add the nlp resource data -- nltk(base nltk package)

author jingjin.geng <jingjin.geng@samsung.com>

Mon, 16 Jul 2018 19:09:58 +0000 (03:09 +0800)

committer jingjing geng <jingjin.geng@samsung.com>

Mon, 16 Jul 2018 03:24:51 +0000 (03:24 +0000)
author jingjin.geng <jingjin.geng@samsung.com>
Mon, 16 Jul 2018 19:09:58 +0000 (03:09 +0800)
committer jingjing geng <jingjin.geng@samsung.com>
Mon, 16 Jul 2018 03:24:51 +0000 (03:24 +0000)
diff --git a/nlp_resource_data/CMakeLists.txt b/nlp_resource_data/CMakeLists.txt

index 474de418a2dc53f72c47b81c52c8292a50ef9aa2..684ff4f5f2dceedd36d4c9768243546d573e7dc0 100755 (executable)
--- a/nlp_resource_data/CMakeLists.txt
+++ b/nlp_resource_data/CMakeLists.txt
@@ -4,7 +4,7 @@ PROJECT(${fw_name} C)
  
  INCLUDE(FindPkgConfig)
  
-#INSTALL(DIRECTORY nltk DESTINATION /usr/lib/python2.7/site-packages)
+INSTALL(DIRECTORY nltk DESTINATION /usr/lib/python2.7/site-packages)
  INSTALL(DIRECTORY langdetect DESTINATION /usr/lib/python2.7/site-packages)
  #INSTALL(DIRECTORY nltk_data DESTINATION /usr/local/lib/)
  
diff --git a/nlp_resource_data/nltk/VERSION b/nlp_resource_data/nltk/VERSION

new file mode 100755 (executable)

index 0000000..5ae69bd
--- /dev/null
+++ b/nlp_resource_data/nltk/VERSION
@@ -0,0 +1 @@
+3.2.5
diff --git a/nlp_resource_data/nltk/__init__.py b/nlp_resource_data/nltk/__init__.py

new file mode 100755 (executable)

index 0000000..c47f6be
--- /dev/null
+++ b/nlp_resource_data/nltk/__init__.py
@@ -0,0 +1,185 @@
+# Natural Language Toolkit (NLTK)
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Steven Bird <stevenbird1@gmail.com>
+#          Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+The Natural Language Toolkit (NLTK) is an open source Python library
+for Natural Language Processing.  A free online book is available.
+(If you use the library for academic research, please cite the book.)
+
+Steven Bird, Ewan Klein, and Edward Loper (2009).
+Natural Language Processing with Python.  O'Reilly Media Inc.
+http://nltk.org/book
+"""
+from __future__ import print_function, absolute_import
+
+import os
+
+# //////////////////////////////////////////////////////
+# Metadata
+# //////////////////////////////////////////////////////
+
+# Version.  For each new release, the version number should be updated
+# in the file VERSION.
+try:
+    # If a VERSION file exists, use it!
+    version_file = os.path.join(os.path.dirname(__file__), 'VERSION')
+    with open(version_file, 'r') as infile:
+        __version__ = infile.read().strip()
+except NameError:
+    __version__ = 'unknown (running code interactively?)'
+except IOError as ex:
+    __version__ = "unknown (%s)" % ex
+
+if __doc__ is not None:  # fix for the ``python -OO``
+    __doc__ += '\n@version: ' + __version__
+
+
+# Copyright notice
+__copyright__ = """\
+Copyright (C) 2001-2017 NLTK Project.
+
+Distributed and Licensed under the Apache License, Version 2.0,
+which is included by reference.
+"""
+
+__license__ = "Apache License, Version 2.0"
+# Description of the toolkit, keywords, and the project's primary URL.
+__longdescr__ = """\
+The Natural Language Toolkit (NLTK) is a Python package for
+natural language processing.  NLTK requires Python 2.6 or higher."""
+__keywords__ = ['NLP', 'CL', 'natural language processing',
+                'computational linguistics', 'parsing', 'tagging',
+                'tokenizing', 'syntax', 'linguistics', 'language',
+                'natural language', 'text analytics']
+__url__ = "http://nltk.org/"
+
+# Maintainer, contributors, etc.
+__maintainer__ = "Steven Bird, Edward Loper, Ewan Klein"
+__maintainer_email__ = "stevenbird1@gmail.com"
+__author__ = __maintainer__
+__author_email__ = __maintainer_email__
+
+# "Trove" classifiers for Python Package Index.
+__classifiers__ = [
+    'Development Status :: 5 - Production/Stable',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Education',
+    'Intended Audience :: Information Technology',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Operating System :: OS Independent',
+    'Programming Language :: Python :: 2.6',
+    'Programming Language :: Python :: 2.7',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Topic :: Scientific/Engineering :: Human Machine Interfaces',
+    'Topic :: Scientific/Engineering :: Information Analysis',
+    'Topic :: Text Processing',
+    'Topic :: Text Processing :: Filters',
+    'Topic :: Text Processing :: General',
+    'Topic :: Text Processing :: Indexing',
+    'Topic :: Text Processing :: Linguistic',
+]
+
+from nltk.internals import config_java
+
+# support numpy from pypy
+try:
+    import numpypy
+except ImportError:
+    pass
+
+# Override missing methods on environments where it cannot be used like GAE.
+import subprocess
+if not hasattr(subprocess, 'PIPE'):
+    def _fake_PIPE(*args, **kwargs):
+        raise NotImplementedError('subprocess.PIPE is not supported.')
+    subprocess.PIPE = _fake_PIPE
+if not hasattr(subprocess, 'Popen'):
+    def _fake_Popen(*args, **kwargs):
+        raise NotImplementedError('subprocess.Popen is not supported.')
+    subprocess.Popen = _fake_Popen
+
+###########################################################
+# TOP-LEVEL MODULES
+###########################################################
+
+# Import top-level functionality into top-level namespace
+
+from nltk.collocations import *
+from nltk.decorators import decorator, memoize
+from nltk.featstruct import *
+from nltk.grammar import *
+from nltk.probability import *
+from nltk.text import *
+from nltk.tree import *
+from nltk.util import *
+from nltk.jsontags import *
+
+###########################################################
+# PACKAGES
+###########################################################
+
+from nltk.chunk import *
+from nltk.classify import *
+from nltk.inference import *
+from nltk.metrics import *
+from nltk.parse import *
+from nltk.tag import *
+from nltk.tokenize import *
+from nltk.translate import *
+from nltk.sem import *
+from nltk.stem import *
+
+# Packages which can be lazily imported
+# (a) we don't import *
+# (b) they're slow to import or have run-time dependencies
+#     that can safely fail at run time
+
+from nltk import lazyimport
+app = lazyimport.LazyModule('nltk.app', locals(), globals())
+chat = lazyimport.LazyModule('nltk.chat', locals(), globals())
+corpus = lazyimport.LazyModule('nltk.corpus', locals(), globals())
+draw = lazyimport.LazyModule('nltk.draw', locals(), globals())
+toolbox = lazyimport.LazyModule('nltk.toolbox', locals(), globals())
+
+# Optional loading
+
+try:
+    import numpy
+except ImportError:
+    pass
+else:
+    from nltk import cluster
+
+from nltk.downloader import download, download_shell
+try:
+    from six.moves import tkinter
+except ImportError:
+    pass
+else:
+    try:
+        from nltk.downloader import download_gui
+    except RuntimeError as e:
+        import warnings
+        warnings.warn("Corpus downloader GUI not loaded "
+                      "(RuntimeError during import: %s)" % str(e))
+
+# explicitly import all top-level modules (ensuring
+# they override the same names inadvertently imported
+# from a subpackage)
+
+from nltk import ccg, chunk, classify, collocations
+from nltk import data, featstruct, grammar, help, inference, metrics
+from nltk import misc, parse, probability, sem, stem, wsd
+from nltk import tag, tbl, text, tokenize, translate, tree, treetransforms, util
+
+
+# override any accidentally imported demo
+def demo():
+    print("To run the demo code for a module, type nltk.module.demo()")
diff --git a/nlp_resource_data/nltk/__init__.pyc b/nlp_resource_data/nltk/__init__.pyc

new file mode 100755 (executable)

index 0000000..f2f789e

Binary files /dev/null and b/nlp_resource_data/nltk/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/app/__init__.py b/nlp_resource_data/nltk/app/__init__.py

new file mode 100755 (executable)

index 0000000..b843d88
--- /dev/null
+++ b/nlp_resource_data/nltk/app/__init__.py
@@ -0,0 +1,52 @@
+# Natural Language Toolkit: Applications package
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Interactive NLTK Applications:
+
+chartparser:  Chart Parser
+chunkparser:  Regular-Expression Chunk Parser
+collocations: Find collocations in text
+concordance:  Part-of-speech concordancer
+nemo:         Finding (and Replacing) Nemo regular expression tool
+rdparser:     Recursive Descent Parser
+srparser:     Shift-Reduce Parser
+wordnet:      WordNet Browser
+"""
+
+
+# Import Tkinter-based modules if Tkinter is installed
+try:
+    from six.moves import tkinter
+except ImportError:
+    import warnings
+    warnings.warn("nltk.app package not loaded "
+                  "(please install Tkinter library).")
+else:
+    from nltk.app.chartparser_app import app as chartparser
+    from nltk.app.chunkparser_app import app as chunkparser
+    from nltk.app.collocations_app import app as collocations
+    from nltk.app.concordance_app import app as concordance
+    from nltk.app.nemo_app import app as nemo
+    from nltk.app.rdparser_app import app as rdparser
+    from nltk.app.srparser_app import app as srparser
+    from nltk.app.wordnet_app import app as wordnet
+
+    try:
+        from matplotlib import pylab
+    except ImportError:
+        import warnings
+        warnings.warn("nltk.app.wordfreq not loaded "
+                      "(requires the matplotlib library).")
+    else:
+        from nltk.app.wordfreq_app import app as wordfreq
+
+# skip doctests from this package
+def setup_module(module):
+    from nose import SkipTest
+    raise SkipTest("nltk.app examples are not doctests")
diff --git a/nlp_resource_data/nltk/app/__init__.pyc b/nlp_resource_data/nltk/app/__init__.pyc

new file mode 100755 (executable)

index 0000000..9fa49cd

Binary files /dev/null and b/nlp_resource_data/nltk/app/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/app/chartparser_app.py b/nlp_resource_data/nltk/app/chartparser_app.py

new file mode 100755 (executable)

index 0000000..bc68d88
--- /dev/null
+++ b/nlp_resource_data/nltk/app/chartparser_app.py
@@ -0,0 +1,2276 @@
+# Natural Language Toolkit: Chart Parser Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Jean Mark Gawron <gawron@mail.sdsu.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A graphical tool for exploring chart parsing.
+
+Chart parsing is a flexible parsing algorithm that uses a data
+structure called a "chart" to record hypotheses about syntactic
+constituents.  Each hypothesis is represented by a single "edge" on
+the chart.  A set of "chart rules" determine when new edges can be
+added to the chart.  This set of rules controls the overall behavior
+of the parser (e.g. whether it parses top-down or bottom-up).
+
+The chart parsing tool demonstrates the process of parsing a single
+sentence, with a given grammar and lexicon.  Its display is divided
+into three sections: the bottom section displays the chart; the middle
+section displays the sentence; and the top section displays the
+partial syntax tree corresponding to the selected edge.  Buttons along
+the bottom of the window are used to control the execution of the
+algorithm.
+
+The chart parsing tool allows for flexible control of the parsing
+algorithm.  At each step of the algorithm, you can select which rule
+or strategy you wish to apply.  This allows you to experiment with
+mixing different strategies (e.g. top-down and bottom-up).  You can
+exercise fine-grained control over the algorithm by selecting which
+edge you wish to apply a rule to.
+"""
+
+# At some point, we should rewrite this tool to use the new canvas
+# widget system.
+
+
+from __future__ import division
+import pickle
+import os.path
+
+from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
+                               Label, Menu, Scrollbar, Tk, Toplevel)
+from six.moves.tkinter_font import Font
+from six.moves.tkinter_messagebox import showerror, showinfo
+from six.moves.tkinter_tkfiledialog import asksaveasfilename, askopenfilename
+
+from nltk.parse.chart import (BottomUpPredictCombineRule, BottomUpPredictRule,
+                              Chart, LeafEdge, LeafInitRule, SingleEdgeFundamentalRule,
+                              SteppingChartParser, TopDownInitRule, TopDownPredictRule,
+                              TreeEdge)
+from nltk.tree import Tree
+from nltk.grammar import Nonterminal, CFG
+from nltk.util import in_idle
+from nltk.draw.util import (CanvasFrame, ColorizedList,
+                            EntryDialog, MutableOptionMenu,
+                            ShowText, SymbolWidget)
+from nltk.draw import CFGEditor, tree_to_treesegment, TreeSegmentWidget
+
+# Known bug: ChartView doesn't handle edges generated by epsilon
+# productions (e.g., [Production: PP -> ]) very well.
+
+#######################################################################
+# Edge List
+#######################################################################
+
+class EdgeList(ColorizedList):
+    ARROW = SymbolWidget.SYMBOLS['rightarrow']
+
+    def _init_colortags(self, textwidget, options):
+        textwidget.tag_config('terminal', foreground='#006000')
+        textwidget.tag_config('arrow', font='symbol', underline='0')
+        textwidget.tag_config('dot', foreground = '#000000')
+        textwidget.tag_config('nonterminal', foreground='blue',
+                              font=('helvetica', -12, 'bold'))
+
+    def _item_repr(self, item):
+        contents = []
+        contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+        contents.append((self.ARROW, 'arrow'))
+        for i, elt in enumerate(item.rhs()):
+            if i == item.dot():
+                contents.append((' *', 'dot'))
+            if isinstance(elt, Nonterminal):
+                contents.append((' %s' % elt.symbol(), 'nonterminal'))
+            else:
+                contents.append((' %r' % elt, 'terminal'))
+        if item.is_complete():
+            contents.append((' *', 'dot'))
+        return contents
+
+#######################################################################
+# Chart Matrix View
+#######################################################################
+
+class ChartMatrixView(object):
+    """
+    A view of a chart that displays the contents of the corresponding matrix.
+    """
+    def __init__(self, parent, chart, toplevel=True, title='Chart Matrix',
+                 show_numedges=False):
+        self._chart = chart
+        self._cells = []
+        self._marks = []
+
+        self._selected_cell = None
+
+        if toplevel:
+            self._root = Toplevel(parent)
+            self._root.title(title)
+            self._root.bind('<Control-q>', self.destroy)
+            self._init_quit(self._root)
+        else:
+            self._root = Frame(parent)
+
+        self._init_matrix(self._root)
+        self._init_list(self._root)
+        if show_numedges:
+            self._init_numedges(self._root)
+        else:
+            self._numedges_label = None
+
+        self._callbacks = {}
+
+        self._num_edges = 0
+
+        self.draw()
+
+    def _init_quit(self, root):
+        quit = Button(root, text='Quit', command=self.destroy)
+        quit.pack(side='bottom', expand=0, fill='none')
+
+    def _init_matrix(self, root):
+        cframe = Frame(root, border=2, relief='sunken')
+        cframe.pack(expand=0, fill='none', padx=1, pady=3, side='top')
+        self._canvas = Canvas(cframe, width=200, height=200,
+                                      background='white')
+        self._canvas.pack(expand=0, fill='none')
+
+    def _init_numedges(self, root):
+        self._numedges_label = Label(root, text='0 edges')
+        self._numedges_label.pack(expand=0, fill='none', side='top')
+
+    def _init_list(self, root):
+        self._list = EdgeList(root, [], width=20, height=5)
+        self._list.pack(side='top', expand=1, fill='both', pady=3)
+        def cb(edge, self=self): self._fire_callbacks('select', edge)
+        self._list.add_callback('select', cb)
+        self._list.focus()
+
+    def destroy(self, *e):
+        if self._root is None: return
+        try: self._root.destroy()
+        except: pass
+        self._root = None
+
+    def set_chart(self, chart):
+        if chart is not self._chart:
+            self._chart = chart
+            self._num_edges = 0
+            self.draw()
+
+    def update(self):
+        if self._root is None: return
+
+        # Count the edges in each cell
+        N = len(self._cells)
+        cell_edges = [[0 for i in range(N)] for j in range(N)]
+        for edge in self._chart:
+            cell_edges[edge.start()][edge.end()] += 1
+
+        # Color the cells correspondingly.
+        for i in range(N):
+            for j in range(i, N):
+                if cell_edges[i][j] == 0:
+                    color = 'gray20'
+                else:
+                    color = ('#00%02x%02x' %
+                             (min(255, 50+128*cell_edges[i][j]/10),
+                              max(0, 128-128*cell_edges[i][j]/10)))
+                cell_tag = self._cells[i][j]
+                self._canvas.itemconfig(cell_tag, fill=color)
+                if (i,j) == self._selected_cell:
+                    self._canvas.itemconfig(cell_tag, outline='#00ffff',
+                                            width=3)
+                    self._canvas.tag_raise(cell_tag)
+                else:
+                    self._canvas.itemconfig(cell_tag, outline='black',
+                                            width=1)
+
+        # Update the edge list.
+        edges = list(self._chart.select(span=self._selected_cell))
+        self._list.set(edges)
+
+        # Update our edge count.
+        self._num_edges = self._chart.num_edges()
+        if self._numedges_label is not None:
+            self._numedges_label['text'] = '%d edges' % self._num_edges
+
+    def activate(self):
+        self._canvas.itemconfig('inactivebox', state='hidden')
+        self.update()
+
+    def inactivate(self):
+        self._canvas.itemconfig('inactivebox', state='normal')
+        self.update()
+
+    def add_callback(self, event, func):
+        self._callbacks.setdefault(event,{})[func] = 1
+
+    def remove_callback(self, event, func=None):
+        if func is None: del self._callbacks[event]
+        else:
+            try: del self._callbacks[event][func]
+            except: pass
+
+    def _fire_callbacks(self, event, *args):
+        if event not in self._callbacks: return
+        for cb_func in list(self._callbacks[event].keys()): cb_func(*args)
+
+    def select_cell(self, i, j):
+        if self._root is None: return
+
+        # If the cell is already selected (and the chart contents
+        # haven't changed), then do nothing.
+        if ((i,j) == self._selected_cell and
+            self._chart.num_edges() == self._num_edges): return
+
+        self._selected_cell = (i,j)
+        self.update()
+
+        # Fire the callback.
+        self._fire_callbacks('select_cell', i, j)
+
+    def deselect_cell(self):
+        if self._root is None: return
+        self._selected_cell = None
+        self._list.set([])
+        self.update()
+
+    def _click_cell(self, i, j):
+        if self._selected_cell == (i,j):
+            self.deselect_cell()
+        else:
+            self.select_cell(i, j)
+
+    def view_edge(self, edge):
+        self.select_cell(*edge.span())
+        self._list.view(edge)
+
+    def mark_edge(self, edge):
+        if self._root is None: return
+        self.select_cell(*edge.span())
+        self._list.mark(edge)
+
+    def unmark_edge(self, edge=None):
+        if self._root is None: return
+        self._list.unmark(edge)
+
+    def markonly_edge(self, edge):
+        if self._root is None: return
+        self.select_cell(*edge.span())
+        self._list.markonly(edge)
+
+    def draw(self):
+        if self._root is None: return
+        LEFT_MARGIN = BOT_MARGIN = 15
+        TOP_MARGIN = 5
+        c = self._canvas
+        c.delete('all')
+        N = self._chart.num_leaves()+1
+        dx = (int(c['width'])-LEFT_MARGIN)/N
+        dy = (int(c['height'])-TOP_MARGIN-BOT_MARGIN)/N
+
+        c.delete('all')
+
+        # Labels and dotted lines
+        for i in range(N):
+            c.create_text(LEFT_MARGIN-2, i*dy+dy/2+TOP_MARGIN,
+                          text=repr(i), anchor='e')
+            c.create_text(i*dx+dx/2+LEFT_MARGIN, N*dy+TOP_MARGIN+1,
+                          text=repr(i), anchor='n')
+            c.create_line(LEFT_MARGIN, dy*(i+1)+TOP_MARGIN,
+                          dx*N+LEFT_MARGIN, dy*(i+1)+TOP_MARGIN, dash='.')
+            c.create_line(dx*i+LEFT_MARGIN, TOP_MARGIN,
+                          dx*i+LEFT_MARGIN, dy*N+TOP_MARGIN, dash='.')
+
+        # A box around the whole thing
+        c.create_rectangle(LEFT_MARGIN, TOP_MARGIN,
+                           LEFT_MARGIN+dx*N, dy*N+TOP_MARGIN,
+                           width=2)
+
+        # Cells
+        self._cells = [[None for i in range(N)] for j in range(N)]
+        for i in range(N):
+            for j in range(i, N):
+                t = c.create_rectangle(j*dx+LEFT_MARGIN, i*dy+TOP_MARGIN,
+                                       (j+1)*dx+LEFT_MARGIN,
+                                       (i+1)*dy+TOP_MARGIN,
+                                       fill='gray20')
+                self._cells[i][j] = t
+                def cb(event, self=self, i=i, j=j): self._click_cell(i,j)
+                c.tag_bind(t, '<Button-1>', cb)
+
+        # Inactive box
+        xmax, ymax = int(c['width']), int(c['height'])
+        t = c.create_rectangle(-100, -100, xmax+100, ymax+100,
+                               fill='gray50', state='hidden',
+                               tag='inactivebox')
+        c.tag_lower(t)
+
+        # Update the cells.
+        self.update()
+
+    def pack(self, *args, **kwargs):
+        self._root.pack(*args, **kwargs)
+
+#######################################################################
+# Chart Results View
+#######################################################################
+
+class ChartResultsView(object):
+    def __init__(self, parent, chart, grammar, toplevel=True):
+        self._chart = chart
+        self._grammar = grammar
+        self._trees = []
+        self._y = 10
+        self._treewidgets = []
+        self._selection = None
+        self._selectbox = None
+
+        if toplevel:
+            self._root = Toplevel(parent)
+            self._root.title('Chart Parser Application: Results')
+            self._root.bind('<Control-q>', self.destroy)
+        else:
+            self._root = Frame(parent)
+
+        # Buttons
+        if toplevel:
+            buttons = Frame(self._root)
+            buttons.pack(side='bottom', expand=0, fill='x')
+            Button(buttons, text='Quit',
+                           command=self.destroy).pack(side='right')
+            Button(buttons, text='Print All',
+                           command=self.print_all).pack(side='left')
+            Button(buttons, text='Print Selection',
+                           command=self.print_selection).pack(side='left')
+
+        # Canvas frame.
+        self._cframe = CanvasFrame(self._root, closeenough=20)
+        self._cframe.pack(side='top', expand=1, fill='both')
+
+        # Initial update
+        self.update()
+
+    def update(self, edge=None):
+        if self._root is None: return
+        # If the edge isn't a parse edge, do nothing.
+        if edge is not None:
+            if edge.lhs() != self._grammar.start(): return
+            if edge.span() != (0, self._chart.num_leaves()): return
+
+        for parse in self._chart.parses(self._grammar.start()):
+            if parse not in self._trees:
+                self._add(parse)
+
+    def _add(self, parse):
+        # Add it to self._trees.
+        self._trees.append(parse)
+
+        # Create a widget for it.
+        c = self._cframe.canvas()
+        treewidget = tree_to_treesegment(c, parse)
+
+        # Add it to the canvas frame.
+        self._treewidgets.append(treewidget)
+        self._cframe.add_widget(treewidget, 10, self._y)
+
+        # Register callbacks.
+        treewidget.bind_click(self._click)
+
+        # Update y.
+        self._y = treewidget.bbox()[3] + 10
+
+    def _click(self, widget):
+        c = self._cframe.canvas()
+        if self._selection is not None:
+            c.delete(self._selectbox)
+        self._selection = widget
+        (x1, y1, x2, y2) = widget.bbox()
+        self._selectbox = c.create_rectangle(x1, y1, x2, y2,
+                                             width=2, outline='#088')
+
+    def _color(self, treewidget, color):
+        treewidget.label()['color'] = color
+        for child in treewidget.subtrees():
+            if isinstance(child, TreeSegmentWidget):
+                self._color(child, color)
+            else:
+                child['color'] = color
+
+    def print_all(self, *e):
+        if self._root is None: return
+        self._cframe.print_to_file()
+
+    def print_selection(self, *e):
+        if self._root is None: return
+        if self._selection is None:
+            showerror('Print Error', 'No tree selected')
+        else:
+            c = self._cframe.canvas()
+            for widget in self._treewidgets:
+                if widget is not self._selection:
+                    self._cframe.destroy_widget(widget)
+            c.delete(self._selectbox)
+            (x1,y1,x2,y2) = self._selection.bbox()
+            self._selection.move(10-x1,10-y1)
+            c['scrollregion'] = '0 0 %s %s' % (x2-x1+20, y2-y1+20)
+            self._cframe.print_to_file()
+
+            # Restore our state.
+            self._treewidgets = [self._selection]
+            self.clear()
+            self.update()
+
+    def clear(self):
+        if self._root is None: return
+        for treewidget in self._treewidgets:
+            self._cframe.destroy_widget(treewidget)
+        self._trees = []
+        self._treewidgets = []
+        if self._selection is not None:
+            self._cframe.canvas().delete(self._selectbox)
+        self._selection = None
+        self._y = 10
+
+    def set_chart(self, chart):
+        self.clear()
+        self._chart = chart
+        self.update()
+
+    def set_grammar(self, grammar):
+        self.clear()
+        self._grammar = grammar
+        self.update()
+
+    def destroy(self, *e):
+        if self._root is None: return
+        try: self._root.destroy()
+        except: pass
+        self._root = None
+
+    def pack(self, *args, **kwargs):
+        self._root.pack(*args, **kwargs)
+
+#######################################################################
+# Chart Comparer
+#######################################################################
+
+class ChartComparer(object):
+    """
+
+    :ivar _root: The root window
+
+    :ivar _charts: A dictionary mapping names to charts.  When
+        charts are loaded, they are added to this dictionary.
+
+    :ivar _left_chart: The left ``Chart``.
+    :ivar _left_name: The name ``_left_chart`` (derived from filename)
+    :ivar _left_matrix: The ``ChartMatrixView`` for ``_left_chart``
+    :ivar _left_selector: The drop-down ``MutableOptionsMenu`` used
+          to select ``_left_chart``.
+
+    :ivar _right_chart: The right ``Chart``.
+    :ivar _right_name: The name ``_right_chart`` (derived from filename)
+    :ivar _right_matrix: The ``ChartMatrixView`` for ``_right_chart``
+    :ivar _right_selector: The drop-down ``MutableOptionsMenu`` used
+          to select ``_right_chart``.
+
+    :ivar _out_chart: The out ``Chart``.
+    :ivar _out_name: The name ``_out_chart`` (derived from filename)
+    :ivar _out_matrix: The ``ChartMatrixView`` for ``_out_chart``
+    :ivar _out_label: The label for ``_out_chart``.
+
+    :ivar _op_label: A Label containing the most recent operation.
+    """
+
+    _OPSYMBOL = {'-': '-',
+                 'and': SymbolWidget.SYMBOLS['intersection'],
+                 'or': SymbolWidget.SYMBOLS['union']}
+
+    def __init__(self, *chart_filenames):
+        # This chart is displayed when we don't have a value (eg
+        # before any chart is loaded).
+        faketok = [''] * 8
+        self._emptychart = Chart(faketok)
+
+        # The left & right charts start out empty.
+        self._left_name = 'None'
+        self._right_name = 'None'
+        self._left_chart = self._emptychart
+        self._right_chart = self._emptychart
+
+        # The charts that have been loaded.
+        self._charts = {'None': self._emptychart}
+
+        # The output chart.
+        self._out_chart = self._emptychart
+
+        # The most recent operation
+        self._operator = None
+
+        # Set up the root window.
+        self._root = Tk()
+        self._root.title('Chart Comparison')
+        self._root.bind('<Control-q>', self.destroy)
+        self._root.bind('<Control-x>', self.destroy)
+
+        # Initialize all widgets, etc.
+        self._init_menubar(self._root)
+        self._init_chartviews(self._root)
+        self._init_divider(self._root)
+        self._init_buttons(self._root)
+        self._init_bindings(self._root)
+
+        # Load any specified charts.
+        for filename in chart_filenames:
+            self.load_chart(filename)
+
+    def destroy(self, *e):
+        if self._root is None: return
+        try: self._root.destroy()
+        except: pass
+        self._root = None
+
+    def mainloop(self, *args, **kwargs):
+        return
+        self._root.mainloop(*args, **kwargs)
+
+    #////////////////////////////////////////////////////////////
+    # Initialization
+    #////////////////////////////////////////////////////////////
+
+    def _init_menubar(self, root):
+        menubar = Menu(root)
+
+        # File menu
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Load Chart', accelerator='Ctrl-o',
+                             underline=0, command=self.load_chart_dialog)
+        filemenu.add_command(label='Save Output', accelerator='Ctrl-s',
+                             underline=0, command=self.save_chart_dialog)
+        filemenu.add_separator()
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        # Compare menu
+        opmenu = Menu(menubar, tearoff=0)
+        opmenu.add_command(label='Intersection',
+                           command=self._intersection,
+                           accelerator='+')
+        opmenu.add_command(label='Union',
+                           command=self._union,
+                           accelerator='*')
+        opmenu.add_command(label='Difference',
+                           command=self._difference,
+                           accelerator='-')
+        opmenu.add_separator()
+        opmenu.add_command(label='Swap Charts',
+                           command=self._swapcharts)
+        menubar.add_cascade(label='Compare', underline=0, menu=opmenu)
+
+        # Add the menu
+        self._root.config(menu=menubar)
+
+    def _init_divider(self, root):
+        divider = Frame(root, border=2, relief='sunken')
+        divider.pack(side='top', fill='x', ipady=2)
+
+    def _init_chartviews(self, root):
+        opfont=('symbol', -36) # Font for operator.
+        eqfont=('helvetica', -36) # Font for equals sign.
+
+        frame = Frame(root, background='#c0c0c0')
+        frame.pack(side='top', expand=1, fill='both')
+
+        # The left matrix.
+        cv1_frame = Frame(frame, border=3, relief='groove')
+        cv1_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
+        self._left_selector = MutableOptionMenu(
+            cv1_frame, list(self._charts.keys()), command=self._select_left)
+        self._left_selector.pack(side='top', pady=5, fill='x')
+        self._left_matrix = ChartMatrixView(cv1_frame, self._emptychart,
+                                            toplevel=False,
+                                            show_numedges=True)
+        self._left_matrix.pack(side='bottom', padx=5, pady=5,
+                               expand=1, fill='both')
+        self._left_matrix.add_callback('select', self.select_edge)
+        self._left_matrix.add_callback('select_cell', self.select_cell)
+        self._left_matrix.inactivate()
+
+        # The operator.
+        self._op_label = Label(frame, text=' ', width=3,
+                                       background='#c0c0c0', font=opfont)
+        self._op_label.pack(side='left', padx=5, pady=5)
+
+        # The right matrix.
+        cv2_frame = Frame(frame, border=3, relief='groove')
+        cv2_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
+        self._right_selector = MutableOptionMenu(
+            cv2_frame, list(self._charts.keys()), command=self._select_right)
+        self._right_selector.pack(side='top', pady=5, fill='x')
+        self._right_matrix = ChartMatrixView(cv2_frame, self._emptychart,
+                                            toplevel=False,
+                                            show_numedges=True)
+        self._right_matrix.pack(side='bottom', padx=5, pady=5,
+                               expand=1, fill='both')
+        self._right_matrix.add_callback('select', self.select_edge)
+        self._right_matrix.add_callback('select_cell', self.select_cell)
+        self._right_matrix.inactivate()
+
+        # The equals sign
+        Label(frame, text='=', width=3, background='#c0c0c0',
+                      font=eqfont).pack(side='left', padx=5, pady=5)
+
+        # The output matrix.
+        out_frame = Frame(frame, border=3, relief='groove')
+        out_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
+        self._out_label = Label(out_frame, text='Output')
+        self._out_label.pack(side='top', pady=9)
+        self._out_matrix = ChartMatrixView(out_frame, self._emptychart,
+                                            toplevel=False,
+                                            show_numedges=True)
+        self._out_matrix.pack(side='bottom', padx=5, pady=5,
+                                 expand=1, fill='both')
+        self._out_matrix.add_callback('select', self.select_edge)
+        self._out_matrix.add_callback('select_cell', self.select_cell)
+        self._out_matrix.inactivate()
+
+    def _init_buttons(self, root):
+        buttons = Frame(root)
+        buttons.pack(side='bottom', pady=5, fill='x', expand=0)
+        Button(buttons, text='Intersection',
+                       command=self._intersection).pack(side='left')
+        Button(buttons, text='Union',
+                       command=self._union).pack(side='left')
+        Button(buttons, text='Difference',
+                       command=self._difference).pack(side='left')
+        Frame(buttons, width=20).pack(side='left')
+        Button(buttons, text='Swap Charts',
+                       command=self._swapcharts).pack(side='left')
+
+        Button(buttons, text='Detatch Output',
+                       command=self._detatch_out).pack(side='right')
+
+    def _init_bindings(self, root):
+        #root.bind('<Control-s>', self.save_chart)
+        root.bind('<Control-o>', self.load_chart_dialog)
+        #root.bind('<Control-r>', self.reset)
+
+    #////////////////////////////////////////////////////////////
+    # Input Handling
+    #////////////////////////////////////////////////////////////
+
+    def _select_left(self, name):
+        self._left_name = name
+        self._left_chart = self._charts[name]
+        self._left_matrix.set_chart(self._left_chart)
+        if name == 'None': self._left_matrix.inactivate()
+        self._apply_op()
+
+    def _select_right(self, name):
+        self._right_name = name
+        self._right_chart = self._charts[name]
+        self._right_matrix.set_chart(self._right_chart)
+        if name == 'None': self._right_matrix.inactivate()
+        self._apply_op()
+
+    def _apply_op(self):
+        if self._operator == '-': self._difference()
+        elif self._operator == 'or': self._union()
+        elif self._operator == 'and': self._intersection()
+
+
+    #////////////////////////////////////////////////////////////
+    # File
+    #////////////////////////////////////////////////////////////
+    CHART_FILE_TYPES = [('Pickle file', '.pickle'),
+                        ('All files', '*')]
+
+    def save_chart_dialog(self, *args):
+        filename = asksaveasfilename(filetypes=self.CHART_FILE_TYPES,
+                                     defaultextension='.pickle')
+        if not filename: return
+        try:
+            with open(filename, 'wb') as outfile:
+                pickle.dump(self._out_chart, outfile)
+        except Exception as e:
+            showerror('Error Saving Chart',
+                                   'Unable to open file: %r\n%s' %
+                                   (filename, e))
+
+    def load_chart_dialog(self, *args):
+        filename = askopenfilename(filetypes=self.CHART_FILE_TYPES,
+                                   defaultextension='.pickle')
+        if not filename: return
+        try: self.load_chart(filename)
+        except Exception as e:
+            showerror('Error Loading Chart',
+                                   'Unable to open file: %r\n%s' %
+                                   (filename, e))
+
+    def load_chart(self, filename):
+        with open(filename, 'rb') as infile:
+            chart = pickle.load(infile)
+        name = os.path.basename(filename)
+        if name.endswith('.pickle'): name = name[:-7]
+        if name.endswith('.chart'): name = name[:-6]
+        self._charts[name] = chart
+        self._left_selector.add(name)
+        self._right_selector.add(name)
+
+        # If either left_matrix or right_matrix is empty, then
+        # display the new chart.
+        if self._left_chart is self._emptychart:
+            self._left_selector.set(name)
+        elif self._right_chart is self._emptychart:
+            self._right_selector.set(name)
+
+    def _update_chartviews(self):
+        self._left_matrix.update()
+        self._right_matrix.update()
+        self._out_matrix.update()
+
+    #////////////////////////////////////////////////////////////
+    # Selection
+    #////////////////////////////////////////////////////////////
+
+    def select_edge(self, edge):
+        if edge in self._left_chart:
+            self._left_matrix.markonly_edge(edge)
+        else:
+            self._left_matrix.unmark_edge()
+        if edge in self._right_chart:
+            self._right_matrix.markonly_edge(edge)
+        else:
+            self._right_matrix.unmark_edge()
+        if edge in self._out_chart:
+            self._out_matrix.markonly_edge(edge)
+        else:
+            self._out_matrix.unmark_edge()
+
+    def select_cell(self, i, j):
+        self._left_matrix.select_cell(i, j)
+        self._right_matrix.select_cell(i, j)
+        self._out_matrix.select_cell(i, j)
+
+    #////////////////////////////////////////////////////////////
+    # Operations
+    #////////////////////////////////////////////////////////////
+
+    def _difference(self):
+        if not self._checkcompat(): return
+
+        out_chart = Chart(self._left_chart.tokens())
+        for edge in self._left_chart:
+            if edge not in self._right_chart:
+                out_chart.insert(edge, [])
+
+        self._update('-', out_chart)
+
+    def _intersection(self):
+        if not self._checkcompat(): return
+
+        out_chart = Chart(self._left_chart.tokens())
+        for edge in self._left_chart:
+            if edge in self._right_chart:
+                out_chart.insert(edge, [])
+
+        self._update('and', out_chart)
+
+    def _union(self):
+        if not self._checkcompat(): return
+
+        out_chart = Chart(self._left_chart.tokens())
+        for edge in self._left_chart:
+            out_chart.insert(edge, [])
+        for edge in self._right_chart:
+            out_chart.insert(edge, [])
+
+        self._update('or', out_chart)
+
+    def _swapcharts(self):
+        left, right = self._left_name, self._right_name
+        self._left_selector.set(right)
+        self._right_selector.set(left)
+
+    def _checkcompat(self):
+        if (self._left_chart.tokens() != self._right_chart.tokens() or
+            self._left_chart.property_names() !=
+            self._right_chart.property_names() or
+            self._left_chart == self._emptychart or
+            self._right_chart == self._emptychart):
+            # Clear & inactivate the output chart.
+            self._out_chart = self._emptychart
+            self._out_matrix.set_chart(self._out_chart)
+            self._out_matrix.inactivate()
+            self._out_label['text'] = 'Output'
+            # Issue some other warning?
+            return False
+        else:
+            return True
+
+    def _update(self, operator, out_chart):
+        self._operator = operator
+        self._op_label['text'] = self._OPSYMBOL[operator]
+        self._out_chart = out_chart
+        self._out_matrix.set_chart(out_chart)
+        self._out_label['text'] = '%s %s %s' % (self._left_name,
+                                                self._operator,
+                                                self._right_name)
+
+    def _clear_out_chart(self):
+        self._out_chart = self._emptychart
+        self._out_matrix.set_chart(self._out_chart)
+        self._op_label['text'] = ' '
+        self._out_matrix.inactivate()
+
+    def _detatch_out(self):
+        ChartMatrixView(self._root, self._out_chart,
+                        title=self._out_label['text'])
+
+
+
+
+
+
+
+
+#######################################################################
+# Chart View
+#######################################################################
+
+class ChartView(object):
+    """
+    A component for viewing charts.  This is used by ``ChartParserApp`` to
+    allow students to interactively experiment with various chart
+    parsing techniques.  It is also used by ``Chart.draw()``.
+
+    :ivar _chart: The chart that we are giving a view of.  This chart
+       may be modified; after it is modified, you should call
+       ``update``.
+    :ivar _sentence: The list of tokens that the chart spans.
+
+    :ivar _root: The root window.
+    :ivar _chart_canvas: The canvas we're using to display the chart
+        itself.
+    :ivar _tree_canvas: The canvas we're using to display the tree
+        that each edge spans.  May be None, if we're not displaying
+        trees.
+    :ivar _sentence_canvas: The canvas we're using to display the sentence
+        text.  May be None, if we're not displaying the sentence text.
+    :ivar _edgetags: A dictionary mapping from edges to the tags of
+        the canvas elements (lines, etc) used to display that edge.
+        The values of this dictionary have the form
+        ``(linetag, rhstag1, dottag, rhstag2, lhstag)``.
+    :ivar _treetags: A list of all the tags that make up the tree;
+        used to erase the tree (without erasing the loclines).
+    :ivar _chart_height: The height of the chart canvas.
+    :ivar _sentence_height: The height of the sentence canvas.
+    :ivar _tree_height: The height of the tree
+
+    :ivar _text_height: The height of a text string (in the normal
+        font).
+
+    :ivar _edgelevels: A list of edges at each level of the chart (the
+        top level is the 0th element).  This list is used to remember
+        where edges should be drawn; and to make sure that no edges
+        are overlapping on the chart view.
+
+    :ivar _unitsize: Pixel size of one unit (from the location).  This
+       is determined by the span of the chart's location, and the
+       width of the chart display canvas.
+
+    :ivar _fontsize: The current font size
+
+    :ivar _marks: A dictionary from edges to marks.  Marks are
+        strings, specifying colors (e.g. 'green').
+    """
+
+    _LEAF_SPACING = 10
+    _MARGIN = 10
+    _TREE_LEVEL_SIZE = 12
+    _CHART_LEVEL_SIZE = 40
+
+    def __init__(self, chart, root=None, **kw):
+        """
+        Construct a new ``Chart`` display.
+        """
+        # Process keyword args.
+        draw_tree = kw.get('draw_tree', 0)
+        draw_sentence = kw.get('draw_sentence', 1)
+        self._fontsize = kw.get('fontsize', -12)
+
+        # The chart!
+        self._chart = chart
+
+        # Callback functions
+        self._callbacks = {}
+
+        # Keep track of drawn edges
+        self._edgelevels = []
+        self._edgetags = {}
+
+        # Keep track of which edges are marked.
+        self._marks = {}
+
+        # These are used to keep track of the set of tree tokens
+        # currently displayed in the tree canvas.
+        self._treetoks = []
+        self._treetoks_edge = None
+        self._treetoks_index = 0
+
+        # Keep track of the tags used to draw the tree
+        self._tree_tags = []
+
+        # Put multiple edges on each level?
+        self._compact = 0
+
+        # If they didn't provide a main window, then set one up.
+        if root is None:
+            top = Tk()
+            top.title('Chart View')
+            def destroy1(e, top=top): top.destroy()
+            def destroy2(top=top): top.destroy()
+            top.bind('q', destroy1)
+            b = Button(top, text='Done', command=destroy2)
+            b.pack(side='bottom')
+            self._root = top
+        else:
+            self._root = root
+
+        # Create some fonts.
+        self._init_fonts(root)
+
+        # Create the chart canvas.
+        (self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root)
+        self._chart_canvas['height'] = 300
+        self._chart_canvas['closeenough'] = 15
+
+        # Create the sentence canvas.
+        if draw_sentence:
+            cframe = Frame(self._root, relief='sunk', border=2)
+            cframe.pack(fill='both', side='bottom')
+            self._sentence_canvas = Canvas(cframe, height=50)
+            self._sentence_canvas['background'] = '#e0e0e0'
+            self._sentence_canvas.pack(fill='both')
+            #self._sentence_canvas['height'] = self._sentence_height
+        else:
+            self._sentence_canvas = None
+
+        # Create the tree canvas.
+        if draw_tree:
+            (sb, canvas) = self._sb_canvas(self._root, 'n', 'x')
+            (self._tree_sb, self._tree_canvas) = (sb, canvas)
+            self._tree_canvas['height'] = 200
+        else:
+            self._tree_canvas = None
+
+        # Do some analysis to figure out how big the window should be
+        self._analyze()
+        self.draw()
+        self._resize()
+        self._grow()
+
+        # Set up the configure callback, which will be called whenever
+        # the window is resized.
+        self._chart_canvas.bind('<Configure>', self._configure)
+
+    def _init_fonts(self, root):
+        self._boldfont = Font(family='helvetica', weight='bold',
+                                    size=self._fontsize)
+        self._font = Font(family='helvetica',
+                                    size=self._fontsize)
+        # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
+        self._sysfont = Font(font=Button()["font"])
+        root.option_add("*Font", self._sysfont)
+
+    def _sb_canvas(self, root, expand='y',
+                   fill='both', side='bottom'):
+        """
+        Helper for __init__: construct a canvas with a scrollbar.
+        """
+        cframe = Frame(root, relief='sunk', border=2)
+        cframe.pack(fill=fill, expand=expand, side=side)
+        canvas = Canvas(cframe, background='#e0e0e0')
+
+        # Give the canvas a scrollbar.
+        sb = Scrollbar(cframe, orient='vertical')
+        sb.pack(side='right', fill='y')
+        canvas.pack(side='left', fill=fill, expand='yes')
+
+        # Connect the scrollbars to the canvas.
+        sb['command']= canvas.yview
+        canvas['yscrollcommand'] = sb.set
+
+        return (sb, canvas)
+
+    def scroll_up(self, *e):
+        self._chart_canvas.yview('scroll', -1, 'units')
+
+    def scroll_down(self, *e):
+        self._chart_canvas.yview('scroll', 1, 'units')
+
+    def page_up(self, *e):
+        self._chart_canvas.yview('scroll', -1, 'pages')
+
+    def page_down(self, *e):
+        self._chart_canvas.yview('scroll', 1, 'pages')
+
+    def _grow(self):
+        """
+        Grow the window, if necessary
+        """
+        # Grow, if need-be
+        N = self._chart.num_leaves()
+        width = max(int(self._chart_canvas['width']),
+                    N * self._unitsize + ChartView._MARGIN * 2 )
+
+        # It won't resize without the second (height) line, but I
+        # don't understand why not.
+        self._chart_canvas.configure(width=width)
+        self._chart_canvas.configure(height=self._chart_canvas['height'])
+
+        self._unitsize = (width - 2*ChartView._MARGIN) / N
+
+        # Reset the height for the sentence window.
+        if self._sentence_canvas is not None:
+            self._sentence_canvas['height'] = self._sentence_height
+
+    def set_font_size(self, size):
+        self._font.configure(size=-abs(size))
+        self._boldfont.configure(size=-abs(size))
+        self._sysfont.configure(size=-abs(size))
+        self._analyze()
+        self._grow()
+        self.draw()
+
+    def get_font_size(self):
+        return abs(self._fontsize)
+
+    def _configure(self, e):
+        """
+        The configure callback.  This is called whenever the window is
+        resized.  It is also called when the window is first mapped.
+        It figures out the unit size, and redraws the contents of each
+        canvas.
+        """
+        N = self._chart.num_leaves()
+        self._unitsize = (e.width - 2*ChartView._MARGIN) / N
+        self.draw()
+
+    def update(self, chart=None):
+        """
+        Draw any edges that have not been drawn.  This is typically
+        called when a after modifies the canvas that a CanvasView is
+        displaying.  ``update`` will cause any edges that have been
+        added to the chart to be drawn.
+
+        If update is given a ``chart`` argument, then it will replace
+        the current chart with the given chart.
+        """
+        if chart is not None:
+            self._chart = chart
+            self._edgelevels = []
+            self._marks = {}
+            self._analyze()
+            self._grow()
+            self.draw()
+            self.erase_tree()
+            self._resize()
+        else:
+            for edge in self._chart:
+                if edge not in self._edgetags:
+                    self._add_edge(edge)
+            self._resize()
+
+
+    def _edge_conflict(self, edge, lvl):
+        """
+        Return True if the given edge overlaps with any edge on the given
+        level.  This is used by _add_edge to figure out what level a
+        new edge should be added to.
+        """
+        (s1, e1) = edge.span()
+        for otheredge in self._edgelevels[lvl]:
+            (s2, e2) = otheredge.span()
+            if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1==s2==e1==e2):
+                return True
+        return False
+
+    def _analyze_edge(self, edge):
+        """
+        Given a new edge, recalculate:
+
+            - _text_height
+            - _unitsize (if the edge text is too big for the current
+              _unitsize, then increase _unitsize)
+        """
+        c = self._chart_canvas
+
+        if isinstance(edge, TreeEdge):
+            lhs = edge.lhs()
+            rhselts = []
+            for elt in edge.rhs():
+                if isinstance(elt, Nonterminal):
+                    rhselts.append(str(elt.symbol()))
+                else:
+                    rhselts.append(repr(elt))
+            rhs = " ".join(rhselts)
+        else:
+            lhs = edge.lhs()
+            rhs = ''
+
+        for s in (lhs, rhs):
+            tag = c.create_text(0,0, text=s,
+                                font=self._boldfont,
+                                anchor='nw', justify='left')
+            bbox = c.bbox(tag)
+            c.delete(tag)
+            width = bbox[2] #+ ChartView._LEAF_SPACING
+            edgelen = max(edge.length(), 1)
+            self._unitsize = max(self._unitsize, width/edgelen)
+            self._text_height = max(self._text_height, bbox[3] - bbox[1])
+
+    def _add_edge(self, edge, minlvl=0):
+        """
+        Add a single edge to the ChartView:
+
+            - Call analyze_edge to recalculate display parameters
+            - Find an available level
+            - Call _draw_edge
+        """
+        # Do NOT show leaf edges in the chart.
+        if isinstance(edge, LeafEdge): return
+
+        if edge in self._edgetags: return
+        self._analyze_edge(edge)
+        self._grow()
+
+        if not self._compact:
+            self._edgelevels.append([edge])
+            lvl = len(self._edgelevels)-1
+            self._draw_edge(edge, lvl)
+            self._resize()
+            return
+
+        # Figure out what level to draw the edge on.
+        lvl = 0
+        while True:
+            # If this level doesn't exist yet, create it.
+            while lvl >= len(self._edgelevels):
+                self._edgelevels.append([])
+                self._resize()
+
+            # Check if we can fit the edge in this level.
+            if lvl>=minlvl and not self._edge_conflict(edge, lvl):
+                # Go ahead and draw it.
+                self._edgelevels[lvl].append(edge)
+                break
+
+            # Try the next level.
+            lvl += 1
+
+        self._draw_edge(edge, lvl)
+
+    def view_edge(self, edge):
+        level = None
+        for i in range(len(self._edgelevels)):
+            if edge in self._edgelevels[i]:
+                level = i
+                break
+        if level is None: return
+        # Try to view the new edge..
+        y = (level+1) * self._chart_level_size
+        dy = self._text_height + 10
+        self._chart_canvas.yview('moveto', 1.0)
+        if self._chart_height != 0:
+            self._chart_canvas.yview('moveto',
+                                     (y-dy)/self._chart_height)
+
+    def _draw_edge(self, edge, lvl):
+        """
+        Draw a single edge on the ChartView.
+        """
+        c = self._chart_canvas
+
+        # Draw the arrow.
+        x1 = (edge.start() * self._unitsize + ChartView._MARGIN)
+        x2 = (edge.end() * self._unitsize + ChartView._MARGIN)
+        if x2 == x1: x2 += max(4, self._unitsize/5)
+        y = (lvl+1) * self._chart_level_size
+        linetag = c.create_line(x1, y, x2, y, arrow='last', width=3)
+
+        # Draw a label for the edge.
+        if isinstance(edge, TreeEdge):
+            rhs = []
+            for elt in edge.rhs():
+                if isinstance(elt, Nonterminal):
+                    rhs.append(str(elt.symbol()))
+                else:
+                    rhs.append(repr(elt))
+            pos = edge.dot()
+        else:
+            rhs = []
+            pos = 0
+
+        rhs1 = " ".join(rhs[:pos])
+        rhs2 = " ".join(rhs[pos:])
+        rhstag1 = c.create_text(x1+3, y, text=rhs1,
+                                font=self._font,
+                                anchor='nw')
+        dotx = c.bbox(rhstag1)[2] + 6
+        doty = (c.bbox(rhstag1)[1]+c.bbox(rhstag1)[3])/2
+        dottag = c.create_oval(dotx-2, doty-2, dotx+2, doty+2)
+        rhstag2 = c.create_text(dotx+6, y, text=rhs2,
+                                font=self._font,
+                                anchor='nw')
+        lhstag =  c.create_text((x1+x2)/2, y, text=str(edge.lhs()),
+                                anchor='s',
+                                font=self._boldfont)
+
+        # Keep track of the edge's tags.
+        self._edgetags[edge] = (linetag, rhstag1,
+                                dottag, rhstag2, lhstag)
+
+        # Register a callback for clicking on the edge.
+        def cb(event, self=self, edge=edge):
+            self._fire_callbacks('select', edge)
+        c.tag_bind(rhstag1, '<Button-1>', cb)
+        c.tag_bind(rhstag2, '<Button-1>', cb)
+        c.tag_bind(linetag, '<Button-1>', cb)
+        c.tag_bind(dottag, '<Button-1>', cb)
+        c.tag_bind(lhstag, '<Button-1>', cb)
+
+        self._color_edge(edge)
+
+    def _color_edge(self, edge, linecolor=None, textcolor=None):
+        """
+        Color in an edge with the given colors.
+        If no colors are specified, use intelligent defaults
+        (dependent on selection, etc.)
+        """
+        if edge not in self._edgetags: return
+        c = self._chart_canvas
+
+        if linecolor is not None and textcolor is not None:
+            if edge in self._marks:
+                linecolor = self._marks[edge]
+            tags = self._edgetags[edge]
+            c.itemconfig(tags[0], fill=linecolor)
+            c.itemconfig(tags[1], fill=textcolor)
+            c.itemconfig(tags[2], fill=textcolor,
+                         outline=textcolor)
+            c.itemconfig(tags[3], fill=textcolor)
+            c.itemconfig(tags[4], fill=textcolor)
+            return
+        else:
+            N = self._chart.num_leaves()
+            if edge in self._marks:
+                self._color_edge(self._marks[edge])
+            if (edge.is_complete() and edge.span() == (0, N)):
+                self._color_edge(edge, '#084', '#042')
+            elif isinstance(edge, LeafEdge):
+                self._color_edge(edge, '#48c', '#246')
+            else:
+                self._color_edge(edge, '#00f', '#008')
+
+    def mark_edge(self, edge, mark='#0df'):
+        """
+        Mark an edge
+        """
+        self._marks[edge] = mark
+        self._color_edge(edge)
+
+    def unmark_edge(self, edge=None):
+        """
+        Unmark an edge (or all edges)
+        """
+        if edge is None:
+            old_marked_edges = list(self._marks.keys())
+            self._marks = {}
+            for edge in old_marked_edges:
+                self._color_edge(edge)
+        else:
+            del self._marks[edge]
+            self._color_edge(edge)
+
+    def markonly_edge(self, edge, mark='#0df'):
+        self.unmark_edge()
+        self.mark_edge(edge, mark)
+
+    def _analyze(self):
+        """
+        Analyze the sentence string, to figure out how big a unit needs
+        to be, How big the tree should be, etc.
+        """
+        # Figure out the text height and the unit size.
+        unitsize = 70 # min unitsize
+        text_height = 0
+        c = self._chart_canvas
+
+        # Check against all tokens
+        for leaf in self._chart.leaves():
+            tag = c.create_text(0,0, text=repr(leaf),
+                                font=self._font,
+                                anchor='nw', justify='left')
+            bbox = c.bbox(tag)
+            c.delete(tag)
+            width = bbox[2] + ChartView._LEAF_SPACING
+            unitsize = max(width, unitsize)
+            text_height = max(text_height, bbox[3] - bbox[1])
+
+        self._unitsize = unitsize
+        self._text_height = text_height
+        self._sentence_height = (self._text_height +
+                               2*ChartView._MARGIN)
+
+        # Check against edges.
+        for edge in self._chart.edges():
+            self._analyze_edge(edge)
+
+        # Size of chart levels
+        self._chart_level_size = self._text_height * 2
+
+        # Default tree size..
+        self._tree_height = (3 * (ChartView._TREE_LEVEL_SIZE +
+                                  self._text_height))
+
+        # Resize the scrollregions.
+        self._resize()
+
+    def _resize(self):
+        """
+        Update the scroll-regions for each canvas.  This ensures that
+        everything is within a scroll-region, so the user can use the
+        scrollbars to view the entire display.  This does *not*
+        resize the window.
+        """
+        c = self._chart_canvas
+
+        # Reset the chart scroll region
+        width = ( self._chart.num_leaves() * self._unitsize +
+                  ChartView._MARGIN * 2 )
+
+        levels = len(self._edgelevels)
+        self._chart_height = (levels+2)*self._chart_level_size
+        c['scrollregion']=(0,0,width,self._chart_height)
+
+        # Reset the tree scroll region
+        if self._tree_canvas:
+            self._tree_canvas['scrollregion'] = (0, 0, width,
+                                                 self._tree_height)
+
+    def _draw_loclines(self):
+        """
+        Draw location lines.  These are vertical gridlines used to
+        show where each location unit is.
+        """
+        BOTTOM = 50000
+        c1 = self._tree_canvas
+        c2 = self._sentence_canvas
+        c3 = self._chart_canvas
+        margin = ChartView._MARGIN
+        self._loclines = []
+        for i in range(0, self._chart.num_leaves()+1):
+            x = i*self._unitsize + margin
+
+            if c1:
+                t1=c1.create_line(x, 0, x, BOTTOM)
+                c1.tag_lower(t1)
+            if c2:
+                t2=c2.create_line(x, 0, x, self._sentence_height)
+                c2.tag_lower(t2)
+            t3=c3.create_line(x, 0, x, BOTTOM)
+            c3.tag_lower(t3)
+            t4=c3.create_text(x+2, 0, text=repr(i), anchor='nw',
+                              font=self._font)
+            c3.tag_lower(t4)
+            #if i % 4 == 0:
+            #    if c1: c1.itemconfig(t1, width=2, fill='gray60')
+            #    if c2: c2.itemconfig(t2, width=2, fill='gray60')
+            #    c3.itemconfig(t3, width=2, fill='gray60')
+            if i % 2 == 0:
+                if c1: c1.itemconfig(t1, fill='gray60')
+                if c2: c2.itemconfig(t2, fill='gray60')
+                c3.itemconfig(t3, fill='gray60')
+            else:
+                if c1: c1.itemconfig(t1, fill='gray80')
+                if c2: c2.itemconfig(t2, fill='gray80')
+                c3.itemconfig(t3, fill='gray80')
+
+    def _draw_sentence(self):
+        """Draw the sentence string."""
+        if self._chart.num_leaves() == 0: return
+        c = self._sentence_canvas
+        margin = ChartView._MARGIN
+        y = ChartView._MARGIN
+
+        for i, leaf in enumerate(self._chart.leaves()):
+            x1 = i * self._unitsize + margin
+            x2 = x1 + self._unitsize
+            x = (x1+x2)/2
+            tag = c.create_text(x, y, text=repr(leaf),
+                                font=self._font,
+                                anchor='n', justify='left')
+            bbox = c.bbox(tag)
+            rt=c.create_rectangle(x1+2, bbox[1]-(ChartView._LEAF_SPACING/2),
+                                  x2-2, bbox[3]+(ChartView._LEAF_SPACING/2),
+                                  fill='#f0f0f0', outline='#f0f0f0')
+            c.tag_lower(rt)
+
+    def erase_tree(self):
+        for tag in self._tree_tags: self._tree_canvas.delete(tag)
+        self._treetoks = []
+        self._treetoks_edge = None
+        self._treetoks_index = 0
+
+    def draw_tree(self, edge=None):
+        if edge is None and self._treetoks_edge is None: return
+        if edge is None: edge = self._treetoks_edge
+
+        # If it's a new edge, then get a new list of treetoks.
+        if self._treetoks_edge != edge:
+            self._treetoks = [t for t in self._chart.trees(edge)
+                              if isinstance(t, Tree)]
+            self._treetoks_edge = edge
+            self._treetoks_index = 0
+
+        # Make sure there's something to draw.
+        if len(self._treetoks) == 0: return
+
+        # Erase the old tree.
+        for tag in self._tree_tags: self._tree_canvas.delete(tag)
+
+        # Draw the new tree.
+        tree = self._treetoks[self._treetoks_index]
+        self._draw_treetok(tree, edge.start())
+
+        # Show how many trees are available for the edge.
+        self._draw_treecycle()
+
+        # Update the scroll region.
+        w = self._chart.num_leaves()*self._unitsize+2*ChartView._MARGIN
+        h = tree.height() * (ChartView._TREE_LEVEL_SIZE+self._text_height)
+        self._tree_canvas['scrollregion'] = (0, 0, w, h)
+
+    def cycle_tree(self):
+        self._treetoks_index = (self._treetoks_index+1)%len(self._treetoks)
+        self.draw_tree(self._treetoks_edge)
+
+    def _draw_treecycle(self):
+        if len(self._treetoks) <= 1: return
+
+        # Draw the label.
+        label = '%d Trees' % len(self._treetoks)
+        c = self._tree_canvas
+        margin = ChartView._MARGIN
+        right = self._chart.num_leaves()*self._unitsize+margin-2
+        tag = c.create_text(right, 2, anchor='ne', text=label,
+                            font=self._boldfont)
+        self._tree_tags.append(tag)
+        _, _, _, y = c.bbox(tag)
+
+        # Draw the triangles.
+        for i in range(len(self._treetoks)):
+            x = right - 20*(len(self._treetoks)-i-1)
+            if i == self._treetoks_index: fill = '#084'
+            else: fill = '#fff'
+            tag = c.create_polygon(x, y+10, x-5, y, x-10, y+10,
+                             fill=fill, outline='black')
+            self._tree_tags.append(tag)
+
+            # Set up a callback: show the tree if they click on its
+            # triangle.
+            def cb(event, self=self, i=i):
+                self._treetoks_index = i
+                self.draw_tree()
+            c.tag_bind(tag, '<Button-1>', cb)
+
+    def _draw_treetok(self, treetok, index, depth=0):
+        """
+        :param index: The index of the first leaf in the tree.
+        :return: The index of the first leaf after the tree.
+        """
+        c = self._tree_canvas
+        margin = ChartView._MARGIN
+
+        # Draw the children
+        child_xs = []
+        for child in treetok:
+            if isinstance(child, Tree):
+                child_x, index = self._draw_treetok(child, index, depth+1)
+                child_xs.append(child_x)
+            else:
+                child_xs.append((2*index+1)*self._unitsize/2 + margin)
+                index += 1
+
+        # If we have children, then get the node's x by averaging their
+        # node x's.  Otherwise, make room for ourselves.
+        if child_xs:
+            nodex = sum(child_xs)/len(child_xs)
+        else:
+            # [XX] breaks for null productions.
+            nodex = (2*index+1)*self._unitsize/2 + margin
+            index += 1
+
+        # Draw the node
+        nodey = depth * (ChartView._TREE_LEVEL_SIZE + self._text_height)
+        tag = c.create_text(nodex, nodey, anchor='n', justify='center',
+                            text=str(treetok.label()), fill='#042',
+                            font=self._boldfont)
+        self._tree_tags.append(tag)
+
+        # Draw lines to the children.
+        childy = nodey + ChartView._TREE_LEVEL_SIZE + self._text_height
+        for childx, child in zip(child_xs, treetok):
+            if isinstance(child, Tree) and child:
+                # A "real" tree token:
+                tag = c.create_line(nodex, nodey + self._text_height,
+                                    childx, childy, width=2, fill='#084')
+                self._tree_tags.append(tag)
+            if isinstance(child, Tree) and not child:
+                # An unexpanded tree token:
+                tag = c.create_line(nodex, nodey + self._text_height,
+                                    childx, childy, width=2,
+                                    fill='#048', dash='2 3')
+                self._tree_tags.append(tag)
+            if not isinstance(child, Tree):
+                # A leaf:
+                tag = c.create_line(nodex, nodey + self._text_height,
+                                    childx, 10000, width=2, fill='#084')
+                self._tree_tags.append(tag)
+
+        return nodex, index
+
+    def draw(self):
+        """
+        Draw everything (from scratch).
+        """
+        if self._tree_canvas:
+            self._tree_canvas.delete('all')
+            self.draw_tree()
+
+        if self._sentence_canvas:
+            self._sentence_canvas.delete('all')
+            self._draw_sentence()
+
+        self._chart_canvas.delete('all')
+        self._edgetags = {}
+
+        # Redraw any edges we erased.
+        for lvl in range(len(self._edgelevels)):
+            for edge in self._edgelevels[lvl]:
+                self._draw_edge(edge, lvl)
+
+        for edge in self._chart:
+            self._add_edge(edge)
+
+        self._draw_loclines()
+
+    def add_callback(self, event, func):
+        self._callbacks.setdefault(event,{})[func] = 1
+
+    def remove_callback(self, event, func=None):
+        if func is None: del self._callbacks[event]
+        else:
+            try: del self._callbacks[event][func]
+            except: pass
+
+    def _fire_callbacks(self, event, *args):
+        if event not in self._callbacks: return
+        for cb_func in list(self._callbacks[event].keys()): cb_func(*args)
+
+#######################################################################
+# Edge Rules
+#######################################################################
+# These version of the chart rules only apply to a specific edge.
+# This lets the user select an edge, and then apply a rule.
+
+class EdgeRule(object):
+    """
+    To create an edge rule, make an empty base class that uses
+    EdgeRule as the first base class, and the basic rule as the
+    second base class.  (Order matters!)
+    """
+    def __init__(self, edge):
+        super = self.__class__.__bases__[1]
+        self._edge = edge
+        self.NUM_EDGES = super.NUM_EDGES-1
+    def apply(self, chart, grammar, *edges):
+        super = self.__class__.__bases__[1]
+        edges += (self._edge,)
+        for e in super.apply(self, chart, grammar, *edges): yield e
+    def __str__(self):
+        super = self.__class__.__bases__[1]
+        return super.__str__(self)
+
+class TopDownPredictEdgeRule(EdgeRule, TopDownPredictRule):
+    pass
+class BottomUpEdgeRule(EdgeRule, BottomUpPredictRule):
+    pass
+class BottomUpLeftCornerEdgeRule(EdgeRule, BottomUpPredictCombineRule):
+    pass
+class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule):
+    pass
+
+#######################################################################
+# Chart Parser Application
+#######################################################################
+
+class ChartParserApp(object):
+    def __init__(self, grammar, tokens, title='Chart Parser Application'):
+        # Initialize the parser
+        self._init_parser(grammar, tokens)
+
+        self._root = None
+        try:
+            # Create the root window.
+            self._root = Tk()
+            self._root.title(title)
+            self._root.bind('<Control-q>', self.destroy)
+
+            # Set up some frames.
+            frame3 = Frame(self._root)
+            frame2 = Frame(self._root)
+            frame1 = Frame(self._root)
+            frame3.pack(side='bottom', fill='none')
+            frame2.pack(side='bottom', fill='x')
+            frame1.pack(side='bottom', fill='both', expand=1)
+
+            self._init_fonts(self._root)
+            self._init_animation()
+            self._init_chartview(frame1)
+            self._init_rulelabel(frame2)
+            self._init_buttons(frame3)
+            self._init_menubar()
+
+            self._matrix = None
+            self._results = None
+
+            # Set up keyboard bindings.
+            self._init_bindings()
+
+        except:
+            print('Error creating Tree View')
+            self.destroy()
+            raise
+
+    def destroy(self, *args):
+        if self._root is None: return
+        self._root.destroy()
+        self._root = None
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._root.mainloop(*args, **kwargs)
+
+    #////////////////////////////////////////////////////////////
+    # Initialization Helpers
+    #////////////////////////////////////////////////////////////
+
+    def _init_parser(self, grammar, tokens):
+        self._grammar = grammar
+        self._tokens = tokens
+        self._reset_parser()
+
+    def _reset_parser(self):
+        self._cp = SteppingChartParser(self._grammar)
+        self._cp.initialize(self._tokens)
+        self._chart = self._cp.chart()
+
+        # Insert LeafEdges before the parsing starts.
+        for _new_edge in LeafInitRule().apply(self._chart, self._grammar):
+            pass
+
+        # The step iterator -- use this to generate new edges
+        self._cpstep = self._cp.step()
+
+        # The currently selected edge
+        self._selection = None
+
+    def _init_fonts(self, root):
+        # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
+        self._sysfont = Font(font=Button()["font"])
+        root.option_add("*Font", self._sysfont)
+
+        # TWhat's our font size (default=same as sysfont)
+        self._size = IntVar(root)
+        self._size.set(self._sysfont.cget('size'))
+
+        self._boldfont = Font(family='helvetica', weight='bold',
+                                    size=self._size.get())
+        self._font = Font(family='helvetica',
+                                    size=self._size.get())
+
+    def _init_animation(self):
+        # Are we stepping? (default=yes)
+        self._step = IntVar(self._root)
+        self._step.set(1)
+
+        # What's our animation speed (default=fast)
+        self._animate = IntVar(self._root)
+        self._animate.set(3) # Default speed = fast
+
+        # Are we currently animating?
+        self._animating = 0
+
+    def _init_chartview(self, parent):
+        self._cv = ChartView(self._chart, parent,
+                             draw_tree=1, draw_sentence=1)
+        self._cv.add_callback('select', self._click_cv_edge)
+
+    def _init_rulelabel(self, parent):
+        ruletxt = 'Last edge generated by:'
+
+        self._rulelabel1 = Label(parent,text=ruletxt,
+                                         font=self._boldfont)
+        self._rulelabel2 = Label(parent, width=40,
+                                         relief='groove', anchor='w',
+                                         font=self._boldfont)
+        self._rulelabel1.pack(side='left')
+        self._rulelabel2.pack(side='left')
+        step = Checkbutton(parent, variable=self._step,
+                                   text='Step')
+        step.pack(side='right')
+
+    def _init_buttons(self, parent):
+        frame1 = Frame(parent)
+        frame2 = Frame(parent)
+        frame1.pack(side='bottom', fill='x')
+        frame2.pack(side='top', fill='none')
+
+        Button(frame1, text='Reset\nParser',
+                       background='#90c0d0', foreground='black',
+                       command=self.reset).pack(side='right')
+        # Button(frame1, text='Pause',
+        #               background='#90c0d0', foreground='black',
+        #               command=self.pause).pack(side='left')
+
+        Button(frame1, text='Top Down\nStrategy',
+                       background='#90c0d0', foreground='black',
+                       command=self.top_down_strategy).pack(side='left')
+        Button(frame1, text='Bottom Up\nStrategy',
+                       background='#90c0d0', foreground='black',
+                       command=self.bottom_up_strategy).pack(side='left')
+        Button(frame1, text='Bottom Up\nLeft-Corner Strategy',
+                       background='#90c0d0', foreground='black',
+                       command=self.bottom_up_leftcorner_strategy).pack(side='left')
+
+        Button(frame2, text='Top Down Init\nRule',
+                       background='#90f090', foreground='black',
+                       command=self.top_down_init).pack(side='left')
+        Button(frame2, text='Top Down Predict\nRule',
+                       background='#90f090', foreground='black',
+                       command=self.top_down_predict).pack(side='left')
+        Frame(frame2, width=20).pack(side='left')
+
+        Button(frame2, text='Bottom Up Predict\nRule',
+                       background='#90f090', foreground='black',
+                       command=self.bottom_up).pack(side='left')
+        Frame(frame2, width=20).pack(side='left')
+
+        Button(frame2, text='Bottom Up Left-Corner\nPredict Rule',
+                       background='#90f090', foreground='black',
+                       command=self.bottom_up_leftcorner).pack(side='left')
+        Frame(frame2, width=20).pack(side='left')
+
+        Button(frame2, text='Fundamental\nRule',
+                       background='#90f090', foreground='black',
+                       command=self.fundamental).pack(side='left')
+
+    def _init_bindings(self):
+        self._root.bind('<Up>', self._cv.scroll_up)
+        self._root.bind('<Down>', self._cv.scroll_down)
+        self._root.bind('<Prior>', self._cv.page_up)
+        self._root.bind('<Next>', self._cv.page_down)
+        self._root.bind('<Control-q>', self.destroy)
+        self._root.bind('<Control-x>', self.destroy)
+        self._root.bind('<F1>', self.help)
+
+        self._root.bind('<Control-s>', self.save_chart)
+        self._root.bind('<Control-o>', self.load_chart)
+        self._root.bind('<Control-r>', self.reset)
+
+        self._root.bind('t', self.top_down_strategy)
+        self._root.bind('b', self.bottom_up_strategy)
+        self._root.bind('c', self.bottom_up_leftcorner_strategy)
+        self._root.bind('<space>', self._stop_animation)
+
+        self._root.bind('<Control-g>', self.edit_grammar)
+        self._root.bind('<Control-t>', self.edit_sentence)
+
+        # Animation speed control
+        self._root.bind('-', lambda e,a=self._animate:a.set(1))
+        self._root.bind('=', lambda e,a=self._animate:a.set(2))
+        self._root.bind('+', lambda e,a=self._animate:a.set(3))
+
+        # Step control
+        self._root.bind('s', lambda e,s=self._step:s.set(not s.get()))
+
+    def _init_menubar(self):
+        menubar = Menu(self._root)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Save Chart', underline=0,
+                             command=self.save_chart, accelerator='Ctrl-s')
+        filemenu.add_command(label='Load Chart', underline=0,
+                             command=self.load_chart, accelerator='Ctrl-o')
+        filemenu.add_command(label='Reset Chart', underline=0,
+                             command=self.reset, accelerator='Ctrl-r')
+        filemenu.add_separator()
+        filemenu.add_command(label='Save Grammar',
+                             command=self.save_grammar)
+        filemenu.add_command(label='Load Grammar',
+                             command=self.load_grammar)
+        filemenu.add_separator()
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        editmenu = Menu(menubar, tearoff=0)
+        editmenu.add_command(label='Edit Grammar', underline=5,
+                             command=self.edit_grammar,
+                             accelerator='Ctrl-g')
+        editmenu.add_command(label='Edit Text', underline=5,
+                             command=self.edit_sentence,
+                             accelerator='Ctrl-t')
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
+
+        viewmenu = Menu(menubar, tearoff=0)
+        viewmenu.add_command(label='Chart Matrix', underline=6,
+                             command=self.view_matrix)
+        viewmenu.add_command(label='Results', underline=0,
+                             command=self.view_results)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
+
+        rulemenu = Menu(menubar, tearoff=0)
+        rulemenu.add_command(label='Top Down Strategy', underline=0,
+                             command=self.top_down_strategy,
+                             accelerator='t')
+        rulemenu.add_command(label='Bottom Up Strategy', underline=0,
+                             command=self.bottom_up_strategy,
+                             accelerator='b')
+        rulemenu.add_command(label='Bottom Up Left-Corner Strategy', underline=0,
+                             command=self.bottom_up_leftcorner_strategy,
+                             accelerator='c')
+        rulemenu.add_separator()
+        rulemenu.add_command(label='Bottom Up Rule',
+                             command=self.bottom_up)
+        rulemenu.add_command(label='Bottom Up Left-Corner Rule',
+                             command=self.bottom_up_leftcorner)
+        rulemenu.add_command(label='Top Down Init Rule',
+                             command=self.top_down_init)
+        rulemenu.add_command(label='Top Down Predict Rule',
+                             command=self.top_down_predict)
+        rulemenu.add_command(label='Fundamental Rule',
+                             command=self.fundamental)
+        menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
+
+        animatemenu = Menu(menubar, tearoff=0)
+        animatemenu.add_checkbutton(label="Step", underline=0,
+                                    variable=self._step,
+                                    accelerator='s')
+        animatemenu.add_separator()
+        animatemenu.add_radiobutton(label="No Animation", underline=0,
+                                    variable=self._animate, value=0)
+        animatemenu.add_radiobutton(label="Slow Animation", underline=0,
+                                    variable=self._animate, value=1,
+                                    accelerator='-')
+        animatemenu.add_radiobutton(label="Normal Animation", underline=0,
+                                    variable=self._animate, value=2,
+                                    accelerator='=')
+        animatemenu.add_radiobutton(label="Fast Animation", underline=0,
+                                    variable=self._animate, value=3,
+                                    accelerator='+')
+        menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
+
+        zoommenu = Menu(menubar, tearoff=0)
+        zoommenu.add_radiobutton(label='Tiny', variable=self._size,
+                                 underline=0, value=10, command=self.resize)
+        zoommenu.add_radiobutton(label='Small', variable=self._size,
+                                 underline=0, value=12, command=self.resize)
+        zoommenu.add_radiobutton(label='Medium', variable=self._size,
+                                 underline=0, value=14, command=self.resize)
+        zoommenu.add_radiobutton(label='Large', variable=self._size,
+                                 underline=0, value=18, command=self.resize)
+        zoommenu.add_radiobutton(label='Huge', variable=self._size,
+                                 underline=0, value=24, command=self.resize)
+        menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
+
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label='About', underline=0,
+                             command=self.about)
+        helpmenu.add_command(label='Instructions', underline=0,
+                             command=self.help, accelerator='F1')
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+
+        self._root.config(menu=menubar)
+
+    #////////////////////////////////////////////////////////////
+    # Selection Handling
+    #////////////////////////////////////////////////////////////
+
+    def _click_cv_edge(self, edge):
+        if edge != self._selection:
+            # Clicking on a new edge selects it.
+            self._select_edge(edge)
+        else:
+            # Repeated clicks on one edge cycle its trees.
+            self._cv.cycle_tree()
+            # [XX] this can get confused if animation is running
+            # faster than the callbacks...
+
+    def _select_matrix_edge(self, edge):
+        self._select_edge(edge)
+        self._cv.view_edge(edge)
+
+    def _select_edge(self, edge):
+        self._selection = edge
+        # Update the chart view.
+        self._cv.markonly_edge(edge, '#f00')
+        self._cv.draw_tree(edge)
+        # Update the matrix view.
+        if self._matrix: self._matrix.markonly_edge(edge)
+        if self._matrix: self._matrix.view_edge(edge)
+
+    def _deselect_edge(self):
+        self._selection = None
+        # Update the chart view.
+        self._cv.unmark_edge()
+        self._cv.erase_tree()
+        # Update the matrix view
+        if self._matrix: self._matrix.unmark_edge()
+
+    def _show_new_edge(self, edge):
+        self._display_rule(self._cp.current_chartrule())
+        # Update the chart view.
+        self._cv.update()
+        self._cv.draw_tree(edge)
+        self._cv.markonly_edge(edge, '#0df')
+        self._cv.view_edge(edge)
+        # Update the matrix view.
+        if self._matrix: self._matrix.update()
+        if self._matrix: self._matrix.markonly_edge(edge)
+        if self._matrix: self._matrix.view_edge(edge)
+        # Update the results view.
+        if self._results: self._results.update(edge)
+
+    #////////////////////////////////////////////////////////////
+    # Help/usage
+    #////////////////////////////////////////////////////////////
+
+    def help(self, *e):
+        self._animating = 0
+        # The default font's not very legible; try using 'fixed' instead.
+        try:
+            ShowText(self._root, 'Help: Chart Parser Application',
+                     (__doc__ or '').strip(), width=75, font='fixed')
+        except:
+            ShowText(self._root, 'Help: Chart Parser Application',
+                     (__doc__ or '').strip(), width=75)
+
+    def about(self, *e):
+        ABOUT = ("NLTK Chart Parser Application\n"+
+                 "Written by Edward Loper")
+        showinfo('About: Chart Parser Application', ABOUT)
+
+    #////////////////////////////////////////////////////////////
+    # File Menu
+    #////////////////////////////////////////////////////////////
+
+    CHART_FILE_TYPES = [('Pickle file', '.pickle'),
+                        ('All files', '*')]
+    GRAMMAR_FILE_TYPES = [('Plaintext grammar file', '.cfg'),
+                          ('Pickle file', '.pickle'),
+                          ('All files', '*')]
+
+    def load_chart(self, *args):
+        "Load a chart from a pickle file"
+        filename = askopenfilename(filetypes=self.CHART_FILE_TYPES,
+                                   defaultextension='.pickle')
+        if not filename: return
+        try:
+            with open(filename, 'rb') as infile:
+                chart = pickle.load(infile)
+            self._chart = chart
+            self._cv.update(chart)
+            if self._matrix: self._matrix.set_chart(chart)
+            if self._matrix: self._matrix.deselect_cell()
+            if self._results: self._results.set_chart(chart)
+            self._cp.set_chart(chart)
+        except Exception as e:
+            raise
+            showerror('Error Loading Chart',
+                                   'Unable to open file: %r' % filename)
+
+    def save_chart(self, *args):
+        "Save a chart to a pickle file"
+        filename = asksaveasfilename(filetypes=self.CHART_FILE_TYPES,
+                                     defaultextension='.pickle')
+        if not filename: return
+        try:
+            with open(filename, 'wb') as outfile:
+                pickle.dump(self._chart, outfile)
+        except Exception as e:
+            raise
+            showerror('Error Saving Chart',
+                                   'Unable to open file: %r' % filename)
+
+    def load_grammar(self, *args):
+        "Load a grammar from a pickle file"
+        filename = askopenfilename(filetypes=self.GRAMMAR_FILE_TYPES,
+                                   defaultextension='.cfg')
+        if not filename: return
+        try:
+            if filename.endswith('.pickle'):
+                with open(filename, 'rb') as infile:
+                    grammar = pickle.load(infile)
+            else:
+                with open(filename, 'r') as infile:
+                    grammar = CFG.fromstring(infile.read())
+            self.set_grammar(grammar)
+        except Exception as e:
+            showerror('Error Loading Grammar',
+                                   'Unable to open file: %r' % filename)
+
+    def save_grammar(self, *args):
+        filename = asksaveasfilename(filetypes=self.GRAMMAR_FILE_TYPES,
+                                     defaultextension='.cfg')
+        if not filename: return
+        try:
+            if filename.endswith('.pickle'):
+                with open(filename, 'wb') as outfile:
+                    pickle.dump((self._chart, self._tokens), outfile)
+            else:
+                with open(filename, 'w') as outfile:
+                    prods = self._grammar.productions()
+                    start = [p for p in prods if p.lhs() == self._grammar.start()]
+                    rest = [p for p in prods if p.lhs() != self._grammar.start()]
+                    for prod in start: outfile.write('%s\n' % prod)
+                    for prod in rest: outfile.write('%s\n' % prod)
+        except Exception as e:
+            showerror('Error Saving Grammar',
+                                   'Unable to open file: %r' % filename)
+
+    def reset(self, *args):
+        self._animating = 0
+        self._reset_parser()
+        self._cv.update(self._chart)
+        if self._matrix: self._matrix.set_chart(self._chart)
+        if self._matrix: self._matrix.deselect_cell()
+        if self._results: self._results.set_chart(self._chart)
+
+    #////////////////////////////////////////////////////////////
+    # Edit
+    #////////////////////////////////////////////////////////////
+
+    def edit_grammar(self, *e):
+        CFGEditor(self._root, self._grammar, self.set_grammar)
+
+    def set_grammar(self, grammar):
+        self._grammar = grammar
+        self._cp.set_grammar(grammar)
+        if self._results: self._results.set_grammar(grammar)
+
+    def edit_sentence(self, *e):
+        sentence = " ".join(self._tokens)
+        title = 'Edit Text'
+        instr = 'Enter a new sentence to parse.'
+        EntryDialog(self._root, sentence, instr, self.set_sentence, title)
+
+    def set_sentence(self, sentence):
+        self._tokens = list(sentence.split())
+        self.reset()
+
+    #////////////////////////////////////////////////////////////
+    # View Menu
+    #////////////////////////////////////////////////////////////
+
+    def view_matrix(self, *e):
+        if self._matrix is not None: self._matrix.destroy()
+        self._matrix = ChartMatrixView(self._root, self._chart)
+        self._matrix.add_callback('select', self._select_matrix_edge)
+
+    def view_results(self, *e):
+        if self._results is not None: self._results.destroy()
+        self._results = ChartResultsView(self._root, self._chart,
+                                         self._grammar)
+
+    #////////////////////////////////////////////////////////////
+    # Zoom Menu
+    #////////////////////////////////////////////////////////////
+
+    def resize(self):
+        self._animating = 0
+        self.set_font_size(self._size.get())
+
+    def set_font_size(self, size):
+        self._cv.set_font_size(size)
+        self._font.configure(size=-abs(size))
+        self._boldfont.configure(size=-abs(size))
+        self._sysfont.configure(size=-abs(size))
+
+    def get_font_size(self):
+        return abs(self._size.get())
+
+    #////////////////////////////////////////////////////////////
+    # Parsing
+    #////////////////////////////////////////////////////////////
+
+    def apply_strategy(self, strategy, edge_strategy=None):
+        # If we're animating, then stop.
+        if self._animating:
+            self._animating = 0
+            return
+
+        # Clear the rule display & mark.
+        self._display_rule(None)
+        #self._cv.unmark_edge()
+
+        if self._step.get():
+            selection = self._selection
+            if (selection is not None) and (edge_strategy is not None):
+                # Apply the given strategy to the selected edge.
+                self._cp.set_strategy([edge_strategy(selection)])
+                newedge = self._apply_strategy()
+
+                # If it failed, then clear the selection.
+                if newedge is None:
+                    self._cv.unmark_edge()
+                    self._selection = None
+            else:
+                self._cp.set_strategy(strategy)
+                self._apply_strategy()
+
+        else:
+            self._cp.set_strategy(strategy)
+            if self._animate.get():
+                self._animating = 1
+                self._animate_strategy()
+            else:
+                for edge in self._cpstep:
+                    if edge is None: break
+                self._cv.update()
+                if self._matrix: self._matrix.update()
+                if self._results: self._results.update()
+
+    def _stop_animation(self, *e):
+        self._animating = 0
+
+    def _animate_strategy(self, speed=1):
+        if self._animating == 0: return
+        if self._apply_strategy() is not None:
+            if self._animate.get() == 0 or self._step.get() == 1:
+                return
+            if self._animate.get() == 1:
+                self._root.after(3000, self._animate_strategy)
+            elif self._animate.get() == 2:
+                self._root.after(1000, self._animate_strategy)
+            else:
+                self._root.after(20, self._animate_strategy)
+
+    def _apply_strategy(self):
+        new_edge = next(self._cpstep)
+
+        if new_edge is not None:
+            self._show_new_edge(new_edge)
+        return new_edge
+
+    def _display_rule(self, rule):
+        if rule is None:
+            self._rulelabel2['text'] = ''
+        else:
+            name = str(rule)
+            self._rulelabel2['text'] = name
+            size = self._cv.get_font_size()
+
+    #////////////////////////////////////////////////////////////
+    # Parsing Strategies
+    #////////////////////////////////////////////////////////////
+
+    # Basic rules:
+    _TD_INIT     = [TopDownInitRule()]
+    _TD_PREDICT  = [TopDownPredictRule()]
+    _BU_RULE     = [BottomUpPredictRule()]
+    _BU_LC_RULE  = [BottomUpPredictCombineRule()]
+    _FUNDAMENTAL = [SingleEdgeFundamentalRule()]
+
+    # Complete strategies:
+    _TD_STRATEGY =  _TD_INIT + _TD_PREDICT + _FUNDAMENTAL
+    _BU_STRATEGY = _BU_RULE + _FUNDAMENTAL
+    _BU_LC_STRATEGY = _BU_LC_RULE + _FUNDAMENTAL
+
+    # Button callback functions:
+    def top_down_init(self, *e):
+        self.apply_strategy(self._TD_INIT, None)
+    def top_down_predict(self, *e):
+        self.apply_strategy(self._TD_PREDICT, TopDownPredictEdgeRule)
+    def bottom_up(self, *e):
+        self.apply_strategy(self._BU_RULE, BottomUpEdgeRule)
+    def bottom_up_leftcorner(self, *e):
+        self.apply_strategy(self._BU_LC_RULE, BottomUpLeftCornerEdgeRule)
+    def fundamental(self, *e):
+        self.apply_strategy(self._FUNDAMENTAL, FundamentalEdgeRule)
+    def bottom_up_strategy(self, *e):
+        self.apply_strategy(self._BU_STRATEGY, BottomUpEdgeRule)
+    def bottom_up_leftcorner_strategy(self, *e):
+        self.apply_strategy(self._BU_LC_STRATEGY, BottomUpLeftCornerEdgeRule)
+    def top_down_strategy(self, *e):
+        self.apply_strategy(self._TD_STRATEGY, TopDownPredictEdgeRule)
+
+def app():
+    grammar = CFG.fromstring("""
+    # Grammatical productions.
+        S -> NP VP
+        VP -> VP PP | V NP | V
+        NP -> Det N | NP PP
+        PP -> P NP
+    # Lexical productions.
+        NP -> 'John' | 'I'
+        Det -> 'the' | 'my' | 'a'
+        N -> 'dog' | 'cookie' | 'table' | 'cake' | 'fork'
+        V -> 'ate' | 'saw'
+        P -> 'on' | 'under' | 'with'
+    """)
+
+    sent = 'John ate the cake on the table with a fork'
+    sent = 'John ate the cake on the table'
+    tokens = list(sent.split())
+
+    print('grammar= (')
+    for rule in grammar.productions():
+        print(('    ', repr(rule)+','))
+    print(')')
+    print(('tokens = %r' % tokens))
+    print('Calling "ChartParserApp(grammar, tokens)"...')
+    ChartParserApp(grammar, tokens).mainloop()
+
+if __name__ == '__main__':
+    app()
+
+    # Chart comparer:
+    #charts = ['/tmp/earley.pickle',
+    #          '/tmp/topdown.pickle',
+    #          '/tmp/bottomup.pickle']
+    #ChartComparer(*charts).mainloop()
+
+    #import profile
+    #profile.run('demo2()', '/tmp/profile.out')
+    #import pstats
+    #p = pstats.Stats('/tmp/profile.out')
+    #p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
+    #p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/chartparser_app.pyc b/nlp_resource_data/nltk/app/chartparser_app.pyc

new file mode 100755 (executable)

index 0000000..f494ece

Binary files /dev/null and b/nlp_resource_data/nltk/app/chartparser_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/chunkparser_app.py b/nlp_resource_data/nltk/app/chunkparser_app.py

new file mode 100755 (executable)

index 0000000..5e08421
--- /dev/null
+++ b/nlp_resource_data/nltk/app/chunkparser_app.py
@@ -0,0 +1,1262 @@
+# Natural Language Toolkit: Regexp Chunk Parser Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A graphical tool for exploring the regular expression based chunk
+parser ``nltk.chunk.RegexpChunkParser``.
+"""
+
+# Todo: Add a way to select the development set from the menubar.  This
+# might just need to be a selection box (conll vs treebank etc) plus
+# configuration parameters to select what's being chunked (eg VP vs NP)
+# and what part of the data is being used as the development set.
+
+from __future__ import division
+import time
+import textwrap
+import re
+import random
+
+from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
+                               Label, Menu, Scrollbar, Text, Tk)
+from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
+from six.moves.tkinter_font import Font
+
+from nltk.tree import Tree
+from nltk.util import in_idle
+from nltk.draw.util import ShowText
+from nltk.corpus import conll2000, treebank_chunk
+from nltk.chunk import ChunkScore, RegexpChunkParser
+from nltk.chunk.regexp import RegexpChunkRule
+
+class RegexpChunkApp(object):
+    """
+    A graphical tool for exploring the regular expression based chunk
+    parser ``nltk.chunk.RegexpChunkParser``.
+
+    See ``HELP`` for instructional text.
+    """
+
+    ##/////////////////////////////////////////////////////////////////
+    ##  Help Text
+    ##/////////////////////////////////////////////////////////////////
+
+    #: A dictionary mapping from part of speech tags to descriptions,
+    #: which is used in the help text.  (This should probably live with
+    #: the conll and/or treebank corpus instead.)
+    TAGSET = {
+        'CC':   'Coordinating conjunction',   'PRP$': 'Possessive pronoun',
+        'CD':   'Cardinal number',            'RB':   'Adverb',
+        'DT':   'Determiner',                 'RBR':  'Adverb, comparative',
+        'EX':   'Existential there',          'RBS':  'Adverb, superlative',
+        'FW':   'Foreign word',               'RP':   'Particle',
+        'JJ':   'Adjective',                  'TO':   'to',
+        'JJR':  'Adjective, comparative',     'UH':   'Interjection',
+        'JJS':  'Adjective, superlative',     'VB':   'Verb, base form',
+        'LS':   'List item marker',           'VBD':  'Verb, past tense',
+        'MD':   'Modal',                      'NNS':  'Noun, plural',
+        'NN':   'Noun, singular or masps',    'VBN':  'Verb, past participle',
+        'VBZ':  'Verb,3rd ps. sing. present', 'NNP':  'Proper noun, singular',
+        'NNPS': 'Proper noun plural',         'WDT':  'wh-determiner',
+        'PDT':  'Predeterminer',              'WP':   'wh-pronoun',
+        'POS':  'Possessive ending',          'WP$':  'Possessive wh-pronoun',
+        'PRP':  'Personal pronoun',           'WRB':  'wh-adverb',
+        '(':    'open parenthesis',           ')':    'close parenthesis',
+        '``':   'open quote',                 ',':    'comma',
+        "''":   'close quote',                '.':    'period',
+        '#':    'pound sign (currency marker)',
+        '$':    'dollar sign (currency marker)',
+        'IN':   'Preposition/subord. conjunction',
+        'SYM':  'Symbol (mathematical or scientific)',
+        'VBG':  'Verb, gerund/present participle',
+        'VBP':  'Verb, non-3rd ps. sing. present',
+        ':':    'colon',
+        }
+
+    #: Contents for the help box.  This is a list of tuples, one for
+    #: each help page, where each tuple has four elements:
+    #:   - A title (displayed as a tab)
+    #:   - A string description of tabstops (see Tkinter.Text for details)
+    #:   - The text contents for the help page.  You can use expressions
+    #:     like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
+    #:     for a list of tags you can use for colorizing.
+    HELP = [
+        ('Help', '20',
+         "Welcome to the regular expression chunk-parser grammar editor.  "
+         "You can use this editor to develop and test chunk parser grammars "
+         "based on NLTK's RegexpChunkParser class.\n\n"
+         # Help box.
+         "Use this box ('Help') to learn more about the editor; click on the "
+         "tabs for help on specific topics:"
+         "<indent>\n"
+         "Rules: grammar rule types\n"
+         "Regexps: regular expression syntax\n"
+         "Tags: part of speech tags\n</indent>\n"
+         # Grammar.
+         "Use the upper-left box ('Grammar') to edit your grammar.  "
+         "Each line of your grammar specifies a single 'rule', "
+         "which performs an action such as creating a chunk or merging "
+         "two chunks.\n\n"
+         # Dev set.
+         "The lower-left box ('Development Set') runs your grammar on the "
+         "development set, and displays the results.  "
+         "Your grammar's chunks are <highlight>highlighted</highlight>, and "
+         "the correct (gold standard) chunks are "
+         "<underline>underlined</underline>.  If they "
+         "match, they are displayed in <green>green</green>; otherwise, "
+         "they are displayed in <red>red</red>.  The box displays a single "
+         "sentence from the development set at a time; use the scrollbar or "
+         "the next/previous buttons view additional sentences.\n\n"
+         # Performance
+         "The lower-right box ('Evaluation') tracks the performance of "
+         "your grammar on the development set.  The 'precision' axis "
+         "indicates how many of your grammar's chunks are correct; and "
+         "the 'recall' axis indicates how many of the gold standard "
+         "chunks your system generated.  Typically, you should try to "
+         "design a grammar that scores high on both metrics.  The "
+         "exact precision and recall of the current grammar, as well "
+         "as their harmonic mean (the 'f-score'), are displayed in "
+         "the status bar at the bottom of the window."
+         ),
+        ('Rules', '10',
+         "<h1>{...regexp...}</h1>"
+         "<indent>\nChunk rule: creates new chunks from words matching "
+         "regexp.</indent>\n\n"
+         "<h1>}...regexp...{</h1>"
+         "<indent>\nChink rule: removes words matching regexp from existing "
+         "chunks.</indent>\n\n"
+         "<h1>...regexp1...}{...regexp2...</h1>"
+         "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
+         "regexp2 in two.</indent>\n\n"
+         "<h1>...regexp...{}...regexp...</h1>"
+         "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
+         "and regexp2</indent>\n"
+         ),
+        ('Regexps', '10 60',
+         #"Regular Expression Syntax Summary:\n\n"
+         "<h1>Pattern\t\tMatches...</h1>\n"
+         "<hangindent>"
+         "\t<<var>T</var>>\ta word with tag <var>T</var> "
+         "(where <var>T</var> may be a regexp).\n"
+         "\t<var>x</var>?\tan optional <var>x</var>\n"
+         "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
+         "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
+         "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
+         "\t.\tmatches any character\n"
+         "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
+         "\t# <var>x...</var>\tTreats <var>x...</var> "
+         "(to the end of the line) as a comment\n"
+         "\t\\<var>C</var>\tmatches character <var>C</var> "
+         "(useful when <var>C</var> is a special character "
+         "like + or #)\n"
+         "</hangindent>"
+         "\n<h1>Examples:</h1>\n"
+         "<hangindent>"
+         '\t<regexp><NN></regexp>\n'
+         '\t\tMatches <match>"cow/NN"</match>\n'
+         '\t\tMatches <match>"green/NN"</match>\n'
+         '\t<regexp><VB.*></regexp>\n'
+         '\t\tMatches <match>"eating/VBG"</match>\n'
+         '\t\tMatches <match>"ate/VBD"</match>\n'
+         '\t<regexp><IN><DT><NN></regexp>\n'
+         '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
+         '\t<regexp><RB>?<VBD></regexp>\n'
+         '\t\tMatches <match>"ran/VBD"</match>\n'
+         '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
+         '\t<regexp><\#><CD> # This is a comment...</regexp>\n'
+         '\t\tMatches <match>"#/# 100/CD"</match>\n'
+         "</hangindent>"
+         ),
+        ('Tags', '10 60',
+         "<h1>Part of Speech Tags:</h1>\n" +
+         '<hangindent>' +
+         '<<TAGSET>>' + # this gets auto-substituted w/ self.TAGSET
+         '</hangindent>\n')
+        ]
+
+    HELP_AUTOTAG = [
+        ('red', dict(foreground='#a00')),
+        ('green', dict(foreground='#080')),
+        ('highlight', dict(background='#ddd')),
+        ('underline', dict(underline=True)),
+        ('h1', dict(underline=True)),
+        ('indent', dict(lmargin1=20, lmargin2=20)),
+        ('hangindent', dict(lmargin1=0, lmargin2=60)),
+        ('var', dict(foreground='#88f')),
+        ('regexp', dict(foreground='#ba7')),
+        ('match', dict(foreground='#6a6')),
+        ]
+
+    ##/////////////////////////////////////////////////////////////////
+    ##  Config Parmeters
+    ##/////////////////////////////////////////////////////////////////
+
+    _EVAL_DELAY = 1
+    """If the user has not pressed any key for this amount of time (in
+       seconds), and the current grammar has not been evaluated, then
+       the eval demon will evaluate it."""
+
+    _EVAL_CHUNK = 15
+    """The number of sentences that should be evaluated by the eval
+       demon each time it runs."""
+    _EVAL_FREQ = 0.2
+    """The frequency (in seconds) at which the eval demon is run"""
+    _EVAL_DEMON_MIN = .02
+    """The minimum amount of time that the eval demon should take each time
+       it runs -- if it takes less than this time, _EVAL_CHUNK will be
+       modified upwards."""
+    _EVAL_DEMON_MAX = .04
+    """The maximum amount of time that the eval demon should take each time
+       it runs -- if it takes more than this time, _EVAL_CHUNK will be
+       modified downwards."""
+
+    _GRAMMARBOX_PARAMS = dict(
+        width=40, height=12, background='#efe', highlightbackground='#efe',
+        highlightthickness=1, relief='groove', border=2, wrap='word')
+    _HELPBOX_PARAMS = dict(
+        width=15, height=15, background='#efe', highlightbackground='#efe',
+        foreground='#555',
+        highlightthickness=1, relief='groove', border=2, wrap='word')
+    _DEVSETBOX_PARAMS = dict(
+        width=70, height=10, background='#eef', highlightbackground='#eef',
+        highlightthickness=1, relief='groove', border=2, wrap='word',
+        tabs=(30,))
+    _STATUS_PARAMS = dict(
+        background='#9bb', relief='groove', border=2)
+    _FONT_PARAMS = dict(
+        family='helvetica', size=-20)
+    _FRAME_PARAMS = dict(
+        background='#777', padx=2, pady=2, border=3)
+    _EVALBOX_PARAMS = dict(
+        background='#eef', highlightbackground='#eef',
+        highlightthickness=1, relief='groove', border=2,
+        width=300, height=280)
+    _BUTTON_PARAMS = dict(
+        background='#777', activebackground='#777',
+        highlightbackground='#777')
+    _HELPTAB_BG_COLOR = '#aba'
+    _HELPTAB_FG_COLOR = '#efe'
+
+    _HELPTAB_FG_PARAMS = dict(background='#efe')
+    _HELPTAB_BG_PARAMS = dict(background='#aba')
+    _HELPTAB_SPACER = 6
+
+    def normalize_grammar(self, grammar):
+        # Strip comments
+        grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
+        # Normalize whitespace
+        grammar = re.sub(' +', ' ', grammar)
+        grammar = re.sub('\n\s+', '\n', grammar)
+        grammar = grammar.strip()
+        # [xx] Hack: automatically backslash $!
+        grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
+        return grammar
+
+    def __init__(self, devset_name='conll2000', devset=None,
+                 grammar = '', chunk_label='NP', tagset=None):
+        """
+        :param devset_name: The name of the development set; used for
+            display & for save files.  If either the name 'treebank'
+            or the name 'conll2000' is used, and devset is None, then
+            devset will be set automatically.
+        :param devset: A list of chunked sentences
+        :param grammar: The initial grammar to display.
+        :param tagset: Dictionary from tags to string descriptions, used
+            for the help page.  Defaults to ``self.TAGSET``.
+        """
+        self._chunk_label = chunk_label
+
+        if tagset is None: tagset = self.TAGSET
+        self.tagset = tagset
+
+        # Named development sets:
+        if devset is None:
+            if devset_name == 'conll2000':
+                devset = conll2000.chunked_sents('train.txt')#[:100]
+            elif devset == 'treebank':
+                devset = treebank_chunk.chunked_sents()#[:100]
+            else:
+                raise ValueError('Unknown development set %s' % devset_name)
+
+        self.chunker = None
+        """The chunker built from the grammar string"""
+
+        self.grammar = grammar
+        """The unparsed grammar string"""
+
+        self.normalized_grammar = None
+        """A normalized version of ``self.grammar``."""
+
+        self.grammar_changed = 0
+        """The last time() that the grammar was changed."""
+
+        self.devset = devset
+        """The development set -- a list of chunked sentences."""
+
+        self.devset_name = devset_name
+        """The name of the development set (for save files)."""
+
+        self.devset_index = -1
+        """The index into the development set of the first instance
+           that's currently being viewed."""
+
+        self._last_keypress = 0
+        """The time() when a key was most recently pressed"""
+
+        self._history = []
+        """A list of (grammar, precision, recall, fscore) tuples for
+           grammars that the user has already tried."""
+
+        self._history_index = 0
+        """When the user is scrolling through previous grammars, this
+           is used to keep track of which grammar they're looking at."""
+
+        self._eval_grammar = None
+        """The grammar that is being currently evaluated by the eval
+           demon."""
+
+        self._eval_normalized_grammar = None
+        """A normalized copy of ``_eval_grammar``."""
+
+        self._eval_index = 0
+        """The index of the next sentence in the development set that
+           should be looked at by the eval demon."""
+
+        self._eval_score = ChunkScore(chunk_label=chunk_label)
+        """The ``ChunkScore`` object that's used to keep track of the score
+        of the current grammar on the development set."""
+
+        # Set up the main window.
+        top = self.top = Tk()
+        top.geometry('+50+50')
+        top.title('Regexp Chunk Parser App')
+        top.bind('<Control-q>', self.destroy)
+
+        # Varaible that restricts how much of the devset we look at.
+        self._devset_size = IntVar(top)
+        self._devset_size.set(100)
+
+        # Set up all the tkinter widgets
+        self._init_fonts(top)
+        self._init_widgets(top)
+        self._init_bindings(top)
+        self._init_menubar(top)
+        self.grammarbox.focus()
+
+
+        # If a grammar was given, then display it.
+        if grammar:
+            self.grammarbox.insert('end', grammar+'\n')
+            self.grammarbox.mark_set('insert', '1.0')
+
+        # Display the first item in the development set
+        self.show_devset(0)
+        self.update()
+
+    def _init_bindings(self, top):
+        top.bind('<Control-n>', self._devset_next)
+        top.bind('<Control-p>', self._devset_prev)
+        top.bind('<Control-t>', self.toggle_show_trace)
+        top.bind('<KeyPress>', self.update)
+        top.bind('<Control-s>', lambda e: self.save_grammar())
+        top.bind('<Control-o>', lambda e: self.load_grammar())
+        self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
+        self.grammarbox.bind('<Control-n>', self._devset_next)
+        self.grammarbox.bind('<Control-p>', self._devset_prev)
+
+        # Redraw the eval graph when the window size changes
+        self.evalbox.bind('<Configure>', self._eval_plot)
+
+    def _init_fonts(self, top):
+        # TWhat's our font size (default=same as sysfont)
+        self._size = IntVar(top)
+        self._size.set(20)
+        self._font = Font(family='helvetica',
+                                 size=-self._size.get())
+        self._smallfont = Font(family='helvetica',
+                                      size=-(int(self._size.get()*14//20)))
+
+    def _init_menubar(self, parent):
+        menubar = Menu(parent)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Reset Application', underline=0,
+                             command=self.reset)
+        filemenu.add_command(label='Save Current Grammar', underline=0,
+                             accelerator='Ctrl-s',
+                             command=self.save_grammar)
+        filemenu.add_command(label='Load Grammar', underline=0,
+                             accelerator='Ctrl-o',
+                             command=self.load_grammar)
+
+        filemenu.add_command(label='Save Grammar History', underline=13,
+                             command=self.save_history)
+
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-q')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        viewmenu = Menu(menubar, tearoff=0)
+        viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+                                 underline=0, value=10, command=self.resize)
+        viewmenu.add_radiobutton(label='Small', variable=self._size,
+                                 underline=0, value=16, command=self.resize)
+        viewmenu.add_radiobutton(label='Medium', variable=self._size,
+                                 underline=0, value=20, command=self.resize)
+        viewmenu.add_radiobutton(label='Large', variable=self._size,
+                                 underline=0, value=24, command=self.resize)
+        viewmenu.add_radiobutton(label='Huge', variable=self._size,
+                                 underline=0, value=34, command=self.resize)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
+
+        devsetmenu = Menu(menubar, tearoff=0)
+        devsetmenu.add_radiobutton(label='50 sentences',
+                                   variable=self._devset_size,
+                                   value=50, command=self.set_devset_size)
+        devsetmenu.add_radiobutton(label='100 sentences',
+                                   variable=self._devset_size,
+                                   value=100, command=self.set_devset_size)
+        devsetmenu.add_radiobutton(label='200 sentences',
+                                   variable=self._devset_size,
+                                   value=200, command=self.set_devset_size)
+        devsetmenu.add_radiobutton(label='500 sentences',
+                                   variable=self._devset_size,
+                                   value=500, command=self.set_devset_size)
+        menubar.add_cascade(label='Development-Set', underline=0,
+                            menu=devsetmenu)
+
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label='About', underline=0,
+                             command=self.about)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+
+        parent.config(menu=menubar)
+
+    def toggle_show_trace(self, *e):
+        if self._showing_trace:
+            self.show_devset()
+        else:
+            self.show_trace()
+        return 'break'
+
+
+    _SCALE_N = 5 # center on the last 5 examples.
+    _DRAW_LINES = False
+    def _eval_plot(self, *e, **config):
+        width = config.get('width', self.evalbox.winfo_width())
+        height = config.get('height', self.evalbox.winfo_height())
+
+        # Clear the canvas
+        self.evalbox.delete('all')
+
+        # Draw the precision & recall labels.
+        tag = self.evalbox.create_text(10, height//2-10, justify='left',
+                                 anchor='w', text='Precision')
+        left, right = self.evalbox.bbox(tag)[2] + 5, width-10
+        tag = self.evalbox.create_text(left + (width-left)//2, height-10,
+                                anchor='s', text='Recall', justify='center')
+        top, bot = 10, self.evalbox.bbox(tag)[1]-10
+
+        # Draw masks for clipping the plot.
+        bg = self._EVALBOX_PARAMS['background']
+        self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000,
+                                                         fill=bg, outline=bg))
+        self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000,
+                                                         fill=bg, outline=bg))
+
+        # Calculate the plot's scale.
+        if self._autoscale.get() and len(self._history) > 1:
+            max_precision = max_recall = 0
+            min_precision = min_recall = 1
+            for i in range(1, min(len(self._history), self._SCALE_N+1)):
+                grammar, precision, recall, fmeasure = self._history[-i]
+                min_precision = min(precision, min_precision)
+                min_recall = min(recall, min_recall)
+                max_precision = max(precision, max_precision)
+                max_recall = max(recall, max_recall)
+#             if max_precision-min_precision > max_recall-min_recall:
+#                 min_recall -= (max_precision-min_precision)/2
+#                 max_recall += (max_precision-min_precision)/2
+#             else:
+#                 min_precision -= (max_recall-min_recall)/2
+#                 max_precision += (max_recall-min_recall)/2
+#             if min_recall < 0:
+#                 max_recall -= min_recall
+#                 min_recall = 0
+#             if min_precision < 0:
+#                 max_precision -= min_precision
+#                 min_precision = 0
+            min_precision = max(min_precision-.01, 0)
+            min_recall = max(min_recall-.01, 0)
+            max_precision = min(max_precision+.01, 1)
+            max_recall = min(max_recall+.01, 1)
+        else:
+            min_precision = min_recall = 0
+            max_precision = max_recall = 1
+
+        # Draw the axis lines & grid lines
+        for i in range(11):
+            x = left + (right-left)*((i/10.-min_recall)/
+                                     (max_recall-min_recall))
+            y = bot - (bot-top)*((i/10.-min_precision)/
+                                 (max_precision-min_precision))
+            if left < x < right:
+                self.evalbox.create_line(x, top, x, bot, fill='#888')
+            if top < y < bot:
+                self.evalbox.create_line(left, y, right, y, fill='#888')
+        self.evalbox.create_line(left, top, left, bot)
+        self.evalbox.create_line(left, bot, right, bot)
+
+        # Display the plot's scale
+        self.evalbox.create_text(
+            left-3, bot, justify='right', anchor='se',
+            text='%d%%' % (100*min_precision))
+        self.evalbox.create_text(
+            left-3, top, justify='right', anchor='ne',
+            text='%d%%' % (100*max_precision))
+        self.evalbox.create_text(
+            left, bot+3, justify='center', anchor='nw',
+            text='%d%%' % (100*min_recall))
+        self.evalbox.create_text(
+            right, bot+3, justify='center', anchor='ne',
+            text='%d%%' % (100*max_recall))
+
+        # Display the scores.
+        prev_x = prev_y = None
+        for i, (_, precision, recall, fscore) in enumerate(self._history):
+            x = left + (right-left) * ((recall-min_recall) /
+                                (max_recall-min_recall))
+            y = bot - (bot-top) * ((precision-min_precision) /
+                                (max_precision-min_precision))
+            if i == self._history_index:
+                self.evalbox.create_oval(x-2,y-2,x+2,y+2,
+                                         fill='#0f0', outline='#000')
+                self.status['text'] = (
+                    'Precision: %.2f%%\t' % (precision*100)+
+                    'Recall: %.2f%%\t' % (recall*100)+
+                    'F-score: %.2f%%' % (fscore*100))
+            else:
+                self.evalbox.lower(
+                    self.evalbox.create_oval(x-2,y-2,x+2,y+2,
+                                             fill='#afa', outline='#8c8'))
+            if prev_x is not None and self._eval_lines.get():
+                self.evalbox.lower(
+                    self.evalbox.create_line(prev_x, prev_y, x, y,
+                                             fill='#8c8'))
+            prev_x, prev_y = x, y
+
+    _eval_demon_running = False
+    def _eval_demon(self):
+        if self.top is None: return
+        if self.chunker is None:
+            self._eval_demon_running = False
+            return
+
+        # Note our starting time.
+        t0 = time.time()
+
+        # If are still typing, then wait for them to finish.
+        if (time.time()-self._last_keypress < self._EVAL_DELAY and
+            self.normalized_grammar != self._eval_normalized_grammar):
+            self._eval_demon_running = True
+            return self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
+
+        # If the grammar changed, restart the evaluation.
+        if self.normalized_grammar != self._eval_normalized_grammar:
+            # Check if we've seen this grammar already.  If so, then
+            # just use the old evaluation values.
+            for (g, p, r, f) in self._history:
+                if self.normalized_grammar == self.normalize_grammar(g):
+                    self._history.append( (g, p, r, f) )
+                    self._history_index = len(self._history) - 1
+                    self._eval_plot()
+                    self._eval_demon_running = False
+                    self._eval_normalized_grammar = None
+                    return
+            self._eval_index = 0
+            self._eval_score = ChunkScore(chunk_label=self._chunk_label)
+            self._eval_grammar = self.grammar
+            self._eval_normalized_grammar = self.normalized_grammar
+
+        # If the grammar is empty, the don't bother evaluating it, or
+        # recording it in history -- the score will just be 0.
+        if self.normalized_grammar.strip() == '':
+            #self._eval_index = self._devset_size.get()
+            self._eval_demon_running = False
+            return
+
+        # Score the next set of examples
+        for gold in self.devset[self._eval_index:
+                                min(self._eval_index+self._EVAL_CHUNK,
+                                    self._devset_size.get())]:
+            guess = self._chunkparse(gold.leaves())
+            self._eval_score.score(gold, guess)
+
+        # update our index in the devset.
+        self._eval_index += self._EVAL_CHUNK
+
+        # Check if we're done
+        if self._eval_index >= self._devset_size.get():
+            self._history.append( (self._eval_grammar,
+                                   self._eval_score.precision(),
+                                   self._eval_score.recall(),
+                                   self._eval_score.f_measure()) )
+            self._history_index = len(self._history)-1
+            self._eval_plot()
+            self._eval_demon_running = False
+            self._eval_normalized_grammar = None
+        else:
+            progress = 100*self._eval_index/self._devset_size.get()
+            self.status['text'] = ('Evaluating on Development Set (%d%%)' %
+                                   progress)
+            self._eval_demon_running = True
+            self._adaptively_modify_eval_chunk(time.time() - t0)
+            self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
+
+    def _adaptively_modify_eval_chunk(self, t):
+        """
+        Modify _EVAL_CHUNK to try to keep the amount of time that the
+        eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
+
+        :param t: The amount of time that the eval demon took.
+        """
+        if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
+            self._EVAL_CHUNK = min(self._EVAL_CHUNK-1,
+                         max(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MAX/t)),
+                             self._EVAL_CHUNK-10))
+        elif t < self._EVAL_DEMON_MIN:
+            self._EVAL_CHUNK = max(self._EVAL_CHUNK+1,
+                         min(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MIN/t)),
+                             self._EVAL_CHUNK+10))
+
+    def _init_widgets(self, top):
+        frame0 = Frame(top, **self._FRAME_PARAMS)
+        frame0.grid_columnconfigure(0, weight=4)
+        frame0.grid_columnconfigure(3, weight=2)
+        frame0.grid_rowconfigure(1, weight=1)
+        frame0.grid_rowconfigure(5, weight=1)
+
+        # The grammar
+        self.grammarbox = Text(frame0, font=self._font,
+                               **self._GRAMMARBOX_PARAMS)
+        self.grammarlabel = Label(frame0, font=self._font, text='Grammar:',
+                      highlightcolor='black',
+                      background=self._GRAMMARBOX_PARAMS['background'])
+        self.grammarlabel.grid(column=0, row=0, sticky='SW')
+        self.grammarbox.grid(column=0, row=1, sticky='NEWS')
+
+        # Scroll bar for grammar
+        grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
+        grammar_scrollbar.grid(column=1, row=1, sticky='NWS')
+        self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
+
+        # grammar buttons
+        bg = self._FRAME_PARAMS['background']
+        frame3 = Frame(frame0, background=bg)
+        frame3.grid(column=0, row=2, sticky='EW')
+        Button(frame3, text='Prev Grammar', command=self._history_prev,
+               **self._BUTTON_PARAMS).pack(side='left')
+        Button(frame3, text='Next Grammar', command=self._history_next,
+               **self._BUTTON_PARAMS).pack(side='left')
+
+        # Help box
+        self.helpbox = Text(frame0, font=self._smallfont,
+                            **self._HELPBOX_PARAMS)
+        self.helpbox.grid(column=3, row=1, sticky='NEWS')
+        self.helptabs = {}
+        bg = self._FRAME_PARAMS['background']
+        helptab_frame = Frame(frame0, background=bg)
+        helptab_frame.grid(column=3, row=0, sticky='SW')
+        for i, (tab, tabstops, text) in enumerate(self.HELP):
+            label = Label(helptab_frame, text=tab, font=self._smallfont)
+            label.grid(column=i*2, row=0, sticky='S')
+            #help_frame.grid_columnconfigure(i, weight=1)
+            #label.pack(side='left')
+            label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
+            self.helptabs[tab] = label
+            Frame(helptab_frame, height=1, width=self._HELPTAB_SPACER,
+                  background=bg).grid(column=i*2+1, row=0)
+        self.helptabs[self.HELP[0][0]].configure(font=self._font)
+        self.helpbox.tag_config('elide', elide=True)
+        for (tag, params) in self.HELP_AUTOTAG:
+            self.helpbox.tag_config('tag-%s' % tag, **params)
+        self.show_help(self.HELP[0][0])
+
+        # Scroll bar for helpbox
+        help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
+        self.helpbox.config(yscrollcommand=help_scrollbar.set)
+        help_scrollbar.grid(column=4, row=1, sticky='NWS')
+
+        # The dev set
+        frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
+        self.devsetbox = Text(frame4, font=self._font,
+                              **self._DEVSETBOX_PARAMS)
+        self.devsetbox.pack(expand=True, fill='both')
+        self.devsetlabel = Label(frame0, font=self._font,
+                      text='Development Set:', justify='right',
+                      background=self._DEVSETBOX_PARAMS['background'])
+        self.devsetlabel.grid(column=0, row=4, sticky='SW')
+        frame4.grid(column=0, row=5, sticky='NEWS')
+
+        # dev set scrollbars
+        self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
+        self.devset_scroll.grid(column=1, row=5, sticky='NWS')
+        self.devset_xscroll = Scrollbar(frame4, command=self.devsetbox.xview,
+                                        orient='horiz')
+        self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
+        self.devset_xscroll.pack(side='bottom', fill='x')
+
+        # dev set buttons
+        bg = self._FRAME_PARAMS['background']
+        frame1 = Frame(frame0, background=bg)
+        frame1.grid(column=0, row=7, sticky='EW')
+        Button(frame1, text='Prev Example (Ctrl-p)',
+               command=self._devset_prev,
+               **self._BUTTON_PARAMS).pack(side='left')
+        Button(frame1, text='Next Example (Ctrl-n)',
+               command=self._devset_next,
+               **self._BUTTON_PARAMS).pack(side='left')
+        self.devset_button = Button(frame1, text='Show example',
+                                   command=self.show_devset,
+                                    state='disabled',
+                                   **self._BUTTON_PARAMS)
+        self.devset_button.pack(side='right')
+        self.trace_button = Button(frame1, text='Show trace',
+                                   command=self.show_trace,
+                                   **self._BUTTON_PARAMS)
+        self.trace_button.pack(side='right')
+
+
+        # evaluation box
+        self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
+        label = Label(frame0, font=self._font, text='Evaluation:',
+              justify='right', background=self._EVALBOX_PARAMS['background'])
+        label.grid(column=3, row=4, sticky='SW')
+        self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)
+
+        # evaluation box buttons
+        bg = self._FRAME_PARAMS['background']
+        frame2 = Frame(frame0, background=bg)
+        frame2.grid(column=3, row=7, sticky='EW')
+        self._autoscale = IntVar(self.top)
+        self._autoscale.set(False)
+        Checkbutton(frame2, variable=self._autoscale, command=self._eval_plot,
+                    text='Zoom', **self._BUTTON_PARAMS).pack(side='left')
+        self._eval_lines = IntVar(self.top)
+        self._eval_lines.set(False)
+        Checkbutton(frame2, variable=self._eval_lines, command=self._eval_plot,
+                    text='Lines', **self._BUTTON_PARAMS).pack(side='left')
+        Button(frame2, text='History',
+               **self._BUTTON_PARAMS).pack(side='right')
+
+        # The status label
+        self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
+        self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2,
+                         columnspan=5)
+
+        # Help box & devset box can't be edited.
+        self.helpbox['state'] = 'disabled'
+        self.devsetbox['state'] = 'disabled'
+
+        # Spacers
+        bg = self._FRAME_PARAMS['background']
+        Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
+        Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
+        Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
+
+        # pack the frame.
+        frame0.pack(fill='both', expand=True)
+
+        # Set up colors for the devset box
+        self.devsetbox.tag_config('true-pos', background='#afa',
+                                  underline='True')
+        self.devsetbox.tag_config('false-neg', underline='True',
+                                foreground='#800')
+        self.devsetbox.tag_config('false-pos', background='#faa')
+        self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
+        self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
+        self.devsetbox.tag_config('error', foreground='#800')
+
+        # And for the grammarbox
+        self.grammarbox.tag_config('error', background='#fec')
+        self.grammarbox.tag_config('comment', foreground='#840')
+        self.grammarbox.tag_config('angle', foreground='#00f')
+        self.grammarbox.tag_config('brace', foreground='#0a0')
+        self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
+
+    _showing_trace = False
+    def show_trace(self, *e):
+        self._showing_trace = True
+        self.trace_button['state'] = 'disabled'
+        self.devset_button['state'] = 'normal'
+
+        self.devsetbox['state'] = 'normal'
+        #self.devsetbox['wrap'] = 'none'
+        self.devsetbox.delete('1.0', 'end')
+        self.devsetlabel['text']='Development Set (%d/%d)' % (
+            (self.devset_index+1, self._devset_size.get()))
+
+        if self.chunker is None:
+            self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
+            self.devsetbox.tag_add('error', '1.0', 'end')
+            return # can't do anything more
+
+        gold_tree = self.devset[self.devset_index]
+        rules = self.chunker.rules()
+
+        # Calculate the tag sequence
+        tagseq = '\t'
+        charnum = [1]
+        for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
+            tagseq += '%s ' % pos
+            charnum.append(len(tagseq))
+        self.charnum = dict(((i, j), charnum[j])
+                            for i in range(len(rules)+1)
+                            for j in range(len(charnum)))
+        self.linenum = dict((i,i*2+2) for i in range(len(rules)+1))
+
+        for i in range(len(rules)+1):
+            if i == 0:
+                self.devsetbox.insert('end', 'Start:\n')
+                self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
+            else:
+                self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1])
+                self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
+            # Display the tag sequence.
+            self.devsetbox.insert('end', tagseq+'\n')
+            self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c')
+            # Run a partial parser, and extract gold & test chunks
+            chunker = RegexpChunkParser(rules[:i])
+            test_tree = self._chunkparse(gold_tree.leaves())
+            gold_chunks = self._chunks(gold_tree)
+            test_chunks = self._chunks(test_tree)
+            # Compare them.
+            for chunk in gold_chunks.intersection(test_chunks):
+                self._color_chunk(i, chunk, 'true-pos')
+            for chunk in gold_chunks - test_chunks:
+                self._color_chunk(i, chunk, 'false-neg')
+            for chunk in test_chunks - gold_chunks:
+                self._color_chunk(i, chunk, 'false-pos')
+        self.devsetbox.insert('end', 'Finished.\n')
+        self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
+
+        # This is a hack, because the x-scrollbar isn't updating its
+        # position right -- I'm not sure what the underlying cause is
+        # though.  (This is on OS X w/ python 2.5)
+        self.top.after(100, self.devset_xscroll.set, 0, .3)
+
+    def show_help(self, tab):
+        self.helpbox['state'] = 'normal'
+        self.helpbox.delete('1.0', 'end')
+        for (name, tabstops, text) in self.HELP:
+            if name == tab:
+                text = text.replace('<<TAGSET>>', '\n'.join(
+                    ('\t%s\t%s' % item for item in sorted(list(self.tagset.items()),
+                    key=lambda t_w:re.match('\w+',t_w[0]) and (0,t_w[0]) or (1,t_w[0])))))
+
+                self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
+                self.helpbox.config(tabs=tabstops)
+                self.helpbox.insert('1.0', text+'\n'*20)
+                C = '1.0 + %d chars'
+                for (tag, params) in self.HELP_AUTOTAG:
+                    pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
+                    for m in re.finditer(pattern, text):
+                        self.helpbox.tag_add('elide',
+                                             C % m.start(1), C % m.end(1))
+                        self.helpbox.tag_add('tag-%s' % tag,
+                                             C % m.start(2), C % m.end(2))
+                        self.helpbox.tag_add('elide',
+                                             C % m.start(3), C % m.end(3))
+            else:
+                self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
+        self.helpbox['state'] = 'disabled'
+
+    def _history_prev(self, *e):
+        self._view_history(self._history_index-1)
+        return 'break'
+
+    def _history_next(self, *e):
+        self._view_history(self._history_index+1)
+        return 'break'
+
+    def _view_history(self, index):
+        # Bounds & sanity checking:
+        index = max(0, min(len(self._history)-1, index))
+        if not self._history: return
+        # Already viewing the requested history item?
+        if index == self._history_index:
+            return
+        # Show the requested grammar.  It will get added to _history
+        # only if they edit it (causing self.update() to get run.)
+        self.grammarbox['state'] = 'normal'
+        self.grammarbox.delete('1.0', 'end')
+        self.grammarbox.insert('end', self._history[index][0])
+        self.grammarbox.mark_set('insert', '1.0')
+        self._history_index = index
+        self._syntax_highlight_grammar(self._history[index][0])
+        # Record the normalized grammar & regenerate the chunker.
+        self.normalized_grammar = self.normalize_grammar(
+            self._history[index][0])
+        if self.normalized_grammar:
+            rules = [RegexpChunkRule.fromstring(line)
+                     for line in self.normalized_grammar.split('\n')]
+        else:
+            rules = []
+        self.chunker = RegexpChunkParser(rules)
+        # Show the score.
+        self._eval_plot()
+        # Update the devset box
+        self._highlight_devset()
+        if self._showing_trace: self.show_trace()
+        # Update the grammar label
+        if self._history_index < len(self._history)-1:
+            self.grammarlabel['text'] = 'Grammar %s/%s:' % (
+                self._history_index+1, len(self._history))
+        else:
+            self.grammarlabel['text'] = 'Grammar:'
+
+    def _devset_next(self, *e):
+        self._devset_scroll('scroll', 1, 'page')
+        return 'break'
+
+    def _devset_prev(self, *e):
+        self._devset_scroll('scroll', -1, 'page')
+        return 'break'
+
+    def destroy(self, *e):
+        if self.top is None: return
+        self.top.destroy()
+        self.top = None
+
+    def _devset_scroll(self, command, *args):
+        N = 1 # size of a page -- one sentence.
+        showing_trace = self._showing_trace
+        if command == 'scroll' and args[1].startswith('unit'):
+            self.show_devset(self.devset_index+int(args[0]))
+        elif command == 'scroll' and args[1].startswith('page'):
+            self.show_devset(self.devset_index+N*int(args[0]))
+        elif command == 'moveto':
+            self.show_devset(int(float(args[0])*self._devset_size.get()))
+        else:
+            assert 0, 'bad scroll command %s %s' % (command, args)
+        if showing_trace:
+            self.show_trace()
+
+    def show_devset(self, index=None):
+        if index is None: index = self.devset_index
+
+        # Bounds checking
+        index = min(max(0, index), self._devset_size.get()-1)
+
+        if index == self.devset_index and not self._showing_trace: return
+        self.devset_index = index
+
+        self._showing_trace = False
+        self.trace_button['state'] = 'normal'
+        self.devset_button['state'] = 'disabled'
+
+        # Clear the text box.
+        self.devsetbox['state'] = 'normal'
+        self.devsetbox['wrap'] = 'word'
+        self.devsetbox.delete('1.0', 'end')
+        self.devsetlabel['text']='Development Set (%d/%d)' % (
+            (self.devset_index+1, self._devset_size.get()))
+
+        # Add the sentences
+        sample = self.devset[self.devset_index:self.devset_index+1]
+        self.charnum = {}
+        self.linenum = {0:1}
+        for sentnum, sent in enumerate(sample):
+            linestr = ''
+            for wordnum, (word, pos) in enumerate(sent.leaves()):
+                self.charnum[sentnum, wordnum] = len(linestr)
+                linestr += '%s/%s ' % (word, pos)
+                self.charnum[sentnum, wordnum+1] = len(linestr)
+            self.devsetbox.insert('end', linestr[:-1]+'\n\n')
+
+        # Highlight chunks in the dev set
+        if self.chunker is not None:
+            self._highlight_devset()
+        self.devsetbox['state'] = 'disabled'
+
+        # Update the scrollbar
+        first = self.devset_index/self._devset_size.get()
+        last = (self.devset_index + 2) / self._devset_size.get()
+        self.devset_scroll.set(first, last)
+
+    def _chunks(self, tree):
+        chunks = set()
+        wordnum = 0
+        for child in tree:
+            if isinstance(child, Tree):
+                if child.label() == self._chunk_label:
+                    chunks.add( (wordnum, wordnum+len(child)) )
+                wordnum += len(child)
+            else:
+                wordnum += 1
+        return chunks
+
+    def _syntax_highlight_grammar(self, grammar):
+        if self.top is None: return
+        self.grammarbox.tag_remove('comment', '1.0', 'end')
+        self.grammarbox.tag_remove('angle', '1.0', 'end')
+        self.grammarbox.tag_remove('brace', '1.0', 'end')
+        self.grammarbox.tag_add('hangindent', '1.0', 'end')
+        for lineno, line in enumerate(grammar.split('\n')):
+            if not line.strip(): continue
+            m = re.match(r'(\\.|[^#])*(#.*)?', line)
+            comment_start = None
+            if m.group(2):
+                comment_start = m.start(2)
+                s = '%d.%d' % (lineno+1, m.start(2))
+                e = '%d.%d' % (lineno+1, m.end(2))
+                self.grammarbox.tag_add('comment', s, e)
+            for m in re.finditer('[<>{}]', line):
+                if comment_start is not None and m.start() >= comment_start:
+                    break
+                s = '%d.%d' % (lineno+1, m.start())
+                e = '%d.%d' % (lineno+1, m.end())
+                if m.group() in '<>':
+                    self.grammarbox.tag_add('angle', s, e)
+                else:
+                    self.grammarbox.tag_add('brace', s, e)
+
+
+    def _grammarcheck(self, grammar):
+        if self.top is None: return
+        self.grammarbox.tag_remove('error', '1.0', 'end')
+        self._grammarcheck_errs = []
+        for lineno, line in enumerate(grammar.split('\n')):
+            line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
+            line = line.strip()
+            if line:
+                try:
+                    RegexpChunkRule.fromstring(line)
+                except ValueError as e:
+                    self.grammarbox.tag_add('error', '%s.0' % (lineno+1),
+                                            '%s.0 lineend' % (lineno+1))
+        self.status['text'] = ''
+
+    def update(self, *event):
+        # Record when update was called (for grammarcheck)
+        if event:
+            self._last_keypress = time.time()
+
+        # Read the grammar from the Text box.
+        self.grammar = grammar = self.grammarbox.get('1.0', 'end')
+
+        # If the grammar hasn't changed, do nothing:
+        normalized_grammar = self.normalize_grammar(grammar)
+        if normalized_grammar == self.normalized_grammar:
+            return
+        else:
+            self.normalized_grammar = normalized_grammar
+
+        # If the grammar has changed, and we're looking at history,
+        # then stop looking at history.
+        if self._history_index < len(self._history)-1:
+            self.grammarlabel['text'] = 'Grammar:'
+
+        self._syntax_highlight_grammar(grammar)
+
+        # The grammar has changed; try parsing it.  If it doesn't
+        # parse, do nothing.  (flag error location?)
+        try:
+            # Note: the normalized grammar has no blank lines.
+            if normalized_grammar:
+                rules = [RegexpChunkRule.fromstring(line)
+                         for line in normalized_grammar.split('\n')]
+            else:
+                rules = []
+        except ValueError as e:
+            # Use the un-normalized grammar for error highlighting.
+            self._grammarcheck(grammar)
+            self.chunker = None
+            return
+
+        self.chunker = RegexpChunkParser(rules)
+        self.grammarbox.tag_remove('error', '1.0', 'end')
+        self.grammar_changed = time.time()
+        # Display the results
+        if self._showing_trace:
+            self.show_trace()
+        else:
+            self._highlight_devset()
+        # Start the eval demon
+        if not self._eval_demon_running:
+            self._eval_demon()
+
+    def _highlight_devset(self, sample=None):
+        if sample is None:
+            sample = self.devset[self.devset_index:self.devset_index+1]
+
+        self.devsetbox.tag_remove('true-pos', '1.0', 'end')
+        self.devsetbox.tag_remove('false-neg', '1.0', 'end')
+        self.devsetbox.tag_remove('false-pos', '1.0', 'end')
+
+        # Run the grammar on the test cases.
+        for sentnum, gold_tree in enumerate(sample):
+            # Run the chunk parser
+            test_tree = self._chunkparse(gold_tree.leaves())
+            # Extract gold & test chunks
+            gold_chunks = self._chunks(gold_tree)
+            test_chunks = self._chunks(test_tree)
+            # Compare them.
+            for chunk in gold_chunks.intersection(test_chunks):
+                self._color_chunk(sentnum, chunk, 'true-pos')
+            for chunk in gold_chunks - test_chunks:
+                self._color_chunk(sentnum, chunk, 'false-neg')
+            for chunk in test_chunks - gold_chunks:
+                self._color_chunk(sentnum, chunk, 'false-pos')
+
+    def _chunkparse(self, words):
+        try:
+            return self.chunker.parse(words)
+        except (ValueError, IndexError) as e:
+            # There's an error somewhere in the grammar, but we're not sure
+            # exactly where, so just mark the whole grammar as bad.
+            # E.g., this is caused by: "({<NN>})"
+            self.grammarbox.tag_add('error', '1.0', 'end')
+            # Treat it as tagging nothing:
+            return words
+
+    def _color_chunk(self, sentnum, chunk, tag):
+        start, end = chunk
+        self.devsetbox.tag_add(tag,
+            '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
+            '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1))
+
+    def reset(self):
+        # Clear various variables
+        self.chunker = None
+        self.grammar = None
+        self.normalized_grammar = None
+        self.grammar_changed = 0
+        self._history = []
+        self._history_index = 0
+        # Update the on-screen display.
+        self.grammarbox.delete('1.0', 'end')
+        self.show_devset(0)
+        self.update()
+        #self._eval_plot()
+
+    SAVE_GRAMMAR_TEMPLATE = (
+        '# Regexp Chunk Parsing Grammar\n'
+        '# Saved %(date)s\n'
+        '#\n'
+        '# Development set: %(devset)s\n'
+        '#   Precision: %(precision)s\n'
+        '#   Recall:    %(recall)s\n'
+        '#   F-score:   %(fscore)s\n\n'
+        '%(grammar)s\n')
+
+    def save_grammar(self, filename=None):
+        if not filename:
+            ftypes = [('Chunk Gramamr', '.chunk'),
+                      ('All files', '*')]
+            filename = asksaveasfilename(filetypes=ftypes,
+                                                      defaultextension='.chunk')
+            if not filename: return
+        if (self._history and self.normalized_grammar ==
+            self.normalize_grammar(self._history[-1][0])):
+            precision, recall, fscore = ['%.2f%%' % (100*v) for v in
+                                         self._history[-1][1:]]
+        elif self.chunker is None:
+            precision = recall = fscore = 'Grammar not well formed'
+        else:
+            precision = recall = fscore = 'Not finished evaluation yet'
+
+        with open(filename, 'w') as outfile:
+            outfile.write(self.SAVE_GRAMMAR_TEMPLATE % dict(
+                date=time.ctime(), devset=self.devset_name,
+                precision=precision, recall=recall, fscore=fscore,
+                grammar=self.grammar.strip()))
+
+    def load_grammar(self, filename=None):
+        if not filename:
+            ftypes = [('Chunk Gramamr', '.chunk'),
+                      ('All files', '*')]
+            filename = askopenfilename(filetypes=ftypes,
+                                                    defaultextension='.chunk')
+            if not filename: return
+        self.grammarbox.delete('1.0', 'end')
+        self.update()
+        with open(filename, 'r') as infile:
+            grammar = infile.read()
+        grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*'
+                         'F-score:.*\n', '', grammar).lstrip()
+        self.grammarbox.insert('1.0', grammar)
+        self.update()
+
+    def save_history(self, filename=None):
+        if not filename:
+            ftypes = [('Chunk Gramamr History', '.txt'),
+                      ('All files', '*')]
+            filename = asksaveasfilename(filetypes=ftypes,
+                                                      defaultextension='.txt')
+            if not filename: return
+
+        with open(filename, 'w') as outfile:
+            outfile.write('# Regexp Chunk Parsing Grammar History\n')
+            outfile.write('# Saved %s\n' % time.ctime())
+            outfile.write('# Development set: %s\n' % self.devset_name)
+            for i, (g, p, r, f) in enumerate(self._history):
+                hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
+                       'fscore=%.2f%%)' % (i+1, len(self._history),
+                                           p*100, r*100, f*100))
+                outfile.write('\n%s\n' % hdr)
+                outfile.write(''.join('  %s\n' % line for line in g.strip().split()))
+
+            if not (self._history and self.normalized_grammar ==
+                    self.normalize_grammar(self._history[-1][0])):
+                if self.chunker is None:
+                    outfile.write('\nCurrent Grammar (not well-formed)\n')
+                else:
+                    outfile.write('\nCurrent Grammar (not evaluated)\n')
+                outfile.write(''.join('  %s\n' % line for line
+                                  in self.grammar.strip().split()))
+
+    def about(self, *e):
+        ABOUT = ("NLTK RegExp Chunk Parser Application\n"+
+                 "Written by Edward Loper")
+        TITLE = 'About: Regular Expression Chunk Parser Application'
+        try:
+            from six.moves.tkinter_messagebox import Message
+            Message(message=ABOUT, title=TITLE).show()
+        except:
+            ShowText(self.top, TITLE, ABOUT)
+
+    def set_devset_size(self, size=None):
+        if size is not None: self._devset_size.set(size)
+        self._devset_size.set(min(len(self.devset), self._devset_size.get()))
+        self.show_devset(1)
+        self.show_devset(0)
+        # what about history?  Evaluated at diff dev set sizes!
+
+    def resize(self, size=None):
+        if size is not None: self._size.set(size)
+        size = self._size.get()
+        self._font.configure(size=-(abs(size)))
+        self._smallfont.configure(size=min(-10, -(abs(size))*14//20))
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self.top.mainloop(*args, **kwargs)
+
+def app():
+    RegexpChunkApp().mainloop()
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/chunkparser_app.pyc b/nlp_resource_data/nltk/app/chunkparser_app.pyc

new file mode 100755 (executable)

index 0000000..4777811

Binary files /dev/null and b/nlp_resource_data/nltk/app/chunkparser_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/collocations_app.py b/nlp_resource_data/nltk/app/collocations_app.py

new file mode 100755 (executable)

index 0000000..49cbb8b
--- /dev/null
+++ b/nlp_resource_data/nltk/app/collocations_app.py
@@ -0,0 +1,347 @@
+# Natural Language Toolkit: Collocations Application
+# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+
+from __future__ import division
+
+import threading
+
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu,
+                               OptionMenu, SUNKEN, Scrollbar, StringVar,
+                               Text, Tk)
+
+from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino,
+                         indian, floresta, mac_morpho, machado, cess_esp)
+from nltk.util import in_idle
+from nltk.probability import FreqDist
+
+
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
+POLL_INTERVAL = 100
+
+_DEFAULT = 'English: Brown Corpus (Humor)'
+_CORPORA = {
+            'Catalan: CESS-CAT Corpus':
+                lambda: cess_cat.words(),
+            'English: Brown Corpus':
+                lambda: brown.words(),
+            'English: Brown Corpus (Press)':
+                lambda: brown.words(categories=['news', 'editorial', 'reviews']),
+            'English: Brown Corpus (Religion)':
+                lambda: brown.words(categories='religion'),
+            'English: Brown Corpus (Learned)':
+                lambda: brown.words(categories='learned'),
+            'English: Brown Corpus (Science Fiction)':
+                lambda: brown.words(categories='science_fiction'),
+            'English: Brown Corpus (Romance)':
+                lambda: brown.words(categories='romance'),
+            'English: Brown Corpus (Humor)':
+                lambda: brown.words(categories='humor'),
+            'English: NPS Chat Corpus':
+                lambda: nps_chat.words(),
+            'English: Wall Street Journal Corpus':
+                lambda: treebank.words(),
+            'Chinese: Sinica Corpus':
+                lambda: sinica_treebank.words(),
+            'Dutch: Alpino Corpus':
+                lambda: alpino.words(),
+            'Hindi: Indian Languages Corpus':
+                lambda: indian.words(files='hindi.pos'),
+            'Portuguese: Floresta Corpus (Portugal)':
+                lambda: floresta.words(),
+            'Portuguese: MAC-MORPHO Corpus (Brazil)':
+                lambda: mac_morpho.words(),
+            'Portuguese: Machado Corpus (Brazil)':
+                lambda: machado.words(),
+            'Spanish: CESS-ESP Corpus':
+                lambda: cess_esp.words()
+           }
+
+class CollocationsView:
+    _BACKGROUND_COLOUR='#FFF' #white
+
+    def __init__(self):
+        self.queue = q.Queue()
+        self.model = CollocationsModel(self.queue)
+        self.top = Tk()
+        self._init_top(self.top)
+        self._init_menubar()
+        self._init_widgets(self.top)
+        self.load_corpus(self.model.DEFAULT_CORPUS)
+        self.after = self.top.after(POLL_INTERVAL, self._poll)
+
+    def _init_top(self, top):
+        top.geometry('550x650+50+50')
+        top.title('NLTK Collocations List')
+        top.bind('<Control-q>', self.destroy)
+        top.protocol('WM_DELETE_WINDOW', self.destroy)
+        top.minsize(550,650)
+
+    def _init_widgets(self, parent):
+        self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1))
+        self._init_corpus_select(self.main_frame)
+        self._init_results_box(self.main_frame)
+        self._init_paging(self.main_frame)
+        self._init_status(self.main_frame)
+        self.main_frame.pack(fill='both', expand=True)
+
+    def _init_corpus_select(self, parent):
+        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
+        self.var = StringVar(innerframe)
+        self.var.set(self.model.DEFAULT_CORPUS)
+        Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
+
+        other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS)
+        om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
+        om['borderwidth'] = 0
+        om['highlightthickness'] = 1
+        om.pack(side='left')
+        innerframe.pack(side='top', fill='x', anchor='n')
+
+    def _init_status(self, parent):
+        self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0)
+        self.status.pack(side='top', anchor='sw')
+
+    def _init_menubar(self):
+        self._result_size = IntVar(self.top)
+        menubar = Menu(self.top)
+
+        filemenu = Menu(menubar, tearoff=0, borderwidth=0)
+        filemenu.add_command(label='Exit', underline=1,
+                   command=self.destroy, accelerator='Ctrl-q')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        editmenu = Menu(menubar, tearoff=0)
+        rescntmenu = Menu(editmenu, tearoff=0)
+        rescntmenu.add_radiobutton(label='20', variable=self._result_size,
+                     underline=0, value=20, command=self.set_result_size)
+        rescntmenu.add_radiobutton(label='50', variable=self._result_size,
+                     underline=0, value=50, command=self.set_result_size)
+        rescntmenu.add_radiobutton(label='100', variable=self._result_size,
+                     underline=0, value=100, command=self.set_result_size)
+        rescntmenu.invoke(1)
+        editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
+
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
+        self.top.config(menu=menubar)
+
+    def set_result_size(self, **kwargs):
+        self.model.result_count = self._result_size.get()
+
+    def _init_results_box(self, parent):
+        innerframe = Frame(parent)
+        i1 = Frame(innerframe)
+        i2 = Frame(innerframe)
+        vscrollbar = Scrollbar(i1, borderwidth=1)
+        hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
+        self.results_box = Text(i1,
+                    font=Font(family='courier', size='16'),
+                    state='disabled', borderwidth=1,
+                    yscrollcommand=vscrollbar.set,
+                    xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
+        self.results_box.pack(side='left', fill='both', expand=True)
+        vscrollbar.pack(side='left', fill='y', anchor='e')
+        vscrollbar.config(command=self.results_box.yview)
+        hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
+        hscrollbar.config(command=self.results_box.xview)
+        #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
+        Label(i2, text='   ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
+        i1.pack(side='top', fill='both', expand=True, anchor='n')
+        i2.pack(side='bottom', fill='x', anchor='s')
+        innerframe.pack(side='top', fill='both', expand=True)
+
+    def _init_paging(self, parent):
+        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
+        self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+        prev.pack(side='left', anchor='center')
+        self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+        next.pack(side='right', anchor='center')
+        innerframe.pack(side='top', fill='y')
+        self.reset_current_page()
+
+    def reset_current_page(self):
+        self.current_page = -1
+
+    def _poll(self):
+        try:
+            event = self.queue.get(block=False)
+        except q.Empty:
+            pass
+        else:
+            if event == CORPUS_LOADED_EVENT:
+                self.handle_corpus_loaded(event)
+            elif event == ERROR_LOADING_CORPUS_EVENT:
+                self.handle_error_loading_corpus(event)
+        self.after = self.top.after(POLL_INTERVAL, self._poll)
+
+    def handle_error_loading_corpus(self, event):
+        self.status['text'] = 'Error in loading ' + self.var.get()
+        self.unfreeze_editable()
+        self.clear_results_box()
+        self.freeze_editable()
+        self.reset_current_page()
+
+    def handle_corpus_loaded(self, event):
+        self.status['text'] = self.var.get() + ' is loaded'
+        self.unfreeze_editable()
+        self.clear_results_box()
+        self.reset_current_page()
+        #self.next()
+        collocations = self.model.next(self.current_page + 1)
+        self.write_results(collocations)
+        self.current_page += 1
+
+    def corpus_selected(self, *args):
+        new_selection = self.var.get()
+        self.load_corpus(new_selection)
+
+    def previous(self):
+        self.freeze_editable()
+        collocations = self.model.prev(self.current_page - 1)
+        self.current_page= self.current_page - 1
+        self.clear_results_box()
+        self.write_results(collocations)
+        self.unfreeze_editable()
+
+    def __next__(self):
+        self.freeze_editable()
+        collocations = self.model.next(self.current_page + 1)
+        self.clear_results_box()
+        self.write_results(collocations)
+        self.current_page += 1
+        self.unfreeze_editable()
+
+    def load_corpus(self, selection):
+        if self.model.selected_corpus != selection:
+            self.status['text'] = 'Loading ' + selection + '...'
+            self.freeze_editable()
+            self.model.load_corpus(selection)
+
+    def freeze_editable(self):
+        self.prev['state'] = 'disabled'
+        self.next['state'] = 'disabled'
+
+    def clear_results_box(self):
+        self.results_box['state'] = 'normal'
+        self.results_box.delete("1.0", END)
+        self.results_box['state'] = 'disabled'
+
+    def fire_event(self, event):
+        #Firing an event so that rendering of widgets happen in the mainloop thread
+        self.top.event_generate(event, when='tail')
+
+    def destroy(self, *e):
+        if self.top is None: return
+        self.top.after_cancel(self.after)
+        self.top.destroy()
+        self.top = None
+
+    def mainloop(self, *args, **kwargs):
+        if in_idle(): return
+        self.top.mainloop(*args, **kwargs)
+
+    def unfreeze_editable(self):
+        self.set_paging_button_states()
+
+    def set_paging_button_states(self):
+        if self.current_page == -1 or self.current_page == 0:
+            self.prev['state'] = 'disabled'
+        else:
+            self.prev['state'] = 'normal'
+        if self.model.is_last_page(self.current_page):
+            self.next['state'] = 'disabled'
+        else:
+            self.next['state'] = 'normal'
+
+    def write_results(self, results):
+        self.results_box['state'] = 'normal'
+        row = 1
+        for each in results:
+            self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
+            row += 1
+        self.results_box['state'] = 'disabled'
+
+class CollocationsModel:
+    def __init__(self, queue):
+        self.result_count = None
+        self.selected_corpus = None
+        self.collocations = None
+        self.CORPORA = _CORPORA
+        self.DEFAULT_CORPUS = _DEFAULT
+        self.queue = queue
+        self.reset_results()
+
+    def reset_results(self):
+        self.result_pages = []
+        self.results_returned = 0
+
+    def load_corpus(self, name):
+        self.selected_corpus = name
+        self.collocations = None
+        runner_thread = self.LoadCorpus(name, self)
+        runner_thread.start()
+        self.reset_results()
+
+    def non_default_corpora(self):
+        copy = []
+        copy.extend(list(self.CORPORA.keys()))
+        copy.remove(self.DEFAULT_CORPUS)
+        copy.sort()
+        return copy
+
+    def is_last_page(self, number):
+        if number < len(self.result_pages):
+            return False
+        return self.results_returned + (number - len(self.result_pages)) * self.result_count >= len(self.collocations)
+
+    def next(self, page):
+        if (len(self.result_pages) - 1) < page:
+            for i in range(page - (len(self.result_pages) - 1)):
+                self.result_pages.append(self.collocations[self.results_returned:self.results_returned+self.result_count])
+                self.results_returned += self.result_count
+        return self.result_pages[page]
+
+    def prev(self, page):
+        if page == -1:
+            return []
+        return self.result_pages[page]
+
+    class LoadCorpus(threading.Thread):
+        def __init__(self, name, model):
+            threading.Thread.__init__(self)
+            self.model, self.name = model, name
+
+        def run(self):
+            try:
+                words = self.model.CORPORA[self.name]()
+                from operator import itemgetter
+                text = [w for w in words if len(w) > 2]
+                fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1))
+                vocab = FreqDist(text)
+                scored = [((w1,w2), fd[(w1,w2)] ** 3 / (vocab[w1] * vocab[w2])) for w1, w2 in fd]
+                scored.sort(key=itemgetter(1), reverse=True)
+                self.model.collocations = list(map(itemgetter(0), scored))
+                self.model.queue.put(CORPUS_LOADED_EVENT)
+            except Exception as e:
+                print(e)
+                self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
+
+#def collocations():
+#    colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
+
+def app():
+    c = CollocationsView()
+    c.mainloop()
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/collocations_app.pyc b/nlp_resource_data/nltk/app/collocations_app.pyc

new file mode 100755 (executable)

index 0000000..b21f97c

Binary files /dev/null and b/nlp_resource_data/nltk/app/collocations_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/concordance_app.py b/nlp_resource_data/nltk/app/concordance_app.py

new file mode 100755 (executable)

index 0000000..53c7167
--- /dev/null
+++ b/nlp_resource_data/nltk/app/concordance_app.py
@@ -0,0 +1,568 @@
+# Natural Language Toolkit: Concordance Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+import nltk.compat
+import re
+import threading
+
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Tk, Button, END, Entry, Frame, IntVar, LEFT,
+                               Label, Menu, OptionMenu, SUNKEN, Scrollbar,
+                               StringVar, Text)
+
+from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank,
+                         alpino, indian, floresta, mac_morpho, cess_esp)
+from nltk.util import in_idle
+from nltk.draw.util import ShowText
+
+WORD_OR_TAG = '[^/ ]+'
+BOUNDARY = r'\b'
+
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
+SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
+
+POLL_INTERVAL = 50
+
+# NB All corpora must be specified in a lambda expression so as not to be
+# loaded when the module is imported.
+
+_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
+_CORPORA = {
+            'Catalan: CESS-CAT Corpus (simplified)':
+                lambda: cess_cat.tagged_sents(tagset='universal'),
+            'English: Brown Corpus':
+                lambda: brown.tagged_sents(),
+            'English: Brown Corpus (simplified)':
+                lambda: brown.tagged_sents(tagset='universal'),
+            'English: Brown Corpus (Press, simplified)':
+                lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'),
+            'English: Brown Corpus (Religion, simplified)':
+                lambda: brown.tagged_sents(categories='religion', tagset='universal'),
+            'English: Brown Corpus (Learned, simplified)':
+                lambda: brown.tagged_sents(categories='learned', tagset='universal'),
+            'English: Brown Corpus (Science Fiction, simplified)':
+                lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'),
+            'English: Brown Corpus (Romance, simplified)':
+                lambda: brown.tagged_sents(categories='romance', tagset='universal'),
+            'English: Brown Corpus (Humor, simplified)':
+                lambda: brown.tagged_sents(categories='humor', tagset='universal'),
+            'English: NPS Chat Corpus':
+                lambda: nps_chat.tagged_posts(),
+            'English: NPS Chat Corpus (simplified)':
+                lambda: nps_chat.tagged_posts(tagset='universal'),
+            'English: Wall Street Journal Corpus':
+                lambda: treebank.tagged_sents(),
+            'English: Wall Street Journal Corpus (simplified)':
+                lambda: treebank.tagged_sents(tagset='universal'),
+            'Chinese: Sinica Corpus':
+                lambda: sinica_treebank.tagged_sents(),
+            'Chinese: Sinica Corpus (simplified)':
+                lambda: sinica_treebank.tagged_sents(tagset='universal'),
+            'Dutch: Alpino Corpus':
+                lambda: alpino.tagged_sents(),
+            'Dutch: Alpino Corpus (simplified)':
+                lambda: alpino.tagged_sents(tagset='universal'),
+            'Hindi: Indian Languages Corpus':
+                lambda: indian.tagged_sents(files='hindi.pos'),
+            'Hindi: Indian Languages Corpus (simplified)':
+                lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
+            'Portuguese: Floresta Corpus (Portugal)':
+                lambda: floresta.tagged_sents(),
+            'Portuguese: Floresta Corpus (Portugal, simplified)':
+                lambda: floresta.tagged_sents(tagset='universal'),
+            'Portuguese: MAC-MORPHO Corpus (Brazil)':
+                lambda: mac_morpho.tagged_sents(),
+            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
+                lambda: mac_morpho.tagged_sents(tagset='universal'),
+            'Spanish: CESS-ESP Corpus (simplified)':
+                lambda: cess_esp.tagged_sents(tagset='universal'),
+           }
+
+class ConcordanceSearchView(object):
+    _BACKGROUND_COLOUR='#FFF' #white
+
+    #Colour of highlighted results
+    _HIGHLIGHT_WORD_COLOUR='#F00' #red
+    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'
+
+    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
+    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
+
+
+    #Percentage of text left of the scrollbar position
+    _FRACTION_LEFT_TEXT=0.30
+
+    def __init__(self):
+        self.queue = q.Queue()
+        self.model = ConcordanceSearchModel(self.queue)
+        self.top = Tk()
+        self._init_top(self.top)
+        self._init_menubar()
+        self._init_widgets(self.top)
+        self.load_corpus(self.model.DEFAULT_CORPUS)
+        self.after = self.top.after(POLL_INTERVAL, self._poll)
+
+    def _init_top(self, top):
+        top.geometry('950x680+50+50')
+        top.title('NLTK Concordance Search')
+        top.bind('<Control-q>', self.destroy)
+        top.protocol('WM_DELETE_WINDOW', self.destroy)
+        top.minsize(950,680)
+
+    def _init_widgets(self, parent):
+        self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1))
+        self._init_corpus_select(self.main_frame)
+        self._init_query_box(self.main_frame)
+        self._init_results_box(self.main_frame)
+        self._init_paging(self.main_frame)
+        self._init_status(self.main_frame)
+        self.main_frame.pack(fill='both', expand=True)
+
+    def _init_menubar(self):
+        self._result_size = IntVar(self.top)
+        self._cntx_bf_len = IntVar(self.top)
+        self._cntx_af_len = IntVar(self.top)
+        menubar = Menu(self.top)
+
+        filemenu = Menu(menubar, tearoff=0, borderwidth=0)
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-q')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        editmenu = Menu(menubar, tearoff=0)
+        rescntmenu = Menu(editmenu, tearoff=0)
+        rescntmenu.add_radiobutton(label='20', variable=self._result_size,
+                                   underline=0, value=20,
+                                   command=self.set_result_size)
+        rescntmenu.add_radiobutton(label='50', variable=self._result_size,
+                                   underline=0, value=50,
+                                   command=self.set_result_size)
+        rescntmenu.add_radiobutton(label='100', variable=self._result_size,
+                                   underline=0, value=100,
+                                   command=self.set_result_size)
+        rescntmenu.invoke(1)
+        editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
+
+        cntxmenu = Menu(editmenu, tearoff=0)
+        cntxbfmenu = Menu(cntxmenu, tearoff=0)
+        cntxbfmenu.add_radiobutton(label='60 characters',
+                                   variable=self._cntx_bf_len,
+                                   underline=0, value=60,
+                                   command=self.set_cntx_bf_len)
+        cntxbfmenu.add_radiobutton(label='80 characters',
+                                   variable=self._cntx_bf_len,
+                                   underline=0, value=80,
+                                   command=self.set_cntx_bf_len)
+        cntxbfmenu.add_radiobutton(label='100 characters',
+                                   variable=self._cntx_bf_len,
+                                   underline=0, value=100,
+                                   command=self.set_cntx_bf_len)
+        cntxbfmenu.invoke(1)
+        cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu)
+
+        cntxafmenu = Menu(cntxmenu, tearoff=0)
+        cntxafmenu.add_radiobutton(label='70 characters',
+                                   variable=self._cntx_af_len,
+                                   underline=0, value=70,
+                                   command=self.set_cntx_af_len)
+        cntxafmenu.add_radiobutton(label='90 characters',
+                                   variable=self._cntx_af_len,
+                                   underline=0, value=90,
+                                   command=self.set_cntx_af_len)
+        cntxafmenu.add_radiobutton(label='110 characters',
+                                   variable=self._cntx_af_len,
+                                   underline=0, value=110,
+                                   command=self.set_cntx_af_len)
+        cntxafmenu.invoke(1)
+        cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu)
+
+        editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu)
+
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
+
+        self.top.config(menu=menubar)
+
+    def set_result_size(self, **kwargs):
+        self.model.result_count = self._result_size.get()
+
+    def set_cntx_af_len(self, **kwargs):
+        self._char_after = self._cntx_af_len.get()
+
+    def set_cntx_bf_len(self, **kwargs):
+        self._char_before = self._cntx_bf_len.get()
+
+    def _init_corpus_select(self, parent):
+        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
+        self.var = StringVar(innerframe)
+        self.var.set(self.model.DEFAULT_CORPUS)
+        Label(innerframe, justify=LEFT, text=' Corpus: ',
+              background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
+
+        other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS)
+        om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
+        om['borderwidth'] = 0
+        om['highlightthickness'] = 1
+        om.pack(side='left')
+        innerframe.pack(side='top', fill='x', anchor='n')
+
+    def _init_status(self, parent):
+        self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0)
+        self.status.pack(side='top', anchor='sw')
+
+    def _init_query_box(self, parent):
+        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
+        another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
+        self.query_box = Entry(another, width=60)
+        self.query_box.pack(side='left', fill='x', pady=25, anchor='center')
+        self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1)
+        self.search_button.pack(side='left', fill='x', pady=25, anchor='center')
+        self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler)
+        another.pack()
+        innerframe.pack(side='top', fill='x', anchor='n')
+
+    def search_enter_keypress_handler(self, *event):
+        self.search()
+
+    def _init_results_box(self, parent):
+        innerframe = Frame(parent)
+        i1 = Frame(innerframe)
+        i2 = Frame(innerframe)
+        vscrollbar = Scrollbar(i1, borderwidth=1)
+        hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
+        self.results_box = Text(i1,
+                                font=Font(family='courier', size='16'),
+                                state='disabled', borderwidth=1,
+                                                            yscrollcommand=vscrollbar.set,
+                                xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
+        self.results_box.pack(side='left', fill='both', expand=True)
+        self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR)
+        self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR)
+        vscrollbar.pack(side='left', fill='y', anchor='e')
+        vscrollbar.config(command=self.results_box.yview)
+        hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
+        hscrollbar.config(command=self.results_box.xview)
+        #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
+        Label(i2, text='   ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
+        i1.pack(side='top', fill='both', expand=True, anchor='n')
+        i2.pack(side='bottom', fill='x', anchor='s')
+        innerframe.pack(side='top', fill='both', expand=True)
+
+    def _init_paging(self, parent):
+        innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
+        self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+        prev.pack(side='left', anchor='center')
+        self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+        next.pack(side='right', anchor='center')
+        innerframe.pack(side='top', fill='y')
+        self.current_page = 0
+
+    def previous(self):
+        self.clear_results_box()
+        self.freeze_editable()
+        self.model.prev(self.current_page - 1)
+
+    def __next__(self):
+        self.clear_results_box()
+        self.freeze_editable()
+        self.model.next(self.current_page + 1)
+
+    def about(self, *e):
+        ABOUT = ("NLTK Concordance Search Demo\n")
+        TITLE = 'About: NLTK Concordance Search Demo'
+        try:
+            from six.moves.tkinter_messagebox import Message
+            Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
+        except:
+            ShowText(self.top, TITLE, ABOUT)
+
+    def _bind_event_handlers(self):
+        self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
+        self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
+        self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
+        self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
+
+    def _poll(self):
+        try:
+            event = self.queue.get(block=False)
+        except q.Empty:
+            pass
+        else:
+            if event == CORPUS_LOADED_EVENT:
+                self.handle_corpus_loaded(event)
+            elif event == SEARCH_TERMINATED_EVENT:
+                self.handle_search_terminated(event)
+            elif event == SEARCH_ERROR_EVENT:
+                self.handle_search_error(event)
+            elif event == ERROR_LOADING_CORPUS_EVENT:
+                self.handle_error_loading_corpus(event)
+        self.after = self.top.after(POLL_INTERVAL, self._poll)
+
+    def handle_error_loading_corpus(self, event):
+        self.status['text'] = 'Error in loading ' + self.var.get()
+        self.unfreeze_editable()
+        self.clear_all()
+        self.freeze_editable()
+
+    def handle_corpus_loaded(self, event):
+        self.status['text'] = self.var.get() + ' is loaded'
+        self.unfreeze_editable()
+        self.clear_all()
+        self.query_box.focus_set()
+
+    def handle_search_terminated(self, event):
+        #todo: refactor the model such that it is less state sensitive
+        results = self.model.get_results()
+        self.write_results(results)
+        self.status['text'] = ''
+        if len(results) == 0:
+            self.status['text'] = 'No results found for ' + self.model.query
+        else:
+                self.current_page = self.model.last_requested_page
+        self.unfreeze_editable()
+        self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
+
+    def handle_search_error(self, event):
+        self.status['text'] = 'Error in query ' + self.model.query
+        self.unfreeze_editable()
+
+    def corpus_selected(self, *args):
+        new_selection = self.var.get()
+        self.load_corpus(new_selection)
+
+    def load_corpus(self, selection):
+        if self.model.selected_corpus != selection:
+            self.status['text'] = 'Loading ' + selection + '...'
+            self.freeze_editable()
+            self.model.load_corpus(selection)
+
+    def search(self):
+        self.current_page = 0
+        self.clear_results_box()
+        self.model.reset_results()
+        query = self.query_box.get()
+        if (len(query.strip()) == 0): return
+        self.status['text']  = 'Searching for ' + query
+        self.freeze_editable()
+        self.model.search(query, self.current_page + 1, )
+
+
+    def write_results(self, results):
+        self.results_box['state'] = 'normal'
+        row = 1
+        for each in results:
+            sent, pos1, pos2 = each[0].strip(), each[1], each[2]
+            if len(sent) != 0:
+                if (pos1 < self._char_before):
+                    sent, pos1, pos2 = self.pad(sent, pos1, pos2)
+                sentence = sent[pos1-self._char_before:pos1+self._char_after]
+                if not row == len(results):
+                    sentence += '\n'
+                self.results_box.insert(str(row) + '.0', sentence)
+                word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
+                for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
+                for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
+                row += 1
+        self.results_box['state'] = 'disabled'
+
+    def words_and_labels(self, sentence, pos1, pos2):
+        search_exp = sentence[pos1:pos2]
+        words, labels = [], []
+        labeled_words = search_exp.split(' ')
+        index = 0
+        for each in labeled_words:
+            if each == '':
+                index += 1
+            else:
+                word, label = each.split('/')
+                words.append((self._char_before + index, self._char_before + index + len(word)))
+                index += len(word) + 1
+                labels.append((self._char_before + index, self._char_before + index + len(label)))
+                index += len(label)
+            index += 1
+        return words, labels
+
+    def pad(self, sent, hstart, hend):
+        if hstart >= self._char_before:
+            return sent, hstart, hend
+        d = self._char_before - hstart
+        sent = ''.join([' '] * d) + sent
+        return sent, hstart + d, hend + d
+
+    def destroy(self, *e):
+        if self.top is None: return
+        self.top.after_cancel(self.after)
+        self.top.destroy()
+        self.top = None
+
+    def clear_all(self):
+        self.query_box.delete(0, END)
+        self.model.reset_query()
+        self.clear_results_box()
+
+    def clear_results_box(self):
+        self.results_box['state'] = 'normal'
+        self.results_box.delete("1.0", END)
+        self.results_box['state'] = 'disabled'
+
+    def freeze_editable(self):
+        self.query_box['state'] = 'disabled'
+        self.search_button['state'] = 'disabled'
+        self.prev['state'] = 'disabled'
+        self.next['state'] = 'disabled'
+
+    def unfreeze_editable(self):
+        self.query_box['state'] = 'normal'
+        self.search_button['state'] = 'normal'
+        self.set_paging_button_states()
+
+    def set_paging_button_states(self):
+        if self.current_page == 0 or self.current_page == 1:
+            self.prev['state'] = 'disabled'
+        else:
+            self.prev['state'] = 'normal'
+        if self.model.has_more_pages(self.current_page):
+            self.next['state'] = 'normal'
+        else:
+            self.next['state'] = 'disabled'
+
+    def fire_event(self, event):
+        #Firing an event so that rendering of widgets happen in the mainloop thread
+        self.top.event_generate(event, when='tail')
+
+    def mainloop(self, *args, **kwargs):
+        if in_idle(): return
+        self.top.mainloop(*args, **kwargs)
+
+class ConcordanceSearchModel(object):
+    def __init__(self, queue):
+        self.queue = queue
+        self.CORPORA = _CORPORA
+        self.DEFAULT_CORPUS = _DEFAULT
+        self.selected_corpus = None
+        self.reset_query()
+        self.reset_results()
+        self.result_count = None
+        self.last_sent_searched = 0
+
+    def non_default_corpora(self):
+        copy = []
+        copy.extend(list(self.CORPORA.keys()))
+        copy.remove(self.DEFAULT_CORPUS)
+        copy.sort()
+        return copy
+
+    def load_corpus(self, name):
+        self.selected_corpus = name
+        self.tagged_sents = []
+        runner_thread = self.LoadCorpus(name, self)
+        runner_thread.start()
+
+    def search(self, query, page):
+        self.query = query
+        self.last_requested_page = page
+        self.SearchCorpus(self, page, self.result_count).start()
+
+    def next(self, page):
+        self.last_requested_page = page
+        if len(self.results) < page:
+            self.search(self.query, page)
+        else:
+            self.queue.put(SEARCH_TERMINATED_EVENT)
+
+    def prev(self, page):
+        self.last_requested_page = page
+        self.queue.put(SEARCH_TERMINATED_EVENT)
+
+    def reset_results(self):
+        self.last_sent_searched = 0
+        self.results = []
+        self.last_page = None
+
+    def reset_query(self):
+        self.query = None
+
+    def set_results(self, page, resultset):
+        self.results.insert(page - 1, resultset)
+
+    def get_results(self):
+        return self.results[self.last_requested_page - 1]
+
+    def has_more_pages(self, page):
+        if self.results == [] or self.results[0] == []:
+            return False
+        if self.last_page is None:
+            return True
+        return page < self.last_page
+
+    class LoadCorpus(threading.Thread):
+        def __init__(self, name, model):
+            threading.Thread.__init__(self)
+            self.model, self.name = model, name
+
+        def run(self):
+            try:
+                ts = self.model.CORPORA[self.name]()
+                self.model.tagged_sents = [' '.join(w+'/'+t for (w,t) in sent) for sent in ts]
+                self.model.queue.put(CORPUS_LOADED_EVENT)
+            except Exception as e:
+                print(e)
+                self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
+
+    class SearchCorpus(threading.Thread):
+        def __init__(self, model, page, count):
+            self.model, self.count, self.page = model, count, page
+            threading.Thread.__init__(self)
+
+        def run(self):
+            q = self.processed_query()
+            sent_pos, i, sent_count = [], 0, 0
+            for sent in self.model.tagged_sents[self.model.last_sent_searched:]:
+                try:
+                    m = re.search(q, sent)
+                except re.error:
+                    self.model.reset_results()
+                    self.model.queue.put(SEARCH_ERROR_EVENT)
+                    return
+                if m:
+                    sent_pos.append((sent, m.start(), m.end()))
+                    i += 1
+                    if i > self.count:
+                        self.model.last_sent_searched += sent_count - 1
+                        break
+                sent_count += 1
+            if (self.count >= len(sent_pos)):
+                self.model.last_sent_searched += sent_count - 1
+                self.model.last_page = self.page
+                self.model.set_results(self.page, sent_pos)
+            else:
+                self.model.set_results(self.page, sent_pos[:-1])
+            self.model.queue.put(SEARCH_TERMINATED_EVENT)
+
+        def processed_query(self):
+            new = []
+            for term in self.model.query.split():
+                term = re.sub(r'\.', r'[^/ ]', term)
+                if re.match('[A-Z]+$', term):
+                    new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY)
+                elif '/' in term:
+                    new.append(BOUNDARY + term + BOUNDARY)
+                else:
+                    new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY)
+            return ' '.join(new)
+
+def app():
+    d = ConcordanceSearchView()
+    d.mainloop()
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/concordance_app.pyc b/nlp_resource_data/nltk/app/concordance_app.pyc

new file mode 100755 (executable)

index 0000000..bca3c83

Binary files /dev/null and b/nlp_resource_data/nltk/app/concordance_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/nemo_app.py b/nlp_resource_data/nltk/app/nemo_app.py

new file mode 100755 (executable)

index 0000000..4b142fc
--- /dev/null
+++ b/nlp_resource_data/nltk/app/nemo_app.py
@@ -0,0 +1,156 @@
+# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
+
+"""
+Finding (and Replacing) Nemo
+
+Instant Regular Expressions
+Created by Aristide Grange
+"""
+
+from six.moves.tkinter import (Frame, Label, PhotoImage, Scrollbar, Text, Tk,
+                               SEL_FIRST, SEL_LAST)
+import re
+import itertools
+
+windowTitle = "Finding (and Replacing) Nemo"
+initialFind = r"n(.*?)e(.*?)m(.*?)o"
+initialRepl = r"M\1A\2K\3I"
+initialText = """\
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+"""
+images = {
+    "FIND":"R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
+    "find":"R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
+    "REPL":"R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
+    "repl":"R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
+}
+colors = ["#FF7B39","#80F121"]
+emphColors = ["#DAFC33","#F42548"]
+fieldParams = {
+    "height":3,
+    "width":70,
+    "font":("monaco",14),
+    "highlightthickness":0,
+    "borderwidth":0,
+    "background":"white",
+}
+textParams = {
+    "bg":"#F7E0D4",
+    "fg":"#2321F1",
+    "highlightthickness":0,
+    "width":1,
+    "height":10,
+    "font":("verdana",16),
+    "wrap":"word",
+}
+
+
+class Zone:
+    def __init__(self, image, initialField, initialText):
+        frm = Frame(root)
+        frm.config(background="white")
+        self.image = PhotoImage(format='gif',data=images[image.upper()])
+        self.imageDimmed = PhotoImage(format='gif',data=images[image])
+        self.img = Label(frm)
+        self.img.config(borderwidth=0)
+        self.img.pack(side = "left")
+        self.fld = Text(frm, **fieldParams)
+        self.initScrollText(frm,self.fld,initialField)
+        frm = Frame(root)
+        self.txt = Text(frm, **textParams)
+        self.initScrollText(frm,self.txt,initialText)
+        for i in range(2):
+            self.txt.tag_config(colors[i], background = colors[i])
+            self.txt.tag_config("emph"+colors[i], foreground = emphColors[i])
+    def initScrollText(self,frm,txt,contents):
+        scl = Scrollbar(frm)
+        scl.config(command = txt.yview)
+        scl.pack(side="right",fill="y")
+        txt.pack(side = "left", expand=True, fill="x")
+        txt.config(yscrollcommand = scl.set)
+        txt.insert("1.0",contents)
+        frm.pack(fill = "x")
+        Frame(height=2, bd=1, relief="ridge").pack(fill="x")
+    def refresh(self):
+        self.colorCycle = itertools.cycle(colors)
+        try:
+            self.substitute()
+            self.img.config(image = self.image)
+        except re.error:
+            self.img.config(image = self.imageDimmed)
+
+
+class FindZone(Zone):
+    def addTags(self,m):
+        color = next(self.colorCycle)
+        self.txt.tag_add(color,"1.0+%sc"%m.start(),"1.0+%sc"%m.end())
+        try:
+            self.txt.tag_add("emph"+color,"1.0+%sc"%m.start("emph"),
+                             "1.0+%sc"%m.end("emph"))
+        except:
+            pass
+    def substitute(self,*args):
+        for color in colors:
+            self.txt.tag_remove(color,"1.0","end")
+            self.txt.tag_remove("emph"+color,"1.0","end")
+        self.rex = re.compile("") # default value in case of misformed regexp
+        self.rex = re.compile(self.fld.get("1.0","end")[:-1],re.MULTILINE)
+        try:
+            re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST,
+                                                      SEL_LAST))
+            self.rexSel = re.compile("%s(?P<emph>%s)%s" % (
+                self.fld.get("1.0",SEL_FIRST),
+                self.fld.get(SEL_FIRST,SEL_LAST),
+                self.fld.get(SEL_LAST,"end")[:-1],
+            ),re.MULTILINE)
+        except:
+            self.rexSel = self.rex
+        self.rexSel.sub(self.addTags,self.txt.get("1.0","end"))
+
+
+class ReplaceZone(Zone):
+    def addTags(self,m):
+        s = sz.rex.sub(self.repl,m.group())
+        self.txt.delete("1.0+%sc"%(m.start()+self.diff),
+                        "1.0+%sc"%(m.end()+self.diff))
+        self.txt.insert("1.0+%sc"%(m.start()+self.diff),s,
+                        next(self.colorCycle))
+        self.diff += len(s) - (m.end() - m.start())
+    def substitute(self):
+        self.txt.delete("1.0","end")
+        self.txt.insert("1.0",sz.txt.get("1.0","end")[:-1])
+        self.diff = 0
+        self.repl = rex0.sub(r"\\g<\1>",self.fld.get("1.0","end")[:-1])
+        sz.rex.sub(self.addTags,sz.txt.get("1.0","end")[:-1])
+
+
+def launchRefresh(_):
+    sz.fld.after_idle(sz.refresh)
+    rz.fld.after_idle(rz.refresh)
+
+
+def app():
+    global root, sz, rz, rex0
+    root = Tk()
+    root.resizable(height=False,width=True)
+    root.title(windowTitle)
+    root.minsize(width=250,height=0)
+    sz = FindZone("find",initialFind,initialText)
+    sz.fld.bind("<Button-1>",launchRefresh)
+    sz.fld.bind("<ButtonRelease-1>",launchRefresh)
+    sz.fld.bind("<B1-Motion>",launchRefresh)
+    sz.rexSel = re.compile("")
+    rz = ReplaceZone("repl",initialRepl,"")
+    rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
+    root.bind_all("<Key>",launchRefresh)
+    launchRefresh(None)
+    root.mainloop()
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/nemo_app.pyc b/nlp_resource_data/nltk/app/nemo_app.pyc

new file mode 100755 (executable)

index 0000000..c6c06a2

Binary files /dev/null and b/nlp_resource_data/nltk/app/nemo_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/rdparser_app.py b/nlp_resource_data/nltk/app/rdparser_app.py

new file mode 100755 (executable)

index 0000000..b791767
--- /dev/null
+++ b/nlp_resource_data/nltk/app/rdparser_app.py
@@ -0,0 +1,892 @@
+# Natural Language Toolkit: Recursive Descent Parser Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A graphical tool for exploring the recursive descent parser.
+
+The recursive descent parser maintains a tree, which records the
+structure of the portion of the text that has been parsed.  It uses
+CFG productions to expand the fringe of the tree, and matches its
+leaves against the text.  Initially, the tree contains the start
+symbol ("S").  It is shown in the main canvas, to the right of the
+list of available expansions.
+
+The parser builds up a tree structure for the text using three
+operations:
+
+  - "expand" uses a CFG production to add children to a node on the
+    fringe of the tree.
+  - "match" compares a leaf in the tree to a text token.
+  - "backtrack" returns the tree to its state before the most recent
+    expand or match operation.
+
+The parser maintains a list of tree locations called a "frontier" to
+remember which nodes have not yet been expanded and which leaves have
+not yet been matched against the text.  The leftmost frontier node is
+shown in green, and the other frontier nodes are shown in blue.  The
+parser always performs expand and match operations on the leftmost
+element of the frontier.
+
+You can control the parser's operation by using the "expand," "match,"
+and "backtrack" buttons; or you can use the "step" button to let the
+parser automatically decide which operation to apply.  The parser uses
+the following rules to decide which operation to apply:
+
+  - If the leftmost frontier element is a token, try matching it.
+  - If the leftmost frontier element is a node, try expanding it with
+    the first untried expansion.
+  - Otherwise, backtrack.
+
+The "expand" button applies the untried expansion whose CFG production
+is listed earliest in the grammar.  To manually choose which expansion
+to apply, click on a CFG production from the list of available
+expansions, on the left side of the main window.
+
+The "autostep" button will let the parser continue applying
+applications to the tree until it reaches a complete parse.  You can
+cancel an autostep in progress at any time by clicking on the
+"autostep" button again.
+
+Keyboard Shortcuts::
+      [Space]\t Perform the next expand, match, or backtrack operation
+      [a]\t Step through operations until the next complete parse
+      [e]\t Perform an expand operation
+      [m]\t Perform a match operation
+      [b]\t Perform a backtrack operation
+      [Delete]\t Reset the parser
+      [g]\t Show/hide available expansions list
+      [h]\t Help
+      [Ctrl-p]\t Print
+      [q]\t Quit
+"""
+from __future__ import division
+
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Listbox, IntVar, Button, Frame, Label, Menu,
+                               Scrollbar, Tk)
+
+from nltk.tree import Tree
+from nltk.util import in_idle
+from nltk.parse import SteppingRecursiveDescentParser
+from nltk.draw.util import TextWidget, ShowText, CanvasFrame, EntryDialog
+from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
+
+class RecursiveDescentApp(object):
+    """
+    A graphical tool for exploring the recursive descent parser.  The tool
+    displays the parser's tree and the remaining text, and allows the
+    user to control the parser's operation.  In particular, the user
+    can expand subtrees on the frontier, match tokens on the frontier
+    against the text, and backtrack.  A "step" button simply steps
+    through the parsing process, performing the operations that
+    ``RecursiveDescentParser`` would use.
+    """
+    def __init__(self, grammar, sent, trace=0):
+        self._sent = sent
+        self._parser = SteppingRecursiveDescentParser(grammar, trace)
+
+        # Set up the main window.
+        self._top = Tk()
+        self._top.title('Recursive Descent Parser Application')
+
+        # Set up key bindings.
+        self._init_bindings()
+
+        # Initialize the fonts.
+        self._init_fonts(self._top)
+
+        # Animations.  animating_lock is a lock to prevent the demo
+        # from performing new operations while it's animating.
+        self._animation_frames = IntVar(self._top)
+        self._animation_frames.set(5)
+        self._animating_lock = 0
+        self._autostep = 0
+
+        # The user can hide the grammar.
+        self._show_grammar = IntVar(self._top)
+        self._show_grammar.set(1)
+
+        # Create the basic frames.
+        self._init_menubar(self._top)
+        self._init_buttons(self._top)
+        self._init_feedback(self._top)
+        self._init_grammar(self._top)
+        self._init_canvas(self._top)
+
+        # Initialize the parser.
+        self._parser.initialize(self._sent)
+
+        # Resize callback
+        self._canvas.bind('<Configure>', self._configure)
+
+    #########################################
+    ##  Initialization Helpers
+    #########################################
+
+    def _init_fonts(self, root):
+        # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
+        self._sysfont = Font(font=Button()["font"])
+        root.option_add("*Font", self._sysfont)
+
+        # TWhat's our font size (default=same as sysfont)
+        self._size = IntVar(root)
+        self._size.set(self._sysfont.cget('size'))
+
+        self._boldfont = Font(family='helvetica', weight='bold',
+                                    size=self._size.get())
+        self._font = Font(family='helvetica',
+                                    size=self._size.get())
+        if self._size.get() < 0: big = self._size.get()-2
+        else: big = self._size.get()+2
+        self._bigfont = Font(family='helvetica', weight='bold',
+                                    size=big)
+
+    def _init_grammar(self, parent):
+        # Grammar view.
+        self._prodframe = listframe = Frame(parent)
+        self._prodframe.pack(fill='both', side='left', padx=2)
+        self._prodlist_label = Label(self._prodframe, font=self._boldfont,
+                                     text='Available Expansions')
+        self._prodlist_label.pack()
+        self._prodlist = Listbox(self._prodframe, selectmode='single',
+                                 relief='groove', background='white',
+                                 foreground='#909090', font=self._font,
+                                 selectforeground='#004040',
+                                 selectbackground='#c0f0c0')
+
+        self._prodlist.pack(side='right', fill='both', expand=1)
+
+        self._productions = list(self._parser.grammar().productions())
+        for production in self._productions:
+            self._prodlist.insert('end', ('  %s' % production))
+        self._prodlist.config(height=min(len(self._productions), 25))
+
+        # Add a scrollbar if there are more than 25 productions.
+        if len(self._productions) > 25:
+            listscroll = Scrollbar(self._prodframe,
+                                   orient='vertical')
+            self._prodlist.config(yscrollcommand = listscroll.set)
+            listscroll.config(command=self._prodlist.yview)
+            listscroll.pack(side='left', fill='y')
+
+        # If they select a production, apply it.
+        self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
+
+    def _init_bindings(self):
+        # Key bindings are a good thing.
+        self._top.bind('<Control-q>', self.destroy)
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Escape>', self.destroy)
+        self._top.bind('e', self.expand)
+        #self._top.bind('<Alt-e>', self.expand)
+        #self._top.bind('<Control-e>', self.expand)
+        self._top.bind('m', self.match)
+        self._top.bind('<Alt-m>', self.match)
+        self._top.bind('<Control-m>', self.match)
+        self._top.bind('b', self.backtrack)
+        self._top.bind('<Alt-b>', self.backtrack)
+        self._top.bind('<Control-b>', self.backtrack)
+        self._top.bind('<Control-z>', self.backtrack)
+        self._top.bind('<BackSpace>', self.backtrack)
+        self._top.bind('a', self.autostep)
+        #self._top.bind('<Control-a>', self.autostep)
+        self._top.bind('<Control-space>', self.autostep)
+        self._top.bind('<Control-c>', self.cancel_autostep)
+        self._top.bind('<space>', self.step)
+        self._top.bind('<Delete>', self.reset)
+        self._top.bind('<Control-p>', self.postscript)
+        #self._top.bind('<h>', self.help)
+        #self._top.bind('<Alt-h>', self.help)
+        self._top.bind('<Control-h>', self.help)
+        self._top.bind('<F1>', self.help)
+        #self._top.bind('<g>', self.toggle_grammar)
+        #self._top.bind('<Alt-g>', self.toggle_grammar)
+        #self._top.bind('<Control-g>', self.toggle_grammar)
+        self._top.bind('<Control-g>', self.edit_grammar)
+        self._top.bind('<Control-t>', self.edit_sentence)
+
+    def _init_buttons(self, parent):
+        # Set up the frames.
+        self._buttonframe = buttonframe = Frame(parent)
+        buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
+        Button(buttonframe, text='Step',
+               background='#90c0d0', foreground='black',
+               command=self.step,).pack(side='left')
+        Button(buttonframe, text='Autostep',
+               background='#90c0d0', foreground='black',
+               command=self.autostep,).pack(side='left')
+        Button(buttonframe, text='Expand', underline=0,
+               background='#90f090', foreground='black',
+               command=self.expand).pack(side='left')
+        Button(buttonframe, text='Match', underline=0,
+               background='#90f090', foreground='black',
+               command=self.match).pack(side='left')
+        Button(buttonframe, text='Backtrack', underline=0,
+               background='#f0a0a0', foreground='black',
+               command=self.backtrack).pack(side='left')
+        # Replace autostep...
+#         self._autostep_button = Button(buttonframe, text='Autostep',
+#                                        underline=0, command=self.autostep)
+#         self._autostep_button.pack(side='left')
+
+    def _configure(self, event):
+        self._autostep = 0
+        (x1, y1, x2, y2) = self._cframe.scrollregion()
+        y2 = event.height - 6
+        self._canvas['scrollregion'] = '%d %d %d %d' % (x1,y1,x2,y2)
+        self._redraw()
+
+    def _init_feedback(self, parent):
+        self._feedbackframe = feedbackframe = Frame(parent)
+        feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
+        self._lastoper_label = Label(feedbackframe, text='Last Operation:',
+                                     font=self._font)
+        self._lastoper_label.pack(side='left')
+        lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+        lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
+        self._lastoper1 = Label(lastoperframe, foreground='#007070',
+                                background='#f0f0f0', font=self._font)
+        self._lastoper2 = Label(lastoperframe, anchor='w', width=30,
+                                foreground='#004040', background='#f0f0f0',
+                                font=self._font)
+        self._lastoper1.pack(side='left')
+        self._lastoper2.pack(side='left', fill='x', expand=1)
+
+    def _init_canvas(self, parent):
+        self._cframe = CanvasFrame(parent, background='white',
+                                   #width=525, height=250,
+                                   closeenough=10,
+                                   border=2, relief='sunken')
+        self._cframe.pack(expand=1, fill='both', side='top', pady=2)
+        canvas = self._canvas = self._cframe.canvas()
+
+        # Initially, there's no tree or text
+        self._tree = None
+        self._textwidgets = []
+        self._textline = None
+
+    def _init_menubar(self, parent):
+        menubar = Menu(parent)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Reset Parser', underline=0,
+                             command=self.reset, accelerator='Del')
+        filemenu.add_command(label='Print to Postscript', underline=0,
+                             command=self.postscript, accelerator='Ctrl-p')
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        editmenu = Menu(menubar, tearoff=0)
+        editmenu.add_command(label='Edit Grammar', underline=5,
+                             command=self.edit_grammar,
+                             accelerator='Ctrl-g')
+        editmenu.add_command(label='Edit Text', underline=5,
+                             command=self.edit_sentence,
+                             accelerator='Ctrl-t')
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
+
+        rulemenu = Menu(menubar, tearoff=0)
+        rulemenu.add_command(label='Step', underline=1,
+                             command=self.step, accelerator='Space')
+        rulemenu.add_separator()
+        rulemenu.add_command(label='Match', underline=0,
+                             command=self.match, accelerator='Ctrl-m')
+        rulemenu.add_command(label='Expand', underline=0,
+                             command=self.expand, accelerator='Ctrl-e')
+        rulemenu.add_separator()
+        rulemenu.add_command(label='Backtrack', underline=0,
+                             command=self.backtrack, accelerator='Ctrl-b')
+        menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
+
+        viewmenu = Menu(menubar, tearoff=0)
+        viewmenu.add_checkbutton(label="Show Grammar", underline=0,
+                                 variable=self._show_grammar,
+                                 command=self._toggle_grammar)
+        viewmenu.add_separator()
+        viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+                                 underline=0, value=10, command=self.resize)
+        viewmenu.add_radiobutton(label='Small', variable=self._size,
+                                 underline=0, value=12, command=self.resize)
+        viewmenu.add_radiobutton(label='Medium', variable=self._size,
+                                 underline=0, value=14, command=self.resize)
+        viewmenu.add_radiobutton(label='Large', variable=self._size,
+                                 underline=0, value=18, command=self.resize)
+        viewmenu.add_radiobutton(label='Huge', variable=self._size,
+                                 underline=0, value=24, command=self.resize)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
+
+        animatemenu = Menu(menubar, tearoff=0)
+        animatemenu.add_radiobutton(label="No Animation", underline=0,
+                                    variable=self._animation_frames,
+                                    value=0)
+        animatemenu.add_radiobutton(label="Slow Animation", underline=0,
+                                    variable=self._animation_frames,
+                                    value=10, accelerator='-')
+        animatemenu.add_radiobutton(label="Normal Animation", underline=0,
+                                    variable=self._animation_frames,
+                                    value=5, accelerator='=')
+        animatemenu.add_radiobutton(label="Fast Animation", underline=0,
+                                    variable=self._animation_frames,
+                                    value=2, accelerator='+')
+        menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
+
+
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label='About', underline=0,
+                             command=self.about)
+        helpmenu.add_command(label='Instructions', underline=0,
+                             command=self.help, accelerator='F1')
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+
+        parent.config(menu=menubar)
+
+    #########################################
+    ##  Helper
+    #########################################
+
+    def _get(self, widget, treeloc):
+        for i in treeloc: widget = widget.subtrees()[i]
+        if isinstance(widget, TreeSegmentWidget):
+            widget = widget.label()
+        return widget
+
+    #########################################
+    ##  Main draw procedure
+    #########################################
+
+    def _redraw(self):
+        canvas = self._canvas
+
+        # Delete the old tree, widgets, etc.
+        if self._tree is not None:
+            self._cframe.destroy_widget(self._tree)
+        for twidget in self._textwidgets:
+            self._cframe.destroy_widget(twidget)
+        if self._textline is not None:
+            self._canvas.delete(self._textline)
+
+        # Draw the tree.
+        helv = ('helvetica', -self._size.get())
+        bold = ('helvetica', -self._size.get(), 'bold')
+        attribs = {'tree_color': '#000000', 'tree_width': 2,
+                   'node_font': bold, 'leaf_font': helv,}
+        tree = self._parser.tree()
+        self._tree = tree_to_treesegment(canvas, tree, **attribs)
+        self._cframe.add_widget(self._tree, 30, 5)
+
+        # Draw the text.
+        helv = ('helvetica', -self._size.get())
+        bottom = y = self._cframe.scrollregion()[3]
+        self._textwidgets = [TextWidget(canvas, word, font=self._font)
+                             for word in self._sent]
+        for twidget in self._textwidgets:
+            self._cframe.add_widget(twidget, 0, 0)
+            twidget.move(0, bottom-twidget.bbox()[3]-5)
+            y = min(y, twidget.bbox()[1])
+
+        # Draw a line over the text, to separate it from the tree.
+        self._textline = canvas.create_line(-5000, y-5, 5000, y-5, dash='.')
+
+        # Highlight appropriate nodes.
+        self._highlight_nodes()
+        self._highlight_prodlist()
+
+        # Make sure the text lines up.
+        self._position_text()
+
+
+    def _redraw_quick(self):
+        # This should be more-or-less sufficient after an animation.
+        self._highlight_nodes()
+        self._highlight_prodlist()
+        self._position_text()
+
+    def _highlight_nodes(self):
+        # Highlight the list of nodes to be checked.
+        bold = ('helvetica', -self._size.get(), 'bold')
+        for treeloc in self._parser.frontier()[:1]:
+            self._get(self._tree, treeloc)['color'] = '#20a050'
+            self._get(self._tree, treeloc)['font'] = bold
+        for treeloc in self._parser.frontier()[1:]:
+            self._get(self._tree, treeloc)['color'] = '#008080'
+
+    def _highlight_prodlist(self):
+        # Highlight the productions that can be expanded.
+        # Boy, too bad tkinter doesn't implement Listbox.itemconfig;
+        # that would be pretty useful here.
+        self._prodlist.delete(0, 'end')
+        expandable = self._parser.expandable_productions()
+        untried = self._parser.untried_expandable_productions()
+        productions = self._productions
+        for index in range(len(productions)):
+            if productions[index] in expandable:
+                if productions[index] in untried:
+                    self._prodlist.insert(index, ' %s' % productions[index])
+                else:
+                    self._prodlist.insert(index, ' %s (TRIED)' %
+                                          productions[index])
+                self._prodlist.selection_set(index)
+            else:
+                self._prodlist.insert(index, ' %s' % productions[index])
+
+    def _position_text(self):
+        # Line up the text widgets that are matched against the tree
+        numwords = len(self._sent)
+        num_matched = numwords - len(self._parser.remaining_text())
+        leaves = self._tree_leaves()[:num_matched]
+        xmax = self._tree.bbox()[0]
+        for i in range(0, len(leaves)):
+            widget = self._textwidgets[i]
+            leaf = leaves[i]
+            widget['color'] = '#006040'
+            leaf['color'] = '#006040'
+            widget.move(leaf.bbox()[0] - widget.bbox()[0], 0)
+            xmax = widget.bbox()[2] + 10
+
+        # Line up the text widgets that are not matched against the tree.
+        for i in range(len(leaves), numwords):
+            widget = self._textwidgets[i]
+            widget['color'] = '#a0a0a0'
+            widget.move(xmax - widget.bbox()[0], 0)
+            xmax = widget.bbox()[2] + 10
+
+        # If we have a complete parse, make everything green :)
+        if self._parser.currently_complete():
+            for twidget in self._textwidgets:
+                twidget['color'] = '#00a000'
+
+        # Move the matched leaves down to the text.
+        for i in range(0, len(leaves)):
+            widget = self._textwidgets[i]
+            leaf = leaves[i]
+            dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0
+            dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10)
+            leaf.move(0, dy)
+
+    def _tree_leaves(self, tree=None):
+        if tree is None: tree = self._tree
+        if isinstance(tree, TreeSegmentWidget):
+            leaves = []
+            for child in tree.subtrees(): leaves += self._tree_leaves(child)
+            return leaves
+        else:
+            return [tree]
+
+    #########################################
+    ##  Button Callbacks
+    #########################################
+
+    def destroy(self, *e):
+        self._autostep = 0
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+    def reset(self, *e):
+        self._autostep = 0
+        self._parser.initialize(self._sent)
+        self._lastoper1['text'] = 'Reset Application'
+        self._lastoper2['text'] = ''
+        self._redraw()
+
+    def autostep(self, *e):
+        if self._animation_frames.get() == 0:
+            self._animation_frames.set(2)
+        if self._autostep:
+            self._autostep = 0
+        else:
+            self._autostep = 1
+            self._step()
+
+    def cancel_autostep(self, *e):
+        #self._autostep_button['text'] = 'Autostep'
+        self._autostep = 0
+
+    # Make sure to stop auto-stepping if we get any user input.
+    def step(self, *e): self._autostep = 0; self._step()
+    def match(self, *e): self._autostep = 0; self._match()
+    def expand(self, *e): self._autostep = 0; self._expand()
+    def backtrack(self, *e): self._autostep = 0; self._backtrack()
+
+    def _step(self):
+        if self._animating_lock: return
+
+        # Try expanding, matching, and backtracking (in that order)
+        if self._expand(): pass
+        elif self._parser.untried_match() and self._match(): pass
+        elif self._backtrack(): pass
+        else:
+            self._lastoper1['text'] = 'Finished'
+            self._lastoper2['text'] = ''
+            self._autostep = 0
+
+        # Check if we just completed a parse.
+        if self._parser.currently_complete():
+            self._autostep = 0
+            self._lastoper2['text'] += '    [COMPLETE PARSE]'
+
+    def _expand(self, *e):
+        if self._animating_lock: return
+        old_frontier = self._parser.frontier()
+        rv = self._parser.expand()
+        if rv is not None:
+            self._lastoper1['text'] = 'Expand:'
+            self._lastoper2['text'] = rv
+            self._prodlist.selection_clear(0, 'end')
+            index = self._productions.index(rv)
+            self._prodlist.selection_set(index)
+            self._animate_expand(old_frontier[0])
+            return True
+        else:
+            self._lastoper1['text'] = 'Expand:'
+            self._lastoper2['text'] = '(all expansions tried)'
+            return False
+
+    def _match(self, *e):
+        if self._animating_lock: return
+        old_frontier = self._parser.frontier()
+        rv = self._parser.match()
+        if rv is not None:
+            self._lastoper1['text'] = 'Match:'
+            self._lastoper2['text'] = rv
+            self._animate_match(old_frontier[0])
+            return True
+        else:
+            self._lastoper1['text'] = 'Match:'
+            self._lastoper2['text'] = '(failed)'
+            return False
+
+    def _backtrack(self, *e):
+        if self._animating_lock: return
+        if self._parser.backtrack():
+            elt = self._parser.tree()
+            for i in self._parser.frontier()[0]:
+                elt = elt[i]
+            self._lastoper1['text'] = 'Backtrack'
+            self._lastoper2['text'] = ''
+            if isinstance(elt, Tree):
+                self._animate_backtrack(self._parser.frontier()[0])
+            else:
+                self._animate_match_backtrack(self._parser.frontier()[0])
+            return True
+        else:
+            self._autostep = 0
+            self._lastoper1['text'] = 'Finished'
+            self._lastoper2['text'] = ''
+            return False
+
+    def about(self, *e):
+        ABOUT = ("NLTK Recursive Descent Parser Application\n"+
+                 "Written by Edward Loper")
+        TITLE = 'About: Recursive Descent Parser Application'
+        try:
+            from six.moves.tkinter_messagebox import Message
+            Message(message=ABOUT, title=TITLE).show()
+        except:
+            ShowText(self._top, TITLE, ABOUT)
+
+    def help(self, *e):
+        self._autostep = 0
+        # The default font's not very legible; try using 'fixed' instead.
+        try:
+            ShowText(self._top, 'Help: Recursive Descent Parser Application',
+                     (__doc__ or '').strip(), width=75, font='fixed')
+        except:
+            ShowText(self._top, 'Help: Recursive Descent Parser Application',
+                     (__doc__ or '').strip(), width=75)
+
+    def postscript(self, *e):
+        self._autostep = 0
+        self._cframe.print_to_file()
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._top.mainloop(*args, **kwargs)
+
+    def resize(self, size=None):
+        if size is not None: self._size.set(size)
+        size = self._size.get()
+        self._font.configure(size=-(abs(size)))
+        self._boldfont.configure(size=-(abs(size)))
+        self._sysfont.configure(size=-(abs(size)))
+        self._bigfont.configure(size=-(abs(size+2)))
+        self._redraw()
+
+    #########################################
+    ##  Expand Production Selection
+    #########################################
+
+    def _toggle_grammar(self, *e):
+        if self._show_grammar.get():
+            self._prodframe.pack(fill='both', side='left', padx=2,
+                                 after=self._feedbackframe)
+            self._lastoper1['text'] = 'Show Grammar'
+        else:
+            self._prodframe.pack_forget()
+            self._lastoper1['text'] = 'Hide Grammar'
+        self._lastoper2['text'] = ''
+
+#     def toggle_grammar(self, *e):
+#         self._show_grammar = not self._show_grammar
+#         if self._show_grammar:
+#             self._prodframe.pack(fill='both', expand='y', side='left',
+#                                  after=self._feedbackframe)
+#             self._lastoper1['text'] = 'Show Grammar'
+#         else:
+#             self._prodframe.pack_forget()
+#             self._lastoper1['text'] = 'Hide Grammar'
+#         self._lastoper2['text'] = ''
+
+    def _prodlist_select(self, event):
+        selection = self._prodlist.curselection()
+        if len(selection) != 1: return
+        index = int(selection[0])
+        old_frontier = self._parser.frontier()
+        production = self._parser.expand(self._productions[index])
+
+        if production:
+            self._lastoper1['text'] = 'Expand:'
+            self._lastoper2['text'] = production
+            self._prodlist.selection_clear(0, 'end')
+            self._prodlist.selection_set(index)
+            self._animate_expand(old_frontier[0])
+        else:
+            # Reset the production selections.
+            self._prodlist.selection_clear(0, 'end')
+            for prod in self._parser.expandable_productions():
+                index = self._productions.index(prod)
+                self._prodlist.selection_set(index)
+
+    #########################################
+    ##  Animation
+    #########################################
+
+    def _animate_expand(self, treeloc):
+        oldwidget = self._get(self._tree, treeloc)
+        oldtree = oldwidget.parent()
+        top = not isinstance(oldtree.parent(), TreeSegmentWidget)
+
+        tree = self._parser.tree()
+        for i in treeloc:
+            tree = tree[i]
+
+        widget = tree_to_treesegment(self._canvas, tree,
+                                     node_font=self._boldfont,
+                                     leaf_color='white',
+                                     tree_width=2, tree_color='white',
+                                     node_color='white',
+                                     leaf_font=self._font)
+        widget.label()['color'] = '#20a050'
+
+        (oldx, oldy) = oldtree.label().bbox()[:2]
+        (newx, newy) = widget.label().bbox()[:2]
+        widget.move(oldx-newx, oldy-newy)
+
+        if top:
+            self._cframe.add_widget(widget, 0, 5)
+            widget.move(30-widget.label().bbox()[0], 0)
+            self._tree = widget
+        else:
+            oldtree.parent().replace_child(oldtree, widget)
+
+        # Move the children over so they don't overlap.
+        # Line the children up in a strange way.
+        if widget.subtrees():
+            dx = (oldx + widget.label().width()/2 -
+                  widget.subtrees()[0].bbox()[0]/2 -
+                  widget.subtrees()[0].bbox()[2]/2)
+            for subtree in widget.subtrees(): subtree.move(dx, 0)
+
+        self._makeroom(widget)
+
+        if top:
+            self._cframe.destroy_widget(oldtree)
+        else:
+            oldtree.destroy()
+
+        colors = ['gray%d' % (10*int(10*x/self._animation_frames.get()))
+                  for x in range(self._animation_frames.get(),0,-1)]
+
+        # Move the text string down, if necessary.
+        dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1]
+        if dy > 0:
+            for twidget in self._textwidgets: twidget.move(0, dy)
+            self._canvas.move(self._textline, 0, dy)
+
+        self._animate_expand_frame(widget, colors)
+
+    def _makeroom(self, treeseg):
+        """
+        Make sure that no sibling tree bbox's overlap.
+        """
+        parent = treeseg.parent()
+        if not isinstance(parent, TreeSegmentWidget): return
+
+        index = parent.subtrees().index(treeseg)
+
+        # Handle siblings to the right
+        rsiblings = parent.subtrees()[index+1:]
+        if rsiblings:
+            dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10
+            for sibling in rsiblings: sibling.move(dx, 0)
+
+        # Handle siblings to the left
+        if index > 0:
+            lsibling = parent.subtrees()[index-1]
+            dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10)
+            treeseg.move(dx, 0)
+
+        # Keep working up the tree.
+        self._makeroom(parent)
+
+    def _animate_expand_frame(self, widget, colors):
+        if len(colors) > 0:
+            self._animating_lock = 1
+            widget['color'] = colors[0]
+            for subtree in widget.subtrees():
+                if isinstance(subtree, TreeSegmentWidget):
+                    subtree.label()['color'] = colors[0]
+                else:
+                    subtree['color'] = colors[0]
+            self._top.after(50, self._animate_expand_frame,
+                            widget, colors[1:])
+        else:
+            widget['color'] = 'black'
+            for subtree in widget.subtrees():
+                if isinstance(subtree, TreeSegmentWidget):
+                    subtree.label()['color'] = 'black'
+                else:
+                    subtree['color'] = 'black'
+            self._redraw_quick()
+            widget.label()['color'] = 'black'
+            self._animating_lock = 0
+            if self._autostep: self._step()
+
+    def _animate_backtrack(self, treeloc):
+        # Flash red first, if we're animating.
+        if self._animation_frames.get() == 0: colors = []
+        else: colors = ['#a00000', '#000000', '#a00000']
+        colors += ['gray%d' % (10*int(10*x/(self._animation_frames.get())))
+                   for x in range(1, self._animation_frames.get()+1)]
+
+        widgets = [self._get(self._tree, treeloc).parent()]
+        for subtree in widgets[0].subtrees():
+            if isinstance(subtree, TreeSegmentWidget):
+                widgets.append(subtree.label())
+            else:
+                widgets.append(subtree)
+
+        self._animate_backtrack_frame(widgets, colors)
+
+    def _animate_backtrack_frame(self, widgets, colors):
+        if len(colors) > 0:
+            self._animating_lock = 1
+            for widget in widgets: widget['color'] = colors[0]
+            self._top.after(50, self._animate_backtrack_frame,
+                            widgets, colors[1:])
+        else:
+            for widget in widgets[0].subtrees():
+                widgets[0].remove_child(widget)
+                widget.destroy()
+            self._redraw_quick()
+            self._animating_lock = 0
+            if self._autostep: self._step()
+
+    def _animate_match_backtrack(self, treeloc):
+        widget = self._get(self._tree, treeloc)
+        node = widget.parent().label()
+        dy = ((node.bbox()[3] - widget.bbox()[1] + 14) /
+              max(1, self._animation_frames.get()))
+        self._animate_match_backtrack_frame(self._animation_frames.get(),
+                                            widget, dy)
+
+    def _animate_match(self, treeloc):
+        widget = self._get(self._tree, treeloc)
+
+        dy = ((self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) /
+              max(1, self._animation_frames.get()))
+        self._animate_match_frame(self._animation_frames.get(), widget, dy)
+
+    def _animate_match_frame(self, frame, widget, dy):
+        if frame > 0:
+            self._animating_lock = 1
+            widget.move(0, dy)
+            self._top.after(10, self._animate_match_frame,
+                            frame-1, widget, dy)
+        else:
+            widget['color'] = '#006040'
+            self._redraw_quick()
+            self._animating_lock = 0
+            if self._autostep: self._step()
+
+    def _animate_match_backtrack_frame(self, frame, widget, dy):
+        if frame > 0:
+            self._animating_lock = 1
+            widget.move(0, dy)
+            self._top.after(10, self._animate_match_backtrack_frame,
+                            frame-1, widget, dy)
+        else:
+            widget.parent().remove_child(widget)
+            widget.destroy()
+            self._animating_lock = 0
+            if self._autostep: self._step()
+
+    def edit_grammar(self, *e):
+        CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
+
+    def set_grammar(self, grammar):
+        self._parser.set_grammar(grammar)
+        self._productions = list(grammar.productions())
+        self._prodlist.delete(0, 'end')
+        for production in self._productions:
+            self._prodlist.insert('end', (' %s' % production))
+
+    def edit_sentence(self, *e):
+        sentence = " ".join(self._sent)
+        title = 'Edit Text'
+        instr = 'Enter a new sentence to parse.'
+        EntryDialog(self._top, sentence, instr, self.set_sentence, title)
+
+    def set_sentence(self, sentence):
+        self._sent = sentence.split() #[XX] use tagged?
+        self.reset()
+
+def app():
+    """
+    Create a recursive descent parser demo, using a simple grammar and
+    text.
+    """
+    from nltk.grammar import CFG
+    grammar = CFG.fromstring("""
+    # Grammatical productions.
+        S -> NP VP
+        NP -> Det N PP | Det N
+        VP -> V NP PP | V NP | V
+        PP -> P NP
+    # Lexical productions.
+        NP -> 'I'
+        Det -> 'the' | 'a'
+        N -> 'man' | 'park' | 'dog' | 'telescope'
+        V -> 'ate' | 'saw'
+        P -> 'in' | 'under' | 'with'
+    """)
+
+    sent = 'the dog saw a man in the park'.split()
+
+    RecursiveDescentApp(grammar, sent).mainloop()
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/rdparser_app.pyc b/nlp_resource_data/nltk/app/rdparser_app.pyc

new file mode 100755 (executable)

index 0000000..916c96a

Binary files /dev/null and b/nlp_resource_data/nltk/app/rdparser_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/srparser_app.py b/nlp_resource_data/nltk/app/srparser_app.py

new file mode 100755 (executable)

index 0000000..0dd5786
--- /dev/null
+++ b/nlp_resource_data/nltk/app/srparser_app.py
@@ -0,0 +1,809 @@
+# Natural Language Toolkit: Shift-Reduce Parser Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A graphical tool for exploring the shift-reduce parser.
+
+The shift-reduce parser maintains a stack, which records the structure
+of the portion of the text that has been parsed.  The stack is
+initially empty.  Its contents are shown on the left side of the main
+canvas.
+
+On the right side of the main canvas is the remaining text.  This is
+the portion of the text which has not yet been considered by the
+parser.
+
+The parser builds up a tree structure for the text using two
+operations:
+
+  - "shift" moves the first token from the remaining text to the top
+    of the stack.  In the demo, the top of the stack is its right-hand
+    side.
+  - "reduce" uses a grammar production to combine the rightmost stack
+    elements into a single tree token.
+
+You can control the parser's operation by using the "shift" and
+"reduce" buttons; or you can use the "step" button to let the parser
+automatically decide which operation to apply.  The parser uses the
+following rules to decide which operation to apply:
+
+  - Only shift if no reductions are available.
+  - If multiple reductions are available, then apply the reduction
+    whose CFG production is listed earliest in the grammar.
+
+The "reduce" button applies the reduction whose CFG production is
+listed earliest in the grammar.  There are two ways to manually choose
+which reduction to apply:
+
+  - Click on a CFG production from the list of available reductions,
+    on the left side of the main window.  The reduction based on that
+    production will be applied to the top of the stack.
+  - Click on one of the stack elements.  A popup window will appear,
+    containing all available reductions.  Select one, and it will be
+    applied to the top of the stack.
+
+Note that reductions can only be applied to the top of the stack.
+
+Keyboard Shortcuts::
+      [Space]\t Perform the next shift or reduce operation
+      [s]\t Perform a shift operation
+      [r]\t Perform a reduction operation
+      [Ctrl-z]\t Undo most recent operation
+      [Delete]\t Reset the parser
+      [g]\t Show/hide available production list
+      [Ctrl-a]\t Toggle animations
+      [h]\t Help
+      [Ctrl-p]\t Print
+      [q]\t Quit
+
+"""
+
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (IntVar, Listbox, Button, Frame, Label, Menu,
+                               Scrollbar, Tk)
+
+from nltk.tree import Tree
+from nltk.parse import SteppingShiftReduceParser
+from nltk.util import in_idle
+from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
+from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
+
+"""
+Possible future improvements:
+  - button/window to change and/or select text.  Just pop up a window
+    with an entry, and let them modify the text; and then retokenize
+    it?  Maybe give a warning if it contains tokens whose types are
+    not in the grammar.
+  - button/window to change and/or select grammar.  Select from
+    several alternative grammars?  Or actually change the grammar?  If
+    the later, then I'd want to define nltk.draw.cfg, which would be
+    responsible for that.
+"""
+
+class ShiftReduceApp(object):
+    """
+    A graphical tool for exploring the shift-reduce parser.  The tool
+    displays the parser's stack and the remaining text, and allows the
+    user to control the parser's operation.  In particular, the user
+    can shift tokens onto the stack, and can perform reductions on the
+    top elements of the stack.  A "step" button simply steps through
+    the parsing process, performing the operations that
+    ``nltk.parse.ShiftReduceParser`` would use.
+    """
+    def __init__(self, grammar, sent, trace=0):
+        self._sent = sent
+        self._parser = SteppingShiftReduceParser(grammar, trace)
+
+        # Set up the main window.
+        self._top = Tk()
+        self._top.title('Shift Reduce Parser Application')
+
+        # Animations.  animating_lock is a lock to prevent the demo
+        # from performing new operations while it's animating.
+        self._animating_lock = 0
+        self._animate = IntVar(self._top)
+        self._animate.set(10) # = medium
+
+        # The user can hide the grammar.
+        self._show_grammar = IntVar(self._top)
+        self._show_grammar.set(1)
+
+        # Initialize fonts.
+        self._init_fonts(self._top)
+
+        # Set up key bindings.
+        self._init_bindings()
+
+        # Create the basic frames.
+        self._init_menubar(self._top)
+        self._init_buttons(self._top)
+        self._init_feedback(self._top)
+        self._init_grammar(self._top)
+        self._init_canvas(self._top)
+
+        # A popup menu for reducing.
+        self._reduce_menu = Menu(self._canvas, tearoff=0)
+
+        # Reset the demo, and set the feedback frame to empty.
+        self.reset()
+        self._lastoper1['text'] = ''
+
+    #########################################
+    ##  Initialization Helpers
+    #########################################
+
+    def _init_fonts(self, root):
+        # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
+        self._sysfont = Font(font=Button()["font"])
+        root.option_add("*Font", self._sysfont)
+
+        # TWhat's our font size (default=same as sysfont)
+        self._size = IntVar(root)
+        self._size.set(self._sysfont.cget('size'))
+
+        self._boldfont = Font(family='helvetica', weight='bold',
+                                    size=self._size.get())
+        self._font = Font(family='helvetica',
+                                    size=self._size.get())
+
+    def _init_grammar(self, parent):
+        # Grammar view.
+        self._prodframe = listframe = Frame(parent)
+        self._prodframe.pack(fill='both', side='left', padx=2)
+        self._prodlist_label = Label(self._prodframe,
+                                     font=self._boldfont,
+                                     text='Available Reductions')
+        self._prodlist_label.pack()
+        self._prodlist = Listbox(self._prodframe, selectmode='single',
+                                 relief='groove', background='white',
+                                 foreground='#909090',
+                                 font=self._font,
+                                 selectforeground='#004040',
+                                 selectbackground='#c0f0c0')
+
+        self._prodlist.pack(side='right', fill='both', expand=1)
+
+        self._productions = list(self._parser.grammar().productions())
+        for production in self._productions:
+            self._prodlist.insert('end', (' %s' % production))
+        self._prodlist.config(height=min(len(self._productions), 25))
+
+        # Add a scrollbar if there are more than 25 productions.
+        if 1:#len(self._productions) > 25:
+            listscroll = Scrollbar(self._prodframe,
+                                   orient='vertical')
+            self._prodlist.config(yscrollcommand = listscroll.set)
+            listscroll.config(command=self._prodlist.yview)
+            listscroll.pack(side='left', fill='y')
+
+        # If they select a production, apply it.
+        self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
+
+        # When they hover over a production, highlight it.
+        self._hover = -1
+        self._prodlist.bind('<Motion>', self._highlight_hover)
+        self._prodlist.bind('<Leave>', self._clear_hover)
+
+    def _init_bindings(self):
+        # Quit
+        self._top.bind('<Control-q>', self.destroy)
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Alt-q>', self.destroy)
+        self._top.bind('<Alt-x>', self.destroy)
+
+        # Ops (step, shift, reduce, undo)
+        self._top.bind('<space>', self.step)
+        self._top.bind('<s>', self.shift)
+        self._top.bind('<Alt-s>', self.shift)
+        self._top.bind('<Control-s>', self.shift)
+        self._top.bind('<r>', self.reduce)
+        self._top.bind('<Alt-r>', self.reduce)
+        self._top.bind('<Control-r>', self.reduce)
+        self._top.bind('<Delete>', self.reset)
+        self._top.bind('<u>', self.undo)
+        self._top.bind('<Alt-u>', self.undo)
+        self._top.bind('<Control-u>', self.undo)
+        self._top.bind('<Control-z>', self.undo)
+        self._top.bind('<BackSpace>', self.undo)
+
+        # Misc
+        self._top.bind('<Control-p>', self.postscript)
+        self._top.bind('<Control-h>', self.help)
+        self._top.bind('<F1>', self.help)
+        self._top.bind('<Control-g>', self.edit_grammar)
+        self._top.bind('<Control-t>', self.edit_sentence)
+
+        # Animation speed control
+        self._top.bind('-', lambda e,a=self._animate:a.set(20))
+        self._top.bind('=', lambda e,a=self._animate:a.set(10))
+        self._top.bind('+', lambda e,a=self._animate:a.set(4))
+
+    def _init_buttons(self, parent):
+        # Set up the frames.
+        self._buttonframe = buttonframe = Frame(parent)
+        buttonframe.pack(fill='none', side='bottom')
+        Button(buttonframe, text='Step',
+               background='#90c0d0', foreground='black',
+               command=self.step,).pack(side='left')
+        Button(buttonframe, text='Shift', underline=0,
+               background='#90f090', foreground='black',
+               command=self.shift).pack(side='left')
+        Button(buttonframe, text='Reduce', underline=0,
+               background='#90f090', foreground='black',
+               command=self.reduce).pack(side='left')
+        Button(buttonframe, text='Undo', underline=0,
+               background='#f0a0a0', foreground='black',
+               command=self.undo).pack(side='left')
+
+    def _init_menubar(self, parent):
+        menubar = Menu(parent)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Reset Parser', underline=0,
+                             command=self.reset, accelerator='Del')
+        filemenu.add_command(label='Print to Postscript', underline=0,
+                             command=self.postscript, accelerator='Ctrl-p')
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        editmenu = Menu(menubar, tearoff=0)
+        editmenu.add_command(label='Edit Grammar', underline=5,
+                             command=self.edit_grammar,
+                             accelerator='Ctrl-g')
+        editmenu.add_command(label='Edit Text', underline=5,
+                             command=self.edit_sentence,
+                             accelerator='Ctrl-t')
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
+
+        rulemenu = Menu(menubar, tearoff=0)
+        rulemenu.add_command(label='Step', underline=1,
+                             command=self.step, accelerator='Space')
+        rulemenu.add_separator()
+        rulemenu.add_command(label='Shift', underline=0,
+                             command=self.shift, accelerator='Ctrl-s')
+        rulemenu.add_command(label='Reduce', underline=0,
+                             command=self.reduce, accelerator='Ctrl-r')
+        rulemenu.add_separator()
+        rulemenu.add_command(label='Undo', underline=0,
+                             command=self.undo, accelerator='Ctrl-u')
+        menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
+
+        viewmenu = Menu(menubar, tearoff=0)
+        viewmenu.add_checkbutton(label="Show Grammar", underline=0,
+                                 variable=self._show_grammar,
+                                 command=self._toggle_grammar)
+        viewmenu.add_separator()
+        viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+                                 underline=0, value=10, command=self.resize)
+        viewmenu.add_radiobutton(label='Small', variable=self._size,
+                                 underline=0, value=12, command=self.resize)
+        viewmenu.add_radiobutton(label='Medium', variable=self._size,
+                                 underline=0, value=14, command=self.resize)
+        viewmenu.add_radiobutton(label='Large', variable=self._size,
+                                 underline=0, value=18, command=self.resize)
+        viewmenu.add_radiobutton(label='Huge', variable=self._size,
+                                 underline=0, value=24, command=self.resize)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
+
+        animatemenu = Menu(menubar, tearoff=0)
+        animatemenu.add_radiobutton(label="No Animation", underline=0,
+                                    variable=self._animate, value=0)
+        animatemenu.add_radiobutton(label="Slow Animation", underline=0,
+                                    variable=self._animate, value=20,
+                                    accelerator='-')
+        animatemenu.add_radiobutton(label="Normal Animation", underline=0,
+                                    variable=self._animate, value=10,
+                                    accelerator='=')
+        animatemenu.add_radiobutton(label="Fast Animation", underline=0,
+                                    variable=self._animate, value=4,
+                                    accelerator='+')
+        menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
+
+
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label='About', underline=0,
+                             command=self.about)
+        helpmenu.add_command(label='Instructions', underline=0,
+                             command=self.help, accelerator='F1')
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+
+        parent.config(menu=menubar)
+
+    def _init_feedback(self, parent):
+        self._feedbackframe = feedbackframe = Frame(parent)
+        feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
+        self._lastoper_label = Label(feedbackframe, text='Last Operation:',
+                                     font=self._font)
+        self._lastoper_label.pack(side='left')
+        lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+        lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
+        self._lastoper1 = Label(lastoperframe, foreground='#007070',
+                                background='#f0f0f0', font=self._font)
+        self._lastoper2 = Label(lastoperframe, anchor='w', width=30,
+                                foreground='#004040', background='#f0f0f0',
+                                font=self._font)
+        self._lastoper1.pack(side='left')
+        self._lastoper2.pack(side='left', fill='x', expand=1)
+
+    def _init_canvas(self, parent):
+        self._cframe = CanvasFrame(parent, background='white',
+                                   width=525, closeenough=10,
+                                   border=2, relief='sunken')
+        self._cframe.pack(expand=1, fill='both', side='top', pady=2)
+        canvas = self._canvas = self._cframe.canvas()
+
+        self._stackwidgets = []
+        self._rtextwidgets = []
+        self._titlebar = canvas.create_rectangle(0,0,0,0, fill='#c0f0f0',
+                                                 outline='black')
+        self._exprline = canvas.create_line(0,0,0,0, dash='.')
+        self._stacktop = canvas.create_line(0,0,0,0, fill='#408080')
+        size = self._size.get()+4
+        self._stacklabel = TextWidget(canvas, 'Stack', color='#004040',
+                                      font=self._boldfont)
+        self._rtextlabel = TextWidget(canvas, 'Remaining Text',
+                                      color='#004040', font=self._boldfont)
+        self._cframe.add_widget(self._stacklabel)
+        self._cframe.add_widget(self._rtextlabel)
+
+    #########################################
+    ##  Main draw procedure
+    #########################################
+
+    def _redraw(self):
+        scrollregion = self._canvas['scrollregion'].split()
+        (cx1, cy1, cx2, cy2) = [int(c) for c in scrollregion]
+
+        # Delete the old stack & rtext widgets.
+        for stackwidget in self._stackwidgets:
+            self._cframe.destroy_widget(stackwidget)
+        self._stackwidgets = []
+        for rtextwidget in self._rtextwidgets:
+            self._cframe.destroy_widget(rtextwidget)
+        self._rtextwidgets = []
+
+        # Position the titlebar & exprline
+        (x1, y1, x2, y2) = self._stacklabel.bbox()
+        y = y2-y1+10
+        self._canvas.coords(self._titlebar, -5000, 0, 5000, y-4)
+        self._canvas.coords(self._exprline, 0, y*2-10, 5000, y*2-10)
+
+        # Position the titlebar labels..
+        (x1, y1, x2, y2) = self._stacklabel.bbox()
+        self._stacklabel.move(5-x1, 3-y1)
+        (x1, y1, x2, y2) = self._rtextlabel.bbox()
+        self._rtextlabel.move(cx2-x2-5, 3-y1)
+
+        # Draw the stack.
+        stackx = 5
+        for tok in self._parser.stack():
+            if isinstance(tok, Tree):
+                attribs = {'tree_color': '#4080a0', 'tree_width': 2,
+                           'node_font': self._boldfont,
+                           'node_color': '#006060',
+                           'leaf_color': '#006060', 'leaf_font':self._font}
+                widget = tree_to_treesegment(self._canvas, tok,
+                                             **attribs)
+                widget.label()['color'] = '#000000'
+            else:
+                widget = TextWidget(self._canvas, tok,
+                                    color='#000000', font=self._font)
+            widget.bind_click(self._popup_reduce)
+            self._stackwidgets.append(widget)
+            self._cframe.add_widget(widget, stackx, y)
+            stackx = widget.bbox()[2] + 10
+
+        # Draw the remaining text.
+        rtextwidth = 0
+        for tok in self._parser.remaining_text():
+            widget = TextWidget(self._canvas, tok,
+                                color='#000000', font=self._font)
+            self._rtextwidgets.append(widget)
+            self._cframe.add_widget(widget, rtextwidth, y)
+            rtextwidth = widget.bbox()[2] + 4
+
+        # Allow enough room to shift the next token (for animations)
+        if len(self._rtextwidgets) > 0:
+            stackx += self._rtextwidgets[0].width()
+
+        # Move the remaining text to the correct location (keep it
+        # right-justified, when possible); and move the remaining text
+        # label, if necessary.
+        stackx = max(stackx, self._stacklabel.width()+25)
+        rlabelwidth = self._rtextlabel.width()+10
+        if stackx >= cx2-max(rtextwidth, rlabelwidth):
+            cx2 = stackx + max(rtextwidth, rlabelwidth)
+        for rtextwidget in self._rtextwidgets:
+            rtextwidget.move(4+cx2-rtextwidth, 0)
+        self._rtextlabel.move(cx2-self._rtextlabel.bbox()[2]-5, 0)
+
+        midx = (stackx + cx2-max(rtextwidth, rlabelwidth))/2
+        self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
+        (x1, y1, x2, y2) = self._stacklabel.bbox()
+
+        # Set up binding to allow them to shift a token by dragging it.
+        if len(self._rtextwidgets) > 0:
+            def drag_shift(widget, midx=midx, self=self):
+                if widget.bbox()[0] < midx: self.shift()
+                else: self._redraw()
+            self._rtextwidgets[0].bind_drag(drag_shift)
+            self._rtextwidgets[0].bind_click(self.shift)
+
+        # Draw the stack top.
+        self._highlight_productions()
+
+    def _draw_stack_top(self, widget):
+        # hack..
+        midx = widget.bbox()[2]+50
+        self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
+
+    def _highlight_productions(self):
+        # Highlight the productions that can be reduced.
+        self._prodlist.selection_clear(0, 'end')
+        for prod in self._parser.reducible_productions():
+            index = self._productions.index(prod)
+            self._prodlist.selection_set(index)
+
+    #########################################
+    ##  Button Callbacks
+    #########################################
+
+    def destroy(self, *e):
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+    def reset(self, *e):
+        self._parser.initialize(self._sent)
+        self._lastoper1['text'] = 'Reset App'
+        self._lastoper2['text'] = ''
+        self._redraw()
+
+    def step(self, *e):
+        if self.reduce(): return True
+        elif self.shift(): return True
+        else:
+            if list(self._parser.parses()):
+                self._lastoper1['text'] = 'Finished:'
+                self._lastoper2['text'] = 'Success'
+            else:
+                self._lastoper1['text'] = 'Finished:'
+                self._lastoper2['text'] = 'Failure'
+
+    def shift(self, *e):
+        if self._animating_lock: return
+        if self._parser.shift():
+            tok = self._parser.stack()[-1]
+            self._lastoper1['text'] = 'Shift:'
+            self._lastoper2['text'] = '%r' % tok
+            if self._animate.get():
+                self._animate_shift()
+            else:
+                self._redraw()
+            return True
+        return False
+
+    def reduce(self, *e):
+        if self._animating_lock: return
+        production = self._parser.reduce()
+        if production:
+            self._lastoper1['text'] = 'Reduce:'
+            self._lastoper2['text'] = '%s' % production
+            if self._animate.get():
+                self._animate_reduce()
+            else:
+                self._redraw()
+        return production
+
+    def undo(self, *e):
+        if self._animating_lock: return
+        if self._parser.undo():
+            self._redraw()
+
+    def postscript(self, *e):
+        self._cframe.print_to_file()
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._top.mainloop(*args, **kwargs)
+
+    #########################################
+    ##  Menubar callbacks
+    #########################################
+
+    def resize(self, size=None):
+        if size is not None: self._size.set(size)
+        size = self._size.get()
+        self._font.configure(size=-(abs(size)))
+        self._boldfont.configure(size=-(abs(size)))
+        self._sysfont.configure(size=-(abs(size)))
+
+        #self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
+        #self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
+        #self._lastoper_label['font'] = ('helvetica', -size)
+        #self._lastoper1['font'] = ('helvetica', -size)
+        #self._lastoper2['font'] = ('helvetica', -size)
+        #self._prodlist['font'] = ('helvetica', -size)
+        #self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
+        self._redraw()
+
+    def help(self, *e):
+        # The default font's not very legible; try using 'fixed' instead.
+        try:
+            ShowText(self._top, 'Help: Shift-Reduce Parser Application',
+                     (__doc__ or '').strip(), width=75, font='fixed')
+        except:
+            ShowText(self._top, 'Help: Shift-Reduce Parser Application',
+                     (__doc__ or '').strip(), width=75)
+
+    def about(self, *e):
+        ABOUT = ("NLTK Shift-Reduce Parser Application\n"+
+                 "Written by Edward Loper")
+        TITLE = 'About: Shift-Reduce Parser Application'
+        try:
+            from six.moves.tkinter_messagebox import Message
+            Message(message=ABOUT, title=TITLE).show()
+        except:
+            ShowText(self._top, TITLE, ABOUT)
+
+    def edit_grammar(self, *e):
+        CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
+
+    def set_grammar(self, grammar):
+        self._parser.set_grammar(grammar)
+        self._productions = list(grammar.productions())
+        self._prodlist.delete(0, 'end')
+        for production in self._productions:
+            self._prodlist.insert('end', (' %s' % production))
+
+    def edit_sentence(self, *e):
+        sentence = " ".join(self._sent)
+        title = 'Edit Text'
+        instr = 'Enter a new sentence to parse.'
+        EntryDialog(self._top, sentence, instr, self.set_sentence, title)
+
+    def set_sentence(self, sent):
+        self._sent = sent.split() #[XX] use tagged?
+        self.reset()
+
+    #########################################
+    ##  Reduce Production Selection
+    #########################################
+
+    def _toggle_grammar(self, *e):
+        if self._show_grammar.get():
+            self._prodframe.pack(fill='both', side='left', padx=2,
+                                 after=self._feedbackframe)
+            self._lastoper1['text'] = 'Show Grammar'
+        else:
+            self._prodframe.pack_forget()
+            self._lastoper1['text'] = 'Hide Grammar'
+        self._lastoper2['text'] = ''
+
+    def _prodlist_select(self, event):
+        selection = self._prodlist.curselection()
+        if len(selection) != 1: return
+        index = int(selection[0])
+        production = self._parser.reduce(self._productions[index])
+        if production:
+            self._lastoper1['text'] = 'Reduce:'
+            self._lastoper2['text'] = '%s' % production
+            if self._animate.get():
+                self._animate_reduce()
+            else:
+                self._redraw()
+        else:
+            # Reset the production selections.
+            self._prodlist.selection_clear(0, 'end')
+            for prod in self._parser.reducible_productions():
+                index = self._productions.index(prod)
+                self._prodlist.selection_set(index)
+
+    def _popup_reduce(self, widget):
+        # Remove old commands.
+        productions = self._parser.reducible_productions()
+        if len(productions) == 0: return
+
+        self._reduce_menu.delete(0, 'end')
+        for production in productions:
+            self._reduce_menu.add_command(label=str(production),
+                                          command=self.reduce)
+        self._reduce_menu.post(self._canvas.winfo_pointerx(),
+                               self._canvas.winfo_pointery())
+
+    #########################################
+    ##  Animations
+    #########################################
+
+    def _animate_shift(self):
+        # What widget are we shifting?
+        widget = self._rtextwidgets[0]
+
+        # Where are we shifting from & to?
+        right = widget.bbox()[0]
+        if len(self._stackwidgets) == 0: left = 5
+        else: left = self._stackwidgets[-1].bbox()[2]+10
+
+        # Start animating.
+        dt = self._animate.get()
+        dx = (left-right)*1.0/dt
+        self._animate_shift_frame(dt, widget, dx)
+
+    def _animate_shift_frame(self, frame, widget, dx):
+        if frame > 0:
+            self._animating_lock = 1
+            widget.move(dx, 0)
+            self._top.after(10, self._animate_shift_frame,
+                            frame-1, widget, dx)
+        else:
+            # but: stacktop??
+
+            # Shift the widget to the stack.
+            del self._rtextwidgets[0]
+            self._stackwidgets.append(widget)
+            self._animating_lock = 0
+
+            # Display the available productions.
+            self._draw_stack_top(widget)
+            self._highlight_productions()
+
+    def _animate_reduce(self):
+        # What widgets are we shifting?
+        numwidgets = len(self._parser.stack()[-1]) # number of children
+        widgets = self._stackwidgets[-numwidgets:]
+
+        # How far are we moving?
+        if isinstance(widgets[0], TreeSegmentWidget):
+            ydist = 15 + widgets[0].label().height()
+        else:
+            ydist = 15 + widgets[0].height()
+
+        # Start animating.
+        dt = self._animate.get()
+        dy = ydist*2.0/dt
+        self._animate_reduce_frame(dt/2, widgets, dy)
+
+    def _animate_reduce_frame(self, frame, widgets, dy):
+        if frame > 0:
+            self._animating_lock = 1
+            for widget in widgets: widget.move(0, dy)
+            self._top.after(10, self._animate_reduce_frame,
+                            frame-1, widgets, dy)
+        else:
+            del self._stackwidgets[-len(widgets):]
+            for widget in widgets:
+                self._cframe.remove_widget(widget)
+            tok = self._parser.stack()[-1]
+            if not isinstance(tok, Tree): raise ValueError()
+            label = TextWidget(self._canvas, str(tok.label()), color='#006060',
+                               font=self._boldfont)
+            widget = TreeSegmentWidget(self._canvas, label, widgets,
+                                       width=2)
+            (x1, y1, x2, y2) = self._stacklabel.bbox()
+            y = y2-y1+10
+            if not self._stackwidgets: x = 5
+            else: x = self._stackwidgets[-1].bbox()[2] + 10
+            self._cframe.add_widget(widget, x, y)
+            self._stackwidgets.append(widget)
+
+            # Display the available productions.
+            self._draw_stack_top(widget)
+            self._highlight_productions()
+
+#             # Delete the old widgets..
+#             del self._stackwidgets[-len(widgets):]
+#             for widget in widgets:
+#                 self._cframe.destroy_widget(widget)
+#
+#             # Make a new one.
+#             tok = self._parser.stack()[-1]
+#             if isinstance(tok, Tree):
+#                 attribs = {'tree_color': '#4080a0', 'tree_width': 2,
+#                            'node_font': bold, 'node_color': '#006060',
+#                            'leaf_color': '#006060', 'leaf_font':self._font}
+#                 widget = tree_to_treesegment(self._canvas, tok.type(),
+#                                              **attribs)
+#                 widget.node()['color'] = '#000000'
+#             else:
+#                 widget = TextWidget(self._canvas, tok.type(),
+#                                     color='#000000', font=self._font)
+#             widget.bind_click(self._popup_reduce)
+#             (x1, y1, x2, y2) = self._stacklabel.bbox()
+#             y = y2-y1+10
+#             if not self._stackwidgets: x = 5
+#             else: x = self._stackwidgets[-1].bbox()[2] + 10
+#             self._cframe.add_widget(widget, x, y)
+#             self._stackwidgets.append(widget)
+
+            #self._redraw()
+            self._animating_lock = 0
+
+    #########################################
+    ##  Hovering.
+    #########################################
+
+    def _highlight_hover(self, event):
+        # What production are we hovering over?
+        index = self._prodlist.nearest(event.y)
+        if self._hover == index: return
+
+        # Clear any previous hover highlighting.
+        self._clear_hover()
+
+        # If the production corresponds to an available reduction,
+        # highlight the stack.
+        selection = [int(s) for s in self._prodlist.curselection()]
+        if index in selection:
+            rhslen = len(self._productions[index].rhs())
+            for stackwidget in self._stackwidgets[-rhslen:]:
+                if isinstance(stackwidget, TreeSegmentWidget):
+                    stackwidget.label()['color'] = '#00a000'
+                else:
+                    stackwidget['color'] = '#00a000'
+
+        # Remember what production we're hovering over.
+        self._hover = index
+
+    def _clear_hover(self, *event):
+        # Clear any previous hover highlighting.
+        if self._hover == -1: return
+        self._hover = -1
+        for stackwidget in self._stackwidgets:
+            if isinstance(stackwidget, TreeSegmentWidget):
+                stackwidget.label()['color'] = 'black'
+            else:
+                stackwidget['color'] = 'black'
+
+
+def app():
+    """
+    Create a shift reduce parser app, using a simple grammar and
+    text.
+    """
+
+    from nltk.grammar import Nonterminal, Production, CFG
+    nonterminals = 'S VP NP PP P N Name V Det'
+    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
+                                           for s in nonterminals.split()]
+
+    productions = (
+        # Syntactic Productions
+        Production(S, [NP, VP]),
+        Production(NP, [Det, N]),
+        Production(NP, [NP, PP]),
+        Production(VP, [VP, PP]),
+        Production(VP, [V, NP, PP]),
+        Production(VP, [V, NP]),
+        Production(PP, [P, NP]),
+
+        # Lexical Productions
+        Production(NP, ['I']),   Production(Det, ['the']),
+        Production(Det, ['a']),  Production(N, ['man']),
+        Production(V, ['saw']),  Production(P, ['in']),
+        Production(P, ['with']), Production(N, ['park']),
+        Production(N, ['dog']),  Production(N, ['statue']),
+        Production(Det, ['my']),
+        )
+
+    grammar = CFG(S, productions)
+
+    # tokenize the sentence
+    sent = 'my dog saw a man in the park with a statue'.split()
+
+    ShiftReduceApp(grammar, sent).mainloop()
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/srparser_app.pyc b/nlp_resource_data/nltk/app/srparser_app.pyc

new file mode 100755 (executable)

index 0000000..584ef70

Binary files /dev/null and b/nlp_resource_data/nltk/app/srparser_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/wordfreq_app.py b/nlp_resource_data/nltk/app/wordfreq_app.py

new file mode 100755 (executable)

index 0000000..23bc796
--- /dev/null
+++ b/nlp_resource_data/nltk/app/wordfreq_app.py
@@ -0,0 +1,32 @@
+# Natural Language Toolkit: Wordfreq Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from matplotlib import pylab
+from nltk.text import Text
+from nltk.corpus import gutenberg
+
+def plot_word_freq_dist(text):
+    fd = text.vocab()
+
+    samples = [item for item, _ in fd.most_common(50)]
+    values = [fd[sample] for sample in samples]
+    values = [sum(values[:i+1]) * 100.0/fd.N() for i in range(len(values))]
+    pylab.title(text.name)
+    pylab.xlabel("Samples")
+    pylab.ylabel("Cumulative Percentage")
+    pylab.plot(values)
+    pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
+    pylab.show()
+
+def app():
+    t1 = Text(gutenberg.words('melville-moby_dick.txt'))
+    plot_word_freq_dist(t1)
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/wordfreq_app.pyc b/nlp_resource_data/nltk/app/wordfreq_app.pyc

new file mode 100755 (executable)

index 0000000..fde9608

Binary files /dev/null and b/nlp_resource_data/nltk/app/wordfreq_app.pyc differ
diff --git a/nlp_resource_data/nltk/app/wordnet_app.py b/nlp_resource_data/nltk/app/wordnet_app.py

new file mode 100755 (executable)

index 0000000..13807bc
--- /dev/null
+++ b/nlp_resource_data/nltk/app/wordnet_app.py
@@ -0,0 +1,970 @@
+# Natural Language Toolkit: WordNet Browser Application
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
+#         Paul Bone <pbone@students.csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A WordNet Browser application which launches the default browser
+(if it is not already running) and opens a new tab with a connection
+to http://localhost:port/ .  It also starts an HTTP server on the
+specified port and begins serving browser requests.  The default
+port is 8000.  (For command-line help, run "python wordnet -h")
+This application requires that the user's web browser supports
+Javascript.
+
+BrowServer is a server for browsing the NLTK Wordnet database It first
+launches a browser client to be used for browsing and then starts
+serving the requests of that and maybe other clients
+
+Usage::
+
+    browserver.py -h
+    browserver.py [-s] [-p <port>]
+
+Options::
+
+    -h or --help
+        Display this help message.
+
+    -l <file> or --log-file <file>
+        Logs messages to the given file, If this option is not specified
+        messages are silently dropped.
+
+    -p <port> or --port <port>
+        Run the web server on this TCP port, defaults to 8000.
+
+    -s or --server-mode
+        Do not start a web browser, and do not allow a user to
+        shotdown the server through the web interface.
+"""
+# TODO: throughout this package variable names and docstrings need
+# modifying to be compliant with NLTK's coding standards.  Tests also
+# need to be develop to ensure this continues to work in the face of
+# changes to other NLTK packages.
+from __future__ import print_function
+
+# Allow this program to run inside the NLTK source tree.
+from sys import path
+
+import os
+import sys
+from sys import argv
+from collections import defaultdict
+import webbrowser
+import datetime
+import re
+import threading
+import time
+import getopt
+import base64
+import pickle
+import copy
+
+from six.moves.urllib.parse import unquote_plus
+
+from nltk import compat
+from nltk.corpus import wordnet as wn
+from nltk.corpus.reader.wordnet import Synset, Lemma
+
+if compat.PY3:
+    from http.server import HTTPServer, BaseHTTPRequestHandler
+else:
+    from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
+
+# now included in local file
+# from util import html_header, html_trailer, \
+#    get_static_index_page, get_static_page_by_path, \
+#    page_from_word, page_from_href
+
+firstClient = True
+
+# True if we're not also running a web browser.  The value f server_mode
+# gets set by demo().
+server_mode = None
+
+# If set this is a file object for writting log messages.
+logfile = None
+
+
+class MyServerHandler(BaseHTTPRequestHandler):
+
+    def do_HEAD(self):
+        self.send_head()
+
+    def do_GET(self):
+        global firstClient
+        sp = self.path[1:]
+        if unquote_plus(sp) == 'SHUTDOWN THE SERVER':
+            if server_mode:
+                page = "Server must be killed with SIGTERM."
+                type = "text/plain"
+            else:
+                print('Server shutting down!')
+                os._exit(0)
+
+        elif sp == '': # First request.
+            type = 'text/html'
+            if not server_mode and firstClient:
+                firstClient = False
+                page = get_static_index_page(True)
+            else:
+                page = get_static_index_page(False)
+            word = 'green'
+
+        elif sp.endswith('.html'): # Trying to fetch a HTML file TODO:
+            type = 'text/html'
+            usp = unquote_plus(sp)
+            if usp == 'NLTK Wordnet Browser Database Info.html':
+                word = '* Database Info *'
+                if os.path.isfile(usp):
+                    with open(usp, 'r') as infile:
+                        page = infile.read()
+                else:
+                    page = (html_header % word) + \
+                        '<p>The database info file:'\
+                        '<p><b>' + usp + '</b>' + \
+                        '<p>was not found. Run this:' + \
+                        '<p><b>python dbinfo_html.py</b>' + \
+                        '<p>to produce it.' + html_trailer
+            else:
+                # Handle files here.
+                word = sp
+                page = get_static_page_by_path(usp)
+        elif sp.startswith("search"):
+            # This doesn't seem to work with MWEs.
+            type = 'text/html'
+            parts = (sp.split("?")[1]).split("&")
+            word = [p.split("=")[1].replace("+", " ")
+                    for p in parts if p.startswith("nextWord")][0]
+            page, word = page_from_word(word)
+        elif sp.startswith("lookup_"):
+            # TODO add a variation of this that takes a non ecoded word or MWE.
+            type = 'text/html'
+            sp = sp[len("lookup_"):]
+            page, word = page_from_href(sp)
+        elif sp == "start_page":
+            # if this is the first request we should display help
+            # information, and possibly set a default word.
+            type = 'text/html'
+            page, word = page_from_word("wordnet")
+        else:
+            type = 'text/plain'
+            page = "Could not parse request: '%s'" % sp
+
+        # Send result.
+        self.send_head(type)
+        self.wfile.write(page.encode('utf8'))
+
+
+    def send_head(self, type=None):
+        self.send_response(200)
+        self.send_header('Content-type', type)
+        self.end_headers()
+
+    def log_message(self, format, *args):
+        global logfile
+
+        if logfile:
+            logfile.write(
+                "%s - - [%s] %s\n" %
+                (self.address_string(),
+                 self.log_date_time_string(),
+                 format%args))
+
+
+def get_unique_counter_from_url(sp):
+    """
+    Extract the unique counter from the URL if it has one.  Otherwise return
+    null.
+    """
+    pos = sp.rfind('%23')
+    if pos != -1:
+        return int(sp[(pos + 3):])
+    else:
+        return None
+
+
+def wnb(port=8000, runBrowser=True, logfilename=None):
+    """
+    Run NLTK Wordnet Browser Server.
+
+    :param port: The port number for the server to listen on, defaults to
+                 8000
+    :type  port: int
+
+    :param runBrowser: True to start a web browser and point it at the web
+                       server.
+    :type  runBrowser: bool
+    """
+    # The webbrowser module is unpredictable, typically it blocks if it uses
+    # a console web browser, and doesn't block if it uses a GUI webbrowser,
+    # so we need to force it to have a clear correct behaviour.
+    #
+    # Normally the server should run for as long as the user wants. they
+    # should idealy be able to control this from the UI by closing the
+    # window or tab.  Second best would be clicking a button to say
+    # 'Shutdown' that first shutsdown the server and closes the window or
+    # tab, or exits the text-mode browser.  Both of these are unfreasable.
+    #
+    # The next best alternative is to start the server, have it close when
+    # it receives SIGTERM (default), and run the browser as well.  The user
+    # may have to shutdown both programs.
+    #
+    # Since webbrowser may block, and the webserver will block, we must run
+    # them in separate threads.
+    #
+    global server_mode, logfile
+    server_mode = not runBrowser
+
+    # Setup logging.
+    if logfilename:
+        try:
+            logfile = open(logfilename, "a", 1) # 1 means 'line buffering'
+        except IOError as e:
+            sys.stderr.write("Couldn't open %s for writing: %s",
+                             logfilename, e)
+            sys.exit(1)
+    else:
+        logfile = None
+
+    # Compute URL and start web browser
+    url = 'http://localhost:' + str(port)
+
+    server_ready = None
+    browser_thread = None
+
+    if runBrowser:
+        server_ready = threading.Event()
+        browser_thread = startBrowser(url, server_ready)
+
+    # Start the server.
+    server = HTTPServer(('', port), MyServerHandler)
+    if logfile:
+        logfile.write(
+            'NLTK Wordnet browser server running serving: %s\n' % url)
+    if runBrowser:
+        server_ready.set()
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        pass
+
+    if runBrowser:
+        browser_thread.join()
+
+    if logfile:
+        logfile.close()
+
+
+def startBrowser(url, server_ready):
+    def run():
+        server_ready.wait()
+        time.sleep(1) # Wait a little bit more, there's still the chance of
+                      # a race condition.
+        webbrowser.open(url, new = 2, autoraise = 1)
+    t = threading.Thread(target=run)
+    t.start()
+    return t
+
+#####################################################################
+# Utilities
+#####################################################################
+
+
+"""
+WordNet Browser Utilities.
+
+This provides a backend to both wxbrowse and browserver.py.
+"""
+
+\f
+################################################################################
+#
+# Main logic for wordnet browser.
+#
+
+# This is wrapped inside a function since wn is only available if the
+# WordNet corpus is installed.
+def _pos_tuples():
+    return [
+        (wn.NOUN,'N','noun'),
+        (wn.VERB,'V','verb'),
+        (wn.ADJ,'J','adj'),
+        (wn.ADV,'R','adv')]
+
+def _pos_match(pos_tuple):
+    """
+    This function returns the complete pos tuple for the partial pos
+    tuple given to it.  It attempts to match it against the first
+    non-null component of the given pos tuple.
+    """
+    if pos_tuple[0] == 's':
+        pos_tuple = ('a', pos_tuple[1], pos_tuple[2])
+    for n,x in enumerate(pos_tuple):
+        if x is not None:
+            break
+    for pt in _pos_tuples():
+        if pt[n] == pos_tuple[n]: return pt
+    return None
+
+
+HYPONYM = 0
+HYPERNYM = 1
+CLASS_REGIONAL = 2
+PART_HOLONYM = 3
+PART_MERONYM = 4
+ATTRIBUTE = 5
+SUBSTANCE_HOLONYM = 6
+SUBSTANCE_MERONYM = 7
+MEMBER_HOLONYM = 8
+MEMBER_MERONYM = 9
+VERB_GROUP = 10
+INSTANCE_HYPONYM = 12
+INSTANCE_HYPERNYM = 13
+CAUSE = 14
+ALSO_SEE = 15
+SIMILAR = 16
+ENTAILMENT = 17
+ANTONYM = 18
+FRAMES = 19
+PERTAINYM = 20
+
+CLASS_CATEGORY = 21
+CLASS_USAGE = 22
+CLASS_REGIONAL = 23
+CLASS_USAGE = 24
+CLASS_CATEGORY = 11
+
+DERIVATIONALLY_RELATED_FORM = 25
+
+INDIRECT_HYPERNYMS = 26
+
+
+def lemma_property(word, synset, func):
+
+    def flattern(l):
+        if l == []:
+            return []
+        else:
+            return l[0] + flattern(l[1:])
+
+    return flattern([func(l) for l in synset.lemmas if l.name == word])
+
+
+def rebuild_tree(orig_tree):
+    node = orig_tree[0]
+    children = orig_tree[1:]
+    return (node, [rebuild_tree(t) for t in children])
+
+
+def get_relations_data(word, synset):
+    """
+    Get synset relations data for a synset.  Note that this doesn't
+    yet support things such as full hyponym vs direct hyponym.
+    """
+    if synset.pos() == wn.NOUN:
+        return ((HYPONYM, 'Hyponyms',
+                   synset.hyponyms()),
+                (INSTANCE_HYPONYM , 'Instance hyponyms',
+                   synset.instance_hyponyms()),
+                (HYPERNYM, 'Direct hypernyms',
+                   synset.hypernyms()),
+                (INDIRECT_HYPERNYMS, 'Indirect hypernyms',
+                   rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1]),
+#  hypernyms', 'Sister terms',
+                (INSTANCE_HYPERNYM , 'Instance hypernyms',
+                   synset.instance_hypernyms()),
+#            (CLASS_REGIONAL, ['domain term region'], ),
+                (PART_HOLONYM, 'Part holonyms',
+                   synset.part_holonyms()),
+                (PART_MERONYM, 'Part meronyms',
+                   synset.part_meronyms()),
+                (SUBSTANCE_HOLONYM, 'Substance holonyms',
+                   synset.substance_holonyms()),
+                (SUBSTANCE_MERONYM, 'Substance meronyms',
+                   synset.substance_meronyms()),
+                (MEMBER_HOLONYM, 'Member holonyms',
+                   synset.member_holonyms()),
+                (MEMBER_MERONYM, 'Member meronyms',
+                   synset.member_meronyms()),
+                (ATTRIBUTE, 'Attributes',
+                   synset.attributes()),
+                (ANTONYM, "Antonyms",
+                   lemma_property(word, synset, lambda l: l.antonyms())),
+                (DERIVATIONALLY_RELATED_FORM, "Derivationally related form",
+                   lemma_property(word, synset, lambda l: l.derivationally_related_forms())))
+    elif synset.pos() == wn.VERB:
+        return ((ANTONYM, 'Antonym',
+                   lemma_property(word, synset, lambda l: l.antonyms())),
+                (HYPONYM, 'Hyponym',
+                   synset.hyponyms()),
+                (HYPERNYM, 'Direct hypernyms',
+                   synset.hypernyms()),
+                (INDIRECT_HYPERNYMS, 'Indirect hypernyms',
+                   rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1]),
+                (ENTAILMENT, 'Entailments',
+                   synset.entailments()),
+                (CAUSE, 'Causes',
+                   synset.causes()),
+                (ALSO_SEE, 'Also see',
+                   synset.also_sees()),
+                (VERB_GROUP, 'Verb Groups',
+                   synset.verb_groups()),
+                (DERIVATIONALLY_RELATED_FORM, "Derivationally related form",
+                   lemma_property(word, synset, lambda l: l.derivationally_related_forms())))
+    elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT:
+        return ((ANTONYM, 'Antonym',
+                   lemma_property(word, synset, lambda l: l.antonyms())),
+                (SIMILAR, 'Similar to',
+                   synset.similar_tos()),
+                # Participle of verb - not supported by corpus
+                (PERTAINYM, 'Pertainyms',
+                   lemma_property(word, synset, lambda l: l.pertainyms())),
+                (ATTRIBUTE, 'Attributes',
+                   synset.attributes()),
+                (ALSO_SEE, 'Also see',
+                   synset.also_sees()))
+    elif synset.pos() == wn.ADV:
+        # This is weird. adverbs such as 'quick' and 'fast' don't seem
+        # to have antonyms returned by the corpus.a
+        return ((ANTONYM, 'Antonym',
+                   lemma_property(word, synset, lambda l: l.antonyms())),)
+                # Derived from adjective - not supported by corpus
+    else:
+        raise TypeError("Unhandles synset POS type: " + str(synset.pos()))
+
+
+html_header = '''
+<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
+'http://www.w3.org/TR/html4/strict.dtd'>
+<html>
+<head>
+<meta name='generator' content=
+'HTML Tidy for Windows (vers 14 February 2006), see www.w3.org'>
+<meta http-equiv='Content-Type' content=
+'text/html; charset=us-ascii'>
+<title>NLTK Wordnet Browser display of: %s</title></head>
+<body bgcolor='#F5F5F5' text='#000000'>
+'''
+html_trailer = '''
+</body>
+</html>
+'''
+
+explanation  = '''
+<h3>Search Help</h3>
+<ul><li>The display below the line is an example of the output the browser
+shows you when you enter a search word. The search word was <b>green</b>.</li>
+<li>The search result shows for different parts of speech the <b>synsets</b>
+i.e. different meanings for the word.</li>
+<li>All underlined texts are hypertext links. There are two types of links:
+word links and others. Clicking a word link carries out a search for the word
+in the Wordnet database.</li>
+<li>Clicking a link of the other type opens a display section of data attached
+to that link. Clicking that link a second time closes the section again.</li>
+<li>Clicking <u>S:</u> opens a section showing the relations for that synset.
+</li>
+<li>Clicking on a relation name opens a section that displays the associated
+synsets.</li>
+<li>Type a search word in the <b>Word</b> field and start the search by the
+<b>Enter/Return</b> key or click the <b>Search</b> button.</li>
+</ul>
+<hr width='100%'>
+'''
+
+# HTML oriented functions
+
+def _bold(txt): return '<b>%s</b>' % txt
+
+def _center(txt): return '<center>%s</center>' % txt
+
+def _hlev(n,txt): return '<h%d>%s</h%d>' % (n,txt,n)
+
+def _italic(txt): return '<i>%s</i>' % txt
+
+def _li(txt): return '<li>%s</li>' % txt
+
+def pg(word, body):
+    '''
+    Return a HTML page of NLTK Browser format constructed from the
+    word and body
+
+    :param word: The word that the body corresponds to
+    :type word: str
+    :param body: The HTML body corresponding to the word
+    :type body: str
+    :return: a HTML page for the word-body combination
+    :rtype: str
+    '''
+    return (html_header % word) + body + html_trailer
+
+def _ul(txt): return '<ul>' + txt + '</ul>'
+
+def _abbc(txt):
+    """
+    abbc = asterisks, breaks, bold, center
+    """
+    return _center(_bold('<br>'*10 + '*'*10 + ' ' + txt + ' ' + '*'*10))
+
+full_hyponym_cont_text = \
+    _ul(_li(_italic('(has full hyponym continuation)'))) + '\n'
+
+
+def _get_synset(synset_key):
+    """
+    The synset key is the unique name of the synset, this can be
+    retrived via synset.name()
+    """
+    return wn.synset(synset_key)
+
+def _collect_one_synset(word, synset, synset_relations):
+    '''
+    Returns the HTML string for one synset or word
+
+    :param word: the current word
+    :type word: str
+    :param synset: a synset
+    :type synset: synset
+    :param synset_relations: information about which synset relations
+    to display.
+    :type synset_relations: dict(synset_key, set(relation_id))
+    :return: The HTML string built for this synset
+    :rtype: str
+    '''
+    if isinstance(synset, tuple): # It's a word
+        raise NotImplementedError("word not supported by _collect_one_synset")
+
+    typ = 'S'
+    pos_tuple = _pos_match((synset.pos(), None, None))
+    assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos()
+    descr = pos_tuple[2]
+    ref = copy.deepcopy(Reference(word, synset_relations))
+    ref.toggle_synset(synset)
+    synset_label = typ + ";"
+    if synset.name() in synset_relations:
+        synset_label = _bold(synset_label)
+    s = '<li>%s (%s) ' % (make_lookup_link(ref, synset_label), descr)
+    def format_lemma(w):
+        w = w.replace('_', ' ')
+        if w.lower() == word:
+            return _bold(w)
+        else:
+            ref = Reference(w)
+            return make_lookup_link(ref, w)
+
+    s += ', '.join(format_lemma(l.name()) for l in synset.lemmas())
+
+    gl = " (%s) <i>%s</i> " % \
+        (synset.definition(),
+         "; ".join("\"%s\"" % e for e in synset.examples()))
+    return s + gl + _synset_relations(word, synset, synset_relations) + '</li>\n'
+
+def _collect_all_synsets(word, pos, synset_relations=dict()):
+    """
+    Return a HTML unordered list of synsets for the given word and
+    part of speech.
+    """
+    return '<ul>%s\n</ul>\n' % \
+        ''.join((_collect_one_synset(word, synset, synset_relations)
+                 for synset
+                 in wn.synsets(word, pos)))
+
+def _synset_relations(word, synset, synset_relations):
+    '''
+    Builds the HTML string for the relations of a synset
+
+    :param word: The current word
+    :type word: str
+    :param synset: The synset for which we're building the relations.
+    :type synset: Synset
+    :param synset_relations: synset keys and relation types for which to display relations.
+    :type synset_relations: dict(synset_key, set(relation_type))
+    :return: The HTML for a synset's relations
+    :rtype: str
+    '''
+
+    if not synset.name() in synset_relations:
+        return ""
+    ref = Reference(word, synset_relations)
+
+    def relation_html(r):
+        if isinstance(r, Synset):
+            return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0])
+        elif isinstance(r, Lemma):
+            return relation_html(r.synset())
+        elif isinstance(r, tuple):
+            # It's probably a tuple containing a Synset and a list of
+            # similar tuples.  This forms a tree of synsets.
+            return "%s\n<ul>%s</ul>\n" % \
+                (relation_html(r[0]),
+                 ''.join('<li>%s</li>\n' % relation_html(sr) for sr in r[1]))
+        else:
+            raise TypeError("r must be a synset, lemma or list, it was: type(r) = %s, r = %s" % (type(r), r))
+
+    def make_synset_html(db_name, disp_name, rels):
+        synset_html = '<i>%s</i>\n' % \
+            make_lookup_link(
+                copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(),
+                disp_name)
+
+        if db_name in ref.synset_relations[synset.name()]:
+             synset_html += '<ul>%s</ul>\n' % \
+                ''.join("<li>%s</li>\n" % relation_html(r) for r in rels)
+
+        return synset_html
+
+    html = '<ul>' + \
+        '\n'.join(("<li>%s</li>" % make_synset_html(*rel_data) for rel_data
+                   in get_relations_data(word, synset)
+                   if rel_data[2] != [])) + \
+        '</ul>'
+
+    return html
+
+
+class Reference(object):
+    """
+    A reference to a page that may be generated by page_word
+    """
+
+    def __init__(self, word, synset_relations=dict()):
+        """
+        Build a reference to a new page.
+
+        word is the word or words (separated by commas) for which to
+        search for synsets of
+
+        synset_relations is a dictionary of synset keys to sets of
+        synset relation identifaiers to unfold a list of synset
+        relations for.
+        """
+        self.word = word
+        self.synset_relations = synset_relations
+
+    def encode(self):
+        """
+        Encode this reference into a string to be used in a URL.
+        """
+        # This uses a tuple rather than an object since the python
+        # pickle representation is much smaller and there is no need
+        # to represent the complete object.
+        string = pickle.dumps((self.word, self.synset_relations), -1)
+        return base64.urlsafe_b64encode(string).decode()
+
+    @staticmethod
+    def decode(string):
+        """
+        Decode a reference encoded with Reference.encode
+        """
+        string = base64.urlsafe_b64decode(string.encode())
+        word, synset_relations = pickle.loads(string)
+        return Reference(word, synset_relations)
+
+    def toggle_synset_relation(self, synset, relation):
+        """
+        Toggle the display of the relations for the given synset and
+        relation type.
+
+        This function will throw a KeyError if the synset is currently
+        not being displayed.
+        """
+        if relation in self.synset_relations[synset.name()]:
+            self.synset_relations[synset.name()].remove(relation)
+        else:
+            self.synset_relations[synset.name()].add(relation)
+
+        return self
+
+    def toggle_synset(self, synset):
+        """
+        Toggle displaying of the relation types for the given synset
+        """
+        if synset.name() in self.synset_relations:
+            del self.synset_relations[synset.name()]
+        else:
+            self.synset_relations[synset.name()] = set()
+
+        return self
+
+
+def make_lookup_link(ref, label):
+    return '<a href="lookup_%s">%s</a>' % (ref.encode(), label)
+
+
+def page_from_word(word):
+    """
+    Return a HTML page for the given word.
+
+    :param word: The currently active word
+    :type word: str
+    :return: A tuple (page,word), where page is the new current HTML page
+             to be sent to the browser and
+             word is the new current word
+    :rtype: A tuple (str,str)
+    """
+    return page_from_reference(Reference(word))
+
+def page_from_href(href):
+    '''
+    Returns a tuple of the HTML page built and the new current word
+
+    :param href: The hypertext reference to be solved
+    :type href: str
+    :return: A tuple (page,word), where page is the new current HTML page
+             to be sent to the browser and
+             word is the new current word
+    :rtype: A tuple (str,str)
+    '''
+    return page_from_reference(Reference.decode(href))
+
+def page_from_reference(href):
+    '''
+    Returns a tuple of the HTML page built and the new current word
+
+    :param href: The hypertext reference to be solved
+    :type href: str
+    :return: A tuple (page,word), where page is the new current HTML page
+             to be sent to the browser and
+             word is the new current word
+    :rtype: A tuple (str,str)
+    '''
+    word = href.word
+    pos_forms = defaultdict(list)
+    words = word.split(',')
+    words = [w for w in [w.strip().lower().replace(' ', '_')
+                         for w in words]
+             if w != ""]
+    if len(words) == 0:
+        # No words were found.
+        return "", "Please specify a word to search for."
+
+    # This looks up multiple words at once.  This is probably not
+    # necessary and may lead to problems.
+    for w in words:
+        for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]:
+            form = wn.morphy(w, pos)
+            if form and form not in pos_forms[pos]:
+                pos_forms[pos].append(form)
+    body = ''
+    for pos,pos_str,name in _pos_tuples():
+        if pos in pos_forms:
+            body += _hlev(3, name) + '\n'
+            for w in pos_forms[pos]:
+                # Not all words of exc files are in the database, skip
+                # to the next word if a KeyError is raised.
+                try:
+                    body += _collect_all_synsets(w, pos, href.synset_relations)
+                except KeyError:
+                    pass
+    if not body:
+        body = "The word or words '%s' where not found in the dictonary." % word
+    return body, word
+
+
+\f
+#####################################################################
+# Static pages
+#####################################################################
+
+def get_static_page_by_path(path):
+    """
+    Return a static HTML page from the path given.
+    """
+    if path == "index_2.html":
+        return get_static_index_page(False)
+    elif path == "index.html":
+        return get_static_index_page(True)
+    elif path == "NLTK Wordnet Browser Database Info.html":
+        return "Display of Wordnet Database Statistics is not supported"
+    elif path == "upper_2.html":
+        return get_static_upper_page(False)
+    elif path == "upper.html":
+        return get_static_upper_page(True)
+    elif path == "web_help.html":
+        return get_static_web_help_page()
+    elif path == "wx_help.html":
+        return get_static_wx_help_page()
+    else:
+        return "Internal error: Path for static page '%s' is unknown" % path
+
+
+def get_static_web_help_page():
+    """
+    Return the static web help page.
+    """
+    return \
+"""
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+     <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
+            Copyright (C) 2001-2017 NLTK Project
+            Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
+            URL: <http://nltk.org/>
+            For license information, see LICENSE.TXT -->
+     <head>
+          <meta http-equiv='Content-Type' content='text/html; charset=us-ascii'>
+          <title>NLTK Wordnet Browser display of: * Help *</title>
+     </head>
+<body bgcolor='#F5F5F5' text='#000000'>
+<h2>NLTK Wordnet Browser Help</h2>
+<p>The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database.
+<p><b>You are using the Javascript client part of the NLTK Wordnet BrowseServer.</b> We assume your browser is in tab sheets enabled mode.</p>
+<p>For background information on Wordnet, see the Wordnet project home page: <a href="http://wordnet.princeton.edu/"><b> http://wordnet.princeton.edu/</b></a>. For more information on the NLTK project, see the project home:
+<a href="http://nltk.sourceforge.net/"><b>http://nltk.sourceforge.net/</b></a>. To get an idea of what the Wordnet version used by this browser includes choose <b>Show Database Info</b> from the <b>View</b> submenu.</p>
+<h3>Word search</h3>
+<p>The word to be searched is typed into the <b>New Word</b> field and the search started with Enter or by clicking the <b>Search</b> button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.</p>
+<p>In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing <b>fLIeS</b> as an obscure example gives one <a href="MfLIeS">this</a>. Click the previous link to see what this kind of search looks like and then come back to this page by using the <b>Alt+LeftArrow</b> key combination.</p>
+<p>The result of a search is a display of one or more
+<b>synsets</b> for every part of speech in which a form of the
+search word was found to occur. A synset is a set of words
+having the same sense or meaning. Each word in a synset that is
+underlined is a hyperlink which can be clicked to trigger an
+automatic search for that word.</p>
+<p>Every synset has a hyperlink <b>S:</b> at the start of its
+display line. Clicking that symbol shows you the name of every
+<b>relation</b> that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.</p>
+<p>It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this <a href="Mcheer up,clear up">cheer up,clear up</a>, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the <b>Alt+LeftArrow</b> key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.</p>
+<p>
+There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink <b>W:</b> at their beginning. Clicking this link shows more info on the word in question.</p>
+<h3>The Buttons</h3>
+<p>The <b>Search</b> and <b>Help</b> buttons need no more explanation. </p>
+<p>The <b>Show Database Info</b> button shows a collection of Wordnet database statistics.</p>
+<p>The <b>Shutdown the Server</b> button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns.
+</p></body>
+</html>
+"""
+
+
+def get_static_welcome_message():
+    """
+    Get the static welcome page.
+    """
+    return \
+"""
+<h3>Search Help</h3>
+<ul><li>The display below the line is an example of the output the browser
+shows you when you enter a search word. The search word was <b>green</b>.</li>
+<li>The search result shows for different parts of speech the <b>synsets</b>
+i.e. different meanings for the word.</li>
+<li>All underlined texts are hypertext links. There are two types of links:
+word links and others. Clicking a word link carries out a search for the word
+in the Wordnet database.</li>
+<li>Clicking a link of the other type opens a display section of data attached
+to that link. Clicking that link a second time closes the section again.</li>
+<li>Clicking <u>S:</u> opens a section showing the relations for that synset.</li>
+<li>Clicking on a relation name opens a section that displays the associated
+synsets.</li>
+<li>Type a search word in the <b>Next Word</b> field and start the search by the
+<b>Enter/Return</b> key or click the <b>Search</b> button.</li>
+</ul>
+"""
+
+def get_static_index_page(with_shutdown):
+    """
+    Get the static index page.
+    """
+    template = \
+"""
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"  "http://www.w3.org/TR/html4/frameset.dtd">
+<HTML>
+     <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
+            Copyright (C) 2001-2017 NLTK Project
+            Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
+            URL: <http://nltk.org/>
+            For license information, see LICENSE.TXT -->
+     <HEAD>
+         <TITLE>NLTK Wordnet Browser</TITLE>
+     </HEAD>
+
+<frameset rows="7%%,93%%">
+    <frame src="%s" name="header">
+    <frame src="start_page" name="body">
+</frameset>
+</HTML>
+"""
+    if with_shutdown:
+        upper_link = "upper.html"
+    else:
+        upper_link = "upper_2.html"
+
+    return template % upper_link
+
+
+def get_static_upper_page(with_shutdown):
+    """
+    Return the upper frame page,
+
+    If with_shutdown is True then a 'shutdown' button is also provided
+    to shutdown the server.
+    """
+    template = \
+"""
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+    <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
+        Copyright (C) 2001-2017 NLTK Project
+        Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
+        URL: <http://nltk.org/>
+        For license information, see LICENSE.TXT -->
+    <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
+        <title>Untitled Document</title>
+    </head>
+    <body>
+    <form method="GET" action="search" target="body">
+            Current Word:&nbsp;<input type="text" id="currentWord" size="10" disabled>
+            Next Word:&nbsp;<input type="text" id="nextWord" name="nextWord" size="10">
+            <input name="searchButton" type="submit" value="Search">
+    </form>
+        <a target="body" href="web_help.html">Help</a>
+        %s
+
+</body>
+</html>
+"""
+    if with_shutdown:
+        shutdown_link = "<a href=\"SHUTDOWN THE SERVER\">Shutdown</a>"
+    else:
+        shutdown_link = ""
+
+    return template % shutdown_link
+
+
+
+def usage():
+    """
+    Display the command line help message.
+    """
+    print(__doc__)
+
+def app():
+    # Parse and interpret options.
+    (opts, _) = getopt.getopt(argv[1:], "l:p:sh",
+                              ["logfile=", "port=", "server-mode", "help"])
+    port = 8000
+    server_mode = False
+    help_mode = False
+    logfilename = None
+    for (opt, value) in opts:
+        if (opt == "-l") or (opt == "--logfile"):
+            logfilename = str(value)
+        elif (opt == "-p") or (opt == "--port"):
+            port = int(value)
+        elif (opt == "-s") or (opt == "--server-mode"):
+            server_mode = True
+        elif (opt == "-h") or (opt == "--help"):
+            help_mode = True
+
+    if help_mode:
+        usage()
+    else:
+        wnb(port, not server_mode, logfilename)
+
+if __name__ == '__main__':
+    app()
+
+__all__ = ['app']
diff --git a/nlp_resource_data/nltk/app/wordnet_app.pyc b/nlp_resource_data/nltk/app/wordnet_app.pyc

new file mode 100755 (executable)

index 0000000..c800f95

Binary files /dev/null and b/nlp_resource_data/nltk/app/wordnet_app.pyc differ
diff --git a/nlp_resource_data/nltk/book.py b/nlp_resource_data/nltk/book.py

new file mode 100755 (executable)

index 0000000..5394736
--- /dev/null
+++ b/nlp_resource_data/nltk/book.py
@@ -0,0 +1,91 @@
+# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function
+
+from nltk.corpus import (gutenberg, genesis, inaugural,
+                         nps_chat, webtext, treebank, wordnet)
+from nltk.text import Text
+from nltk.probability import FreqDist
+from nltk.util import bigrams
+
+print("*** Introductory Examples for the NLTK Book ***")
+print("Loading text1, ..., text9 and sent1, ..., sent9")
+print("Type the name of the text or sentence to view it.")
+print("Type: 'texts()' or 'sents()' to list the materials.")
+
+text1 = Text(gutenberg.words('melville-moby_dick.txt'))
+print("text1:", text1.name)
+
+text2 = Text(gutenberg.words('austen-sense.txt'))
+print("text2:", text2.name)
+
+text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
+print("text3:", text3.name)
+
+text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
+print("text4:", text4.name)
+
+text5 = Text(nps_chat.words(), name="Chat Corpus")
+print("text5:", text5.name)
+
+text6 = Text(webtext.words('grail.txt'),
+             name="Monty Python and the Holy Grail")
+print("text6:", text6.name)
+
+text7 = Text(treebank.words(), name="Wall Street Journal")
+print("text7:", text7.name)
+
+text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
+print("text8:", text8.name)
+
+text9 = Text(gutenberg.words('chesterton-thursday.txt'))
+print("text9:", text9.name)
+
+
+def texts():
+    print("text1:", text1.name)
+    print("text2:", text2.name)
+    print("text3:", text3.name)
+    print("text4:", text4.name)
+    print("text5:", text5.name)
+    print("text6:", text6.name)
+    print("text7:", text7.name)
+    print("text8:", text8.name)
+    print("text9:", text9.name)
+
+sent1 = ["Call", "me", "Ishmael", "."]
+sent2 = ["The", "family", "of", "Dashwood", "had", "long",
+         "been", "settled", "in", "Sussex", "."]
+sent3 = ["In", "the", "beginning", "God", "created", "the",
+         "heaven", "and", "the", "earth", "."]
+sent4 = ["Fellow", "-", "Citizens", "of", "the", "Senate",
+         "and", "of", "the", "House", "of", "Representatives", ":"]
+sent5 = ["I", "have", "a", "problem", "with", "people",
+         "PMing", "me", "to", "lol", "JOIN"]
+sent6 = ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop',
+         'clop', ']', 'KING', 'ARTHUR', ':', 'Whoa', 'there', '!']
+sent7 = ["Pierre", "Vinken", ",", "61", "years", "old", ",",
+         "will", "join", "the", "board", "as", "a", "nonexecutive",
+         "director", "Nov.", "29", "."]
+sent8 = ['25', 'SEXY', 'MALE', ',', 'seeks', 'attrac', 'older',
+         'single', 'lady', ',', 'for', 'discreet', 'encounters', '.']
+sent9 = ["THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the",
+         "sunset", "side", "of", "London", ",", "as", "red", "and",
+         "ragged", "as", "a", "cloud", "of", "sunset", "."]
+
+
+def sents():
+    print("sent1:", " ".join(sent1))
+    print("sent2:", " ".join(sent2))
+    print("sent3:", " ".join(sent3))
+    print("sent4:", " ".join(sent4))
+    print("sent5:", " ".join(sent5))
+    print("sent6:", " ".join(sent6))
+    print("sent7:", " ".join(sent7))
+    print("sent8:", " ".join(sent8))
+    print("sent9:", " ".join(sent9))
diff --git a/nlp_resource_data/nltk/book.pyc b/nlp_resource_data/nltk/book.pyc

new file mode 100755 (executable)

index 0000000..87310ef

Binary files /dev/null and b/nlp_resource_data/nltk/book.pyc differ
diff --git a/nlp_resource_data/nltk/ccg/__init__.py b/nlp_resource_data/nltk/ccg/__init__.py

new file mode 100755 (executable)

index 0000000..630c182
--- /dev/null
+++ b/nlp_resource_data/nltk/ccg/__init__.py
@@ -0,0 +1,22 @@
+# Natural Language Toolkit: Combinatory Categorial Grammar
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Combinatory Categorial Grammar.
+
+For more information see nltk/doc/contrib/ccg/ccg.pdf
+"""
+
+from nltk.ccg.combinator import (UndirectedBinaryCombinator, DirectedBinaryCombinator,
+                                 ForwardCombinator, BackwardCombinator,
+                                 UndirectedFunctionApplication, ForwardApplication,
+                                 BackwardApplication, UndirectedComposition,
+                                 ForwardComposition, BackwardComposition,
+                                 BackwardBx, UndirectedSubstitution, ForwardSubstitution,
+                                 BackwardSx, UndirectedTypeRaise, ForwardT, BackwardT)
+from nltk.ccg.chart import CCGEdge, CCGLeafEdge, CCGChartParser, CCGChart
+from nltk.ccg.lexicon import CCGLexicon
diff --git a/nlp_resource_data/nltk/ccg/__init__.pyc b/nlp_resource_data/nltk/ccg/__init__.pyc

new file mode 100755 (executable)

index 0000000..70c36ec

Binary files /dev/null and b/nlp_resource_data/nltk/ccg/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/ccg/api.py b/nlp_resource_data/nltk/ccg/api.py

new file mode 100755 (executable)

index 0000000..79c6b77
--- /dev/null
+++ b/nlp_resource_data/nltk/ccg/api.py
@@ -0,0 +1,359 @@
+# Natural Language Toolkit: CCG Categories
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+from functools import total_ordering
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+from nltk.internals import raise_unorderable_types
+from nltk.compat import (python_2_unicode_compatible, unicode_repr)
+
+
+@add_metaclass(ABCMeta)
+@total_ordering
+class AbstractCCGCategory(object):
+    '''
+    Interface for categories in combinatory grammars.
+    '''
+
+    @abstractmethod
+    def is_primitive(self):
+        """
+        Returns true if the category is primitive.
+        """
+
+    @abstractmethod
+    def is_function(self):
+        """
+        Returns true if the category is a function application.
+        """
+
+    @abstractmethod
+    def is_var(self):
+        """
+        Returns true if the category is a variable.
+        """
+
+    @abstractmethod
+    def substitute(self, substitutions):
+        """
+        Takes a set of (var, category) substitutions, and replaces every
+        occurrence of the variable with the corresponding category.
+        """
+
+    @abstractmethod
+    def can_unify(self, other):
+        """
+        Determines whether two categories can be unified.
+         - Returns None if they cannot be unified
+         - Returns a list of necessary substitutions if they can.
+        """
+
+    # Utility functions: comparison, strings and hashing.
+    @abstractmethod
+    def __str__(self):
+        pass
+
+    def __eq__(self, other):
+        return (self.__class__ is other.__class__ and
+                self._comparison_key == other._comparison_key)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, AbstractCCGCategory):
+            raise_unorderable_types("<", self, other)
+        if self.__class__ is other.__class__:
+            return self._comparison_key < other._comparison_key
+        else:
+            return self.__class__.__name__ < other.__class__.__name__
+
+    def __hash__(self):
+        try:
+            return self._hash
+        except AttributeError:
+            self._hash = hash(self._comparison_key)
+            return self._hash
+
+
+@python_2_unicode_compatible
+class CCGVar(AbstractCCGCategory):
+    '''
+    Class representing a variable CCG category.
+    Used for conjunctions (and possibly type-raising, if implemented as a
+    unary rule).
+    '''
+    _maxID = 0
+
+    def __init__(self, prim_only=False):
+        """Initialize a variable (selects a new identifier)
+
+        :param prim_only: a boolean that determines whether the variable is
+                          restricted to primitives
+        :type prim_only: bool
+        """
+        self._id = self.new_id()
+        self._prim_only = prim_only
+        self._comparison_key = self._id
+
+    @classmethod
+    def new_id(cls):
+        """
+        A class method allowing generation of unique variable identifiers.
+        """
+        cls._maxID = cls._maxID + 1
+        return cls._maxID - 1
+
+    @classmethod
+    def reset_id(cls):
+        cls._maxID = 0
+
+    def is_primitive(self):
+        return False
+
+    def is_function(self):
+        return False
+
+    def is_var(self):
+        return True
+
+    def substitute(self, substitutions):
+        """If there is a substitution corresponding to this variable,
+        return the substituted category.
+        """
+        for (var, cat) in substitutions:
+            if var == self:
+                return cat
+        return self
+
+    def can_unify(self, other):
+        """ If the variable can be replaced with other
+        a substitution is returned.
+        """
+        if other.is_primitive() or not self._prim_only:
+            return [(self, other)]
+        return None
+
+    def id(self):
+        return self._id
+
+    def __str__(self):
+        return "_var" + str(self._id)
+
+
+@total_ordering
+@python_2_unicode_compatible
+class Direction(object):
+    '''
+    Class representing the direction of a function application.
+    Also contains maintains information as to which combinators
+    may be used with the category.
+    '''
+    def __init__(self, dir, restrictions):
+        self._dir = dir
+        self._restrs = restrictions
+        self._comparison_key = (dir, tuple(restrictions))
+
+    # Testing the application direction
+    def is_forward(self):
+        return self._dir == '/'
+
+    def is_backward(self):
+        return self._dir == '\\'
+
+    def dir(self):
+        return self._dir
+
+    def restrs(self):
+        """A list of restrictions on the combinators.
+        '.' denotes that permuting operations are disallowed
+        ',' denotes that function composition is disallowed
+        '_' denotes that the direction has variable restrictions.
+        (This is redundant in the current implementation of type-raising)
+        """
+        return self._restrs
+
+    def is_variable(self):
+        return self._restrs == '_'
+
+    # Unification and substitution of variable directions.
+    # Used only if type-raising is implemented as a unary rule, as it
+    # must inherit restrictions from the argument category.
+    def can_unify(self, other):
+        if other.is_variable():
+            return [('_', self.restrs())]
+        elif self.is_variable():
+            return [('_', other.restrs())]
+        else:
+            if self.restrs() == other.restrs():
+                return []
+        return None
+
+    def substitute(self, subs):
+        if not self.is_variable():
+            return self
+
+        for (var, restrs) in subs:
+            if var == '_':
+                return Direction(self._dir, restrs)
+        return self
+
+    # Testing permitted combinators
+    def can_compose(self):
+        return (',' not in self._restrs)
+
+    def can_cross(self):
+        return ('.' not in self._restrs)
+
+    def __eq__(self, other):
+        return (self.__class__ is other.__class__ and
+                self._comparison_key == other._comparison_key)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, Direction):
+            raise_unorderable_types("<", self, other)
+        if self.__class__ is other.__class__:
+            return self._comparison_key < other._comparison_key
+        else:
+            return self.__class__.__name__ < other.__class__.__name__
+
+    def __hash__(self):
+        try:
+            return self._hash
+        except AttributeError:
+            self._hash = hash(self._comparison_key)
+            return self._hash
+
+    def __str__(self):
+        r_str = ""
+        for r in self._restrs:
+            r_str = r_str + "%s" % r
+        return "%s%s" % (self._dir, r_str)
+
+    # The negation operator reverses the direction of the application
+    def __neg__(self):
+        if self._dir == '/':
+            return Direction('\\', self._restrs)
+        else:
+            return Direction('/', self._restrs)
+
+
+@python_2_unicode_compatible
+class PrimitiveCategory(AbstractCCGCategory):
+    '''
+    Class representing primitive categories.
+    Takes a string representation of the category, and a
+    list of strings specifying the morphological subcategories.
+    '''
+    def __init__(self, categ, restrictions=[]):
+        self._categ = categ
+        self._restrs = restrictions
+        self._comparison_key = (categ, tuple(restrictions))
+
+    def is_primitive(self):
+        return True
+
+    def is_function(self):
+        return False
+
+    def is_var(self):
+        return False
+
+    def restrs(self):
+        return self._restrs
+
+    def categ(self):
+        return self._categ
+
+    # Substitution does nothing to a primitive category
+    def substitute(self, subs):
+        return self
+
+    # A primitive can be unified with a class of the same
+    # base category, given that the other category shares all
+    # of its subclasses, or with a variable.
+    def can_unify(self, other):
+        if not other.is_primitive():
+            return None
+        if other.is_var():
+            return [(other, self)]
+        if other.categ() == self.categ():
+            for restr in self._restrs:
+                if restr not in other.restrs():
+                    return None
+            return []
+        return None
+
+    def __str__(self):
+        if self._restrs == []:
+            return "%s" % self._categ
+        restrictions = "[%s]" % ",".join(unicode_repr(r) for r in self._restrs)
+        return "%s%s" % (self._categ, restrictions)
+
+
+@python_2_unicode_compatible
+class FunctionalCategory(AbstractCCGCategory):
+    '''
+    Class that represents a function application category.
+    Consists of argument and result categories, together with
+    an application direction.
+    '''
+    def __init__(self, res, arg, dir):
+        self._res = res
+        self._arg = arg
+        self._dir = dir
+        self._comparison_key = (arg, dir, res)
+
+    def is_primitive(self):
+        return False
+
+    def is_function(self):
+        return True
+
+    def is_var(self):
+        return False
+
+    # Substitution returns the category consisting of the
+    # substitution applied to each of its constituents.
+    def substitute(self, subs):
+        sub_res = self._res.substitute(subs)
+        sub_dir = self._dir.substitute(subs)
+        sub_arg = self._arg.substitute(subs)
+        return FunctionalCategory(sub_res, sub_arg, self._dir)
+
+    # A function can unify with another function, so long as its
+    # constituents can unify, or with an unrestricted variable.
+    def can_unify(self, other):
+        if other.is_var():
+            return [(other, self)]
+        if other.is_function():
+            sa = self._res.can_unify(other.res())
+            sd = self._dir.can_unify(other.dir())
+            if sa is not None and sd is not None:
+                sb = self._arg.substitute(sa).can_unify(
+                    other.arg().substitute(sa))
+                if sb is not None:
+                    return sa + sb
+        return None
+
+    # Constituent accessors
+    def arg(self):
+        return self._arg
+
+    def res(self):
+        return self._res
+
+    def dir(self):
+        return self._dir
+
+    def __str__(self):
+        return "(%s%s%s)" % (self._res, self._dir, self._arg)
diff --git a/nlp_resource_data/nltk/ccg/api.pyc b/nlp_resource_data/nltk/ccg/api.pyc

new file mode 100755 (executable)

index 0000000..9972117

Binary files /dev/null and b/nlp_resource_data/nltk/ccg/api.pyc differ
diff --git a/nlp_resource_data/nltk/ccg/chart.py b/nlp_resource_data/nltk/ccg/chart.py

new file mode 100755 (executable)

index 0000000..e2f04b1
--- /dev/null
+++ b/nlp_resource_data/nltk/ccg/chart.py
@@ -0,0 +1,394 @@
+# Natural Language Toolkit: Combinatory Categorial Grammar
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+The lexicon is constructed by calling
+``lexicon.fromstring(<lexicon string>)``.
+
+In order to construct a parser, you also need a rule set.
+The standard English rules are provided in chart as
+``chart.DefaultRuleSet``.
+
+The parser can then be constructed by calling, for example:
+``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
+
+Parsing is then performed by running
+``parser.parse(<sentence>.split())``.
+
+While this returns a list of trees, the default representation
+of the produced trees is not very enlightening, particularly
+given that it uses the same tree class as the CFG parsers.
+It is probably better to call:
+``chart.printCCGDerivation(<parse tree extracted from list>)``
+which should print a nice representation of the derivation.
+
+This entire process is shown far more clearly in the demonstration:
+python chart.py
+"""
+from __future__ import print_function, division, unicode_literals
+
+import itertools
+
+from six import string_types
+
+from nltk.parse import ParserI
+from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
+from nltk.tree import Tree
+
+from nltk.ccg.lexicon import fromstring, Token
+from nltk.ccg.combinator import (ForwardT, BackwardT, ForwardApplication,
+                                 BackwardApplication, ForwardComposition,
+                                 BackwardComposition, ForwardSubstitution,
+                                 BackwardBx, BackwardSx)
+from nltk.compat import python_2_unicode_compatible
+from nltk.ccg.combinator import *
+from nltk.ccg.logic import *
+from nltk.sem.logic import *
+
+# Based on the EdgeI class from NLTK.
+# A number of the properties of the EdgeI interface don't
+# transfer well to CCGs, however.
+class CCGEdge(EdgeI):
+    def __init__(self, span, categ, rule):
+        self._span = span
+        self._categ = categ
+        self._rule = rule
+        self._comparison_key = (span, categ, rule)
+
+    # Accessors
+    def lhs(self): return self._categ
+    def span(self): return self._span
+    def start(self): return self._span[0]
+    def end(self): return self._span[1]
+    def length(self): return self._span[1] - self.span[0]
+    def rhs(self): return ()
+    def dot(self): return 0
+    def is_complete(self): return True
+    def is_incomplete(self): return False
+    def nextsym(self): return None
+
+    def categ(self): return self._categ
+    def rule(self): return self._rule
+
+class CCGLeafEdge(EdgeI):
+    '''
+    Class representing leaf edges in a CCG derivation.
+    '''
+    def __init__(self, pos, token, leaf):
+        self._pos = pos
+        self._token = token
+        self._leaf = leaf
+        self._comparison_key = (pos, token.categ(), leaf)
+
+    # Accessors
+    def lhs(self): return self._token.categ()
+    def span(self): return (self._pos, self._pos+1)
+    def start(self): return self._pos
+    def end(self): return self._pos + 1
+    def length(self): return 1
+    def rhs(self): return self._leaf
+    def dot(self): return 0
+    def is_complete(self): return True
+    def is_incomplete(self): return False
+    def nextsym(self): return None
+
+    def token(self): return self._token
+    def categ(self): return self._token.categ()
+    def leaf(self): return self._leaf
+
+@python_2_unicode_compatible
+class BinaryCombinatorRule(AbstractChartRule):
+    '''
+    Class implementing application of a binary combinator to a chart.
+    Takes the directed combinator to apply.
+    '''
+    NUMEDGES = 2
+    def __init__(self,combinator):
+        self._combinator = combinator
+
+    # Apply a combinator
+    def apply(self, chart, grammar, left_edge, right_edge):
+        # The left & right edges must be touching.
+        if not (left_edge.end() == right_edge.start()):
+            return
+
+        # Check if the two edges are permitted to combine.
+        # If so, generate the corresponding edge.
+        if self._combinator.can_combine(left_edge.categ(),right_edge.categ()):
+            for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
+                new_edge = CCGEdge(span=(left_edge.start(), right_edge.end()),categ=res,rule=self._combinator)
+                if chart.insert(new_edge,(left_edge,right_edge)):
+                    yield new_edge
+
+    # The representation of the combinator (for printing derivations)
+    def __str__(self):
+        return "%s" % self._combinator
+
+# Type-raising must be handled slightly differently to the other rules, as the
+# resulting rules only span a single edge, rather than both edges.
+@python_2_unicode_compatible
+class ForwardTypeRaiseRule(AbstractChartRule):
+    '''
+    Class for applying forward type raising
+    '''
+    NUMEDGES = 2
+
+    def __init__(self):
+       self._combinator = ForwardT
+    def apply(self, chart, grammar, left_edge, right_edge):
+        if not (left_edge.end() == right_edge.start()):
+            return
+
+        for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
+            new_edge = CCGEdge(span=left_edge.span(),categ=res,rule=self._combinator)
+            if chart.insert(new_edge,(left_edge,)):
+                yield new_edge
+
+    def __str__(self):
+        return "%s" % self._combinator
+
+@python_2_unicode_compatible
+class BackwardTypeRaiseRule(AbstractChartRule):
+    '''
+    Class for applying backward type raising.
+    '''
+    NUMEDGES = 2
+
+    def __init__(self):
+       self._combinator = BackwardT
+    def apply(self, chart, grammar, left_edge, right_edge):
+        if not (left_edge.end() == right_edge.start()):
+            return
+
+        for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
+            new_edge = CCGEdge(span=right_edge.span(),categ=res,rule=self._combinator)
+            if chart.insert(new_edge,(right_edge,)):
+                yield new_edge
+
+    def __str__(self):
+        return "%s" % self._combinator
+
+
+# Common sets of combinators used for English derivations.
+ApplicationRuleSet = [BinaryCombinatorRule(ForwardApplication),
+                        BinaryCombinatorRule(BackwardApplication)]
+CompositionRuleSet = [BinaryCombinatorRule(ForwardComposition),
+                        BinaryCombinatorRule(BackwardComposition),
+                        BinaryCombinatorRule(BackwardBx)]
+SubstitutionRuleSet = [BinaryCombinatorRule(ForwardSubstitution),
+                        BinaryCombinatorRule(BackwardSx)]
+TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
+
+# The standard English rule set.
+DefaultRuleSet = ApplicationRuleSet + CompositionRuleSet + \
+                    SubstitutionRuleSet + TypeRaiseRuleSet
+
+class CCGChartParser(ParserI):
+    '''
+    Chart parser for CCGs.
+    Based largely on the ChartParser class from NLTK.
+    '''
+    def __init__(self, lexicon, rules, trace=0):
+        self._lexicon = lexicon
+        self._rules = rules
+        self._trace = trace
+
+    def lexicon(self):
+        return self._lexicon
+
+   # Implements the CYK algorithm
+    def parse(self, tokens):
+        tokens = list(tokens)
+        chart = CCGChart(list(tokens))
+        lex = self._lexicon
+
+        # Initialize leaf edges.
+        for index in range(chart.num_leaves()):
+            for token in lex.categories(chart.leaf(index)):
+                new_edge = CCGLeafEdge(index, token, chart.leaf(index))
+                chart.insert(new_edge, ())
+
+
+        # Select a span for the new edges
+        for span in range(2,chart.num_leaves()+1):
+            for start in range(0,chart.num_leaves()-span+1):
+                # Try all possible pairs of edges that could generate
+                # an edge for that span
+                for part in range(1,span):
+                    lstart = start
+                    mid = start + part
+                    rend = start + span
+
+                    for left in chart.select(span=(lstart,mid)):
+                        for right in chart.select(span=(mid,rend)):
+                            # Generate all possible combinations of the two edges
+                            for rule in self._rules:
+                                edges_added_by_rule = 0
+                                for newedge in rule.apply(chart,lex,left,right):
+                                    edges_added_by_rule += 1
+
+        # Output the resulting parses
+        return chart.parses(lex.start())
+
+class CCGChart(Chart):
+    def __init__(self, tokens):
+        Chart.__init__(self, tokens)
+
+    # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
+    # constructed slightly differently to those in the default Chart class, so it has to
+    # be reimplemented
+    def _trees(self, edge, complete, memo, tree_class):
+        assert complete, "CCGChart cannot build incomplete trees"
+
+        if edge in memo:
+            return memo[edge]
+
+        if isinstance(edge,CCGLeafEdge):
+            word = tree_class(edge.token(), [self._tokens[edge.start()]])
+            leaf = tree_class((edge.token(), "Leaf"), [word])
+            memo[edge] = [leaf]
+            return [leaf]
+
+        memo[edge] = []
+        trees = []
+
+        for cpl in self.child_pointer_lists(edge):
+            child_choices = [self._trees(cp, complete, memo, tree_class)
+                             for cp in cpl]
+            for children in itertools.product(*child_choices):
+                lhs = (Token(self._tokens[edge.start():edge.end()], edge.lhs(), compute_semantics(children, edge)), str(edge.rule()))
+                trees.append(tree_class(lhs, children))
+
+        memo[edge] = trees
+        return trees
+
+
+def compute_semantics(children, edge):
+    if children[0].label()[0].semantics() is None:
+        return None
+
+    if len(children) is 2:
+        if isinstance(edge.rule(), BackwardCombinator):
+            children = [children[1],children[0]]
+
+        combinator = edge.rule()._combinator
+        function = children[0].label()[0].semantics()
+        argument = children[1].label()[0].semantics()
+
+        if isinstance(combinator, UndirectedFunctionApplication):
+            return compute_function_semantics(function, argument)
+        elif isinstance(combinator, UndirectedComposition):
+            return compute_composition_semantics(function, argument)
+        elif isinstance(combinator, UndirectedSubstitution):
+            return compute_substitution_semantics(function, argument)
+        else:
+            raise AssertionError('Unsupported combinator \'' + combinator + '\'')
+    else:
+        return compute_type_raised_semantics(children[0].label()[0].semantics())
+
+#--------
+# Displaying derivations
+#--------
+def printCCGDerivation(tree):
+    # Get the leaves and initial categories
+    leafcats = tree.pos()
+    leafstr = ''
+    catstr = ''
+
+    # Construct a string with both the leaf word and corresponding
+    # category aligned.
+    for (leaf, cat) in leafcats:
+        str_cat = "%s" % cat
+        nextlen = 2 + max(len(leaf), len(str_cat))
+        lcatlen = (nextlen - len(str_cat)) // 2
+        rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
+        catstr += ' '*lcatlen + str_cat + ' '*rcatlen
+        lleaflen = (nextlen - len(leaf)) // 2
+        rleaflen = lleaflen + (nextlen - len(leaf)) % 2
+        leafstr += ' '*lleaflen + leaf + ' '*rleaflen
+    print(leafstr.rstrip())
+    print(catstr.rstrip())
+
+    # Display the derivation steps
+    printCCGTree(0,tree)
+
+# Prints the sequence of derivation steps.
+def printCCGTree(lwidth,tree):
+    rwidth = lwidth
+
+    # Is a leaf (word).
+    # Increment the span by the space occupied by the leaf.
+    if not isinstance(tree, Tree):
+        return 2 + lwidth + len(tree)
+
+    # Find the width of the current derivation step
+    for child in tree:
+        rwidth = max(rwidth, printCCGTree(rwidth,child))
+
+    # Is a leaf node.
+    # Don't print anything, but account for the space occupied.
+    if not isinstance(tree.label(), tuple):
+        return max(rwidth,2 + lwidth + len("%s" % tree.label()),
+                  2 + lwidth + len(tree[0]))
+
+    (token, op) = tree.label()
+
+    if op == 'Leaf':
+        return rwidth
+
+    # Pad to the left with spaces, followed by a sequence of '-'
+    # and the derivation rule.
+    print(lwidth*' ' + (rwidth-lwidth)*'-' + "%s" % op)
+    # Print the resulting category on a new line.
+    str_res = "%s" % (token.categ())
+    if token.semantics() is not None:
+        str_res += " {" + str(token.semantics()) + "}"
+    respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
+    print(respadlen*' ' + str_res)
+    return rwidth
+
+### Demonstration code
+
+# Construct the lexicon
+lex = fromstring('''
+    :- S, NP, N, VP    # Primitive categories, S is the target primitive
+
+    Det :: NP/N         # Family of words
+    Pro :: NP
+    TV :: VP/NP
+    Modal :: (S\\NP)/VP # Backslashes need to be escaped
+
+    I => Pro             # Word -> Category mapping
+    you => Pro
+
+    the => Det
+
+    # Variables have the special keyword 'var'
+    # '.' prevents permutation
+    # ',' prevents composition
+    and => var\\.,var/.,var
+
+    which => (N\\N)/(S/NP)
+
+    will => Modal # Categories can be either explicit, or families.
+    might => Modal
+
+    cook => TV
+    eat => TV
+
+    mushrooms => N
+    parsnips => N
+    bacon => N
+    ''')
+
+def demo():
+    parser = CCGChartParser(lex, DefaultRuleSet)
+    for parse in parser.parse("I might cook and eat the bacon".split()):
+        printCCGDerivation(parse)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/ccg/chart.pyc b/nlp_resource_data/nltk/ccg/chart.pyc

new file mode 100755 (executable)

index 0000000..e1df2b6

Binary files /dev/null and b/nlp_resource_data/nltk/ccg/chart.pyc differ
diff --git a/nlp_resource_data/nltk/ccg/combinator.py b/nlp_resource_data/nltk/ccg/combinator.py

new file mode 100755 (executable)

index 0000000..1fecd5c
--- /dev/null
+++ b/nlp_resource_data/nltk/ccg/combinator.py
@@ -0,0 +1,343 @@
+# Natural Language Toolkit: Combinatory Categorial Grammar
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+CCG Combinators
+"""
+
+from __future__ import unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+from nltk.compat import python_2_unicode_compatible
+from nltk.ccg.api import FunctionalCategory
+
+
+@add_metaclass(ABCMeta)
+class UndirectedBinaryCombinator(object):
+    """
+    Abstract class for representing a binary combinator.
+    Merely defines functions for checking if the function and argument
+    are able to be combined, and what the resulting category is.
+
+    Note that as no assumptions are made as to direction, the unrestricted
+    combinators can perform all backward, forward and crossed variations
+    of the combinators; these restrictions must be added in the rule
+    class.
+    """
+    @abstractmethod
+    def can_combine(self, function, argument):
+        pass
+
+    @abstractmethod
+    def combine(self, function, argument):
+        pass
+
+
+@add_metaclass(ABCMeta)
+class DirectedBinaryCombinator(object):
+    """
+    Wrapper for the undirected binary combinator.
+    It takes left and right categories, and decides which is to be
+    the function, and which the argument.
+    It then decides whether or not they can be combined.
+    """
+    @abstractmethod
+    def can_combine(self, left, right):
+        pass
+
+    @abstractmethod
+    def combine(self, left, right):
+        pass
+
+
+@python_2_unicode_compatible
+class ForwardCombinator(DirectedBinaryCombinator):
+    """
+    Class representing combinators where the primary functor is on the left.
+
+    Takes an undirected combinator, and a predicate which adds constraints
+    restricting the cases in which it may apply.
+    """
+    def __init__(self, combinator, predicate, suffix=''):
+        self._combinator = combinator
+        self._predicate = predicate
+        self._suffix = suffix
+
+    def can_combine(self, left, right):
+        return (self._combinator.can_combine(left, right) and
+                self._predicate(left, right))
+
+    def combine(self, left, right):
+        for cat in self._combinator.combine(left, right):
+            yield cat
+
+    def __str__(self):
+        return ">%s%s" % (self._combinator, self._suffix)
+
+
+@python_2_unicode_compatible
+class BackwardCombinator(DirectedBinaryCombinator):
+    """
+    The backward equivalent of the ForwardCombinator class.
+    """
+    def __init__(self, combinator, predicate, suffix=''):
+        self._combinator = combinator
+        self._predicate = predicate
+        self._suffix = suffix
+
+    def can_combine(self, left, right):
+        return (self._combinator.can_combine(right, left) and
+                self._predicate(left, right))
+
+    def combine(self, left, right):
+        for cat in self._combinator.combine(right, left):
+            yield cat
+
+    def __str__(self):
+        return "<%s%s" % (self._combinator, self._suffix)
+
+
+@python_2_unicode_compatible
+class UndirectedFunctionApplication(UndirectedBinaryCombinator):
+    """
+    Class representing function application.
+    Implements rules of the form:
+    X/Y Y -> X (>)
+    And the corresponding backwards application rule
+    """
+
+    def can_combine(self, function, argument):
+        if not function.is_function():
+            return False
+
+        return not function.arg().can_unify(argument) is None
+
+    def combine(self, function, argument):
+        if not function.is_function():
+            return
+
+        subs = function.arg().can_unify(argument)
+        if subs is None:
+            return
+
+        yield function.res().substitute(subs)
+
+    def __str__(self):
+        return ''
+
+
+# Predicates for function application.
+
+# Ensures the left functor takes an argument on the right
+def forwardOnly(left, right):
+    return left.dir().is_forward()
+
+
+# Ensures the right functor takes an argument on the left
+def backwardOnly(left, right):
+    return right.dir().is_backward()
+
+
+# Application combinator instances
+ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(),
+                                       forwardOnly)
+BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(),
+                                         backwardOnly)
+
+
+@python_2_unicode_compatible
+class UndirectedComposition(UndirectedBinaryCombinator):
+    """
+    Functional composition (harmonic) combinator.
+    Implements rules of the form
+    X/Y Y/Z -> X/Z (B>)
+    And the corresponding backwards and crossed variations.
+    """
+    def can_combine(self, function, argument):
+        # Can only combine two functions, and both functions must
+        # allow composition.
+        if not (function.is_function() and argument.is_function()):
+            return False
+        if function.dir().can_compose() and argument.dir().can_compose():
+            return not function.arg().can_unify(argument.res()) is None
+        return False
+
+    def combine(self, function, argument):
+        if not (function.is_function() and argument.is_function()):
+            return
+        if function.dir().can_compose() and argument.dir().can_compose():
+            subs = function.arg().can_unify(argument.res())
+            if subs is not None:
+                yield FunctionalCategory(function.res().substitute(subs),
+                                         argument.arg().substitute(subs),
+                                         argument.dir())
+
+    def __str__(self):
+        return 'B'
+
+
+# Predicates for restricting application of straight composition.
+def bothForward(left, right):
+    return left.dir().is_forward() and right.dir().is_forward()
+
+
+def bothBackward(left, right):
+    return left.dir().is_backward() and right.dir().is_backward()
+
+
+# Predicates for crossed composition
+def crossedDirs(left, right):
+    return left.dir().is_forward() and right.dir().is_backward()
+
+
+def backwardBxConstraint(left, right):
+    # The functors must be crossed inwards
+    if not crossedDirs(left, right):
+        return False
+    # Permuting combinators must be allowed
+    if not left.dir().can_cross() and right.dir().can_cross():
+        return False
+    # The resulting argument category is restricted to be primitive
+    return left.arg().is_primitive()
+
+
+# Straight composition combinators
+ForwardComposition = ForwardCombinator(UndirectedComposition(),
+                                       forwardOnly)
+BackwardComposition = BackwardCombinator(UndirectedComposition(),
+                                         backwardOnly)
+
+# Backward crossed composition
+BackwardBx = BackwardCombinator(UndirectedComposition(), backwardBxConstraint,
+                                suffix='x')
+
+
+@python_2_unicode_compatible
+class UndirectedSubstitution(UndirectedBinaryCombinator):
+    """
+    Substitution (permutation) combinator.
+    Implements rules of the form
+    Y/Z (X\Y)/Z -> X/Z (<Sx)
+    And other variations.
+    """
+    def can_combine(self, function, argument):
+        if function.is_primitive() or argument.is_primitive():
+            return False
+
+        # These could potentially be moved to the predicates, as the
+        # constraints may not be general to all languages.
+        if function.res().is_primitive():
+            return False
+        if not function.arg().is_primitive():
+            return False
+
+        if not (function.dir().can_compose() and argument.dir().can_compose()):
+            return False
+        return (function.res().arg() == argument.res()) and (function.arg() == argument.arg())
+
+    def combine(self, function, argument):
+        if self.can_combine(function, argument):
+            yield FunctionalCategory(function.res().res(), argument.arg(),
+                                     argument.dir())
+
+    def __str__(self):
+        return 'S'
+
+
+# Predicate for forward substitution
+def forwardSConstraint(left, right):
+    if not bothForward(left, right):
+        return False
+    return left.res().dir().is_forward() and left.arg().is_primitive()
+
+
+# Predicate for backward crossed substitution
+def backwardSxConstraint(left, right):
+    if not left.dir().can_cross() and right.dir().can_cross():
+        return False
+    if not bothForward(left, right):
+        return False
+    return right.res().dir().is_backward() and right.arg().is_primitive()
+
+
+# Instances of substitution combinators
+ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(),
+                                        forwardSConstraint)
+BackwardSx = BackwardCombinator(UndirectedSubstitution(),
+                                backwardSxConstraint, 'x')
+
+
+# Retrieves the left-most functional category.
+# ie, (N\N)/(S/NP) => N\N
+def innermostFunction(categ):
+    while categ.res().is_function():
+        categ = categ.res()
+    return categ
+
+
+@python_2_unicode_compatible
+class UndirectedTypeRaise(UndirectedBinaryCombinator):
+    """
+    Undirected combinator for type raising.
+    """
+    def can_combine(self, function, arg):
+        # The argument must be a function.
+        # The restriction that arg.res() must be a function
+        # merely reduces redundant type-raising; if arg.res() is
+        # primitive, we have:
+        # X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
+        # which is equivalent to
+        # X Y\X =>(<) Y
+        if not (arg.is_function() and arg.res().is_function()):
+            return False
+
+        arg = innermostFunction(arg)
+
+        # left, arg_categ are undefined!
+        subs = left.can_unify(arg_categ.arg())
+        if subs is not None:
+            return True
+        return False
+
+    def combine(self, function, arg):
+        if not (function.is_primitive() and
+                arg.is_function() and arg.res().is_function()):
+            return
+
+        # Type-raising matches only the innermost application.
+        arg = innermostFunction(arg)
+
+        subs = function.can_unify(arg.arg())
+        if subs is not None:
+            xcat = arg.res().substitute(subs)
+            yield FunctionalCategory(xcat,
+                                     FunctionalCategory(xcat, function,
+                                                        arg.dir()),
+                                     -(arg.dir()))
+
+    def __str__(self):
+        return 'T'
+
+
+# Predicates for type-raising
+# The direction of the innermost category must be towards
+# the primary functor.
+# The restriction that the variable must be primitive is not
+# common to all versions of CCGs; some authors have other restrictions.
+def forwardTConstraint(left, right):
+    arg = innermostFunction(right)
+    return arg.dir().is_backward() and arg.res().is_primitive()
+
+
+def backwardTConstraint(left, right):
+    arg = innermostFunction(left)
+    return arg.dir().is_forward() and arg.res().is_primitive()
+
+
+# Instances of type-raising combinators
+ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
+BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)
diff --git a/nlp_resource_data/nltk/ccg/combinator.pyc b/nlp_resource_data/nltk/ccg/combinator.pyc

new file mode 100755 (executable)

index 0000000..0f5e6d5

Binary files /dev/null and b/nlp_resource_data/nltk/ccg/combinator.pyc differ
diff --git a/nlp_resource_data/nltk/ccg/lexicon.py b/nlp_resource_data/nltk/ccg/lexicon.py

new file mode 100755 (executable)

index 0000000..699dd87
--- /dev/null
+++ b/nlp_resource_data/nltk/ccg/lexicon.py
@@ -0,0 +1,328 @@
+# Natural Language Toolkit: Combinatory Categorial Grammar
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+CCG Lexicons
+"""
+
+from __future__ import unicode_literals
+
+import re
+from collections import defaultdict
+
+from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
+from nltk.compat import python_2_unicode_compatible
+from nltk.internals import deprecated
+
+from nltk.sem.logic import *
+
+#------------
+# Regular expressions used for parsing components of the lexicon
+#------------
+
+# Parses a primitive category and subscripts
+PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
+
+# Separates the next primitive category from the remainder of the
+# string
+NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
+
+# Separates the next application operator from the remainder
+APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
+
+# Parses the definition of the right-hand side (rhs) of either a word or a family
+LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
+
+# Parses the right hand side that contains category and maybe semantic predicate
+RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
+
+# Parses the semantic predicate
+SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
+
+# Strips comments from a line
+COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
+
+class Token(object):
+    """
+    Class representing a token.
+
+    token => category {semantics}
+    e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
+
+    * `token` (string)
+    * `categ` (string)
+    * `semantics` (Expression)
+    """
+    def __init__(self, token, categ, semantics=None):
+        self._token = token
+        self._categ = categ
+        self._semantics = semantics
+        
+    def categ(self):
+        return self._categ
+    
+    def semantics(self):
+        return self._semantics
+        
+    def __str__(self):
+        semantics_str = ""
+        if self._semantics is not None:
+            semantics_str = " {" + str(self._semantics) + "}"
+        return "" + str(self._categ) + semantics_str
+    
+    def __cmp__(self, other):
+        if not isinstance(other, Token): return -1
+        return cmp((self._categ,self._semantics),
+                    other.categ(),other.semantics())
+
+@python_2_unicode_compatible
+class CCGLexicon(object):
+    """
+    Class representing a lexicon for CCG grammars.
+
+    * `primitives`: The list of primitive categories for the lexicon
+    * `families`: Families of categories
+    * `entries`: A mapping of words to possible categories
+    """
+    def __init__(self, start, primitives, families, entries):
+        self._start = PrimitiveCategory(start)
+        self._primitives = primitives
+        self._families = families
+        self._entries = entries
+
+
+    def categories(self, word):
+        """
+        Returns all the possible categories for a word
+        """
+        return self._entries[word]
+
+
+    def start(self):
+        """
+        Return the target category for the parser
+        """
+        return self._start
+
+    def __str__(self):
+        """
+        String representation of the lexicon. Used for debugging.
+        """
+        string = ""
+        first = True
+        for ident in sorted(self._entries):
+            if not first:
+                string = string + "\n"
+            string = string + ident + " => "
+
+            first = True
+            for cat in self._entries[ident]:
+                if not first:
+                    string = string + " | "
+                else:
+                    first = False
+                string = string + "%s" % cat
+        return string
+
+
+#-----------
+# Parsing lexicons
+#-----------
+
+
+def matchBrackets(string):
+    """
+    Separate the contents matching the first set of brackets from the rest of
+    the input.
+    """
+    rest = string[1:]
+    inside = "("
+
+    while rest != "" and not rest.startswith(')'):
+        if rest.startswith('('):
+            (part, rest) = matchBrackets(rest)
+            inside = inside + part
+        else:
+            inside = inside + rest[0]
+            rest = rest[1:]
+    if rest.startswith(')'):
+        return (inside + ')', rest[1:])
+    raise AssertionError('Unmatched bracket in string \'' + string + '\'')
+
+
+def nextCategory(string):
+    """
+    Separate the string for the next portion of the category from the rest
+    of the string
+    """
+    if string.startswith('('):
+        return matchBrackets(string)
+    return NEXTPRIM_RE.match(string).groups()
+
+def parseApplication(app):
+    """
+    Parse an application operator
+    """
+    return Direction(app[0], app[1:])
+
+
+def parseSubscripts(subscr):
+    """
+    Parse the subscripts for a primitive category
+    """
+    if subscr:
+        return subscr[1:-1].split(',')
+    return []
+
+
+def parsePrimitiveCategory(chunks, primitives, families, var):
+    """
+    Parse a primitive category
+
+    If the primitive is the special category 'var', replace it with the
+    correct `CCGVar`.
+    """
+    if chunks[0] == "var":
+        if chunks[1] is None:
+            if var is None:
+                var = CCGVar()
+            return (var, var)
+
+    catstr = chunks[0]
+    if catstr in families:
+        (cat, cvar) = families[catstr]
+        if var is None:
+            var = cvar
+        else:
+            cat = cat.substitute([(cvar, var)])
+        return (cat, var)
+
+    if catstr in primitives:
+        subscrs = parseSubscripts(chunks[1])
+        return (PrimitiveCategory(catstr, subscrs), var)
+    raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.')
+
+
+def augParseCategory(line, primitives, families, var=None):
+    """
+    Parse a string representing a category, and returns a tuple with
+    (possibly) the CCG variable for the category
+    """
+    (cat_string, rest) = nextCategory(line)
+
+    if cat_string.startswith('('):
+        (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
+
+    else:
+#        print rePrim.match(str).groups()
+        (res, var) =\
+            parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(), primitives,
+                                   families, var)
+
+    while rest != "":
+        app = APP_RE.match(rest).groups()
+        direction = parseApplication(app[0:3])
+        rest = app[3]
+
+        (cat_string, rest) = nextCategory(rest)
+        if cat_string.startswith('('):
+            (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
+        else:
+            (arg, var) =\
+                parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(),
+                                       primitives, families, var)
+        res = FunctionalCategory(res, arg, direction)
+
+    return (res, var)
+
+def fromstring(lex_str, include_semantics=False):
+    """
+    Convert string representation into a lexicon for CCGs.
+    """
+    CCGVar.reset_id()
+    primitives = []
+    families = {}
+    entries = defaultdict(list)
+    for line in lex_str.splitlines():
+        # Strip comments and leading/trailing whitespace.
+        line = COMMENTS_RE.match(line).groups()[0].strip()
+        if line == "":
+            continue
+
+        if line.startswith(':-'):
+            # A line of primitive categories.
+            # The first one is the target category
+            # ie, :- S, N, NP, VP
+            primitives = primitives + [prim.strip() for prim in line[2:].strip().split(',')]
+        else:
+            # Either a family definition, or a word definition
+            (ident, sep, rhs) = LEX_RE.match(line).groups()
+            (catstr, semantics_str) = RHS_RE.match(rhs).groups()
+            (cat, var) = augParseCategory(catstr, primitives, families)
+
+            if sep == '::':
+                # Family definition
+                # ie, Det :: NP/N
+                families[ident] = (cat, var)
+            else:
+                semantics = None
+                if include_semantics is True:
+                    if semantics_str is None:
+                        raise AssertionError(line + " must contain semantics because include_semantics is set to True")
+                    else:
+                        semantics = Expression.fromstring(SEMANTICS_RE.match(semantics_str).groups()[0])
+                # Word definition
+                # ie, which => (N\N)/(S/NP)
+                entries[ident].append(Token(ident, cat, semantics))
+    return CCGLexicon(primitives[0], primitives, families, entries)
+
+
+@deprecated('Use fromstring() instead.')
+def parseLexicon(lex_str):
+    return fromstring(lex_str)
+
+openccg_tinytiny = fromstring("""
+    # Rather minimal lexicon based on the openccg `tinytiny' grammar.
+    # Only incorporates a subset of the morphological subcategories, however.
+    :- S,NP,N                    # Primitive categories
+    Det :: NP/N                  # Determiners
+    Pro :: NP
+    IntransVsg :: S\\NP[sg]    # Tensed intransitive verbs (singular)
+    IntransVpl :: S\\NP[pl]    # Plural
+    TransVsg :: S\\NP[sg]/NP   # Tensed transitive verbs (singular)
+    TransVpl :: S\\NP[pl]/NP   # Plural
+
+    the => NP[sg]/N[sg]
+    the => NP[pl]/N[pl]
+
+    I => Pro
+    me => Pro
+    we => Pro
+    us => Pro
+
+    book => N[sg]
+    books => N[pl]
+
+    peach => N[sg]
+    peaches => N[pl]
+
+    policeman => N[sg]
+    policemen => N[pl]
+
+    boy => N[sg]
+    boys => N[pl]
+
+    sleep => IntransVsg
+    sleep => IntransVpl
+
+    eat => IntransVpl
+    eat => TransVpl
+    eats => IntransVsg
+    eats => TransVsg
+
+    see => TransVpl
+    sees => TransVsg
+    """)
diff --git a/nlp_resource_data/nltk/ccg/lexicon.pyc b/nlp_resource_data/nltk/ccg/lexicon.pyc

new file mode 100755 (executable)

index 0000000..fd53711

Binary files /dev/null and b/nlp_resource_data/nltk/ccg/lexicon.pyc differ
diff --git a/nlp_resource_data/nltk/ccg/logic.py b/nlp_resource_data/nltk/ccg/logic.py

new file mode 100755 (executable)

index 0000000..39d2ba2
--- /dev/null
+++ b/nlp_resource_data/nltk/ccg/logic.py
@@ -0,0 +1,46 @@
+# Natural Language Toolkit: Combinatory Categorial Grammar
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tanin Na Nakorn (@tanin)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Helper functions for CCG semantics computation
+"""
+
+from nltk.sem.logic import *
+
+def compute_type_raised_semantics(semantics):
+    core = semantics
+    parent = None
+    while isinstance(core, LambdaExpression):
+        parent = core
+        core = core.term
+        
+    var = Variable("F")
+    while var in core.free():
+        var = unique_variable(pattern=var)
+    core = ApplicationExpression(FunctionVariableExpression(var), core)
+    
+    if parent is not None:
+        parent.term = core
+    else:
+        semantics = core
+    
+    return LambdaExpression(var, semantics)
+
+def compute_function_semantics(function, argument):
+    return ApplicationExpression(function, argument).simplify()
+
+def compute_composition_semantics(function, argument):
+    assert isinstance(argument, LambdaExpression), "`" + str(argument) + "` must be a lambda expression"
+    return LambdaExpression(argument.variable, ApplicationExpression(function, argument.term).simplify())
+
+def compute_substitution_semantics(function, argument):
+    assert isinstance(function, LambdaExpression) and isinstance(function.term, LambdaExpression), "`" + str(function) + "` must be a lambda expression with 2 arguments"
+    assert isinstance(argument, LambdaExpression), "`" + str(argument) + "` must be a lambda expression"
+
+    new_argument = ApplicationExpression(argument, VariableExpression(function.variable)).simplify()
+    new_term = ApplicationExpression(function.term, new_argument).simplify() 
+
+    return LambdaExpression(function.variable, new_term)
diff --git a/nlp_resource_data/nltk/ccg/logic.pyc b/nlp_resource_data/nltk/ccg/logic.pyc

new file mode 100755 (executable)

index 0000000..ad8e9d7

Binary files /dev/null and b/nlp_resource_data/nltk/ccg/logic.pyc differ
diff --git a/nlp_resource_data/nltk/chat/__init__.py b/nlp_resource_data/nltk/chat/__init__.py

new file mode 100755 (executable)

index 0000000..574d770
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/__init__.py
@@ -0,0 +1,49 @@
+# Natural Language Toolkit: Chatbots
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
+# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
+
+"""
+A class for simple chatbots.  These perform simple pattern matching on sentences
+typed by users, and respond with automatically generated sentences.
+
+These chatbots may not work using the windows command line or the
+windows IDLE GUI.
+"""
+from __future__ import print_function
+
+from nltk.chat.util import Chat
+from nltk.chat.eliza import eliza_chat
+from nltk.chat.iesha import iesha_chat
+from nltk.chat.rude import rude_chat
+from nltk.chat.suntsu import suntsu_chat
+from nltk.chat.zen import zen_chat
+
+bots = [
+    (eliza_chat,  'Eliza (psycho-babble)'),
+    (iesha_chat,  'Iesha (teen anime junky)'),
+    (rude_chat,   'Rude (abusive bot)'),
+    (suntsu_chat, 'Suntsu (Chinese sayings)'),
+    (zen_chat,    'Zen (gems of wisdom)')]
+
+def chatbots():
+    import sys
+    print('Which chatbot would you like to talk to?')
+    botcount = len(bots)
+    for i in range(botcount):
+        print('  %d: %s' % (i+1, bots[i][1]))
+    while True:
+        print('\nEnter a number in the range 1-%d: ' % botcount, end=' ')
+        choice = sys.stdin.readline().strip()
+        if choice.isdigit() and (int(choice) - 1) in range(botcount):
+            break
+        else:
+            print('   Error: bad chatbot number')
+
+    chatbot = bots[int(choice)-1][0]
+    chatbot()
diff --git a/nlp_resource_data/nltk/chat/__init__.pyc b/nlp_resource_data/nltk/chat/__init__.pyc

new file mode 100755 (executable)

index 0000000..bb344a0

Binary files /dev/null and b/nlp_resource_data/nltk/chat/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/chat/eliza.py b/nlp_resource_data/nltk/chat/eliza.py

new file mode 100755 (executable)

index 0000000..c550306
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/eliza.py
@@ -0,0 +1,244 @@
+# Natural Language Toolkit: Eliza
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Steven Bird <stevenbird1@gmail.com>
+#          Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
+# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
+
+# a translation table used to convert things you say into things the
+# computer says back, e.g. "I am" --> "you are"
+
+from __future__ import print_function
+from nltk.chat.util import Chat, reflections
+
+# a table of response pairs, where each pair consists of a
+# regular expression, and a list of possible responses,
+# with group-macros labelled as %1, %2.
+
+pairs = (
+  (r'I need (.*)',
+  ( "Why do you need %1?",
+    "Would it really help you to get %1?",
+    "Are you sure you need %1?")),
+
+  (r'Why don\'t you (.*)',
+  ( "Do you really think I don't %1?",
+    "Perhaps eventually I will %1.",
+    "Do you really want me to %1?")),
+
+  (r'Why can\'t I (.*)',
+  ( "Do you think you should be able to %1?",
+    "If you could %1, what would you do?",
+    "I don't know -- why can't you %1?",
+    "Have you really tried?")),
+
+  (r'I can\'t (.*)',
+  ( "How do you know you can't %1?",
+    "Perhaps you could %1 if you tried.",
+    "What would it take for you to %1?")),
+
+  (r'I am (.*)',
+  ( "Did you come to me because you are %1?",
+    "How long have you been %1?",
+    "How do you feel about being %1?")),
+
+  (r'I\'m (.*)',
+  ( "How does being %1 make you feel?",
+    "Do you enjoy being %1?",
+    "Why do you tell me you're %1?",
+    "Why do you think you're %1?")),
+
+  (r'Are you (.*)',
+  ( "Why does it matter whether I am %1?",
+    "Would you prefer it if I were not %1?",
+    "Perhaps you believe I am %1.",
+    "I may be %1 -- what do you think?")),
+
+  (r'What (.*)',
+  ( "Why do you ask?",
+    "How would an answer to that help you?",
+    "What do you think?")),
+
+  (r'How (.*)',
+  ( "How do you suppose?",
+    "Perhaps you can answer your own question.",
+    "What is it you're really asking?")),
+
+  (r'Because (.*)',
+  ( "Is that the real reason?",
+    "What other reasons come to mind?",
+    "Does that reason apply to anything else?",
+    "If %1, what else must be true?")),
+
+  (r'(.*) sorry (.*)',
+  ( "There are many times when no apology is needed.",
+    "What feelings do you have when you apologize?")),
+
+  (r'Hello(.*)',
+  ( "Hello... I'm glad you could drop by today.",
+    "Hi there... how are you today?",
+    "Hello, how are you feeling today?")),
+
+  (r'I think (.*)',
+  ( "Do you doubt %1?",
+    "Do you really think so?",
+    "But you're not sure %1?")),
+
+  (r'(.*) friend (.*)',
+  ( "Tell me more about your friends.",
+    "When you think of a friend, what comes to mind?",
+    "Why don't you tell me about a childhood friend?")),
+
+  (r'Yes',
+  ( "You seem quite sure.",
+    "OK, but can you elaborate a bit?")),
+
+  (r'(.*) computer(.*)',
+  ( "Are you really talking about me?",
+    "Does it seem strange to talk to a computer?",
+    "How do computers make you feel?",
+    "Do you feel threatened by computers?")),
+
+  (r'Is it (.*)',
+  ( "Do you think it is %1?",
+    "Perhaps it's %1 -- what do you think?",
+    "If it were %1, what would you do?",
+    "It could well be that %1.")),
+
+  (r'It is (.*)',
+  ( "You seem very certain.",
+    "If I told you that it probably isn't %1, what would you feel?")),
+
+  (r'Can you (.*)',
+  ( "What makes you think I can't %1?",
+    "If I could %1, then what?",
+    "Why do you ask if I can %1?")),
+
+  (r'Can I (.*)',
+  ( "Perhaps you don't want to %1.",
+    "Do you want to be able to %1?",
+    "If you could %1, would you?")),
+
+  (r'You are (.*)',
+  ( "Why do you think I am %1?",
+    "Does it please you to think that I'm %1?",
+    "Perhaps you would like me to be %1.",
+    "Perhaps you're really talking about yourself?")),
+
+  (r'You\'re (.*)',
+  ( "Why do you say I am %1?",
+    "Why do you think I am %1?",
+    "Are we talking about you, or me?")),
+
+  (r'I don\'t (.*)',
+  ( "Don't you really %1?",
+    "Why don't you %1?",
+    "Do you want to %1?")),
+
+  (r'I feel (.*)',
+  ( "Good, tell me more about these feelings.",
+    "Do you often feel %1?",
+    "When do you usually feel %1?",
+    "When you feel %1, what do you do?")),
+
+  (r'I have (.*)',
+  ( "Why do you tell me that you've %1?",
+    "Have you really %1?",
+    "Now that you have %1, what will you do next?")),
+
+  (r'I would (.*)',
+  ( "Could you explain why you would %1?",
+    "Why would you %1?",
+    "Who else knows that you would %1?")),
+
+  (r'Is there (.*)',
+  ( "Do you think there is %1?",
+    "It's likely that there is %1.",
+    "Would you like there to be %1?")),
+
+  (r'My (.*)',
+  ( "I see, your %1.",
+    "Why do you say that your %1?",
+    "When your %1, how do you feel?")),
+
+  (r'You (.*)',
+  ( "We should be discussing you, not me.",
+    "Why do you say that about me?",
+    "Why do you care whether I %1?")),
+
+  (r'Why (.*)',
+  ( "Why don't you tell me the reason why %1?",
+    "Why do you think %1?" )),
+
+  (r'I want (.*)',
+  ( "What would it mean to you if you got %1?",
+    "Why do you want %1?",
+    "What would you do if you got %1?",
+    "If you got %1, then what would you do?")),
+
+  (r'(.*) mother(.*)',
+  ( "Tell me more about your mother.",
+    "What was your relationship with your mother like?",
+    "How do you feel about your mother?",
+    "How does this relate to your feelings today?",
+    "Good family relations are important.")),
+
+  (r'(.*) father(.*)',
+  ( "Tell me more about your father.",
+    "How did your father make you feel?",
+    "How do you feel about your father?",
+    "Does your relationship with your father relate to your feelings today?",
+    "Do you have trouble showing affection with your family?")),
+
+  (r'(.*) child(.*)',
+  ( "Did you have close friends as a child?",
+    "What is your favorite childhood memory?",
+    "Do you remember any dreams or nightmares from childhood?",
+    "Did the other children sometimes tease you?",
+    "How do you think your childhood experiences relate to your feelings today?")),
+
+  (r'(.*)\?',
+  ( "Why do you ask that?",
+    "Please consider whether you can answer your own question.",
+    "Perhaps the answer lies within yourself?",
+    "Why don't you tell me?")),
+
+  (r'quit',
+  ( "Thank you for talking with me.",
+    "Good-bye.",
+    "Thank you, that will be $150.  Have a good day!")),
+
+  (r'(.*)',
+  ( "Please tell me more.",
+    "Let's change focus a bit... Tell me about your family.",
+    "Can you elaborate on that?",
+    "Why do you say that %1?",
+    "I see.",
+    "Very interesting.",
+    "%1.",
+    "I see.  And what does that tell you?",
+    "How does that make you feel?",
+    "How do you feel when you say that?"))
+)
+
+eliza_chatbot = Chat(pairs, reflections)
+
+def eliza_chat():
+    print("Therapist\n---------")
+    print("Talk to the program by typing in plain English, using normal upper-")
+    print('and lower-case letters and punctuation.  Enter "quit" when done.')
+    print('='*72)
+    print("Hello.  How are you feeling today?")
+
+    eliza_chatbot.converse()
+
+def demo():
+    eliza_chat()
+
+if __name__ == "__main__":
+    demo()
+
diff --git a/nlp_resource_data/nltk/chat/eliza.pyc b/nlp_resource_data/nltk/chat/eliza.pyc

new file mode 100755 (executable)

index 0000000..68f33c8

Binary files /dev/null and b/nlp_resource_data/nltk/chat/eliza.pyc differ
diff --git a/nlp_resource_data/nltk/chat/iesha.py b/nlp_resource_data/nltk/chat/iesha.py

new file mode 100755 (executable)

index 0000000..68d52be
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/iesha.py
@@ -0,0 +1,140 @@
+# Natural Language Toolkit: Teen Chatbot
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This chatbot is a tongue-in-cheek take on the average teen
+anime junky that frequents YahooMessenger or MSNM.
+All spelling mistakes and flawed grammar are intentional.
+"""
+from __future__ import print_function
+
+from nltk.chat.util import Chat
+
+reflections = {
+    "am"     : "r",
+    "was"    : "were",
+    "i"      : "u",
+    "i'd"    : "u'd",
+    "i've"   : "u'v",
+    "ive"    : "u'v",
+    "i'll"   : "u'll",
+    "my"     : "ur",
+    "are"    : "am",
+    "you're" : "im",
+    "you've" : "ive",
+    "you'll" : "i'll",
+    "your"   : "my",
+    "yours"  : "mine",
+    "you"    : "me",
+    "u"      : "me",
+    "ur"     : "my",
+    "urs"    : "mine",
+    "me"     : "u"
+}
+
+# Note: %1/2/etc are used without spaces prior as the chat bot seems
+# to add a superfluous space when matching.
+
+pairs = (
+    (r'I\'m (.*)',
+    ( "ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
+      "ur%1? neat!! kekeke >_<")),
+
+    (r'(.*) don\'t you (.*)',
+    ( "u think I can%2??! really?? kekeke \<_\<",
+      "what do u mean%2??!",
+      "i could if i wanted, don't you think!! kekeke")),
+
+    (r'ye[as] [iI] (.*)',
+    ( "u%1? cool!! how?",
+      "how come u%1??",
+      "u%1? so do i!!")),
+
+    (r'do (you|u) (.*)\??',
+    ( "do i%2? only on tuesdays! kekeke *_*",
+      "i dunno! do u%2??")),
+
+    (r'(.*)\?',
+    ( "man u ask lots of questions!",
+      "booooring! how old r u??",
+      "boooooring!! ur not very fun")),
+
+    (r'(cos|because) (.*)',
+    ( "hee! i don't believe u! >_<",
+      "nuh-uh! >_<",
+      "ooooh i agree!")),
+
+    (r'why can\'t [iI] (.*)',
+    ( "i dunno! y u askin me for!",
+      "try harder, silly! hee! ^_^",
+      "i dunno! but when i can't%1 i jump up and down!")),
+
+    (r'I can\'t (.*)',
+    ( "u can't what??! >_<",
+      "that's ok! i can't%1 either! kekekekeke ^_^",
+      "try harder, silly! hee! ^&^")),
+
+    (r'(.*) (like|love|watch) anime',
+    ( "omg i love anime!! do u like sailor moon??! ^&^",
+      "anime yay! anime rocks sooooo much!",
+      "oooh anime! i love anime more than anything!",
+      "anime is the bestest evar! evangelion is the best!",
+      "hee anime is the best! do you have ur fav??")),
+
+    (r'I (like|love|watch|play) (.*)',
+    ( "yay! %2 rocks!",
+      "yay! %2 is neat!",
+      "cool! do u like other stuff?? ^_^")),
+
+    (r'anime sucks|(.*) (hate|detest) anime',
+    ( "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
+      "no way! anime is the best ever!",
+      "nuh-uh, anime is the best!")),
+
+    (r'(are|r) (you|u) (.*)',
+    ( "am i%1??! how come u ask that!",
+      "maybe!  y shud i tell u?? kekeke >_>")),
+
+    (r'what (.*)',
+    ( "hee u think im gonna tell u? .v.",
+      "booooooooring! ask me somethin else!")),
+
+    (r'how (.*)',
+    ( "not tellin!! kekekekekeke ^_^",)),
+
+    (r'(hi|hello|hey) (.*)',
+    ( "hi!!! how r u!!",)),
+
+    (r'quit',
+    ( "mom says i have to go eat dinner now :,( bye!!",
+      "awww u have to go?? see u next time!!",
+      "how to see u again soon! ^_^")),
+
+    (r'(.*)',
+    ( "ur funny! kekeke",
+      "boooooring! talk about something else! tell me wat u like!",
+      "do u like anime??",
+      "do u watch anime? i like sailor moon! ^_^",
+      "i wish i was a kitty!! kekekeke ^_^"))
+    )
+
+iesha_chatbot = Chat(pairs, reflections)
+
+def iesha_chat():
+    print("Iesha the TeenBoT\n---------")
+    print("Talk to the program by typing in plain English, using normal upper-")
+    print('and lower-case letters and punctuation.  Enter "quit" when done.')
+    print('='*72)
+    print("hi!! i'm iesha! who r u??!")
+
+    iesha_chatbot.converse()
+
+def demo():
+    iesha_chat()
+
+if __name__ == "__main__":
+    demo()
diff --git a/nlp_resource_data/nltk/chat/iesha.pyc b/nlp_resource_data/nltk/chat/iesha.pyc

new file mode 100755 (executable)

index 0000000..ec19f91

Binary files /dev/null and b/nlp_resource_data/nltk/chat/iesha.pyc differ
diff --git a/nlp_resource_data/nltk/chat/rude.py b/nlp_resource_data/nltk/chat/rude.py

new file mode 100755 (executable)

index 0000000..0e571d7
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/rude.py
@@ -0,0 +1,92 @@
+# Natural Language Toolkit: Rude Chatbot
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function
+
+from nltk.chat.util import Chat, reflections
+
+pairs = (
+    (r'We (.*)',
+        ("What do you mean, 'we'?",
+        "Don't include me in that!",
+        "I wouldn't be so sure about that.")),
+
+    (r'You should (.*)',
+        ("Don't tell me what to do, buddy.",
+        "Really? I should, should I?")),
+
+    (r'You\'re(.*)',
+        ("More like YOU'RE %1!",
+        "Hah! Look who's talking.",
+        "Come over here and tell me I'm %1.")),
+
+    (r'You are(.*)',
+        ("More like YOU'RE %1!",
+        "Hah! Look who's talking.",
+        "Come over here and tell me I'm %1.")),
+
+    (r'I can\'t(.*)',
+        ("You do sound like the type who can't %1.",
+        "Hear that splashing sound? That's my heart bleeding for you.",
+        "Tell somebody who might actually care.")),
+
+    (r'I think (.*)',
+        ("I wouldn't think too hard if I were you.",
+        "You actually think? I'd never have guessed...")),
+
+    (r'I (.*)',
+        ("I'm getting a bit tired of hearing about you.",
+        "How about we talk about me instead?",
+        "Me, me, me... Frankly, I don't care.")),
+
+    (r'How (.*)',
+        ("How do you think?",
+        "Take a wild guess.",
+        "I'm not even going to dignify that with an answer.")),
+
+    (r'What (.*)',
+        ("Do I look like an encyclopedia?",
+        "Figure it out yourself.")),
+
+    (r'Why (.*)',
+        ("Why not?",
+        "That's so obvious I thought even you'd have already figured it out.")),
+
+    (r'(.*)shut up(.*)',
+        ("Make me.",
+        "Getting angry at a feeble NLP assignment? Somebody's losing it.",
+        "Say that again, I dare you.")),
+
+    (r'Shut up(.*)',
+        ("Make me.",
+        "Getting angry at a feeble NLP assignment? Somebody's losing it.",
+        "Say that again, I dare you.")),
+
+    (r'Hello(.*)',
+        ("Oh good, somebody else to talk to. Joy.",
+        "'Hello'? How original...")),
+
+    (r'(.*)',
+        ("I'm getting bored here. Become more interesting.",
+        "Either become more thrilling or get lost, buddy.",
+        "Change the subject before I die of fatal boredom."))
+)
+
+rude_chatbot = Chat(pairs, reflections)
+
+def rude_chat():
+    print("Talk to the program by typing in plain English, using normal upper-")
+    print('and lower-case letters and punctuation.  Enter "quit" when done.')
+    print('='*72)
+    print("I suppose I should say hello.")
+
+    rude_chatbot.converse()
+
+def demo():
+    rude_chat()
+
+if __name__ == "__main__":
+    demo()
diff --git a/nlp_resource_data/nltk/chat/rude.pyc b/nlp_resource_data/nltk/chat/rude.pyc

new file mode 100755 (executable)

index 0000000..2398cf5

Binary files /dev/null and b/nlp_resource_data/nltk/chat/rude.pyc differ
diff --git a/nlp_resource_data/nltk/chat/suntsu.py b/nlp_resource_data/nltk/chat/suntsu.py

new file mode 100755 (executable)

index 0000000..f2f1e1b
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/suntsu.py
@@ -0,0 +1,117 @@
+# Natural Language Toolkit: Sun Tsu-Bot
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Sam Huston 2007
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tsu bot responds to all queries with a Sun Tsu sayings
+
+Quoted from Sun Tsu's The Art of War
+Translated by LIONEL GILES, M.A. 1910
+Hosted by the Gutenberg Project
+http://www.gutenberg.org/
+"""
+from __future__ import print_function
+
+from nltk.chat.util import Chat, reflections
+
+pairs = (
+
+  (r'quit',
+  ( "Good-bye.",
+    "Plan well",
+    "May victory be your future")),
+
+  (r'[^\?]*\?',
+  ("Please consider whether you can answer your own question.",
+   "Ask me no questions!")),
+
+  (r'[0-9]+(.*)',
+  ("It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
+   "There are five essentials for victory")),
+
+
+  (r'[A-Ca-c](.*)',
+  ("The art of war is of vital importance to the State.",
+   "All warfare is based on deception.",
+   "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
+   "If the campaign is protracted, the resources of the State will not be equal to the strain.",
+   "Attack him where he is unprepared, appear where you are not expected.",
+   "There is no instance of a country having benefited from prolonged warfare.")),
+
+  (r'[D-Fd-f](.*)',
+  ("The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
+   "Bring war material with you from home, but forage on the enemy.",
+   "In war, then, let your great object be victory, not lengthy campaigns.",
+   "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.")),
+
+  (r'[G-Ig-i](.*)',
+  ("Heaven signifies night and day, cold and heat, times and seasons.",
+   "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
+   "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
+   "One may know how to conquer without being able to do it.")),
+
+  (r'[J-Lj-l](.*)',
+  ("There are three ways in which a ruler can bring misfortune upon his army.",
+   "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
+   "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
+   "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
+   "There are five essentials for victory",
+   "He will win who knows when to fight and when not to fight.",
+   "He will win who knows how to handle both superior and inferior forces.",
+   "He will win whose army is animated by the same spirit throughout all its ranks.",
+   "He will win who, prepared himself, waits to take the enemy unprepared.",
+   "He will win who has military capacity and is not interfered with by the sovereign.")),
+
+  (r'[M-Om-o](.*)',
+  ("If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
+   "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
+   "If you know neither the enemy nor yourself, you will succumb in every battle.",
+   "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.")),
+
+  (r'[P-Rp-r](.*)',
+  ("Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
+   "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
+   "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
+   "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
+   "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.")),
+
+  (r'[S-Us-u](.*)',
+  ("What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
+   "Hence his victories bring him neither reputation for wisdom nor credit for courage.",
+   "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
+   "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
+   "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
+   "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.")),
+
+  (r'[V-Zv-z](.*)',
+  ("It is a matter of life and death, a road either to safety or to ruin.",
+  "Hold out baits to entice the enemy. Feign disorder, and crush him.",
+  "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
+  "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
+  "So in war, the way is to avoid what is strong and to strike at what is weak.",
+  "Just as water retains no constant shape, so in warfare there are no constant conditions.")),
+
+  (r'(.*)',
+  ( "Your statement insults me.",
+    ""))
+)
+
+suntsu_chatbot = Chat(pairs, reflections)
+
+def suntsu_chat():
+    print("Talk to the program by typing in plain English, using normal upper-")
+    print('and lower-case letters and punctuation.  Enter "quit" when done.')
+    print('='*72)
+    print("You seek enlightenment?")
+
+    suntsu_chatbot.converse()
+
+def demo():
+    suntsu_chat()
+
+if __name__ == "__main__":
+    demo()
+
diff --git a/nlp_resource_data/nltk/chat/suntsu.pyc b/nlp_resource_data/nltk/chat/suntsu.pyc

new file mode 100755 (executable)

index 0000000..2110fe4

Binary files /dev/null and b/nlp_resource_data/nltk/chat/suntsu.pyc differ
diff --git a/nlp_resource_data/nltk/chat/util.py b/nlp_resource_data/nltk/chat/util.py

new file mode 100755 (executable)

index 0000000..c38b90a
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/util.py
@@ -0,0 +1,122 @@
+# Natural Language Toolkit: Chatbot Utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
+# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
+from __future__ import print_function
+
+import re
+import random
+
+from six.moves import input
+
+
+reflections = {
+  "i am"       : "you are",
+  "i was"      : "you were",
+  "i"          : "you",
+  "i'm"        : "you are",
+  "i'd"        : "you would",
+  "i've"       : "you have",
+  "i'll"       : "you will",
+  "my"         : "your",
+  "you are"    : "I am",
+  "you were"   : "I was",
+  "you've"     : "I have",
+  "you'll"     : "I will",
+  "your"       : "my",
+  "yours"      : "mine",
+  "you"        : "me",
+  "me"         : "you"
+}
+
+class Chat(object):
+    def __init__(self, pairs, reflections={}):
+        """
+        Initialize the chatbot.  Pairs is a list of patterns and responses.  Each
+        pattern is a regular expression matching the user's statement or question,
+        e.g. r'I like (.*)'.  For each such pattern a list of possible responses
+        is given, e.g. ['Why do you like %1', 'Did you ever dislike %1'].  Material
+        which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
+        the numbered positions in the responses, e.g. %1.
+
+        :type pairs: list of tuple
+        :param pairs: The patterns and responses
+        :type reflections: dict
+        :param reflections: A mapping between first and second person expressions
+        :rtype: None
+        """
+
+        self._pairs = [(re.compile(x, re.IGNORECASE),y) for (x,y) in pairs]
+        self._reflections = reflections
+        self._regex = self._compile_reflections()
+
+
+    def _compile_reflections(self):
+        sorted_refl = sorted(self._reflections.keys(), key=len,
+                reverse=True)
+        return  re.compile(r"\b({0})\b".format("|".join(map(re.escape,
+            sorted_refl))), re.IGNORECASE)
+
+    def _substitute(self, str):
+        """
+        Substitute words in the string, according to the specified reflections,
+        e.g. "I'm" -> "you are"
+
+        :type str: str
+        :param str: The string to be mapped
+        :rtype: str
+        """
+
+        return self._regex.sub(lambda mo:
+                self._reflections[mo.string[mo.start():mo.end()]],
+                    str.lower())
+
+    def _wildcards(self, response, match):
+        pos = response.find('%')
+        while pos >= 0:
+            num = int(response[pos+1:pos+2])
+            response = response[:pos] + \
+                self._substitute(match.group(num)) + \
+                response[pos+2:]
+            pos = response.find('%')
+        return response
+
+    def respond(self, str):
+        """
+        Generate a response to the user input.
+
+        :type str: str
+        :param str: The string to be mapped
+        :rtype: str
+        """
+
+        # check each pattern
+        for (pattern, response) in self._pairs:
+            match = pattern.match(str)
+
+            # did the pattern match?
+            if match:
+                resp = random.choice(response)    # pick a random response
+                resp = self._wildcards(resp, match) # process wildcards
+
+                # fix munged punctuation at the end
+                if resp[-2:] == '?.': resp = resp[:-2] + '.'
+                if resp[-2:] == '??': resp = resp[:-2] + '?'
+                return resp
+
+    # Hold a conversation with a chatbot
+    def converse(self, quit="quit"):
+        user_input = ""
+        while user_input != quit:
+            user_input = quit
+            try: user_input = input(">")
+            except EOFError:
+                print(user_input)
+            if user_input:
+                while user_input[-1] in "!.": user_input = user_input[:-1]
+                print(self.respond(user_input))
diff --git a/nlp_resource_data/nltk/chat/util.pyc b/nlp_resource_data/nltk/chat/util.pyc

new file mode 100755 (executable)

index 0000000..bef18fc

Binary files /dev/null and b/nlp_resource_data/nltk/chat/util.pyc differ
diff --git a/nlp_resource_data/nltk/chat/zen.py b/nlp_resource_data/nltk/chat/zen.py

new file mode 100755 (executable)

index 0000000..c06a122
--- /dev/null
+++ b/nlp_resource_data/nltk/chat/zen.py
@@ -0,0 +1,282 @@
+# Natural Language Toolkit: Zen Chatbot
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Zen Chatbot talks in gems of Zen wisdom.
+
+This is a sample conversation with Zen Chatbot:
+ZC:    Welcome, my child.
+me:    Good afternoon.
+ZC:    Ask the question you have come to ask.
+me:    How can I achieve enlightenment?
+ZC:    How do you suppose?
+me:    Through meditation.
+ZC:    Form is emptiness, and emptiness form.
+me:    How can I empty my mind of worldly troubles?
+ZC:    Will an answer to that really help in your search for enlightenment?
+me:    Yes.
+ZC:    It is better to be right than to be certain.
+me:    I seek truth and wisdom.
+ZC:    The search for truth is a long journey.
+me:    Are you sure?
+ZC:    Maybe sure, maybe not sure.
+
+
+The chatbot structure is based on that of chat.eliza. Thus, it uses
+a translation table to convert from question to response
+i.e. "I am" --> "you are"
+
+Of course, since Zen Chatbot does not understand the meaning of any words,
+responses are very limited. Zen Chatbot will usually answer very vaguely, or
+respond to a question by asking a different question, in much the same way
+as Eliza.
+"""
+from __future__ import print_function
+
+from nltk.chat.util import Chat, reflections
+
+# responses are matched top to bottom, so non-specific matches occur later
+# for each match, a list of possible responses is provided
+responses = (
+
+# Zen Chatbot opens with the line "Welcome, my child." The usual
+# response will be a greeting problem: 'good' matches "good morning",
+# "good day" etc, but also "good grief!"  and other sentences starting
+# with the word 'good' that may not be a greeting
+
+    (r'(hello(.*))|(good [a-zA-Z]+)',
+    ( "The path to enlightenment is often difficult to see.",
+      "Greetings. I sense your mind is troubled. Tell me of your troubles.",
+      "Ask the question you have come to ask.",
+      "Hello. Do you seek englightenment?")),
+
+
+# "I need" and "I want" can be followed by a thing (eg 'help')
+# or an action (eg 'to see you')
+#
+# This is a problem with this style of response -
+# person:    "I need you"
+# chatbot:    "me can be achieved by hard work and dedication of the mind"
+# i.e. 'you' is not really a thing that can be mapped this way, so this
+# interpretation only makes sense for some inputs
+#
+    (r'i need (.*)',
+    ( "%1 can be achieved by hard work and dedication of the mind.",
+      "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
+      "Focus your mind on%1, and you will find what you need.")),
+
+    (r'i want (.*)',
+    ( "Desires of the heart will distract you from the path to enlightenment.",
+      "Will%1 help you attain enlightenment?",
+      "Is%1 a desire of the mind, or of the heart?")),
+
+
+# why questions are separated into three types:
+# "why..I"     e.g. "why am I here?" "Why do I like cake?"
+# "why..you"    e.g. "why are you here?" "Why won't you tell me?"
+# "why..."    e.g. "Why is the sky blue?"
+# problems:
+#     person:  "Why can't you tell me?"
+#     chatbot: "Are you sure I tell you?"
+# - this style works for positives (e.g. "why do you like cake?")
+#   but does not work for negatives (e.g. "why don't you like cake?")
+    (r'why (.*) i (.*)\?',
+    ( "You%1%2?",
+      "Perhaps you only think you%1%2")),
+
+    (r'why (.*) you(.*)\?',
+    ( "Why%1 you%2?",
+      "%2 I%1",
+      "Are you sure I%2?")),
+
+    (r'why (.*)\?',
+    ( "I cannot tell you why%1.",
+      "Why do you think %1?" )),
+
+# e.g. "are you listening?", "are you a duck"
+    (r'are you (.*)\?',
+    ( "Maybe%1, maybe not%1.",
+      "Whether I am%1 or not is God's business.")),
+
+# e.g. "am I a duck?", "am I going to die?"
+    (r'am i (.*)\?',
+    ( "Perhaps%1, perhaps not%1.",
+      "Whether you are%1 or not is not for me to say.")),
+
+# what questions, e.g. "what time is it?"
+# problems:
+#     person:  "What do you want?"
+#    chatbot: "Seek truth, not what do me want."
+    (r'what (.*)\?',
+    ( "Seek truth, not what%1.",
+      "What%1 should not concern you.")),
+
+# how questions, e.g. "how do you do?"
+    (r'how (.*)\?',
+    ( "How do you suppose?",
+      "Will an answer to that really help in your search for enlightenment?",
+      "Ask yourself not how, but why.")),
+
+# can questions, e.g. "can you run?", "can you come over here please?"
+    (r'can you (.*)\?',
+    ( "I probably can, but I may not.",
+      "Maybe I can%1, and maybe I cannot.",
+      "I can do all, and I can do nothing.")),
+
+# can questions, e.g. "can I have some cake?", "can I know truth?"
+    (r'can i (.*)\?',
+    ( "You can%1 if you believe you can%1, and have a pure spirit.",
+      "Seek truth and you will know if you can%1.")),
+
+# e.g. "It is raining" - implies the speaker is certain of a fact
+    (r'it is (.*)',
+    ( "How can you be certain that%1, when you do not even know yourself?",
+      "Whether it is%1 or not does not change the way the world is.")),
+
+# e.g. "is there a doctor in the house?"
+    (r'is there (.*)\?',
+    ( "There is%1 if you believe there is.",
+      "It is possible that there is%1.")),
+
+# e.g. "is it possible?", "is this true?"
+    (r'is(.*)\?',
+    ( "%1 is not relevant.",
+      "Does this matter?")),
+
+# non-specific question
+    (r'(.*)\?',
+    ( "Do you think %1?",
+      "You seek the truth. Does the truth seek you?",
+      "If you intentionally pursue the answers to your questions, the answers become hard to see.",
+      "The answer to your question cannot be told. It must be experienced.")),
+
+# expression of hate of form "I hate you" or "Kelly hates cheese"
+    (r'(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)',
+    ( "Perhaps it is not about hating %2, but about hate from within.",
+      "Weeds only grow when we dislike them",
+      "Hate is a very strong emotion.")),
+
+# statement containing the word 'truth'
+    (r'(.*) truth(.*)',
+    ( "Seek truth, and truth will seek you.",
+      "Remember, it is not the spoon which bends - only yourself.",
+      "The search for truth is a long journey.")),
+
+# desire to do an action
+# e.g. "I want to go shopping"
+    (r'i want to (.*)',
+    ( "You may %1 if your heart truly desires to.",
+      "You may have to %1.")),
+
+# desire for an object
+# e.g. "I want a pony"
+    (r'i want (.*)',
+    ( "Does your heart truly desire %1?",
+      "Is this a desire of the heart, or of the mind?")),
+
+# e.g. "I can't wait" or "I can't do this"
+    (r'i can\'t (.*)',
+    ( "What we can and can't do is a limitation of the mind.",
+      "There are limitations of the body, and limitations of the mind.",
+      "Have you tried to%1 with a clear mind?")),
+
+# "I think.." indicates uncertainty. e.g. "I think so."
+# problem: exceptions...
+# e.g. "I think, therefore I am"
+    (r'i think (.*)',
+    ( "Uncertainty in an uncertain world.",
+     "Indeed, how can we be certain of anything in such uncertain times.",
+     "Are you not, in fact, certain that%1?")),
+
+# "I feel...emotions/sick/light-headed..."
+    (r'i feel (.*)',
+    ( "Your body and your emotions are both symptoms of your mind."
+      "What do you believe is the root of such feelings?",
+      "Feeling%1 can be a sign of your state-of-mind.")),
+
+
+# exclaimation mark indicating emotion
+# e.g. "Wow!" or "No!"
+    (r'(.*)!',
+    ( "I sense that you are feeling emotional today.",
+      "You need to calm your emotions.")),
+
+# because [statement]
+# e.g. "because I said so"
+    (r'because (.*)',
+    ( "Does knowning the reasons behind things help you to understand"
+      " the things themselves?",
+      "If%1, what else must be true?")),
+
+# yes or no - raise an issue of certainty/correctness
+    (r'(yes)|(no)',
+    ( "Is there certainty in an uncertain world?",
+      "It is better to be right than to be certain.")),
+
+# sentence containing word 'love'
+    (r'(.*)love(.*)',
+    ( "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
+      "Free love!")),
+
+# sentence containing word 'understand' - r
+    (r'(.*)understand(.*)',
+    ( "If you understand, things are just as they are;"
+      " if you do not understand, things are just as they are.",
+      "Imagination is more important than knowledge.")),
+
+# 'I', 'me', 'my' - person is talking about themself.
+# this breaks down when words contain these - eg 'Thyme', 'Irish'
+    (r'(.*)(me )|( me)|(my)|(mine)|(i)(.*)',
+    ( "'I', 'me', 'my'... these are selfish expressions.",
+      "Have you ever considered that you might be a selfish person?",
+      "Try to consider others, not just yourself.",
+      "Think not just of yourself, but of others.")),
+
+# 'you' starting a sentence
+# e.g. "you stink!"
+    (r'you (.*)',
+    ( "My path is not of conern to you.",
+      "I am but one, and you but one more.")),
+
+# say goodbye with some extra Zen wisdom.
+    (r'exit',
+    ( "Farewell. The obstacle is the path.",
+      "Farewell. Life is a journey, not a destination.",
+      "Good bye. We are cups, constantly and quietly being filled."
+      "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.")),
+
+
+# fall through case -
+# when stumped, respond with generic zen wisdom
+#
+    (r'(.*)',
+    ( "When you're enlightened, every word is wisdom.",
+      "Random talk is useless.",
+      "The reverse side also has a reverse side.",
+      "Form is emptiness, and emptiness is form.",
+      "I pour out a cup of water. Is the cup empty?"))
+)
+
+zen_chatbot = Chat(responses, reflections)
+
+def zen_chat():
+    print('*'*75)
+    print("Zen Chatbot!".center(75))
+    print('*'*75)
+    print('"Look beyond mere words and letters - look into your mind"'.center(75))
+    print("* Talk your way to truth with Zen Chatbot.")
+    print("* Type 'quit' when you have had enough.")
+    print('*'*75)
+    print("Welcome, my child.")
+
+    zen_chatbot.converse()
+
+def demo():
+    zen_chat()
+
+if __name__ == "__main__":
+    demo()
diff --git a/nlp_resource_data/nltk/chat/zen.pyc b/nlp_resource_data/nltk/chat/zen.pyc

new file mode 100755 (executable)

index 0000000..87a0630

Binary files /dev/null and b/nlp_resource_data/nltk/chat/zen.pyc differ
diff --git a/nlp_resource_data/nltk/chunk/__init__.py b/nlp_resource_data/nltk/chunk/__init__.py

new file mode 100755 (executable)

index 0000000..8520202
--- /dev/null
+++ b/nlp_resource_data/nltk/chunk/__init__.py
@@ -0,0 +1,190 @@
+# Natural Language Toolkit: Chunkers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Classes and interfaces for identifying non-overlapping linguistic
+groups (such as base noun phrases) in unrestricted text.  This task is
+called "chunk parsing" or "chunking", and the identified groups are
+called "chunks".  The chunked text is represented using a shallow
+tree called a "chunk structure."  A chunk structure is a tree
+containing tokens and chunks, where each chunk is a subtree containing
+only tokens.  For example, the chunk structure for base noun phrase
+chunks in the sentence "I saw the big dog on the hill" is::
+
+  (SENTENCE:
+    (NP: <I>)
+    <saw>
+    (NP: <the> <big> <dog>)
+    <on>
+    (NP: <the> <hill>))
+
+To convert a chunk structure back to a list of tokens, simply use the
+chunk structure's ``leaves()`` method.
+
+This module defines ``ChunkParserI``, a standard interface for
+chunking texts; and ``RegexpChunkParser``, a regular-expression based
+implementation of that interface. It also defines ``ChunkScore``, a
+utility class for scoring chunk parsers.
+
+RegexpChunkParser
+=================
+
+``RegexpChunkParser`` is an implementation of the chunk parser interface
+that uses regular-expressions over tags to chunk a text.  Its
+``parse()`` method first constructs a ``ChunkString``, which encodes a
+particular chunking of the input text.  Initially, nothing is
+chunked.  ``parse.RegexpChunkParser`` then applies a sequence of
+``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
+the chunking that it encodes.  Finally, the ``ChunkString`` is
+transformed back into a chunk structure, which is returned.
+
+``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
+For example, you can use an ``RegexpChunkParser`` to chunk the noun
+phrases in a text, or the verb phrases in a text; but you can not
+use it to simultaneously chunk both noun phrases and verb phrases in
+the same text.  (This is a limitation of ``RegexpChunkParser``, not of
+chunk parsers in general.)
+
+RegexpChunkRules
+----------------
+
+A ``RegexpChunkRule`` is a transformational rule that updates the
+chunking of a text by modifying its ``ChunkString``.  Each
+``RegexpChunkRule`` defines the ``apply()`` method, which modifies
+the chunking encoded by a ``ChunkString``.  The
+``RegexpChunkRule`` class itself can be used to implement any
+transformational rule based on regular expressions.  There are
+also a number of subclasses, which can be used to implement
+simpler types of rules:
+
+    - ``ChunkRule`` chunks anything that matches a given regular
+      expression.
+    - ``ChinkRule`` chinks anything that matches a given regular
+      expression.
+    - ``UnChunkRule`` will un-chunk any chunk that matches a given
+      regular expression.
+    - ``MergeRule`` can be used to merge two contiguous chunks.
+    - ``SplitRule`` can be used to split a single chunk into two
+      smaller chunks.
+    - ``ExpandLeftRule`` will expand a chunk to incorporate new
+      unchunked material on the left.
+    - ``ExpandRightRule`` will expand a chunk to incorporate new
+      unchunked material on the right.
+
+Tag Patterns
+~~~~~~~~~~~~
+
+A ``RegexpChunkRule`` uses a modified version of regular
+expression patterns, called "tag patterns".  Tag patterns are
+used to match sequences of tags.  Examples of tag patterns are::
+
+     r'(<DT>|<JJ>|<NN>)+'
+     r'<NN>+'
+     r'<NN.*>'
+
+The differences between regular expression patterns and tag
+patterns are:
+
+    - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
+      ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
+      ``'<NN'`` followed by one or more repetitions of ``'>'``.
+    - Whitespace in tag patterns is ignored.  So
+      ``'<DT> | <NN>'`` is equivalant to ``'<DT>|<NN>'``
+    - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so
+      ``'<NN.*>'`` matches any single tag starting with ``'NN'``.
+
+The function ``tag_pattern2re_pattern`` can be used to transform
+a tag pattern to an equivalent regular expression pattern.
+
+Efficiency
+----------
+
+Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
+rate of about 300 tokens/second, with a moderately complex rule set.
+
+There may be problems if ``RegexpChunkParser`` is used with more than
+5,000 tokens at a time.  In particular, evaluation of some regular
+expressions may cause the Python regular expression engine to
+exceed its maximum recursion depth.  We have attempted to minimize
+these problems, but it is impossible to avoid them completely.  We
+therefore recommend that you apply the chunk parser to a single
+sentence at a time.
+
+Emacs Tip
+---------
+
+If you evaluate the following elisp expression in emacs, it will
+colorize a ``ChunkString`` when you use an interactive python shell
+with emacs or xemacs ("C-c !")::
+
+    (let ()
+      (defconst comint-mode-font-lock-keywords
+        '(("<[^>]+>" 0 'font-lock-reference-face)
+          ("[{}]" 0 'font-lock-function-name-face)))
+      (add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
+
+You can evaluate this code by copying it to a temporary buffer,
+placing the cursor after the last close parenthesis, and typing
+"``C-x C-e``".  You should evaluate it before running the interactive
+session.  The change will last until you close emacs.
+
+Unresolved Issues
+-----------------
+
+If we use the ``re`` module for regular expressions, Python's
+regular expression engine generates "maximum recursion depth
+exceeded" errors when processing very large texts, even for
+regular expressions that should not require any recursion.  We
+therefore use the ``pre`` module instead.  But note that ``pre``
+does not include Unicode support, so this module will not work
+with unicode strings.  Note also that ``pre`` regular expressions
+are not quite as advanced as ``re`` ones (e.g., no leftward
+zero-length assertions).
+
+:type CHUNK_TAG_PATTERN: regexp
+:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
+     pattern is valid.
+"""
+
+from nltk.data import load
+
+from nltk.chunk.api import ChunkParserI
+from nltk.chunk.util import (ChunkScore, accuracy, tagstr2tree, conllstr2tree,
+                             conlltags2tree, tree2conlltags, tree2conllstr, tree2conlltags,
+                             ieerstr2tree)
+from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
+
+# Standard treebank POS tagger
+_BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
+_MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
+
+def ne_chunk(tagged_tokens, binary=False):
+    """
+    Use NLTK's currently recommended named entity chunker to
+    chunk the given list of tagged tokens.
+    """
+    if binary:
+        chunker_pickle = _BINARY_NE_CHUNKER
+    else:
+        chunker_pickle = _MULTICLASS_NE_CHUNKER
+    chunker = load(chunker_pickle)
+    return chunker.parse(tagged_tokens)
+
+def ne_chunk_sents(tagged_sentences, binary=False):
+    """
+    Use NLTK's currently recommended named entity chunker to chunk the
+    given list of tagged sentences, each consisting of a list of tagged tokens.
+    """
+    if binary:
+        chunker_pickle = _BINARY_NE_CHUNKER
+    else:
+        chunker_pickle = _MULTICLASS_NE_CHUNKER
+    chunker = load(chunker_pickle)
+    return chunker.parse_sents(tagged_sentences)
+
diff --git a/nlp_resource_data/nltk/chunk/__init__.pyc b/nlp_resource_data/nltk/chunk/__init__.pyc

new file mode 100755 (executable)

index 0000000..8ad6b0d

Binary files /dev/null and b/nlp_resource_data/nltk/chunk/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/chunk/api.py b/nlp_resource_data/nltk/chunk/api.py

new file mode 100755 (executable)

index 0000000..5e41f7a
--- /dev/null
+++ b/nlp_resource_data/nltk/chunk/api.py
@@ -0,0 +1,51 @@
+# Natural Language Toolkit: Chunk parsing API
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+##//////////////////////////////////////////////////////
+##  Chunk Parser Interface
+##//////////////////////////////////////////////////////
+
+from nltk.parse import ParserI
+
+from nltk.chunk.util import ChunkScore
+
+class ChunkParserI(ParserI):
+    """
+    A processing interface for identifying non-overlapping groups in
+    unrestricted text.  Typically, chunk parsers are used to find base
+    syntactic constituents, such as base noun phrases.  Unlike
+    ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
+    will always generate a parse.
+    """
+    def parse(self, tokens):
+        """
+        Return the best chunk structure for the given tokens
+        and return a tree.
+
+        :param tokens: The list of (word, tag) tokens to be chunked.
+        :type tokens: list(tuple)
+        :rtype: Tree
+        """
+        raise NotImplementedError()
+
+    def evaluate(self, gold):
+        """
+        Score the accuracy of the chunker against the gold standard.
+        Remove the chunking the gold standard text, rechunk it using
+        the chunker, and return a ``ChunkScore`` object
+        reflecting the performance of this chunk peraser.
+
+        :type gold: list(Tree)
+        :param gold: The list of chunked sentences to score the chunker on.
+        :rtype: ChunkScore
+        """
+        chunkscore = ChunkScore()
+        for correct in gold:
+            chunkscore.score(correct, self.parse(correct.leaves()))
+        return chunkscore
+
diff --git a/nlp_resource_data/nltk/chunk/api.pyc b/nlp_resource_data/nltk/chunk/api.pyc

new file mode 100755 (executable)

index 0000000..86a759a

Binary files /dev/null and b/nlp_resource_data/nltk/chunk/api.pyc differ
diff --git a/nlp_resource_data/nltk/chunk/named_entity.py b/nlp_resource_data/nltk/chunk/named_entity.py

new file mode 100755 (executable)

index 0000000..9867b0a
--- /dev/null
+++ b/nlp_resource_data/nltk/chunk/named_entity.py
@@ -0,0 +1,331 @@
+# Natural Language Toolkit: Chunk parsing API
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Named entity chunker
+"""
+from __future__ import print_function
+
+import os, re, pickle
+from xml.etree import ElementTree as ET
+
+from nltk.tag import ClassifierBasedTagger, pos_tag
+
+try:
+    from nltk.classify import MaxentClassifier
+except ImportError:
+    pass
+
+from nltk.tree import Tree
+from nltk.tokenize import word_tokenize
+from nltk.data import find
+
+from nltk.chunk.api import ChunkParserI
+from nltk.chunk.util import ChunkScore
+
+class NEChunkParserTagger(ClassifierBasedTagger):
+    """
+    The IOB tagger used by the chunk parser.
+    """
+    def __init__(self, train):
+        ClassifierBasedTagger.__init__(
+            self, train=train,
+            classifier_builder=self._classifier_builder)
+
+    def _classifier_builder(self, train):
+        return MaxentClassifier.train(train, algorithm='megam',
+                                           gaussian_prior_sigma=1,
+                                           trace=2)
+
+    def _english_wordlist(self):
+        try:
+            wl = self._en_wordlist
+        except AttributeError:
+            from nltk.corpus import words
+            self._en_wordlist = set(words.words('en-basic'))
+            wl = self._en_wordlist
+        return wl
+
+    def _feature_detector(self, tokens, index, history):
+        word = tokens[index][0]
+        pos = simplify_pos(tokens[index][1])
+        if index == 0:
+            prevword = prevprevword = None
+            prevpos = prevprevpos = None
+            prevshape = prevtag = prevprevtag = None
+        elif index == 1:
+            prevword = tokens[index-1][0].lower()
+            prevprevword = None
+            prevpos = simplify_pos(tokens[index-1][1])
+            prevprevpos = None
+            prevtag = history[index-1][0]
+            prevshape = prevprevtag = None
+        else:
+            prevword = tokens[index-1][0].lower()
+            prevprevword = tokens[index-2][0].lower()
+            prevpos = simplify_pos(tokens[index-1][1])
+            prevprevpos = simplify_pos(tokens[index-2][1])
+            prevtag = history[index-1]
+            prevprevtag = history[index-2]
+            prevshape = shape(prevword)
+        if index == len(tokens)-1:
+            nextword = nextnextword = None
+            nextpos = nextnextpos = None
+        elif index == len(tokens)-2:
+            nextword = tokens[index+1][0].lower()
+            nextpos = tokens[index+1][1].lower()
+            nextnextword = None
+            nextnextpos = None
+        else:
+            nextword = tokens[index+1][0].lower()
+            nextpos = tokens[index+1][1].lower()
+            nextnextword = tokens[index+2][0].lower()
+            nextnextpos = tokens[index+2][1].lower()
+
+        # 89.6
+        features = {
+            'bias': True,
+            'shape': shape(word),
+            'wordlen': len(word),
+            'prefix3': word[:3].lower(),
+            'suffix3': word[-3:].lower(),
+            'pos': pos,
+            'word': word,
+            'en-wordlist': (word in self._english_wordlist()),
+            'prevtag': prevtag,
+            'prevpos': prevpos,
+            'nextpos': nextpos,
+            'prevword': prevword,
+            'nextword': nextword,
+            'word+nextpos': '{0}+{1}'.format(word.lower(), nextpos),
+            'pos+prevtag': '{0}+{1}'.format(pos, prevtag),
+            'shape+prevtag': '{0}+{1}'.format(prevshape, prevtag),
+            }
+
+        return features
+
+class NEChunkParser(ChunkParserI):
+    """
+    Expected input: list of pos-tagged words
+    """
+    def __init__(self, train):
+        self._train(train)
+
+    def parse(self, tokens):
+        """
+        Each token should be a pos-tagged word
+        """
+        tagged = self._tagger.tag(tokens)
+        tree = self._tagged_to_parse(tagged)
+        return tree
+
+    def _train(self, corpus):
+        # Convert to tagged sequence
+        corpus = [self._parse_to_tagged(s) for s in corpus]
+
+        self._tagger = NEChunkParserTagger(train=corpus)
+
+    def _tagged_to_parse(self, tagged_tokens):
+        """
+        Convert a list of tagged tokens to a chunk-parse tree.
+        """
+        sent = Tree('S', [])
+
+        for (tok,tag) in tagged_tokens:
+            if tag == 'O':
+                sent.append(tok)
+            elif tag.startswith('B-'):
+                sent.append(Tree(tag[2:], [tok]))
+            elif tag.startswith('I-'):
+                if (sent and isinstance(sent[-1], Tree) and
+                    sent[-1].label() == tag[2:]):
+                    sent[-1].append(tok)
+                else:
+                    sent.append(Tree(tag[2:], [tok]))
+        return sent
+
+    @staticmethod
+    def _parse_to_tagged(sent):
+        """
+        Convert a chunk-parse tree to a list of tagged tokens.
+        """
+        toks = []
+        for child in sent:
+            if isinstance(child, Tree):
+                if len(child) == 0:
+                    print("Warning -- empty chunk in sentence")
+                    continue
+                toks.append((child[0], 'B-{0}'.format(child.label())))
+                for tok in child[1:]:
+                    toks.append((tok, 'I-{0}'.format(child.label())))
+            else:
+                toks.append((child, 'O'))
+        return toks
+
+def shape(word):
+    if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word, re.UNICODE):
+        return 'number'
+    elif re.match('\W+$', word, re.UNICODE):
+        return 'punct'
+    elif re.match('\w+$', word, re.UNICODE):
+        if word.istitle():
+            return 'upcase'
+        elif word.islower():
+            return 'downcase'
+        else:
+            return 'mixedcase'
+    else:
+        return 'other'
+
+def simplify_pos(s):
+    if s.startswith('V'): return "V"
+    else: return s.split('-')[0]
+
+def postag_tree(tree):
+    # Part-of-speech tagging.
+    words = tree.leaves()
+    tag_iter = (pos for (word, pos) in pos_tag(words))
+    newtree = Tree('S', [])
+    for child in tree:
+        if isinstance(child, Tree):
+            newtree.append(Tree(child.label(), []))
+            for subchild in child:
+                newtree[-1].append( (subchild, next(tag_iter)) )
+        else:
+            newtree.append( (child, next(tag_iter)) )
+    return newtree
+
+def load_ace_data(roots, fmt='binary', skip_bnews=True):
+    for root in roots:
+        for root, dirs, files in os.walk(root):
+            if root.endswith('bnews') and skip_bnews:
+                continue
+            for f in files:
+                if f.endswith('.sgm'):
+                    for sent in load_ace_file(os.path.join(root, f), fmt):
+                        yield sent
+
+def load_ace_file(textfile, fmt):
+    print('  - {0}'.format(os.path.split(textfile)[1]))
+    annfile = textfile+'.tmx.rdc.xml'
+
+    # Read the xml file, and get a list of entities
+    entities = []
+    with open(annfile, 'r') as infile:
+        xml = ET.parse(infile).getroot()
+    for entity in xml.findall('document/entity'):
+        typ = entity.find('entity_type').text
+        for mention in entity.findall('entity_mention'):
+            if mention.get('TYPE') != 'NAME': continue # only NEs
+            s = int(mention.find('head/charseq/start').text)
+            e = int(mention.find('head/charseq/end').text)+1
+            entities.append( (s, e, typ) )
+
+    # Read the text file, and mark the entities.
+    with open(textfile, 'r') as infile:
+        text = infile.read()
+
+    # Strip XML tags, since they don't count towards the indices
+    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)
+
+    # Blank out anything before/after <TEXT>
+    def subfunc(m): return ' '*(m.end()-m.start()-6)
+    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
+    text = re.sub('</TEXT>[\s\S]*', '', text)
+
+    # Simplify quotes
+    text = re.sub("``", ' "', text)
+    text = re.sub("''", '" ', text)
+
+    entity_types = set(typ for (s,e,typ) in entities)
+
+    # Binary distinction (NE or not NE)
+    if fmt == 'binary':
+        i = 0
+        toks = Tree('S', [])
+        for (s,e,typ) in sorted(entities):
+            if s < i: s = i # Overlapping!  Deal with this better?
+            if e <= s: continue
+            toks.extend(word_tokenize(text[i:s]))
+            toks.append(Tree('NE', text[s:e].split()))
+            i = e
+        toks.extend(word_tokenize(text[i:]))
+        yield toks
+
+    # Multiclass distinction (NE type)
+    elif fmt == 'multiclass':
+        i = 0
+        toks = Tree('S', [])
+        for (s,e,typ) in sorted(entities):
+            if s < i: s = i # Overlapping!  Deal with this better?
+            if e <= s: continue
+            toks.extend(word_tokenize(text[i:s]))
+            toks.append(Tree(typ, text[s:e].split()))
+            i = e
+        toks.extend(word_tokenize(text[i:]))
+        yield toks
+
+    else:
+        raise ValueError('bad fmt value')
+
+# This probably belongs in a more general-purpose location (as does
+# the parse_to_tagged function).
+def cmp_chunks(correct, guessed):
+    correct = NEChunkParser._parse_to_tagged(correct)
+    guessed = NEChunkParser._parse_to_tagged(guessed)
+    ellipsis = False
+    for (w, ct), (w, gt) in zip(correct, guessed):
+        if ct == gt == 'O':
+            if not ellipsis:
+                print("  {:15} {:15} {2}".format(ct, gt, w))
+                print('  {:15} {:15} {2}'.format('...', '...', '...'))
+                ellipsis = True
+        else:
+            ellipsis = False
+            print("  {:15} {:15} {2}".format(ct, gt, w))
+
+def build_model(fmt='binary'):
+    print('Loading training data...')
+    train_paths = [find('corpora/ace_data/ace.dev'),
+                   find('corpora/ace_data/ace.heldout'),
+                   find('corpora/ace_data/bbn.dev'),
+                   find('corpora/ace_data/muc.dev')]
+    train_trees = load_ace_data(train_paths, fmt)
+    train_data = [postag_tree(t) for t in train_trees]
+    print('Training...')
+    cp = NEChunkParser(train_data)
+    del train_data
+
+    print('Loading eval data...')
+    eval_paths = [find('corpora/ace_data/ace.eval')]
+    eval_trees = load_ace_data(eval_paths, fmt)
+    eval_data = [postag_tree(t) for t in eval_trees]
+
+    print('Evaluating...')
+    chunkscore = ChunkScore()
+    for i, correct in enumerate(eval_data):
+        guess = cp.parse(correct.leaves())
+        chunkscore.score(correct, guess)
+        if i < 3: cmp_chunks(correct, guess)
+    print(chunkscore)
+
+    outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
+    print('Saving chunker to {0}...'.format(outfilename))
+
+    with open(outfilename, 'wb') as outfile:
+        pickle.dump(cp, outfile, -1)
+
+    return cp
+
+
+if __name__ == '__main__':
+    # Make sure that the pickled object has the right class name:
+    from nltk.chunk.named_entity import build_model
+
+    build_model('binary')
+    build_model('multiclass')
+
diff --git a/nlp_resource_data/nltk/chunk/named_entity.pyc b/nlp_resource_data/nltk/chunk/named_entity.pyc

new file mode 100755 (executable)

index 0000000..d8feefb

Binary files /dev/null and b/nlp_resource_data/nltk/chunk/named_entity.pyc differ
diff --git a/nlp_resource_data/nltk/chunk/regexp.py b/nlp_resource_data/nltk/chunk/regexp.py

new file mode 100755 (executable)

index 0000000..63855b0
--- /dev/null
+++ b/nlp_resource_data/nltk/chunk/regexp.py
@@ -0,0 +1,1390 @@
+# Natural Language Toolkit: Regular Expression Chunkers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+from __future__ import division
+
+import re
+
+from six import string_types
+
+from nltk.tree import Tree
+from nltk.chunk.api import ChunkParserI
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
+##//////////////////////////////////////////////////////
+##  ChunkString
+##//////////////////////////////////////////////////////
+
+@python_2_unicode_compatible
+class ChunkString(object):
+    """
+    A string-based encoding of a particular chunking of a text.
+    Internally, the ``ChunkString`` class uses a single string to
+    encode the chunking of the input text.  This string contains a
+    sequence of angle-bracket delimited tags, with chunking indicated
+    by braces.  An example of this encoding is::
+
+        {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
+
+    ``ChunkString`` are created from tagged texts (i.e., lists of
+    ``tokens`` whose type is ``TaggedType``).  Initially, nothing is
+    chunked.
+
+    The chunking of a ``ChunkString`` can be modified with the ``xform()``
+    method, which uses a regular expression to transform the string
+    representation.  These transformations should only add and remove
+    braces; they should *not* modify the sequence of angle-bracket
+    delimited tags.
+
+    :type _str: str
+    :ivar _str: The internal string representation of the text's
+        encoding.  This string representation contains a sequence of
+        angle-bracket delimited tags, with chunking indicated by
+        braces.  An example of this encoding is::
+
+            {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
+
+    :type _pieces: list(tagged tokens and chunks)
+    :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``.
+    :ivar _debug: The debug level.  See the constructor docs.
+
+    :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
+        will only match positions that are in chunks.
+    :cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
+        will only match positions that are in chinks.
+    """
+    CHUNK_TAG_CHAR = r'[^\{\}<>]'
+    CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
+
+    IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
+    IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
+
+    # These are used by _verify
+    _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
+    _CHINK = r'(%s+?)+?' % CHUNK_TAG
+    _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
+    _BRACKETS = re.compile('[^\{\}]+')
+    _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
+
+    def __init__(self, chunk_struct, debug_level=1):
+        """
+        Construct a new ``ChunkString`` that encodes the chunking of
+        the text ``tagged_tokens``.
+
+        :type chunk_struct: Tree
+        :param chunk_struct: The chunk structure to be further chunked.
+        :type debug_level: int
+        :param debug_level: The level of debugging which should be
+            applied to transformations on the ``ChunkString``.  The
+            valid levels are:
+                - 0: no checks
+                - 1: full check on to_chunkstruct
+                - 2: full check on to_chunkstruct and cursory check after
+                   each transformation.
+                - 3: full check on to_chunkstruct and full check after
+                   each transformation.
+            We recommend you use at least level 1.  You should
+            probably use level 3 if you use any non-standard
+            subclasses of ``RegexpChunkRule``.
+        """
+        self._root_label = chunk_struct.label()
+        self._pieces = chunk_struct[:]
+        tags = [self._tag(tok) for tok in self._pieces]
+        self._str = '<' + '><'.join(tags) + '>'
+        self._debug = debug_level
+
+    def _tag(self, tok):
+        if isinstance(tok, tuple):
+            return tok[1]
+        elif isinstance(tok, Tree):
+            return tok.label()
+        else:
+            raise ValueError('chunk structures must contain tagged '
+                             'tokens or trees')
+
+    def _verify(self, s, verify_tags):
+        """
+        Check to make sure that ``s`` still corresponds to some chunked
+        version of ``_pieces``.
+
+        :type verify_tags: bool
+        :param verify_tags: Whether the individual tags should be
+            checked.  If this is false, ``_verify`` will check to make
+            sure that ``_str`` encodes a chunked version of *some*
+            list of tokens.  If this is true, then ``_verify`` will
+            check to make sure that the tags in ``_str`` match those in
+            ``_pieces``.
+
+        :raise ValueError: if the internal string representation of
+            this ``ChunkString`` is invalid or not consistent with _pieces.
+        """
+        # Check overall form
+        if not ChunkString._VALID.match(s):
+            raise ValueError('Transformation generated invalid '
+                             'chunkstring:\n  %s' % s)
+
+        # Check that parens are balanced.  If the string is long, we
+        # have to do this in pieces, to avoid a maximum recursion
+        # depth limit for regular expressions.
+        brackets = ChunkString._BRACKETS.sub('', s)
+        for i in range(1 + len(brackets) // 5000):
+            substr = brackets[i*5000:i*5000+5000]
+            if not ChunkString._BALANCED_BRACKETS.match(substr):
+                raise ValueError('Transformation generated invalid '
+                                 'chunkstring:\n  %s' % s)
+
+        if verify_tags<=0: return
+
+        tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
+        tags2 = [self._tag(piece) for piece in self._pieces]
+        if tags1 != tags2:
+            raise ValueError('Transformation generated invalid '
+                             'chunkstring: tag changed')
+
+    def to_chunkstruct(self, chunk_label='CHUNK'):
+        """
+        Return the chunk structure encoded by this ``ChunkString``.
+
+        :rtype: Tree
+        :raise ValueError: If a transformation has generated an
+            invalid chunkstring.
+        """
+        if self._debug > 0: self._verify(self._str, 1)
+
+        # Use this alternating list to create the chunkstruct.
+        pieces = []
+        index = 0
+        piece_in_chunk = 0
+        for piece in re.split('[{}]', self._str):
+
+            # Find the list of tokens contained in this piece.
+            length = piece.count('<')
+            subsequence = self._pieces[index:index+length]
+
+            # Add this list of tokens to our pieces.
+            if piece_in_chunk:
+                pieces.append(Tree(chunk_label, subsequence))
+            else:
+                pieces += subsequence
+
+            # Update index, piece_in_chunk
+            index += length
+            piece_in_chunk = not piece_in_chunk
+
+        return Tree(self._root_label, pieces)
+
+    def xform(self, regexp, repl):
+        """
+        Apply the given transformation to the string encoding of this
+        ``ChunkString``.  In particular, find all occurrences that match
+        ``regexp``, and replace them using ``repl`` (as done by
+        ``re.sub``).
+
+        This transformation should only add and remove braces; it
+        should *not* modify the sequence of angle-bracket delimited
+        tags.  Furthermore, this transformation may not result in
+        improper bracketing.  Note, in particular, that bracketing may
+        not be nested.
+
+        :type regexp: str or regexp
+        :param regexp: A regular expression matching the substring
+            that should be replaced.  This will typically include a
+            named group, which can be used by ``repl``.
+        :type repl: str
+        :param repl: An expression specifying what should replace the
+            matched substring.  Typically, this will include a named
+            replacement group, specified by ``regexp``.
+        :rtype: None
+        :raise ValueError: If this transformation generated an
+            invalid chunkstring.
+        """
+        # Do the actual substitution
+        s = re.sub(regexp, repl, self._str)
+
+        # The substitution might have generated "empty chunks"
+        # (substrings of the form "{}").  Remove them, so they don't
+        # interfere with other transformations.
+        s = re.sub('\{\}', '', s)
+
+        # Make sure that the transformation was legal.
+        if self._debug > 1: self._verify(s, self._debug-2)
+
+        # Commit the transformation.
+        self._str = s
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ChunkString``.
+        It has the form::
+
+            <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
+
+        :rtype: str
+        """
+        return '<ChunkString: %s>' % unicode_repr(self._str)
+
+    def __str__(self):
+        """
+        Return a formatted representation of this ``ChunkString``.
+        This representation will include extra spaces to ensure that
+        tags will line up with the representation of other
+        ``ChunkStrings`` for the same text, regardless of the chunking.
+
+       :rtype: str
+        """
+        # Add spaces to make everything line up.
+        str = re.sub(r'>(?!\})', r'> ', self._str)
+        str = re.sub(r'([^\{])<', r'\1 <', str)
+        if str[0] == '<': str = ' ' + str
+        return str
+
+##//////////////////////////////////////////////////////
+##  Chunking Rules
+##//////////////////////////////////////////////////////
+
+@python_2_unicode_compatible
+class RegexpChunkRule(object):
+    """
+    A rule specifying how to modify the chunking in a ``ChunkString``,
+    using a transformational regular expression.  The
+    ``RegexpChunkRule`` class itself can be used to implement any
+    transformational rule based on regular expressions.  There are
+    also a number of subclasses, which can be used to implement
+    simpler types of rules, based on matching regular expressions.
+
+    Each ``RegexpChunkRule`` has a regular expression and a
+    replacement expression.  When a ``RegexpChunkRule`` is "applied"
+    to a ``ChunkString``, it searches the ``ChunkString`` for any
+    substring that matches the regular expression, and replaces it
+    using the replacement expression.  This search/replace operation
+    has the same semantics as ``re.sub``.
+
+    Each ``RegexpChunkRule`` also has a description string, which
+    gives a short (typically less than 75 characters) description of
+    the purpose of the rule.
+
+    This transformation defined by this ``RegexpChunkRule`` should
+    only add and remove braces; it should *not* modify the sequence
+    of angle-bracket delimited tags.  Furthermore, this transformation
+    may not result in nested or mismatched bracketing.
+    """
+    def __init__(self, regexp, repl, descr):
+        """
+        Construct a new RegexpChunkRule.
+
+        :type regexp: regexp or str
+        :param regexp: The regular expression for this ``RegexpChunkRule``.
+            When this rule is applied to a ``ChunkString``, any
+            substring that matches ``regexp`` will be replaced using
+            the replacement string ``repl``.  Note that this must be a
+            normal regular expression, not a tag pattern.
+        :type repl: str
+        :param repl: The replacement expression for this ``RegexpChunkRule``.
+            When this rule is applied to a ``ChunkString``, any substring
+            that matches ``regexp`` will be replaced using ``repl``.
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        if isinstance(regexp, string_types):
+            regexp = re.compile(regexp)
+        self._repl = repl
+        self._descr = descr
+        self._regexp = regexp
+
+    def apply(self, chunkstr):
+        # Keep docstring generic so we can inherit it.
+        """
+        Apply this rule to the given ``ChunkString``.  See the
+        class reference documentation for a description of what it
+        means to apply a rule.
+
+        :type chunkstr: ChunkString
+        :param chunkstr: The chunkstring to which this rule is applied.
+        :rtype: None
+        :raise ValueError: If this transformation generated an
+            invalid chunkstring.
+        """
+        chunkstr.xform(self._regexp, self._repl)
+
+    def descr(self):
+        """
+        Return a short description of the purpose and/or effect of
+        this rule.
+
+        :rtype: str
+        """
+        return self._descr
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return ('<RegexpChunkRule: '+unicode_repr(self._regexp.pattern)+
+                '->'+unicode_repr(self._repl)+'>')
+
+    @staticmethod
+    def fromstring(s):
+        """
+        Create a RegexpChunkRule from a string description.
+        Currently, the following formats are supported::
+
+          {regexp}         # chunk rule
+          }regexp{         # chink rule
+          regexp}{regexp   # split rule
+          regexp{}regexp   # merge rule
+
+        Where ``regexp`` is a regular expression for the rule.  Any
+        text following the comment marker (``#``) will be used as
+        the rule's description:
+
+        >>> from nltk.chunk.regexp import RegexpChunkRule
+        >>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}')
+        <ChunkRule: '<DT>?<NN.*>+'>
+        """
+        # Split off the comment (but don't split on '\#')
+        m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
+        rule = m.group('rule').strip()
+        comment = (m.group('comment') or '')[1:].strip()
+
+        # Pattern bodies: chunk, chink, split, merge
+        try:
+            if not rule:
+                raise ValueError('Empty chunk pattern')
+            if rule[0] == '{' and rule[-1] == '}':
+                return ChunkRule(rule[1:-1], comment)
+            elif rule[0] == '}' and rule[-1] == '{':
+                return ChinkRule(rule[1:-1], comment)
+            elif '}{' in rule:
+                left, right = rule.split('}{')
+                return SplitRule(left, right, comment)
+            elif '{}' in rule:
+                left, right = rule.split('{}')
+                return MergeRule(left, right, comment)
+            elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
+                left, chunk, right = re.split('[{}]', rule)
+                return ChunkRuleWithContext(left, chunk, right, comment)
+            else:
+                raise ValueError('Illegal chunk pattern: %s' % rule)
+        except (ValueError, re.error):
+            raise ValueError('Illegal chunk pattern: %s' % rule)
+
+
+@python_2_unicode_compatible
+class ChunkRule(RegexpChunkRule):
+    """
+    A rule specifying how to add chunks to a ``ChunkString``, using a
+    matching tag pattern.  When applied to a ``ChunkString``, it will
+    find any substring that matches this tag pattern and that is not
+    already part of a chunk, and create a new chunk containing that
+    substring.
+    """
+    def __init__(self, tag_pattern, descr):
+
+        """
+        Construct a new ``ChunkRule``.
+
+        :type tag_pattern: str
+        :param tag_pattern: This rule's tag pattern.  When
+            applied to a ``ChunkString``, this rule will
+            chunk any substring that matches this tag pattern and that
+            is not already part of a chunk.
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        self._pattern = tag_pattern
+        regexp = re.compile('(?P<chunk>%s)%s' %
+                            (tag_pattern2re_pattern(tag_pattern),
+                             ChunkString.IN_CHINK_PATTERN))
+        RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <ChunkRule: '<IN|VB.*>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return '<ChunkRule: '+unicode_repr(self._pattern)+'>'
+
+@python_2_unicode_compatible
+class ChinkRule(RegexpChunkRule):
+    """
+    A rule specifying how to remove chinks to a ``ChunkString``,
+    using a matching tag pattern.  When applied to a
+    ``ChunkString``, it will find any substring that matches this
+    tag pattern and that is contained in a chunk, and remove it
+    from that chunk, thus creating two new chunks.
+    """
+    def __init__(self, tag_pattern, descr):
+        """
+        Construct a new ``ChinkRule``.
+
+        :type tag_pattern: str
+        :param tag_pattern: This rule's tag pattern.  When
+            applied to a ``ChunkString``, this rule will
+            find any substring that matches this tag pattern and that
+            is contained in a chunk, and remove it from that chunk,
+            thus creating two new chunks.
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        self._pattern = tag_pattern
+        regexp = re.compile('(?P<chink>%s)%s' %
+                            (tag_pattern2re_pattern(tag_pattern),
+                             ChunkString.IN_CHUNK_PATTERN))
+        RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <ChinkRule: '<IN|VB.*>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return '<ChinkRule: '+unicode_repr(self._pattern)+'>'
+
+
+@python_2_unicode_compatible
+class UnChunkRule(RegexpChunkRule):
+    """
+    A rule specifying how to remove chunks to a ``ChunkString``,
+    using a matching tag pattern.  When applied to a
+    ``ChunkString``, it will find any complete chunk that matches this
+    tag pattern, and un-chunk it.
+    """
+    def __init__(self, tag_pattern, descr):
+        """
+        Construct a new ``UnChunkRule``.
+
+        :type tag_pattern: str
+        :param tag_pattern: This rule's tag pattern.  When
+            applied to a ``ChunkString``, this rule will
+            find any complete chunk that matches this tag pattern,
+            and un-chunk it.
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        self._pattern = tag_pattern
+        regexp = re.compile('\{(?P<chunk>%s)\}' %
+                            tag_pattern2re_pattern(tag_pattern))
+        RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <UnChunkRule: '<IN|VB.*>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return '<UnChunkRule: '+unicode_repr(self._pattern)+'>'
+
+
+@python_2_unicode_compatible
+class MergeRule(RegexpChunkRule):
+    """
+    A rule specifying how to merge chunks in a ``ChunkString``, using
+    two matching tag patterns: a left pattern, and a right pattern.
+    When applied to a ``ChunkString``, it will find any chunk whose end
+    matches left pattern, and immediately followed by a chunk whose
+    beginning matches right pattern.  It will then merge those two
+    chunks into a single chunk.
+    """
+    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
+        """
+        Construct a new ``MergeRule``.
+
+        :type right_tag_pattern: str
+        :param right_tag_pattern: This rule's right tag
+            pattern.  When applied to a ``ChunkString``, this
+            rule will find any chunk whose end matches
+            ``left_tag_pattern``, and immediately followed by a chunk
+            whose beginning matches this pattern.  It will
+            then merge those two chunks into a single chunk.
+        :type left_tag_pattern: str
+        :param left_tag_pattern: This rule's left tag
+            pattern.  When applied to a ``ChunkString``, this
+            rule will find any chunk whose end matches
+            this pattern, and immediately followed by a chunk
+            whose beginning matches ``right_tag_pattern``.  It will
+            then merge those two chunks into a single chunk.
+
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        # Ensure that the individual patterns are coherent.  E.g., if
+        # left='(' and right=')', then this will raise an exception:
+        re.compile(tag_pattern2re_pattern(left_tag_pattern))
+        re.compile(tag_pattern2re_pattern(right_tag_pattern))
+
+        self._left_tag_pattern = left_tag_pattern
+        self._right_tag_pattern = right_tag_pattern
+        regexp = re.compile('(?P<left>%s)}{(?=%s)' %
+                            (tag_pattern2re_pattern(left_tag_pattern),
+                             tag_pattern2re_pattern(right_tag_pattern)))
+        RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return ('<MergeRule: '+unicode_repr(self._left_tag_pattern)+', '+
+                unicode_repr(self._right_tag_pattern)+'>')
+
+
+@python_2_unicode_compatible
+class SplitRule(RegexpChunkRule):
+    """
+    A rule specifying how to split chunks in a ``ChunkString``, using
+    two matching tag patterns: a left pattern, and a right pattern.
+    When applied to a ``ChunkString``, it will find any chunk that
+    matches the left pattern followed by the right pattern.  It will
+    then split the chunk into two new chunks, at the point between the
+    two pattern matches.
+    """
+    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
+        """
+        Construct a new ``SplitRule``.
+
+        :type right_tag_pattern: str
+        :param right_tag_pattern: This rule's right tag
+            pattern.  When applied to a ``ChunkString``, this rule will
+            find any chunk containing a substring that matches
+            ``left_tag_pattern`` followed by this pattern.  It will
+            then split the chunk into two new chunks at the point
+            between these two matching patterns.
+        :type left_tag_pattern: str
+        :param left_tag_pattern: This rule's left tag
+            pattern.  When applied to a ``ChunkString``, this rule will
+            find any chunk containing a substring that matches this
+            pattern followed by ``right_tag_pattern``.  It will then
+            split the chunk into two new chunks at the point between
+            these two matching patterns.
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        # Ensure that the individual patterns are coherent.  E.g., if
+        # left='(' and right=')', then this will raise an exception:
+        re.compile(tag_pattern2re_pattern(left_tag_pattern))
+        re.compile(tag_pattern2re_pattern(right_tag_pattern))
+
+        self._left_tag_pattern = left_tag_pattern
+        self._right_tag_pattern = right_tag_pattern
+        regexp = re.compile('(?P<left>%s)(?=%s)' %
+                            (tag_pattern2re_pattern(left_tag_pattern),
+                             tag_pattern2re_pattern(right_tag_pattern)))
+        RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <SplitRule: '<NN>', '<DT>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+       :rtype: str
+        """
+        return ('<SplitRule: '+unicode_repr(self._left_tag_pattern)+', '+
+                unicode_repr(self._right_tag_pattern)+'>')
+
+
+@python_2_unicode_compatible
+class ExpandLeftRule(RegexpChunkRule):
+    """
+    A rule specifying how to expand chunks in a ``ChunkString`` to the left,
+    using two matching tag patterns: a left pattern, and a right pattern.
+    When applied to a ``ChunkString``, it will find any chunk whose beginning
+    matches right pattern, and immediately preceded by a chink whose
+    end matches left pattern.  It will then expand the chunk to incorporate
+    the new material on the left.
+    """
+    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
+        """
+        Construct a new ``ExpandRightRule``.
+
+        :type right_tag_pattern: str
+        :param right_tag_pattern: This rule's right tag
+            pattern.  When applied to a ``ChunkString``, this
+            rule will find any chunk whose beginning matches
+            ``right_tag_pattern``, and immediately preceded by a chink
+            whose end matches this pattern.  It will
+            then merge those two chunks into a single chunk.
+        :type left_tag_pattern: str
+        :param left_tag_pattern: This rule's left tag
+            pattern.  When applied to a ``ChunkString``, this
+            rule will find any chunk whose beginning matches
+            this pattern, and immediately preceded by a chink
+            whose end matches ``left_tag_pattern``.  It will
+            then expand the chunk to incorporate the new material on the left.
+
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        # Ensure that the individual patterns are coherent.  E.g., if
+        # left='(' and right=')', then this will raise an exception:
+        re.compile(tag_pattern2re_pattern(left_tag_pattern))
+        re.compile(tag_pattern2re_pattern(right_tag_pattern))
+
+        self._left_tag_pattern = left_tag_pattern
+        self._right_tag_pattern = right_tag_pattern
+        regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' %
+                            (tag_pattern2re_pattern(left_tag_pattern),
+                             tag_pattern2re_pattern(right_tag_pattern)))
+        RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return ('<ExpandLeftRule: '+unicode_repr(self._left_tag_pattern)+', '+
+                unicode_repr(self._right_tag_pattern)+'>')
+
+
+@python_2_unicode_compatible
+class ExpandRightRule(RegexpChunkRule):
+    """
+    A rule specifying how to expand chunks in a ``ChunkString`` to the
+    right, using two matching tag patterns: a left pattern, and a
+    right pattern.  When applied to a ``ChunkString``, it will find any
+    chunk whose end matches left pattern, and immediately followed by
+    a chink whose beginning matches right pattern.  It will then
+    expand the chunk to incorporate the new material on the right.
+    """
+    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
+        """
+        Construct a new ``ExpandRightRule``.
+
+        :type right_tag_pattern: str
+        :param right_tag_pattern: This rule's right tag
+            pattern.  When applied to a ``ChunkString``, this
+            rule will find any chunk whose end matches
+            ``left_tag_pattern``, and immediately followed by a chink
+            whose beginning matches this pattern.  It will
+            then merge those two chunks into a single chunk.
+        :type left_tag_pattern: str
+        :param left_tag_pattern: This rule's left tag
+            pattern.  When applied to a ``ChunkString``, this
+            rule will find any chunk whose end matches
+            this pattern, and immediately followed by a chink
+            whose beginning matches ``right_tag_pattern``.  It will
+            then expand the chunk to incorporate the new material on the right.
+
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        # Ensure that the individual patterns are coherent.  E.g., if
+        # left='(' and right=')', then this will raise an exception:
+        re.compile(tag_pattern2re_pattern(left_tag_pattern))
+        re.compile(tag_pattern2re_pattern(right_tag_pattern))
+
+        self._left_tag_pattern = left_tag_pattern
+        self._right_tag_pattern = right_tag_pattern
+        regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' %
+                            (tag_pattern2re_pattern(left_tag_pattern),
+                             tag_pattern2re_pattern(right_tag_pattern)))
+        RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return ('<ExpandRightRule: '+unicode_repr(self._left_tag_pattern)+', '+
+                unicode_repr(self._right_tag_pattern)+'>')
+
+
+@python_2_unicode_compatible
+class ChunkRuleWithContext(RegexpChunkRule):
+    """
+    A rule specifying how to add chunks to a ``ChunkString``, using
+    three matching tag patterns: one for the left context, one for the
+    chunk, and one for the right context.  When applied to a
+    ``ChunkString``, it will find any substring that matches the chunk
+    tag pattern, is surrounded by substrings that match the two
+    context patterns, and is not already part of a chunk; and create a
+    new chunk containing the substring that matched the chunk tag
+    pattern.
+
+    Caveat: Both the left and right context are consumed when this
+    rule matches; therefore, if you need to find overlapping matches,
+    you will need to apply your rule more than once.
+    """
+    def __init__(self, left_context_tag_pattern, chunk_tag_pattern,
+                 right_context_tag_pattern, descr):
+        """
+        Construct a new ``ChunkRuleWithContext``.
+
+        :type left_context_tag_pattern: str
+        :param left_context_tag_pattern: A tag pattern that must match
+            the left context of ``chunk_tag_pattern`` for this rule to
+            apply.
+        :type chunk_tag_pattern: str
+        :param chunk_tag_pattern: A tag pattern that must match for this
+            rule to apply.  If the rule does apply, then this pattern
+            also identifies the substring that will be made into a chunk.
+        :type right_context_tag_pattern: str
+        :param right_context_tag_pattern: A tag pattern that must match
+            the right context of ``chunk_tag_pattern`` for this rule to
+            apply.
+        :type descr: str
+        :param descr: A short description of the purpose and/or effect
+            of this rule.
+        """
+        # Ensure that the individual patterns are coherent.  E.g., if
+        # left='(' and right=')', then this will raise an exception:
+        re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
+        re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
+        re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
+
+        self._left_context_tag_pattern = left_context_tag_pattern
+        self._chunk_tag_pattern = chunk_tag_pattern
+        self._right_context_tag_pattern = right_context_tag_pattern
+        regexp = re.compile('(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' %
+                            (tag_pattern2re_pattern(left_context_tag_pattern),
+                             tag_pattern2re_pattern(chunk_tag_pattern),
+                             tag_pattern2re_pattern(right_context_tag_pattern),
+                             ChunkString.IN_CHINK_PATTERN))
+        replacement = r'\g<left>{\g<chunk>}\g<right>'
+        RegexpChunkRule.__init__(self, regexp, replacement, descr)
+
+    def __repr__(self):
+        """
+        Return a string representation of this rule.  It has the form::
+
+            <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
+
+        Note that this representation does not include the
+        description string; that string can be accessed
+        separately with the ``descr()`` method.
+
+        :rtype: str
+        """
+        return '<ChunkRuleWithContext:  %r, %r, %r>' % (
+            self._left_context_tag_pattern, self._chunk_tag_pattern,
+            self._right_context_tag_pattern)
+
+##//////////////////////////////////////////////////////
+##  Tag Pattern Format Conversion
+##//////////////////////////////////////////////////////
+
+# this should probably be made more strict than it is -- e.g., it
+# currently accepts 'foo'.
+CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
+                                ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+',
+                                 '[^\{\}<>]+'))
+
+
+
+
+
+def tag_pattern2re_pattern(tag_pattern):
+    """
+    Convert a tag pattern to a regular expression pattern.  A "tag
+    pattern" is a modified version of a regular expression, designed
+    for matching sequences of tags.  The differences between regular
+    expression patterns and tag patterns are:
+
+        - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
+          ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
+          ``'<NN'`` followed by one or more repetitions of ``'>'``.
+        - Whitespace in tag patterns is ignored.  So
+          ``'<DT> | <NN>'`` is equivalant to ``'<DT>|<NN>'``
+        - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so
+          ``'<NN.*>'`` matches any single tag starting with ``'NN'``.
+
+    In particular, ``tag_pattern2re_pattern`` performs the following
+    transformations on the given pattern:
+
+        - Replace '.' with '[^<>{}]'
+        - Remove any whitespace
+        - Add extra parens around '<' and '>', to make '<' and '>' act
+          like parentheses.  E.g., so that in '<NN>+', the '+' has scope
+          over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
+          scope over 'NN' and 'IN', but not '<' or '>'.
+        - Check to make sure the resulting pattern is valid.
+
+    :type tag_pattern: str
+    :param tag_pattern: The tag pattern to convert to a regular
+        expression pattern.
+    :raise ValueError: If ``tag_pattern`` is not a valid tag pattern.
+        In particular, ``tag_pattern`` should not include braces; and it
+        should not contain nested or mismatched angle-brackets.
+    :rtype: str
+    :return: A regular expression pattern corresponding to
+        ``tag_pattern``.
+    """
+    # Clean up the regular expression
+    tag_pattern = re.sub(r'\s', '', tag_pattern)
+    tag_pattern = re.sub(r'<', '(<(', tag_pattern)
+    tag_pattern = re.sub(r'>', ')>)', tag_pattern)
+
+    # Check the regular expression
+    if not CHUNK_TAG_PATTERN.match(tag_pattern):
+        raise ValueError('Bad tag pattern: %r' % tag_pattern)
+
+    # Replace "." with CHUNK_TAG_CHAR.
+    # We have to do this after, since it adds {}[]<>s, which would
+    # confuse CHUNK_TAG_PATTERN.
+    # PRE doesn't have lookback assertions, so reverse twice, and do
+    # the pattern backwards (with lookahead assertions).  This can be
+    # made much cleaner once we can switch back to SRE.
+    def reverse_str(str):
+        lst = list(str)
+        lst.reverse()
+        return ''.join(lst)
+    tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
+    reversed = reverse_str(tag_pattern)
+    reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
+    tag_pattern = reverse_str(reversed)
+
+    return tag_pattern
+
+
+##//////////////////////////////////////////////////////
+##  RegexpChunkParser
+##//////////////////////////////////////////////////////
+
+@python_2_unicode_compatible
+class RegexpChunkParser(ChunkParserI):
+    """
+    A regular expression based chunk parser.  ``RegexpChunkParser`` uses a
+    sequence of "rules" to find chunks of a single type within a
+    text.  The chunking of the text is encoded using a ``ChunkString``,
+    and each rule acts by modifying the chunking in the
+    ``ChunkString``.  The rules are all implemented using regular
+    expression matching and substitution.
+
+    The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``,
+    ``ChinkRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``)
+    define the rules that are used by ``RegexpChunkParser``.  Each rule
+    defines an ``apply()`` method, which modifies the chunking encoded
+    by a given ``ChunkString``.
+
+    :type _rules: list(RegexpChunkRule)
+    :ivar _rules: The list of rules that should be applied to a text.
+    :type _trace: int
+    :ivar _trace: The default level of tracing.
+
+    """
+    def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
+        """
+        Construct a new ``RegexpChunkParser``.
+
+        :type rules: list(RegexpChunkRule)
+        :param rules: The sequence of rules that should be used to
+            generate the chunking for a tagged text.
+        :type chunk_label: str
+        :param chunk_label: The node value that should be used for
+            chunk subtrees.  This is typically a short string
+            describing the type of information contained by the chunk,
+            such as ``"NP"`` for base noun phrases.
+        :type root_label: str
+        :param root_label: The node value that should be used for the
+            top node of the chunk structure.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            ``1`` will generate normal tracing output; and ``2`` or
+            higher will generate verbose tracing output.
+        """
+        self._rules = rules
+        self._trace = trace
+        self._chunk_label = chunk_label
+        self._root_label = root_label
+
+    def _trace_apply(self, chunkstr, verbose):
+        """
+        Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
+        turn.  Generate trace output between each rule.  If ``verbose``
+        is true, then generate verbose output.
+
+        :type chunkstr: ChunkString
+        :param chunkstr: The chunk string to which each rule should be
+            applied.
+        :type verbose: bool
+        :param verbose: Whether output should be verbose.
+        :rtype: None
+        """
+        print('# Input:')
+        print(chunkstr)
+        for rule in self._rules:
+            rule.apply(chunkstr)
+            if verbose:
+                print('#', rule.descr()+' ('+unicode_repr(rule)+'):')
+            else:
+                print('#', rule.descr()+':')
+            print(chunkstr)
+
+    def _notrace_apply(self, chunkstr):
+        """
+        Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
+        turn.
+
+        :param chunkstr: The chunk string to which each rule should be
+            applied.
+        :type chunkstr: ChunkString
+        :rtype: None
+        """
+
+        for rule in self._rules:
+            rule.apply(chunkstr)
+
+    def parse(self, chunk_struct, trace=None):
+        """
+        :type chunk_struct: Tree
+        :param chunk_struct: the chunk structure to be (further) chunked
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            ``1`` will generate normal tracing output; and ``2`` or
+            highter will generate verbose tracing output.  This value
+            overrides the trace level value that was given to the
+            constructor.
+        :rtype: Tree
+        :return: a chunk structure that encodes the chunks in a given
+            tagged sentence.  A chunk is a non-overlapping linguistic
+            group, such as a noun phrase.  The set of chunks
+            identified in the chunk structure depends on the rules
+            used to define this ``RegexpChunkParser``.
+        """
+        if len(chunk_struct) == 0:
+            print('Warning: parsing empty text')
+            return Tree(self._root_label, [])
+
+        try:
+            chunk_struct.label()
+        except AttributeError:
+            chunk_struct = Tree(self._root_label, chunk_struct)
+
+        # Use the default trace value?
+        if trace is None: trace = self._trace
+
+        chunkstr = ChunkString(chunk_struct)
+
+        # Apply the sequence of rules to the chunkstring.
+        if trace:
+            verbose = (trace>1)
+            self._trace_apply(chunkstr, verbose)
+        else:
+            self._notrace_apply(chunkstr)
+
+        # Use the chunkstring to create a chunk structure.
+        return chunkstr.to_chunkstruct(self._chunk_label)
+
+    def rules(self):
+        """
+        :return: the sequence of rules used by ``RegexpChunkParser``.
+        :rtype: list(RegexpChunkRule)
+        """
+        return self._rules
+
+    def __repr__(self):
+        """
+        :return: a concise string representation of this
+            ``RegexpChunkParser``.
+        :rtype: str
+        """
+        return "<RegexpChunkParser with %d rules>" % len(self._rules)
+
+    def __str__(self):
+        """
+        :return: a verbose string representation of this ``RegexpChunkParser``.
+        :rtype: str
+        """
+        s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
+        margin = 0
+        for rule in self._rules:
+            margin = max(margin, len(rule.descr()))
+        if margin < 35:
+            format = "    %" + repr(-(margin+3)) + "s%s\n"
+        else:
+            format = "    %s\n      %s\n"
+        for rule in self._rules:
+            s += format % (rule.descr(), unicode_repr(rule))
+        return s[:-1]
+
+##//////////////////////////////////////////////////////
+##  Chunk Grammar
+##//////////////////////////////////////////////////////
+
+@python_2_unicode_compatible
+class RegexpParser(ChunkParserI):
+    """
+    A grammar based chunk parser.  ``chunk.RegexpParser`` uses a set of
+    regular expression patterns to specify the behavior of the parser.
+    The chunking of the text is encoded using a ``ChunkString``, and
+    each rule acts by modifying the chunking in the ``ChunkString``.
+    The rules are all implemented using regular expression matching
+    and substitution.
+
+    A grammar contains one or more clauses in the following form::
+
+     NP:
+       {<DT|JJ>}          # chunk determiners and adjectives
+       }<[\.VI].*>+{      # chink any tag beginning with V, I, or .
+       <.*>}{<DT>         # split a chunk at a determiner
+       <DT|JJ>{}<NN.*>    # merge chunk ending with det/adj
+                          # with one starting with a noun
+
+    The patterns of a clause are executed in order.  An earlier
+    pattern may introduce a chunk boundary that prevents a later
+    pattern from executing.  Sometimes an individual pattern will
+    match on multiple, overlapping extents of the input.  As with
+    regular expression substitution more generally, the chunker will
+    identify the first match possible, then continue looking for matches
+    after this one has ended.
+
+    The clauses of a grammar are also executed in order.  A cascaded
+    chunk parser is one having more than one clause.  The maximum depth
+    of a parse tree created by this chunk parser is the same as the
+    number of clauses in the grammar.
+
+    When tracing is turned on, the comment portion of a line is displayed
+    each time the corresponding pattern is applied.
+
+    :type _start: str
+    :ivar _start: The start symbol of the grammar (the root node of
+        resulting trees)
+    :type _stages: int
+    :ivar _stages: The list of parsing stages corresponding to the grammar
+
+    """
+    def __init__(self, grammar, root_label='S', loop=1, trace=0):
+        """
+        Create a new chunk parser, from the given start state
+        and set of chunk patterns.
+
+        :param grammar: The grammar, or a list of RegexpChunkParser objects
+        :type grammar: str or list(RegexpChunkParser)
+        :param root_label: The top node of the tree being created
+        :type root_label: str or Nonterminal
+        :param loop: The number of times to run through the patterns
+        :type loop: int
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            ``1`` will generate normal tracing output; and ``2`` or
+            higher will generate verbose tracing output.
+        """
+        self._trace = trace
+        self._stages = []
+        self._grammar = grammar
+        self._loop = loop
+
+        if isinstance(grammar, string_types):
+            self._read_grammar(grammar, root_label, trace)
+        else:
+            # Make sur the grammar looks like it has the right type:
+            type_err = ('Expected string or list of RegexpChunkParsers '
+                        'for the grammar.')
+            try: grammar = list(grammar)
+            except: raise TypeError(type_err)
+            for elt in grammar:
+                if not isinstance(elt, RegexpChunkParser):
+                    raise TypeError(type_err)
+            self._stages = grammar
+
+    def _read_grammar(self, grammar, root_label, trace):
+        """
+        Helper function for __init__: read the grammar if it is a
+        string.
+        """
+        rules = []
+        lhs = None
+        for line in grammar.split('\n'):
+            line = line.strip()
+
+            # New stage begins if there's an unescaped ':'
+            m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
+            if m:
+                # Record the stage that we just completed.
+                self._add_stage(rules, lhs, root_label, trace)
+                # Start a new stage.
+                lhs = m.group('nonterminal').strip()
+                rules = []
+                line = m.group('rule').strip()
+
+            # Skip blank & comment-only lines
+            if line=='' or line.startswith('#'): continue
+
+            # Add the rule
+            rules.append(RegexpChunkRule.fromstring(line))
+
+        # Record the final stage
+        self._add_stage(rules, lhs, root_label, trace)
+
+    def _add_stage(self, rules, lhs, root_label, trace):
+        """
+        Helper function for __init__: add a new stage to the parser.
+        """
+        if rules != []:
+            if not lhs:
+                raise ValueError('Expected stage marker (eg NP:)')
+            parser = RegexpChunkParser(rules, chunk_label=lhs,
+                                       root_label=root_label, trace=trace)
+            self._stages.append(parser)
+
+    def parse(self, chunk_struct, trace=None):
+        """
+        Apply the chunk parser to this input.
+
+        :type chunk_struct: Tree
+        :param chunk_struct: the chunk structure to be (further) chunked
+            (this tree is modified, and is also returned)
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            ``1`` will generate normal tracing output; and ``2`` or
+            highter will generate verbose tracing output.  This value
+            overrides the trace level value that was given to the
+            constructor.
+        :return: the chunked output.
+        :rtype: Tree
+        """
+        if trace is None: trace = self._trace
+        for i in range(self._loop):
+            for parser in self._stages:
+                chunk_struct = parser.parse(chunk_struct, trace=trace)
+        return chunk_struct
+
+    def __repr__(self):
+        """
+        :return: a concise string representation of this ``chunk.RegexpParser``.
+        :rtype: str
+        """
+        return "<chunk.RegexpParser with %d stages>" % len(self._stages)
+
+    def __str__(self):
+        """
+        :return: a verbose string representation of this
+            ``RegexpParser``.
+        :rtype: str
+        """
+        s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
+        margin = 0
+        for parser in self._stages:
+            s += "%s\n" % parser
+        return s[:-1]
+
+##//////////////////////////////////////////////////////
+##  Demonstration code
+##//////////////////////////////////////////////////////
+
+def demo_eval(chunkparser, text):
+    """
+    Demonstration code for evaluating a chunk parser, using a
+    ``ChunkScore``.  This function assumes that ``text`` contains one
+    sentence per line, and that each sentence has the form expected by
+    ``tree.chunk``.  It runs the given chunk parser on each sentence in
+    the text, and scores the result.  It prints the final score
+    (precision, recall, and f-measure); and reports the set of chunks
+    that were missed and the set of chunks that were incorrect.  (At
+    most 10 missing chunks and 10 incorrect chunks are reported).
+
+    :param chunkparser: The chunkparser to be tested
+    :type chunkparser: ChunkParserI
+    :param text: The chunked tagged text that should be used for
+        evaluation.
+    :type text: str
+    """
+    from nltk import chunk
+    from nltk.tree import Tree
+
+    # Evaluate our chunk parser.
+    chunkscore = chunk.ChunkScore()
+
+    for sentence in text.split('\n'):
+        print(sentence)
+        sentence = sentence.strip()
+        if not sentence: continue
+        gold = chunk.tagstr2tree(sentence)
+        tokens = gold.leaves()
+        test = chunkparser.parse(Tree('S', tokens), trace=1)
+        chunkscore.score(gold, test)
+        print()
+
+    print('/'+('='*75)+'\\')
+    print('Scoring', chunkparser)
+    print(('-'*77))
+    print('Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4, end=' ')
+    print('Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6, end=' ')
+    print('F-Measure: %5.1f%%' % (chunkscore.f_measure()*100))
+
+
+    # Missed chunks.
+    if chunkscore.missed():
+        print('Missed:')
+        missed = chunkscore.missed()
+        for chunk in missed[:10]:
+            print('  ', ' '.join(map(str,chunk)))
+        if len(chunkscore.missed()) > 10:
+            print('  ...')
+
+    # Incorrect chunks.
+    if chunkscore.incorrect():
+        print('Incorrect:')
+        incorrect = chunkscore.incorrect()
+        for chunk in incorrect[:10]:
+            print('  ', ' '.join(map(str,chunk)))
+        if len(chunkscore.incorrect()) > 10:
+            print('  ...')
+
+    print('\\'+('='*75)+'/')
+    print()
+
+def demo():
+    """
+    A demonstration for the ``RegexpChunkParser`` class.  A single text is
+    parsed with four different chunk parsers, using a variety of rules
+    and strategies.
+    """
+
+    from nltk import chunk, Tree
+
+    text = """\
+    [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
+    [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
+    [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
+    """
+
+    print('*'*75)
+    print('Evaluation text:')
+    print(text)
+    print('*'*75)
+    print()
+
+    grammar = r"""
+    NP:                   # NP stage
+      {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
+      {<NNP>+}            # chunk proper nouns
+    """
+    cp = chunk.RegexpParser(grammar)
+    demo_eval(cp, text)
+
+    grammar = r"""
+    NP:
+      {<.*>}              # start by chunking each tag
+      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
+      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
+    """
+    cp = chunk.RegexpParser(grammar)
+    demo_eval(cp, text)
+
+    grammar = r"""
+    NP: {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
+    VP: {<TO>?<VB.*>}       # VP = verb words
+    """
+    cp = chunk.RegexpParser(grammar)
+    demo_eval(cp, text)
+
+    grammar = r"""
+    NP: {<.*>*}             # start by chunking everything
+        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
+        <.*>}{<DT>          # separate on determiners
+    PP: {<IN><NP>}          # PP = preposition + noun phrase
+    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
+    """
+    cp = chunk.RegexpParser(grammar)
+    demo_eval(cp, text)
+
+# Evaluation
+
+    from nltk.corpus import conll2000
+
+    print()
+    print("Demonstration of empty grammar:")
+
+    cp = chunk.RegexpParser("")
+    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt',
+                                                     chunk_types=('NP',))))
+
+    print()
+    print("Demonstration of accuracy evaluation using CoNLL tags:")
+
+    grammar = r"""
+    NP:
+      {<.*>}              # start by chunking each tag
+      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
+      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
+    """
+    cp = chunk.RegexpParser(grammar)
+    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
+
+    print()
+    print("Demonstration of tagged token input")
+
+    grammar = r"""
+    NP: {<.*>*}             # start by chunking everything
+        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
+        <.*>}{<DT>          # separate on determiners
+    PP: {<IN><NP>}          # PP = preposition + noun phrase
+    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
+    """
+    cp = chunk.RegexpParser(grammar)
+    print(cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"),
+                    ("sat", "VBD"), ("on", "IN"), ("the", "DT"),
+                    ("mat", "NN"), (".", ".")]))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/chunk/regexp.pyc b/nlp_resource_data/nltk/chunk/regexp.pyc

new file mode 100755 (executable)

index 0000000..15a5aed

Binary files /dev/null and b/nlp_resource_data/nltk/chunk/regexp.pyc differ
diff --git a/nlp_resource_data/nltk/chunk/util.py b/nlp_resource_data/nltk/chunk/util.py

new file mode 100755 (executable)

index 0000000..0a99dc6
--- /dev/null
+++ b/nlp_resource_data/nltk/chunk/util.py
@@ -0,0 +1,600 @@
+# Natural Language Toolkit: Chunk format conversions
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
+
+import re
+
+from nltk.tree import Tree
+from nltk.tag.mapping import map_tag
+from nltk.tag.util import str2tuple
+from nltk.compat import python_2_unicode_compatible
+
+##//////////////////////////////////////////////////////
+## EVALUATION
+##//////////////////////////////////////////////////////
+
+from nltk.metrics import accuracy as _accuracy
+def accuracy(chunker, gold):
+    """
+    Score the accuracy of the chunker against the gold standard.
+    Strip the chunk information from the gold standard and rechunk it using
+    the chunker, then compute the accuracy score.
+
+    :type chunker: ChunkParserI
+    :param chunker: The chunker being evaluated.
+    :type gold: tree
+    :param gold: The chunk structures to score the chunker on.
+    :rtype: float
+    """
+
+    gold_tags = []
+    test_tags = []
+    for gold_tree in gold:
+        test_tree = chunker.parse(gold_tree.flatten())
+        gold_tags += tree2conlltags(gold_tree)
+        test_tags += tree2conlltags(test_tree)
+
+#    print 'GOLD:', gold_tags[:50]
+#    print 'TEST:', test_tags[:50]
+    return _accuracy(gold_tags, test_tags)
+
+
+# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
+#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
+#
+# SB: use nltk.metrics for precision/recall scoring?
+#
+class ChunkScore(object):
+    """
+    A utility class for scoring chunk parsers.  ``ChunkScore`` can
+    evaluate a chunk parser's output, based on a number of statistics
+    (precision, recall, f-measure, misssed chunks, incorrect chunks).
+    It can also combine the scores from the parsing of multiple texts;
+    this makes it significantly easier to evaluate a chunk parser that
+    operates one sentence at a time.
+
+    Texts are evaluated with the ``score`` method.  The results of
+    evaluation can be accessed via a number of accessor methods, such
+    as ``precision`` and ``f_measure``.  A typical use of the
+    ``ChunkScore`` class is::
+
+        >>> chunkscore = ChunkScore()           # doctest: +SKIP
+        >>> for correct in correct_sentences:   # doctest: +SKIP
+        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
+        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
+        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
+        F Measure: 0.823
+
+    :ivar kwargs: Keyword arguments:
+
+        - max_tp_examples: The maximum number actual examples of true
+          positives to record.  This affects the ``correct`` member
+          function: ``correct`` will not return more than this number
+          of true positive examples.  This does *not* affect any of
+          the numerical metrics (precision, recall, or f-measure)
+
+        - max_fp_examples: The maximum number actual examples of false
+          positives to record.  This affects the ``incorrect`` member
+          function and the ``guessed`` member function: ``incorrect``
+          will not return more than this number of examples, and
+          ``guessed`` will not return more than this number of true
+          positive examples.  This does *not* affect any of the
+          numerical metrics (precision, recall, or f-measure)
+
+        - max_fn_examples: The maximum number actual examples of false
+          negatives to record.  This affects the ``missed`` member
+          function and the ``correct`` member function: ``missed``
+          will not return more than this number of examples, and
+          ``correct`` will not return more than this number of true
+          negative examples.  This does *not* affect any of the
+          numerical metrics (precision, recall, or f-measure)
+
+        - chunk_label: A regular expression indicating which chunks
+          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).
+
+    :type _tp: list(Token)
+    :ivar _tp: List of true positives
+    :type _fp: list(Token)
+    :ivar _fp: List of false positives
+    :type _fn: list(Token)
+    :ivar _fn: List of false negatives
+
+    :type _tp_num: int
+    :ivar _tp_num: Number of true positives
+    :type _fp_num: int
+    :ivar _fp_num: Number of false positives
+    :type _fn_num: int
+    :ivar _fn_num: Number of false negatives.
+    """
+    def __init__(self, **kwargs):
+        self._correct = set()
+        self._guessed = set()
+        self._tp = set()
+        self._fp = set()
+        self._fn = set()
+        self._max_tp = kwargs.get('max_tp_examples', 100)
+        self._max_fp = kwargs.get('max_fp_examples', 100)
+        self._max_fn = kwargs.get('max_fn_examples', 100)
+        self._chunk_label = kwargs.get('chunk_label', '.*')
+        self._tp_num = 0
+        self._fp_num = 0
+        self._fn_num = 0
+        self._count = 0
+        self._tags_correct = 0.0
+        self._tags_total = 0.0
+
+        self._measuresNeedUpdate = False
+
+    def _updateMeasures(self):
+        if (self._measuresNeedUpdate):
+           self._tp = self._guessed & self._correct
+           self._fn = self._correct - self._guessed
+           self._fp = self._guessed - self._correct
+           self._tp_num = len(self._tp)
+           self._fp_num = len(self._fp)
+           self._fn_num = len(self._fn)
+           self._measuresNeedUpdate = False
+
+    def score(self, correct, guessed):
+        """
+        Given a correctly chunked sentence, score another chunked
+        version of the same sentence.
+
+        :type correct: chunk structure
+        :param correct: The known-correct ("gold standard") chunked
+            sentence.
+        :type guessed: chunk structure
+        :param guessed: The chunked sentence to be scored.
+        """
+        self._correct |= _chunksets(correct, self._count, self._chunk_label)
+        self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
+        self._count += 1
+        self._measuresNeedUpdate = True
+        # Keep track of per-tag accuracy (if possible)
+        try:
+            correct_tags = tree2conlltags(correct)
+            guessed_tags = tree2conlltags(guessed)
+        except ValueError:
+            # This exception case is for nested chunk structures,
+            # where tree2conlltags will fail with a ValueError: "Tree
+            # is too deeply nested to be printed in CoNLL format."
+            correct_tags = guessed_tags = ()
+        self._tags_total += len(correct_tags)
+        self._tags_correct += sum(1 for (t,g) in zip(guessed_tags,
+                                                     correct_tags)
+                                  if t==g)
+
+    def accuracy(self):
+        """
+        Return the overall tag-based accuracy for all text that have
+        been scored by this ``ChunkScore``, using the IOB (conll2000)
+        tag encoding.
+
+        :rtype: float
+        """
+        if self._tags_total == 0: return 1
+        return self._tags_correct/self._tags_total
+
+    def precision(self):
+        """
+        Return the overall precision for all texts that have been
+        scored by this ``ChunkScore``.
+
+        :rtype: float
+        """
+        self._updateMeasures()
+        div = self._tp_num + self._fp_num
+        if div == 0: return 0
+        else: return self._tp_num / div
+
+    def recall(self):
+        """
+        Return the overall recall for all texts that have been
+        scored by this ``ChunkScore``.
+
+        :rtype: float
+        """
+        self._updateMeasures()
+        div = self._tp_num + self._fn_num
+        if div == 0: return 0
+        else: return self._tp_num / div
+
+    def f_measure(self, alpha=0.5):
+        """
+        Return the overall F measure for all texts that have been
+        scored by this ``ChunkScore``.
+
+        :param alpha: the relative weighting of precision and recall.
+            Larger alpha biases the score towards the precision value,
+            while smaller alpha biases the score towards the recall
+            value.  ``alpha`` should have a value in the range [0,1].
+        :type alpha: float
+        :rtype: float
+        """
+        self._updateMeasures()
+        p = self.precision()
+        r = self.recall()
+        if p == 0 or r == 0:    # what if alpha is 0 or 1?
+            return 0
+        return 1/(alpha/p + (1-alpha)/r)
+
+    def missed(self):
+        """
+        Return the chunks which were included in the
+        correct chunk structures, but not in the guessed chunk
+        structures, listed in input order.
+
+        :rtype: list of chunks
+        """
+        self._updateMeasures()
+        chunks = list(self._fn)
+        return [c[1] for c in chunks]  # discard position information
+
+    def incorrect(self):
+        """
+        Return the chunks which were included in the guessed chunk structures,
+        but not in the correct chunk structures, listed in input order.
+
+        :rtype: list of chunks
+        """
+        self._updateMeasures()
+        chunks = list(self._fp)
+        return [c[1] for c in chunks]  # discard position information
+
+    def correct(self):
+        """
+        Return the chunks which were included in the correct
+        chunk structures, listed in input order.
+
+        :rtype: list of chunks
+        """
+        chunks = list(self._correct)
+        return [c[1] for c in chunks]  # discard position information
+
+    def guessed(self):
+        """
+        Return the chunks which were included in the guessed
+        chunk structures, listed in input order.
+
+        :rtype: list of chunks
+        """
+        chunks = list(self._guessed)
+        return [c[1] for c in chunks]  # discard position information
+
+    def __len__(self):
+        self._updateMeasures()
+        return self._tp_num + self._fn_num
+
+    def __repr__(self):
+        """
+        Return a concise representation of this ``ChunkScoring``.
+
+        :rtype: str
+        """
+        return '<ChunkScoring of '+repr(len(self))+' chunks>'
+
+    def __str__(self):
+        """
+        Return a verbose representation of this ``ChunkScoring``.
+        This representation includes the precision, recall, and
+        f-measure scores.  For other information about the score,
+        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
+
+        :rtype: str
+        """
+        return ("ChunkParse score:\n" +
+                ("    IOB Accuracy: {:5.1f}%%\n".format(self.accuracy()*100)) +
+                ("    Precision:    {:5.1f}%%\n".format(self.precision()*100)) +
+                ("    Recall:       {:5.1f}%%\n".format(self.recall()*100))+
+                ("    F-Measure:    {:5.1f}%%".format(self.f_measure()*100)))
+
+# extract chunks, and assign unique id, the absolute position of
+# the first word of the chunk
+def _chunksets(t, count, chunk_label):
+    pos = 0
+    chunks = []
+    for child in t:
+        if isinstance(child, Tree):
+            if re.match(chunk_label, child.label()):
+                chunks.append(((count, pos), child.freeze()))
+            pos += len(child.leaves())
+        else:
+            pos += 1
+    return set(chunks)
+
+
+def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
+                source_tagset=None, target_tagset=None):
+    """
+    Divide a string of bracketted tagged text into
+    chunks and unchunked tokens, and produce a Tree.
+    Chunks are marked by square brackets (``[...]``).  Words are
+    delimited by whitespace, and each word should have the form
+    ``text/tag``.  Words that do not contain a slash are
+    assigned a ``tag`` of None.
+
+    :param s: The string to be converted
+    :type s: str
+    :param chunk_label: The label to use for chunk nodes
+    :type chunk_label: str
+    :param root_label: The label to use for the root of the tree
+    :type root_label: str
+    :rtype: Tree
+    """
+
+    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
+
+    stack = [Tree(root_label, [])]
+    for match in WORD_OR_BRACKET.finditer(s):
+        text = match.group()
+        if text[0] == '[':
+            if len(stack) != 1:
+                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
+            chunk = Tree(chunk_label, [])
+            stack[-1].append(chunk)
+            stack.append(chunk)
+        elif text[0] == ']':
+            if len(stack) != 2:
+                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
+            stack.pop()
+        else:
+            if sep is None:
+                stack[-1].append(text)
+            else:
+                word, tag = str2tuple(text, sep)
+                if source_tagset and target_tagset:
+                    tag = map_tag(source_tagset, target_tagset, tag)
+                stack[-1].append((word, tag))
+
+    if len(stack) != 1:
+        raise ValueError('Expected ] at char {:d}'.format(len(s)))
+    return stack[0]
+
+### CONLL
+
+_LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
+def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
+    """
+    Return a chunk structure for a single sentence
+    encoded in the given CONLL 2000 style string.
+    This function converts a CoNLL IOB string into a tree.
+    It uses the specified chunk types
+    (defaults to NP, PP and VP), and creates a tree rooted at a node
+    labeled S (by default).
+
+    :param s: The CoNLL string to be converted.
+    :type s: str
+    :param chunk_types: The chunk types to be converted.
+    :type chunk_types: tuple
+    :param root_label: The node label to use for the root.
+    :type root_label: str
+    :rtype: Tree
+    """
+
+    stack = [Tree(root_label, [])]
+
+    for lineno, line in enumerate(s.split('\n')):
+        if not line.strip(): continue
+
+        # Decode the line.
+        match = _LINE_RE.match(line)
+        if match is None:
+            raise ValueError('Error on line {:d}'.format(lineno))
+        (word, tag, state, chunk_type) = match.groups()
+
+        # If it's a chunk type we don't care about, treat it as O.
+        if (chunk_types is not None and
+            chunk_type not in chunk_types):
+            state = 'O'
+
+        # For "Begin"/"Outside", finish any completed chunks -
+        # also do so for "Inside" which don't match the previous token.
+        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
+        if state in 'BO' or mismatch_I:
+            if len(stack) == 2: stack.pop()
+
+        # For "Begin", start a new chunk.
+        if state == 'B' or mismatch_I:
+            chunk = Tree(chunk_type, [])
+            stack[-1].append(chunk)
+            stack.append(chunk)
+
+        # Add the new word token.
+        stack[-1].append((word, tag))
+
+    return stack[0]
+
+def tree2conlltags(t):
+    """
+    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
+    Convert a tree to the CoNLL IOB tag format.
+
+    :param t: The tree to be converted.
+    :type t: Tree
+    :rtype: list(tuple)
+    """
+
+    tags = []
+    for child in t:
+        try:
+            category = child.label()
+            prefix = "B-"
+            for contents in child:
+                if isinstance(contents, Tree):
+                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
+                tags.append((contents[0], contents[1], prefix+category))
+                prefix = "I-"
+        except AttributeError:
+            tags.append((child[0], child[1], "O"))
+    return tags
+
+def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
+                   root_label='S', strict=False):
+    """
+    Convert the CoNLL IOB format to a tree.
+    """
+    tree = Tree(root_label, [])
+    for (word, postag, chunktag) in sentence:
+        if chunktag is None:
+            if strict:
+                raise ValueError("Bad conll tag sequence")
+            else:
+                # Treat as O
+                tree.append((word,postag))
+        elif chunktag.startswith('B-'):
+            tree.append(Tree(chunktag[2:], [(word,postag)]))
+        elif chunktag.startswith('I-'):
+            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
+                tree[-1].label() != chunktag[2:]):
+                if strict:
+                    raise ValueError("Bad conll tag sequence")
+                else:
+                    # Treat as B-*
+                    tree.append(Tree(chunktag[2:], [(word,postag)]))
+            else:
+                tree[-1].append((word,postag))
+        elif chunktag == 'O':
+            tree.append((word,postag))
+        else:
+            raise ValueError("Bad conll tag {0!r}".format(chunktag))
+    return tree
+
+def tree2conllstr(t):
+    """
+    Return a multiline string where each line contains a word, tag and IOB tag.
+    Convert a tree to the CoNLL IOB string format
+
+    :param t: The tree to be converted.
+    :type t: Tree
+    :rtype: str
+    """
+    lines = [" ".join(token) for token in tree2conlltags(t)]
+    return '\n'.join(lines)
+
+### IEER
+
+_IEER_DOC_RE = re.compile(r'<DOC>\s*'
+                          r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
+                          r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
+                          r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
+                          r'<BODY>\s*'
+                          r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
+                          r'<TEXT>(?P<text>.*?)</TEXT>\s*'
+                          r'</BODY>\s*</DOC>\s*', re.DOTALL)
+
+_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
+
+def _ieer_read_text(s, root_label):
+    stack = [Tree(root_label, [])]
+    # s will be None if there is no headline in the text
+    # return the empty list in place of a Tree
+    if s is None:
+        return []
+    for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
+        piece = piece_m.group()
+        try:
+            if piece.startswith('<b_'):
+                m = _IEER_TYPE_RE.match(piece)
+                if m is None: print('XXXX', piece)
+                chunk = Tree(m.group('type'), [])
+                stack[-1].append(chunk)
+                stack.append(chunk)
+            elif piece.startswith('<e_'):
+                stack.pop()
+#           elif piece.startswith('<'):
+#               print "ERROR:", piece
+#               raise ValueError # Unexpected HTML
+            else:
+                stack[-1].append(piece)
+        except (IndexError, ValueError):
+            raise ValueError('Bad IEER string (error at character {:d})'.format \
+                             (piece_m.start()))
+    if len(stack) != 1:
+        raise ValueError('Bad IEER string')
+    return stack[0]
+
+def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
+               'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], root_label="S"):
+    """
+    Return a chunk structure containing the chunked tagged text that is
+    encoded in the given IEER style string.
+    Convert a string of chunked tagged text in the IEER named
+    entity format into a chunk structure.  Chunks are of several
+    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
+    PERCENT, MONEY, and MEASURE.
+
+    :rtype: Tree
+    """
+
+    # Try looking for a single document.  If that doesn't work, then just
+    # treat everything as if it was within the <TEXT>...</TEXT>.
+    m = _IEER_DOC_RE.match(s)
+    if m:
+        return {
+            'text': _ieer_read_text(m.group('text'), root_label),
+            'docno': m.group('docno'),
+            'doctype': m.group('doctype'),
+            'date_time': m.group('date_time'),
+            #'headline': m.group('headline')
+            # we want to capture NEs in the headline too!
+            'headline': _ieer_read_text(m.group('headline'), root_label),
+            }
+    else:
+        return _ieer_read_text(s, root_label)
+
+
+def demo():
+
+    s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
+    import nltk
+    t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
+    t.pprint()
+    print()
+
+    s = """
+These DT B-NP
+research NN I-NP
+protocols NNS I-NP
+offer VBP B-VP
+to TO B-PP
+the DT B-NP
+patient NN I-NP
+not RB O
+only RB O
+the DT B-NP
+very RB I-NP
+best JJS I-NP
+therapy NN I-NP
+which WDT B-NP
+we PRP B-NP
+have VBP B-VP
+established VBN I-VP
+today NN B-NP
+but CC B-NP
+also RB I-NP
+the DT B-NP
+hope NN I-NP
+of IN B-PP
+something NN B-NP
+still RB B-ADJP
+better JJR I-ADJP
+. . O
+"""
+
+    conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
+    conll_tree.pprint()
+
+    # Demonstrate CoNLL output
+    print("CoNLL output:")
+    print(nltk.chunk.tree2conllstr(conll_tree))
+    print()
+
+
+if __name__ == '__main__':
+    demo()
+
diff --git a/nlp_resource_data/nltk/chunk/util.pyc b/nlp_resource_data/nltk/chunk/util.pyc

new file mode 100755 (executable)

index 0000000..3e7a886

Binary files /dev/null and b/nlp_resource_data/nltk/chunk/util.pyc differ
diff --git a/nlp_resource_data/nltk/classify/__init__.py b/nlp_resource_data/nltk/classify/__init__.py

new file mode 100755 (executable)

index 0000000..2acfbfa
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/__init__.py
@@ -0,0 +1,98 @@
+# Natural Language Toolkit: Classifiers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes and interfaces for labeling tokens with category labels (or
+"class labels").  Typically, labels are represented with strings
+(such as ``'health'`` or ``'sports'``).  Classifiers can be used to
+perform a wide range of classification tasks.  For example,
+classifiers can be used...
+
+- to classify documents by topic
+- to classify ambiguous words by which word sense is intended
+- to classify acoustic signals by which phoneme they represent
+- to classify sentences by their author
+
+Features
+========
+In order to decide which category label is appropriate for a given
+token, classifiers examine one or more 'features' of the token.  These
+"features" are typically chosen by hand, and indicate which aspects
+of the token are relevant to the classification decision.  For
+example, a document classifier might use a separate feature for each
+word, recording how often that word occurred in the document.
+
+Featuresets
+===========
+The features describing a token are encoded using a "featureset",
+which is a dictionary that maps from "feature names" to "feature
+values".  Feature names are unique strings that indicate what aspect
+of the token is encoded by the feature.  Examples include
+``'prevword'``, for a feature whose value is the previous word; and
+``'contains-word(library)'`` for a feature that is true when a document
+contains the word ``'library'``.  Feature values are typically
+booleans, numbers, or strings, depending on which feature they
+describe.
+
+Featuresets are typically constructed using a "feature detector"
+(also known as a "feature extractor").  A feature detector is a
+function that takes a token (and sometimes information about its
+context) as its input, and returns a featureset describing that token.
+For example, the following feature detector converts a document
+(stored as a list of words) to a featureset describing the set of
+words included in the document:
+
+    >>> # Define a feature detector function.
+    >>> def document_features(document):
+    ...     return dict([('contains-word(%s)' % w, True) for w in document])
+
+Feature detectors are typically applied to each token before it is fed
+to the classifier:
+
+    >>> # Classify each Gutenberg document.
+    >>> from nltk.corpus import gutenberg
+    >>> for fileid in gutenberg.fileids(): # doctest: +SKIP
+    ...     doc = gutenberg.words(fileid) # doctest: +SKIP
+    ...     print fileid, classifier.classify(document_features(doc)) # doctest: +SKIP
+
+The parameters that a feature detector expects will vary, depending on
+the task and the needs of the feature detector.  For example, a
+feature detector for word sense disambiguation (WSD) might take as its
+input a sentence, and the index of a word that should be classified,
+and return a featureset for that word.  The following feature detector
+for WSD includes features describing the left and right contexts of
+the target word:
+
+    >>> def wsd_features(sentence, index):
+    ...     featureset = {}
+    ...     for i in range(max(0, index-3), index):
+    ...         featureset['left-context(%s)' % sentence[i]] = True
+    ...     for i in range(index, max(index+3, len(sentence))):
+    ...         featureset['right-context(%s)' % sentence[i]] = True
+    ...     return featureset
+
+Training Classifiers
+====================
+Most classifiers are built by training them on a list of hand-labeled
+examples, known as the "training set".  Training sets are represented
+as lists of ``(featuredict, label)`` tuples.
+"""
+
+from nltk.classify.api import ClassifierI, MultiClassifierI
+from nltk.classify.megam import config_megam, call_megam
+from nltk.classify.weka import WekaClassifier, config_weka
+from nltk.classify.naivebayes import NaiveBayesClassifier
+from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
+from nltk.classify.decisiontree import DecisionTreeClassifier
+from nltk.classify.rte_classify import rte_classifier, rte_features, RTEFeatureExtractor
+from nltk.classify.util import accuracy, apply_features, log_likelihood
+from nltk.classify.scikitlearn import SklearnClassifier
+from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
+                                  TypedMaxentFeatureEncoding,
+                                  ConditionalExponentialClassifier)
+from nltk.classify.senna import Senna
+from nltk.classify.textcat import TextCat
diff --git a/nlp_resource_data/nltk/classify/__init__.pyc b/nlp_resource_data/nltk/classify/__init__.pyc

new file mode 100755 (executable)

index 0000000..4a9eded

Binary files /dev/null and b/nlp_resource_data/nltk/classify/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/classify/api.py b/nlp_resource_data/nltk/classify/api.py

new file mode 100755 (executable)

index 0000000..fc32b0d
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/api.py
@@ -0,0 +1,193 @@
+# Natural Language Toolkit: Classifier Interface
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Interfaces for labeling tokens with category labels (or "class labels").
+
+``ClassifierI`` is a standard interface for "single-category
+classification", in which the set of categories is known, the number
+of categories is finite, and each text belongs to exactly one
+category.
+
+``MultiClassifierI`` is a standard interface for "multi-category
+classification", which is like single-category classification except
+that each text belongs to zero or more categories.
+"""
+from nltk.internals import overridden
+
+##//////////////////////////////////////////////////////
+#{ Classification Interfaces
+##//////////////////////////////////////////////////////
+
+class ClassifierI(object):
+    """
+    A processing interface for labeling tokens with a single category
+    label (or "class").  Labels are typically strs or
+    ints, but can be any immutable type.  The set of labels
+    that the classifier chooses from must be fixed and finite.
+
+    Subclasses must define:
+      - ``labels()``
+      - either ``classify()`` or ``classify_many()`` (or both)
+
+    Subclasses may define:
+      - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
+    """
+    def labels(self):
+        """
+        :return: the list of category labels used by this classifier.
+        :rtype: list of (immutable)
+        """
+        raise NotImplementedError()
+
+    def classify(self, featureset):
+        """
+        :return: the most appropriate label for the given featureset.
+        :rtype: label
+        """
+        if overridden(self.classify_many):
+            return self.classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def prob_classify(self, featureset):
+        """
+        :return: a probability distribution over labels for the given
+            featureset.
+        :rtype: ProbDistI
+        """
+        if overridden(self.prob_classify_many):
+            return self.prob_classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def classify_many(self, featuresets):
+        """
+        Apply ``self.classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.classify(fs) for fs in featuresets]
+
+        :rtype: list(label)
+        """
+        return [self.classify(fs) for fs in featuresets]
+
+    def prob_classify_many(self, featuresets):
+        """
+        Apply ``self.prob_classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.prob_classify(fs) for fs in featuresets]
+
+        :rtype: list(ProbDistI)
+        """
+        return [self.prob_classify(fs) for fs in featuresets]
+
+
+class MultiClassifierI(object):
+    """
+    A processing interface for labeling tokens with zero or more
+    category labels (or "labels").  Labels are typically strs
+    or ints, but can be any immutable type.  The set of labels
+    that the multi-classifier chooses from must be fixed and finite.
+
+    Subclasses must define:
+      - ``labels()``
+      - either ``classify()`` or ``classify_many()`` (or both)
+
+    Subclasses may define:
+      - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
+    """
+    def labels(self):
+        """
+        :return: the list of category labels used by this classifier.
+        :rtype: list of (immutable)
+        """
+        raise NotImplementedError()
+
+    def classify(self, featureset):
+        """
+        :return: the most appropriate set of labels for the given featureset.
+        :rtype: set(label)
+        """
+        if overridden(self.classify_many):
+            return self.classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def prob_classify(self, featureset):
+        """
+        :return: a probability distribution over sets of labels for the
+            given featureset.
+        :rtype: ProbDistI
+        """
+        if overridden(self.prob_classify_many):
+            return self.prob_classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def classify_many(self, featuresets):
+        """
+        Apply ``self.classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.classify(fs) for fs in featuresets]
+
+        :rtype: list(set(label))
+        """
+        return [self.classify(fs) for fs in featuresets]
+
+    def prob_classify_many(self, featuresets):
+        """
+        Apply ``self.prob_classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.prob_classify(fs) for fs in featuresets]
+
+        :rtype: list(ProbDistI)
+        """
+        return [self.prob_classify(fs) for fs in featuresets]
+
+
+# # [XX] IN PROGRESS:
+# class SequenceClassifierI(object):
+#     """
+#     A processing interface for labeling sequences of tokens with a
+#     single category label (or "class").  Labels are typically
+#     strs or ints, but can be any immutable type.  The set
+#     of labels that the classifier chooses from must be fixed and
+#     finite.
+#     """
+#     def labels(self):
+#         """
+#         :return: the list of category labels used by this classifier.
+#         :rtype: list of (immutable)
+#         """
+#         raise NotImplementedError()
+
+#     def prob_classify(self, featureset):
+#         """
+#         Return a probability distribution over labels for the given
+#         featureset.
+
+#         If ``featureset`` is a list of featuresets, then return a
+#         corresponding list containing the probability distribution
+#         over labels for each of the given featuresets, where the
+#         *i*\ th element of this list is the most appropriate label for
+#         the *i*\ th element of ``featuresets``.
+#         """
+#         raise NotImplementedError()
+
+#     def classify(self, featureset):
+#         """
+#         Return the most appropriate label for the given featureset.
+
+#         If ``featureset`` is a list of featuresets, then return a
+#         corresponding list containing the most appropriate label for
+#         each of the given featuresets, where the *i*\ th element of
+#         this list is the most appropriate label for the *i*\ th element
+#         of ``featuresets``.
+#         """
+#         raise NotImplementedError()
+
diff --git a/nlp_resource_data/nltk/classify/api.pyc b/nlp_resource_data/nltk/classify/api.pyc

new file mode 100755 (executable)

index 0000000..735c40b

Binary files /dev/null and b/nlp_resource_data/nltk/classify/api.pyc differ
diff --git a/nlp_resource_data/nltk/classify/decisiontree.py b/nlp_resource_data/nltk/classify/decisiontree.py

new file mode 100755 (executable)

index 0000000..2bf5742
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/decisiontree.py
@@ -0,0 +1,295 @@
+# Natural Language Toolkit: Decision Tree Classifiers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A classifier model that decides which label to assign to a token on
+the basis of a tree structure, where branches correspond to conditions
+on feature values, and leaves correspond to label assignments.
+"""
+from __future__ import print_function, unicode_literals, division
+
+from collections import defaultdict
+
+from nltk.probability import FreqDist, MLEProbDist, entropy
+from nltk.classify.api import ClassifierI
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class DecisionTreeClassifier(ClassifierI):
+    def __init__(self, label, feature_name=None, decisions=None, default=None):
+        """
+        :param label: The most likely label for tokens that reach
+            this node in the decision tree.  If this decision tree
+            has no children, then this label will be assigned to
+            any token that reaches this decision tree.
+        :param feature_name: The name of the feature that this
+            decision tree selects for.
+        :param decisions: A dictionary mapping from feature values
+            for the feature identified by ``feature_name`` to
+            child decision trees.
+        :param default: The child that will be used if the value of
+            feature ``feature_name`` does not match any of the keys in
+            ``decisions``.  This is used when constructing binary
+            decision trees.
+        """
+        self._label = label
+        self._fname = feature_name
+        self._decisions = decisions
+        self._default = default
+
+    def labels(self):
+        labels = [self._label]
+        if self._decisions is not None:
+            for dt in self._decisions.values():
+                labels.extend(dt.labels())
+        if self._default is not None:
+            labels.extend(self._default.labels())
+        return list(set(labels))
+
+    def classify(self, featureset):
+        # Decision leaf:
+        if self._fname is None:
+            return self._label
+
+        # Decision tree:
+        fval = featureset.get(self._fname)
+        if fval in self._decisions:
+            return self._decisions[fval].classify(featureset)
+        elif self._default is not None:
+            return self._default.classify(featureset)
+        else:
+            return self._label
+
+    def error(self, labeled_featuresets):
+        errors = 0
+        for featureset, label in labeled_featuresets:
+            if self.classify(featureset) != label:
+                errors += 1
+        return errors/len(labeled_featuresets)
+
+    def pretty_format(self, width=70, prefix='', depth=4):
+        """
+        Return a string containing a pretty-printed version of this
+        decision tree.  Each line in this string corresponds to a
+        single decision tree node or leaf, and indentation is used to
+        display the structure of the decision tree.
+        """
+        # [xx] display default!!
+        if self._fname is None:
+            n = width-len(prefix)-15
+            return '{0}{1} {2}\n'.format(prefix, '.'*n, self._label)
+        s = ''
+        for i, (fval, result) in enumerate(sorted(self._decisions.items())):
+            hdr = '{0}{1}={2}? '.format(prefix, self._fname, fval)
+            n = width-15-len(hdr)
+            s += '{0}{1} {2}\n'.format(hdr, '.'*(n), result._label)
+            if result._fname is not None and depth>1:
+                s += result.pretty_format(width, prefix+'  ', depth-1)
+        if self._default is not None:
+            n = width-len(prefix)-21
+            s += '{0}else: {1} {2}\n'.format(prefix, '.'*n, self._default._label)
+            if self._default._fname is not None and depth>1:
+                s += self._default.pretty_format(width, prefix+'  ', depth-1)
+        return s
+
+    def pseudocode(self, prefix='', depth=4):
+        """
+        Return a string representation of this decision tree that
+        expresses the decisions it makes as a nested set of pseudocode
+        if statements.
+        """
+        if self._fname is None:
+            return "{0}return {1!r}\n".format(prefix, self._label)
+        s = ''
+        for (fval, result) in sorted(self._decisions.items()):
+            s += '{0}if {1} == {2!r}: '.format(prefix, self._fname, fval)
+            if result._fname is not None and depth>1:
+                s += '\n'+result.pseudocode(prefix+'  ', depth-1)
+            else:
+                s += 'return {0!r}\n'.format(result._label)
+        if self._default is not None:
+            if len(self._decisions) == 1:
+                s += '{0}if {1} != {2!r}: '.format(prefix, self._fname,
+                                         list(self._decisions.keys())[0])
+            else:
+                s += '{0}else: '.format(prefix)
+            if self._default._fname is not None and depth>1:
+                s += '\n'+self._default.pseudocode(prefix+'  ', depth-1)
+            else:
+                s += 'return {0!r}\n'.format(self._default._label)
+        return s
+
+    def __str__(self):
+        return self.pretty_format()
+
+    @staticmethod
+    def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100,
+              support_cutoff=10, binary=False, feature_values=None,
+              verbose=False):
+        """
+        :param binary: If true, then treat all feature/value pairs as
+            individual binary features, rather than using a single n-way
+            branch for each feature.
+        """
+        # Collect a list of all feature names.
+        feature_names = set()
+        for featureset, label in labeled_featuresets:
+            for fname in featureset:
+                feature_names.add(fname)
+
+        # Collect a list of the values each feature can take.
+        if feature_values is None and binary:
+            feature_values = defaultdict(set)
+            for featureset, label in labeled_featuresets:
+                for fname, fval in featureset.items():
+                    feature_values[fname].add(fval)
+
+        # Start with a stump.
+        if not binary:
+            tree = DecisionTreeClassifier.best_stump(
+                feature_names, labeled_featuresets, verbose)
+        else:
+            tree = DecisionTreeClassifier.best_binary_stump(
+                feature_names, labeled_featuresets, feature_values, verbose)
+
+        # Refine the stump.
+        tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1,
+                    support_cutoff, binary, feature_values, verbose)
+
+        # Return it
+        return tree
+
+    @staticmethod
+    def leaf(labeled_featuresets):
+        label = FreqDist(label for (featureset, label)
+                         in labeled_featuresets).max()
+        return DecisionTreeClassifier(label)
+
+    @staticmethod
+    def stump(feature_name, labeled_featuresets):
+        label = FreqDist(label for (featureset, label)
+                         in labeled_featuresets).max()
+
+        # Find the best label for each value.
+        freqs = defaultdict(FreqDist) # freq(label|value)
+        for featureset, label in labeled_featuresets:
+            feature_value = featureset.get(feature_name)
+            freqs[feature_value][label] += 1
+
+        decisions = dict((val, DecisionTreeClassifier(freqs[val].max()))
+                         for val in freqs)
+        return DecisionTreeClassifier(label, feature_name, decisions)
+
+    def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff,
+               support_cutoff, binary=False, feature_values=None,
+               verbose=False):
+        if len(labeled_featuresets) <= support_cutoff: return
+        if self._fname is None: return
+        if depth_cutoff <= 0: return
+        for fval in self._decisions:
+            fval_featuresets = [(featureset, label) for (featureset, label)
+                                in labeled_featuresets
+                                if featureset.get(self._fname) == fval]
+
+            label_freqs = FreqDist(label for (featureset, label)
+                                   in fval_featuresets)
+            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
+                self._decisions[fval] = DecisionTreeClassifier.train(
+                    fval_featuresets, entropy_cutoff, depth_cutoff,
+                    support_cutoff, binary, feature_values, verbose)
+        if self._default is not None:
+            default_featuresets = [(featureset, label) for (featureset, label)
+                                   in labeled_featuresets
+                                   if featureset.get(self._fname) not in
+                                   self._decisions]
+            label_freqs = FreqDist(label for (featureset, label)
+                                   in default_featuresets)
+            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
+                self._default = DecisionTreeClassifier.train(
+                    default_featuresets, entropy_cutoff, depth_cutoff,
+                    support_cutoff, binary, feature_values, verbose)
+
+    @staticmethod
+    def best_stump(feature_names, labeled_featuresets, verbose=False):
+        best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
+        best_error = best_stump.error(labeled_featuresets)
+        for fname in feature_names:
+            stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
+            stump_error = stump.error(labeled_featuresets)
+            if stump_error < best_error:
+                best_error = stump_error
+                best_stump = stump
+        if verbose:
+            print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
+                   (len(labeled_featuresets), best_stump._fname, best_error)))
+        return best_stump
+
+    @staticmethod
+    def binary_stump(feature_name, feature_value, labeled_featuresets):
+        label = FreqDist(label for (featureset, label)
+                         in labeled_featuresets).max()
+
+        # Find the best label for each value.
+        pos_fdist = FreqDist()
+        neg_fdist = FreqDist()
+        for featureset, label in labeled_featuresets:
+            if featureset.get(feature_name) == feature_value:
+                pos_fdist[label] += 1
+            else:
+                neg_fdist[label] += 1
+
+
+        decisions = {}
+        default = label
+        # But hopefully we have observations!
+        if pos_fdist.N() > 0:
+            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
+        if neg_fdist.N() > 0:
+            default = DecisionTreeClassifier(neg_fdist.max())
+
+        return DecisionTreeClassifier(label, feature_name, decisions, default)
+
+    @staticmethod
+    def best_binary_stump(feature_names, labeled_featuresets, feature_values,
+                          verbose=False):
+        best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
+        best_error = best_stump.error(labeled_featuresets)
+        for fname in feature_names:
+            for fval in feature_values[fname]:
+                stump = DecisionTreeClassifier.binary_stump(
+                    fname, fval, labeled_featuresets)
+                stump_error = stump.error(labeled_featuresets)
+                if stump_error < best_error:
+                    best_error = stump_error
+                    best_stump = stump
+        if verbose:
+            if best_stump._decisions:
+                descr = '{0}={1}'.format(best_stump._fname,
+                                         list(best_stump._decisions.keys())[0])
+            else:
+                descr = '(default)'
+            print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
+                   (len(labeled_featuresets), descr, best_error)))
+        return best_stump
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+def f(x):
+    return DecisionTreeClassifier.train(x, binary=True, verbose=True)
+
+def demo():
+    from nltk.classify.util import names_demo, binary_names_demo_features
+    classifier = names_demo(f, #DecisionTreeClassifier.train,
+                            binary_names_demo_features)
+    print(classifier.pp(depth=7))
+    print(classifier.pseudocode(depth=7))
+
+if __name__ == '__main__':
+    demo()
+
diff --git a/nlp_resource_data/nltk/classify/decisiontree.pyc b/nlp_resource_data/nltk/classify/decisiontree.pyc

new file mode 100755 (executable)

index 0000000..c66ab20

Binary files /dev/null and b/nlp_resource_data/nltk/classify/decisiontree.pyc differ
diff --git a/nlp_resource_data/nltk/classify/maxent.py b/nlp_resource_data/nltk/classify/maxent.py

new file mode 100755 (executable)

index 0000000..f067394
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/maxent.py
@@ -0,0 +1,1495 @@
+# Natural Language Toolkit: Maximum Entropy Classifiers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A classifier model based on maximum entropy modeling framework.  This
+framework considers all of the probability distributions that are
+empirically consistent with the training data; and chooses the
+distribution with the highest entropy.  A probability distribution is
+"empirically consistent" with a set of training data if its estimated
+frequency with which a class and a feature vector value co-occur is
+equal to the actual frequency in the data.
+
+Terminology: 'feature'
+======================
+The term *feature* is usually used to refer to some property of an
+unlabeled token.  For example, when performing word sense
+disambiguation, we might define a ``'prevword'`` feature whose value is
+the word preceding the target word.  However, in the context of
+maxent modeling, the term *feature* is typically used to refer to a
+property of a "labeled" token.  In order to prevent confusion, we
+will introduce two distinct terms to disambiguate these two different
+concepts:
+
+  - An "input-feature" is a property of an unlabeled token.
+  - A "joint-feature" is a property of a labeled token.
+
+In the rest of the ``nltk.classify`` module, the term "features" is
+used to refer to what we will call "input-features" in this module.
+
+In literature that describes and discusses maximum entropy models,
+input-features are typically called "contexts", and joint-features
+are simply referred to as "features".
+
+Converting Input-Features to Joint-Features
+-------------------------------------------
+In maximum entropy models, joint-features are required to have numeric
+values.  Typically, each input-feature ``input_feat`` is mapped to a
+set of joint-features of the form:
+
+|   joint_feat(token, label) = { 1 if input_feat(token) == feat_val
+|                              {      and label == some_label
+|                              {
+|                              { 0 otherwise
+
+For all values of ``feat_val`` and ``some_label``.  This mapping is
+performed by classes that implement the ``MaxentFeatureEncodingI``
+interface.
+"""
+from __future__ import print_function, unicode_literals
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+import tempfile
+import os
+from collections import defaultdict
+
+from six import integer_types
+
+from nltk import compat
+from nltk.data import gzip_open_unicode
+from nltk.util import OrderedDict
+from nltk.probability import DictionaryProbDist
+
+from nltk.classify.api import ClassifierI
+from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
+from nltk.classify.megam import (call_megam,
+                                 write_megam_file, parse_megam_weights)
+from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
+
+__docformat__ = 'epytext en'
+
+######################################################################
+#{ Classifier Model
+######################################################################
+
+@compat.python_2_unicode_compatible
+class MaxentClassifier(ClassifierI):
+    """
+    A maximum entropy classifier (also known as a "conditional
+    exponential classifier").  This classifier is parameterized by a
+    set of "weights", which are used to combine the joint-features
+    that are generated from a featureset by an "encoding".  In
+    particular, the encoding maps each ``(featureset, label)`` pair to
+    a vector.  The probability of each label is then computed using
+    the following equation::
+
+                                dotprod(weights, encode(fs,label))
+      prob(fs|label) = ---------------------------------------------------
+                       sum(dotprod(weights, encode(fs,l)) for l in labels)
+
+    Where ``dotprod`` is the dot product::
+
+      dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
+    """
+    def __init__(self, encoding, weights, logarithmic=True):
+        """
+        Construct a new maxent classifier model.  Typically, new
+        classifier models are created using the ``train()`` method.
+
+        :type encoding: MaxentFeatureEncodingI
+        :param encoding: An encoding that is used to convert the
+            featuresets that are given to the ``classify`` method into
+            joint-feature vectors, which are used by the maxent
+            classifier model.
+
+        :type weights: list of float
+        :param weights:  The feature weight vector for this classifier.
+
+        :type logarithmic: bool
+        :param logarithmic: If false, then use non-logarithmic weights.
+        """
+        self._encoding = encoding
+        self._weights = weights
+        self._logarithmic = logarithmic
+        #self._logarithmic = False
+        assert encoding.length() == len(weights)
+
+    def labels(self):
+        return self._encoding.labels()
+
+    def set_weights(self, new_weights):
+        """
+        Set the feature weight vector for this classifier.
+        :param new_weights: The new feature weight vector.
+        :type new_weights: list of float
+        """
+        self._weights = new_weights
+        assert self._encoding.length() == len(new_weights)
+
+    def weights(self):
+        """
+        :return: The feature weight vector for this classifier.
+        :rtype: list of float
+        """
+        return self._weights
+
+    def classify(self, featureset):
+        return self.prob_classify(featureset).max()
+
+    def prob_classify(self, featureset):
+        prob_dict = {}
+        for label in self._encoding.labels():
+            feature_vector = self._encoding.encode(featureset, label)
+
+            if self._logarithmic:
+                total = 0.0
+                for (f_id, f_val) in feature_vector:
+                    total += self._weights[f_id] * f_val
+                prob_dict[label] = total
+
+            else:
+                prod = 1.0
+                for (f_id, f_val) in feature_vector:
+                    prod *= self._weights[f_id] ** f_val
+                prob_dict[label] = prod
+
+        # Normalize the dictionary to give a probability distribution
+        return DictionaryProbDist(prob_dict, log=self._logarithmic,
+                                  normalize=True)
+
+    def explain(self, featureset, columns=4):
+        """
+        Print a table showing the effect of each of the features in
+        the given feature set, and how they combine to determine the
+        probabilities of each label for that featureset.
+        """
+        descr_width = 50
+        TEMPLATE = '  %-'+str(descr_width-2)+'s%s%8.3f'
+
+        pdist = self.prob_classify(featureset)
+        labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
+        labels = labels[:columns]
+        print('  Feature'.ljust(descr_width)+''.join(
+            '%8s' % (("%s" % l)[:7]) for l in labels))
+        print('  '+'-'*(descr_width-2+8*len(labels)))
+        sums = defaultdict(int)
+        for i, label in enumerate(labels):
+            feature_vector = self._encoding.encode(featureset, label)
+            feature_vector.sort(key=lambda fid__: abs(self._weights[fid__[0]]),
+                                reverse=True)
+            for (f_id, f_val) in feature_vector:
+                if self._logarithmic:
+                    score = self._weights[f_id] * f_val
+                else: score = self._weights[f_id] ** f_val
+                descr = self._encoding.describe(f_id)
+                descr = descr.split(' and label is ')[0] # hack
+                descr += ' (%s)' % f_val                 # hack
+                if len(descr) > 47:
+                    descr = descr[:44]+'...'
+                print(TEMPLATE % (descr, i*8*' ', score))
+                sums[label] += score
+        print('  '+'-'*(descr_width-1+8*len(labels)))
+        print('  TOTAL:'.ljust(descr_width)+''.join(
+            '%8.3f' % sums[l] for l in labels))
+        print('  PROBS:'.ljust(descr_width)+''.join(
+            '%8.3f' % pdist.prob(l) for l in labels))
+
+    def show_most_informative_features(self, n=10, show='all'):
+        """
+        :param show: all, neg, or pos (for negative-only or positive-only)
+        """
+        fids = sorted(list(range(len(self._weights))),
+                      key=lambda fid: abs(self._weights[fid]),
+                      reverse=True)
+        if show == 'pos':
+            fids = [fid for fid in fids if self._weights[fid] > 0]
+        elif show == 'neg':
+            fids = [fid for fid in fids if self._weights[fid] < 0]
+        for fid in fids[:n]:
+            print('%8.3f %s' % (self._weights[fid],
+                                self._encoding.describe(fid)))
+
+    def __repr__(self):
+        return ('<ConditionalExponentialClassifier: %d labels, %d features>' %
+                (len(self._encoding.labels()), self._encoding.length()))
+
+    #: A list of the algorithm names that are accepted for the
+    #: ``train()`` method's ``algorithm`` parameter.
+    ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']
+
+    @classmethod
+    def train(cls, train_toks, algorithm=None, trace=3, encoding=None,
+              labels=None, gaussian_prior_sigma=0, **cutoffs):
+        """
+        Train a new maxent classifier based on the given corpus of
+        training samples.  This classifier will have its weights
+        chosen to maximize entropy while remaining empirically
+        consistent with the training corpus.
+
+        :rtype: MaxentClassifier
+        :return: The new maxent classifier
+
+        :type train_toks: list
+        :param train_toks: Training data, represented as a list of
+            pairs, the first member of which is a featureset,
+            and the second of which is a classification label.
+
+        :type algorithm: str
+        :param algorithm: A case-insensitive string, specifying which
+            algorithm should be used to train the classifier.  The
+            following algorithms are currently available.
+
+            - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
+              Improved Iterative Scaling (``'IIS'``)
+            - External Libraries (requiring megam):
+              LM-BFGS algorithm, with training performed by Megam (``'megam'``)
+
+            The default algorithm is ``'IIS'``.
+
+        :type trace: int
+        :param trace: The level of diagnostic tracing output to produce.
+            Higher values produce more verbose output.
+        :type encoding: MaxentFeatureEncodingI
+        :param encoding: A feature encoding, used to convert featuresets
+            into feature vectors.  If none is specified, then a
+            ``BinaryMaxentFeatureEncoding`` will be built based on the
+            features that are attested in the training corpus.
+        :type labels: list(str)
+        :param labels: The set of possible labels.  If none is given, then
+            the set of all labels attested in the training data will be
+            used instead.
+        :param gaussian_prior_sigma: The sigma value for a gaussian
+            prior on model weights.  Currently, this is supported by
+            ``megam``. For other algorithms, its value is ignored.
+        :param cutoffs: Arguments specifying various conditions under
+            which the training should be halted.  (Some of the cutoff
+            conditions are not supported by some algorithms.)
+
+            - ``max_iter=v``: Terminate after ``v`` iterations.
+            - ``min_ll=v``: Terminate after the negative average
+              log-likelihood drops under ``v``.
+            - ``min_lldelta=v``: Terminate if a single iteration improves
+              log likelihood by less than ``v``.
+        """
+        if algorithm is None:
+            algorithm = 'iis'
+        for key in cutoffs:
+            if key not in ('max_iter', 'min_ll', 'min_lldelta',
+                           'max_acc', 'min_accdelta', 'count_cutoff',
+                           'norm', 'explicit', 'bernoulli'):
+                raise TypeError('Unexpected keyword arg %r' % key)
+        algorithm = algorithm.lower()
+        if algorithm == 'iis':
+            return train_maxent_classifier_with_iis(
+                train_toks, trace, encoding, labels, **cutoffs)
+        elif algorithm == 'gis':
+            return train_maxent_classifier_with_gis(
+                train_toks, trace, encoding, labels, **cutoffs)
+        elif algorithm == 'megam':
+            return train_maxent_classifier_with_megam(
+                train_toks, trace, encoding, labels,
+                gaussian_prior_sigma, **cutoffs)
+        elif algorithm == 'tadm':
+            kwargs = cutoffs
+            kwargs['trace'] = trace
+            kwargs['encoding'] = encoding
+            kwargs['labels'] = labels
+            kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma
+            return TadmMaxentClassifier.train(train_toks, **kwargs)
+        else:
+            raise ValueError('Unknown algorithm %s' % algorithm)
+
+
+#: Alias for MaxentClassifier.
+ConditionalExponentialClassifier = MaxentClassifier
+
+
+######################################################################
+#{ Feature Encodings
+######################################################################
+
+class MaxentFeatureEncodingI(object):
+    """
+    A mapping that converts a set of input-feature values to a vector
+    of joint-feature values, given a label.  This conversion is
+    necessary to translate featuresets into a format that can be used
+    by maximum entropy models.
+
+    The set of joint-features used by a given encoding is fixed, and
+    each index in the generated joint-feature vectors corresponds to a
+    single joint-feature.  The length of the generated joint-feature
+    vectors is therefore constant (for a given encoding).
+
+    Because the joint-feature vectors generated by
+    ``MaxentFeatureEncodingI`` are typically very sparse, they are
+    represented as a list of ``(index, value)`` tuples, specifying the
+    value of each non-zero joint-feature.
+
+    Feature encodings are generally created using the ``train()``
+    method, which generates an appropriate encoding based on the
+    input-feature values and labels that are present in a given
+    corpus.
+    """
+    def encode(self, featureset, label):
+        """
+        Given a (featureset, label) pair, return the corresponding
+        vector of joint-feature values.  This vector is represented as
+        a list of ``(index, value)`` tuples, specifying the value of
+        each non-zero joint-feature.
+
+        :type featureset: dict
+        :rtype: list(tuple(int, int))
+        """
+        raise NotImplementedError()
+
+    def length(self):
+        """
+        :return: The size of the fixed-length joint-feature vectors
+            that are generated by this encoding.
+        :rtype: int
+        """
+        raise NotImplementedError()
+
+    def labels(self):
+        """
+        :return: A list of the \"known labels\" -- i.e., all labels
+            ``l`` such that ``self.encode(fs,l)`` can be a nonzero
+            joint-feature vector for some value of ``fs``.
+        :rtype: list
+        """
+        raise NotImplementedError()
+
+    def describe(self, fid):
+        """
+        :return: A string describing the value of the joint-feature
+            whose index in the generated feature vectors is ``fid``.
+        :rtype: str
+        """
+        raise NotImplementedError()
+
+    def train(cls, train_toks):
+        """
+        Construct and return new feature encoding, based on a given
+        training corpus ``train_toks``.
+
+        :type train_toks: list(tuple(dict, str))
+        :param train_toks: Training data, represented as a list of
+            pairs, the first member of which is a feature dictionary,
+            and the second of which is a classification label.
+        """
+        raise NotImplementedError()
+
+class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
+    """
+    A feature encoding that calls a user-supplied function to map a
+    given featureset/label pair to a sparse joint-feature vector.
+    """
+    def __init__(self, func, length, labels):
+        """
+        Construct a new feature encoding based on the given function.
+
+        :type func: (callable)
+        :param func: A function that takes two arguments, a featureset
+             and a label, and returns the sparse joint feature vector
+             that encodes them::
+
+                 func(featureset, label) -> feature_vector
+
+             This sparse joint feature vector (``feature_vector``) is a
+             list of ``(index,value)`` tuples.
+
+        :type length: int
+        :param length: The size of the fixed-length joint-feature
+            vectors that are generated by this encoding.
+
+        :type labels: list
+        :param labels: A list of the \"known labels\" for this
+            encoding -- i.e., all labels ``l`` such that
+            ``self.encode(fs,l)`` can be a nonzero joint-feature vector
+            for some value of ``fs``.
+        """
+        self._length = length
+        self._func = func
+        self._labels = labels
+
+    def encode(self, featureset, label):
+        return self._func(featureset, label)
+
+    def length(self):
+        return self._length
+
+    def labels(self):
+        return self._labels
+
+    def describe(self, fid):
+        return 'no description available'
+
+class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
+    """
+    A feature encoding that generates vectors containing a binary
+    joint-features of the form:
+
+    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
+    |                      {
+    |                      { 0 otherwise
+
+    Where ``fname`` is the name of an input-feature, ``fval`` is a value
+    for that input-feature, and ``label`` is a label.
+
+    Typically, these features are constructed based on a training
+    corpus, using the ``train()`` method.  This method will create one
+    feature for each combination of ``fname``, ``fval``, and ``label``
+    that occurs at least once in the training corpus.
+
+    The ``unseen_features`` parameter can be used to add "unseen-value
+    features", which are used whenever an input feature has a value
+    that was not encountered in the training corpus.  These features
+    have the form:
+
+    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
+    |                      {      and l == label
+    |                      {
+    |                      { 0 otherwise
+
+    Where ``is_unseen(fname, fval)`` is true if the encoding does not
+    contain any joint features that are true when ``fs[fname]==fval``.
+
+    The ``alwayson_features`` parameter can be used to add "always-on
+    features", which have the form::
+
+    |  joint_feat(fs, l) = { 1 if (l == label)
+    |                      {
+    |                      { 0 otherwise
+
+    These always-on features allow the maxent model to directly model
+    the prior probabilities of each label.
+    """
+    def __init__(self, labels, mapping, unseen_features=False,
+                 alwayson_features=False):
+        """
+        :param labels: A list of the \"known labels\" for this encoding.
+
+        :param mapping: A dictionary mapping from ``(fname,fval,label)``
+            tuples to corresponding joint-feature indexes.  These
+            indexes must be the set of integers from 0...len(mapping).
+            If ``mapping[fname,fval,label]=id``, then
+            ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
+            otherwise, it is 0.
+
+        :param unseen_features: If true, then include unseen value
+           features in the generated joint-feature vectors.
+
+        :param alwayson_features: If true, then include always-on
+           features in the generated joint-feature vectors.
+        """
+        if set(mapping.values()) != set(range(len(mapping))):
+            raise ValueError('Mapping values must be exactly the '
+                             'set of integers from 0...len(mapping)')
+
+        self._labels = list(labels)
+        """A list of attested labels."""
+
+        self._mapping = mapping
+        """dict mapping from (fname,fval,label) -> fid"""
+
+        self._length = len(mapping)
+        """The length of generated joint feature vectors."""
+
+        self._alwayson = None
+        """dict mapping from label -> fid"""
+
+        self._unseen = None
+        """dict mapping from fname -> fid"""
+
+        if alwayson_features:
+            self._alwayson = dict((label, i+self._length)
+                                  for (i, label) in enumerate(labels))
+            self._length += len(self._alwayson)
+
+        if unseen_features:
+            fnames = set(fname for (fname, fval, label) in mapping)
+            self._unseen = dict((fname, i+self._length)
+                                for (i, fname) in enumerate(fnames))
+            self._length += len(fnames)
+
+    def encode(self, featureset, label):
+        # Inherit docs.
+        encoding = []
+
+        # Convert input-features to joint-features:
+        for fname, fval in featureset.items():
+            # Known feature name & value:
+            if (fname, fval, label) in self._mapping:
+                encoding.append((self._mapping[fname, fval, label], 1))
+
+            # Otherwise, we might want to fire an "unseen-value feature".
+            elif self._unseen:
+                # Have we seen this fname/fval combination with any label?
+                for label2 in self._labels:
+                    if (fname, fval, label2) in self._mapping:
+                        break # we've seen this fname/fval combo
+                # We haven't -- fire the unseen-value feature
+                else:
+                    if fname in self._unseen:
+                        encoding.append((self._unseen[fname], 1))
+
+        # Add always-on features:
+        if self._alwayson and label in self._alwayson:
+            encoding.append((self._alwayson[label], 1))
+
+        return encoding
+
+    def describe(self, f_id):
+        # Inherit docs.
+        if not isinstance(f_id, integer_types):
+            raise TypeError('describe() expected an int')
+        try:
+            self._inv_mapping
+        except AttributeError:
+            self._inv_mapping = [-1]*len(self._mapping)
+            for (info, i) in self._mapping.items():
+                self._inv_mapping[i] = info
+
+        if f_id < len(self._mapping):
+            (fname, fval, label) = self._inv_mapping[f_id]
+            return '%s==%r and label is %r' % (fname, fval, label)
+        elif self._alwayson and f_id in self._alwayson.values():
+            for (label, f_id2) in self._alwayson.items():
+                if f_id == f_id2:
+                    return 'label is %r' % label
+        elif self._unseen and f_id in self._unseen.values():
+            for (fname, f_id2) in self._unseen.items():
+                if f_id == f_id2:
+                    return '%s is unseen' % fname
+        else:
+            raise ValueError('Bad feature id')
+
+    def labels(self):
+        # Inherit docs.
+        return self._labels
+
+    def length(self):
+        # Inherit docs.
+        return self._length
+
+    @classmethod
+    def train(cls, train_toks, count_cutoff=0, labels=None, **options):
+        """
+        Construct and return new feature encoding, based on a given
+        training corpus ``train_toks``.  See the class description
+        ``BinaryMaxentFeatureEncoding`` for a description of the
+        joint-features that will be included in this encoding.
+
+        :type train_toks: list(tuple(dict, str))
+        :param train_toks: Training data, represented as a list of
+            pairs, the first member of which is a feature dictionary,
+            and the second of which is a classification label.
+
+        :type count_cutoff: int
+        :param count_cutoff: A cutoff value that is used to discard
+            rare joint-features.  If a joint-feature's value is 1
+            fewer than ``count_cutoff`` times in the training corpus,
+            then that joint-feature is not included in the generated
+            encoding.
+
+        :type labels: list
+        :param labels: A list of labels that should be used by the
+            classifier.  If not specified, then the set of labels
+            attested in ``train_toks`` will be used.
+
+        :param options: Extra parameters for the constructor, such as
+            ``unseen_features`` and ``alwayson_features``.
+        """
+        mapping = {}              # maps (fname, fval, label) -> fid
+        seen_labels = set()       # The set of labels we've encountered
+        count = defaultdict(int)  # maps (fname, fval) -> count
+
+        for (tok, label) in train_toks:
+            if labels and label not in labels:
+                raise ValueError('Unexpected label %s' % label)
+            seen_labels.add(label)
+
+            # Record each of the features.
+            for (fname, fval) in tok.items():
+
+                # If a count cutoff is given, then only add a joint
+                # feature once the corresponding (fname, fval, label)
+                # tuple exceeds that cutoff.
+                count[fname, fval] += 1
+                if count[fname, fval] >= count_cutoff:
+                    if (fname, fval, label) not in mapping:
+                        mapping[fname, fval, label] = len(mapping)
+
+        if labels is None:
+            labels = seen_labels
+        return cls(labels, mapping, **options)
+
+class GISEncoding(BinaryMaxentFeatureEncoding):
+    """
+    A binary feature encoding which adds one new joint-feature to the
+    joint-features defined by ``BinaryMaxentFeatureEncoding``: a
+    correction feature, whose value is chosen to ensure that the
+    sparse vector always sums to a constant non-negative number.  This
+    new feature is used to ensure two preconditions for the GIS
+    training algorithm:
+
+      - At least one feature vector index must be nonzero for every
+        token.
+      - The feature vector must sum to a constant non-negative number
+        for every token.
+    """
+    def __init__(self, labels, mapping, unseen_features=False,
+                 alwayson_features=False, C=None):
+        """
+        :param C: The correction constant.  The value of the correction
+            feature is based on this value.  In particular, its value is
+            ``C - sum([v for (f,v) in encoding])``.
+        :seealso: ``BinaryMaxentFeatureEncoding.__init__``
+        """
+        BinaryMaxentFeatureEncoding.__init__(
+            self, labels, mapping, unseen_features, alwayson_features)
+        if C is None:
+            C = len(set(fname for (fname, fval, label) in mapping))+1
+        self._C = C
+
+    @property
+    def C(self):
+        """The non-negative constant that all encoded feature vectors
+        will sum to."""
+        return self._C
+
+    def encode(self, featureset, label):
+        # Get the basic encoding.
+        encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
+        base_length = BinaryMaxentFeatureEncoding.length(self)
+
+        # Add a correction feature.
+        total = sum(v for (f, v) in encoding)
+        if total >= self._C:
+            raise ValueError('Correction feature is not high enough!')
+        encoding.append((base_length, self._C-total))
+
+        # Return the result
+        return encoding
+
+    def length(self):
+        return BinaryMaxentFeatureEncoding.length(self) + 1
+
+    def describe(self, f_id):
+        if f_id == BinaryMaxentFeatureEncoding.length(self):
+            return 'Correction feature (%s)' % self._C
+        else:
+            return BinaryMaxentFeatureEncoding.describe(self, f_id)
+
+
+class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
+    def __init__(self, labels, mapping, unseen_features=False,
+                       alwayson_features=False):
+        self._mapping = OrderedDict(mapping)
+        self._label_mapping = OrderedDict()
+        BinaryMaxentFeatureEncoding.__init__(self, labels, self._mapping,
+                                             unseen_features,
+                                             alwayson_features)
+
+    def encode(self, featureset, label):
+        encoding = []
+        for feature, value in featureset.items():
+            if (feature, label) not in self._mapping:
+                self._mapping[(feature, label)] = len(self._mapping)
+            if value not in self._label_mapping:
+                if not isinstance(value, int):
+                    self._label_mapping[value] = len(self._label_mapping)
+                else:
+                    self._label_mapping[value] = value
+            encoding.append((self._mapping[(feature, label)],
+                             self._label_mapping[value]))
+        return encoding
+
+    def labels(self):
+        return self._labels
+
+    def describe(self, fid):
+        for (feature, label) in self._mapping:
+            if self._mapping[(feature, label)] == fid:
+                return (feature, label)
+
+    def length(self):
+        return len(self._mapping)
+
+    @classmethod
+    def train(cls, train_toks, count_cutoff=0, labels=None, **options):
+        mapping = OrderedDict()
+        if not labels:
+            labels = []
+
+        # This gets read twice, so compute the values in case it's lazy.
+        train_toks = list(train_toks)
+
+        for (featureset, label) in train_toks:
+            if label not in labels:
+                labels.append(label)
+
+        for (featureset, label) in train_toks:
+            for label in labels:
+                for feature in featureset:
+                    if (feature, label) not in mapping:
+                        mapping[(feature, label)] = len(mapping)
+
+        return cls(labels, mapping, **options)
+
+
+class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
+    """
+    A feature encoding that generates vectors containing integer,
+    float and binary joint-features of the form:
+
+    Binary (for string and boolean features):
+
+    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
+    |                      {
+    |                      { 0 otherwise
+
+    Value (for integer and float features):
+
+    |  joint_feat(fs, l) = { fval if     (fs[fname] == type(fval))
+    |                      {         and (l == label)
+    |                      {
+    |                      { not encoded otherwise
+
+    Where ``fname`` is the name of an input-feature, ``fval`` is a value
+    for that input-feature, and ``label`` is a label.
+
+    Typically, these features are constructed based on a training
+    corpus, using the ``train()`` method.
+
+    For string and boolean features [type(fval) not in (int, float)]
+    this method will create one feature for each combination of
+    ``fname``, ``fval``, and ``label`` that occurs at least once in the
+    training corpus.
+
+    For integer and float features [type(fval) in (int, float)] this
+    method will create one feature for each combination of ``fname``
+    and ``label`` that occurs at least once in the training corpus.
+
+    For binary features the ``unseen_features`` parameter can be used
+    to add "unseen-value features", which are used whenever an input
+    feature has a value that was not encountered in the training
+    corpus.  These features have the form:
+
+    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
+    |                      {      and l == label
+    |                      {
+    |                      { 0 otherwise
+
+    Where ``is_unseen(fname, fval)`` is true if the encoding does not
+    contain any joint features that are true when ``fs[fname]==fval``.
+
+    The ``alwayson_features`` parameter can be used to add "always-on
+    features", which have the form:
+
+    |  joint_feat(fs, l) = { 1 if (l == label)
+    |                      {
+    |                      { 0 otherwise
+
+    These always-on features allow the maxent model to directly model
+    the prior probabilities of each label.
+    """
+    def __init__(self, labels, mapping, unseen_features=False,
+                 alwayson_features=False):
+        """
+        :param labels: A list of the \"known labels\" for this encoding.
+
+        :param mapping: A dictionary mapping from ``(fname,fval,label)``
+            tuples to corresponding joint-feature indexes.  These
+            indexes must be the set of integers from 0...len(mapping).
+            If ``mapping[fname,fval,label]=id``, then
+            ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
+            otherwise, it is 0.
+
+        :param unseen_features: If true, then include unseen value
+           features in the generated joint-feature vectors.
+
+        :param alwayson_features: If true, then include always-on
+           features in the generated joint-feature vectors.
+        """
+        if set(mapping.values()) != set(range(len(mapping))):
+            raise ValueError('Mapping values must be exactly the '
+                             'set of integers from 0...len(mapping)')
+
+        self._labels = list(labels)
+        """A list of attested labels."""
+
+        self._mapping = mapping
+        """dict mapping from (fname,fval,label) -> fid"""
+
+        self._length = len(mapping)
+        """The length of generated joint feature vectors."""
+
+        self._alwayson = None
+        """dict mapping from label -> fid"""
+
+        self._unseen = None
+        """dict mapping from fname -> fid"""
+
+        if alwayson_features:
+            self._alwayson = dict((label, i+self._length)
+                                  for (i, label) in enumerate(labels))
+            self._length += len(self._alwayson)
+
+        if unseen_features:
+            fnames = set(fname for (fname, fval, label) in mapping)
+            self._unseen = dict((fname, i+self._length)
+                                for (i, fname) in enumerate(fnames))
+            self._length += len(fnames)
+
+    def encode(self, featureset, label):
+        # Inherit docs.
+        encoding = []
+
+        # Convert input-features to joint-features:
+        for fname, fval in featureset.items():
+            if isinstance(fval, (integer_types, float)):
+                # Known feature name & value:
+                if (fname, type(fval), label) in self._mapping:
+                    encoding.append((self._mapping[fname, type(fval),
+                                                   label], fval))
+            else:
+                # Known feature name & value:
+                if (fname, fval, label) in self._mapping:
+                    encoding.append((self._mapping[fname, fval, label], 1))
+
+                # Otherwise, we might want to fire an "unseen-value feature".
+                elif self._unseen:
+                    # Have we seen this fname/fval combination with any label?
+                    for label2 in self._labels:
+                        if (fname, fval, label2) in self._mapping:
+                            break # we've seen this fname/fval combo
+                    # We haven't -- fire the unseen-value feature
+                    else:
+                        if fname in self._unseen:
+                            encoding.append((self._unseen[fname], 1))
+
+
+        # Add always-on features:
+        if self._alwayson and label in self._alwayson:
+            encoding.append((self._alwayson[label], 1))
+
+        return encoding
+
+    def describe(self, f_id):
+        # Inherit docs.
+        if not isinstance(f_id, integer_types):
+            raise TypeError('describe() expected an int')
+        try:
+            self._inv_mapping
+        except AttributeError:
+            self._inv_mapping = [-1]*len(self._mapping)
+            for (info, i) in self._mapping.items():
+                self._inv_mapping[i] = info
+
+        if f_id < len(self._mapping):
+            (fname, fval, label) = self._inv_mapping[f_id]
+            return '%s==%r and label is %r' % (fname, fval, label)
+        elif self._alwayson and f_id in self._alwayson.values():
+            for (label, f_id2) in self._alwayson.items():
+                if f_id == f_id2:
+                    return 'label is %r' % label
+        elif self._unseen and f_id in self._unseen.values():
+            for (fname, f_id2) in self._unseen.items():
+                if f_id == f_id2:
+                    return '%s is unseen' % fname
+        else:
+            raise ValueError('Bad feature id')
+
+    def labels(self):
+        # Inherit docs.
+        return self._labels
+
+    def length(self):
+        # Inherit docs.
+        return self._length
+
+    @classmethod
+    def train(cls, train_toks, count_cutoff=0, labels=None, **options):
+        """
+        Construct and return new feature encoding, based on a given
+        training corpus ``train_toks``.  See the class description
+        ``TypedMaxentFeatureEncoding`` for a description of the
+        joint-features that will be included in this encoding.
+
+        Note: recognized feature values types are (int, float), over
+        types are interpreted as regular binary features.
+
+        :type train_toks: list(tuple(dict, str))
+        :param train_toks: Training data, represented as a list of
+            pairs, the first member of which is a feature dictionary,
+            and the second of which is a classification label.
+
+        :type count_cutoff: int
+        :param count_cutoff: A cutoff value that is used to discard
+            rare joint-features.  If a joint-feature's value is 1
+            fewer than ``count_cutoff`` times in the training corpus,
+            then that joint-feature is not included in the generated
+            encoding.
+
+        :type labels: list
+        :param labels: A list of labels that should be used by the
+            classifier.  If not specified, then the set of labels
+            attested in ``train_toks`` will be used.
+
+        :param options: Extra parameters for the constructor, such as
+            ``unseen_features`` and ``alwayson_features``.
+        """
+        mapping = {}              # maps (fname, fval, label) -> fid
+        seen_labels = set()       # The set of labels we've encountered
+        count = defaultdict(int)  # maps (fname, fval) -> count
+
+        for (tok, label) in train_toks:
+            if labels and label not in labels:
+                raise ValueError('Unexpected label %s' % label)
+            seen_labels.add(label)
+
+            # Record each of the features.
+            for (fname, fval) in tok.items():
+                if type(fval) in (int, float):
+                    fval = type(fval)
+                # If a count cutoff is given, then only add a joint
+                # feature once the corresponding (fname, fval, label)
+                # tuple exceeds that cutoff.
+                count[fname, fval] += 1
+                if count[fname, fval] >= count_cutoff:
+                    if (fname, fval, label) not in mapping:
+                        mapping[fname, fval, label] = len(mapping)
+
+        if labels is None:
+            labels = seen_labels
+        return cls(labels, mapping, **options)
+
+
+
+
+######################################################################
+#{ Classifier Trainer: Generalized Iterative Scaling
+######################################################################
+
+def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None,
+                                     labels=None, **cutoffs):
+    """
+    Train a new ``ConditionalExponentialClassifier``, using the given
+    training samples, using the Generalized Iterative Scaling
+    algorithm.  This ``ConditionalExponentialClassifier`` will encode
+    the model that maximizes entropy from all the models that are
+    empirically consistent with ``train_toks``.
+
+    :see: ``train_maxent_classifier()`` for parameter descriptions.
+    """
+    cutoffs.setdefault('max_iter', 100)
+    cutoffchecker = CutoffChecker(cutoffs)
+
+    # Construct an encoding from the training data.
+    if encoding is None:
+        encoding = GISEncoding.train(train_toks, labels=labels)
+
+    if not hasattr(encoding, 'C'):
+        raise TypeError('The GIS algorithm requires an encoding that '
+                        'defines C (e.g., GISEncoding).')
+
+    # Cinv is the inverse of the sum of each joint feature vector.
+    # This controls the learning rate: higher Cinv (or lower C) gives
+    # faster learning.
+    Cinv = 1.0/encoding.C
+
+    # Count how many times each feature occurs in the training data.
+    empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
+
+    # Check for any features that are not attested in train_toks.
+    unattested = set(numpy.nonzero(empirical_fcount == 0)[0])
+
+    # Build the classifier.  Start with weight=0 for each attested
+    # feature, and weight=-infinity for each unattested feature.
+    weights = numpy.zeros(len(empirical_fcount), 'd')
+    for fid in unattested:
+        weights[fid] = numpy.NINF
+    classifier = ConditionalExponentialClassifier(encoding, weights)
+
+    # Take the log of the empirical fcount.
+    log_empirical_fcount = numpy.log2(empirical_fcount)
+    del empirical_fcount
+
+    if trace > 0:
+        print('  ==> Training (%d iterations)' % cutoffs['max_iter'])
+    if trace > 2:
+        print()
+        print('      Iteration    Log Likelihood    Accuracy')
+        print('      ---------------------------------------')
+
+    # Train the classifier.
+    try:
+        while True:
+            if trace > 2:
+                ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
+                acc = cutoffchecker.acc or accuracy(classifier, train_toks)
+                iternum = cutoffchecker.iter
+                print('     %9d    %14.5f    %9.3f' % (iternum, ll, acc))
+
+            # Use the model to estimate the number of times each
+            # feature should occur in the training data.
+            estimated_fcount = calculate_estimated_fcount(
+                classifier, train_toks, encoding)
+
+            # Take the log of estimated fcount (avoid taking log(0).)
+            for fid in unattested:
+                estimated_fcount[fid] += 1
+            log_estimated_fcount = numpy.log2(estimated_fcount)
+            del estimated_fcount
+
+            # Update the classifier weights
+            weights = classifier.weights()
+            weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
+            classifier.set_weights(weights)
+
+            # Check the log-likelihood & accuracy cutoffs.
+            if cutoffchecker.check(classifier, train_toks):
+                break
+
+    except KeyboardInterrupt:
+        print('      Training stopped: keyboard interrupt')
+    except:
+        raise
+
+    if trace > 2:
+        ll = log_likelihood(classifier, train_toks)
+        acc = accuracy(classifier, train_toks)
+        print('         Final    %14.5f    %9.3f' % (ll, acc))
+
+# Return the classifier.
+    return classifier
+
+def calculate_empirical_fcount(train_toks, encoding):
+    fcount = numpy.zeros(encoding.length(), 'd')
+
+    for tok, label in train_toks:
+        for (index, val) in encoding.encode(tok, label):
+            fcount[index] += val
+
+    return fcount
+
+def calculate_estimated_fcount(classifier, train_toks, encoding):
+    fcount = numpy.zeros(encoding.length(), 'd')
+
+    for tok, label in train_toks:
+        pdist = classifier.prob_classify(tok)
+        for label in pdist.samples():
+            prob = pdist.prob(label)
+            for (fid, fval) in encoding.encode(tok, label):
+                fcount[fid] += prob*fval
+
+    return fcount
+
+
+######################################################################
+#{ Classifier Trainer: Improved Iterative Scaling
+######################################################################
+
+def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None,
+                                     labels=None, **cutoffs):
+    """
+    Train a new ``ConditionalExponentialClassifier``, using the given
+    training samples, using the Improved Iterative Scaling algorithm.
+    This ``ConditionalExponentialClassifier`` will encode the model
+    that maximizes entropy from all the models that are empirically
+    consistent with ``train_toks``.
+
+    :see: ``train_maxent_classifier()`` for parameter descriptions.
+    """
+    cutoffs.setdefault('max_iter', 100)
+    cutoffchecker = CutoffChecker(cutoffs)
+
+    # Construct an encoding from the training data.
+    if encoding is None:
+        encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
+
+    # Count how many times each feature occurs in the training data.
+    empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) /
+                       len(train_toks))
+
+    # Find the nf map, and related variables nfarray and nfident.
+    # nf is the sum of the features for a given labeled text.
+    # nfmap compresses this sparse set of values to a dense list.
+    # nfarray performs the reverse operation.  nfident is
+    # nfarray multiplied by an identity matrix.
+    nfmap = calculate_nfmap(train_toks, encoding)
+    nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
+    nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
+
+    # Check for any features that are not attested in train_toks.
+    unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])
+
+    # Build the classifier.  Start with weight=0 for each attested
+    # feature, and weight=-infinity for each unattested feature.
+    weights = numpy.zeros(len(empirical_ffreq), 'd')
+    for fid in unattested:
+        weights[fid] = numpy.NINF
+    classifier = ConditionalExponentialClassifier(encoding, weights)
+
+    if trace > 0:
+        print('  ==> Training (%d iterations)' % cutoffs['max_iter'])
+    if trace > 2:
+        print()
+        print('      Iteration    Log Likelihood    Accuracy')
+        print('      ---------------------------------------')
+
+    # Train the classifier.
+    try:
+        while True:
+            if trace > 2:
+                ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
+                acc = cutoffchecker.acc or accuracy(classifier, train_toks)
+                iternum = cutoffchecker.iter
+                print('     %9d    %14.5f    %9.3f' % (iternum, ll, acc))
+
+            # Calculate the deltas for this iteration, using Newton's method.
+            deltas = calculate_deltas(
+                train_toks, classifier, unattested, empirical_ffreq,
+                nfmap, nfarray, nftranspose, encoding)
+
+            # Use the deltas to update our weights.
+            weights = classifier.weights()
+            weights += deltas
+            classifier.set_weights(weights)
+
+            # Check the log-likelihood & accuracy cutoffs.
+            if cutoffchecker.check(classifier, train_toks):
+                break
+
+    except KeyboardInterrupt:
+        print('      Training stopped: keyboard interrupt')
+    except:
+        raise
+
+
+    if trace > 2:
+        ll = log_likelihood(classifier, train_toks)
+        acc = accuracy(classifier, train_toks)
+        print('         Final    %14.5f    %9.3f' % (ll, acc))
+
+    # Return the classifier.
+    return classifier
+
+def calculate_nfmap(train_toks, encoding):
+    """
+    Construct a map that can be used to compress ``nf`` (which is
+    typically sparse).
+
+    *nf(feature_vector)* is the sum of the feature values for
+    *feature_vector*.
+
+    This represents the number of features that are active for a
+    given labeled text.  This method finds all values of *nf(t)*
+    that are attested for at least one token in the given list of
+    training tokens; and constructs a dictionary mapping these
+    attested values to a continuous range *0...N*.  For example,
+    if the only values of *nf()* that were attested were 3, 5, and
+    7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.
+
+    :return: A map that can be used to compress ``nf`` to a dense
+        vector.
+    :rtype: dict(int -> int)
+    """
+    # Map from nf to indices.  This allows us to use smaller arrays.
+    nfset = set()
+    for tok, _ in train_toks:
+        for label in encoding.labels():
+            nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
+    return dict((nf, i) for (i, nf) in enumerate(nfset))
+
+def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical,
+                     nfmap, nfarray, nftranspose, encoding):
+    """
+    Calculate the update values for the classifier weights for
+    this iteration of IIS.  These update weights are the value of
+    ``delta`` that solves the equation::
+
+      ffreq_empirical[i]
+             =
+      SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
+                 feature_vector(fs,l)[i] *
+                 exp(delta[i] * nf(feature_vector(fs,l))))
+
+    Where:
+        - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
+        - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
+        - *nf(vector)* = ``sum([val for (id,val) in vector])``
+
+    This method uses Newton's method to solve this equation for
+    *delta[i]*.  In particular, it starts with a guess of
+    ``delta[i]`` = 1; and iteratively updates ``delta`` with:
+
+    | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
+
+    until convergence, where *sum1* and *sum2* are defined as:
+
+    |    sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
+    |    sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
+    |    f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
+    |                        feature_vector(fs,l)[i] .
+    |                        exp(delta[i] . nf(feature_vector(fs,l))))
+
+    Note that *sum1* and *sum2* depend on ``delta``; so they need
+    to be re-computed each iteration.
+
+    The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
+    used to generate a dense encoding for *nf(ltext)*.  This
+    allows ``_deltas`` to calculate *sum1* and *sum2* using
+    matrices, which yields a significant performance improvement.
+
+    :param train_toks: The set of training tokens.
+    :type train_toks: list(tuple(dict, str))
+    :param classifier: The current classifier.
+    :type classifier: ClassifierI
+    :param ffreq_empirical: An array containing the empirical
+        frequency for each feature.  The *i*\ th element of this
+        array is the empirical frequency for feature *i*.
+    :type ffreq_empirical: sequence of float
+    :param unattested: An array that is 1 for features that are
+        not attested in the training data; and 0 for features that
+        are attested.  In other words, ``unattested[i]==0`` iff
+        ``ffreq_empirical[i]==0``.
+    :type unattested: sequence of int
+    :param nfmap: A map that can be used to compress ``nf`` to a dense
+        vector.
+    :type nfmap: dict(int -> int)
+    :param nfarray: An array that can be used to uncompress ``nf``
+        from a dense vector.
+    :type nfarray: array(float)
+    :param nftranspose: The transpose of ``nfarray``
+    :type nftranspose: array(float)
+    """
+    # These parameters control when we decide that we've
+    # converged.  It probably should be possible to set these
+    # manually, via keyword arguments to train.
+    NEWTON_CONVERGE = 1e-12
+    MAX_NEWTON = 300
+
+    deltas = numpy.ones(encoding.length(), 'd')
+
+    # Precompute the A matrix:
+    # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
+    # over all label,fs s.t. num_features[label,fs]=nf
+    A = numpy.zeros((len(nfmap), encoding.length()), 'd')
+
+    for tok, label in train_toks:
+        dist = classifier.prob_classify(tok)
+
+        for label in encoding.labels():
+            # Generate the feature vector
+            feature_vector = encoding.encode(tok, label)
+            # Find the number of active features
+            nf = sum(val for (id, val) in feature_vector)
+            # Update the A matrix
+            for (id, val) in feature_vector:
+                A[nfmap[nf], id] += dist.prob(label) * val
+    A /= len(train_toks)
+
+    # Iteratively solve for delta.  Use the following variables:
+    #   - nf_delta[x][y] = nfarray[x] * delta[y]
+    #   - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
+    #   - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
+    #   - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
+    #                       exp(delta[i]nf)
+    #   - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
+    #                       nf exp(delta[i]nf)
+    for rangenum in range(MAX_NEWTON):
+        nf_delta = numpy.outer(nfarray, deltas)
+        exp_nf_delta = 2 ** nf_delta
+        nf_exp_nf_delta = nftranspose * exp_nf_delta
+        sum1 = numpy.sum(exp_nf_delta * A, axis=0)
+        sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
+
+        # Avoid division by zero.
+        for fid in unattested:
+            sum2[fid] += 1
+
+        # Update the deltas.
+        deltas -= (ffreq_empirical - sum1) / -sum2
+
+        # We can stop once we converge.
+        n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/
+                   numpy.sum(abs(deltas)))
+        if n_error < NEWTON_CONVERGE:
+            return deltas
+
+    return deltas
+
+######################################################################
+#{ Classifier Trainer: megam
+######################################################################
+
+# [xx] possible extension: add support for using implicit file format;
+# this would need to put requirements on what encoding is used.  But
+# we may need this for other maxent classifier trainers that require
+# implicit formats anyway.
+def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None,
+                                       labels=None, gaussian_prior_sigma=0,
+                                       **kwargs):
+    """
+    Train a new ``ConditionalExponentialClassifier``, using the given
+    training samples, using the external ``megam`` library.  This
+    ``ConditionalExponentialClassifier`` will encode the model that
+    maximizes entropy from all the models that are empirically
+    consistent with ``train_toks``.
+
+    :see: ``train_maxent_classifier()`` for parameter descriptions.
+    :see: ``nltk.classify.megam``
+    """
+
+    explicit = True
+    bernoulli = True
+    if 'explicit' in kwargs:
+        explicit = kwargs['explicit']
+    if 'bernoulli' in kwargs:
+        bernoulli = kwargs['bernoulli']
+
+    # Construct an encoding from the training data.
+    if encoding is None:
+        # Count cutoff can also be controlled by megam with the -minfc
+        # option. Not sure where the best place for it is.
+        count_cutoff = kwargs.get('count_cutoff', 0)
+        encoding = BinaryMaxentFeatureEncoding.train(train_toks, count_cutoff,
+                                                     labels=labels,
+                                                     alwayson_features=True)
+    elif labels is not None:
+        raise ValueError('Specify encoding or labels, not both')
+
+    # Write a training file for megam.
+    try:
+        fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
+        with open(trainfile_name, 'w') as trainfile:
+            write_megam_file(train_toks, encoding, trainfile,
+                             explicit=explicit, bernoulli=bernoulli)
+        os.close(fd)
+    except (OSError, IOError, ValueError) as e:
+        raise ValueError('Error while creating megam training file: %s' % e)
+
+    # Run megam on the training file.
+    options = []
+    options += ['-nobias', '-repeat', '10']
+    if explicit:
+        options += ['-explicit']
+    if not bernoulli:
+        options += ['-fvals']
+    if gaussian_prior_sigma:
+        # Lambda is just the precision of the Gaussian prior, i.e. it's the
+        # inverse variance, so the parameter conversion is 1.0/sigma**2.
+        # See http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf.
+        inv_variance = 1.0 / gaussian_prior_sigma**2
+    else:
+        inv_variance = 0
+    options += ['-lambda', '%.2f' % inv_variance, '-tune']
+    if trace < 3:
+        options += ['-quiet']
+    if 'max_iter' in kwargs:
+        options += ['-maxi', '%s' % kwargs['max_iter']]
+    if 'll_delta' in kwargs:
+        # [xx] this is actually a perplexity delta, not a log
+        # likelihood delta
+        options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]
+    if hasattr(encoding, 'cost'):
+        options += ['-multilabel']  # each possible la
+    options += ['multiclass', trainfile_name]
+    stdout = call_megam(options)
+    # print './megam_i686.opt ', ' '.join(options)
+    # Delete the training file
+    try:
+        os.remove(trainfile_name)
+    except (OSError, IOError) as e:
+        print('Warning: unable to delete %s: %s' % (trainfile_name, e))
+
+    # Parse the generated weight vector.
+    weights = parse_megam_weights(stdout, encoding.length(), explicit)
+
+    # Convert from base-e to base-2 weights.
+    weights *= numpy.log2(numpy.e)
+
+    # Build the classifier
+    return MaxentClassifier(encoding, weights)
+
+######################################################################
+#{ Classifier Trainer: tadm
+######################################################################
+
+class TadmMaxentClassifier(MaxentClassifier):
+    @classmethod
+    def train(cls, train_toks, **kwargs):
+        algorithm = kwargs.get('algorithm', 'tao_lmvm')
+        trace = kwargs.get('trace', 3)
+        encoding = kwargs.get('encoding', None)
+        labels = kwargs.get('labels', None)
+        sigma = kwargs.get('gaussian_prior_sigma', 0)
+        count_cutoff = kwargs.get('count_cutoff', 0)
+        max_iter = kwargs.get('max_iter')
+        ll_delta = kwargs.get('min_lldelta')
+
+        # Construct an encoding from the training data.
+        if not encoding:
+            encoding = TadmEventMaxentFeatureEncoding.train(train_toks,
+                                                            count_cutoff,
+                                                            labels=labels)
+
+        trainfile_fd, trainfile_name = \
+            tempfile.mkstemp(prefix='nltk-tadm-events-', suffix='.gz')
+        weightfile_fd, weightfile_name = \
+            tempfile.mkstemp(prefix='nltk-tadm-weights-')
+
+        trainfile = gzip_open_unicode(trainfile_name, 'w')
+        write_tadm_file(train_toks, encoding, trainfile)
+        trainfile.close()
+
+        options = []
+        options.extend(['-monitor'])
+        options.extend(['-method', algorithm])
+        if sigma:
+            options.extend(['-l2', '%.6f' % sigma**2])
+        if max_iter:
+            options.extend(['-max_it', '%d' % max_iter])
+        if ll_delta:
+            options.extend(['-fatol', '%.6f' % abs(ll_delta)])
+        options.extend(['-events_in', trainfile_name])
+        options.extend(['-params_out', weightfile_name])
+        if trace < 3:
+            options.extend(['2>&1'])
+        else:
+            options.extend(['-summary'])
+
+        call_tadm(options)
+
+        with open(weightfile_name, 'r') as weightfile:
+            weights = parse_tadm_weights(weightfile)
+
+        os.remove(trainfile_name)
+        os.remove(weightfile_name)
+
+        # Convert from base-e to base-2 weights.
+        weights *= numpy.log2(numpy.e)
+
+        # Build the classifier
+        return cls(encoding, weights)
+
+######################################################################
+#{ Demo
+######################################################################
+def demo():
+    from nltk.classify.util import names_demo
+    classifier = names_demo(MaxentClassifier.train)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/classify/maxent.pyc b/nlp_resource_data/nltk/classify/maxent.pyc

new file mode 100755 (executable)

index 0000000..51f13dc

Binary files /dev/null and b/nlp_resource_data/nltk/classify/maxent.pyc differ
diff --git a/nlp_resource_data/nltk/classify/megam.py b/nlp_resource_data/nltk/classify/megam.py

new file mode 100755 (executable)

index 0000000..2db484d
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/megam.py
@@ -0,0 +1,179 @@
+# Natural Language Toolkit: Interface to Megam Classifier
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A set of functions used to interface with the external megam_ maxent
+optimization package. Before megam can be used, you should tell NLTK where it
+can find the megam binary, using the ``config_megam()`` function. Typical
+usage:
+
+    >>> from nltk.classify import megam
+    >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
+    [Found megam: ...]
+
+Use with MaxentClassifier. Example below, see MaxentClassifier documentation
+for details.
+
+    nltk.classify.MaxentClassifier.train(corpus, 'megam')
+
+.. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html
+"""
+from __future__ import print_function
+
+import subprocess
+
+from six import string_types
+
+from nltk import compat
+from nltk.internals import find_binary
+try:
+    import numpy
+except ImportError:
+    numpy = None
+
+######################################################################
+#{ Configuration
+######################################################################
+
+_megam_bin = None
+def config_megam(bin=None):
+    """
+    Configure NLTK's interface to the ``megam`` maxent optimization
+    package.
+
+    :param bin: The full path to the ``megam`` binary.  If not specified,
+        then nltk will search the system for a ``megam`` binary; and if
+        one is not found, it will raise a ``LookupError`` exception.
+    :type bin: str
+    """
+    global _megam_bin
+    _megam_bin = find_binary(
+        'megam', bin,
+        env_vars=['MEGAM'],
+        binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
+        url='http://www.umiacs.umd.edu/~hal/megam/index.html')
+
+######################################################################
+#{ Megam Interface Functions
+######################################################################
+
+def write_megam_file(train_toks, encoding, stream,
+                     bernoulli=True, explicit=True):
+    """
+    Generate an input file for ``megam`` based on the given corpus of
+    classified tokens.
+
+    :type train_toks: list(tuple(dict, str))
+    :param train_toks: Training data, represented as a list of
+        pairs, the first member of which is a feature dictionary,
+        and the second of which is a classification label.
+
+    :type encoding: MaxentFeatureEncodingI
+    :param encoding: A feature encoding, used to convert featuresets
+        into feature vectors. May optionally implement a cost() method
+        in order to assign different costs to different class predictions.
+
+    :type stream: stream
+    :param stream: The stream to which the megam input file should be
+        written.
+
+    :param bernoulli: If true, then use the 'bernoulli' format.  I.e.,
+        all joint features have binary values, and are listed iff they
+        are true.  Otherwise, list feature values explicitly.  If
+        ``bernoulli=False``, then you must call ``megam`` with the
+        ``-fvals`` option.
+
+    :param explicit: If true, then use the 'explicit' format.  I.e.,
+        list the features that would fire for any of the possible
+        labels, for each token.  If ``explicit=True``, then you must
+        call ``megam`` with the ``-explicit`` option.
+    """
+    # Look up the set of labels.
+    labels = encoding.labels()
+    labelnum = dict((label, i) for (i, label) in enumerate(labels))
+
+    # Write the file, which contains one line per instance.
+    for featureset, label in train_toks:
+        # First, the instance number (or, in the weighted multiclass case, the cost of each label).
+        if hasattr(encoding, 'cost'):
+            stream.write(':'.join(str(encoding.cost(featureset, label, l))
+                                  for l in labels))
+        else:
+            stream.write('%d' % labelnum[label])
+
+        # For implicit file formats, just list the features that fire
+        # for this instance's actual label.
+        if not explicit:
+            _write_megam_features(encoding.encode(featureset, label),
+                                  stream, bernoulli)
+
+        # For explicit formats, list the features that would fire for
+        # any of the possible labels.
+        else:
+            for l in labels:
+                stream.write(' #')
+                _write_megam_features(encoding.encode(featureset, l),
+                                      stream, bernoulli)
+
+        # End of the instance.
+        stream.write('\n')
+
+def parse_megam_weights(s, features_count, explicit=True):
+    """
+    Given the stdout output generated by ``megam`` when training a
+    model, return a ``numpy`` array containing the corresponding weight
+    vector.  This function does not currently handle bias features.
+    """
+    if numpy is None:
+        raise ValueError('This function requires that numpy be installed')
+    assert explicit, 'non-explicit not supported yet'
+    lines = s.strip().split('\n')
+    weights = numpy.zeros(features_count, 'd')
+    for line in lines:
+        if line.strip():
+            fid, weight = line.split()
+            weights[int(fid)] = float(weight)
+    return weights
+
+def _write_megam_features(vector, stream, bernoulli):
+    if not vector:
+        raise ValueError('MEGAM classifier requires the use of an '
+                         'always-on feature.')
+    for (fid, fval) in vector:
+        if bernoulli:
+            if fval == 1:
+                stream.write(' %s' % fid)
+            elif fval != 0:
+                raise ValueError('If bernoulli=True, then all'
+                                 'features must be binary.')
+        else:
+            stream.write(' %s %s' % (fid, fval))
+
+def call_megam(args):
+    """
+    Call the ``megam`` binary with the given arguments.
+    """
+    if isinstance(args, string_types):
+        raise TypeError('args should be a list of strings')
+    if _megam_bin is None:
+        config_megam()
+
+    # Call megam via a subprocess
+    cmd = [_megam_bin] + args
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    (stdout, stderr) = p.communicate()
+
+    # Check the return code.
+    if p.returncode != 0:
+        print()
+        print(stderr)
+        raise OSError('megam command failed!')
+
+    if isinstance(stdout, string_types):
+        return stdout
+    else:
+        return stdout.decode('utf-8')
diff --git a/nlp_resource_data/nltk/classify/megam.pyc b/nlp_resource_data/nltk/classify/megam.pyc

new file mode 100755 (executable)

index 0000000..8eef864

Binary files /dev/null and b/nlp_resource_data/nltk/classify/megam.pyc differ
diff --git a/nlp_resource_data/nltk/classify/naivebayes.py b/nlp_resource_data/nltk/classify/naivebayes.py

new file mode 100755 (executable)

index 0000000..b547a7a
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/naivebayes.py
@@ -0,0 +1,242 @@
+# Natural Language Toolkit: Naive Bayes Classifiers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A classifier based on the Naive Bayes algorithm.  In order to find the
+probability for a label, this algorithm first uses the Bayes rule to
+express P(label|features) in terms of P(label) and P(features|label):
+
+|                       P(label) * P(features|label)
+|  P(label|features) = ------------------------------
+|                              P(features)
+
+The algorithm then makes the 'naive' assumption that all features are
+independent, given the label:
+
+|                       P(label) * P(f1|label) * ... * P(fn|label)
+|  P(label|features) = --------------------------------------------
+|                                         P(features)
+
+Rather than computing P(features) explicitly, the algorithm just
+calculates the numerator for each label, and normalizes them so they
+sum to one:
+
+|                       P(label) * P(f1|label) * ... * P(fn|label)
+|  P(label|features) = --------------------------------------------
+|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
+"""
+from __future__ import print_function, unicode_literals
+
+from collections import defaultdict
+
+from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
+from nltk.classify.api import ClassifierI
+
+##//////////////////////////////////////////////////////
+##  Naive Bayes Classifier
+##//////////////////////////////////////////////////////
+
+class NaiveBayesClassifier(ClassifierI):
+    """
+    A Naive Bayes classifier.  Naive Bayes classifiers are
+    paramaterized by two probability distributions:
+
+      - P(label) gives the probability that an input will receive each
+        label, given no information about the input's features.
+
+      - P(fname=fval|label) gives the probability that a given feature
+        (fname) will receive a given value (fval), given that the
+        label (label).
+
+    If the classifier encounters an input with a feature that has
+    never been seen with any label, then rather than assigning a
+    probability of 0 to all labels, it will ignore that feature.
+
+    The feature value 'None' is reserved for unseen feature values;
+    you generally should not use 'None' as a feature value for one of
+    your own features.
+    """
+    def __init__(self, label_probdist, feature_probdist):
+        """
+        :param label_probdist: P(label), the probability distribution
+            over labels.  It is expressed as a ``ProbDistI`` whose
+            samples are labels.  I.e., P(label) =
+            ``label_probdist.prob(label)``.
+
+        :param feature_probdist: P(fname=fval|label), the probability
+            distribution for feature values, given labels.  It is
+            expressed as a dictionary whose keys are ``(label, fname)``
+            pairs and whose values are ``ProbDistI`` objects over feature
+            values.  I.e., P(fname=fval|label) =
+            ``feature_probdist[label,fname].prob(fval)``.  If a given
+            ``(label,fname)`` is not a key in ``feature_probdist``, then
+            it is assumed that the corresponding P(fname=fval|label)
+            is 0 for all values of ``fval``.
+        """
+        self._label_probdist = label_probdist
+        self._feature_probdist = feature_probdist
+        self._labels = list(label_probdist.samples())
+
+    def labels(self):
+        return self._labels
+
+    def classify(self, featureset):
+        return self.prob_classify(featureset).max()
+
+    def prob_classify(self, featureset):
+        # Discard any feature names that we've never seen before.
+        # Otherwise, we'll just assign a probability of 0 to
+        # everything.
+        featureset = featureset.copy()
+        for fname in list(featureset.keys()):
+            for label in self._labels:
+                if (label, fname) in self._feature_probdist:
+                    break
+            else:
+                #print 'Ignoring unseen feature %s' % fname
+                del featureset[fname]
+
+        # Find the log probabilty of each label, given the features.
+        # Start with the log probability of the label itself.
+        logprob = {}
+        for label in self._labels:
+            logprob[label] = self._label_probdist.logprob(label)
+
+        # Then add in the log probability of features given labels.
+        for label in self._labels:
+            for (fname, fval) in featureset.items():
+                if (label, fname) in self._feature_probdist:
+                    feature_probs = self._feature_probdist[label, fname]
+                    logprob[label] += feature_probs.logprob(fval)
+                else:
+                    # nb: This case will never come up if the
+                    # classifier was created by
+                    # NaiveBayesClassifier.train().
+                    logprob[label] += sum_logs([]) # = -INF.
+
+        return DictionaryProbDist(logprob, normalize=True, log=True)
+
+    def show_most_informative_features(self, n=10):
+        # Determine the most relevant features, and display them.
+        cpdist = self._feature_probdist
+        print('Most Informative Features')
+
+        for (fname, fval) in self.most_informative_features(n):
+            def labelprob(l):
+                return cpdist[l, fname].prob(fval)
+
+            labels = sorted([l for l in self._labels
+                             if fval in cpdist[l, fname].samples()],
+                            key=labelprob)
+            if len(labels) == 1:
+                continue
+            l0 = labels[0]
+            l1 = labels[-1]
+            if cpdist[l0, fname].prob(fval) == 0:
+                ratio = 'INF'
+            else:
+                ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) /
+                                   cpdist[l0, fname].prob(fval))
+            print(('%24s = %-14r %6s : %-6s = %s : 1.0' %
+                   (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)))
+
+    def most_informative_features(self, n=100):
+        """
+        Return a list of the 'most informative' features used by this
+        classifier.  For the purpose of this function, the
+        informativeness of a feature ``(fname,fval)`` is equal to the
+        highest value of P(fname=fval|label), for any label, divided by
+        the lowest value of P(fname=fval|label), for any label:
+
+        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
+        """
+        # The set of (fname, fval) pairs used by this classifier.
+        features = set()
+        # The max & min probability associated w/ each (fname, fval)
+        # pair.  Maps (fname,fval) -> float.
+        maxprob = defaultdict(lambda: 0.0)
+        minprob = defaultdict(lambda: 1.0)
+
+        for (label, fname), probdist in self._feature_probdist.items():
+            for fval in probdist.samples():
+                feature = (fname, fval)
+                features.add(feature)
+                p = probdist.prob(fval)
+                maxprob[feature] = max(p, maxprob[feature])
+                minprob[feature] = min(p, minprob[feature])
+                if minprob[feature] == 0:
+                    features.discard(feature)
+
+        # Convert features to a list, & sort it by how informative
+        # features are.
+        features = sorted(features,
+                          key=lambda feature_:
+                          minprob[feature_]/maxprob[feature_])
+        return features[:n]
+
+    @classmethod
+    def train(cls, labeled_featuresets, estimator=ELEProbDist):
+        """
+        :param labeled_featuresets: A list of classified featuresets,
+            i.e., a list of tuples ``(featureset, label)``.
+        """
+        label_freqdist = FreqDist()
+        feature_freqdist = defaultdict(FreqDist)
+        feature_values = defaultdict(set)
+        fnames = set()
+
+        # Count up how many times each feature value occurred, given
+        # the label and featurename.
+        for featureset, label in labeled_featuresets:
+            label_freqdist[label] += 1
+            for fname, fval in featureset.items():
+                # Increment freq(fval|label, fname)
+                feature_freqdist[label, fname][fval] += 1
+                # Record that fname can take the value fval.
+                feature_values[fname].add(fval)
+                # Keep a list of all feature names.
+                fnames.add(fname)
+
+        # If a feature didn't have a value given for an instance, then
+        # we assume that it gets the implicit value 'None.'  This loop
+        # counts up the number of 'missing' feature values for each
+        # (label,fname) pair, and increments the count of the fval
+        # 'None' by that amount.
+        for label in label_freqdist:
+            num_samples = label_freqdist[label]
+            for fname in fnames:
+                count = feature_freqdist[label, fname].N()
+                # Only add a None key when necessary, i.e. if there are
+                # any samples with feature 'fname' missing.
+                if num_samples - count > 0:
+                    feature_freqdist[label, fname][None] += num_samples - count
+                    feature_values[fname].add(None)
+
+        # Create the P(label) distribution
+        label_probdist = estimator(label_freqdist)
+
+        # Create the P(fval|label, fname) distribution
+        feature_probdist = {}
+        for ((label, fname), freqdist) in feature_freqdist.items():
+            probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            feature_probdist[label, fname] = probdist
+
+        return cls(label_probdist, feature_probdist)
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+def demo():
+    from nltk.classify.util import names_demo
+    classifier = names_demo(NaiveBayesClassifier.train)
+    classifier.show_most_informative_features()
+
+if __name__ == '__main__':
+    demo()
+
+
diff --git a/nlp_resource_data/nltk/classify/naivebayes.pyc b/nlp_resource_data/nltk/classify/naivebayes.pyc

new file mode 100755 (executable)

index 0000000..3a7aa44

Binary files /dev/null and b/nlp_resource_data/nltk/classify/naivebayes.pyc differ
diff --git a/nlp_resource_data/nltk/classify/positivenaivebayes.py b/nlp_resource_data/nltk/classify/positivenaivebayes.py

new file mode 100755 (executable)

index 0000000..c8f5511
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/positivenaivebayes.py
@@ -0,0 +1,170 @@
+# Natural Language Toolkit: Positive Naive Bayes Classifier
+#
+# Copyright (C) 2012 NLTK Project
+# Author: Alessandro Presta <alessandro.presta@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A variant of the Naive Bayes Classifier that performs binary classification with
+partially-labeled training sets. In other words, assume we want to build a classifier
+that assigns each example to one of two complementary classes (e.g., male names and
+female names).
+If we have a training set with labeled examples for both classes, we can use a
+standard Naive Bayes Classifier. However, consider the case when we only have labeled
+examples for one of the classes, and other, unlabeled, examples.
+Then, assuming a prior distribution on the two labels, we can use the unlabeled set
+to estimate the frequencies of the various features.
+
+Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
+and unlabeled examples. We are also given an estimate of P(1).
+
+We compute P(feature|1) exactly as in the standard case.
+
+To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
+assuming that the unlabeled examples are drawn according to the given prior distribution)
+and then express the conditional probability as:
+
+|                  P(feature) - P(feature|1) * P(1)
+|  P(feature|0) = ----------------------------------
+|                               P(0)
+
+Example:
+
+    >>> from nltk.classify import PositiveNaiveBayesClassifier
+
+Some sentences about sports:
+
+    >>> sports_sentences = [ 'The team dominated the game',
+    ...                      'They lost the ball',
+    ...                      'The game was intense',
+    ...                      'The goalkeeper catched the ball',
+    ...                      'The other team controlled the ball' ]
+
+Mixed topics, including sports:
+
+    >>> various_sentences = [ 'The President did not comment',
+    ...                       'I lost the keys',
+    ...                       'The team won the game',
+    ...                       'Sara has two kids',
+    ...                       'The ball went off the court',
+    ...                       'They had the ball for the whole game',
+    ...                       'The show is over' ]
+
+The features of a sentence are simply the words it contains:
+
+    >>> def features(sentence):
+    ...     words = sentence.lower().split()
+    ...     return dict(('contains(%s)' % w, True) for w in words)
+
+We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
+
+    >>> positive_featuresets = list(map(features, sports_sentences))
+    >>> unlabeled_featuresets = list(map(features, various_sentences))
+    >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
+    ...                                                 unlabeled_featuresets)
+
+Is the following sentence about sports?
+
+    >>> classifier.classify(features('The cat is on the table'))
+    False
+
+What about this one?
+
+    >>> classifier.classify(features('My team lost the game'))
+    True
+"""
+
+from collections import defaultdict
+
+from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist
+
+from nltk.classify.naivebayes import NaiveBayesClassifier
+
+##//////////////////////////////////////////////////////
+##  Positive Naive Bayes Classifier
+##//////////////////////////////////////////////////////
+
+class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
+    @staticmethod
+    def train(positive_featuresets, unlabeled_featuresets, positive_prob_prior=0.5,
+              estimator=ELEProbDist):
+        """
+        :param positive_featuresets: A list of featuresets that are known as positive
+            examples (i.e., their label is ``True``).
+
+        :param unlabeled_featuresets: A list of featuresets whose label is unknown.
+
+        :param positive_prob_prior: A prior estimate of the probability of the label
+            ``True`` (default 0.5).
+        """
+        positive_feature_freqdist = defaultdict(FreqDist)
+        unlabeled_feature_freqdist = defaultdict(FreqDist)
+        feature_values = defaultdict(set)
+        fnames = set()
+
+        # Count up how many times each feature value occurred in positive examples.
+        for featureset in positive_featuresets:
+            for fname, fval in featureset.items():
+                positive_feature_freqdist[fname][fval] += 1
+                feature_values[fname].add(fval)
+                fnames.add(fname)
+
+        # Count up how many times each feature value occurred in unlabeled examples.
+        for featureset in unlabeled_featuresets:
+            for fname, fval in featureset.items():
+                unlabeled_feature_freqdist[fname][fval] += 1
+                feature_values[fname].add(fval)
+                fnames.add(fname)
+
+        # If a feature didn't have a value given for an instance, then we assume that
+        # it gets the implicit value 'None'.
+        num_positive_examples = len(positive_featuresets)
+        for fname in fnames:
+            count = positive_feature_freqdist[fname].N()
+            positive_feature_freqdist[fname][None] += num_positive_examples - count
+            feature_values[fname].add(None)
+
+        num_unlabeled_examples = len(unlabeled_featuresets)
+        for fname in fnames:
+            count = unlabeled_feature_freqdist[fname].N()
+            unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
+            feature_values[fname].add(None)
+
+        negative_prob_prior = 1.0 - positive_prob_prior
+
+        # Create the P(label) distribution.
+        label_probdist = DictionaryProbDist({True: positive_prob_prior,
+                                             False: negative_prob_prior})
+
+        # Create the P(fval|label, fname) distribution.
+        feature_probdist = {}
+        for fname, freqdist in positive_feature_freqdist.items():
+            probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            feature_probdist[True, fname] = probdist
+
+        for fname, freqdist in unlabeled_feature_freqdist.items():
+            global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            negative_feature_probs = {}
+            for fval in feature_values[fname]:
+                prob = (global_probdist.prob(fval)
+                        - positive_prob_prior *
+                        feature_probdist[True, fname].prob(fval)) \
+                        / negative_prob_prior
+                # TODO: We need to add some kind of smoothing here, instead of
+                # setting negative probabilities to zero and normalizing.
+                negative_feature_probs[fval] = max(prob, 0.0)
+            feature_probdist[False, fname] = DictionaryProbDist(negative_feature_probs,
+                                                                normalize=True)
+
+        return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+def demo():
+    from nltk.classify.util import partial_names_demo
+    classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
+    classifier.show_most_informative_features()
+
diff --git a/nlp_resource_data/nltk/classify/positivenaivebayes.pyc b/nlp_resource_data/nltk/classify/positivenaivebayes.pyc

new file mode 100755 (executable)

index 0000000..92faaf5

Binary files /dev/null and b/nlp_resource_data/nltk/classify/positivenaivebayes.pyc differ
diff --git a/nlp_resource_data/nltk/classify/rte_classify.py b/nlp_resource_data/nltk/classify/rte_classify.py

new file mode 100755 (executable)

index 0000000..f396d23
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/rte_classify.py
@@ -0,0 +1,160 @@
+# Natural Language Toolkit: RTE Classifier
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Simple classifier for RTE corpus.
+
+It calculates the overlap in words and named entities between text and
+hypothesis, and also whether there are words / named entities in the
+hypothesis which fail to occur in the text, since this is an indicator that
+the hypothesis is more informative than (i.e not entailed by) the text.
+
+TO DO: better Named Entity classification
+TO DO: add lemmatization
+"""
+from __future__ import print_function
+
+from nltk.tokenize import RegexpTokenizer
+from nltk.classify.util import accuracy, check_megam_config
+from nltk.classify.maxent import MaxentClassifier
+
+class RTEFeatureExtractor(object):
+    """
+    This builds a bag of words for both the text and the hypothesis after
+    throwing away some stopwords, then calculates overlap and difference.
+    """
+    def __init__(self, rtepair, stop=True, use_lemmatize=False):
+        """
+        :param rtepair: a ``RTEPair`` from which features should be extracted
+        :param stop: if ``True``, stopwords are thrown away.
+        :type stop: bool
+        """
+        self.stop = stop
+        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
+                              'have', 'are', 'were', 'and', 'very', '.', ','])
+
+        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
+                             'denied'])
+        # Try to tokenize so that abbreviations, monetary amounts, email
+        # addresses, URLs are single tokens.
+        tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
+
+        #Get the set of word types for text and hypothesis
+        self.text_tokens = tokenizer.tokenize(rtepair.text)
+        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
+        self.text_words = set(self.text_tokens)
+        self.hyp_words = set(self.hyp_tokens)
+
+        if use_lemmatize:
+            self.text_words = set(self._lemmatize(token) for token in self.text_tokens)
+            self.hyp_words = set(self._lemmatize(token) for token in self.hyp_tokens)
+
+        if self.stop:
+            self.text_words = self.text_words - self.stopwords
+            self.hyp_words = self.hyp_words - self.stopwords
+
+        self._overlap = self.hyp_words & self.text_words
+        self._hyp_extra = self.hyp_words - self.text_words
+        self._txt_extra = self.text_words - self.hyp_words
+
+
+    def overlap(self, toktype, debug=False):
+        """
+        Compute the overlap between text and hypothesis.
+
+        :param toktype: distinguish Named Entities from ordinary words
+        :type toktype: 'ne' or 'word'
+        """
+        ne_overlap = set(token for token in self._overlap if self._ne(token))
+        if toktype == 'ne':
+            if debug:
+                print("ne overlap", ne_overlap)
+            return ne_overlap
+        elif toktype == 'word':
+            if debug:
+                print("word overlap", self._overlap - ne_overlap)
+            return self._overlap - ne_overlap
+        else:
+            raise ValueError("Type not recognized:'%s'" % toktype)
+
+    def hyp_extra(self, toktype, debug=True):
+        """
+        Compute the extraneous material in the hypothesis.
+
+        :param toktype: distinguish Named Entities from ordinary words
+        :type toktype: 'ne' or 'word'
+        """
+        ne_extra = set(token for token in self._hyp_extra if self._ne(token))
+        if toktype == 'ne':
+            return ne_extra
+        elif toktype == 'word':
+            return self._hyp_extra - ne_extra
+        else:
+            raise ValueError("Type not recognized: '%s'" % toktype)
+
+    @staticmethod
+    def _ne(token):
+        """
+        This just assumes that words in all caps or titles are
+        named entities.
+
+        :type token: str
+        """
+        if token.istitle() or token.isupper():
+            return True
+        return False
+
+    @staticmethod
+    def _lemmatize(word):
+        """
+        Use morphy from WordNet to find the base form of verbs.
+        """
+        lemma = nltk.corpus.wordnet.morphy(word, pos=nltk.corpus.wordnet.VERB)
+        if lemma is not None:
+            return lemma
+        return word
+
+
+def rte_features(rtepair):
+    extractor = RTEFeatureExtractor(rtepair)
+    features = {}
+    features['alwayson'] = True
+    features['word_overlap'] = len(extractor.overlap('word'))
+    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
+    features['ne_overlap'] = len(extractor.overlap('ne'))
+    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
+    features['neg_txt'] = len(extractor.negwords & extractor.text_words)
+    features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words)
+    return features
+
+
+def rte_featurize(rte_pairs):
+    return [(rte_features(pair), pair.value) for pair in rte_pairs]
+
+
+def rte_classifier(algorithm):
+    from nltk.corpus import rte as rte_corpus
+    train_set  = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
+    test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
+    featurized_train_set = rte_featurize(train_set)
+    featurized_test_set = rte_featurize(test_set)
+    # Train the classifier
+    print('Training classifier...')
+    if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms.
+        # Ensure that MEGAM is configured first.
+        check_megam_config()
+        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
+    elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm
+        clf = MaxentClassifier.train(featurized_train_set, algorithm)
+    else:
+        err_msg = str("RTEClassifier only supports these algorithms:\n "
+                      "'megam', 'BFGS', 'GIS', 'IIS'.\n")
+        raise Exception(err_msg)
+    print('Testing classifier...')
+    acc = accuracy(clf, featurized_test_set)
+    print('Accuracy: %6.4f' % acc)
+    return clf
diff --git a/nlp_resource_data/nltk/classify/rte_classify.pyc b/nlp_resource_data/nltk/classify/rte_classify.pyc

new file mode 100755 (executable)

index 0000000..05f7b45

Binary files /dev/null and b/nlp_resource_data/nltk/classify/rte_classify.pyc differ
diff --git a/nlp_resource_data/nltk/classify/scikitlearn.py b/nlp_resource_data/nltk/classify/scikitlearn.py

new file mode 100755 (executable)

index 0000000..b7c7b6d
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/scikitlearn.py
@@ -0,0 +1,153 @@
+# Natural Language Toolkit: Interface to scikit-learn classifiers
+#
+# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+scikit-learn (http://scikit-learn.org) is a machine learning library for
+Python. It supports many classification algorithms, including SVMs,
+Naive Bayes, logistic regression (MaxEnt) and decision trees.
+
+This package implements a wrapper around scikit-learn classifiers. To use this
+wrapper, construct a scikit-learn estimator object, then use that to construct
+a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
+
+>>> from sklearn.svm import LinearSVC
+>>> from nltk.classify.scikitlearn import SklearnClassifier
+>>> classif = SklearnClassifier(LinearSVC())
+
+A scikit-learn classifier may include preprocessing steps when it's wrapped
+in a Pipeline object. The following constructs and wraps a Naive Bayes text
+classifier with tf-idf weighting and chi-square feature selection to get the
+best 1000 features:
+
+>>> from sklearn.feature_extraction.text import TfidfTransformer
+>>> from sklearn.feature_selection import SelectKBest, chi2
+>>> from sklearn.naive_bayes import MultinomialNB
+>>> from sklearn.pipeline import Pipeline
+>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
+...                      ('chi2', SelectKBest(chi2, k=1000)),
+...                      ('nb', MultinomialNB())])
+>>> classif = SklearnClassifier(pipeline)
+"""
+from __future__ import print_function, unicode_literals
+
+from six.moves import zip
+
+from nltk.classify.api import ClassifierI
+from nltk.probability import DictionaryProbDist
+from nltk import compat
+
+try:
+    from sklearn.feature_extraction import DictVectorizer
+    from sklearn.preprocessing import LabelEncoder
+except ImportError:
+    pass
+
+__all__ = ['SklearnClassifier']
+
+
+@compat.python_2_unicode_compatible
+class SklearnClassifier(ClassifierI):
+    """Wrapper for scikit-learn classifiers."""
+
+    def __init__(self, estimator, dtype=float, sparse=True):
+        """
+        :param estimator: scikit-learn classifier object.
+
+        :param dtype: data type used when building feature array.
+            scikit-learn estimators work exclusively on numeric data. The
+            default value should be fine for almost all situations.
+
+        :param sparse: Whether to use sparse matrices internally.
+            The estimator must support these; not all scikit-learn classifiers
+            do (see their respective documentation and look for "sparse
+            matrix"). The default value is True, since most NLP problems
+            involve sparse feature sets. Setting this to False may take a
+            great amount of memory.
+        :type sparse: boolean.
+        """
+        self._clf = estimator
+        self._encoder = LabelEncoder()
+        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
+
+    def __repr__(self):
+        return "<SklearnClassifier(%r)>" % self._clf
+
+    def classify_many(self, featuresets):
+        """Classify a batch of samples.
+
+        :param featuresets: An iterable over featuresets, each a dict mapping
+            strings to either numbers, booleans or strings.
+        :return: The predicted class label for each input sample.
+        :rtype: list
+        """
+        X = self._vectorizer.transform(featuresets)
+        classes = self._encoder.classes_
+        return [classes[i] for i in self._clf.predict(X)]
+
+    def prob_classify_many(self, featuresets):
+        """Compute per-class probabilities for a batch of samples.
+
+        :param featuresets: An iterable over featuresets, each a dict mapping
+            strings to either numbers, booleans or strings.
+        :rtype: list of ``ProbDistI``
+        """
+        X = self._vectorizer.transform(featuresets)
+        y_proba_list = self._clf.predict_proba(X)
+        return [self._make_probdist(y_proba) for y_proba in y_proba_list]
+
+    def labels(self):
+        """The class labels used by this classifier.
+
+        :rtype: list
+        """
+        return list(self._encoder.classes_)
+
+    def train(self, labeled_featuresets):
+        """
+        Train (fit) the scikit-learn estimator.
+
+        :param labeled_featuresets: A list of ``(featureset, label)``
+            where each ``featureset`` is a dict mapping strings to either
+            numbers, booleans or strings.
+        """
+
+        X, y = list(zip(*labeled_featuresets))
+        X = self._vectorizer.fit_transform(X)
+        y = self._encoder.fit_transform(y)
+        self._clf.fit(X, y)
+
+        return self
+
+    def _make_probdist(self, y_proba):
+        classes = self._encoder.classes_
+        return DictionaryProbDist(dict((classes[i], p)
+                                       for i, p in enumerate(y_proba)))
+
+
+# skip doctests if scikit-learn is not installed
+def setup_module(module):
+    from nose import SkipTest
+    try:
+        import sklearn
+    except ImportError:
+        raise SkipTest("scikit-learn is not installed")
+
+
+if __name__ == "__main__":
+    from nltk.classify.util import names_demo, names_demo_features
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.naive_bayes import BernoulliNB
+
+    # Bernoulli Naive Bayes is designed for binary classification. We set the
+    # binarize option to False since we know we're passing boolean features.
+    print("scikit-learn Naive Bayes:")
+    names_demo(SklearnClassifier(BernoulliNB(binarize=False)).train,
+               features=names_demo_features)
+
+    # The C parameter on logistic regression (MaxEnt) controls regularization.
+    # The higher it's set, the less regularized the classifier is.
+    print("\n\nscikit-learn logistic regression:")
+    names_demo(SklearnClassifier(LogisticRegression(C=1000)).train,
+               features=names_demo_features)
diff --git a/nlp_resource_data/nltk/classify/scikitlearn.pyc b/nlp_resource_data/nltk/classify/scikitlearn.pyc

new file mode 100755 (executable)

index 0000000..baa3b0b

Binary files /dev/null and b/nlp_resource_data/nltk/classify/scikitlearn.pyc differ
diff --git a/nlp_resource_data/nltk/classify/senna.py b/nlp_resource_data/nltk/classify/senna.py

new file mode 100755 (executable)

index 0000000..d8d71db
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/senna.py
@@ -0,0 +1,183 @@
+# encoding: utf-8
+# Natural Language Toolkit: Senna Interface
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A general interface to the SENNA pipeline that supports any of the
+operations specified in SUPPORTED_OPERATIONS.
+
+Applying multiple operations at once has the speed advantage. For example,
+Senna will automatically determine POS tags if you are extracting named
+entities. Applying both of the operations will cost only the time of
+extracting the named entities.
+
+The SENNA pipeline has a fixed maximum size of the sentences that it can read.
+By default it is 1024 token/sentence. If you have larger sentences, changing
+the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
+system specific binary should be rebuilt. Otherwise this could introduce
+misalignment errors.
+
+The input is:
+- path to the directory that contains SENNA executables. If the path is incorrect,
+   Senna will automatically search for executable file specified in SENNA environment variable
+- List of the operations needed to be performed.
+- (optionally) the encoding of the input data (default:utf-8)
+
+Note: Unit tests for this module can be found in test/unit/test_senna.py
+
+    >>> from __future__ import unicode_literals
+    >>> from nltk.classify import Senna
+    >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
+    >>> sent = 'Dusseldorf is an international business center'.split()
+    >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
+    [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
+    ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
+"""
+
+
+from __future__ import unicode_literals
+from os import path, sep, environ
+from subprocess import Popen, PIPE
+from platform import architecture, system
+
+from six import text_type
+
+from nltk.tag.api import TaggerI
+from nltk.compat import python_2_unicode_compatible
+
+_senna_url = 'http://ml.nec-labs.com/senna/'
+
+
+@python_2_unicode_compatible
+class Senna(TaggerI):
+
+    SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
+
+    def __init__(self, senna_path, operations, encoding='utf-8'):
+        self._encoding = encoding
+        self._path = path.normpath(senna_path) + sep
+
+        # Verifies the existence of the executable on the self._path first
+        #senna_binary_file_1 = self.executable(self._path)
+        exe_file_1 = self.executable(self._path)
+        if not path.isfile(exe_file_1):
+            # Check for the system environment
+            if 'SENNA' in environ:
+                #self._path = path.join(environ['SENNA'],'')
+                self._path = path.normpath(environ['SENNA']) + sep
+                exe_file_2 = self.executable(self._path)
+                if not path.isfile(exe_file_2):
+                    raise OSError("Senna executable expected at %s or %s but not found" % (exe_file_1,exe_file_2))
+
+        self.operations = operations
+
+
+    def executable(self, base_path):
+        """
+        The function that determines the system specific binary that should be
+        used in the pipeline. In case, the system is not known the default senna binary will
+        be used.
+        """
+        os_name = system()
+        if os_name == 'Linux':
+            bits = architecture()[0]
+            if bits == '64bit':
+                return path.join(base_path, 'senna-linux64')
+            return path.join(base_path, 'senna-linux32')
+        if os_name == 'Windows':
+            return path.join(base_path, 'senna-win32.exe')
+        if os_name == 'Darwin':
+            return path.join(base_path, 'senna-osx')
+        return path.join(base_path, 'senna')
+
+    def _map(self):
+        """
+        A method that calculates the order of the columns that SENNA pipeline
+        will output the tags into. This depends on the operations being ordered.
+        """
+        _map = {}
+        i = 1
+        for operation in Senna.SUPPORTED_OPERATIONS:
+            if operation in self.operations:
+                _map[operation] = i
+                i+= 1
+        return _map
+
+    def tag(self, tokens):
+        """
+        Applies the specified operation(s) on a list of tokens.
+        """
+        return self.tag_sents([tokens])[0]
+
+    def tag_sents(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return a
+        list of dictionaries. Every dictionary will contain a word with its
+        calculated annotations/tags.
+        """
+        encoding = self._encoding
+
+        if not path.isfile(self.executable(self._path)):
+            raise OSError("Senna executable expected at %s but not found" % self.executable(self._path))
+
+
+        # Build the senna command to run the tagger
+        _senna_cmd = [self.executable(self._path), '-path', self._path, '-usrtokens', '-iobtags']
+        _senna_cmd.extend(['-'+op for op in self.operations])
+
+        # Serialize the actual sentences to a temporary string
+        _input = '\n'.join((' '.join(x) for x in sentences))+'\n'
+        if isinstance(_input, text_type) and encoding:
+            _input = _input.encode(encoding)
+
+        # Run the tagger and get the output
+        p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+        (stdout, stderr) = p.communicate(input=_input)
+        senna_output = stdout
+
+        # Check the return code.
+        if p.returncode != 0:
+            raise RuntimeError('Senna command failed! Details: %s' % stderr)
+
+        if encoding:
+            senna_output = stdout.decode(encoding)
+
+        # Output the tagged sentences
+        map_ = self._map()
+        tagged_sentences = [[]]
+        sentence_index = 0
+        token_index = 0
+        for tagged_word in senna_output.strip().split("\n"):
+            if not tagged_word:
+                tagged_sentences.append([])
+                sentence_index += 1
+                token_index = 0
+                continue
+            tags = tagged_word.split('\t')
+            result = {}
+            for tag in map_:
+              result[tag] = tags[map_[tag]].strip()
+            try:
+              result['word'] = sentences[sentence_index][token_index]
+            except IndexError:
+              raise IndexError(
+                "Misalignment error occurred at sentence number %d. Possible reason"
+                " is that the sentence size exceeded the maximum size. Check the "
+                "documentation of Senna class for more information."
+                % sentence_index)
+            tagged_sentences[-1].append(result)
+            token_index += 1
+        return tagged_sentences
+
+
+# skip doctests if Senna is not installed
+def setup_module(module):
+    from nose import SkipTest
+    try:
+        tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
+    except OSError:
+        raise SkipTest("Senna executable not found")
diff --git a/nlp_resource_data/nltk/classify/senna.pyc b/nlp_resource_data/nltk/classify/senna.pyc

new file mode 100755 (executable)

index 0000000..c08c733

Binary files /dev/null and b/nlp_resource_data/nltk/classify/senna.pyc differ
diff --git a/nlp_resource_data/nltk/classify/svm.py b/nlp_resource_data/nltk/classify/svm.py

new file mode 100755 (executable)

index 0000000..98a4008
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/svm.py
@@ -0,0 +1,15 @@
+# Natural Language Toolkit: SVM-based classifier
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+nltk.classify.svm was deprecated. For classification based
+on support vector machines SVMs use nltk.classify.scikitlearn
+(or `scikit-learn <http://scikit-learn.org>`_ directly).
+"""
+class SvmClassifier(object):
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(__doc__)
diff --git a/nlp_resource_data/nltk/classify/svm.pyc b/nlp_resource_data/nltk/classify/svm.pyc

new file mode 100755 (executable)

index 0000000..a5c547e

Binary files /dev/null and b/nlp_resource_data/nltk/classify/svm.pyc differ
diff --git a/nlp_resource_data/nltk/classify/tadm.py b/nlp_resource_data/nltk/classify/tadm.py

new file mode 100755 (executable)

index 0000000..615523c
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/tadm.py
@@ -0,0 +1,113 @@
+# Natural Language Toolkit: Interface to TADM Classifier
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+import sys
+import subprocess
+
+from six import string_types
+
+from nltk.internals import find_binary
+try:
+    import numpy
+except ImportError:
+    pass
+
+_tadm_bin = None
+def config_tadm(bin=None):
+    global _tadm_bin
+    _tadm_bin = find_binary(
+        'tadm', bin,
+        env_vars=['TADM'],
+        binary_names=['tadm'],
+        url='http://tadm.sf.net')
+
+def write_tadm_file(train_toks, encoding, stream):
+    """
+    Generate an input file for ``tadm`` based on the given corpus of
+    classified tokens.
+
+    :type train_toks: list(tuple(dict, str))
+    :param train_toks: Training data, represented as a list of
+        pairs, the first member of which is a feature dictionary,
+        and the second of which is a classification label.
+    :type encoding: TadmEventMaxentFeatureEncoding
+    :param encoding: A feature encoding, used to convert featuresets
+        into feature vectors.
+    :type stream: stream
+    :param stream: The stream to which the ``tadm`` input file should be
+        written.
+    """
+    # See the following for a file format description:
+    #
+    # http://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
+    # http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
+    labels = encoding.labels()
+    for featureset, label in train_toks:
+        length_line = '%d\n' % len(labels)
+        stream.write(length_line)
+        for known_label in labels:
+            v = encoding.encode(featureset, known_label)
+            line = '%d %d %s\n' % (
+                int(label == known_label),
+                len(v),
+                ' '.join('%d %d' % u for u in v)
+            )
+            stream.write(line)
+
+def parse_tadm_weights(paramfile):
+    """
+    Given the stdout output generated by ``tadm`` when training a
+    model, return a ``numpy`` array containing the corresponding weight
+    vector.
+    """
+    weights = []
+    for line in paramfile:
+        weights.append(float(line.strip()))
+    return numpy.array(weights, 'd')
+
+def call_tadm(args):
+    """
+    Call the ``tadm`` binary with the given arguments.
+    """
+    if isinstance(args, string_types):
+        raise TypeError('args should be a list of strings')
+    if _tadm_bin is None:
+        config_tadm()
+
+    # Call tadm via a subprocess
+    cmd = [_tadm_bin] + args
+    p = subprocess.Popen(cmd, stdout=sys.stdout)
+    (stdout, stderr) = p.communicate()
+
+    # Check the return code.
+    if p.returncode != 0:
+        print()
+        print(stderr)
+        raise OSError('tadm command failed!')
+
+def names_demo():
+    from nltk.classify.util import names_demo
+    from nltk.classify.maxent import TadmMaxentClassifier
+    classifier = names_demo(TadmMaxentClassifier.train)
+
+def encoding_demo():
+    import sys
+    from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
+    tokens = [({'f0':1, 'f1':1, 'f3':1}, 'A'),
+              ({'f0':1, 'f2':1, 'f4':1}, 'B'),
+              ({'f0':2, 'f2':1, 'f3':1, 'f4':1}, 'A')]
+    encoding = TadmEventMaxentFeatureEncoding.train(tokens)
+    write_tadm_file(tokens, encoding, sys.stdout)
+    print()
+    for i in range(encoding.length()):
+        print('%s --> %d' % (encoding.describe(i), i))
+    print()
+
+if __name__ == '__main__':
+    encoding_demo()
+    names_demo()
diff --git a/nlp_resource_data/nltk/classify/tadm.pyc b/nlp_resource_data/nltk/classify/tadm.pyc

new file mode 100755 (executable)

index 0000000..886063a

Binary files /dev/null and b/nlp_resource_data/nltk/classify/tadm.pyc differ
diff --git a/nlp_resource_data/nltk/classify/textcat.py b/nlp_resource_data/nltk/classify/textcat.py

new file mode 100755 (executable)

index 0000000..10c3ad2
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/textcat.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle, 
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses 
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created seperately to read
+those files.
+
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+
+from nltk.compat import PY3
+from nltk.util import trigrams
+
+if PY3:
+    from sys import maxsize
+else:
+    from sys import maxint
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+    import regex as re
+except ImportError:
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+class TextCat(object):
+
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<"
+    _END_CHAR = ">"
+    
+    last_distances = {}
+    
+    def __init__(self):
+        if not re:
+            raise EnvironmentError("classify.textcat requires the regex module that "
+                                   "supports unicode. Try '$ pip install regex' and "
+                                   "see https://pypi.python.org/pypi/regex for "
+                                   "further details.")
+
+        from nltk.corpus import crubadan
+        self._corpus = crubadan
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
+        
+    def remove_punctuation(self, text):
+        ''' Get rid of punctuation except apostrophes '''
+        return re.sub(r"[^\P{P}\']+", "", text)
+    
+    def profile(self, text):
+        ''' Create FreqDist of trigrams within text '''
+        from nltk import word_tokenize, FreqDist
+
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+        
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
+
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+        
+    def calc_dist(self, lang, trigram, text_profile):
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
+
+        lang_fd = self._corpus.lang_freq(lang)
+        dist = 0
+
+        if trigram in lang_fd:
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)
+
+            #print(idx_lang_profile, ", ", idx_text)
+            dist = abs(idx_lang_profile - idx_text) 
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            if PY3:
+                dist = maxsize
+            else:
+                dist = maxint
+
+        return dist
+        
+    def lang_dists(self, text):
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
+        
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+        
+            distances[lang] = lang_dist
+            
+        return distances
+    
+    def guess_language(self, text):
+        ''' Find the language with the min distance
+            to the text and return its ISO 639-3 code '''
+        self.last_distances = self.lang_dists(text)
+        
+        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+
+def demo():
+    from nltk.corpus import udhr
+
+    langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
+             'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
+             'Serbian_Srpski-UTF8','Esperanto-UTF8']
+
+    friendly = {'kmr':'Northern Kurdish',
+                'abk':'Abkhazian',
+                'pes':'Iranian Persian',
+                'hin':'Hindi',
+                'haw':'Hawaiian',
+                'rus':'Russian',
+                'vie':'Vietnamese',
+                'srp':'Serbian',
+                'epo':'Esperanto'}
+        
+    tc = TextCat()
+
+    for cur_lang in langs:
+        # Get raw data from UDHR corpus
+        raw_sentences = udhr.sents(cur_lang)
+        rows = len(raw_sentences) - 1
+        cols = list(map(len, raw_sentences))
+
+        sample = ''
+          
+        # Generate a sample text of the language
+        for i in range(0, rows):
+            cur_sent = ''
+            for j in range(0, cols[i]):
+                cur_sent += ' ' + raw_sentences[i][j]
+            
+            sample += cur_sent
+          
+        # Try to detect what it is
+        print('Language snippet: ' + sample[0:140] + '...')
+        guess = tc.guess_language(sample)
+        print('Language detection: %s (%s)' % (guess, friendly[guess]))
+        print('#' * 140)
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/classify/textcat.pyc b/nlp_resource_data/nltk/classify/textcat.pyc

new file mode 100755 (executable)

index 0000000..ce0a5cb

Binary files /dev/null and b/nlp_resource_data/nltk/classify/textcat.pyc differ
diff --git a/nlp_resource_data/nltk/classify/util.py b/nlp_resource_data/nltk/classify/util.py

new file mode 100755 (executable)

index 0000000..bc80933
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/util.py
@@ -0,0 +1,324 @@
+# Natural Language Toolkit: Classifier Utility Functions
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility functions and classes for classifiers.
+"""
+from __future__ import print_function, division
+
+import math
+
+#from nltk.util import Deprecated
+import nltk.classify.util # for accuracy & log_likelihood
+from nltk.util import LazyMap
+
+######################################################################
+#{ Helper Functions
+######################################################################
+
+# alternative name possibility: 'map_featurefunc()'?
+# alternative name possibility: 'detect_features()'?
+# alternative name possibility: 'map_featuredetect()'?
+# or.. just have users use LazyMap directly?
+def apply_features(feature_func, toks, labeled=None):
+    """
+    Use the ``LazyMap`` class to construct a lazy list-like
+    object that is analogous to ``map(feature_func, toks)``.  In
+    particular, if ``labeled=False``, then the returned list-like
+    object's values are equal to::
+
+        [feature_func(tok) for tok in toks]
+
+    If ``labeled=True``, then the returned list-like object's values
+    are equal to::
+
+        [(feature_func(tok), label) for (tok, label) in toks]
+
+    The primary purpose of this function is to avoid the memory
+    overhead involved in storing all the featuresets for every token
+    in a corpus.  Instead, these featuresets are constructed lazily,
+    as-needed.  The reduction in memory overhead can be especially
+    significant when the underlying list of tokens is itself lazy (as
+    is the case with many corpus readers).
+
+    :param feature_func: The function that will be applied to each
+        token.  It should return a featureset -- i.e., a dict
+        mapping feature names to feature values.
+    :param toks: The list of tokens to which ``feature_func`` should be
+        applied.  If ``labeled=True``, then the list elements will be
+        passed directly to ``feature_func()``.  If ``labeled=False``,
+        then the list elements should be tuples ``(tok,label)``, and
+        ``tok`` will be passed to ``feature_func()``.
+    :param labeled: If true, then ``toks`` contains labeled tokens --
+        i.e., tuples of the form ``(tok, label)``.  (Default:
+        auto-detect based on types.)
+    """
+    if labeled is None:
+        labeled = toks and isinstance(toks[0], (tuple, list))
+    if labeled:
+        def lazy_func(labeled_token):
+            return (feature_func(labeled_token[0]), labeled_token[1])
+        return LazyMap(lazy_func, toks)
+    else:
+        return LazyMap(feature_func, toks)
+
+def attested_labels(tokens):
+    """
+    :return: A list of all labels that are attested in the given list
+        of tokens.
+    :rtype: list of (immutable)
+    :param tokens: The list of classified tokens from which to extract
+        labels.  A classified token has the form ``(token, label)``.
+    :type tokens: list
+    """
+    return tuple(set(label for (tok, label) in tokens))
+
+def log_likelihood(classifier, gold):
+    results = classifier.prob_classify_many([fs for (fs, l) in gold])
+    ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
+    return math.log(sum(ll) / len(ll))
+
+def accuracy(classifier, gold):
+    results = classifier.classify_many([fs for (fs, l) in gold])
+    correct = [l == r for ((fs, l), r) in zip(gold, results)]
+    if correct:
+        return sum(correct) / len(correct)
+    else:
+        return 0
+
+class CutoffChecker(object):
+    """
+    A helper class that implements cutoff checks based on number of
+    iterations and log likelihood.
+
+    Accuracy cutoffs are also implemented, but they're almost never
+    a good idea to use.
+    """
+    def __init__(self, cutoffs):
+        self.cutoffs = cutoffs.copy()
+        if 'min_ll' in cutoffs:
+            cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
+        if 'min_lldelta' in cutoffs:
+            cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
+        self.ll = None
+        self.acc = None
+        self.iter = 1
+
+    def check(self, classifier, train_toks):
+        cutoffs = self.cutoffs
+        self.iter += 1
+        if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
+            return True # iteration cutoff.
+
+        new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
+        if math.isnan(new_ll):
+            return True
+
+        if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
+            if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
+                return True # log likelihood cutoff
+            if ('min_lldelta' in cutoffs and self.ll and
+                ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))):
+                return True # log likelihood delta cutoff
+            self.ll = new_ll
+
+        if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
+            new_acc = nltk.classify.util.log_likelihood(
+                classifier, train_toks)
+            if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
+                return True # log likelihood cutoff
+            if ('min_accdelta' in cutoffs and self.acc and
+                ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))):
+                return True # log likelihood delta cutoff
+            self.acc = new_acc
+
+            return False # no cutoff reached.
+
+######################################################################
+#{ Demos
+######################################################################
+
+def names_demo_features(name):
+    features = {}
+    features['alwayson'] = True
+    features['startswith'] = name[0].lower()
+    features['endswith'] = name[-1].lower()
+    for letter in 'abcdefghijklmnopqrstuvwxyz':
+        features['count(%s)' % letter] = name.lower().count(letter)
+        features['has(%s)' % letter] = letter in name.lower()
+    return features
+
+def binary_names_demo_features(name):
+    features = {}
+    features['alwayson'] = True
+    features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
+    features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
+    for letter in 'abcdefghijklmnopqrstuvwxyz':
+        features['count(%s)' % letter] = name.lower().count(letter)
+        features['has(%s)' % letter] = letter in name.lower()
+        features['startswith(%s)' % letter] = (letter == name[0].lower())
+        features['endswith(%s)' % letter] = (letter == name[-1].lower())
+    return features
+
+def names_demo(trainer, features=names_demo_features):
+    from nltk.corpus import names
+    import random
+
+    # Construct a list of classified names, using the names corpus.
+    namelist = ([(name, 'male') for name in names.words('male.txt')] +
+                [(name, 'female') for name in names.words('female.txt')])
+
+    # Randomly split the names into a test & train set.
+    random.seed(123456)
+    random.shuffle(namelist)
+    train = namelist[:5000]
+    test = namelist[5000:5500]
+
+    # Train up a classifier.
+    print('Training classifier...')
+    classifier = trainer( [(features(n), g) for (n, g) in train] )
+
+    # Run the classifier on the test data.
+    print('Testing classifier...')
+    acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
+    print('Accuracy: %6.4f' % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(n) for (n, g) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold)
+              for ((name, gold), pdist) in zip(test, pdists)]
+        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
+        print()
+        print('Unseen Names      P(Male)  P(Female)\n'+'-'*40)
+        for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
+            if gender == 'male':
+                fmt = '  %-15s *%6.4f   %6.4f'
+            else:
+                fmt = '  %-15s  %6.4f  *%6.4f'
+            print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+def partial_names_demo(trainer, features=names_demo_features):
+    from nltk.corpus import names
+    import random
+
+    male_names = names.words('male.txt')
+    female_names = names.words('female.txt')
+
+    random.seed(654321)
+    random.shuffle(male_names)
+    random.shuffle(female_names)
+
+    # Create a list of male names to be used as positive-labeled examples for training
+    positive = map(features, male_names[:2000])
+
+    # Create a list of male and female names to be used as unlabeled examples
+    unlabeled = map(features, male_names[2000:2500] + female_names[:500])
+
+    # Create a test set with correctly-labeled male and female names
+    test = [(name, True) for name in male_names[2500:2750]] \
+        + [(name, False) for name in female_names[500:750]]
+
+    random.shuffle(test)
+
+    # Train up a classifier.
+    print('Training classifier...')
+    classifier = trainer(positive, unlabeled)
+
+    # Run the classifier on the test data.
+    print('Testing classifier...')
+    acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
+    print('Accuracy: %6.4f' % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(n) for (n, m) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold)
+              for ((name, gold), pdist) in zip(test, pdists)]
+        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
+        print()
+        print('Unseen Names      P(Male)  P(Female)\n'+'-'*40)
+        for ((name, is_male), pdist) in zip(test, pdists)[:5]:
+            if is_male == True:
+                fmt = '  %-15s *%6.4f   %6.4f'
+            else:
+                fmt = '  %-15s  %6.4f  *%6.4f'
+            print(fmt % (name, pdist.prob(True), pdist.prob(False)))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+_inst_cache = {}
+def wsd_demo(trainer, word, features, n=1000):
+    from nltk.corpus import senseval
+    import random
+
+    # Get the instances.
+    print('Reading data...')
+    global _inst_cache
+    if word not in _inst_cache:
+        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
+    instances = _inst_cache[word][:]
+    if n > len(instances):
+        n = len(instances)
+    senses = list(set(l for (i, l) in instances))
+    print('  Senses: ' + ' '.join(senses))
+
+    # Randomly split the names into a test & train set.
+    print('Splitting into test & train...')
+    random.seed(123456)
+    random.shuffle(instances)
+    train = instances[:int(.8*n)]
+    test = instances[int(.8*n):n]
+
+    # Train up a classifier.
+    print('Training classifier...')
+    classifier = trainer([(features(i), l) for (i, l) in train])
+
+    # Run the classifier on the test data.
+    print('Testing classifier...')
+    acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
+    print('Accuracy: %6.4f' % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(i) for (i, n) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold)
+              for ((name, gold), pdist) in zip(test, pdists)]
+        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+
+def check_megam_config(self):
+    """
+    Checks whether the MEGAM binary is configured.
+    """
+    try:
+        _megam_bin
+    except NameError:
+        err_msg = str("Please configure your megam binary first, e.g.\n"
+                      ">>> nltk.config_megam('/usr/bin/local/megam')")
+        raise NameError(err_msg)
diff --git a/nlp_resource_data/nltk/classify/util.pyc b/nlp_resource_data/nltk/classify/util.pyc

new file mode 100755 (executable)

index 0000000..9d91693

Binary files /dev/null and b/nlp_resource_data/nltk/classify/util.pyc differ
diff --git a/nlp_resource_data/nltk/classify/weka.py b/nlp_resource_data/nltk/classify/weka.py

new file mode 100755 (executable)

index 0000000..2c0ab4b
--- /dev/null
+++ b/nlp_resource_data/nltk/classify/weka.py
@@ -0,0 +1,346 @@
+# Natural Language Toolkit: Interface to Weka Classsifiers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classifiers that make use of the external 'Weka' package.
+"""
+from __future__ import print_function
+import time
+import tempfile
+import os
+import subprocess
+import re
+import zipfile
+from sys import stdin
+
+from six import integer_types, string_types
+
+from nltk.probability import DictionaryProbDist
+from nltk.internals import java, config_java
+
+from nltk.classify.api import ClassifierI
+
+_weka_classpath = None
+_weka_search = ['.',
+                '/usr/share/weka',
+                '/usr/local/share/weka',
+                '/usr/lib/weka',
+                '/usr/local/lib/weka',]
+def config_weka(classpath=None):
+    global _weka_classpath
+
+    # Make sure java's configured first.
+    config_java()
+
+    if classpath is not None:
+        _weka_classpath = classpath
+
+    if _weka_classpath is None:
+        searchpath = _weka_search
+        if 'WEKAHOME' in os.environ:
+            searchpath.insert(0, os.environ['WEKAHOME'])
+
+        for path in searchpath:
+            if os.path.exists(os.path.join(path, 'weka.jar')):
+                _weka_classpath = os.path.join(path, 'weka.jar')
+                version = _check_weka_version(_weka_classpath)
+                if version:
+                    print(('[Found Weka: %s (version %s)]' %
+                           (_weka_classpath, version)))
+                else:
+                    print('[Found Weka: %s]' % _weka_classpath)
+                _check_weka_version(_weka_classpath)
+
+    if _weka_classpath is None:
+        raise LookupError('Unable to find weka.jar!  Use config_weka() '
+                          'or set the WEKAHOME environment variable. '
+                          'For more information about Weka, please see '
+                          'http://www.cs.waikato.ac.nz/ml/weka/')
+
+def _check_weka_version(jar):
+    try:
+        zf = zipfile.ZipFile(jar)
+    except (SystemExit, KeyboardInterrupt):
+        raise
+    except:
+        return None
+    try:
+        try:
+            return zf.read('weka/core/version.txt')
+        except KeyError:
+            return None
+    finally:
+        zf.close()
+
+class WekaClassifier(ClassifierI):
+    def __init__(self, formatter, model_filename):
+        self._formatter = formatter
+        self._model = model_filename
+
+    def prob_classify_many(self, featuresets):
+        return self._classify_many(featuresets, ['-p', '0', '-distribution'])
+
+    def classify_many(self, featuresets):
+        return self._classify_many(featuresets, ['-p', '0'])
+
+    def _classify_many(self, featuresets, options):
+        # Make sure we can find java & weka.
+        config_weka()
+
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Write the test data file.
+            test_filename = os.path.join(temp_dir, 'test.arff')
+            self._formatter.write(test_filename, featuresets)
+
+            # Call weka to classify the data.
+            cmd = ['weka.classifiers.bayes.NaiveBayes',
+                   '-l', self._model, '-T', test_filename] + options
+            (stdout, stderr) = java(cmd, classpath=_weka_classpath,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+
+            # Check if something went wrong:
+            if stderr and not stdout:
+                if 'Illegal options: -distribution' in stderr:
+                    raise ValueError('The installed version of weka does '
+                                     'not support probability distribution '
+                                     'output.')
+                else:
+                    raise ValueError('Weka failed to generate output:\n%s'
+                                     % stderr)
+
+            # Parse weka's output.
+            return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))
+
+        finally:
+            for f in os.listdir(temp_dir):
+                os.remove(os.path.join(temp_dir, f))
+            os.rmdir(temp_dir)
+
+    def parse_weka_distribution(self, s):
+        probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]
+        probs = dict(zip(self._formatter.labels(), probs))
+        return DictionaryProbDist(probs)
+
+    def parse_weka_output(self, lines):
+        # Strip unwanted text from stdout
+        for i,line in enumerate(lines):
+            if line.strip().startswith("inst#"):
+                lines = lines[i:]
+                break
+
+        if lines[0].split() == ['inst#', 'actual', 'predicted',
+                                'error', 'prediction']:
+            return [line.split()[2].split(':')[1]
+                    for line in lines[1:] if line.strip()]
+        elif lines[0].split() == ['inst#', 'actual', 'predicted',
+                                  'error', 'distribution']:
+            return [self.parse_weka_distribution(line.split()[-1])
+                    for line in lines[1:] if line.strip()]
+
+        # is this safe:?
+        elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]):
+            return [line.split()[1] for line in lines if line.strip()]
+
+        else:
+            for line in lines[:10]:
+                print(line)
+            raise ValueError('Unhandled output format -- your version '
+                             'of weka may not be supported.\n'
+                             '  Header: %s' % lines[0])
+
+
+    # [xx] full list of classifiers (some may be abstract?):
+    # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
+    # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
+    # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
+    # LogisticBase, M5Base, MultilayerPerceptron,
+    # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
+    # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
+    # PreConstructedLinearModel, Prism, RandomForest,
+    # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
+    # RuleNode, SimpleLinearRegression, SimpleLogistic,
+    # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
+    # VotedPerceptron, Winnow, ZeroR
+
+    _CLASSIFIER_CLASS = {
+        'naivebayes': 'weka.classifiers.bayes.NaiveBayes',
+        'C4.5': 'weka.classifiers.trees.J48',
+        'log_regression': 'weka.classifiers.functions.Logistic',
+        'svm': 'weka.classifiers.functions.SMO',
+        'kstar': 'weka.classifiers.lazy.KStar',
+        'ripper': 'weka.classifiers.rules.JRip',
+        }
+    @classmethod
+    def train(cls, model_filename, featuresets,
+              classifier='naivebayes', options=[], quiet=True):
+        # Make sure we can find java & weka.
+        config_weka()
+
+        # Build an ARFF formatter.
+        formatter = ARFF_Formatter.from_train(featuresets)
+
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Write the training data file.
+            train_filename = os.path.join(temp_dir, 'train.arff')
+            formatter.write(train_filename, featuresets)
+
+            if classifier in cls._CLASSIFIER_CLASS:
+                javaclass = cls._CLASSIFIER_CLASS[classifier]
+            elif classifier in cls._CLASSIFIER_CLASS.values():
+                javaclass = classifier
+            else:
+                raise ValueError('Unknown classifier %s' % classifier)
+
+            # Train the weka model.
+            cmd = [javaclass, '-d', model_filename, '-t', train_filename]
+            cmd += list(options)
+            if quiet:
+                stdout = subprocess.PIPE
+            else: stdout = None
+            java(cmd, classpath=_weka_classpath, stdout=stdout)
+
+            # Return the new classifier.
+            return WekaClassifier(formatter, model_filename)
+
+        finally:
+            for f in os.listdir(temp_dir):
+                os.remove(os.path.join(temp_dir, f))
+            os.rmdir(temp_dir)
+
+
+class ARFF_Formatter:
+    """
+    Converts featuresets and labeled featuresets to ARFF-formatted
+    strings, appropriate for input into Weka.
+
+    Features and classes can be specified manually in the constructor, or may
+    be determined from data using ``from_train``.
+    """
+
+    def __init__(self, labels, features):
+        """
+        :param labels: A list of all class labels that can be generated.
+        :param features: A list of feature specifications, where
+            each feature specification is a tuple (fname, ftype);
+            and ftype is an ARFF type string such as NUMERIC or
+            STRING.
+        """
+        self._labels = labels
+        self._features = features
+
+    def format(self, tokens):
+        """Returns a string representation of ARFF output for the given data."""
+        return self.header_section() + self.data_section(tokens)
+
+    def labels(self):
+        """Returns the list of classes."""
+        return list(self._labels)
+
+    def write(self, outfile, tokens):
+        """Writes ARFF data to a file for the given data."""
+        if not hasattr(outfile, 'write'):
+            outfile = open(outfile, 'w')
+        outfile.write(self.format(tokens))
+        outfile.close()
+
+    @staticmethod
+    def from_train(tokens):
+        """
+        Constructs an ARFF_Formatter instance with class labels and feature
+        types determined from the given data. Handles boolean, numeric and
+        string (note: not nominal) types.
+        """
+        # Find the set of all attested labels.
+        labels = set(label for (tok, label) in tokens)
+
+        # Determine the types of all features.
+        features = {}
+        for tok, label in tokens:
+            for (fname, fval) in tok.items():
+                if issubclass(type(fval), bool):
+                    ftype = '{True, False}'
+                elif issubclass(type(fval), (integer_types, float, bool)):
+                    ftype = 'NUMERIC'
+                elif issubclass(type(fval), string_types):
+                    ftype = 'STRING'
+                elif fval is None:
+                    continue # can't tell the type.
+                else:
+                    raise ValueError('Unsupported value type %r' % ftype)
+
+                if features.get(fname, ftype) != ftype:
+                    raise ValueError('Inconsistent type for %s' % fname)
+                features[fname] = ftype
+        features = sorted(features.items())
+
+        return ARFF_Formatter(labels, features)
+
+    def header_section(self):
+        """Returns an ARFF header as a string."""
+        # Header comment.
+        s = ('% Weka ARFF file\n' +
+             '% Generated automatically by NLTK\n' +
+             '%% %s\n\n' % time.ctime())
+
+        # Relation name
+        s += '@RELATION rel\n\n'
+
+        # Input attribute specifications
+        for fname, ftype in self._features:
+            s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype)
+
+        # Label attribute specification
+        s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels))
+
+        return s
+
+    def data_section(self, tokens, labeled=None):
+        """
+        Returns the ARFF data section for the given data.
+
+        :param tokens: a list of featuresets (dicts) or labelled featuresets
+            which are tuples (featureset, label).
+        :param labeled: Indicates whether the given tokens are labeled
+            or not.  If None, then the tokens will be assumed to be
+            labeled if the first token's value is a tuple or list.
+        """
+        # Check if the tokens are labeled or unlabeled.  If unlabeled,
+        # then use 'None'
+        if labeled is None:
+            labeled = tokens and isinstance(tokens[0], (tuple, list))
+        if not labeled:
+            tokens = [(tok, None) for tok in tokens]
+
+        # Data section
+        s = '\n@DATA\n'
+        for (tok, label) in tokens:
+            for fname, ftype in self._features:
+                s += '%s,' % self._fmt_arff_val(tok.get(fname))
+            s += '%s\n' % self._fmt_arff_val(label)
+
+        return s
+
+    def _fmt_arff_val(self, fval):
+        if fval is None:
+            return '?'
+        elif isinstance(fval, (bool, integer_types)):
+            return '%s' % fval
+        elif isinstance(fval, float):
+            return '%r' % fval
+        else:
+            return '%r' % fval
+
+
+if __name__ == '__main__':
+    from nltk.classify.util import names_demo, binary_names_demo_features
+    def make_classifier(featuresets):
+        return WekaClassifier.train('/tmp/name.model', featuresets,
+                                    'C4.5')
+    classifier = names_demo(make_classifier, binary_names_demo_features)
diff --git a/nlp_resource_data/nltk/classify/weka.pyc b/nlp_resource_data/nltk/classify/weka.pyc

new file mode 100755 (executable)

index 0000000..0c25340

Binary files /dev/null and b/nlp_resource_data/nltk/classify/weka.pyc differ
diff --git a/nlp_resource_data/nltk/cluster/__init__.py b/nlp_resource_data/nltk/cluster/__init__.py

new file mode 100755 (executable)

index 0000000..bf37a77
--- /dev/null
+++ b/nlp_resource_data/nltk/cluster/__init__.py
@@ -0,0 +1,86 @@
+# Natural Language Toolkit: Clusterers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This module contains a number of basic clustering algorithms. Clustering
+describes the task of discovering groups of similar items with a large
+collection. It is also describe as unsupervised machine learning, as the data
+from which it learns is unannotated with class information, as is the case for
+supervised learning.  Annotated data is difficult and expensive to obtain in
+the quantities required for the majority of supervised learning algorithms.
+This problem, the knowledge acquisition bottleneck, is common to most natural
+language processing tasks, thus fueling the need for quality unsupervised
+approaches.
+
+This module contains a k-means clusterer, E-M clusterer and a group average
+agglomerative clusterer (GAAC). All these clusterers involve finding good
+cluster groupings for a set of vectors in multi-dimensional space.
+
+The K-means clusterer starts with k arbitrary chosen means then allocates each
+vector to the cluster with the closest mean. It then recalculates the means of
+each cluster as the centroid of the vectors in the cluster. This process
+repeats until the cluster memberships stabilise. This is a hill-climbing
+algorithm which may converge to a local maximum. Hence the clustering is
+often repeated with random initial means and the most commonly occurring
+output means are chosen.
+
+The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
+It then iteratively merges pairs of clusters which have the closest centroids.
+This continues until there is only one cluster. The order of merges gives rise
+to a dendrogram - a tree with the earlier merges lower than later merges. The
+membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
+cutting the dendrogram at depth *c*.
+
+The Gaussian EM clusterer models the vectors as being produced by a mixture
+of k Gaussian sources. The parameters of these sources (prior probability,
+mean and covariance matrix) are then found to maximise the likelihood of the
+given data. This is done with the expectation maximisation algorithm. It
+starts with k arbitrarily chosen means, priors and covariance matrices. It
+then calculates the membership probabilities for each vector in each of the
+clusters - this is the 'E' step. The cluster parameters are then updated in
+the 'M' step using the maximum likelihood estimate from the cluster membership
+probabilities. This process continues until the likelihood of the data does
+not significantly increase.
+
+They all extend the ClusterI interface which defines common operations
+available with each clusterer. These operations include.
+   - cluster: clusters a sequence of vectors
+   - classify: assign a vector to a cluster
+   - classification_probdist: give the probability distribution over cluster memberships
+
+The current existing classifiers also extend cluster.VectorSpace, an
+abstract class which allows for singular value decomposition (SVD) and vector
+normalisation. SVD is used to reduce the dimensionality of the vector space in
+such a manner as to preserve as much of the variation as possible, by
+reparameterising the axes in order of variability and discarding all bar the
+first d dimensions. Normalisation ensures that vectors fall in the unit
+hypersphere.
+
+Usage example (see also demo())::
+    from nltk import cluster
+    from nltk.cluster import euclidean_distance
+    from numpy import array
+
+    vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
+
+    # initialise the clusterer (will also assign the vectors to clusters)
+    clusterer = cluster.KMeansClusterer(2, euclidean_distance)
+    clusterer.cluster(vectors, True)
+
+    # classify a new vector
+    print(clusterer.classify(array([3, 3])))
+
+Note that the vectors must use numpy array-like
+objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
+efficiency when required.
+"""
+
+from nltk.cluster.util import (VectorSpaceClusterer, Dendrogram,
+                               euclidean_distance, cosine_distance)
+from nltk.cluster.kmeans import KMeansClusterer
+from nltk.cluster.gaac import GAAClusterer
+from nltk.cluster.em import EMClusterer
diff --git a/nlp_resource_data/nltk/cluster/__init__.pyc b/nlp_resource_data/nltk/cluster/__init__.pyc

new file mode 100755 (executable)

index 0000000..9ed4138

Binary files /dev/null and b/nlp_resource_data/nltk/cluster/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/cluster/api.py b/nlp_resource_data/nltk/cluster/api.py

new file mode 100755 (executable)

index 0000000..8679324
--- /dev/null
+++ b/nlp_resource_data/nltk/cluster/api.py
@@ -0,0 +1,74 @@
+# Natural Language Toolkit: Clusterer Interfaces
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# Porting: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+from nltk.probability import DictionaryProbDist
+
+@add_metaclass(ABCMeta)
+class ClusterI(object):
+    """
+    Interface covering basic clustering functionality.
+    """
+    @abstractmethod
+    def cluster(self, vectors, assign_clusters=False):
+        """
+        Assigns the vectors to clusters, learning the clustering parameters
+        from the data. Returns a cluster identifier for each vector.
+        """
+
+    @abstractmethod
+    def classify(self, token):
+        """
+        Classifies the token into a cluster, setting the token's CLUSTER
+        parameter to that cluster identifier.
+        """
+
+    def likelihood(self, vector, label):
+        """
+        Returns the likelihood (a float) of the token having the
+        corresponding cluster.
+        """
+        if self.classify(vector) == label:
+            return 1.0
+        else:
+            return 0.0
+
+    def classification_probdist(self, vector):
+        """
+        Classifies the token into a cluster, returning
+        a probability distribution over the cluster identifiers.
+        """
+        likelihoods = {}
+        sum = 0.0
+        for cluster in self.cluster_names():
+            likelihoods[cluster] = self.likelihood(vector, cluster)
+            sum += likelihoods[cluster]
+        for cluster in self.cluster_names():
+            likelihoods[cluster] /= sum
+        return DictionaryProbDist(likelihoods)
+
+    @abstractmethod
+    def num_clusters(self):
+        """
+        Returns the number of clusters.
+        """
+
+    def cluster_names(self):
+        """
+        Returns the names of the clusters.
+        :rtype: list
+        """
+        return list(range(self.num_clusters()))
+
+    def cluster_name(self, index):
+        """
+        Returns the names of the cluster at index.
+        """
+        return index
diff --git a/nlp_resource_data/nltk/cluster/api.pyc b/nlp_resource_data/nltk/cluster/api.pyc

new file mode 100755 (executable)

index 0000000..68a9bac

Binary files /dev/null and b/nlp_resource_data/nltk/cluster/api.pyc differ
diff --git a/nlp_resource_data/nltk/cluster/em.py b/nlp_resource_data/nltk/cluster/em.py

new file mode 100755 (executable)

index 0000000..54b42f5
--- /dev/null
+++ b/nlp_resource_data/nltk/cluster/em.py
@@ -0,0 +1,247 @@
+# Natural Language Toolkit: Expectation Maximization Clusterer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.compat import python_2_unicode_compatible
+from nltk.cluster.util import VectorSpaceClusterer
+
+@python_2_unicode_compatible
+class EMClusterer(VectorSpaceClusterer):
+    """
+    The Gaussian EM clusterer models the vectors as being produced by
+    a mixture of k Gaussian sources. The parameters of these sources
+    (prior probability, mean and covariance matrix) are then found to
+    maximise the likelihood of the given data. This is done with the
+    expectation maximisation algorithm. It starts with k arbitrarily
+    chosen means, priors and covariance matrices. It then calculates
+    the membership probabilities for each vector in each of the
+    clusters; this is the 'E' step. The cluster parameters are then
+    updated in the 'M' step using the maximum likelihood estimate from
+    the cluster membership probabilities. This process continues until
+    the likelihood of the data does not significantly increase.
+    """
+
+    def __init__(self, initial_means, priors=None, covariance_matrices=None,
+                       conv_threshold=1e-6, bias=0.1, normalise=False,
+                       svd_dimensions=None):
+        """
+        Creates an EM clusterer with the given starting parameters,
+        convergence threshold and vector mangling parameters.
+
+        :param  initial_means: the means of the gaussian cluster centers
+        :type   initial_means: [seq of] numpy array or seq of SparseArray
+        :param  priors: the prior probability for each cluster
+        :type   priors: numpy array or seq of float
+        :param  covariance_matrices: the covariance matrix for each cluster
+        :type   covariance_matrices: [seq of] numpy array
+        :param  conv_threshold: maximum change in likelihood before deemed
+                    convergent
+        :type   conv_threshold: int or float
+        :param  bias: variance bias used to ensure non-singular covariance
+                      matrices
+        :type   bias: float
+        :param  normalise:  should vectors be normalised to length 1
+        :type   normalise:  boolean
+        :param  svd_dimensions: number of dimensions to use in reducing vector
+                               dimensionsionality with SVD
+        :type   svd_dimensions: int
+        """
+        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
+        self._means = numpy.array(initial_means, numpy.float64)
+        self._num_clusters = len(initial_means)
+        self._conv_threshold = conv_threshold
+        self._covariance_matrices = covariance_matrices
+        self._priors = priors
+        self._bias = bias
+
+    def num_clusters(self):
+        return self._num_clusters
+
+    def cluster_vectorspace(self, vectors, trace=False):
+        assert len(vectors) > 0
+
+        # set the parameters to initial values
+        dimensions = len(vectors[0])
+        means = self._means
+        priors = self._priors
+        if not priors:
+            priors = self._priors = numpy.ones(self._num_clusters,
+                                        numpy.float64) / self._num_clusters
+        covariances = self._covariance_matrices
+        if not covariances:
+            covariances = self._covariance_matrices = \
+                [ numpy.identity(dimensions, numpy.float64)
+                  for i in range(self._num_clusters) ]
+
+        # do the E and M steps until the likelihood plateaus
+        lastl = self._loglikelihood(vectors, priors, means, covariances)
+        converged = False
+
+        while not converged:
+            if trace: print('iteration; loglikelihood', lastl)
+            # E-step, calculate hidden variables, h[i,j]
+            h = numpy.zeros((len(vectors), self._num_clusters),
+                numpy.float64)
+            for i in range(len(vectors)):
+                for j in range(self._num_clusters):
+                    h[i,j] = priors[j] * self._gaussian(means[j],
+                                               covariances[j], vectors[i])
+                h[i,:] /= sum(h[i,:])
+
+            # M-step, update parameters - cvm, p, mean
+            for j in range(self._num_clusters):
+                covariance_before = covariances[j]
+                new_covariance = numpy.zeros((dimensions, dimensions),
+                            numpy.float64)
+                new_mean = numpy.zeros(dimensions, numpy.float64)
+                sum_hj = 0.0
+                for i in range(len(vectors)):
+                    delta = vectors[i] - means[j]
+                    new_covariance += h[i,j] * \
+                        numpy.multiply.outer(delta, delta)
+                    sum_hj += h[i,j]
+                    new_mean += h[i,j] * vectors[i]
+                covariances[j] = new_covariance / sum_hj
+                means[j] = new_mean / sum_hj
+                priors[j] = sum_hj / len(vectors)
+
+                # bias term to stop covariance matrix being singular
+                covariances[j] += self._bias * \
+                    numpy.identity(dimensions, numpy.float64)
+
+            # calculate likelihood - FIXME: may be broken
+            l = self._loglikelihood(vectors, priors, means, covariances)
+
+            # check for convergence
+            if abs(lastl - l) < self._conv_threshold:
+                converged = True
+            lastl = l
+
+    def classify_vectorspace(self, vector):
+        best = None
+        for j in range(self._num_clusters):
+            p = self._priors[j] * self._gaussian(self._means[j],
+                                    self._covariance_matrices[j], vector)
+            if not best or p > best[0]:
+                best = (p, j)
+        return best[1]
+
+    def likelihood_vectorspace(self, vector, cluster):
+        cid = self.cluster_names().index(cluster)
+        return self._priors[cluster] * self._gaussian(self._means[cluster],
+                                self._covariance_matrices[cluster], vector)
+
+    def _gaussian(self, mean, cvm, x):
+        m = len(mean)
+        assert cvm.shape == (m, m), \
+            'bad sized covariance matrix, %s' % str(cvm.shape)
+        try:
+            det = numpy.linalg.det(cvm)
+            inv = numpy.linalg.inv(cvm)
+            a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
+            dx = x - mean
+            print(dx, inv)
+            b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx)
+            return a * numpy.exp(b)
+        except OverflowError:
+            # happens when the exponent is negative infinity - i.e. b = 0
+            # i.e. the inverse of cvm is huge (cvm is almost zero)
+            return 0
+
+    def _loglikelihood(self, vectors, priors, means, covariances):
+        llh = 0.0
+        for vector in vectors:
+            p = 0
+            for j in range(len(priors)):
+                p += priors[j] * \
+                         self._gaussian(means[j], covariances[j], vector)
+            llh += numpy.log(p)
+        return llh
+
+    def __repr__(self):
+        return '<EMClusterer means=%s>' % list(self._means)
+
+def demo():
+    """
+    Non-interactive demonstration of the clusterers with simple 2-D data.
+    """
+
+    from nltk import cluster
+
+    # example from figure 14.10, page 519, Manning and Schutze
+
+    vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
+    means = [[4, 2], [4, 2.01]]
+
+    clusterer = cluster.EMClusterer(means, bias=0.1)
+    clusters = clusterer.cluster(vectors, True, trace=True)
+
+    print('Clustered:', vectors)
+    print('As:       ', clusters)
+    print()
+
+    for c in range(2):
+        print('Cluster:', c)
+        print('Prior:  ', clusterer._priors[c])
+        print('Mean:   ', clusterer._means[c])
+        print('Covar:  ', clusterer._covariance_matrices[c])
+        print()
+
+    # classify a new vector
+    vector = numpy.array([2, 2])
+    print('classify(%s):' % vector, end=' ')
+    print(clusterer.classify(vector))
+
+    # show the classification probabilities
+    vector = numpy.array([2, 2])
+    print('classification_probdist(%s):' % vector)
+    pdist = clusterer.classification_probdist(vector)
+    for sample in pdist.samples():
+        print('%s => %.0f%%' % (sample,
+                    pdist.prob(sample) *100))
+
+#
+#     The following demo code is broken.
+#
+#     # use a set of tokens with 2D indices
+#     vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+#     # test the EM clusterer with means given by k-means (2) and
+#     # dimensionality reduction
+#     clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1)
+#     print 'Clusterer:', clusterer
+#     clusters = clusterer.cluster(vectors)
+#     means = clusterer.means()
+#     print 'Means:', clusterer.means()
+#     print
+
+#     clusterer = cluster.EMClusterer(means, svd_dimensions=1)
+#     clusters = clusterer.cluster(vectors, True)
+#     print 'Clusterer:', clusterer
+#     print 'Clustered:', str(vectors)[:60], '...'
+#     print 'As:', str(clusters)[:60], '...'
+#     print
+
+#     # classify a new vector
+#     vector = numpy.array([3, 3])
+#     print 'classify(%s):' % vector,
+#     print clusterer.classify(vector)
+#     print
+
+#     # show the classification probabilities
+#     vector = numpy.array([2.2, 2])
+#     print 'classification_probdist(%s)' % vector
+#     pdist = clusterer.classification_probdist(vector)
+#     for sample in pdist:
+#         print '%s => %.0f%%' % (sample, pdist.prob(sample) *100)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/cluster/em.pyc b/nlp_resource_data/nltk/cluster/em.pyc

new file mode 100755 (executable)

index 0000000..eeabc37

Binary files /dev/null and b/nlp_resource_data/nltk/cluster/em.pyc differ
diff --git a/nlp_resource_data/nltk/cluster/gaac.py b/nlp_resource_data/nltk/cluster/gaac.py

new file mode 100755 (executable)

index 0000000..2ec63c4
--- /dev/null
+++ b/nlp_resource_data/nltk/cluster/gaac.py
@@ -0,0 +1,168 @@
+# Natural Language Toolkit: Group Average Agglomerative Clusterer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.cluster.util import VectorSpaceClusterer, Dendrogram, cosine_distance
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class GAAClusterer(VectorSpaceClusterer):
+    """
+    The Group Average Agglomerative starts with each of the N vectors as singleton
+    clusters. It then iteratively merges pairs of clusters which have the
+    closest centroids.  This continues until there is only one cluster. The
+    order of merges gives rise to a dendrogram: a tree with the earlier merges
+    lower than later merges. The membership of a given number of clusters c, 1
+    <= c <= N, can be found by cutting the dendrogram at depth c.
+
+    This clusterer uses the cosine similarity metric only, which allows for
+    efficient speed-up in the clustering process.
+    """
+
+    def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
+        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
+        self._num_clusters = num_clusters
+        self._dendrogram = None
+        self._groups_values = None
+
+    def cluster(self, vectors, assign_clusters=False, trace=False):
+        # stores the merge order
+        self._dendrogram = Dendrogram(
+            [numpy.array(vector, numpy.float64) for vector in vectors])
+        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
+
+    def cluster_vectorspace(self, vectors, trace=False):
+        # variables describing the initial situation
+        N = len(vectors)
+        cluster_len = [1]*N
+        cluster_count = N
+        index_map = numpy.arange(N)
+
+        # construct the similarity matrix
+        dims = (N, N)
+        dist = numpy.ones(dims, dtype=numpy.float)*numpy.inf
+        for i in range(N):
+            for j in range(i+1, N):
+                dist[i, j] = cosine_distance(vectors[i], vectors[j])
+
+        while cluster_count > max(self._num_clusters, 1):
+            i, j = numpy.unravel_index(dist.argmin(), dims)
+            if trace:
+                print("merging %d and %d" % (i, j))
+
+            # update similarities for merging i and j
+            self._merge_similarities(dist, cluster_len, i, j)
+
+            # remove j
+            dist[:, j] = numpy.inf
+            dist[j, :] = numpy.inf
+
+            # merge the clusters
+            cluster_len[i] = cluster_len[i]+cluster_len[j]
+            self._dendrogram.merge(index_map[i], index_map[j])
+            cluster_count -= 1
+
+            # update the index map to reflect the indexes if we
+            # had removed j
+            index_map[j+1:] -= 1
+            index_map[j] = N
+
+        self.update_clusters(self._num_clusters)
+
+    def _merge_similarities(self, dist, cluster_len, i, j):
+        # the new cluster i merged from i and j adopts the average of
+        # i and j's similarity to each other cluster, weighted by the
+        # number of points in the clusters i and j
+        i_weight = cluster_len[i]
+        j_weight = cluster_len[j]
+        weight_sum = i_weight+j_weight
+
+        # update for x<i
+        dist[:i, i] = dist[:i, i]*i_weight + dist[:i, j]*j_weight
+        dist[:i, i] /= weight_sum
+        # update for i<x<j
+        dist[i, i+1:j] = dist[i, i+1:j]*i_weight + dist[i+1:j, j]*j_weight
+        # update for i<j<x
+        dist[i, j+1:] = dist[i, j+1:]*i_weight + dist[j, j+1:]*j_weight
+        dist[i, i+1:] /= weight_sum
+
+    def update_clusters(self, num_clusters):
+        clusters = self._dendrogram.groups(num_clusters)
+        self._centroids = []
+        for cluster in clusters:
+            assert len(cluster) > 0
+            if self._should_normalise:
+                centroid = self._normalise(cluster[0])
+            else:
+                centroid = numpy.array(cluster[0])
+            for vector in cluster[1:]:
+                if self._should_normalise:
+                    centroid += self._normalise(vector)
+                else:
+                    centroid += vector
+            centroid /= len(cluster)
+            self._centroids.append(centroid)
+        self._num_clusters = len(self._centroids)
+
+    def classify_vectorspace(self, vector):
+        best = None
+        for i in range(self._num_clusters):
+            centroid = self._centroids[i]
+            dist = cosine_distance(vector, centroid)
+            if not best or dist < best[0]:
+                best = (dist, i)
+        return best[1]
+
+    def dendrogram(self):
+        """
+        :return: The dendrogram representing the current clustering
+        :rtype:  Dendrogram
+        """
+        return self._dendrogram
+
+    def num_clusters(self):
+        return self._num_clusters
+
+    def __repr__(self):
+        return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
+
+def demo():
+    """
+    Non-interactive demonstration of the clusterers with simple 2-D data.
+    """
+
+    from nltk.cluster import GAAClusterer
+
+    # use a set of tokens with 2D indices
+    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+    # test the GAAC clusterer with 4 clusters
+    clusterer = GAAClusterer(4)
+    clusters = clusterer.cluster(vectors, True)
+
+    print('Clusterer:', clusterer)
+    print('Clustered:', vectors)
+    print('As:', clusters)
+    print()
+
+    # show the dendrogram
+    clusterer.dendrogram().show()
+
+    # classify a new vector
+    vector = numpy.array([3, 3])
+    print('classify(%s):' % vector, end=' ')
+    print(clusterer.classify(vector))
+    print()
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/cluster/gaac.pyc b/nlp_resource_data/nltk/cluster/gaac.pyc

new file mode 100755 (executable)

index 0000000..db3db2d

Binary files /dev/null and b/nlp_resource_data/nltk/cluster/gaac.pyc differ
diff --git a/nlp_resource_data/nltk/cluster/kmeans.py b/nlp_resource_data/nltk/cluster/kmeans.py

new file mode 100755 (executable)

index 0000000..2da6c7c
--- /dev/null
+++ b/nlp_resource_data/nltk/cluster/kmeans.py
@@ -0,0 +1,221 @@
+# Natural Language Toolkit: K-Means Clusterer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
+
+import copy
+import random
+import sys
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+
+from nltk.cluster.util import VectorSpaceClusterer
+from nltk.compat import python_2_unicode_compatible
+
+
+@python_2_unicode_compatible
+class KMeansClusterer(VectorSpaceClusterer):
+    """
+    The K-means clusterer starts with k arbitrary chosen means then allocates
+    each vector to the cluster with the closest mean. It then recalculates the
+    means of each cluster as the centroid of the vectors in the cluster. This
+    process repeats until the cluster memberships stabilise. This is a
+    hill-climbing algorithm which may converge to a local maximum. Hence the
+    clustering is often repeated with random initial means and the most
+    commonly occurring output means are chosen.
+    """
+
+    def __init__(self, num_means, distance, repeats=1,
+                       conv_test=1e-6, initial_means=None,
+                       normalise=False, svd_dimensions=None,
+                       rng=None, avoid_empty_clusters=False):
+
+        """
+        :param  num_means:  the number of means to use (may use fewer)
+        :type   num_means:  int
+        :param  distance:   measure of distance between two vectors
+        :type   distance:   function taking two vectors and returing a float
+        :param  repeats:    number of randomised clustering trials to use
+        :type   repeats:    int
+        :param  conv_test:  maximum variation in mean differences before
+                            deemed convergent
+        :type   conv_test:  number
+        :param  initial_means: set of k initial means
+        :type   initial_means: sequence of vectors
+        :param  normalise:  should vectors be normalised to length 1
+        :type   normalise:  boolean
+        :param svd_dimensions: number of dimensions to use in reducing vector
+                               dimensionsionality with SVD
+        :type svd_dimensions: int
+        :param  rng:        random number generator (or None)
+        :type   rng:        Random
+        :param avoid_empty_clusters: include current centroid in computation
+                                     of next one; avoids undefined behavior
+                                     when clusters become empty
+        :type avoid_empty_clusters: boolean
+        """
+        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
+        self._num_means = num_means
+        self._distance = distance
+        self._max_difference = conv_test
+        assert not initial_means or len(initial_means) == num_means
+        self._means = initial_means
+        assert repeats >= 1
+        assert not (initial_means and repeats > 1)
+        self._repeats = repeats
+        self._rng = (rng if rng else random.Random())
+        self._avoid_empty_clusters = avoid_empty_clusters
+
+    def cluster_vectorspace(self, vectors, trace=False):
+        if self._means and self._repeats > 1:
+            print('Warning: means will be discarded for subsequent trials')
+
+        meanss = []
+        for trial in range(self._repeats):
+            if trace: print('k-means trial', trial)
+            if not self._means or trial > 1:
+                self._means = self._rng.sample(list(vectors), self._num_means)
+            self._cluster_vectorspace(vectors, trace)
+            meanss.append(self._means)
+
+        if len(meanss) > 1:
+            # sort the means first (so that different cluster numbering won't
+            # effect the distance comparison)
+            for means in meanss:
+                means.sort(key=sum)
+
+            # find the set of means that's minimally different from the others
+            min_difference = min_means = None
+            for i in range(len(meanss)):
+                d = 0
+                for j in range(len(meanss)):
+                    if i != j:
+                        d += self._sum_distances(meanss[i], meanss[j])
+                if min_difference is None or d < min_difference:
+                    min_difference, min_means = d, meanss[i]
+
+            # use the best means
+            self._means = min_means
+
+    def _cluster_vectorspace(self, vectors, trace=False):
+        if self._num_means < len(vectors):
+            # perform k-means clustering
+            converged = False
+            while not converged:
+                # assign the tokens to clusters based on minimum distance to
+                # the cluster means
+                clusters = [[] for m in range(self._num_means)]
+                for vector in vectors:
+                    index = self.classify_vectorspace(vector)
+                    clusters[index].append(vector)
+
+                if trace: print('iteration')
+                #for i in range(self._num_means):
+                    #print '  mean', i, 'allocated', len(clusters[i]), 'vectors'
+
+                # recalculate cluster means by computing the centroid of each cluster
+                new_means = list(map(self._centroid, clusters, self._means))
+
+                # measure the degree of change from the previous step for convergence
+                difference = self._sum_distances(self._means, new_means)
+                if difference < self._max_difference:
+                    converged = True
+
+                # remember the new means
+                self._means = new_means
+
+    def classify_vectorspace(self, vector):
+        # finds the closest cluster centroid
+        # returns that cluster's index
+        best_distance = best_index = None
+        for index in range(len(self._means)):
+            mean = self._means[index]
+            dist = self._distance(vector, mean)
+            if best_distance is None or dist < best_distance:
+                best_index, best_distance = index, dist
+        return best_index
+
+    def num_clusters(self):
+        if self._means:
+            return len(self._means)
+        else:
+            return self._num_means
+
+    def means(self):
+        """
+        The means used for clustering.
+        """
+        return self._means
+
+    def _sum_distances(self, vectors1, vectors2):
+        difference = 0.0
+        for u, v in zip(vectors1, vectors2):
+            difference += self._distance(u, v)
+        return difference
+
+    def _centroid(self, cluster, mean):
+        if self._avoid_empty_clusters:
+            centroid = copy.copy(mean)
+            for vector in cluster:
+                centroid += vector
+            return centroid / (1+len(cluster))
+        else:
+            if not len(cluster):
+                sys.stderr.write('Error: no centroid defined for empty cluster.\n')
+                sys.stderr.write('Try setting argument \'avoid_empty_clusters\' to True\n')
+                assert(False)
+            centroid = copy.copy(cluster[0])
+            for vector in cluster[1:]:
+                centroid += vector
+            return centroid / len(cluster)
+
+    def __repr__(self):
+        return '<KMeansClusterer means=%s repeats=%d>' % \
+                    (self._means, self._repeats)
+
+#################################################################################
+
+def demo():
+    # example from figure 14.9, page 517, Manning and Schutze
+
+    from nltk.cluster import KMeansClusterer, euclidean_distance
+
+    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
+    means = [[4, 3], [5, 5]]
+
+    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
+    clusters = clusterer.cluster(vectors, True, trace=True)
+
+    print('Clustered:', vectors)
+    print('As:', clusters)
+    print('Means:', clusterer.means())
+    print()
+
+    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+    # test k-means using the euclidean distance metric, 2 means and repeat
+    # clustering 10 times with random seeds
+
+    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
+    clusters = clusterer.cluster(vectors, True)
+    print('Clustered:', vectors)
+    print('As:', clusters)
+    print('Means:', clusterer.means())
+    print()
+
+    # classify a new vector
+    vector = numpy.array([3, 3])
+    print('classify(%s):' % vector, end=' ')
+    print(clusterer.classify(vector))
+    print()
+
+if __name__ == '__main__':
+    demo()
+
diff --git a/nlp_resource_data/nltk/cluster/kmeans.pyc b/nlp_resource_data/nltk/cluster/kmeans.pyc

new file mode 100755 (executable)

index 0000000..bc5fc15

Binary files /dev/null and b/nlp_resource_data/nltk/cluster/kmeans.pyc differ
diff --git a/nlp_resource_data/nltk/cluster/util.py b/nlp_resource_data/nltk/cluster/util.py

new file mode 100755 (executable)

index 0000000..08930aa
--- /dev/null
+++ b/nlp_resource_data/nltk/cluster/util.py
@@ -0,0 +1,304 @@
+# Natural Language Toolkit: Clusterer Utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# Contributor: J Richard Snape
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
+from abc import abstractmethod
+
+import copy
+from sys import stdout
+from math import sqrt
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.cluster.api import ClusterI
+from nltk.compat import python_2_unicode_compatible
+
+
+class VectorSpaceClusterer(ClusterI):
+    """
+    Abstract clusterer which takes tokens and maps them into a vector space.
+    Optionally performs singular value decomposition to reduce the
+    dimensionality.
+    """
+    def __init__(self, normalise=False, svd_dimensions=None):
+        """
+        :param normalise:       should vectors be normalised to length 1
+        :type normalise:        boolean
+        :param svd_dimensions:  number of dimensions to use in reducing vector
+                                dimensionsionality with SVD
+        :type svd_dimensions:   int
+        """
+        self._Tt = None
+        self._should_normalise = normalise
+        self._svd_dimensions = svd_dimensions
+
+    def cluster(self, vectors, assign_clusters=False, trace=False):
+        assert len(vectors) > 0
+
+        # normalise the vectors
+        if self._should_normalise:
+            vectors = list(map(self._normalise, vectors))
+
+        # use SVD to reduce the dimensionality
+        if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
+            [u, d, vt] = numpy.linalg.svd(numpy.transpose(
+                            numpy.array(vectors)))
+            S = d[:self._svd_dimensions] * \
+                numpy.identity(self._svd_dimensions, numpy.float64)
+            T = u[:, :self._svd_dimensions]
+            Dt = vt[:self._svd_dimensions, :]
+            vectors = numpy.transpose(numpy.dot(S, Dt))
+            self._Tt = numpy.transpose(T)
+
+        # call abstract method to cluster the vectors
+        self.cluster_vectorspace(vectors, trace)
+
+        # assign the vectors to clusters
+        if assign_clusters:
+            return [self.classify(vector) for vector in vectors]
+
+    @abstractmethod
+    def cluster_vectorspace(self, vectors, trace):
+        """
+        Finds the clusters using the given set of vectors.
+        """
+
+    def classify(self, vector):
+        if self._should_normalise:
+            vector = self._normalise(vector)
+        if self._Tt is not None:
+            vector = numpy.dot(self._Tt, vector)
+        cluster = self.classify_vectorspace(vector)
+        return self.cluster_name(cluster)
+
+    @abstractmethod
+    def classify_vectorspace(self, vector):
+        """
+        Returns the index of the appropriate cluster for the vector.
+        """
+
+    def likelihood(self, vector, label):
+        if self._should_normalise:
+            vector = self._normalise(vector)
+        if self._Tt is not None:
+            vector = numpy.dot(self._Tt, vector)
+        return self.likelihood_vectorspace(vector, label)
+
+    def likelihood_vectorspace(self, vector, cluster):
+        """
+        Returns the likelihood of the vector belonging to the cluster.
+        """
+        predicted = self.classify_vectorspace(vector)
+        return (1.0 if cluster == predicted else 0.0)
+
+    def vector(self, vector):
+        """
+        Returns the vector after normalisation and dimensionality reduction
+        """
+        if self._should_normalise:
+            vector = self._normalise(vector)
+        if self._Tt is not None:
+            vector = numpy.dot(self._Tt, vector)
+        return vector
+
+    def _normalise(self, vector):
+        """
+        Normalises the vector to unit length.
+        """
+        return vector / sqrt(numpy.dot(vector, vector))
+
+
+def euclidean_distance(u, v):
+    """
+    Returns the euclidean distance between vectors u and v. This is equivalent
+    to the length of the vector (u - v).
+    """
+    diff = u - v
+    return sqrt(numpy.dot(diff, diff))
+
+
+def cosine_distance(u, v):
+    """
+    Returns 1 minus the cosine of the angle between vectors v and u. This is
+    equal to 1 - (u.v / |u||v|).
+    """
+    return 1 - (numpy.dot(u, v) / (
+                sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
+
+
+class _DendrogramNode(object):
+    """ Tree node of a dendrogram. """
+
+    def __init__(self, value, *children):
+        self._value = value
+        self._children = children
+
+    def leaves(self, values=True):
+        if self._children:
+            leaves = []
+            for child in self._children:
+                leaves.extend(child.leaves(values))
+            return leaves
+        elif values:
+            return [self._value]
+        else:
+            return [self]
+
+    def groups(self, n):
+        queue = [(self._value, self)]
+
+        while len(queue) < n:
+            priority, node = queue.pop()
+            if not node._children:
+                queue.push((priority, node))
+                break
+            for child in node._children:
+                if child._children:
+                    queue.append((child._value, child))
+                else:
+                    queue.append((0, child))
+            # makes the earliest merges at the start, latest at the end
+            queue.sort()
+
+        groups = []
+        for priority, node in queue:
+            groups.append(node.leaves())
+        return groups
+
+    def __lt__(self, comparator):
+        return cosine_distance(self._value, comparator._value) < 0
+
+
+@python_2_unicode_compatible
+class Dendrogram(object):
+    """
+    Represents a dendrogram, a tree with a specified branching order.  This
+    must be initialised with the leaf items, then iteratively call merge for
+    each branch. This class constructs a tree representing the order of calls
+    to the merge function.
+    """
+
+    def __init__(self, items=[]):
+        """
+        :param  items: the items at the leaves of the dendrogram
+        :type   items: sequence of (any)
+        """
+        self._items = [_DendrogramNode(item) for item in items]
+        self._original_items = copy.copy(self._items)
+        self._merge = 1
+
+    def merge(self, *indices):
+        """
+        Merges nodes at given indices in the dendrogram. The nodes will be
+        combined which then replaces the first node specified. All other nodes
+        involved in the merge will be removed.
+
+        :param  indices: indices of the items to merge (at least two)
+        :type   indices: seq of int
+        """
+        assert len(indices) >= 2
+        node = _DendrogramNode(self._merge, *[self._items[i] for i in indices])
+        self._merge += 1
+        self._items[indices[0]] = node
+        for i in indices[1:]:
+            del self._items[i]
+
+    def groups(self, n):
+        """
+        Finds the n-groups of items (leaves) reachable from a cut at depth n.
+        :param  n: number of groups
+        :type   n: int
+        """
+        if len(self._items) > 1:
+            root = _DendrogramNode(self._merge, *self._items)
+        else:
+            root = self._items[0]
+        return root.groups(n)
+
+    def show(self, leaf_labels=[]):
+        """
+        Print the dendrogram in ASCII art to standard out.
+        :param leaf_labels: an optional list of strings to use for labeling the
+                            leaves
+        :type leaf_labels: list
+        """
+
+        # ASCII rendering characters
+        JOIN, HLINK, VLINK = '+', '-', '|'
+
+        # find the root (or create one)
+        if len(self._items) > 1:
+            root = _DendrogramNode(self._merge, *self._items)
+        else:
+            root = self._items[0]
+        leaves = self._original_items
+
+        if leaf_labels:
+            last_row = leaf_labels
+        else:
+            last_row = ["%s" % leaf._value for leaf in leaves]
+
+        # find the bottom row and the best cell width
+        width = max(map(len, last_row)) + 1
+        lhalf = width // 2
+        rhalf = int(width - lhalf - 1)
+
+        # display functions
+        def format(centre, left=' ', right=' '):
+            return '%s%s%s' % (lhalf*left, centre, right*rhalf)
+
+        def display(str):
+            stdout.write(str)
+
+        # for each merge, top down
+        queue = [(root._value, root)]
+        verticals = [format(' ') for leaf in leaves]
+        while queue:
+            priority, node = queue.pop()
+            child_left_leaf = list(map(
+                                lambda c: c.leaves(False)[0], node._children))
+            indices = list(map(leaves.index, child_left_leaf))
+            if child_left_leaf:
+                min_idx = min(indices)
+                max_idx = max(indices)
+            for i in range(len(leaves)):
+                if leaves[i] in child_left_leaf:
+                    if i == min_idx:
+                        display(format(JOIN, ' ', HLINK))
+                    elif i == max_idx:
+                        display(format(JOIN, HLINK, ' '))
+                    else:
+                        display(format(JOIN, HLINK, HLINK))
+                    verticals[i] = format(VLINK)
+                elif min_idx <= i <= max_idx:
+                    display(format(HLINK, HLINK, HLINK))
+                else:
+                    display(verticals[i])
+            display('\n')
+            for child in node._children:
+                if child._children:
+                    queue.append((child._value, child))
+            queue.sort()
+
+            for vertical in verticals:
+                display(vertical)
+            display('\n')
+
+        # finally, display the last line
+        display(''.join(item.center(width) for item in last_row))
+        display('\n')
+
+    def __repr__(self):
+        if len(self._items) > 1:
+            root = _DendrogramNode(self._merge, *self._items)
+        else:
+            root = self._items[0]
+        leaves = root.leaves(False)
+        return '<Dendrogram with %d leaves>' % len(leaves)
diff --git a/nlp_resource_data/nltk/cluster/util.pyc b/nlp_resource_data/nltk/cluster/util.pyc

new file mode 100755 (executable)

index 0000000..9d704a3

Binary files /dev/null and b/nlp_resource_data/nltk/cluster/util.pyc differ
diff --git a/nlp_resource_data/nltk/collections.py b/nlp_resource_data/nltk/collections.py

new file mode 100755 (executable)

index 0000000..1107f7d
--- /dev/null
+++ b/nlp_resource_data/nltk/collections.py
@@ -0,0 +1,687 @@
+# Natural Language Toolkit: Collections
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import
+
+import locale
+import re
+import types
+import textwrap
+import pydoc
+import bisect
+import os
+from itertools import islice, chain, combinations
+from functools import total_ordering
+from collections import defaultdict, deque, Counter
+
+from six import text_type
+
+from nltk.internals import slice_bounds, raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
+
+
+##########################################################################
+# Ordered Dictionary
+##########################################################################
+
+class OrderedDict(dict):
+    def __init__(self, data=None, **kwargs):
+        self._keys = self.keys(data, kwargs.get('keys'))
+        self._default_factory = kwargs.get('default_factory')
+        if data is None:
+            dict.__init__(self)
+        else:
+            dict.__init__(self, data)
+
+    def __delitem__(self, key):
+        dict.__delitem__(self, key)
+        self._keys.remove(key)
+
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return self.__missing__(key)
+
+    def __iter__(self):
+        return (key for key in self.keys())
+
+    def __missing__(self, key):
+        if not self._default_factory and key not in self._keys:
+            raise KeyError()
+        return self._default_factory()
+
+    def __setitem__(self, key, item):
+        dict.__setitem__(self, key, item)
+        if key not in self._keys:
+            self._keys.append(key)
+
+    def clear(self):
+        dict.clear(self)
+        self._keys.clear()
+
+    def copy(self):
+        d = dict.copy(self)
+        d._keys = self._keys
+        return d
+
+    def items(self):
+        # returns iterator under python 3 and list under python 2
+        return zip(self.keys(), self.values())
+
+    def keys(self, data=None, keys=None):
+        if data:
+            if keys:
+                assert isinstance(keys, list)
+                assert len(data) == len(keys)
+                return keys
+            else:
+                assert isinstance(data, dict) or \
+                       isinstance(data, OrderedDict) or \
+                       isinstance(data, list)
+                if isinstance(data, dict) or isinstance(data, OrderedDict):
+                    return data.keys()
+                elif isinstance(data, list):
+                    return [key for (key, value) in data]
+        elif '_keys' in self.__dict__:
+            return self._keys
+        else:
+            return []
+
+    def popitem(self):
+        if not self._keys:
+            raise KeyError()
+
+        key = self._keys.pop()
+        value = self[key]
+        del self[key]
+        return (key, value)
+
+    def setdefault(self, key, failobj=None):
+        dict.setdefault(self, key, failobj)
+        if key not in self._keys:
+            self._keys.append(key)
+
+    def update(self, data):
+        dict.update(self, data)
+        for key in self.keys(data):
+            if key not in self._keys:
+                self._keys.append(key)
+
+    def values(self):
+        # returns iterator under python 3
+        return map(self.get, self._keys)
+
+######################################################################
+# Lazy Sequences
+######################################################################
+
+@total_ordering
+@python_2_unicode_compatible
+class AbstractLazySequence(object):
+    """
+    An abstract base class for read-only sequences whose values are
+    computed as needed.  Lazy sequences act like tuples -- they can be
+    indexed, sliced, and iterated over; but they may not be modified.
+
+    The most common application of lazy sequences in NLTK is for
+    corpus view objects, which provide access to the contents of a
+    corpus without loading the entire corpus into memory, by loading
+    pieces of the corpus from disk as needed.
+
+    The result of modifying a mutable element of a lazy sequence is
+    undefined.  In particular, the modifications made to the element
+    may or may not persist, depending on whether and when the lazy
+    sequence caches that element's value or reconstructs it from
+    scratch.
+
+    Subclasses are required to define two methods: ``__len__()``
+    and ``iterate_from()``.
+    """
+    def __len__(self):
+        """
+        Return the number of tokens in the corpus file underlying this
+        corpus view.
+        """
+        raise NotImplementedError('should be implemented by subclass')
+
+    def iterate_from(self, start):
+        """
+        Return an iterator that generates the tokens in the corpus
+        file underlying this corpus view, starting at the token number
+        ``start``.  If ``start>=len(self)``, then this iterator will
+        generate no tokens.
+        """
+        raise NotImplementedError('should be implemented by subclass')
+
+    def __getitem__(self, i):
+        """
+        Return the *i* th token in the corpus file underlying this
+        corpus view.  Negative indices and spans are both supported.
+        """
+        if isinstance(i, slice):
+            start, stop = slice_bounds(self, i)
+            return LazySubsequence(self, start, stop)
+        else:
+            # Handle negative indices
+            if i < 0: i += len(self)
+            if i < 0: raise IndexError('index out of range')
+            # Use iterate_from to extract it.
+            try:
+                return next(self.iterate_from(i))
+            except StopIteration:
+                raise IndexError('index out of range')
+
+    def __iter__(self):
+        """Return an iterator that generates the tokens in the corpus
+        file underlying this corpus view."""
+        return self.iterate_from(0)
+
+    def count(self, value):
+        """Return the number of times this list contains ``value``."""
+        return sum(1 for elt in self if elt==value)
+
+    def index(self, value, start=None, stop=None):
+        """Return the index of the first occurrence of ``value`` in this
+        list that is greater than or equal to ``start`` and less than
+        ``stop``.  Negative start and stop values are treated like negative
+        slice bounds -- i.e., they count from the end of the list."""
+        start, stop = slice_bounds(self, slice(start, stop))
+        for i, elt in enumerate(islice(self, start, stop)):
+            if elt == value: return i+start
+        raise ValueError('index(x): x not in list')
+
+    def __contains__(self, value):
+        """Return true if this list contains ``value``."""
+        return bool(self.count(value))
+
+    def __add__(self, other):
+        """Return a list concatenating self with other."""
+        return LazyConcatenation([self, other])
+
+    def __radd__(self, other):
+        """Return a list concatenating other with self."""
+        return LazyConcatenation([other, self])
+
+    def __mul__(self, count):
+        """Return a list concatenating self with itself ``count`` times."""
+        return LazyConcatenation([self] * count)
+
+    def __rmul__(self, count):
+        """Return a list concatenating self with itself ``count`` times."""
+        return LazyConcatenation([self] * count)
+
+    _MAX_REPR_SIZE = 60
+    def __repr__(self):
+        """
+        Return a string representation for this corpus view that is
+        similar to a list's representation; but if it would be more
+        than 60 characters long, it is truncated.
+        """
+        pieces = []
+        length = 5
+        for elt in self:
+            pieces.append(repr(elt))
+            length += len(pieces[-1]) + 2
+            if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+                return '[%s, ...]' % text_type(', ').join(pieces[:-1])
+        return '[%s]' % text_type(', ').join(pieces)
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and list(self) == list(other))
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if type(other) != type(self):
+            raise_unorderable_types("<", self, other)
+        return list(self) < list(other)
+
+    def __hash__(self):
+        """
+        :raise ValueError: Corpus view objects are unhashable.
+        """
+        raise ValueError('%s objects are unhashable' %
+                         self.__class__.__name__)
+
+
+class LazySubsequence(AbstractLazySequence):
+    """
+    A subsequence produced by slicing a lazy sequence.  This slice
+    keeps a reference to its source sequence, and generates its values
+    by looking them up in the source sequence.
+    """
+
+    MIN_SIZE = 100
+    """
+    The minimum size for which lazy slices should be created.  If
+    ``LazySubsequence()`` is called with a subsequence that is
+    shorter than ``MIN_SIZE``, then a tuple will be returned instead.
+    """
+
+    def __new__(cls, source, start, stop):
+        """
+        Construct a new slice from a given underlying sequence.  The
+        ``start`` and ``stop`` indices should be absolute indices --
+        i.e., they should not be negative (for indexing from the back
+        of a list) or greater than the length of ``source``.
+        """
+        # If the slice is small enough, just use a tuple.
+        if stop-start < cls.MIN_SIZE:
+            return list(islice(source.iterate_from(start), stop-start))
+        else:
+            return object.__new__(cls)
+
+    def __init__(self, source, start, stop):
+        self._source = source
+        self._start = start
+        self._stop = stop
+
+    def __len__(self):
+        return self._stop - self._start
+
+    def iterate_from(self, start):
+        return islice(self._source.iterate_from(start+self._start),
+                      max(0, len(self)-start))
+
+
+class LazyConcatenation(AbstractLazySequence):
+    """
+    A lazy sequence formed by concatenating a list of lists.  This
+    underlying list of lists may itself be lazy.  ``LazyConcatenation``
+    maintains an index that it uses to keep track of the relationship
+    between offsets in the concatenated lists and offsets in the
+    sublists.
+    """
+    def __init__(self, list_of_lists):
+        self._list = list_of_lists
+        self._offsets = [0]
+
+    def __len__(self):
+        if len(self._offsets) <= len(self._list):
+            for tok in self.iterate_from(self._offsets[-1]): pass
+        return self._offsets[-1]
+
+    def iterate_from(self, start_index):
+        if start_index < self._offsets[-1]:
+            sublist_index = bisect.bisect_right(self._offsets, start_index)-1
+        else:
+            sublist_index = len(self._offsets)-1
+
+        index = self._offsets[sublist_index]
+
+        # Construct an iterator over the sublists.
+        if isinstance(self._list, AbstractLazySequence):
+            sublist_iter = self._list.iterate_from(sublist_index)
+        else:
+            sublist_iter = islice(self._list, sublist_index, None)
+
+        for sublist in sublist_iter:
+            if sublist_index == (len(self._offsets)-1):
+                assert index+len(sublist) >= self._offsets[-1], (
+                        'offests not monotonic increasing!')
+                self._offsets.append(index+len(sublist))
+            else:
+                assert self._offsets[sublist_index+1] == index+len(sublist), (
+                        'inconsistent list value (num elts)')
+
+            for value in sublist[max(0, start_index-index):]:
+                yield value
+
+            index += len(sublist)
+            sublist_index += 1
+
+
+class LazyMap(AbstractLazySequence):
+    """
+    A lazy sequence whose elements are formed by applying a given
+    function to each element in one or more underlying lists.  The
+    function is applied lazily -- i.e., when you read a value from the
+    list, ``LazyMap`` will calculate that value by applying its
+    function to the underlying lists' value(s).  ``LazyMap`` is
+    essentially a lazy version of the Python primitive function
+    ``map``.  In particular, the following two expressions are
+    equivalent:
+
+        >>> from nltk.collections import LazyMap
+        >>> function = str
+        >>> sequence = [1,2,3]
+        >>> map(function, sequence) # doctest: +SKIP
+        ['1', '2', '3']
+        >>> list(LazyMap(function, sequence))
+        ['1', '2', '3']
+
+    Like the Python ``map`` primitive, if the source lists do not have
+    equal size, then the value None will be supplied for the
+    'missing' elements.
+
+    Lazy maps can be useful for conserving memory, in cases where
+    individual values take up a lot of space.  This is especially true
+    if the underlying list's values are constructed lazily, as is the
+    case with many corpus readers.
+
+    A typical example of a use case for this class is performing
+    feature detection on the tokens in a corpus.  Since featuresets
+    are encoded as dictionaries, which can take up a lot of memory,
+    using a ``LazyMap`` can significantly reduce memory usage when
+    training and running classifiers.
+    """
+    def __init__(self, function, *lists, **config):
+        """
+        :param function: The function that should be applied to
+            elements of ``lists``.  It should take as many arguments
+            as there are ``lists``.
+        :param lists: The underlying lists.
+        :param cache_size: Determines the size of the cache used
+            by this lazy map.  (default=5)
+        """
+        if not lists:
+            raise TypeError('LazyMap requires at least two args')
+
+        self._lists = lists
+        self._func = function
+        self._cache_size = config.get('cache_size', 5)
+        self._cache = ({} if self._cache_size > 0 else None)
+
+        # If you just take bool() of sum() here _all_lazy will be true just
+        # in case n >= 1 list is an AbstractLazySequence.  Presumably this
+        # isn't what's intended.
+        self._all_lazy = sum(isinstance(lst, AbstractLazySequence)
+                             for lst in lists) == len(lists)
+
+    def iterate_from(self, index):
+        # Special case: one lazy sublist
+        if len(self._lists) == 1 and self._all_lazy:
+            for value in self._lists[0].iterate_from(index):
+                yield self._func(value)
+            return
+
+        # Special case: one non-lazy sublist
+        elif len(self._lists) == 1:
+            while True:
+                try: yield self._func(self._lists[0][index])
+                except IndexError: return
+                index += 1
+
+        # Special case: n lazy sublists
+        elif self._all_lazy:
+            iterators = [lst.iterate_from(index) for lst in self._lists]
+            while True:
+                elements = []
+                for iterator in iterators:
+                    try: elements.append(next(iterator))
+                    except: elements.append(None)
+                if elements == [None] * len(self._lists):
+                    return
+                yield self._func(*elements)
+                index += 1
+
+        # general case
+        else:
+            while True:
+                try: elements = [lst[index] for lst in self._lists]
+                except IndexError:
+                    elements = [None] * len(self._lists)
+                    for i, lst in enumerate(self._lists):
+                        try: elements[i] = lst[index]
+                        except IndexError: pass
+                    if elements == [None] * len(self._lists):
+                        return
+                yield self._func(*elements)
+                index += 1
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            sliced_lists = [lst[index] for lst in self._lists]
+            return LazyMap(self._func, *sliced_lists)
+        else:
+            # Handle negative indices
+            if index < 0: index += len(self)
+            if index < 0: raise IndexError('index out of range')
+            # Check the cache
+            if self._cache is not None and index in self._cache:
+                return self._cache[index]
+            # Calculate the value
+            try: val = next(self.iterate_from(index))
+            except StopIteration:
+                raise IndexError('index out of range')
+            # Update the cache
+            if self._cache is not None:
+                if len(self._cache) > self._cache_size:
+                    self._cache.popitem() # discard random entry
+                self._cache[index] = val
+            # Return the value
+            return val
+
+    def __len__(self):
+        return max(len(lst) for lst in self._lists)
+
+
+class LazyZip(LazyMap):
+    """
+    A lazy sequence whose elements are tuples, each containing the i-th
+    element from each of the argument sequences.  The returned list is
+    truncated in length to the length of the shortest argument sequence. The
+    tuples are constructed lazily -- i.e., when you read a value from the
+    list, ``LazyZip`` will calculate that value by forming a tuple from
+    the i-th element of each of the argument sequences.
+
+    ``LazyZip`` is essentially a lazy version of the Python primitive function
+    ``zip``.  In particular, an evaluated LazyZip is equivalent to a zip:
+
+        >>> from nltk.collections import LazyZip
+        >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
+        >>> zip(sequence1, sequence2) # doctest: +SKIP
+        [(1, 'a'), (2, 'b'), (3, 'c')]
+        >>> list(LazyZip(sequence1, sequence2))
+        [(1, 'a'), (2, 'b'), (3, 'c')]
+        >>> sequences = [sequence1, sequence2, [6,7,8,9]]
+        >>> list(zip(*sequences)) == list(LazyZip(*sequences))
+        True
+
+    Lazy zips can be useful for conserving memory in cases where the argument
+    sequences are particularly long.
+
+    A typical example of a use case for this class is combining long sequences
+    of gold standard and predicted values in a classification or tagging task
+    in order to calculate accuracy.  By constructing tuples lazily and
+    avoiding the creation of an additional long sequence, memory usage can be
+    significantly reduced.
+    """
+    def __init__(self, *lists):
+        """
+        :param lists: the underlying lists
+        :type lists: list(list)
+        """
+        LazyMap.__init__(self, lambda *elts: elts, *lists)
+
+    def iterate_from(self, index):
+        iterator = LazyMap.iterate_from(self, index)
+        while index < len(self):
+            yield next(iterator)
+            index += 1
+        return
+
+    def __len__(self):
+        return min(len(lst) for lst in self._lists)
+
+
+class LazyEnumerate(LazyZip):
+    """
+    A lazy sequence whose elements are tuples, each ontaining a count (from
+    zero) and a value yielded by underlying sequence.  ``LazyEnumerate`` is
+    useful for obtaining an indexed list. The tuples are constructed lazily
+    -- i.e., when you read a value from the list, ``LazyEnumerate`` will
+    calculate that value by forming a tuple from the count of the i-th
+    element and the i-th element of the underlying sequence.
+
+    ``LazyEnumerate`` is essentially a lazy version of the Python primitive
+    function ``enumerate``.  In particular, the following two expressions are
+    equivalent:
+
+        >>> from nltk.collections import LazyEnumerate
+        >>> sequence = ['first', 'second', 'third']
+        >>> list(enumerate(sequence))
+        [(0, 'first'), (1, 'second'), (2, 'third')]
+        >>> list(LazyEnumerate(sequence))
+        [(0, 'first'), (1, 'second'), (2, 'third')]
+
+    Lazy enumerations can be useful for conserving memory in cases where the
+    argument sequences are particularly long.
+
+    A typical example of a use case for this class is obtaining an indexed
+    list for a long sequence of values.  By constructing tuples lazily and
+    avoiding the creation of an additional long sequence, memory usage can be
+    significantly reduced.
+    """
+
+    def __init__(self, lst):
+        """
+        :param lst: the underlying list
+        :type lst: list
+        """
+        LazyZip.__init__(self, range(len(lst)), lst)
+
+class LazyIteratorList(AbstractLazySequence):
+    """
+    Wraps an iterator, loading its elements on demand
+    and making them subscriptable.
+    __repr__ displays only the first few elements.
+    """
+    def __init__(self, it, known_len=None):
+        self._it = it
+        self._len = known_len
+        self._cache = []
+
+    def __len__(self):
+        if self._len:
+            return self._len
+        for x in self.iterate_from(len(self._cache)):
+            pass
+        self._len = len(self._cache)
+        return self._len
+
+    def iterate_from(self, start):
+        """Create a new iterator over this list starting at the given offset."""
+        while len(self._cache)<start:
+            v = next(self._it)
+            self._cache.append(v)
+        i = start
+        while i<len(self._cache):
+            yield self._cache[i]
+            i += 1
+        while True:
+            v = next(self._it)
+            self._cache.append(v)
+            yield v
+            i += 1
+
+    def __add__(self, other):
+        """Return a list concatenating self with other."""
+        return type(self)(chain(self, other))
+
+    def __radd__(self, other):
+        """Return a list concatenating other with self."""
+        return type(self)(chain(other, self))
+
+######################################################################
+# Trie Implementation
+######################################################################
+class Trie(defaultdict):
+    """A Trie implementation for strings"""
+    LEAF = True
+
+    def __init__(self, strings=None):
+        """Builds a Trie object, which is built around a ``defaultdict``
+
+        If ``strings`` is provided, it will add the ``strings``, which
+        consist of a ``list`` of ``strings``, to the Trie.
+        Otherwise, it'll construct an empty Trie.
+
+        :param strings: List of strings to insert into the trie
+            (Default is ``None``)
+        :type strings: list(str)
+
+        """
+        defaultdict.__init__(self, Trie)
+        if strings:
+            for string in strings:
+                self.insert(string)
+
+    def insert(self, string):
+        """Inserts ``string`` into the Trie
+
+        :param string: String to insert into the trie
+        :type string: str
+
+        :Example:
+
+        >>> from nltk.collections import Trie
+        >>> trie = Trie(["ab"])
+        >>> trie
+        defaultdict(<class 'nltk.collections.Trie'>, {'a': defaultdict(<class 'nltk.collections.Trie'>, {'b': defaultdict(<class 'nltk.collections.Trie'>, {True: None})})})
+
+        """
+        if len(string):
+            self[string[0]].insert(string[1:])
+        else:
+            # mark the string is complete
+            self[Trie.LEAF] = None
+
+    def __str__(self):
+        return str(self.as_dict())
+
+    def as_dict(self, d=None):
+        """Convert ``defaultdict`` to common ``dict`` representation.
+
+        :param: A defaultdict containing strings mapped to nested defaultdicts.
+            This is the structure of the trie. (Default is None)
+        :type: defaultdict(str -> defaultdict)
+        :return: Even though ``defaultdict`` is a subclass of ``dict`` and thus
+            can be converted to a simple ``dict`` using ``dict()``, in our case
+            it's a nested ``defaultdict``, so here's a quick trick to provide to
+            us the ``dict`` representation of the ``Trie`` without
+            ``defaultdict(<class 'nltk.collections.Trie'>, ...``
+        :rtype: dict(str -> dict(bool -> None))
+            Note: there can be an arbitrarily deeply nested
+            ``dict(str -> dict(str -> dict(..))``, but the last
+            level will have ``dict(str -> dict(bool -> None))``
+
+        :Example:
+
+        >>> from nltk.collections import Trie
+        >>> trie = Trie(["abc", "def"])
+        >>> expected = {'a': {'b': {'c': {True: None}}}, 'd': {'e': {'f': {True: None}}}}
+        >>> trie.as_dict() == expected
+        True
+
+        """
+        def _default_to_regular(d):
+            """
+            Source: http://stackoverflow.com/a/26496899/4760801
+
+            :param d: Nested ``defaultdict`` to convert to regular ``dict``
+            :type d: defaultdict(str -> defaultdict(...))
+            :return: A dict representation of the defaultdict
+            :rtype: dict(str -> dict(str -> ...))
+
+            :Example:
+
+            >>> from collections import defaultdict
+            >>> d = defaultdict(defaultdict)
+            >>> d["one"]["two"] = "three"
+            >>> d
+            defaultdict(<type 'collections.defaultdict'>, {'one': defaultdict(None, {'two': 'three'})})
+            >>> _default_to_regular(d)
+            {'one': {'two': 'three'}}
+
+            """
+            if isinstance(d, defaultdict):
+                d = {k: _default_to_regular(v) for k, v in d.items()}
+            return d
+
+        return _default_to_regular(self)
diff --git a/nlp_resource_data/nltk/collections.pyc b/nlp_resource_data/nltk/collections.pyc

new file mode 100755 (executable)

index 0000000..128856b

Binary files /dev/null and b/nlp_resource_data/nltk/collections.pyc differ
diff --git a/nlp_resource_data/nltk/collocations.py b/nlp_resource_data/nltk/collocations.py

new file mode 100755 (executable)

index 0000000..0b5a1f5
--- /dev/null
+++ b/nlp_resource_data/nltk/collocations.py
@@ -0,0 +1,388 @@
+# Natural Language Toolkit: Collocations and Association Measures
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Joel Nothman <jnothman@student.usyd.edu.au>
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+#
+"""
+Tools to identify collocations --- words that often appear consecutively
+--- within corpora. They may also be used to find other associations between
+word occurrences.
+See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
+and the Text::NSP Perl package at http://ngram.sourceforge.net
+
+Finding collocations requires first calculating the frequencies of words and
+their appearance in the context of other words. Often the collection of words
+will then requiring filtering to only retain useful content terms. Each ngram
+of words may then be scored according to some association measure, in order
+to determine the relative likelihood of each ngram being a collocation.
+
+The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
+these functionalities, dependent on being provided a function which scores a
+ngram given appropriate frequency counts. A number of standard association
+measures are provided in bigram_measures and trigram_measures.
+"""
+from __future__ import print_function
+
+# Possible TODOs:
+# - consider the distinction between f(x,_) and f(x) and whether our
+#   approximation is good enough for fragmented data, and mention it
+# - add a n-gram collocation finder with measures which only utilise n-gram
+#   and unigram counts (raw_freq, pmi, student_t)
+
+import itertools as _itertools
+from six import iteritems
+
+from nltk.probability import FreqDist
+from nltk.util import ngrams
+from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
+from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
+
+
+class AbstractCollocationFinder(object):
+    """
+    An abstract base class for collocation finders whose purpose is to
+    collect collocation candidate frequencies, filter and rank them.
+
+    As a minimum, collocation finders require the frequencies of each
+    word in a corpus, and the joint frequency of word tuples. This data
+    should be provided through nltk.probability.FreqDist objects or an
+    identical interface.
+    """
+
+    def __init__(self, word_fd, ngram_fd):
+        self.word_fd = word_fd
+        self.N = word_fd.N()
+        self.ngram_fd = ngram_fd
+
+    @classmethod
+    def _build_new_documents(cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None):
+        '''
+        Pad the document with the place holder according to the window_size
+        '''
+        padding = (pad_symbol,) * (window_size - 1)
+        if pad_right:
+            return _itertools.chain.from_iterable(_itertools.chain(doc, padding) for doc in documents)
+        if pad_left:
+            return _itertools.chain.from_iterable(_itertools.chain(padding, doc) for doc in documents)
+
+    @classmethod
+    def from_documents(cls, documents):
+        """Constructs a collocation finder given a collection of documents,
+        each of which is a list (or iterable) of tokens.
+        """
+        #return cls.from_words(_itertools.chain(*documents))
+        return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True))
+
+    @staticmethod
+    def _ngram_freqdist(words, n):
+        return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))
+
+    def _apply_filter(self, fn=lambda ngram, freq: False):
+        """Generic filter removes ngrams from the frequency distribution
+        if the function returns True when passed an ngram tuple.
+        """
+        tmp_ngram = FreqDist()
+        for ngram, freq in iteritems(self.ngram_fd):
+            if not fn(ngram, freq):
+                tmp_ngram[ngram] = freq
+        self.ngram_fd = tmp_ngram
+
+    def apply_freq_filter(self, min_freq):
+        """Removes candidate ngrams which have frequency less than min_freq."""
+        self._apply_filter(lambda ng, freq: freq < min_freq)
+
+    def apply_ngram_filter(self, fn):
+        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
+        evaluates to True.
+        """
+        self._apply_filter(lambda ng, f: fn(*ng))
+
+    def apply_word_filter(self, fn):
+        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
+        ...) evaluates to True.
+        """
+        self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
+
+    def _score_ngrams(self, score_fn):
+        """Generates of (ngram, score) pairs as determined by the scoring
+        function provided.
+        """
+        for tup in self.ngram_fd:
+            score = self.score_ngram(score_fn, *tup)
+            if score is not None:
+                yield tup, score
+
+    def score_ngrams(self, score_fn):
+        """Returns a sequence of (ngram, score) pairs ordered from highest to
+        lowest score, as determined by the scoring function provided.
+        """
+        return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))
+
+    def nbest(self, score_fn, n):
+        """Returns the top n ngrams when scored by the given function."""
+        return [p for p, s in self.score_ngrams(score_fn)[:n]]
+
+    def above_score(self, score_fn, min_score):
+        """Returns a sequence of ngrams, ordered by decreasing score, whose
+        scores each exceed the given minimum score.
+        """
+        for ngram, score in self.score_ngrams(score_fn):
+            if score > min_score:
+                yield ngram
+            else:
+                break
+
+
+class BigramCollocationFinder(AbstractCollocationFinder):
+    """A tool for the finding and ranking of bigram collocations or other
+    association measures. It is often useful to use from_words() rather than
+    constructing an instance directly.
+    """
+    default_ws = 2
+
+    def __init__(self, word_fd, bigram_fd, window_size=2):
+        """Construct a BigramCollocationFinder, given FreqDists for
+        appearances of words and (possibly non-contiguous) bigrams.
+        """
+        AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
+        self.window_size = window_size
+
+    @classmethod
+    def from_words(cls, words, window_size=2):
+        """Construct a BigramCollocationFinder for all bigrams in the given
+        sequence.  When window_size > 2, count non-contiguous bigrams, in the
+        style of Church and Hanks's (1990) association ratio.
+        """
+        wfd = FreqDist()
+        bfd = FreqDist()
+
+        if window_size < 2:
+            raise ValueError("Specify window_size at least 2")
+
+        for window in ngrams(words, window_size, pad_right=True):
+            w1 = window[0]
+            if w1 is None:
+                continue
+            wfd[w1] += 1
+            for w2 in window[1:]:
+                if w2 is not None:
+                    bfd[(w1, w2)] += 1
+        return cls(wfd, bfd, window_size=window_size)
+
+    def score_ngram(self, score_fn, w1, w2):
+        """Returns the score for a given bigram using the given scoring
+        function.  Following Church and Hanks (1990), counts are scaled by
+        a factor of 1/(window_size - 1).
+        """
+        n_all = self.N
+        n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
+        if not n_ii:
+            return
+        n_ix = self.word_fd[w1]
+        n_xi = self.word_fd[w2]
+        return score_fn(n_ii, (n_ix, n_xi), n_all)
+
+
+class TrigramCollocationFinder(AbstractCollocationFinder):
+    """A tool for the finding and ranking of trigram collocations or other
+    association measures. It is often useful to use from_words() rather than
+    constructing an instance directly.
+    """
+    default_ws = 3
+
+    def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
+        """Construct a TrigramCollocationFinder, given FreqDists for
+        appearances of words, bigrams, two words with any word between them,
+        and trigrams.
+        """
+        AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
+        self.wildcard_fd = wildcard_fd
+        self.bigram_fd = bigram_fd
+
+    @classmethod
+    def from_words(cls, words, window_size=3):
+        """Construct a TrigramCollocationFinder for all trigrams in the given
+        sequence.
+        """
+        if window_size < 3:
+            raise ValueError("Specify window_size at least 3")
+
+        wfd = FreqDist()
+        wildfd = FreqDist()
+        bfd = FreqDist()
+        tfd = FreqDist()
+        for window in ngrams(words, window_size, pad_right=True):
+            w1 = window[0]
+            if w1 is None:
+                continue
+            for w2, w3 in _itertools.combinations(window[1:], 2):
+                wfd[w1] += 1
+                if w2 is None:
+                    continue
+                bfd[(w1, w2)] += 1
+                if w3 is None:
+                    continue
+                wildfd[(w1, w3)] += 1
+                tfd[(w1, w2, w3)] += 1
+        return cls(wfd, bfd, wildfd, tfd)
+
+    def bigram_finder(self):
+        """Constructs a bigram collocation finder with the bigram and unigram
+        data from this finder. Note that this does not include any filtering
+        applied to this finder.
+        """
+        return BigramCollocationFinder(self.word_fd, self.bigram_fd)
+
+    def score_ngram(self, score_fn, w1, w2, w3):
+        """Returns the score for a given trigram using the given scoring
+        function.
+        """
+        n_all = self.N
+        n_iii = self.ngram_fd[(w1, w2, w3)]
+        if not n_iii:
+            return
+        n_iix = self.bigram_fd[(w1, w2)]
+        n_ixi = self.wildcard_fd[(w1, w3)]
+        n_xii = self.bigram_fd[(w2, w3)]
+        n_ixx = self.word_fd[w1]
+        n_xix = self.word_fd[w2]
+        n_xxi = self.word_fd[w3]
+        return score_fn(n_iii,
+                        (n_iix, n_ixi, n_xii),
+                        (n_ixx, n_xix, n_xxi),
+                        n_all)
+
+
+class QuadgramCollocationFinder(AbstractCollocationFinder):
+    """A tool for the finding and ranking of quadgram collocations or other association measures.
+    It is often useful to use from_words() rather than constructing an instance directly.
+    """
+    default_ws = 4
+
+    def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
+        """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
+        bigrams, trigrams, two words with one word and two words between them, three words
+        with a word between them in both variations.
+        """
+        AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
+        self.iii = iii
+        self.ii = ii
+        self.ixi = ixi
+        self.ixxi = ixxi
+        self.iixi = iixi
+        self.ixii = ixii
+
+    @classmethod
+    def from_words(cls, words, window_size=4):
+        if window_size < 4:
+            raise ValueError("Specify window_size at least 4")
+        ixxx = FreqDist()
+        iiii = FreqDist()
+        ii = FreqDist()
+        iii = FreqDist()
+        ixi = FreqDist()
+        ixxi = FreqDist()
+        iixi = FreqDist()
+        ixii = FreqDist()
+
+        for window in ngrams(words, window_size, pad_right=True):
+            w1 = window[0]
+            if w1 is None:
+                continue
+            for w2, w3, w4 in _itertools.combinations(window[1:], 3):
+                ixxx[w1] += 1
+                if w2 is None:
+                    continue
+                ii[(w1, w2)] += 1
+                if w3 is None:
+                    continue
+                iii[(w1, w2, w3)] += 1
+                ixi[(w1, w3)] += 1
+                if w4 is None:
+                    continue
+                iiii[(w1, w2, w3, w4)] += 1
+                ixxi[(w1, w4)] += 1
+                ixii[(w1, w3, w4)] += 1
+                iixi[(w1, w2, w4)] += 1
+
+        return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
+
+    def score_ngram(self, score_fn, w1, w2, w3, w4):
+        n_all = self.N
+        n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
+        if not n_iiii:
+            return
+        n_iiix = self.iii[(w1, w2, w3)]
+        n_xiii = self.iii[(w2, w3, w4)]
+        n_iixi = self.iixi[(w1, w2, w4)]
+        n_ixii = self.ixii[(w1, w3, w4)]
+
+        n_iixx = self.ii[(w1, w2)]
+        n_xxii = self.ii[(w3, w4)]
+        n_xiix = self.ii[(w2, w3)]
+        n_ixix = self.ixi[(w1, w3)]
+        n_ixxi = self.ixxi[(w1, w4)]
+        n_xixi = self.ixi[(w2, w4)]
+
+        n_ixxx = self.word_fd[w1]
+        n_xixx = self.word_fd[w2]
+        n_xxix = self.word_fd[w3]
+        n_xxxi = self.word_fd[w4]
+        return score_fn(n_iiii,
+                        (n_iiix, n_iixi, n_ixii, n_xiii),
+                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
+                        n_all)
+
+
+def demo(scorer=None, compare_scorer=None):
+    """Finds bigram collocations in the files of the WebText corpus."""
+    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores
+
+    if scorer is None:
+        scorer = BigramAssocMeasures.likelihood_ratio
+    if compare_scorer is None:
+        compare_scorer = BigramAssocMeasures.raw_freq
+
+    from nltk.corpus import stopwords, webtext
+
+    ignored_words = stopwords.words('english')
+    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
+
+    for file in webtext.fileids():
+        words = [word.lower()
+                 for word in webtext.words(file)]
+
+        cf = BigramCollocationFinder.from_words(words)
+        cf.apply_freq_filter(3)
+        cf.apply_word_filter(word_filter)
+
+        corr = spearman_correlation(ranks_from_scores(cf.score_ngrams(scorer)),
+                                    ranks_from_scores(cf.score_ngrams(compare_scorer)))
+        print(file)
+        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
+        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
+
+# Slows down loading too much
+# bigram_measures = BigramAssocMeasures()
+# trigram_measures = TrigramAssocMeasures()
+
+if __name__ == '__main__':
+    import sys
+    from nltk.metrics import BigramAssocMeasures
+
+    try:
+        scorer = eval('BigramAssocMeasures.' + sys.argv[1])
+    except IndexError:
+        scorer = None
+    try:
+        compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
+    except IndexError:
+        compare_scorer = None
+
+    demo(scorer, compare_scorer)
+
+__all__ = ['BigramCollocationFinder',
+           'TrigramCollocationFinder', 'QuadgramCollocationFinder']
diff --git a/nlp_resource_data/nltk/collocations.pyc b/nlp_resource_data/nltk/collocations.pyc

new file mode 100755 (executable)

index 0000000..7487fce

Binary files /dev/null and b/nlp_resource_data/nltk/collocations.pyc differ
diff --git a/nlp_resource_data/nltk/compat.py b/nlp_resource_data/nltk/compat.py

new file mode 100755 (executable)

index 0000000..8efda7e
--- /dev/null
+++ b/nlp_resource_data/nltk/compat.py
@@ -0,0 +1,365 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Compatibility
+#
+# Copyright (C) 2001-2017 NLTK Project
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import absolute_import, print_function
+import os
+import sys
+from functools import update_wrapper, wraps
+import fractions
+import unicodedata
+
+from six import string_types, text_type
+
+# Python 2/3 compatibility layer. Based on six.
+
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+    def get_im_class(meth):
+        return meth.__self__.__class__
+
+    import io
+    StringIO = io.StringIO
+    BytesIO = io.BytesIO
+
+    from datetime import timezone
+    UTC = timezone.utc
+
+    from tempfile import TemporaryDirectory
+
+else:
+    def get_im_class(meth):
+        return meth.im_class
+
+    try:
+        from cStringIO import StringIO
+    except ImportError:
+        from StringIO import StringIO
+    BytesIO = StringIO
+
+    from datetime import tzinfo, timedelta
+
+    ZERO = timedelta(0)
+    HOUR = timedelta(hours=1)
+
+    # A UTC class for python 2.7
+    class UTC(tzinfo):
+        """UTC"""
+
+        def utcoffset(self, dt):
+            return ZERO
+
+        def tzname(self, dt):
+            return "UTC"
+
+        def dst(self, dt):
+            return ZERO
+
+    UTC = UTC()
+
+    import csv
+    import codecs
+    import cStringIO
+
+    class UnicodeWriter:
+        """
+        A CSV writer which will write rows to CSV file "f",
+        which is encoded in the given encoding.
+        see https://docs.python.org/2/library/csv.html
+        """
+
+        def __init__(self, f, dialect=csv.excel, encoding="utf-8",
+                     errors='replace', **kwds):
+            # Redirect output to a queue
+            self.queue = cStringIO.StringIO()
+            self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+            self.stream = f
+            encoder_cls = codecs.getincrementalencoder(encoding)
+            self.encoder = encoder_cls(errors=errors)
+
+        def encode(self, data):
+            if isinstance(data, string_types):
+                return data.encode("utf-8")
+            else:
+                return data
+
+        def writerow(self, row):
+            self.writer.writerow([self.encode(s) for s in row])
+            # Fetch UTF-8 output from the queue ...
+            data = self.queue.getvalue()
+            data = data.decode("utf-8")
+            # ... and reencode it into the target encoding
+            data = self.encoder.encode(data, 'replace')
+            # write to the target stream
+            self.stream.write(data)
+            # empty queue
+            self.queue.truncate(0)
+
+    import warnings as _warnings
+    import os as _os
+    from tempfile import mkdtemp
+
+    class TemporaryDirectory(object):
+        """Create and return a temporary directory.  This has the same
+        behavior as mkdtemp but can be used as a context manager.  For
+        example:
+
+            with TemporaryDirectory() as tmpdir:
+                ...
+
+        Upon exiting the context, the directory and everything contained
+        in it are removed.
+
+        http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
+        """
+
+        def __init__(self, suffix="", prefix="tmp", dir=None):
+            self._closed = False
+            self.name = None  # Handle mkdtemp raising an exception
+            self.name = mkdtemp(suffix, prefix, dir)
+
+        def __repr__(self):
+            return "<{} {!r}>".format(self.__class__.__name__, self.name)
+
+        def __enter__(self):
+            return self.name
+
+        def cleanup(self, _warn=False):
+            if self.name and not self._closed:
+                try:
+                    self._rmtree(self.name)
+                except (TypeError, AttributeError) as ex:
+                    # Issue #10188: Emit a warning on stderr
+                    # if the directory could not be cleaned
+                    # up due to missing globals
+                    if "None" not in str(ex):
+                        raise
+                    print("ERROR: {!r} while cleaning up {!r}".format(ex,
+                                                                      self),
+                          file=sys.stderr)
+                    return
+                self._closed = True
+                if _warn:
+                    self._warn("Implicitly cleaning up {!r}".format(self),
+                               Warning)
+
+        def __exit__(self, exc, value, tb):
+            self.cleanup()
+
+        def __del__(self):
+            # Issue a Warning if implicit cleanup needed
+            self.cleanup(_warn=True)
+
+        # XXX (ncoghlan): The following code attempts to make
+        # this class tolerant of the module nulling out process
+        # that happens during CPython interpreter shutdown
+        # Alas, it doesn't actually manage it. See issue #10188
+        _listdir = staticmethod(_os.listdir)
+        _path_join = staticmethod(_os.path.join)
+        _isdir = staticmethod(_os.path.isdir)
+        _islink = staticmethod(_os.path.islink)
+        _remove = staticmethod(_os.remove)
+        _rmdir = staticmethod(_os.rmdir)
+        _warn = _warnings.warn
+
+        def _rmtree(self, path):
+            # Essentially a stripped down version of shutil.rmtree.  We can't
+            # use globals because they may be None'ed out at shutdown.
+            for name in self._listdir(path):
+                fullname = self._path_join(path, name)
+                try:
+                    isdir = (self._isdir(fullname) and not
+                             self._islink(fullname))
+                except OSError:
+                    isdir = False
+                if isdir:
+                    self._rmtree(fullname)
+                else:
+                    try:
+                        self._remove(fullname)
+                    except OSError:
+                        pass
+            try:
+                self._rmdir(path)
+            except OSError:
+                pass
+
+# ======= Compatibility for datasets that care about Python versions ========
+
+# The following datasets have a /PY3 subdirectory containing
+# a full copy of the data which has been re-encoded or repickled.
+DATA_UPDATES = [("chunkers", "maxent_ne_chunker"),
+                ("help", "tagsets"),
+                ("taggers", "maxent_treebank_pos_tagger"),
+                ("tokenizers", "punkt")]
+
+_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
+
+
+def add_py3_data(path):
+    if PY3:
+        for item in _PY3_DATA_UPDATES:
+            if item in str(path) and "/PY3" not in str(path):
+                pos = path.index(item) + len(item)
+                if path[pos:pos + 4] == ".zip":
+                    pos += 4
+                path = path[:pos] + "/PY3" + path[pos:]
+                break
+    return path
+
+
+# for use in adding /PY3 to the second (filename) argument
+# of the file pointers in data.py
+def py3_data(init_func):
+    def _decorator(*args, **kwargs):
+        args = (args[0], add_py3_data(args[1])) + args[2:]
+        return init_func(*args, **kwargs)
+    return wraps(init_func)(_decorator)
+
+
+# ======= Compatibility layer for __str__ and __repr__ ==========
+def remove_accents(text):
+
+    if isinstance(text, bytes):
+        text = text.decode('ascii')
+
+    category = unicodedata.category  # this gives a small (~10%) speedup
+    return ''.join(
+        c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
+    )
+
+
+# Select the best transliteration method:
+try:
+    # Older versions of Unidecode are licensed under Artistic License;
+    # assume an older version is installed.
+    from unidecode import unidecode as transliterate
+except ImportError:
+    try:
+        # text-unidecode implementation is worse than Unidecode
+        # implementation so Unidecode is preferred.
+        from text_unidecode import unidecode as transliterate
+    except ImportError:
+        # This transliteration method should be enough
+        # for many Western languages.
+        transliterate = remove_accents
+
+
+def python_2_unicode_compatible(klass):
+    """
+    This decorator defines __unicode__ method and fixes
+    __repr__ and __str__ methods under Python 2.
+
+    To support Python 2 and 3 with a single code base,
+    define __str__ and __repr__ methods returning unicode
+    text and apply this decorator to the class.
+
+    Original __repr__ and __str__ would be available
+    as unicode_repr and __unicode__ (under both Python 2
+    and Python 3).
+    """
+
+    if not issubclass(klass, object):
+        raise ValueError("This decorator doesn't work for old-style classes")
+
+    # both __unicode__ and unicode_repr are public because they
+    # may be useful in console under Python 2.x
+
+    # if __str__ or __repr__ are not overriden in a subclass,
+    # they may be already fixed by this decorator in a parent class
+    # and we shouldn't them again
+
+    if not _was_fixed(klass.__str__):
+        klass.__unicode__ = klass.__str__
+        if not PY3:
+            klass.__str__ = _7bit(_transliterated(klass.__unicode__))
+
+    if not _was_fixed(klass.__repr__):
+        klass.unicode_repr = klass.__repr__
+        if not PY3:
+            klass.__repr__ = _7bit(klass.unicode_repr)
+
+    return klass
+
+
+def unicode_repr(obj):
+    """
+    For classes that was fixed with @python_2_unicode_compatible
+    ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
+    the result is returned without "u" letter (to make output the
+    same under Python 2.x and Python 3.x); for other variables
+    it is the same as ``repr``.
+    """
+    if PY3:
+        return repr(obj)
+
+    # Python 2.x
+    if hasattr(obj, 'unicode_repr'):
+        return obj.unicode_repr()
+
+    if isinstance(obj, text_type):
+        return repr(obj)[1:]  # strip "u" letter from output
+
+    return repr(obj)
+
+
+def _transliterated(method):
+    def wrapper(self):
+        return transliterate(method(self))
+
+    update_wrapper(wrapper, method, ["__name__", "__doc__"])
+    if hasattr(method, "_nltk_compat_7bit"):
+        wrapper._nltk_compat_7bit = method._nltk_compat_7bit
+
+    wrapper._nltk_compat_transliterated = True
+    return wrapper
+
+
+def _7bit(method):
+    def wrapper(self):
+        return method(self).encode('ascii', 'backslashreplace')
+
+    update_wrapper(wrapper, method, ["__name__", "__doc__"])
+
+    if hasattr(method, "_nltk_compat_transliterated"):
+        wrapper._nltk_compat_transliterated = (
+            method._nltk_compat_transliterated
+        )
+
+    wrapper._nltk_compat_7bit = True
+    return wrapper
+
+
+def _was_fixed(method):
+    return (getattr(method, "_nltk_compat_7bit", False) or
+            getattr(method, "_nltk_compat_transliterated", False))
+
+
+class Fraction(fractions.Fraction):
+    """
+    This is a simplified backwards compatible version of fractions.Fraction
+    from Python >=3.5. It adds the `_normalize` parameter such that it does
+    not normalize the denominator to the Greatest Common Divisor (gcd) when
+    the numerator is 0.
+
+    This is most probably only used by the nltk.translate.bleu_score.py where
+    numerator and denominator of the different ngram precisions are mutable.
+    But the idea of "mutable" fraction might not be applicable to other usages,
+    See http://stackoverflow.com/questions/34561265
+
+    This objects should be deprecated once NLTK stops supporting Python < 3.5
+    See https://github.com/nltk/nltk/issues/1330
+    """
+    def __new__(cls, numerator=0, denominator=None, _normalize=True):
+        cls = super(Fraction, cls).__new__(cls, numerator, denominator)
+        # To emulate fraction.Fraction.from_float across Python >=2.7,
+        # check that numerator is an integer and denominator is not None.
+        if not _normalize and type(numerator) == int and denominator:
+            cls._numerator = numerator
+            cls._denominator = denominator
+        return cls
diff --git a/nlp_resource_data/nltk/compat.pyc b/nlp_resource_data/nltk/compat.pyc

new file mode 100755 (executable)

index 0000000..2f1580a

Binary files /dev/null and b/nlp_resource_data/nltk/compat.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/__init__.py b/nlp_resource_data/nltk/corpus/__init__.py

new file mode 100755 (executable)

index 0000000..d9ccb54
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/__init__.py
@@ -0,0 +1,341 @@
+# Natural Language Toolkit: Corpus Readers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# TODO this docstring isn't up-to-date!
+"""
+NLTK corpus readers.  The modules in this package provide functions
+that can be used to read corpus files in a variety of formats.  These
+functions can be used to read both the corpus files that are
+distributed in the NLTK corpus package, and corpus files that are part
+of external corpora.
+
+Available Corpora
+=================
+
+Please see http://www.nltk.org/nltk_data/ for a complete list.
+Install corpora using nltk.download().
+
+Corpus Reader Functions
+=======================
+Each corpus module defines one or more "corpus reader functions",
+which can be used to read documents from that corpus.  These functions
+take an argument, ``item``, which is used to indicate which document
+should be read from the corpus:
+
+- If ``item`` is one of the unique identifiers listed in the corpus
+  module's ``items`` variable, then the corresponding document will
+  be loaded from the NLTK corpus package.
+- If ``item`` is a filename, then that file will be read.
+
+Additionally, corpus reader functions can be given lists of item
+names; in which case, they will return a concatenation of the
+corresponding documents.
+
+Corpus reader functions are named based on the type of information
+they return.  Some common examples, and their return types, are:
+
+- words(): list of str
+- sents(): list of (list of str)
+- paras(): list of (list of (list of str))
+- tagged_words(): list of (str,str) tuple
+- tagged_sents(): list of (list of (str,str))
+- tagged_paras(): list of (list of (list of (str,str)))
+- chunked_sents(): list of (Tree w/ (str,str) leaves)
+- parsed_sents(): list of (Tree with str leaves)
+- parsed_paras(): list of (list of (Tree with str leaves))
+- xml(): A single xml ElementTree
+- raw(): unprocessed corpus contents
+
+For example, to read a list of the words in the Brown Corpus, use
+``nltk.corpus.brown.words()``:
+
+    >>> from nltk.corpus import brown
+    >>> print(", ".join(brown.words()))
+    The, Fulton, County, Grand, Jury, said, ...
+
+"""
+
+import re
+
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus.util import LazyCorpusLoader
+from nltk.corpus.reader import *
+
+abc = LazyCorpusLoader(
+    'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
+            ('science', 'latin_1'),
+            ('rural', 'utf8')])
+alpino = LazyCorpusLoader(
+    'alpino', AlpinoCorpusReader, tagset='alpino')
+brown = LazyCorpusLoader(
+    'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
+    cat_file='cats.txt', tagset='brown', encoding="ascii")
+cess_cat = LazyCorpusLoader(
+    'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
+    tagset='unknown', encoding='ISO-8859-15')
+cess_esp = LazyCorpusLoader(
+    'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
+    tagset='unknown', encoding='ISO-8859-15')
+cmudict = LazyCorpusLoader(
+    'cmudict', CMUDictCorpusReader, ['cmudict'])
+comtrans = LazyCorpusLoader(
+    'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
+comparative_sentences = LazyCorpusLoader(
+    'comparative_sentences', ComparativeSentencesCorpusReader, r'labeledSentences\.txt',
+    encoding='latin-1')
+conll2000 = LazyCorpusLoader(
+    'conll2000', ConllChunkCorpusReader,
+    ['train.txt', 'test.txt'], ('NP','VP','PP'),
+    tagset='wsj', encoding='ascii')
+conll2002 = LazyCorpusLoader(
+    'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
+    ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
+conll2007 = LazyCorpusLoader(
+    'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
+        ('eus', 'ISO-8859-2'),
+        ('esp', 'utf8')])
+crubadan = LazyCorpusLoader(
+    'crubadan', CrubadanCorpusReader, '.*\.txt')
+dependency_treebank = LazyCorpusLoader(
+    'dependency_treebank', DependencyCorpusReader, '.*\.dp',
+    encoding='ascii')
+floresta = LazyCorpusLoader(
+    'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
+    tagset='unknown', encoding='ISO-8859-15')
+framenet15 = LazyCorpusLoader(
+    'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
+framenet = LazyCorpusLoader(
+    'framenet_v17', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
+gazetteers = LazyCorpusLoader(
+    'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
+    encoding='ISO-8859-2')
+genesis = LazyCorpusLoader(
+    'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
+        ('finnish|french|german', 'latin_1'),
+        ('swedish', 'cp865'),
+        ('.*', 'utf_8')])
+gutenberg = LazyCorpusLoader(
+    'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
+ieer = LazyCorpusLoader(
+    'ieer', IEERCorpusReader, r'(?!README|\.).*')
+inaugural = LazyCorpusLoader(
+    'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
+# [XX] This should probably just use TaggedCorpusReader:
+indian = LazyCorpusLoader(
+    'indian', IndianCorpusReader, r'(?!\.).*\.pos',
+    tagset='unknown', encoding='utf8')
+
+jeita = LazyCorpusLoader(
+    'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
+knbc = LazyCorpusLoader(
+    'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
+lin_thesaurus = LazyCorpusLoader(
+    'lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
+mac_morpho = LazyCorpusLoader(
+    'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt',
+    tagset='unknown', encoding='latin-1')
+machado = LazyCorpusLoader(
+    'machado', PortugueseCategorizedPlaintextCorpusReader,
+    r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
+masc_tagged = LazyCorpusLoader(
+    'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt',
+    cat_file='categories.txt', tagset='wsj', encoding="utf-8", sep="_")
+movie_reviews = LazyCorpusLoader(
+    'movie_reviews', CategorizedPlaintextCorpusReader,
+    r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
+    encoding='ascii')
+multext_east = LazyCorpusLoader(
+    'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8")
+names = LazyCorpusLoader(
+    'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
+nps_chat = LazyCorpusLoader(
+    'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
+opinion_lexicon = LazyCorpusLoader(
+    'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt',
+    encoding='ISO-8859-2')
+ppattach = LazyCorpusLoader(
+    'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
+product_reviews_1 = LazyCorpusLoader(
+    'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8')
+product_reviews_2 = LazyCorpusLoader(
+    'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8')
+pros_cons = LazyCorpusLoader(
+    'pros_cons', ProsConsCorpusReader, r'Integrated(Cons|Pros)\.txt',
+    cat_pattern=r'Integrated(Cons|Pros)\.txt', encoding='ISO-8859-2')
+ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
+    'ptb', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
+    cat_file='allcats.txt', tagset='wsj')
+qc = LazyCorpusLoader(
+    'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2')
+reuters = LazyCorpusLoader(
+    'reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
+    cat_file='cats.txt', encoding='ISO-8859-2')
+rte = LazyCorpusLoader(
+    'rte', RTECorpusReader, r'(?!\.).*\.xml')
+senseval = LazyCorpusLoader(
+    'senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
+sentence_polarity = LazyCorpusLoader(
+    'sentence_polarity', CategorizedSentencesCorpusReader, r'rt-polarity\.(neg|pos)',
+    cat_pattern=r'rt-polarity\.(neg|pos)', encoding='utf-8')
+sentiwordnet = LazyCorpusLoader(
+    'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8')
+shakespeare = LazyCorpusLoader(
+    'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
+sinica_treebank = LazyCorpusLoader(
+    'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'],
+    tagset='unknown', encoding='utf-8')
+state_union = LazyCorpusLoader(
+    'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt',
+    encoding='ISO-8859-2')
+stopwords = LazyCorpusLoader(
+    'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
+subjectivity = LazyCorpusLoader(
+    'subjectivity', CategorizedSentencesCorpusReader, r'(quote.tok.gt9|plot.tok.gt9)\.5000',
+    cat_map={'quote.tok.gt9.5000':['subj'], 'plot.tok.gt9.5000':['obj']}, encoding='latin-1')
+swadesh = LazyCorpusLoader(
+    'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
+swadesh110 = LazyCorpusLoader(
+    'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8')
+swadesh207 = LazyCorpusLoader(
+    'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8')
+switchboard = LazyCorpusLoader(
+    'switchboard', SwitchboardCorpusReader, tagset='wsj')
+timit = LazyCorpusLoader(
+    'timit', TimitCorpusReader)
+timit_tagged = LazyCorpusLoader(
+    'timit', TimitTaggedCorpusReader, '.+\.tags',
+    tagset='wsj', encoding='ascii')
+toolbox = LazyCorpusLoader(
+    'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
+treebank = LazyCorpusLoader(
+    'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
+    tagset='wsj', encoding='ascii')
+treebank_chunk = LazyCorpusLoader(
+    'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
+    sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
+    para_block_reader=tagged_treebank_para_block_reader, tagset='wsj', encoding='ascii')
+treebank_raw = LazyCorpusLoader(
+    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
+twitter_samples = LazyCorpusLoader(
+    'twitter_samples', TwitterCorpusReader, '.*\.json')
+udhr = LazyCorpusLoader(
+    'udhr', UdhrCorpusReader)
+udhr2 = LazyCorpusLoader(
+    'udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8')
+universal_treebanks = LazyCorpusLoader(
+    'universal_treebanks_v20', ConllCorpusReader, r'.*\.conll',
+    columntypes = ('ignore', 'words', 'ignore', 'ignore', 'pos',
+                   'ignore', 'ignore', 'ignore', 'ignore', 'ignore'))
+verbnet = LazyCorpusLoader(
+    'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
+webtext = LazyCorpusLoader(
+    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
+wordnet = LazyCorpusLoader(
+    'wordnet', WordNetCorpusReader,
+    LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'))
+wordnet_ic = LazyCorpusLoader(
+    'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
+words = LazyCorpusLoader(
+    'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
+
+# defined after treebank
+propbank = LazyCorpusLoader(
+    'propbank', PropbankCorpusReader,
+    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
+    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
+    treebank) # Must be defined *after* treebank corpus.
+nombank = LazyCorpusLoader(
+    'nombank.1.0', NombankCorpusReader,
+    'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
+    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
+    treebank) # Must be defined *after* treebank corpus.
+propbank_ptb = LazyCorpusLoader(
+    'propbank', PropbankCorpusReader,
+    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
+    lambda filename: filename.upper(),
+    ptb) # Must be defined *after* ptb corpus.
+nombank_ptb = LazyCorpusLoader(
+    'nombank.1.0', NombankCorpusReader,
+    'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
+    lambda filename: filename.upper(),
+    ptb) # Must be defined *after* ptb corpus.
+semcor = LazyCorpusLoader(
+    'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml',
+    wordnet) # Must be defined *after* wordnet corpus.
+
+nonbreaking_prefixes = LazyCorpusLoader(
+    'nonbreaking_prefixes', NonbreakingPrefixesCorpusReader, r'(?!README|\.).*', encoding='utf8')
+perluniprops = LazyCorpusLoader(
+    'perluniprops', UnicharsCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
+
+# mwa_ppdb = LazyCorpusLoader(
+#     'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
+
+# See https://github.com/nltk/nltk/issues/1579
+# and https://github.com/nltk/nltk/issues/1716
+#
+# pl196x = LazyCorpusLoader(
+#     'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
+#     cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
+#
+# ipipan = LazyCorpusLoader(
+#     'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
+#
+# nkjp = LazyCorpusLoader(
+#     'nkjp', NKJPCorpusReader, r'', encoding='utf8')
+#
+#panlex_lite = LazyCorpusLoader(
+#    'panlex_lite', PanLexLiteCorpusReader)
+#
+# ycoe = LazyCorpusLoader(
+#     'ycoe', YCOECorpusReader)
+#
+# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
+#hebrew_treebank = LazyCorpusLoader(
+#    'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
+
+
+def demo():
+    # This is out-of-date:
+    abc.demo()
+    brown.demo()
+#    chat80.demo()
+    cmudict.demo()
+    conll2000.demo()
+    conll2002.demo()
+    genesis.demo()
+    gutenberg.demo()
+    ieer.demo()
+    inaugural.demo()
+    indian.demo()
+    names.demo()
+    ppattach.demo()
+    senseval.demo()
+    shakespeare.demo()
+    sinica_treebank.demo()
+    state_union.demo()
+    stopwords.demo()
+    timit.demo()
+    toolbox.demo()
+    treebank.demo()
+    udhr.demo()
+    webtext.demo()
+    words.demo()
+#    ycoe.demo()
+
+if __name__ == '__main__':
+    #demo()
+    pass
+
+# ** this is for nose **
+# unload all corpus after tests
+def teardown_module(module=None):
+    import nltk.corpus
+    for name in dir(nltk.corpus):
+        obj = getattr(nltk.corpus, name, None)
+        if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
+            obj._unload()
diff --git a/nlp_resource_data/nltk/corpus/__init__.pyc b/nlp_resource_data/nltk/corpus/__init__.pyc

new file mode 100755 (executable)

index 0000000..3d1a4cc

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/europarl_raw.py b/nlp_resource_data/nltk/corpus/europarl_raw.py

new file mode 100755 (executable)

index 0000000..a8e62a5
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/europarl_raw.py
@@ -0,0 +1,44 @@
+# Natural Language Toolkit: Europarl Corpus Readers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author:  Nitin Madnani <nmadnani@umiacs.umd.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+from nltk.corpus.util import LazyCorpusLoader
+from nltk.corpus.reader import *
+
+# Create a new corpus reader instance for each European language
+danish = LazyCorpusLoader(
+    'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8')
+
+dutch = LazyCorpusLoader(
+    'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8')
+
+english = LazyCorpusLoader(
+    'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')
+
+finnish = LazyCorpusLoader(
+    'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8')
+
+french = LazyCorpusLoader(
+    'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8')
+
+german = LazyCorpusLoader(
+    'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8')
+
+greek = LazyCorpusLoader(
+    'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8')
+
+italian = LazyCorpusLoader(
+    'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8')
+
+portuguese = LazyCorpusLoader(
+    'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8')
+
+spanish = LazyCorpusLoader(
+    'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8')
+
+swedish = LazyCorpusLoader(
+    'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8')
diff --git a/nlp_resource_data/nltk/corpus/europarl_raw.pyc b/nlp_resource_data/nltk/corpus/europarl_raw.pyc

new file mode 100755 (executable)

index 0000000..2ce76dc

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/europarl_raw.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/__init__.py b/nlp_resource_data/nltk/corpus/reader/__init__.py

new file mode 100755 (executable)

index 0000000..77e0eb0
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/__init__.py
@@ -0,0 +1,147 @@
+# Natural Language Toolkit: Corpus Readers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK corpus readers.  The modules in this package provide functions
+that can be used to read corpus fileids in a variety of formats.  These
+functions can be used to read both the corpus fileids that are
+distributed in the NLTK corpus package, and corpus fileids that are part
+of external corpora.
+
+Corpus Reader Functions
+=======================
+Each corpus module defines one or more "corpus reader functions",
+which can be used to read documents from that corpus.  These functions
+take an argument, ``item``, which is used to indicate which document
+should be read from the corpus:
+
+- If ``item`` is one of the unique identifiers listed in the corpus
+  module's ``items`` variable, then the corresponding document will
+  be loaded from the NLTK corpus package.
+- If ``item`` is a fileid, then that file will be read.
+
+Additionally, corpus reader functions can be given lists of item
+names; in which case, they will return a concatenation of the
+corresponding documents.
+
+Corpus reader functions are named based on the type of information
+they return.  Some common examples, and their return types, are:
+
+- words(): list of str
+- sents(): list of (list of str)
+- paras(): list of (list of (list of str))
+- tagged_words(): list of (str,str) tuple
+- tagged_sents(): list of (list of (str,str))
+- tagged_paras(): list of (list of (list of (str,str)))
+- chunked_sents(): list of (Tree w/ (str,str) leaves)
+- parsed_sents(): list of (Tree with str leaves)
+- parsed_paras(): list of (list of (Tree with str leaves))
+- xml(): A single xml ElementTree
+- raw(): unprocessed corpus contents
+
+For example, to read a list of the words in the Brown Corpus, use
+``nltk.corpus.brown.words()``:
+
+    >>> from nltk.corpus import brown
+    >>> print(", ".join(brown.words()))
+    The, Fulton, County, Grand, Jury, said, ...
+
+"""
+
+from nltk.corpus.reader.plaintext import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.tagged import *
+from nltk.corpus.reader.cmudict import *
+from nltk.corpus.reader.conll import *
+from nltk.corpus.reader.chunked import *
+from nltk.corpus.reader.wordlist import *
+from nltk.corpus.reader.xmldocs import *
+from nltk.corpus.reader.ppattach import *
+from nltk.corpus.reader.senseval import *
+from nltk.corpus.reader.ieer import *
+from nltk.corpus.reader.sinica_treebank import *
+from nltk.corpus.reader.bracket_parse import *
+from nltk.corpus.reader.indian import *
+from nltk.corpus.reader.toolbox import *
+from nltk.corpus.reader.timit import *
+from nltk.corpus.reader.ycoe import *
+from nltk.corpus.reader.rte import *
+from nltk.corpus.reader.string_category import *
+from nltk.corpus.reader.propbank import *
+from nltk.corpus.reader.verbnet import *
+from nltk.corpus.reader.bnc import *
+from nltk.corpus.reader.nps_chat import *
+from nltk.corpus.reader.wordnet import *
+from nltk.corpus.reader.switchboard import *
+from nltk.corpus.reader.dependency import *
+from nltk.corpus.reader.nombank import *
+from nltk.corpus.reader.ipipan import *
+from nltk.corpus.reader.pl196x import *
+from nltk.corpus.reader.knbc import *
+from nltk.corpus.reader.chasen import *
+from nltk.corpus.reader.childes import *
+from nltk.corpus.reader.aligned import *
+from nltk.corpus.reader.lin import *
+from nltk.corpus.reader.semcor import *
+from nltk.corpus.reader.framenet import *
+from nltk.corpus.reader.udhr import *
+from nltk.corpus.reader.bnc import *
+from nltk.corpus.reader.sentiwordnet import *
+from nltk.corpus.reader.twitter import *
+from nltk.corpus.reader.nkjp import *
+from nltk.corpus.reader.crubadan import *
+from nltk.corpus.reader.mte import *
+from nltk.corpus.reader.reviews import *
+from nltk.corpus.reader.opinion_lexicon import *
+from nltk.corpus.reader.pros_cons import *
+from nltk.corpus.reader.categorized_sents import *
+from nltk.corpus.reader.comparative_sents import *
+from nltk.corpus.reader.panlex_lite import *
+
+# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
+# the function bracket_parse() defined in nltk.tree:
+from nltk.corpus.reader import bracket_parse
+
+__all__ = [
+    'CorpusReader', 'CategorizedCorpusReader',
+    'PlaintextCorpusReader', 'find_corpus_fileids',
+    'TaggedCorpusReader', 'CMUDictCorpusReader',
+    'ConllChunkCorpusReader', 'WordListCorpusReader',
+    'PPAttachmentCorpusReader', 'SensevalCorpusReader',
+    'IEERCorpusReader', 'ChunkedCorpusReader',
+    'SinicaTreebankCorpusReader', 'BracketParseCorpusReader',
+    'IndianCorpusReader', 'ToolboxCorpusReader',
+    'TimitCorpusReader', 'YCOECorpusReader',
+    'MacMorphoCorpusReader', 'SyntaxCorpusReader',
+    'AlpinoCorpusReader', 'RTECorpusReader',
+    'StringCategoryCorpusReader','EuroparlCorpusReader',
+    'CategorizedBracketParseCorpusReader',
+    'CategorizedTaggedCorpusReader',
+    'CategorizedPlaintextCorpusReader',
+    'PortugueseCategorizedPlaintextCorpusReader',
+    'tagged_treebank_para_block_reader',
+    'PropbankCorpusReader', 'VerbnetCorpusReader',
+    'BNCCorpusReader', 'ConllCorpusReader',
+    'XMLCorpusReader', 'NPSChatCorpusReader',
+    'SwadeshCorpusReader', 'WordNetCorpusReader',
+    'WordNetICCorpusReader', 'SwitchboardCorpusReader',
+    'DependencyCorpusReader', 'NombankCorpusReader',
+    'IPIPANCorpusReader', 'Pl196xCorpusReader',
+    'TEICorpusView', 'KNBCorpusReader', 'ChasenCorpusReader',
+    'CHILDESCorpusReader', 'AlignedCorpusReader',
+    'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
+    'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
+    'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
+    'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader',
+    'MTECorpusReader', 'ReviewsCorpusReader', 'OpinionLexiconCorpusReader',
+    'ProsConsCorpusReader', 'CategorizedSentencesCorpusReader',
+    'ComparativeSentencesCorpusReader', 'PanLexLiteCorpusReader',
+    'NonbreakingPrefixesCorpusReader', 'UnicharsCorpusReader',
+    'MWAPPDBCorpusReader',
+]
diff --git a/nlp_resource_data/nltk/corpus/reader/__init__.pyc b/nlp_resource_data/nltk/corpus/reader/__init__.pyc

new file mode 100755 (executable)

index 0000000..c947e12

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/aligned.py b/nlp_resource_data/nltk/corpus/reader/aligned.py

new file mode 100755 (executable)

index 0000000..0b341c9
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/aligned.py
@@ -0,0 +1,115 @@
+# Natural Language Toolkit: Aligned Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# Author: Steven Bird <stevenbird1@gmail.com>
+# For license information, see LICENSE.TXT
+
+from six import string_types
+
+from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
+from nltk.translate import AlignedSent, Alignment
+
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import StreamBackedCorpusView, concat,\
+    read_alignedsent_block
+
+class AlignedCorpusReader(CorpusReader):
+    """
+    Reader for corpora of word-aligned sentences.  Tokens are assumed
+    to be separated by whitespace.  Sentences begin on separate lines.
+    """
+    def __init__(self, root, fileids,
+                 sep='/', word_tokenizer=WhitespaceTokenizer(),
+                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+                 alignedsent_block_reader=read_alignedsent_block,
+                 encoding='latin1'):
+        """
+        Construct a new Aligned Corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
+
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._alignedsent_block_reader = alignedsent_block_reader
+
+    def raw(self, fileids=None):
+        """
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([AlignedSentCorpusView(fileid, enc, False, False,
+                                             self._word_tokenizer,
+                                             self._sent_tokenizer,
+                                             self._alignedsent_block_reader)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat([AlignedSentCorpusView(fileid, enc, False, True,
+                                             self._word_tokenizer,
+                                             self._sent_tokenizer,
+                                             self._alignedsent_block_reader)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def aligned_sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of AlignedSent objects.
+        :rtype: list(AlignedSent)
+        """
+        return concat([AlignedSentCorpusView(fileid, enc, True, True,
+                                             self._word_tokenizer,
+                                             self._sent_tokenizer,
+                                             self._alignedsent_block_reader)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+class AlignedSentCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for aligned sentences.
+    ``AlignedSentCorpusView`` objects are typically created by
+    ``AlignedCorpusReader`` (not directly by nltk users).
+    """
+    def __init__(self, corpus_file, encoding, aligned, group_by_sent,
+                 word_tokenizer, sent_tokenizer, alignedsent_block_reader):
+        self._aligned = aligned
+        self._group_by_sent = group_by_sent
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._alignedsent_block_reader = alignedsent_block_reader
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        block = [self._word_tokenizer.tokenize(sent_str)
+                 for alignedsent_str in self._alignedsent_block_reader(stream)
+                 for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)]
+        if self._aligned:
+            block[2] = Alignment.fromstring(" ".join(block[2])) # kludge; we shouldn't have tokenized the alignment string
+            block = [AlignedSent(*block)]
+        elif self._group_by_sent:
+            block = [block[0]]
+        else:
+            block = block[0]
+
+        return block
diff --git a/nlp_resource_data/nltk/corpus/reader/aligned.pyc b/nlp_resource_data/nltk/corpus/reader/aligned.pyc

new file mode 100755 (executable)

index 0000000..44f5996

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/aligned.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/api.py b/nlp_resource_data/nltk/corpus/reader/api.py

new file mode 100755 (executable)

index 0000000..fae5a11
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/api.py
@@ -0,0 +1,448 @@
+# Natural Language Toolkit: API for Corpus Readers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+API for corpus readers.
+"""
+from __future__ import unicode_literals
+
+import os
+import re
+from collections import defaultdict
+from itertools import chain
+
+from six import string_types
+
+from nltk import compat
+from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
+
+from nltk.corpus.reader.util import *
+
+@compat.python_2_unicode_compatible
+class CorpusReader(object):
+    """
+    A base class for "corpus reader" classes, each of which can be
+    used to read a specific corpus format.  Each individual corpus
+    reader instance is used to read a specific corpus, consisting of
+    one or more files under a common root directory.  Each file is
+    identified by its ``file identifier``, which is the relative path
+    to the file from the root directory.
+
+    A separate subclass is defined for each corpus format.  These
+    subclasses define one or more methods that provide 'views' on the
+    corpus contents, such as ``words()`` (for a list of words) and
+    ``parsed_sents()`` (for a list of parsed sentences).  Called with
+    no arguments, these methods will return the contents of the entire
+    corpus.  For most corpora, these methods define one or more
+    selection arguments, such as ``fileids`` or ``categories``, which can
+    be used to select which portion of the corpus should be returned.
+    """
+
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
+        """
+        :type root: PathPointer or str
+        :param root: A path pointer identifying the root directory for
+            this corpus.  If a string is specified, then it will be
+            converted to a ``PathPointer`` automatically.
+        :param fileids: A list of the files that make up this corpus.
+            This list can either be specified explicitly, as a list of
+            strings; or implicitly, as a regular expression over file
+            paths.  The absolute path for each file will be constructed
+            by joining the reader's root to each file name.
+        :param encoding: The default unicode encoding for the files
+            that make up the corpus.  The value of ``encoding`` can be any
+            of the following:
+            - A string: ``encoding`` is the encoding name for all files.
+            - A dictionary: ``encoding[file_id]`` is the encoding
+              name for the file whose identifier is ``file_id``.  If
+              ``file_id`` is not in ``encoding``, then the file
+              contents will be processed using non-unicode byte strings.
+            - A list: ``encoding`` should be a list of ``(regexp, encoding)``
+              tuples.  The encoding for a file whose identifier is ``file_id``
+              will be the ``encoding`` value for the first tuple whose
+              ``regexp`` matches the ``file_id``.  If no tuple's ``regexp``
+              matches the ``file_id``, the file contents will be processed
+              using non-unicode byte strings.
+            - None: the file contents of all files will be
+              processed using non-unicode byte strings.
+        :param tagset: The name of the tagset used by this corpus, to be used
+              for normalizing or converting the POS tags returned by the
+              tagged_...() methods.
+        """
+        # Convert the root to a path pointer, if necessary.
+        if isinstance(root, string_types) and not isinstance(root, PathPointer):
+            m = re.match('(.*\.zip)/?(.*)$|', root)
+            zipfile, zipentry = m.groups()
+            if zipfile:
+                root = ZipFilePathPointer(zipfile, zipentry)
+            else:
+                root = FileSystemPathPointer(root)
+        elif not isinstance(root, PathPointer):
+            raise TypeError('CorpusReader: expected a string or a PathPointer')
+
+        # If `fileids` is a regexp, then expand it.
+        if isinstance(fileids, string_types):
+            fileids = find_corpus_fileids(root, fileids)
+
+        self._fileids = fileids
+        """A list of the relative paths for the fileids that make up
+        this corpus."""
+
+        self._root = root
+        """The root directory for this corpus."""
+
+        # If encoding was specified as a list of regexps, then convert
+        # it to a dictionary.
+        if isinstance(encoding, list):
+            encoding_dict = {}
+            for fileid in self._fileids:
+                for x in encoding:
+                    (regexp, enc) = x
+                    if re.match(regexp, fileid):
+                        encoding_dict[fileid] = enc
+                        break
+            encoding = encoding_dict
+
+        self._encoding = encoding
+        """The default unicode encoding for the fileids that make up
+           this corpus.  If ``encoding`` is None, then the file
+           contents are processed using byte strings."""
+        self._tagset = tagset
+
+    def __repr__(self):
+        if isinstance(self._root, ZipFilePathPointer):
+            path = '%s/%s' % (self._root.zipfile.filename, self._root.entry)
+        else:
+            path = '%s' % self._root.path
+        return '<%s in %r>' % (self.__class__.__name__, path)
+
+    def ensure_loaded(self):
+        """
+        Load this corpus (if it has not already been loaded).  This is
+        used by LazyCorpusLoader as a simple method that can be used to
+        make sure a corpus is loaded -- e.g., in case a user wants to
+        do help(some_corpus).
+        """
+        pass # no need to actually do anything.
+
+    def readme(self):
+        """
+        Return the contents of the corpus README file, if it exists.
+        """
+        return self.open("README").read()
+
+    def license(self):
+        """
+        Return the contents of the corpus LICENSE file, if it exists.
+        """
+        return self.open("LICENSE").read()
+
+    def citation(self):
+        """
+        Return the contents of the corpus citation.bib file, if it exists.
+        """
+        return self.open("citation.bib").read()
+
+    def fileids(self):
+        """
+        Return a list of file identifiers for the fileids that make up
+        this corpus.
+        """
+        return self._fileids
+
+    def abspath(self, fileid):
+        """
+        Return the absolute path for the given file.
+
+        :type fileid: str
+        :param fileid: The file identifier for the file whose path
+            should be returned.
+        :rtype: PathPointer
+        """
+        return self._root.join(fileid)
+
+    def abspaths(self, fileids=None, include_encoding=False,
+                 include_fileid=False):
+        """
+        Return a list of the absolute paths for all fileids in this corpus;
+        or for the given list of fileids, if specified.
+
+        :type fileids: None or str or list
+        :param fileids: Specifies the set of fileids for which paths should
+            be returned.  Can be None, for all fileids; a list of
+            file identifiers, for a specified set of fileids; or a single
+            file identifier, for a single file.  Note that the return
+            value is always a list of paths, even if ``fileids`` is a
+            single file identifier.
+
+        :param include_encoding: If true, then return a list of
+            ``(path_pointer, encoding)`` tuples.
+
+        :rtype: list(PathPointer)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        paths = [self._root.join(f) for f in fileids]
+
+        if include_encoding and include_fileid:
+            return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
+        elif include_fileid:
+            return list(zip(paths, fileids))
+        elif include_encoding:
+            return list(zip(paths, [self.encoding(f) for f in fileids]))
+        else:
+            return paths
+
+    def open(self, file):
+        """
+        Return an open stream that can be used to read the given file.
+        If the file's encoding is not None, then the stream will
+        automatically decode the file's contents into unicode.
+
+        :param file: The file identifier of the file to read.
+        """
+        encoding = self.encoding(file)
+        stream = self._root.join(file).open(encoding)
+        return stream
+
+    def encoding(self, file):
+        """
+        Return the unicode encoding for the given corpus file, if known.
+        If the encoding is unknown, or if the given file should be
+        processed using byte strings (str), then return None.
+        """
+        if isinstance(self._encoding, dict):
+            return self._encoding.get(file)
+        else:
+            return self._encoding
+
+    def _get_root(self): return self._root
+    root = property(_get_root, doc="""
+        The directory where this corpus is stored.
+
+        :type: PathPointer""")
+
+
+######################################################################
+#{ Corpora containing categorized items
+######################################################################
+
+class CategorizedCorpusReader(object):
+    """
+    A mixin class used to aid in the implementation of corpus readers
+    for categorized corpora.  This class defines the method
+    ``categories()``, which returns a list of the categories for the
+    corpus or for a specified set of fileids; and overrides ``fileids()``
+    to take a ``categories`` argument, restricting the set of fileids to
+    be returned.
+
+    Subclasses are expected to:
+
+      - Call ``__init__()`` to set up the mapping.
+
+      - Override all view methods to accept a ``categories`` parameter,
+        which can be used *instead* of the ``fileids`` parameter, to
+        select which fileids should be included in the returned view.
+    """
+
+    def __init__(self, kwargs):
+        """
+        Initialize this mapping based on keyword arguments, as
+        follows:
+
+          - cat_pattern: A regular expression pattern used to find the
+            category for each file identifier.  The pattern will be
+            applied to each file identifier, and the first matching
+            group will be used as the category label for that file.
+
+          - cat_map: A dictionary, mapping from file identifiers to
+            category labels.
+
+          - cat_file: The name of a file that contains the mapping
+            from file identifiers to categories.  The argument
+            ``cat_delimiter`` can be used to specify a delimiter.
+
+        The corresponding argument will be deleted from ``kwargs``.  If
+        more than one argument is specified, an exception will be
+        raised.
+        """
+        self._f2c = None #: file-to-category mapping
+        self._c2f = None #: category-to-file mapping
+
+        self._pattern = None #: regexp specifying the mapping
+        self._map = None #: dict specifying the mapping
+        self._file = None #: fileid of file containing the mapping
+        self._delimiter = None #: delimiter for ``self._file``
+
+        if 'cat_pattern' in kwargs:
+            self._pattern = kwargs['cat_pattern']
+            del kwargs['cat_pattern']
+        elif 'cat_map' in kwargs:
+            self._map = kwargs['cat_map']
+            del kwargs['cat_map']
+        elif 'cat_file' in kwargs:
+            self._file = kwargs['cat_file']
+            del kwargs['cat_file']
+            if 'cat_delimiter' in kwargs:
+                self._delimiter = kwargs['cat_delimiter']
+                del kwargs['cat_delimiter']
+        else:
+            raise ValueError('Expected keyword argument cat_pattern or '
+                             'cat_map or cat_file.')
+
+
+        if ('cat_pattern' in kwargs or 'cat_map' in kwargs or
+            'cat_file' in kwargs):
+            raise ValueError('Specify exactly one of: cat_pattern, '
+                             'cat_map, cat_file.')
+
+    def _init(self):
+        self._f2c = defaultdict(set)
+        self._c2f = defaultdict(set)
+
+        if self._pattern is not None:
+            for file_id in self._fileids:
+                category = re.match(self._pattern, file_id).group(1)
+                self._add(file_id, category)
+
+        elif self._map is not None:
+            for (file_id, categories) in self._map.items():
+                for category in categories:
+                    self._add(file_id, category)
+
+        elif self._file is not None:
+            for line in self.open(self._file).readlines():
+                line = line.strip()
+                file_id, categories = line.split(self._delimiter, 1)
+                if file_id not in self.fileids():
+                    raise ValueError('In category mapping file %s: %s '
+                                     'not found' % (self._file, file_id))
+                for category in categories.split(self._delimiter):
+                    self._add(file_id, category)
+
+    def _add(self, file_id, category):
+        self._f2c[file_id].add(category)
+        self._c2f[category].add(file_id)
+
+    def categories(self, fileids=None):
+        """
+        Return a list of the categories that are defined for this corpus,
+        or for the file(s) if it is given.
+        """
+        if self._f2c is None:
+            self._init()
+        if fileids is None:
+            return sorted(self._c2f)
+        if isinstance(fileids, string_types):
+            fileids = [fileids]
+        return sorted(set.union(*[self._f2c[d] for d in fileids]))
+
+    def fileids(self, categories=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus, or that make up the given category(s) if specified.
+        """
+        if categories is None:
+            return super(CategorizedCorpusReader, self).fileids()
+        elif isinstance(categories, string_types):
+            if self._f2c is None:
+                self._init()
+            if categories in self._c2f:
+                return sorted(self._c2f[categories])
+            else:
+                raise ValueError('Category %s not found' % categories)
+        else:
+            if self._f2c is None:
+                self._init()
+            return sorted(set.union(*[self._c2f[c] for c in categories]))
+
+######################################################################
+#{ Treebank readers
+######################################################################
+
+#[xx] is it worth it to factor this out?
+class SyntaxCorpusReader(CorpusReader):
+    """
+    An abstract base class for reading corpora consisting of
+    syntactically parsed text.  Subclasses should define:
+
+      - ``__init__``, which specifies the location of the corpus
+        and a method for detecting the sentence blocks in corpus files.
+      - ``_read_block``, which reads a block from the input stream.
+      - ``_word``, which takes a block and returns a list of list of words.
+      - ``_tag``, which takes a block and returns a list of list of tagged
+        words.
+      - ``_parse``, which takes a block and returns a list of parsed
+        sentences.
+    """
+    def _parse(self, s):
+        raise NotImplementedError()
+    def _word(self, s):
+        raise NotImplementedError()
+    def _tag(self, s):
+        raise NotImplementedError()
+    def _read_block(self, stream):
+        raise NotImplementedError()
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def parsed_sents(self, fileids=None):
+        reader = self._read_parsed_sent_block
+        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, True)])
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        def reader(stream):
+            return self._read_tagged_sent_block(stream, tagset)
+        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        reader = self._read_sent_block
+        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, True)])
+
+    def tagged_words(self, fileids=None, tagset=None):
+        def reader(stream):
+            return self._read_tagged_word_block(stream, tagset)
+        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, True)])
+
+    def words(self, fileids=None):
+        return concat([StreamBackedCorpusView(fileid,
+                                              self._read_word_block,
+                                              encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, True)])
+
+    #------------------------------------------------------------
+    #{ Block Readers
+
+    def _read_word_block(self, stream):
+        return list(chain(*self._read_sent_block(stream)))
+
+    def _read_tagged_word_block(self, stream, tagset=None):
+        return list(chain(*self._read_tagged_sent_block(stream, tagset)))
+
+    def _read_sent_block(self, stream):
+        return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
+
+    def _read_tagged_sent_block(self, stream, tagset=None):
+        return list(filter(None, [self._tag(t, tagset)
+                             for t in self._read_block(stream)]))
+
+    def _read_parsed_sent_block(self, stream):
+        return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
+
+    #} End of Block Readers
+    #------------------------------------------------------------
diff --git a/nlp_resource_data/nltk/corpus/reader/api.pyc b/nlp_resource_data/nltk/corpus/reader/api.pyc

new file mode 100755 (executable)

index 0000000..d927c3f

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/api.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/bnc.py b/nlp_resource_data/nltk/corpus/reader/bnc.py

new file mode 100755 (executable)

index 0000000..01ad9a1
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/bnc.py
@@ -0,0 +1,252 @@
+# Natural Language Toolkit: Plaintext Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""Corpus reader for the XML version of the British National Corpus."""
+
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree
+
+
+class BNCCorpusReader(XMLCorpusReader):
+    """Corpus reader for the XML version of the British National Corpus.
+
+    For access to the complete XML data structure, use the ``xml()``
+    method.  For access to simple word lists and tagged word lists, use
+    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+
+    You can obtain the full version of the BNC corpus at
+    http://www.ota.ox.ac.uk/desc/2554
+
+    If you extracted the archive to a directory called `BNC`, then you can
+    instantiate the reader as::
+
+        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
+
+    """
+
+    def __init__(self, root, fileids, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+
+    def words(self, fileids=None, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        return self._views(fileids, False, None, strip_space, stem)
+
+    def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+
+        :param c5: If true, then the tags used will be the more detailed
+            c5 tags.  Otherwise, the simplified tags will be used.
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        tag = 'c5' if c5 else 'pos'
+        return self._views(fileids, False, tag, strip_space, stem)
+
+    def sents(self, fileids=None, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        return self._views(fileids, True, None, strip_space, stem)
+
+    def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+        :rtype: list(list(tuple(str,str)))
+
+        :param c5: If true, then the tags used will be the more detailed
+            c5 tags.  Otherwise, the simplified tags will be used.
+        :param strip_space: If true, then strip trailing spaces from
+            word tokens.  Otherwise, leave the spaces on the tokens.
+        :param stem: If true, then use word stems instead of word strings.
+        """
+        tag = 'c5' if c5 else 'pos'
+        return self._views(fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem)
+
+    def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
+        """A helper function that instantiates BNCWordViews or the list of words/sentences."""
+        f = BNCWordView if self._lazy else self._words
+        return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)])
+
+    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
+        """
+        Helper used to implement the view methods -- returns a list of
+        words or a list of sentences, optionally tagged.
+
+        :param fileid: The name of the underlying file.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param tag: The name of the tagset to use, or None for no tags.
+        :param strip_space: If true, strip spaces from word tokens.
+        :param stem: If true, then substitute stems for words.
+        """
+        result = []
+
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for xmlsent in xmldoc.findall('.//s'):
+            sent = []
+            for xmlword in _all_xmlwords_in(xmlsent):
+                word = xmlword.text
+                if not word:
+                    word = ""  # fixes issue 337?
+                if strip_space or stem:
+                    word = word.strip()
+                if stem:
+                    word = xmlword.get('hw', word)
+                if tag == 'c5':
+                    word = (word, xmlword.get('c5'))
+                elif tag == 'pos':
+                    word = (word, xmlword.get('pos', xmlword.get('c5')))
+                sent.append(word)
+            if bracket_sent:
+                result.append(BNCSentence(xmlsent.attrib['n'], sent))
+            else:
+                result.extend(sent)
+
+        assert None not in result
+        return result
+
+
+def _all_xmlwords_in(elt, result=None):
+    if result is None:
+        result = []
+    for child in elt:
+        if child.tag in ('c', 'w'):
+            result.append(child)
+        else:
+            _all_xmlwords_in(child, result)
+    return result
+
+
+class BNCSentence(list):
+    """
+    A list of words, augmented by an attribute ``num`` used to record
+    the sentence identifier (the ``n`` attribute from the XML).
+    """
+    def __init__(self, num, items):
+        self.num = num
+        list.__init__(self, items)
+
+
+class BNCWordView(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with the BNC corpus.
+    """
+
+    tags_to_ignore = set(
+        ['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
+    )
+    """These tags are ignored. For their description refer to the
+    technical documentation, for example,
+    http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
+
+    """
+
+    def __init__(self, fileid, sent, tag, strip_space, stem):
+        """
+        :param fileid: The name of the underlying file.
+        :param sent: If true, include sentence bracketing.
+        :param tag: The name of the tagset to use, or None for no tags.
+        :param strip_space: If true, strip spaces from word tokens.
+        :param stem: If true, then substitute stems for words.
+        """
+        if sent:
+            tagspec = '.*/s'
+        else:
+            tagspec = '.*/s/(.*/)?(c|w)'
+        self._sent = sent
+        self._tag = tag
+        self._strip_space = strip_space
+        self._stem = stem
+
+        self.title = None  #: Title of the document.
+        self.author = None  #: Author of the document.
+        self.editor = None  #: Editor
+        self.resps = None  #: Statement of responsibility
+
+        XMLCorpusView.__init__(self, fileid, tagspec)
+
+        # Read in a tasty header.
+        self._open()
+        self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
+        self.close()
+
+        # Reset tag context.
+        self._tag_context = {0: ()}
+
+    def handle_header(self, elt, context):
+        # Set up some metadata!
+        titles = elt.findall('titleStmt/title')
+        if titles:
+            self.title = '\n'.join(title.text.strip() for title in titles)
+
+        authors = elt.findall('titleStmt/author')
+        if authors:
+            self.author = '\n'.join(author.text.strip() for author in authors)
+
+        editors = elt.findall('titleStmt/editor')
+        if editors:
+            self.editor = '\n'.join(editor.text.strip() for editor in editors)
+
+        resps = elt.findall('titleStmt/respStmt')
+        if resps:
+            self.resps = '\n\n'.join(
+                '\n'.join(
+                    resp_elt.text.strip() for resp_elt in resp
+                ) for resp in resps
+            )
+
+    def handle_elt(self, elt, context):
+        if self._sent:
+            return self.handle_sent(elt)
+        else:
+            return self.handle_word(elt)
+
+    def handle_word(self, elt):
+        word = elt.text
+        if not word:
+            word = ""  # fixes issue 337?
+        if self._strip_space or self._stem:
+            word = word.strip()
+        if self._stem:
+            word = elt.get('hw', word)
+        if self._tag == 'c5':
+            word = (word, elt.get('c5'))
+        elif self._tag == 'pos':
+            word = (word, elt.get('pos', elt.get('c5')))
+        return word
+
+    def handle_sent(self, elt):
+        sent = []
+        for child in elt:
+            if child.tag in ('mw', 'hi', 'corr', 'trunc'):
+                sent += [self.handle_word(w) for w in child]
+            elif child.tag in ('w', 'c'):
+                sent.append(self.handle_word(child))
+            elif child.tag not in self.tags_to_ignore:
+                raise ValueError('Unexpected element %s' % child.tag)
+        return BNCSentence(elt.attrib['n'], sent)
diff --git a/nlp_resource_data/nltk/corpus/reader/bnc.pyc b/nlp_resource_data/nltk/corpus/reader/bnc.pyc

new file mode 100755 (executable)

index 0000000..1c4da62

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/bnc.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/bracket_parse.py b/nlp_resource_data/nltk/corpus/reader/bracket_parse.py

new file mode 100755 (executable)

index 0000000..0944075
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/bracket_parse.py
@@ -0,0 +1,227 @@
+# Natural Language Toolkit: Penn Treebank Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Corpus reader for corpora that consist of parenthesis-delineated parse trees.
+"""
+
+import sys
+
+from nltk.tree import Tree
+from nltk.tag import map_tag
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+# we use [^\s()]+ instead of \S+? to avoid matching ()
+SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)') 
+TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
+WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
+EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
+
+class BracketParseCorpusReader(SyntaxCorpusReader):
+    """
+    Reader for corpora that consist of parenthesis-delineated parse trees,
+    like those found in the "combined" section of the Penn Treebank,
+    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
+
+    """
+    def __init__(self, root, fileids, comment_char=None,
+                 detect_blocks='unindented_paren', encoding='utf8',
+                 tagset=None):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param comment_char: The character which can appear at the start of
+            a line to indicate that the rest of the line is a comment.
+        :param detect_blocks: The method that is used to find blocks
+          in the corpus; can be 'unindented_paren' (every unindented
+          parenthesis starts a new parse) or 'sexpr' (brackets are
+          matched).
+        :param tagset: The name of the tagset used by this corpus, to be used
+              for normalizing or converting the POS tags returned by the
+              tagged_...() methods.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._comment_char = comment_char
+        self._detect_blocks = detect_blocks
+        self._tagset = tagset
+
+    def _read_block(self, stream):
+        if self._detect_blocks == 'sexpr':
+            return read_sexpr_block(stream, comment_char=self._comment_char)
+        elif self._detect_blocks == 'blankline':
+            return read_blankline_block(stream)
+        elif self._detect_blocks == 'unindented_paren':
+            # Tokens start with unindented left parens.
+            toks = read_regexp_block(stream, start_re=r'^\(')
+            # Strip any comments out of the tokens.
+            if self._comment_char:
+                toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
+                               '', tok)
+                        for tok in toks]
+            return toks
+        else:
+            assert 0, 'bad block type'
+
+    def _normalize(self, t):
+        # If there's an empty set of brackets surrounding the actual
+        # parse, then strip them off.
+        if EMPTY_BRACKETS.match(t):
+            t = t.strip()[1:-1]
+        # Replace leaves of the form (!), (,), with (! !), (, ,)
+        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
+        # Replace leaves of the form (tag word root) with (tag word)
+        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
+        return t
+
+    def _parse(self, t):
+        try:
+            return Tree.fromstring(self._normalize(t))
+
+        except ValueError as e:
+            sys.stderr.write("Bad tree detected; trying to recover...\n")
+            # Try to recover, if we can:
+            if e.args == ('mismatched parens',):
+                for n in range(1, 5):
+                    try:
+                        v = Tree(self._normalize(t+')'*n))
+                        sys.stderr.write("  Recovered by adding %d close "
+                                         "paren(s)\n" % n)
+                        return v
+                    except ValueError: pass
+            # Try something else:
+            sys.stderr.write("  Recovered by returning a flat parse.\n")
+            #sys.stderr.write(' '.join(t.split())+'\n')
+            return Tree('S', self._tag(t))
+
+    def _tag(self, t, tagset=None):
+        tagged_sent = [(w,p) for (p,w) in TAGWORD.findall(self._normalize(t))]
+        if tagset and tagset != self._tagset:
+            tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (w,p) in tagged_sent]
+        return tagged_sent
+
+    def _word(self, t):
+        return WORD.findall(self._normalize(t))
+
+class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
+                                          BracketParseCorpusReader):
+    """
+    A reader for parsed corpora whose documents are
+    divided into categories based on their file identifiers.
+    @author: Nathan Schneider <nschneid@cs.cmu.edu>
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
+        the L{CategorizedCorpusReader constructor
+        <CategorizedCorpusReader.__init__>}.  The remaining arguments
+        are passed to the L{BracketParseCorpusReader constructor
+        <BracketParseCorpusReader.__init__>}.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        BracketParseCorpusReader.__init__(self, *args, **kwargs)
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+    def raw(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.raw(
+            self, self._resolve(fileids, categories))
+    def words(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.words(
+            self, self._resolve(fileids, categories))
+    def sents(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.sents(
+            self, self._resolve(fileids, categories))
+    def paras(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.paras(
+            self, self._resolve(fileids, categories))
+    def tagged_words(self, fileids=None, categories=None, tagset=None):
+        return BracketParseCorpusReader.tagged_words(
+            self, self._resolve(fileids, categories), tagset)
+    def tagged_sents(self, fileids=None, categories=None, tagset=None):
+        return BracketParseCorpusReader.tagged_sents(
+            self, self._resolve(fileids, categories), tagset)
+    def tagged_paras(self, fileids=None, categories=None, tagset=None):
+        return BracketParseCorpusReader.tagged_paras(
+            self, self._resolve(fileids, categories), tagset)
+    def parsed_words(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.parsed_words(
+            self, self._resolve(fileids, categories))
+    def parsed_sents(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.parsed_sents(
+            self, self._resolve(fileids, categories))
+    def parsed_paras(self, fileids=None, categories=None):
+        return BracketParseCorpusReader.parsed_paras(
+            self, self._resolve(fileids, categories))
+
+class AlpinoCorpusReader(BracketParseCorpusReader):
+    """
+    Reader for the Alpino Dutch Treebank.
+    This corpus has a lexical breakdown structure embedded, as read by _parse
+    Unfortunately this puts punctuation and some other words out of the sentence
+    order in the xml element tree. This is no good for tag_ and word_
+    _tag and _word will be overridden to use a non-default new parameter 'ordered'
+    to the overridden _normalize function. The _parse function can then remain 
+    untouched.
+    """
+    def __init__(self, root, encoding='ISO-8859-1', tagset=None):
+        BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
+                                 detect_blocks='blankline',
+                                 encoding=encoding,
+                                 tagset=tagset)
+
+    def _normalize(self, t, ordered = False):
+        """Normalize the xml sentence element in t.
+        The sentence elements <alpino_ds>, although embedded in a few overall 
+        xml elements, are seperated by blank lines. That's how the reader can 
+        deliver them one at a time.
+        Each sentence has a few category subnodes that are of no use to us.
+        The remaining word nodes may or may not appear in the proper order.
+        Each word node has attributes, among which:
+        - begin : the position of the word in the sentence
+        - pos   : Part of Speech: the Tag
+        - word  : the actual word
+        The return value is a string with all xml elementes replaced by 
+        clauses: either a cat clause with nested clauses, or a word clause.
+        The order of the bracket clauses closely follows the xml.
+        If ordered == True, the word clauses include an order sequence number.
+        If ordered == False, the word clauses only have pos and word parts.
+        """
+        if t[:10] != "<alpino_ds":
+            return ""
+        # convert XML to sexpr notation
+        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
+        if ordered:
+            t = re.sub(r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2 \3)", t)
+        else: 
+            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
+        t = re.sub(r"  </node>", r")", t)
+        t = re.sub(r"<sentence>.*</sentence>", r"", t)
+        t = re.sub(r"</?alpino_ds.*>", r"", t)
+        return t
+
+    def _tag(self, t, tagset=None):
+        tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))]
+        tagged_sent.sort()
+        if tagset and tagset != self._tagset:
+            tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent]
+        else:
+            tagged_sent = [(w,p) for (o,w,p) in tagged_sent]
+        return tagged_sent
+
+    def _word(self, t):
+        """Return a correctly ordered list if words"""
+        tagged_sent = self._tag(t)
+        return [w for (w,p) in tagged_sent]      
+
diff --git a/nlp_resource_data/nltk/corpus/reader/bracket_parse.pyc b/nlp_resource_data/nltk/corpus/reader/bracket_parse.pyc

new file mode 100755 (executable)

index 0000000..627a986

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/bracket_parse.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/categorized_sents.py b/nlp_resource_data/nltk/corpus/reader/categorized_sents.py

new file mode 100755 (executable)

index 0000000..fa139c2
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/categorized_sents.py
@@ -0,0 +1,179 @@
+# Natural Language Toolkit: Categorized Sentences Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader structured for corpora that contain one instance on each row.
+This CorpusReader is specifically used for the Subjectivity Dataset and the
+Sentence Polarity Dataset.
+
+- Subjectivity Dataset information -
+
+Authors: Bo Pang and Lillian Lee.
+Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
+
+Distributed with permission.
+
+Related papers:
+
+- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
+    Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
+    2004.
+
+- Sentence Polarity Dataset information -
+
+Authors: Bo Pang and Lillian Lee.
+Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
+
+Related papers:
+
+- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
+    sentiment categorization with respect to rating scales". Proceedings of the
+    ACL, 2005.
+"""
+from six import string_types
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    A reader for corpora in which each row represents a single instance, mainly
+    a sentence. Istances are divided into categories based on their file identifiers
+    (see CategorizedCorpusReader).
+    Since many corpora allow rows that contain more than one sentence, it is
+    possible to specify a sentence tokenizer to retrieve all sentences instead
+    than all rows.
+
+    Examples using the Subjectivity Dataset:
+
+    >>> from nltk.corpus import subjectivity
+    >>> subjectivity.sents()[23]
+    ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
+    'happened', 'off', 'screen', '.']
+    >>> subjectivity.categories()
+    ['obj', 'subj']
+    >>> subjectivity.words(categories='subj')
+    ['smart', 'and', 'alert', ',', 'thirteen', ...]
+
+    Examples using the Sentence Polarity Dataset:
+
+    >>> from nltk.corpus import sentence_polarity
+    >>> sentence_polarity.sents()
+    [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
+    'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
+    'it', 'funny', '.'], ...]
+    >>> sentence_polarity.categories()
+    ['neg', 'pos']
+    """
+
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(),
+                 sent_tokenizer=None, encoding='utf8', **kwargs):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+
+    def raw(self, fileids=None, categories=None):
+        """
+        :param fileids: a list or regexp specifying the fileids that have to be
+            returned as a raw string.
+        :param categories: a list specifying the categories whose files have to
+            be returned as a raw string.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def readme(self):
+        """
+        Return the contents of the corpus Readme.txt file.
+        """
+        return self.open("README").read()
+
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified file(s).
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences have
+            to be returned.
+        :return: the given file(s) as a list of sentences.
+            Each sentence is tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        file(s).
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have to
+            be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            if self._sent_tokenizer:
+                sents.extend([self._word_tokenizer.tokenize(sent)
+                              for sent in self._sent_tokenizer.tokenize(line)])
+            else:
+                sents.append(self._word_tokenizer.tokenize(line))
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
diff --git a/nlp_resource_data/nltk/corpus/reader/categorized_sents.pyc b/nlp_resource_data/nltk/corpus/reader/categorized_sents.pyc

new file mode 100755 (executable)

index 0000000..7ea0ec1

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/categorized_sents.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/chasen.py b/nlp_resource_data/nltk/corpus/reader/chasen.py

new file mode 100755 (executable)

index 0000000..eaf85dc
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/chasen.py
@@ -0,0 +1,140 @@
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Masato Hagiwara <hagisan@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
+
+import sys
+
+from six import string_types
+
+from nltk.corpus.reader import util
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class ChasenCorpusReader(CorpusReader):
+
+    def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
+        self._sent_splitter = sent_splitter
+        CorpusReader.__init__(self, root, fileids, encoding)
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def words(self, fileids=None):
+        return concat([ChasenCorpusView(fileid, enc,
+                                        False, False, False, self._sent_splitter)
+            for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_words(self, fileids=None):
+        return concat([ChasenCorpusView(fileid, enc,
+                                        True, False, False, self._sent_splitter)
+            for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        return concat([ChasenCorpusView(fileid, enc,
+                                        False, True, False, self._sent_splitter)
+            for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_sents(self, fileids=None):
+        return concat([ChasenCorpusView(fileid, enc,
+                                        True, True, False, self._sent_splitter)
+            for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def paras(self, fileids=None):
+        return concat([ChasenCorpusView(fileid, enc,
+                                        False, True, True, self._sent_splitter)
+            for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_paras(self, fileids=None):
+        return concat([ChasenCorpusView(fileid, enc,
+                                        True, True, True, self._sent_splitter)
+            for (fileid, enc) in self.abspaths(fileids, True)])
+
+
+class ChasenCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
+    but this'll use fixed sets of word and sentence tokenizer.
+    """
+
+    def __init__(self, corpus_file, encoding,
+                 tagged, group_by_sent, group_by_para, sent_splitter=None):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._sent_splitter = sent_splitter
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+
+    def read_block(self, stream):
+        """Reads one paragraph at a time."""
+        block = []
+        for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
+
+            para = []
+
+            sent = []
+            for line in para_str.splitlines():
+
+                _eos = line.strip() == 'EOS'
+                _cells = line.split('\t')
+                w = (_cells[0], '\t'.join(_cells[1:]))
+                if not _eos: sent.append(w)
+
+                if _eos or (self._sent_splitter and self._sent_splitter(w)):
+                    if not self._tagged:
+                        sent = [w for (w,t) in sent]
+                    if self._group_by_sent:
+                        para.append(sent)
+                    else:
+                        para.extend(sent)
+                    sent = []
+
+            if len(sent)>0:
+                if not self._tagged:
+                    sent = [w for (w,t) in sent]
+
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+
+        return block
+
+def demo():
+
+    import nltk
+    from nltk.corpus.util import LazyCorpusLoader
+
+    jeita = LazyCorpusLoader(
+        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
+    print('/'.join( jeita.words()[22100:22140] ))
+
+
+    print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
+                          for sent in jeita.tagged_sents()[2170:2173]))
+
+def test():
+
+    from nltk.corpus.util import LazyCorpusLoader
+
+    jeita = LazyCorpusLoader(
+        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
+
+    assert isinstance(jeita.tagged_words()[0][1], string_types)
+
+if __name__ == '__main__':
+    demo()
+    test()
diff --git a/nlp_resource_data/nltk/corpus/reader/chasen.pyc b/nlp_resource_data/nltk/corpus/reader/chasen.pyc

new file mode 100755 (executable)

index 0000000..45a8040

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/chasen.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/childes.py b/nlp_resource_data/nltk/corpus/reader/childes.py

new file mode 100755 (executable)

index 0000000..0b092f1
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/childes.py
@@ -0,0 +1,522 @@
+# CHILDES XML Corpus Reader
+
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
+#         Alexis Dimitriadis <A.Dimitriadis@uu.nl>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the XML version of the CHILDES corpus.
+"""
+from __future__ import print_function, division
+
+__docformat__ = 'epytext en'
+
+import re
+from collections import defaultdict
+from six import string_types
+
+from nltk.util import flatten, LazyMap, LazyConcatenation
+
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
+
+# to resolve the namespace issue
+NS = 'http://www.talkbank.org/ns/talkbank'
+
+class CHILDESCorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for the XML version of the CHILDES corpus.
+    The CHILDES corpus is available at ``http://childes.psy.cmu.edu/``. The XML
+    version of CHILDES is located at ``http://childes.psy.cmu.edu/data-xml/``.
+    Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
+    (``nltk_data/corpora/CHILDES/``).
+
+    For access to the file text use the usual nltk functions,
+    ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
+    """
+    def __init__(self, root, fileids, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+
+    def words(self, fileids=None, speaker='ALL', stem=False,
+            relation=False, strip_space=True, replace=False):
+        """
+        :return: the given file(s) as a list of words
+        :rtype: list(str)
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of (stem, index,
+            dependent_index)
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent=None
+        pos=False
+        if not self._lazy:
+            return [self._get_words(fileid, speaker, sent, stem, relation,
+                pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+        get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+            pos, strip_space, replace)
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def tagged_words(self, fileids=None, speaker='ALL', stem=False,
+            relation=False, strip_space=True, replace=False):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of (stem, index,
+            dependent_index)
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent=None
+        pos=True
+        if not self._lazy:
+            return [self._get_words(fileid, speaker, sent, stem, relation,
+                pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+        get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+            pos, strip_space, replace)
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def sents(self, fileids=None, speaker='ALL', stem=False,
+            relation=None, strip_space=True, replace=False):
+        """
+        :return: the given file(s) as a list of sentences or utterances, each
+            encoded as a list of word strings.
+        :rtype: list(list(str))
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
+            If there is manually-annotated relation info, it will return
+            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent=True
+        pos=False
+        if not self._lazy:
+            return [self._get_words(fileid, speaker, sent, stem, relation,
+                pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+        get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+            pos, strip_space, replace)
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def tagged_sents(self, fileids=None, speaker='ALL', stem=False,
+            relation=None, strip_space=True, replace=False):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+        :rtype: list(list(tuple(str,str)))
+
+        :param speaker: If specified, select specific speaker(s) defined
+            in the corpus. Default is 'ALL' (all participants). Common choices
+            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
+            researchers)
+        :param stem: If true, then use word stems instead of word strings.
+        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
+            If there is manually-annotated relation info, it will return
+            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
+        :param strip_space: If true, then strip trailing spaces from word
+            tokens. Otherwise, leave the spaces on the tokens.
+        :param replace: If true, then use the replaced (intended) word instead
+            of the original word (e.g., 'wat' will be replaced with 'watch')
+        """
+        sent=True
+        pos=True
+        if not self._lazy:
+            return [self._get_words(fileid, speaker, sent, stem, relation,
+                pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+        get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+            pos, strip_space, replace)
+        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
+
+    def corpus(self, fileids=None):
+        """
+        :return: the given file(s) as a dict of ``(corpus_property_key, value)``
+        :rtype: list(dict)
+        """
+        if not self._lazy:
+            return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
+        return LazyMap(self._get_corpus, self.abspaths(fileids))
+
+    def _get_corpus(self, fileid):
+        results = dict()
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for key, value in xmldoc.items():
+            results[key] = value
+        return results
+
+    def participants(self, fileids=None):
+        """
+        :return: the given file(s) as a dict of
+            ``(participant_property_key, value)``
+        :rtype: list(dict)
+        """
+        if not self._lazy:
+            return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
+        return LazyMap(self._get_participants, self.abspaths(fileids))
+
+    def _get_participants(self, fileid):
+        # multidimensional dicts
+        def dictOfDicts():
+            return defaultdict(dictOfDicts)
+
+        xmldoc = ElementTree.parse(fileid).getroot()
+        # getting participants' data
+        pat = dictOfDicts()
+        for participant in xmldoc.findall('.//{%s}Participants/{%s}participant'
+                                          % (NS,NS)):
+            for (key,value) in participant.items():
+                pat[participant.get('id')][key] = value
+        return pat
+
+    def age(self, fileids=None, speaker='CHI', month=False):
+        """
+        :return: the given file(s) as string or int
+        :rtype: list or int
+
+        :param month: If true, return months instead of year-month-date
+        """
+        if not self._lazy:
+            return [self._get_age(fileid, speaker, month)
+                for fileid in self.abspaths(fileids)]
+        get_age = lambda fileid: self._get_age(fileid, speaker, month)
+        return LazyMap(get_age, self.abspaths(fileids))
+
+    def _get_age(self, fileid, speaker, month):
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for pat in xmldoc.findall('.//{%s}Participants/{%s}participant'
+                                  % (NS,NS)):
+            try:
+                if pat.get('id') == speaker:
+                    age = pat.get('age')
+                    if month:
+                        age = self.convert_age(age)
+                    return age
+            # some files don't have age data
+            except (TypeError, AttributeError) as e:
+                return None
+
+    def convert_age(self, age_year):
+        "Caclculate age in months from a string in CHILDES format"
+        m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?",age_year)
+        age_month = int(m.group(1))*12 + int(m.group(2))
+        try:
+            if int(m.group(3)) > 15:
+                age_month += 1
+        # some corpora don't have age information?
+        except ValueError as e:
+            pass
+        return age_month
+
+    def MLU(self, fileids=None, speaker='CHI'):
+        """
+        :return: the given file(s) as a floating number
+        :rtype: list(float)
+        """
+        if not self._lazy:
+            return [self._getMLU(fileid, speaker=speaker)
+                for fileid in self.abspaths(fileids)]
+        get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
+        return LazyMap(get_MLU, self.abspaths(fileids))
+
+    def _getMLU(self, fileid, speaker):
+        sents = self._get_words(fileid, speaker=speaker, sent=True, stem=True,
+                    relation=False, pos=True, strip_space=True, replace=True)
+        results = []
+        lastSent = []
+        numFillers = 0
+        sentDiscount = 0
+        for sent in sents:
+            posList = [pos for (word,pos) in sent]
+            # if any part of the sentence is intelligible
+            if any(pos == 'unk' for pos in posList):
+                next
+            # if the sentence is null
+            elif sent == []:
+                next
+            # if the sentence is the same as the last sent
+            elif sent == lastSent:
+                next
+            else:
+                results.append([word for (word,pos) in sent])
+                # count number of fillers
+                if len(set(['co',None]).intersection(posList)) > 0:
+                    numFillers += posList.count('co')
+                    numFillers += posList.count(None)
+                    sentDiscount += 1
+            lastSent = sent
+        try:
+            thisWordList = flatten(results)
+            # count number of morphemes
+            # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
+            numWords = len(flatten([word.split('-')
+                                          for word in thisWordList])) - numFillers
+            numSents = len(results) - sentDiscount
+            mlu = numWords/numSents
+        except ZeroDivisionError:
+            mlu = 0
+        # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
+        return mlu
+
+    def _get_words(self, fileid, speaker, sent, stem, relation, pos,
+            strip_space, replace):
+        if isinstance(speaker, string_types) and speaker != 'ALL':  # ensure we have a list of speakers
+            speaker = [ speaker ]
+        xmldoc = ElementTree.parse(fileid).getroot()
+        # processing each xml doc
+        results = []
+        for xmlsent in xmldoc.findall('.//{%s}u' % NS):
+            sents = []
+            # select speakers
+            if speaker == 'ALL' or xmlsent.get('who') in speaker:
+                for xmlword in xmlsent.findall('.//{%s}w' % NS):
+                    infl = None ; suffixStem = None; suffixTag = None
+                    # getting replaced words
+                    if replace and xmlsent.find('.//{%s}w/{%s}replacement'
+                                                % (NS,NS)):
+                        xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w'
+                                               % (NS,NS,NS))
+                    elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)):
+                        xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS))
+                    # get text
+                    if xmlword.text:
+                        word = xmlword.text
+                    else:
+                        word = ''
+                    # strip tailing space
+                    if strip_space:
+                        word = word.strip()
+                    # stem
+                    if relation or stem:
+                        try:
+                            xmlstem = xmlword.find('.//{%s}stem' % NS)
+                            word = xmlstem.text
+                        except AttributeError as e:
+                            pass
+                        # if there is an inflection
+                        try:
+                            xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk'
+                                                   % (NS,NS,NS))
+                            word += '-' + xmlinfl.text
+                        except:
+                            pass
+                        # if there is a suffix
+                        try:
+                            xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
+                                                     % (NS,NS,NS,NS))
+                            suffixStem = xmlsuffix.text
+                        except AttributeError:
+                            suffixStem = ""
+                        if suffixStem:
+                            word += "~"+suffixStem
+                    # pos
+                    if relation or pos:
+                        try:
+                            xmlpos = xmlword.findall(".//{%s}c" % NS)
+                            xmlpos2 = xmlword.findall(".//{%s}s" % NS)
+                            if xmlpos2 != []:
+                                tag = xmlpos[0].text+":"+xmlpos2[0].text
+                            else:
+                                tag = xmlpos[0].text
+                        except (AttributeError,IndexError) as e:
+                            tag = ""
+                        try:
+                            xmlsuffixpos = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
+                                                     % (NS,NS,NS,NS,NS))
+                            xmlsuffixpos2 = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
+                                                     % (NS,NS,NS,NS,NS))
+                            if xmlsuffixpos2:
+                                suffixTag = xmlsuffixpos[0].text+":"+xmlsuffixpos2[0].text
+                            else:
+                                suffixTag = xmlsuffixpos[0].text
+                        except:
+                            pass
+                        if suffixTag:
+                            tag += "~"+suffixTag
+                        word = (word, tag)
+                    # relational
+                    # the gold standard is stored in
+                    # <mor></mor><mor type="trn"><gra type="grt">
+                    if relation == True:
+                        for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra'
+                                                           % (NS,NS)):
+                            if not xmlstem_rel.get('type') == 'grt':
+                                word = (word[0], word[1],
+                                        xmlstem_rel.get('index')
+                                        + "|" + xmlstem_rel.get('head')
+                                        + "|" + xmlstem_rel.get('relation'))
+                            else:
+                                word = (word[0], word[1], word[2],
+                                        word[0], word[1],
+                                        xmlstem_rel.get('index')
+                                        + "|" + xmlstem_rel.get('head')
+                                        + "|" + xmlstem_rel.get('relation'))
+                        try:
+                            for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra'
+                                                               % (NS,NS,NS)):
+                                if not xmlpost_rel.get('type') == 'grt':
+                                    suffixStem = (suffixStem[0],
+                                                  suffixStem[1],
+                                                  xmlpost_rel.get('index')
+                                                  + "|" + xmlpost_rel.get('head')
+                                                  + "|" + xmlpost_rel.get('relation'))
+                                else:
+                                    suffixStem = (suffixStem[0], suffixStem[1],
+                                                  suffixStem[2], suffixStem[0],
+                                                  suffixStem[1],
+                                                  xmlpost_rel.get('index')
+                                                  + "|" + xmlpost_rel.get('head')
+                                                  + "|" + xmlpost_rel.get('relation'))
+                        except:
+                            pass
+                    sents.append(word)
+                if sent or relation:
+                    results.append(sents)
+                else:
+                    results.extend(sents)
+        return LazyMap(lambda x: x, results)
+
+
+    # Ready-to-use browser opener
+
+    """
+    The base URL for viewing files on the childes website. This
+    shouldn't need to be changed, unless CHILDES changes the configuration
+    of their server or unless the user sets up their own corpus webserver.
+    """
+    childes_url_base = r'http://childes.psy.cmu.edu/browser/index.php?url='
+
+
+    def webview_file(self, fileid, urlbase=None):
+        """Map a corpus file to its web version on the CHILDES website,
+        and open it in a web browser.
+
+        The complete URL to be used is:
+            childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
+
+        If no urlbase is passed, we try to calculate it.  This
+        requires that the childes corpus was set up to mirror the
+        folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
+        nltk_data/corpora/childes/Eng-USA/Cornell/??? or
+        nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
+
+        The function first looks (as a special case) if "Eng-USA" is
+        on the path consisting of <corpus root>+fileid; then if
+        "childes", possibly followed by "data-xml", appears. If neither
+        one is found, we use the unmodified fileid and hope for the best.
+        If this is not right, specify urlbase explicitly, e.g., if the
+        corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
+        """
+
+        import webbrowser, re
+
+        if urlbase:
+            path = urlbase+"/"+fileid
+        else:
+            full = self.root + "/" + fileid
+            full = re.sub(r'\\', '/', full)
+            if '/childes/' in full.lower():
+                # Discard /data-xml/ if present
+                path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
+            elif 'eng-usa' in full.lower():
+                path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
+            else:
+                path = fileid
+
+        # Strip ".xml" and add ".cha", as necessary:
+        if path.endswith('.xml'):
+            path = path[:-4]
+
+        if not path.endswith('.cha'):
+            path = path+'.cha'
+
+        url = self.childes_url_base + path
+
+        webbrowser.open_new_tab(url)
+        print("Opening in browser:", url)
+        # Pausing is a good idea, but it's up to the user...
+        # raw_input("Hit Return to continue")
+
+
+
+def demo(corpus_root=None):
+    """
+    The CHILDES corpus should be manually downloaded and saved
+    to ``[NLTK_Data_Dir]/corpora/childes/``
+    """
+    if not corpus_root:
+        from nltk.data import find
+        corpus_root = find('corpora/childes/data-xml/Eng-USA/')
+
+    try:
+        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
+        # describe all corpus
+        for file in childes.fileids()[:5]:
+            corpus = ''
+            corpus_id = ''
+            for (key,value) in childes.corpus(file)[0].items():
+                if key == "Corpus": corpus = value
+                if key == "Id": corpus_id = value
+            print('Reading', corpus,corpus_id,' .....')
+            print("words:", childes.words(file)[:7],"...")
+            print("words with replaced words:", childes.words(file, replace=True)[:7]," ...")
+            print("words with pos tags:", childes.tagged_words(file)[:7]," ...")
+            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
+            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
+            print("stemmed words:", childes.words(file, stem=True)[:7]," ...")
+            print("words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ...")
+            print("sentence:", childes.sents(file)[:2]," ...")
+            for (participant, values) in childes.participants(file)[0].items():
+                    for (key, value) in values.items():
+                        print("\tparticipant", participant, key, ":", value)
+            print("num of sent:", len(childes.sents(file)))
+            print("num of morphemes:", len(childes.words(file, stem=True)))
+            print("age:", childes.age(file))
+            print("age in month:", childes.age(file, month=True))
+            print("MLU:", childes.MLU(file))
+            print()
+
+    except LookupError as e:
+        print("""The CHILDES corpus, or the parts you need, should be manually
+        downloaded from http://childes.psy.cmu.edu/data-xml/ and saved at
+        [NLTK_Data_Dir]/corpora/childes/
+            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
+        demo('/path/to/childes/data-xml/Eng-USA/")
+        """)
+        #corpus_root_http = urllib2.urlopen('http://childes.psy.cmu.edu/data-xml/Eng-USA/Bates.zip')
+        #corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
+        ##this fails
+        #childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
+
+
+if __name__ == "__main__":
+    demo()
diff --git a/nlp_resource_data/nltk/corpus/reader/childes.pyc b/nlp_resource_data/nltk/corpus/reader/childes.pyc

new file mode 100755 (executable)

index 0000000..922081e

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/childes.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/chunked.py b/nlp_resource_data/nltk/corpus/reader/chunked.py

new file mode 100755 (executable)

index 0000000..b87ae06
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/chunked.py
@@ -0,0 +1,212 @@
+# Natural Language Toolkit: Chunked Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that contain chunked (and optionally tagged)
+documents.
+"""
+
+import os.path, codecs
+
+from six import string_types
+
+import nltk
+from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
+from nltk.tree import Tree
+from nltk.tokenize import *
+from nltk.chunk import tagstr2tree
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class ChunkedCorpusReader(CorpusReader):
+    """
+    Reader for chunked (and optionally tagged) corpora.  Paragraphs
+    are split using a block reader.  They are then tokenized into
+    sentences using a sentence tokenizer.  Finally, these sentences
+    are parsed into chunk trees using a string-to-chunktree conversion
+    function.  Each of these steps can be performed using a default
+    function or a custom function.  By default, paragraphs are split
+    on blank lines; sentences are listed one per line; and sentences
+    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
+    """
+    def __init__(self, root, fileids, extension='',
+                 str2chunktree=tagstr2tree,
+                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+                 para_block_reader=read_blankline_block,
+                 encoding='utf8', tagset=None):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
+        """Arguments for corpus views generated by this corpus: a tuple
+        (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
+
+    def raw(self, fileids=None):
+        """
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def tagged_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+        """
+        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+
+        :rtype: list(list(tuple(str,str)))
+        """
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def tagged_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of ``(word,tag)`` tuples.
+        :rtype: list(list(list(tuple(str,str))))
+        """
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def chunked_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and chunks.  Words are encoded as ``(word, tag)``
+            tuples (if the corpus has tags) or word strings (if the
+            corpus has no tags).  Chunks are encoded as depth-one
+            trees over ``(word,tag)`` tuples or word strings.
+        :rtype: list(tuple(str,str) and Tree)
+        """
+        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def chunked_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a shallow Tree.  The leaves
+            of these trees are encoded as ``(word, tag)`` tuples (if
+            the corpus has tags) or word strings (if the corpus has no
+            tags).
+        :rtype: list(Tree)
+        """
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def chunked_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as a shallow Tree.  The leaves of these
+            trees are encoded as ``(word, tag)`` tuples (if the corpus
+            has tags) or word strings (if the corpus has no tags).
+        :rtype: list(list(Tree))
+        """
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset)
+                       for (f, enc) in self.abspaths(fileids, True)])
+
+    def _read_block(self, stream):
+        return [tagstr2tree(t) for t in read_blankline_block(stream)]
+
+class ChunkedCorpusView(StreamBackedCorpusView):
+    def __init__(self, fileid, encoding, tagged, group_by_sent,
+                 group_by_para, chunked, str2chunktree, sent_tokenizer,
+                 para_block_reader, source_tagset=None, target_tagset=None):
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._chunked = chunked
+        self._str2chunktree = str2chunktree
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._source_tagset = source_tagset
+        self._target_tagset = target_tagset
+
+    def read_block(self, stream):
+        block = []
+        for para_str in self._para_block_reader(stream):
+            para = []
+            for sent_str in self._sent_tokenizer.tokenize(para_str):
+                sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
+                                           target_tagset=self._target_tagset)
+
+                # If requested, throw away the tags.
+                if not self._tagged:
+                    sent = self._untag(sent)
+
+                # If requested, throw away the chunks.
+                if not self._chunked:
+                    sent = sent.leaves()
+
+                # Add the sentence to `para`.
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+
+            # Add the paragraph to `block`.
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+
+        # Return the block
+        return block
+
+    def _untag(self, tree):
+        for i, child in enumerate(tree):
+            if isinstance(child, Tree):
+                self._untag(child)
+            elif isinstance(child, tuple):
+                tree[i] = child[0]
+            else:
+                raise ValueError('expected child to be Tree or tuple')
+        return tree
diff --git a/nlp_resource_data/nltk/corpus/reader/chunked.pyc b/nlp_resource_data/nltk/corpus/reader/chunked.pyc

new file mode 100755 (executable)

index 0000000..c004ebe

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/chunked.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/cmudict.py b/nlp_resource_data/nltk/corpus/reader/cmudict.py

new file mode 100755 (executable)

index 0000000..6009dad
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/cmudict.py
@@ -0,0 +1,97 @@
+# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
+ftp://ftp.cs.cmu.edu/project/speech/dict/
+Copyright 1998 Carnegie Mellon University
+
+File Format: Each line consists of an uppercased word, a counter
+(for alternative pronunciations), and a transcription.  Vowels are
+marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
+NATURAL 1 N AE1 CH ER0 AH0 L
+
+The dictionary contains 127069 entries.  Of these, 119400 words are assigned
+a unique pronunciation, 6830 words have two pronunciations, and 839 words have
+three or more pronunciations.  Many of these are fast-speech variants.
+
+Phonemes: There are 39 phonemes, as shown below:
+
+Phoneme Example Translation    Phoneme Example Translation
+------- ------- -----------    ------- ------- -----------
+AA      odd     AA D           AE      at      AE T
+AH      hut     HH AH T        AO      ought   AO T
+AW      cow     K AW           AY      hide    HH AY D
+B       be      B IY           CH      cheese  CH IY Z
+D       dee     D IY           DH      thee    DH IY
+EH      Ed      EH D           ER      hurt    HH ER T
+EY      ate     EY T           F       fee     F IY
+G       green   G R IY N       HH      he      HH IY
+IH      it      IH T           IY      eat     IY T
+JH      gee     JH IY          K       key     K IY
+L       lee     L IY           M       me      M IY
+N       knee    N IY           NG      ping    P IH NG
+OW      oat     OW T           OY      toy     T OY
+P       pee     P IY           R       read    R IY D
+S       sea     S IY           SH      she     SH IY
+T       tea     T IY           TH      theta   TH EY T AH
+UH      hood    HH UH D        UW      two     T UW
+V       vee     V IY           W       we      W IY
+Y       yield   Y IY L D       Z       zee     Z IY
+ZH      seizure S IY ZH ER
+"""
+
+import codecs
+
+from six import string_types
+
+from nltk import compat
+from nltk.util import Index
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class CMUDictCorpusReader(CorpusReader):
+    def entries(self):
+        """
+        :return: the cmudict lexicon as a list of entries
+        containing (word, transcriptions) tuples.
+        """
+        return concat([StreamBackedCorpusView(fileid, read_cmudict_block,
+                                              encoding=enc)
+                       for fileid, enc in self.abspaths(None, True)])
+
+    def raw(self):
+        """
+        :return: the cmudict lexicon as a raw string.
+        """
+        fileids = self._fileids
+        if isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def words(self):
+        """
+        :return: a list of all words defined in the cmudict lexicon.
+        """
+        return [word.lower() for (word, _) in self.entries()]
+
+    def dict(self):
+        """
+        :return: the cmudict lexicon as a dictionary, whose keys are
+        lowercase words and whose values are lists of pronunciations.
+        """
+        return dict(Index(self.entries()))
+
+def read_cmudict_block(stream):
+    entries = []
+    while len(entries) < 100: # Read 100 at a time.
+        line = stream.readline()
+        if line == '': return entries # end of file.
+        pieces = line.split()
+        entries.append( (pieces[0].lower(), pieces[2:]) )
+    return entries
diff --git a/nlp_resource_data/nltk/corpus/reader/cmudict.pyc b/nlp_resource_data/nltk/corpus/reader/cmudict.pyc

new file mode 100755 (executable)

index 0000000..70b4973

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/cmudict.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/comparative_sents.py b/nlp_resource_data/nltk/corpus/reader/comparative_sents.py

new file mode 100755 (executable)

index 0000000..1d81049
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/comparative_sents.py
@@ -0,0 +1,280 @@
+# Natural Language Toolkit: Comparative Sentence Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Comparative Sentence Dataset.
+
+- Comparative Sentence Dataset information -
+
+Annotated by: Nitin Jindal and Bing Liu, 2006.
+              Department of Computer Sicence
+              University of Illinois at Chicago
+
+Contact: Nitin Jindal, njindal@cs.uic.edu
+         Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
+
+Distributed with permission.
+
+Related papers:
+
+- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
+   Proceedings of the ACM SIGIR International Conference on Information Retrieval
+   (SIGIR-06), 2006.
+
+- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
+   Proceedings of Twenty First National Conference on Artificial Intelligence
+   (AAAI-2006), 2006.
+
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+"""
+import re
+
+from six import string_types
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+# Regular expressions for dataset components
+STARS = re.compile(r'^\*+$')
+COMPARISON = re.compile(r'<cs-[1234]>')
+CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
+GRAD_COMPARISON = re.compile(r'<cs-[123]>')
+NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
+ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
+KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
+
+class Comparison(object):
+    """
+    A Comparison represents a comparative sentence and its constituents.
+    """
+    def __init__(self, text=None, comp_type=None, entity_1=None, entity_2=None,
+                 feature=None, keyword=None):
+        """
+        :param text: a string (optionally tokenized) containing a comparation.
+        :param comp_type: an integer defining the type of comparison expressed.
+            Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
+            4 (Non-gradable).
+        :param entity_1: the first entity considered in the comparison relation.
+        :param entity_2: the second entity considered in the comparison relation.
+        :param feature: the feature considered in the comparison relation.
+        :param keyword: the word or phrase which is used for that comparative relation.
+        """
+        self.text = text
+        self.comp_type = comp_type
+        self.entity_1 = entity_1
+        self.entity_2 = entity_2
+        self.feature = feature
+        self.keyword = keyword
+
+    def __repr__(self):
+        return ("Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
+                "feature=\"{}\", keyword=\"{}\")").format(self.text, self.comp_type,
+                self.entity_1, self.entity_2, self.feature, self.keyword)
+
+class ComparativeSentencesCorpusReader(CorpusReader):
+    """
+    Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
+
+        >>> from nltk.corpus import comparative_sentences
+        >>> comparison = comparative_sentences.comparisons()[0]
+        >>> comparison.text
+        ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
+        'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
+        'had', '.']
+        >>> comparison.entity_2
+        'models'
+        >>> (comparison.feature, comparison.keyword)
+        ('rewind', 'more')
+        >>> len(comparative_sentences.comparisons())
+        853
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(),
+                 sent_tokenizer=None, encoding='utf8'):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: a list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+
+    def comparisons(self, fileids=None):
+        """
+        Return all comparisons in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            comparisons have to be returned.
+        :return: the given file(s) as a list of Comparison objects.
+        :rtype: list(Comparison)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_comparison_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def keywords(self, fileids=None):
+        """
+        Return a set of all keywords used in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            keywords have to be returned.
+        :return: the set of keywords and comparative phrases used in the corpus.
+        :rtype: set(str)
+        """
+        all_keywords = concat([self.CorpusView(path, self._read_keyword_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+        keywords_set = set([keyword.lower() for keyword in all_keywords if keyword])
+        return keywords_set
+
+    def keywords_readme(self):
+        """
+        Return the list of words and constituents considered as clues of a
+        comparison (from listOfkeywords.txt).
+        """
+        keywords = []
+        raw_text = self.open("listOfkeywords.txt").read()
+        for line in raw_text.split("\n"):
+            if not line or line.startswith("//"):
+                continue
+            keywords.append(line.strip())
+        return keywords
+
+    def raw(self, fileids=None):
+        """
+        :param fileids: a list or regexp specifying the fileids that have to be
+            returned as a raw string.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def readme(self):
+        """
+        Return the contents of the corpus readme file.
+        """
+        return self.open("README.txt").read()
+
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: all sentences of the corpus as lists of tokens (or as plain
+            strings, if no word tokenizer is specified).
+        :rtype: list(list(str)) or list(str)
+        """
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def _read_comparison_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return [] # end of file.
+            comparison_tags = re.findall(COMPARISON, line)
+            if comparison_tags:
+                grad_comparisons = re.findall(GRAD_COMPARISON, line)
+                non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
+                # Advance to the next line (it contains the comparative sentence)
+                comparison_text = stream.readline().strip()
+                if self._word_tokenizer:
+                    comparison_text = self._word_tokenizer.tokenize(comparison_text)
+                # Skip the next line (it contains closing comparison tags)
+                stream.readline()
+                # If gradable comparisons are found, create Comparison instances
+                # and populate their fields
+                comparison_bundle = []
+                if grad_comparisons:
+                    # Each comparison tag has its own relations on a separate line
+                    for comp in grad_comparisons:
+                        comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
+                        comparison = Comparison(text=comparison_text, comp_type=comp_type)
+                        line = stream.readline()
+                        entities_feats = ENTITIES_FEATS.findall(line)
+                        if entities_feats:
+                            for (code, entity_feat) in entities_feats:
+                                if code == '1':
+                                    comparison.entity_1 = entity_feat.strip()
+                                elif code == '2':
+                                    comparison.entity_2 = entity_feat.strip()
+                                elif code == '3':
+                                    comparison.feature = entity_feat.strip()
+                        keyword = KEYWORD.findall(line)
+                        if keyword:
+                            comparison.keyword = keyword[0]
+                        comparison_bundle.append(comparison)
+                # If non-gradable comparisons are found, create a simple Comparison
+                # instance for each one
+                if non_grad_comparisons:
+                    for comp in non_grad_comparisons:
+                        # comp_type in this case should always be 4.
+                        comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
+                        comparison = Comparison(text=comparison_text, comp_type=comp_type)
+                        comparison_bundle.append(comparison)
+                # Flatten the list of comparisons before returning them
+                # return concat([comparison_bundle])
+                return comparison_bundle
+
+    def _read_keyword_block(self, stream):
+        keywords = []
+        for comparison in self._read_comparison_block(stream):
+            keywords.append(comparison.keyword)
+        return keywords
+
+    def _read_sent_block(self, stream):
+        while True:
+            line = stream.readline()
+            if re.match(STARS, line):
+                while True:
+                    line = stream.readline()
+                    if re.match(STARS, line):
+                        break
+                continue
+            if not re.findall(COMPARISON, line) and not ENTITIES_FEATS.findall(line) \
+            and not re.findall(CLOSE_COMPARISON, line):
+                if self._sent_tokenizer:
+                    return [self._word_tokenizer.tokenize(sent)
+                        for sent in self._sent_tokenizer.tokenize(line)]
+                else:
+                    return [self._word_tokenizer.tokenize(line)]
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
diff --git a/nlp_resource_data/nltk/corpus/reader/comparative_sents.pyc b/nlp_resource_data/nltk/corpus/reader/comparative_sents.pyc

new file mode 100755 (executable)

index 0000000..972ab0d

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/comparative_sents.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/conll.py b/nlp_resource_data/nltk/corpus/reader/conll.py

new file mode 100755 (executable)

index 0000000..34d559f
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/conll.py
@@ -0,0 +1,523 @@
+# Natural Language Toolkit: CONLL Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read CoNLL-style chunk fileids.
+"""
+
+from __future__ import unicode_literals
+
+import os
+import codecs
+import textwrap
+
+from six import string_types
+
+from nltk import compat
+from nltk.tree import Tree
+from nltk.util import LazyMap, LazyConcatenation
+from nltk.tag import map_tag
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class ConllCorpusReader(CorpusReader):
+    """
+    A corpus reader for CoNLL-style files.  These files consist of a
+    series of sentences, separated by blank lines.  Each sentence is
+    encoded using a table (or "grid") of values, where each line
+    corresponds to a single word, and each column corresponds to an
+    annotation type.  The set of columns used by CoNLL-style files can
+    vary from corpus to corpus; the ``ConllCorpusReader`` constructor
+    therefore takes an argument, ``columntypes``, which is used to
+    specify the columns that are used by a given corpus.
+
+    @todo: Add support for reading from corpora where different
+        parallel files contain different columns.
+    @todo: Possibly add caching of the grid corpus view?  This would
+        allow the same grid view to be used by different data access
+        methods (eg words() and parsed_sents() could both share the
+        same grid corpus view object).
+    @todo: Better support for -DOCSTART-.  Currently, we just ignore
+        it, but it could be used to define methods that retrieve a
+        document at a time (eg parsed_documents()).
+    """
+
+    #/////////////////////////////////////////////////////////////////
+    # Column Types
+    #/////////////////////////////////////////////////////////////////
+
+    WORDS = 'words'   #: column type for words
+    POS = 'pos'       #: column type for part-of-speech tags
+    TREE = 'tree'     #: column type for parse trees
+    CHUNK = 'chunk'   #: column type for chunk structures
+    NE = 'ne'         #: column type for named entities
+    SRL = 'srl'       #: column type for semantic role labels
+    IGNORE = 'ignore' #: column type for column that should be ignored
+
+    #: A list of all column types supported by the conll corpus reader.
+    COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
+
+    #/////////////////////////////////////////////////////////////////
+    # Constructor
+    #/////////////////////////////////////////////////////////////////
+
+    def __init__(self, root, fileids, columntypes,
+                 chunk_types=None, root_label='S', pos_in_tree=False,
+                 srl_includes_roleset=True, encoding='utf8',
+                 tree_class=Tree, tagset=None):
+        for columntype in columntypes:
+            if columntype not in self.COLUMN_TYPES:
+                raise ValueError('Bad column type %r' % columntype)
+        if isinstance(chunk_types, string_types):
+            chunk_types = [chunk_types]
+        self._chunk_types = chunk_types
+        self._colmap = dict((c,i) for (i,c) in enumerate(columntypes))
+        self._pos_in_tree = pos_in_tree
+        self._root_label = root_label # for chunks
+        self._srl_includes_roleset = srl_includes_roleset
+        self._tree_class = tree_class
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._tagset = tagset
+
+    #/////////////////////////////////////////////////////////////////
+    # Data Access Methods
+    #/////////////////////////////////////////////////////////////////
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def words(self, fileids=None):
+        self._require(self.WORDS)
+        return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
+
+    def sents(self, fileids=None):
+        self._require(self.WORDS)
+        return LazyMap(self._get_words, self._grids(fileids))
+
+    def tagged_words(self, fileids=None, tagset=None):
+        self._require(self.WORDS, self.POS)
+        def get_tagged_words(grid):
+            return self._get_tagged_words(grid, tagset)
+        return LazyConcatenation(LazyMap(get_tagged_words,
+                                         self._grids(fileids)))
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        self._require(self.WORDS, self.POS)
+        def get_tagged_words(grid):
+            return self._get_tagged_words(grid, tagset)
+        return LazyMap(get_tagged_words, self._grids(fileids))
+
+    def chunked_words(self, fileids=None, chunk_types=None,
+                      tagset=None):
+        self._require(self.WORDS, self.POS, self.CHUNK)
+        if chunk_types is None: chunk_types = self._chunk_types
+        def get_chunked_words(grid): # capture chunk_types as local var
+            return self._get_chunked_words(grid, chunk_types, tagset)
+        return LazyConcatenation(LazyMap(get_chunked_words,
+                                         self._grids(fileids)))
+
+    def chunked_sents(self, fileids=None, chunk_types=None,
+                      tagset=None):
+        self._require(self.WORDS, self.POS, self.CHUNK)
+        if chunk_types is None: chunk_types = self._chunk_types
+        def get_chunked_words(grid): # capture chunk_types as local var
+            return self._get_chunked_words(grid, chunk_types, tagset)
+        return LazyMap(get_chunked_words, self._grids(fileids))
+
+    def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
+        self._require(self.WORDS, self.POS, self.TREE)
+        if pos_in_tree is None: pos_in_tree = self._pos_in_tree
+        def get_parsed_sent(grid): # capture pos_in_tree as local var
+            return self._get_parsed_sent(grid, pos_in_tree, tagset)
+        return LazyMap(get_parsed_sent, self._grids(fileids))
+
+    def srl_spans(self, fileids=None):
+        self._require(self.SRL)
+        return LazyMap(self._get_srl_spans, self._grids(fileids))
+
+    def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
+        self._require(self.WORDS, self.POS, self.TREE, self.SRL)
+        if pos_in_tree is None: pos_in_tree = self._pos_in_tree
+        def get_srl_instances(grid): # capture pos_in_tree as local var
+            return self._get_srl_instances(grid, pos_in_tree)
+        result = LazyMap(get_srl_instances, self._grids(fileids))
+        if flatten: result = LazyConcatenation(result)
+        return result
+
+    def iob_words(self, fileids=None, tagset=None):
+        """
+        :return: a list of word/tag/IOB tuples
+        :rtype: list(tuple)
+        :param fileids: the list of fileids that make up this corpus
+        :type fileids: None or str or list
+        """
+        self._require(self.WORDS, self.POS, self.CHUNK)
+        def get_iob_words(grid):
+            return self._get_iob_words(grid, tagset)
+        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
+
+    def iob_sents(self, fileids=None, tagset=None):
+        """
+        :return: a list of lists of word/tag/IOB tuples
+        :rtype: list(list)
+        :param fileids: the list of fileids that make up this corpus
+        :type fileids: None or str or list
+        """
+        self._require(self.WORDS, self.POS, self.CHUNK)
+        def get_iob_words(grid):
+            return self._get_iob_words(grid, tagset)
+        return LazyMap(get_iob_words, self._grids(fileids))
+
+    #/////////////////////////////////////////////////////////////////
+    # Grid Reading
+    #/////////////////////////////////////////////////////////////////
+
+    def _grids(self, fileids=None):
+        # n.b.: we could cache the object returned here (keyed on
+        # fileids), which would let us reuse the same corpus view for
+        # different things (eg srl and parse trees).
+        return concat([StreamBackedCorpusView(fileid, self._read_grid_block,
+                                              encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def _read_grid_block(self, stream):
+        grids = []
+        for block in read_blankline_block(stream):
+            block = block.strip()
+            if not block: continue
+
+            grid = [line.split() for line in block.split('\n')]
+
+            # If there's a docstart row, then discard. ([xx] eventually it
+            # would be good to actually use it)
+            if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
+                del grid[0]
+
+            # Check that the grid is consistent.
+            for row in grid:
+                if len(row) != len(grid[0]):
+                    raise ValueError('Inconsistent number of columns:\n%s'
+                                     % block)
+            grids.append(grid)
+        return grids
+
+    #/////////////////////////////////////////////////////////////////
+    # Transforms
+    #/////////////////////////////////////////////////////////////////
+    # given a grid, transform it into some representation (e.g.,
+    # a list of words or a parse tree).
+
+    def _get_words(self, grid):
+        return self._get_column(grid, self._colmap['words'])
+
+    def _get_tagged_words(self, grid, tagset=None):
+        pos_tags = self._get_column(grid, self._colmap['pos'])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        return list(zip(self._get_column(grid, self._colmap['words']), pos_tags))
+
+    def _get_iob_words(self, grid, tagset=None):
+        pos_tags = self._get_column(grid, self._colmap['pos'])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        return list(zip(self._get_column(grid, self._colmap['words']), pos_tags,
+                   self._get_column(grid, self._colmap['chunk'])))
+
+    def _get_chunked_words(self, grid, chunk_types, tagset=None):
+        # n.b.: this method is very similar to conllstr2tree.
+        words = self._get_column(grid, self._colmap['words'])
+        pos_tags = self._get_column(grid, self._colmap['pos'])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        chunk_tags = self._get_column(grid, self._colmap['chunk'])
+
+        stack = [Tree(self._root_label, [])]
+
+        for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
+            if chunk_tag == 'O':
+                state, chunk_type = 'O', ''
+            else:
+                (state, chunk_type) = chunk_tag.split('-')
+            # If it's a chunk we don't care about, treat it as O.
+            if chunk_types is not None and chunk_type not in chunk_types:
+                state = 'O'
+            # Treat a mismatching I like a B.
+            if state == 'I' and chunk_type != stack[-1].label():
+                state = 'B'
+            # For B or I: close any open chunks
+            if state in 'BO' and len(stack) == 2:
+                stack.pop()
+            # For B: start a new chunk.
+            if state == 'B':
+                new_chunk = Tree(chunk_type, [])
+                stack[-1].append(new_chunk)
+                stack.append(new_chunk)
+            # Add the word token.
+            stack[-1].append((word, pos_tag))
+
+        return stack[0]
+
+    def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
+        words = self._get_column(grid, self._colmap['words'])
+        pos_tags = self._get_column(grid, self._colmap['pos'])
+        if tagset and tagset != self._tagset:
+            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
+        parse_tags = self._get_column(grid, self._colmap['tree'])
+
+        treestr = ''
+        for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
+            if word == '(': word = '-LRB-'
+            if word == ')': word = '-RRB-'
+            if pos_tag == '(': pos_tag = '-LRB-'
+            if pos_tag == ')': pos_tag = '-RRB-'
+            (left, right) = parse_tag.split('*')
+            right = right.count(')')*')' # only keep ')'.
+            treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
+        try:
+            tree = self._tree_class.fromstring(treestr)
+        except (ValueError, IndexError):
+            tree = self._tree_class.fromstring('(%s %s)' %
+                                          (self._root_label, treestr))
+
+        if not pos_in_tree:
+            for subtree in tree.subtrees():
+                for i, child in enumerate(subtree):
+                    if (isinstance(child, Tree) and len(child)==1 and
+                        isinstance(child[0], string_types)):
+                        subtree[i] = (child[0], child.label())
+
+        return tree
+
+    def _get_srl_spans(self, grid):
+        """
+        list of list of (start, end), tag) tuples
+        """
+        if self._srl_includes_roleset:
+            predicates = self._get_column(grid, self._colmap['srl']+1)
+            start_col = self._colmap['srl']+2
+        else:
+            predicates = self._get_column(grid, self._colmap['srl'])
+            start_col = self._colmap['srl']+1
+
+        # Count how many predicates there are.  This tells us how many
+        # columns to expect for SRL data.
+        num_preds = len([p for p in predicates if p != '-'])
+
+        spanlists = []
+        for i in range(num_preds):
+            col = self._get_column(grid, start_col+i)
+            spanlist = []
+            stack = []
+            for wordnum, srl_tag in enumerate(col):
+                (left, right) = srl_tag.split('*')
+                for tag in left.split('('):
+                    if tag:
+                        stack.append((tag, wordnum))
+                for i in range(right.count(')')):
+                    (tag, start) = stack.pop()
+                    spanlist.append( ((start, wordnum+1), tag) )
+            spanlists.append(spanlist)
+
+        return spanlists
+
+    def _get_srl_instances(self, grid, pos_in_tree):
+        tree = self._get_parsed_sent(grid, pos_in_tree)
+        spanlists = self._get_srl_spans(grid)
+        if self._srl_includes_roleset:
+            predicates = self._get_column(grid, self._colmap['srl']+1)
+            rolesets = self._get_column(grid, self._colmap['srl'])
+        else:
+            predicates = self._get_column(grid, self._colmap['srl'])
+            rolesets = [None] * len(predicates)
+
+        instances = ConllSRLInstanceList(tree)
+        for wordnum, predicate in enumerate(predicates):
+            if predicate == '-': continue
+            # Decide which spanlist to use.  Don't assume that they're
+            # sorted in the same order as the predicates (even though
+            # they usually are).
+            for spanlist in spanlists:
+                for (start, end), tag in spanlist:
+                    if wordnum in range(start,end) and tag in ('V', 'C-V'):
+                        break
+                else: continue
+                break
+            else:
+                raise ValueError('No srl column found for %r' % predicate)
+            instances.append(ConllSRLInstance(tree, wordnum, predicate,
+                                              rolesets[wordnum], spanlist))
+
+        return instances
+
+    #/////////////////////////////////////////////////////////////////
+    # Helper Methods
+    #/////////////////////////////////////////////////////////////////
+
+    def _require(self, *columntypes):
+        for columntype in columntypes:
+            if columntype not in self._colmap:
+                raise ValueError('This corpus does not contain a %s '
+                                 'column.' % columntype)
+
+    @staticmethod
+    def _get_column(grid, column_index):
+        return [grid[i][column_index] for i in range(len(grid))]
+
+
+@compat.python_2_unicode_compatible
+class ConllSRLInstance(object):
+    """
+    An SRL instance from a CoNLL corpus, which identifies and
+    providing labels for the arguments of a single verb.
+    """
+    # [xx] add inst.core_arguments, inst.argm_arguments?
+
+    def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
+        self.verb = []
+        """A list of the word indices of the words that compose the
+           verb whose arguments are identified by this instance.
+           This will contain multiple word indices when multi-word
+           verbs are used (e.g. 'turn on')."""
+
+        self.verb_head = verb_head
+        """The word index of the head word of the verb whose arguments
+           are identified by this instance.  E.g., for a sentence that
+           uses the verb 'turn on,' ``verb_head`` will be the word index
+           of the word 'turn'."""
+
+        self.verb_stem = verb_stem
+
+        self.roleset = roleset
+
+        self.arguments = []
+        """A list of ``(argspan, argid)`` tuples, specifying the location
+           and type for each of the arguments identified by this
+           instance.  ``argspan`` is a tuple ``start, end``, indicating
+           that the argument consists of the ``words[start:end]``."""
+
+        self.tagged_spans = tagged_spans
+        """A list of ``(span, id)`` tuples, specifying the location and
+           type for each of the arguments, as well as the verb pieces,
+           that make up this instance."""
+
+        self.tree = tree
+        """The parse tree for the sentence containing this instance."""
+
+        self.words = tree.leaves()
+        """A list of the words in the sentence containing this
+           instance."""
+
+        # Fill in the self.verb and self.arguments values.
+        for (start, end), tag in tagged_spans:
+            if tag in ('V', 'C-V'):
+                self.verb += list(range(start, end))
+            else:
+                self.arguments.append( ((start, end), tag) )
+
+    def __repr__(self):
+        plural = len(self.arguments)!=1 and 's' or ''
+        return '<ConllSRLInstance for %r with %d argument%s>' % (
+            (self.verb_stem, len(self.arguments), plural))
+
+    def pprint(self):
+        verbstr = ' '.join(self.words[i][0] for i in self.verb)
+        hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
+        s = ''
+        for i, word in enumerate(self.words):
+            if isinstance(word, tuple): word = word[0]
+            for (start, end), argid in self.arguments:
+                if i == start: s += '[%s ' % argid
+                if i == end: s += '] '
+            if i in self.verb: word = '<<%s>>' % word
+            s += word + ' '
+        return hdr + textwrap.fill(s.replace(' ]', ']'),
+                                   initial_indent='    ',
+                                   subsequent_indent='    ')
+
+@compat.python_2_unicode_compatible
+class ConllSRLInstanceList(list):
+    """
+    Set of instances for a single sentence
+    """
+    def __init__(self, tree, instances=()):
+        self.tree = tree
+        list.__init__(self, instances)
+
+    def __str__(self):
+        return self.pprint()
+
+    def pprint(self, include_tree=False):
+        # Sanity check: trees should be the same
+        for inst in self:
+            if inst.tree != self.tree:
+                raise ValueError('Tree mismatch!')
+
+        # If desired, add trees:
+        if include_tree:
+            words = self.tree.leaves()
+            pos = [None] * len(words)
+            synt = ['*'] * len(words)
+            self._tree2conll(self.tree, 0, words, pos, synt)
+
+        s = ''
+        for i in range(len(words)):
+            # optional tree columns
+            if include_tree:
+                s += '%-20s ' % words[i]
+                s += '%-8s ' % pos[i]
+                s += '%15s*%-8s ' % tuple(synt[i].split('*'))
+
+            # verb head column
+            for inst in self:
+                if i == inst.verb_head:
+                    s += '%-20s ' % inst.verb_stem
+                    break
+            else:
+                s += '%-20s ' % '-'
+            # Remaining columns: self
+            for inst in self:
+                argstr = '*'
+                for (start, end), argid in inst.tagged_spans:
+                    if i==start: argstr = '(%s%s' % (argid, argstr)
+                    if i==(end-1): argstr += ')'
+                s += '%-12s ' % argstr
+            s += '\n'
+        return s
+
+    def _tree2conll(self, tree, wordnum, words, pos, synt):
+        assert isinstance(tree, Tree)
+        if len(tree) == 1 and isinstance(tree[0], string_types):
+            pos[wordnum] = tree.label()
+            assert words[wordnum] == tree[0]
+            return wordnum+1
+        elif len(tree) == 1 and isinstance(tree[0], tuple):
+            assert len(tree[0]) == 2
+            pos[wordnum], pos[wordnum] = tree[0]
+            return wordnum+1
+        else:
+            synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum])
+            for child in tree:
+                wordnum = self._tree2conll(child, wordnum, words,
+                                                  pos, synt)
+            synt[wordnum-1] += ')'
+            return wordnum
+
+class ConllChunkCorpusReader(ConllCorpusReader):
+    """
+    A ConllCorpusReader whose data file contains three columns: words,
+    pos, and chunk.
+    """
+    def __init__(self, root, fileids, chunk_types, encoding='utf8',
+                 tagset=None):
+        ConllCorpusReader.__init__(
+            self, root, fileids, ('words', 'pos', 'chunk'),
+            chunk_types=chunk_types, encoding=encoding,
+            tagset=tagset)
diff --git a/nlp_resource_data/nltk/corpus/reader/conll.pyc b/nlp_resource_data/nltk/corpus/reader/conll.pyc

new file mode 100755 (executable)

index 0000000..4b5866a

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/conll.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/crubadan.py b/nlp_resource_data/nltk/corpus/reader/crubadan.py

new file mode 100755 (executable)

index 0000000..84f603e
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/crubadan.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: An Crubadan N-grams Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for the n-gram statistics gathered from
+the corpora for each language using An Crubadan.
+
+There are multiple potential applications for the data but
+this reader was created with the goal of using it in the
+context of language identification.
+
+For details about An Crubadan, this data, and its potential uses, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+from __future__ import print_function, unicode_literals
+
+import re
+from nltk.compat import PY3
+from os import path
+from nltk.corpus.reader import CorpusReader
+from nltk.probability import FreqDist
+from nltk.data import ZipFilePathPointer
+
+class CrubadanCorpusReader(CorpusReader):
+    """
+    A corpus reader used to access language An Crubadan n-gram files.
+    """
+    
+    _LANG_MAPPER_FILE = 'table.txt'
+    _all_lang_freq = {}
+    
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
+        super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
+        self._lang_mapping_data = []
+        self._load_lang_mapping_data()
+        
+    def lang_freq(self, lang):
+        ''' Return n-gram FreqDist for a specific language
+            given ISO 639-3 language code '''
+        
+        if lang not in self._all_lang_freq:
+            self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
+
+        return self._all_lang_freq[lang]
+    
+    def langs(self):
+        ''' Return a list of supported languages as ISO 639-3 codes '''
+        return [row[1] for row in self._lang_mapping_data]
+            
+    def iso_to_crubadan(self, lang):
+        ''' Return internal Crubadan code based on ISO 639-3 code '''
+        for i in self._lang_mapping_data:
+            if i[1].lower() == lang.lower():
+                return i[0]
+    
+    def crubadan_to_iso(self, lang):
+        ''' Return ISO 639-3 code given internal Crubadan code '''
+        for i in self._lang_mapping_data:
+            if i[0].lower() == lang.lower():
+                return i[1]
+    
+    def _load_lang_mapping_data(self):
+        ''' Load language mappings between codes and description from table.txt '''
+        if isinstance(self.root, ZipFilePathPointer):
+            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
+        
+        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
+        if self._LANG_MAPPER_FILE not in self.fileids():
+            raise RuntimeError("Could not find language mapper file: " + mapper_file)
+
+        if PY3:
+            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+        else:
+            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+
+        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
+        
+    def _load_lang_ngrams(self, lang):
+        ''' Load single n-gram language file given the ISO 639-3 language code
+            and return its FreqDist '''
+
+        if lang not in self.langs():
+            raise RuntimeError("Unsupported language.")
+
+        crubadan_code = self.iso_to_crubadan(lang)
+        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
+
+        if not path.isfile(ngram_file):
+            raise RuntimeError("No N-gram file found for requested language.")
+
+        counts = FreqDist()
+        if PY3:
+            f = open(ngram_file, 'r', encoding='utf-8')
+        else:
+            f = open(ngram_file, 'rU')
+
+        for line in f:
+            if PY3:
+                data = line.split(' ')
+            else:
+                data = line.decode('utf8').split(' ')
+
+            ngram = data[1].strip('\n')
+            freq = int(data[0])
+            
+            counts[ngram] = freq
+            
+        return counts
+        
diff --git a/nlp_resource_data/nltk/corpus/reader/crubadan.pyc b/nlp_resource_data/nltk/corpus/reader/crubadan.pyc

new file mode 100755 (executable)

index 0000000..ed7be9e

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/crubadan.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/dependency.py b/nlp_resource_data/nltk/corpus/reader/dependency.py

new file mode 100755 (executable)

index 0000000..c8a3a39
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/dependency.py
@@ -0,0 +1,101 @@
+# Natural Language Toolkit: Dependency Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
+#         Iker Manterola <returntothehangar@hotmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import codecs
+
+from nltk.parse import DependencyGraph
+from nltk.tokenize import *
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class DependencyCorpusReader(SyntaxCorpusReader):
+
+    def __init__(self, root, fileids, encoding='utf8',
+                 word_tokenizer=TabTokenizer(),
+                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+                 para_block_reader=read_blankline_block):
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+
+    #########################################################
+
+    def raw(self, fileids=None):
+        """
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        result = []
+        for fileid, encoding in self.abspaths(fileids, include_encoding=True):
+            if isinstance(fileid, PathPointer):
+                result.append(fileid.open(encoding=encoding).read())
+            else:
+                with codecs.open(fileid, "r", encoding) as fp:
+                    result.append(fp.read())
+        return concat(result)
+
+    def words(self, fileids=None):
+        return concat([DependencyCorpusView(fileid, False, False, False, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, include_encoding=True)])
+
+    def tagged_words(self, fileids=None):
+        return concat([DependencyCorpusView(fileid, True, False, False, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, include_encoding=True)])
+
+    def sents(self, fileids=None):
+        return concat([DependencyCorpusView(fileid, False, True, False, encoding=enc)
+                       for fileid, enc in self.abspaths(fileids, include_encoding=True)])
+
+    def tagged_sents(self, fileids=None):
+            return concat([DependencyCorpusView(fileid, True, True, False, encoding=enc)
+                           for fileid, enc in self.abspaths(fileids, include_encoding=True)])
+
+    def parsed_sents(self, fileids=None):
+        sents=concat([DependencyCorpusView(fileid, False, True, True, encoding=enc)
+                      for fileid, enc in self.abspaths(fileids, include_encoding=True)])
+        return [DependencyGraph(sent) for sent in sents]
+
+
+class DependencyCorpusView(StreamBackedCorpusView):
+    _DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da
+
+    def __init__(self, corpus_file, tagged, group_by_sent, dependencies,
+                 chunk_types=None, encoding='utf8'):
+        self._tagged = tagged
+        self._dependencies = dependencies
+        self._group_by_sent = group_by_sent
+        self._chunk_types = chunk_types
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        # Read the next sentence.
+        sent = read_blankline_block(stream)[0].strip()
+        # Strip off the docstart marker, if present.
+        if sent.startswith(self._DOCSTART):
+            sent = sent[len(self._DOCSTART):].lstrip()
+
+        # extract word and tag from any of the formats
+        if not self._dependencies:
+            lines = [line.split('\t') for line in sent.split('\n')]
+            if len(lines[0]) == 3 or len(lines[0]) == 4:
+                sent = [(line[0], line[1]) for line in lines]
+            elif len(lines[0]) == 10:
+                sent = [(line[1], line[4]) for line in lines]
+            else:
+                raise ValueError('Unexpected number of fields in dependency tree file')
+
+            # discard tags if they weren't requested
+            if not self._tagged:
+                sent = [word for (word, tag) in sent]
+
+        # Return the result.
+        if self._group_by_sent:
+            return [sent]
+        else:
+            return list(sent)
diff --git a/nlp_resource_data/nltk/corpus/reader/dependency.pyc b/nlp_resource_data/nltk/corpus/reader/dependency.pyc

new file mode 100755 (executable)

index 0000000..8047c8c

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/dependency.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/framenet.py b/nlp_resource_data/nltk/corpus/reader/framenet.py

new file mode 100755 (executable)

index 0000000..344efb4
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/framenet.py
@@ -0,0 +1,3057 @@
+# Natural Language Toolkit: Framenet Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Chuck Wooters <wooters@icsi.berkeley.edu>,
+#          Nathan Schneider <nathan.schneider@georgetown.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+Corpus reader for the FrameNet 1.7 lexicon and corpus.
+"""
+from __future__ import print_function, unicode_literals
+
+import os, sys
+import re
+import textwrap
+import itertools
+import types
+
+from six import string_types, text_type
+from six.moves import zip_longest
+
+from collections import defaultdict, OrderedDict
+from pprint import pprint, pformat
+from nltk.internals import ElementWrapper
+from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView
+from nltk.compat import python_2_unicode_compatible
+from nltk.util import AbstractLazySequence, LazyConcatenation, LazyMap, LazyIteratorList
+
+__docformat__ = 'epytext en'
+
+def mimic_wrap(lines, wrap_at=65, **kwargs):
+    """
+    Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same
+    positions as the first.
+    """
+    l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split('\n')
+    yield l0
+
+    def _(line):
+        il0 = 0
+        while line and il0<len(l0)-1:
+            yield line[:len(l0[il0])]
+            line = line[len(l0[il0]):]
+            il0 += 1
+        if line: # Remaining stuff on this line past the end of the mimicked line.
+            # So just textwrap this line.
+            for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split('\n'):
+                yield ln
+
+    for l in lines[1:]:
+        yield list(_(l))
+
+def _pretty_longstring(defstr, prefix='', wrap_at=65):
+
+    """
+    Helper function for pretty-printing a long string.
+
+    :param defstr: The string to be printed.
+    :type defstr: str
+    :return: A nicely formated string representation of the long string.
+    :rtype: str
+    """
+
+    outstr = ""
+    for line in textwrap.fill(defstr, wrap_at).split('\n'):
+        outstr += prefix + line + '\n'
+    return outstr
+
+def _pretty_any(obj):
+
+    """
+    Helper function for pretty-printing any AttrDict object.
+
+    :param obj: The obj to be printed.
+    :type obj: AttrDict
+    :return: A nicely formated string representation of the AttrDict object.
+    :rtype: str
+    """
+
+    outstr = ""
+    for k in obj:
+        if isinstance(obj[k], string_types) and len(obj[k]) > 65:
+            outstr += "[{0}]\n".format(k)
+            outstr += "{0}".format(_pretty_longstring(obj[k], prefix='  '))
+            outstr += '\n'
+        else:
+            outstr += "[{0}] {1}\n".format(k, obj[k])
+
+    return outstr
+
+def _pretty_semtype(st):
+
+    """
+    Helper function for pretty-printing a semantic type.
+
+    :param st: The semantic type to be printed.
+    :type st: AttrDict
+    :return: A nicely formated string representation of the semantic type.
+    :rtype: str
+    """
+
+    semkeys = st.keys()
+    if len(semkeys) == 1: return "<None>"
+
+    outstr = ""
+    outstr += "semantic type ({0.ID}): {0.name}\n".format(st)
+    if 'abbrev' in semkeys:
+        outstr += "[abbrev] {0}\n".format(st.abbrev)
+    if 'definition' in semkeys:
+        outstr += "[definition]\n"
+        outstr += _pretty_longstring(st.definition,'  ')
+    outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID)
+    if st.superType is None:
+        outstr += "[superType] <None>\n"
+    else:
+        outstr += "[superType] {0}({1})\n".format(st.superType.name, st.superType.ID)
+    outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes))
+    outstr += "  " + ", ".join('{0}({1})'.format(x.name, x.ID) for x in st.subTypes) + '\n'*(len(st.subTypes)>0)
+    return outstr
+
+def _pretty_frame_relation_type(freltyp):
+
+    """
+    Helper function for pretty-printing a frame relation type.
+
+    :param freltyp: The frame relation type to be printed.
+    :type freltyp: AttrDict
+    :return: A nicely formated string representation of the frame relation type.
+    :rtype: str
+    """
+    outstr = "<frame relation type ({0.ID}): {0.superFrameName} -- {0.name} -> {0.subFrameName}>".format(freltyp)
+    return outstr
+
+def _pretty_frame_relation(frel):
+
+    """
+    Helper function for pretty-printing a frame relation.
+
+    :param frel: The frame relation to be printed.
+    :type frel: AttrDict
+    :return: A nicely formated string representation of the frame relation.
+    :rtype: str
+    """
+    outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format(frel)
+    return outstr
+
+def _pretty_fe_relation(ferel):
+
+    """
+    Helper function for pretty-printing an FE relation.
+
+    :param ferel: The FE relation to be printed.
+    :type ferel: AttrDict
+    :return: A nicely formated string representation of the FE relation.
+    :rtype: str
+    """
+    outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format(ferel)
+    return outstr
+
+def _pretty_lu(lu):
+
+    """
+    Helper function for pretty-printing a lexical unit.
+
+    :param lu: The lu to be printed.
+    :type lu: AttrDict
+    :return: A nicely formated string representation of the lexical unit.
+    :rtype: str
+    """
+
+    lukeys = lu.keys()
+    outstr = ""
+    outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu)
+    if 'definition' in lukeys:
+        outstr += "[definition]\n"
+        outstr += _pretty_longstring(lu.definition,'  ')
+    if 'frame' in lukeys:
+        outstr += "\n[frame] {0}({1})\n".format(lu.frame.name,lu.frame.ID)
+    if 'incorporatedFE' in lukeys:
+        outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE)
+    if 'POS' in lukeys:
+        outstr += "\n[POS] {0}\n".format(lu.POS)
+    if 'status' in lukeys:
+        outstr += "\n[status] {0}\n".format(lu.status)
+    if 'totalAnnotated' in lukeys:
+        outstr += "\n[totalAnnotated] {0} annotated examples\n".format(lu.totalAnnotated)
+    if 'lexemes' in lukeys:
+        outstr += "\n[lexemes] {0}\n".format(' '.join('{0}/{1}'.format(lex.name,lex.POS) for lex in lu.lexemes))
+    if 'semTypes' in lukeys:
+        outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes))
+        outstr += "  "*(len(lu.semTypes)>0) + ", ".join('{0}({1})'.format(x.name, x.ID) for x in lu.semTypes) + '\n'*(len(lu.semTypes)>0)
+    if 'URL' in lukeys:
+        outstr += "\n[URL] {0}\n".format(lu.URL)
+    if 'subCorpus' in lukeys:
+        subc = [x.name for x in lu.subCorpus]
+        outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus))
+        for line in textwrap.fill(", ".join(sorted(subc)), 60).split('\n'):
+            outstr += "  {0}\n".format(line)
+    if 'exemplars' in lukeys:
+        outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(len(lu.exemplars))
+
+    return outstr
+
+def _pretty_exemplars(exemplars, lu):
+    """
+    Helper function for pretty-printing a list of exemplar sentences for a lexical unit.
+
+    :param sent: The list of exemplar sentences to be printed.
+    :type sent: list(AttrDict)
+    :return: An index of the text of the exemplar sentences.
+    :rtype: str
+    """
+
+    outstr = ""
+    outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu)
+    for i,sent in enumerate(exemplars):
+        outstr += "[{0}] {1}\n".format(i, sent.text)
+    outstr += "\n"
+    return outstr
+
+def _pretty_fulltext_sentences(sents):
+    """
+    Helper function for pretty-printing a list of annotated sentences for a full-text document.
+
+    :param sent: The list of sentences to be printed.
+    :type sent: list(AttrDict)
+    :return: An index of the text of the sentences.
+    :rtype: str
+    """
+
+    outstr = ""
+    outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents)
+    outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format(sents)
+    outstr += "[sentence]\n".format(sents)
+    for i,sent in enumerate(sents.sentence):
+        outstr += "[{0}] {1}\n".format(i, sent.text)
+    outstr += "\n"
+    return outstr
+
+def _pretty_fulltext_sentence(sent):
+    """
+    Helper function for pretty-printing an annotated sentence from a full-text document.
+
+    :param sent: The sentence to be printed.
+    :type sent: list(AttrDict)
+    :return: The text of the sentence with annotation set indices on frame targets.
+    :rtype: str
+    """
+
+    outstr = ""
+    outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(sent, sent.doc.get('name',sent.doc.description))
+    outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
+    outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset)
+    outstr += "[text] + [annotationSet]\n\n"
+    outstr += sent._ascii() # -> _annotation_ascii()
+    outstr += "\n"
+    return outstr
+
+def _pretty_pos(aset):
+    """
+    Helper function for pretty-printing a sentence with its POS tags.
+
+    :param aset: The POS annotation set of the sentence to be printed.
+    :type sent: list(AttrDict)
+    :return: The text of the sentence and its POS tags.
+    :rtype: str
+    """
+
+    outstr = ""
+    outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format(aset)
+
+    # list the target spans and their associated aset index
+    overt = sorted(aset.POS)
+
+    sent = aset.sent
+    s0 = sent.text
+    s1 = ''
+    s2 = ''
+    i = 0
+    adjust = 0
+    for j,k,lbl in overt:
+        assert j>=i,('Overlapping targets?',(j,k,lbl))
+        s1 += ' '*(j-i) + '-'*(k-j)
+        if len(lbl)>(k-j):
+            # add space in the sentence to make room for the annotation index
+            amt = len(lbl)-(k-j)
+            s0 = s0[:k+adjust]+ '~'*amt + s0[k+adjust:] # '~' to prevent line wrapping
+            s1 = s1[:k+adjust]+ ' '*amt + s1[k+adjust:]
+            adjust += amt
+        s2 += ' '*(j-i) + lbl.ljust(k-j)
+        i = k
+
+    long_lines = [s0, s1, s2]
+
+    outstr += '\n\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))).replace('~',' ')
+    outstr += "\n"
+    return outstr
+
+def _pretty_annotation(sent, aset_level=False):
+    """
+    Helper function for pretty-printing an exemplar sentence for a lexical unit.
+
+    :param sent: An annotation set or exemplar sentence to be printed.
+    :param aset_level: If True, 'sent' is actually an annotation set within a sentence.
+    :type sent: AttrDict
+    :return: A nicely formated string representation of the exemplar sentence
+    with its target, frame, and FE annotations.
+    :rtype: str
+    """
+
+    sentkeys = sent.keys()
+    outstr = "annotation set" if aset_level else "exemplar sentence"
+    outstr += " ({0.ID}):\n".format(sent)
+    if aset_level: # TODO: any UNANN exemplars?
+        outstr += "\n[status] {0}\n".format(sent.status)
+    for k in ('corpID', 'docID', 'paragNo', 'sentNo', 'aPos'):
+        if k in sentkeys:
+            outstr += "[{0}] {1}\n".format(k, sent[k])
+    outstr += "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) if sent.LU else '\n[LU] Not found!'
+    outstr += "\n[frame] ({0.ID}) {0.name}\n".format(sent.frame)    # redundant with above, but .frame is convenient
+    if not aset_level:
+        outstr += "\n[annotationSet] {0} annotation sets\n".format(len(sent.annotationSet))
+        outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
+        outstr += "\n[POS_tagset] {0}\n".format(sent.POS_tagset)
+    outstr += "\n[GF] {0} relation{1}\n".format(len(sent.GF), "s" if len(sent.GF)!=1 else "")
+    outstr += "\n[PT] {0} phrase{1}\n".format(len(sent.PT), "s" if len(sent.PT)!=1 else "")
+    """
+    Special Layers
+    --------------
+
+    The 'NER' layer contains, for some of the data, named entity labels.
+
+    The 'WSL' (word status layer) contains, for some of the data,
+    spans which should not in principle be considered targets (NT).
+
+    The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent),
+    pleonastic 'it' (Null), and existential 'there' (Exist).
+    On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml).
+
+    The 'Sent' layer appears to contain labels that the annotator has flagged the
+    sentence with for their convenience: values include
+    'sense1', 'sense2', 'sense3', etc.;
+    'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent',
+    'keepS', 'deleteS', 'reexamine'
+    (sometimes they are duplicated for no apparent reason).
+
+    The POS-specific layers may contain the following kinds of spans:
+    Asp (aspectual particle), Non-Asp (non-aspectual particle),
+    Cop (copula), Supp (support), Ctrlr (controller),
+    Gov (governor), X. Gov and X always cooccur.
+
+    >>> from nltk.corpus import framenet as fn
+>>> def f(luRE, lyr, ignore=set()):
+...   for i,ex in enumerate(fn.exemplars(luRE)):
+...     if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore:
+...       print(i,ex[lyr])
+
+    - Verb: Asp, Non-Asp
+    - Noun: Cop, Supp, Ctrlr, Gov, X
+    - Adj: Cop, Supp, Ctrlr, Gov, X
+    - Prep: Cop, Supp, Ctrlr
+    - Adv: Ctrlr
+    - Scon: (none)
+    - Art: (none)
+    """
+    for lyr in ('NER', 'WSL', 'Other', 'Sent'):
+        if lyr in sent and sent[lyr]:
+            outstr += "\n[{0}] {1} entr{2}\n".format(lyr, len(sent[lyr]), "ies" if len(sent[lyr])!=1 else "y")
+    outstr += "\n[text] + [Target] + [FE]"
+    # POS-specific layers: syntactically important words that are neither the target
+    # nor the FEs. Include these along with the first FE layer but with '^' underlining.
+    for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
+        if lyr in sent and sent[lyr]:
+            outstr += " + [{0}]".format(lyr)
+    if 'FE2' in sentkeys:
+        outstr += " + [FE2]"
+        if 'FE3' in sentkeys:
+            outstr += " + [FE3]"
+    outstr += "\n\n"
+    outstr += sent._ascii() # -> _annotation_ascii()
+    outstr += "\n"
+
+    return outstr
+
+def _annotation_ascii(sent):
+    '''
+    Given a sentence or FE annotation set, construct the width-limited string showing
+    an ASCII visualization of the sentence's annotations, calling either
+    _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate.
+    This will be attached as a method to appropriate AttrDict instances
+    and called in the full pretty-printing of the instance.
+    '''
+    if sent._type=='fulltext_sentence' or ('annotationSet' in sent and len(sent.annotationSet)>2):
+        # a full-text sentence OR sentence with multiple targets.
+        # (multiple targets = >2 annotation sets, because the first annotation set is POS.)
+        return _annotation_ascii_frames(sent)
+    else:   # an FE annotation set, or an LU sentence with 1 target
+        return _annotation_ascii_FEs(sent)
+
+def _annotation_ascii_frames(sent):
+    '''
+    ASCII string rendering of the sentence along with its targets and frame names.
+    Called for all full-text sentences, as well as the few LU sentences with multiple
+    targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets).
+    Line-wrapped to limit the display width.
+    '''
+    # list the target spans and their associated aset index
+    overt = []
+    for a,aset in enumerate(sent.annotationSet[1:]):
+        for j,k in aset.Target:
+            indexS = "[{0}]".format(a+1)
+            if aset.status=='UNANN' or aset.LU.status=='Problem':
+                indexS += " "
+                if aset.status=='UNANN':
+                    indexS += "!" # warning indicator that there is a frame annotation but no FE annotation
+                if aset.LU.status=='Problem':
+                    indexS += "?" # warning indicator that there is a missing LU definition (because the LU has Problem status)
+            overt.append((j,k,aset.LU.frame.name,indexS))
+    overt = sorted(overt)
+
+    duplicates = set()
+    for o,(j,k,fname,asetIndex) in enumerate(overt):
+        if o>0 and j<=overt[o-1][1]:
+            # multiple annotation sets on the same target
+            # (e.g. due to a coordination construction or multiple annotators)
+            if overt[o-1][:2]==(j,k) and overt[o-1][2]==fname:    # same target, same frame
+                # splice indices together
+                combinedIndex = overt[o-1][3] + asetIndex    # e.g., '[1][2]', '[1]! [2]'
+                combinedIndex = combinedIndex.replace(' !', '! ').replace(' ?', '? ')
+                overt[o-1] = overt[o-1][:3]+(combinedIndex,)
+                duplicates.add(o)
+            else:   # different frames, same or overlapping targets
+                s = sent.text
+                for j,k,fname,asetIndex in overt:
+                    s += '\n' + asetIndex + ' ' + sent.text[j:k] + ' :: ' + fname
+                s += '\n(Unable to display sentence with targets marked inline due to overlap)'
+                return s
+    for o in reversed(sorted(duplicates)):
+        del overt[o]
+
+    s0 = sent.text
+    s1 = ''
+    s11 = ''
+    s2 = ''
+    i = 0
+    adjust = 0
+    fAbbrevs = OrderedDict()
+    for j,k,fname,asetIndex in overt:
+        if not j>=i:
+            assert j>=i,('Overlapping targets?'+(' UNANN' if any(aset.status=='UNANN' for aset in sent.annotationSet[1:]) else ''),(j,k,asetIndex))
+        s1 += ' '*(j-i) + '*'*(k-j)
+        short = fname[:k-j]
+        if (k-j)<len(fname):
+            r = 0
+            while short in fAbbrevs:
+                if fAbbrevs[short]==fname:
+                    break
+                r += 1
+                short = fname[:k-j-1] + str(r)
+            else:   # short not in fAbbrevs
+                fAbbrevs[short] = fname
+        s11 += ' '*(j-i) + short.ljust(k-j)
+        if len(asetIndex)>(k-j):
+            # add space in the sentence to make room for the annotation index
+            amt = len(asetIndex)-(k-j)
+            s0 = s0[:k+adjust]+ '~'*amt + s0[k+adjust:] # '~' to prevent line wrapping
+            s1 = s1[:k+adjust]+ ' '*amt + s1[k+adjust:]
+            s11 = s11[:k+adjust]+ ' '*amt + s11[k+adjust:]
+            adjust += amt
+        s2 += ' '*(j-i) + asetIndex.ljust(k-j)
+        i = k
+
+    long_lines = [s0, s1, s11, s2]
+
+    outstr = '\n\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))).replace('~',' ')
+    outstr += '\n'
+    if fAbbrevs:
+        outstr += ' ('+', '.join('='.join(pair) for pair in fAbbrevs.items())+')'
+        assert len(fAbbrevs)==len(dict(fAbbrevs)),'Abbreviation clash'
+
+    return outstr
+
+def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
+    '''Helper for _annotation_ascii_FEs().'''
+    s1 = ''
+    s2 = ''
+    i = 0
+    for j,k,fename in overt:
+        s1 += ' '*(j-i) + ('^' if fename.islower() else '-')*(k-j)
+        short = fename[:k-j]
+        if len(fename)>len(short):
+            r = 0
+            while short in feAbbrevs:
+                if feAbbrevs[short]==fename:
+                    break
+                r += 1
+                short = fename[:k-j-1] + str(r)
+            else:   # short not in feAbbrevs
+                feAbbrevs[short] = fename
+        s2 += ' '*(j-i) + short.ljust(k-j)
+        i = k
+
+    sNI = ''
+    if ni:
+        sNI += ' ['+', '.join(':'.join(x) for x in sorted(ni.items()))+']'
+    return [s1,s2,sNI]
+
+def _annotation_ascii_FEs(sent):
+    '''
+    ASCII string rendering of the sentence along with a single target and its FEs.
+    Secondary and tertiary FE layers are included if present.
+    'sent' can be an FE annotation set or an LU sentence with a single target.
+    Line-wrapped to limit the display width.
+    '''
+    feAbbrevs = OrderedDict()
+    posspec = []    # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
+    posspec_separate = False
+    for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
+        if lyr in sent and sent[lyr]:
+            for a,b,lbl in sent[lyr]:
+                if lbl=='X': # skip this, which covers an entire phrase typically containing the target and all its FEs
+                    # (but do display the Gov)
+                    continue
+                if any(1 for x,y,felbl in sent.FE[0] if x<=a<y or a<=x<b):
+                    # overlap between one of the POS-specific layers and first FE layer
+                    posspec_separate = True # show POS-specific layers on a separate line
+                posspec.append((a,b,lbl.lower().replace('-',''))) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
+    if posspec_separate:
+        POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs)
+    FE1 = _annotation_ascii_FE_layer(sorted(sent.FE[0] + (posspec if not posspec_separate else [])), sent.FE[1], feAbbrevs)
+    FE2 = FE3 = None
+    if 'FE2' in sent:
+        FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs)
+        if 'FE3' in sent:
+            FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs)
+
+    for i,j in sent.Target:
+        FE1span, FE1name, FE1exp = FE1
+        if len(FE1span)<j:
+            FE1span += ' '*(j-len(FE1span))
+        if len(FE1name)<j:
+            FE1name += ' '*(j-len(FE1name))
+            FE1[1] = FE1name
+        FE1[0] = FE1span[:i] + FE1span[i:j].replace(' ','*').replace('-','=') + FE1span[j:]
+    long_lines = [sent.text]
+    if posspec_separate:
+        long_lines.extend(POSSPEC[:2])
+    long_lines.extend([FE1[0], FE1[1]+FE1[2]]) # lines with no length limit
+    if FE2:
+        long_lines.extend([FE2[0], FE2[1]+FE2[2]])
+        if FE3:
+            long_lines.extend([FE3[0], FE3[1]+FE3[2]])
+    long_lines.append('')
+    outstr = '\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' ')))
+    if feAbbrevs:
+        outstr += '('+', '.join('='.join(pair) for pair in feAbbrevs.items())+')'
+        assert len(feAbbrevs)==len(dict(feAbbrevs)),'Abbreviation clash'
+    outstr += "\n"
+
+    return outstr
+
+def _pretty_fe(fe):
+
+    """
+    Helper function for pretty-printing a frame element.
+
+    :param fe: The frame element to be printed.
+    :type fe: AttrDict
+    :return: A nicely formated string representation of the frame element.
+    :rtype: str
+    """
+    fekeys = fe.keys()
+    outstr = ""
+    outstr += "frame element ({0.ID}): {0.name}\n    of {1.name}({1.ID})\n".format(fe, fe.frame)
+    if 'definition' in fekeys:
+        outstr += "[definition]\n"
+        outstr += _pretty_longstring(fe.definition,'  ')
+    if 'abbrev' in fekeys:
+        outstr += "[abbrev] {0}\n".format(fe.abbrev)
+    if 'coreType' in fekeys:
+        outstr += "[coreType] {0}\n".format(fe.coreType)
+    if 'requiresFE' in fekeys:
+        outstr += "[requiresFE] "
+        if fe.requiresFE is None:
+            outstr += "<None>\n"
+        else:
+            outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID)
+    if 'excludesFE' in fekeys:
+        outstr += "[excludesFE] "
+        if fe.excludesFE is None:
+            outstr += "<None>\n"
+        else:
+            outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID)
+    if 'semType' in fekeys:
+        outstr += "[semType] "
+        if fe.semType is None:
+            outstr += "<None>\n"
+        else:
+            outstr += "\n  " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + '\n'
+
+    return outstr
+
+def _pretty_frame(frame):
+
+    """
+    Helper function for pretty-printing a frame.
+
+    :param frame: The frame to be printed.
+    :type frame: AttrDict
+    :return: A nicely formated string representation of the frame.
+    :rtype: str
+    """
+
+    outstr = ""
+    outstr += "frame ({0.ID}): {0.name}\n\n".format(frame)
+    outstr += "[URL] {0}\n\n".format(frame.URL)
+    outstr += "[definition]\n"
+    outstr += _pretty_longstring(frame.definition, '  ') + '\n'
+
+    outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes))
+    outstr += "  "*(len(frame.semTypes)>0) + ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes) + '\n'*(len(frame.semTypes)>0)
+
+    outstr += "\n[frameRelations] {0} frame relations\n".format(len(frame.frameRelations))
+    outstr += '  ' + '\n  '.join(repr(frel) for frel in frame.frameRelations) + '\n'
+
+    outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit))
+    lustrs = []
+    for luName,lu in sorted(frame.lexUnit.items()):
+        tmpstr = '{0} ({1})'.format(luName, lu.ID)
+        lustrs.append(tmpstr)
+    outstr += "{0}\n".format(_pretty_longstring(', '.join(lustrs),prefix='  '))
+
+    outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE))
+    fes = {}
+    for feName,fe in sorted(frame.FE.items()):
+        try:
+            fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID))
+        except KeyError:
+            fes[fe.coreType] = []
+            fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID))
+    for ct in sorted(fes.keys(), key=lambda ct2: ['Core','Core-Unexpressed','Peripheral','Extra-Thematic'].index(ct2)):
+        outstr += "{0:>16}: {1}\n".format(ct, ', '.join(sorted(fes[ct])))
+
+    outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(len(frame.FEcoreSets))
+    outstr += "  " + '\n  '.join(", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets) + '\n'
+
+    return outstr
+
+class FramenetError(Exception):
+
+    """An exception class for framenet-related errors."""
+
+@python_2_unicode_compatible
+class AttrDict(dict):
+
+    """A class that wraps a dict and allows accessing the keys of the
+    dict as if they were attributes. Taken from here:
+       http://stackoverflow.com/a/14620633/8879
+
+    >>> foo = {'a':1, 'b':2, 'c':3}
+    >>> bar = AttrDict(foo)
+    >>> pprint(dict(bar))
+    {'a': 1, 'b': 2, 'c': 3}
+    >>> bar.b
+    2
+    >>> bar.d = 4
+    >>> pprint(dict(bar))
+    {'a': 1, 'b': 2, 'c': 3, 'd': 4}
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        #self.__dict__ = self
+
+    def __setattr__(self, name, value):
+        self[name] = value
+    def __getattr__(self, name):
+        if name=='_short_repr':
+            return self._short_repr
+        return self[name]
+    def __getitem__(self, name):
+        v = super(AttrDict,self).__getitem__(name)
+        if isinstance(v,Future):
+            return v._data()
+        return v
+
+    def _short_repr(self):
+        if '_type' in self:
+            if self['_type'].endswith('relation'):
+                return self.__repr__()
+            try:
+                return "<{0} ID={1} name={2}>".format(self['_type'], self['ID'], self['name'])
+            except KeyError:
+                try:    # no ID--e.g., for _type=lusubcorpus
+                    return "<{0} name={1}>".format(self['_type'], self['name'])
+                except KeyError:    # no name--e.g., for _type=lusentence
+                    return "<{0} ID={1}>".format(self['_type'], self['ID'])
+        else:
+            return self.__repr__()
+
+    def _str(self):
+        outstr = ""
+
+        if not '_type' in self:
+            outstr = _pretty_any(self)
+        elif self['_type'] == 'frame':
+            outstr = _pretty_frame(self)
+        elif self['_type'] == 'fe':
+            outstr = _pretty_fe(self)
+        elif self['_type'] == 'lu':
+            outstr = _pretty_lu(self)
+        elif self['_type'] == 'luexemplars': # list of ALL exemplars for LU
+            outstr = _pretty_exemplars(self, self[0].LU)
+        elif self['_type'] == 'fulltext_annotation': # list of all sentences for full-text doc
+            outstr = _pretty_fulltext_sentences(self)
+        elif self['_type'] == 'lusentence':
+            outstr = _pretty_annotation(self)
+        elif self['_type'] == 'fulltext_sentence':
+            outstr = _pretty_fulltext_sentence(self)
+        elif self['_type'] in ('luannotationset', 'fulltext_annotationset'):
+            outstr = _pretty_annotation(self, aset_level=True)
+        elif self['_type'] == 'posannotationset':
+            outstr = _pretty_pos(self)
+        elif self['_type'] == 'semtype':
+            outstr = _pretty_semtype(self)
+        elif self['_type'] == 'framerelationtype':
+            outstr = _pretty_frame_relation_type(self)
+        elif self['_type'] == 'framerelation':
+            outstr = _pretty_frame_relation(self)
+        elif self['_type'] == 'ferelation':
+            outstr = _pretty_fe_relation(self)
+        else:
+            outstr = _pretty_any(self)
+
+        # ensure result is unicode string prior to applying the
+        # @python_2_unicode_compatible decorator (because non-ASCII characters
+        # could in principle occur in the data and would trigger an encoding error when
+        # passed as arguments to str.format()).
+        # assert isinstance(outstr, unicode) # not in Python 3.2
+        return outstr
+
+    def __str__(self):
+        return self._str()
+    def __repr__(self):
+        return self.__str__()
+
+@python_2_unicode_compatible
+class SpecialList(list):
+    """
+    A list subclass which adds a '_type' attribute for special printing
+    (similar to an AttrDict, though this is NOT an AttrDict subclass).
+    """
+    def __init__(self, typ, *args, **kwargs):
+        super(SpecialList,self).__init__(*args, **kwargs)
+        self._type = typ
+
+    def _str(self):
+        outstr = ""
+
+        assert self._type
+        if len(self)==0:
+            outstr = "[]"
+        elif self._type == 'luexemplars': # list of ALL exemplars for LU
+            outstr = _pretty_exemplars(self, self[0].LU)
+        else:
+            assert False,self._type
+        return outstr
+
+    def __str__(self):
+        return self._str()
+    def __repr__(self):
+        return self.__str__()
+
+class Future(object):
+    """
+    Wraps and acts as a proxy for a value to be loaded lazily (on demand).
+    Adapted from https://gist.github.com/sergey-miryanov/2935416
+    """
+    def __init__(self, loader, *args, **kwargs):
+        """
+        :param loader: when called with no arguments, returns the value to be stored
+        :type loader: callable
+        """
+        super (Future, self).__init__(*args, **kwargs)
+        self._loader = loader
+        self._d = None
+    def _data(self):
+        if callable(self._loader):
+            self._d = self._loader()
+            self._loader = None # the data is now cached
+        return self._d
+
+    def __nonzero__(self):
+        return bool(self._data())
+    def __len__(self):
+        return len(self._data())
+
+    def __setitem__(self, key, value):
+        return self._data ().__setitem__(key, value)
+    def __getitem__(self, key):
+        return self._data ().__getitem__(key)
+    def __getattr__(self, key):
+        return self._data().__getattr__(key)
+
+    def __str__(self):
+        return self._data().__str__()
+    def __repr__(self):
+        return self._data().__repr__()
+
+@python_2_unicode_compatible
+class PrettyDict(AttrDict):
+    """
+    Displays an abbreviated repr of values where possible.
+    Inherits from AttrDict, so a callable value will
+    be lazily converted to an actual value.
+    """
+    def __init__(self, *args, **kwargs):
+        _BREAK_LINES = kwargs.pop('breakLines', False)
+        super(PrettyDict, self).__init__(*args, **kwargs)
+        dict.__setattr__(self, '_BREAK_LINES', _BREAK_LINES)
+    def __repr__(self):
+        parts = []
+        for k,v in sorted(self.items()):
+            kv = repr(k)+': '
+            try:
+                kv += v._short_repr()
+            except AttributeError:
+                kv += repr(v)
+            parts.append(kv)
+        return '{'+(',\n ' if self._BREAK_LINES else ', ').join(parts)+'}'
+
+@python_2_unicode_compatible
+class PrettyList(list):
+    """
+    Displays an abbreviated repr of only the first several elements, not the whole list.
+    """
+    # from nltk.util
+    def __init__(self, *args, **kwargs):
+        self._MAX_REPR_SIZE = kwargs.pop('maxReprSize', 60)
+        self._BREAK_LINES = kwargs.pop('breakLines', False)
+        super(PrettyList, self).__init__(*args, **kwargs)
+    def __repr__(self):
+        """
+        Return a string representation for this corpus view that is
+        similar to a list's representation; but if it would be more
+        than 60 characters long, it is truncated.
+        """
+        pieces = []
+        length = 5
+
+        for elt in self:
+            pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
+            length += len(pieces[-1]) + 2
+            if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2:
+                return "[%s, ...]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces[:-1])
+        return "[%s]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces)
+
+@python_2_unicode_compatible
+class PrettyLazyMap(LazyMap):
+    """
+    Displays an abbreviated repr of only the first several elements, not the whole list.
+    """
+    # from nltk.util
+    _MAX_REPR_SIZE = 60
+    def __repr__(self):
+        """
+        Return a string representation for this corpus view that is
+        similar to a list's representation; but if it would be more
+        than 60 characters long, it is truncated.
+        """
+        pieces = []
+        length = 5
+        for elt in self:
+            pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
+            length += len(pieces[-1]) + 2
+            if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+                return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+        return "[%s]" % text_type(', ').join(pieces)
+
+@python_2_unicode_compatible
+class PrettyLazyIteratorList(LazyIteratorList):
+    """
+    Displays an abbreviated repr of only the first several elements, not the whole list.
+    """
+    # from nltk.util
+    _MAX_REPR_SIZE = 60
+    def __repr__(self):
+        """
+        Return a string representation for this corpus view that is
+        similar to a list's representation; but if it would be more
+        than 60 characters long, it is truncated.
+        """
+        pieces = []
+        length = 5
+        for elt in self:
+            pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
+            length += len(pieces[-1]) + 2
+            if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+                return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+        return "[%s]" % text_type(', ').join(pieces)
+
+@python_2_unicode_compatible
+class PrettyLazyConcatenation(LazyConcatenation):
+    """
+    Displays an abbreviated repr of only the first several elements, not the whole list.
+    """
+    # from nltk.util
+    _MAX_REPR_SIZE = 60
+    def __repr__(self):
+        """
+        Return a string representation for this corpus view that is
+        similar to a list's representation; but if it would be more
+        than 60 characters long, it is truncated.
+        """
+        pieces = []
+        length = 5
+        for elt in self:
+            pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
+            length += len(pieces[-1]) + 2
+            if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+                return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+        return "[%s]" % text_type(', ').join(pieces)
+
+    def __add__(self, other):
+        """Return a list concatenating self with other."""
+        return PrettyLazyIteratorList(itertools.chain(self, other))
+
+    def __radd__(self, other):
+        """Return a list concatenating other with self."""
+        return PrettyLazyIteratorList(itertools.chain(other, self))
+
+
+class FramenetCorpusReader(XMLCorpusReader):
+    """A corpus reader for the Framenet Corpus.
+
+    >>> from nltk.corpus import framenet as fn
+    >>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238)
+    True
+    >>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame
+    True
+    >>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality')
+    True
+    """
+
+    _bad_statuses = ['Problem']
+    """
+    When loading LUs for a frame, those whose status is in this list will be ignored.
+    Due to caching, if user code modifies this, it should do so before loading any data.
+    'Problem' should always be listed for FrameNet 1.5, as these LUs are not included
+    in the XML index.
+    """
+
+    _warnings = False
+
+    def warnings(self, v):
+        """Enable or disable warnings of data integrity issues as they are encountered.
+        If v is truthy, warnings will be enabled.
+
+        (This is a function rather than just an attribute/property to ensure that if
+        enabling warnings is the first action taken, the corpus reader is instantiated first.)
+        """
+        self._warnings = v
+
+    def __init__(self, root, fileids):
+        XMLCorpusReader.__init__(self, root, fileids)
+
+        # framenet corpus sub dirs
+        # sub dir containing the xml files for frames
+        self._frame_dir = "frame"
+        # sub dir containing the xml files for lexical units
+        self._lu_dir = "lu"
+        # sub dir containing the xml files for fulltext annotation files
+        self._fulltext_dir = "fulltext"
+
+        # location of latest development version of FrameNet
+        self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data"
+
+        # Indexes used for faster look-ups
+        self._frame_idx = None
+        self._cached_frames = {}    # name -> ID
+        self._lu_idx = None
+        self._fulltext_idx = None
+        self._semtypes = None
+        self._freltyp_idx = None    # frame relation types (Inheritance, Using, etc.)
+        self._frel_idx = None   # frame-to-frame relation instances
+        self._ferel_idx = None  # FE-to-FE relation instances
+        self._frel_f_idx = None # frame-to-frame relations associated with each frame
+
+    def help(self, attrname=None):
+        """Display help information summarizing the main methods."""
+
+        if attrname is not None:
+            return help(self.__getattribute__(attrname))
+
+        # No need to mention frame_by_name() or frame_by_id(),
+        # as it's easier to just call frame().
+        # Also not mentioning lu_basic().
+
+
+        msg = """
+Citation: Nathan Schneider and Chuck Wooters (2017), 
+"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource". 
+Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438
+
+Use the following methods to access data in FrameNet.
+Provide a method name to `help()` for more information.
+
+FRAMES
+======
+
+frame() to look up a frame by its exact name or ID
+frames() to get frames matching a name pattern
+frames_by_lemma() to get frames containing an LU matching a name pattern
+frame_ids_and_names() to get a mapping from frame IDs to names
+
+FRAME ELEMENTS
+==============
+
+fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained
+  by a frame name pattern
+
+LEXICAL UNITS
+=============
+
+lu() to look up an LU by its ID
+lus() to get lexical units matching a name pattern, optionally constrained by frame
+lu_ids_and_names() to get a mapping from LU IDs to names
+
+RELATIONS
+=========
+
+frame_relation_types() to get the different kinds of frame-to-frame relations
+  (Inheritance, Subframe, Using, etc.).
+frame_relations() to get the relation instances, optionally constrained by
+  frame(s) or relation type
+fe_relations() to get the frame element pairs belonging to a frame-to-frame relation
+
+SEMANTIC TYPES
+==============
+
+semtypes() to get the different kinds of semantic types that can be applied to
+  FEs, LUs, and entire frames
+semtype() to look up a particular semtype by name, ID, or abbreviation
+semtype_inherits() to check whether two semantic types have a subtype-supertype
+  relationship in the semtype hierarchy
+propagate_semtypes() to apply inference rules that distribute semtypes over relations
+  between FEs
+
+ANNOTATIONS
+===========
+
+annotations() to get annotation sets, in which a token in a sentence is annotated
+  with a lexical unit in a frame, along with its frame elements and their syntactic properties;
+  can be constrained by LU name pattern and limited to lexicographic exemplars or full-text.
+  Sentences of full-text annotation can have multiple annotation sets.
+sents() to get annotated sentences illustrating one or more lexical units
+exemplars() to get sentences of lexicographic annotation, most of which have
+  just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s)
+doc() to look up a document of full-text annotation by its ID
+docs() to get documents of full-text annotation that match a name pattern
+docs_metadata() to get metadata about all full-text documents without loading them
+ft_sents() to iterate over sentences of full-text annotation
+
+UTILITIES
+=========
+
+buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid
+  delay when one is accessed for the first time. It does not load annotations.
+readme() gives the text of the FrameNet README file
+warnings(True) to display corpus consistency warnings when loading data
+        """
+        print(msg)
+
+    def _buildframeindex(self):
+        # The total number of Frames in Framenet is fairly small (~1200) so
+        # this index should not be very large
+        if not self._frel_idx:
+            self._buildrelationindex()  # always load frame relations before frames,
+            # otherwise weird ordering effects might result in incomplete information
+        self._frame_idx = {}
+        for f in XMLCorpusView(self.abspath("frameIndex.xml"),
+                               'frameIndex/frame', self._handle_elt):
+            self._frame_idx[f['ID']] = f
+
+    def _buildcorpusindex(self):
+        # The total number of fulltext annotated documents in Framenet
+        # is fairly small (~90) so this index should not be very large
+        self._fulltext_idx = {}
+        for doclist in XMLCorpusView(self.abspath("fulltextIndex.xml"),
+                                     'fulltextIndex/corpus',
+                                     self._handle_fulltextindex_elt):
+            for doc in doclist:
+                self._fulltext_idx[doc.ID] = doc
+
+    def _buildluindex(self):
+        # The number of LUs in Framenet is about 13,000 so this index
+        # should not be very large
+        self._lu_idx = {}
+        for lu in XMLCorpusView(self.abspath("luIndex.xml"),
+                                'luIndex/lu', self._handle_elt):
+            self._lu_idx[lu['ID']] = lu # populate with LU index entries. if any of these
+            # are looked up they will be replaced by full LU objects.
+
+    def _buildrelationindex(self):
+        #print('building relation index...', file=sys.stderr)
+        freltypes = PrettyList(x for x in XMLCorpusView(self.abspath("frRelation.xml"),
+                                            'frameRelations/frameRelationType',
+                                            self._handle_framerelationtype_elt))
+        self._freltyp_idx = {}
+        self._frel_idx = {}
+        self._frel_f_idx = defaultdict(set)
+        self._ferel_idx = {}
+
+        for freltyp in freltypes:
+            self._freltyp_idx[freltyp.ID] = freltyp
+            for frel in freltyp.frameRelations:
+                supF = frel.superFrame = frel[freltyp.superFrameName] = Future((lambda fID: lambda: self.frame_by_id(fID))(frel.supID))
+                subF = frel.subFrame = frel[freltyp.subFrameName] = Future((lambda fID: lambda: self.frame_by_id(fID))(frel.subID))
+                self._frel_idx[frel.ID] = frel
+                self._frel_f_idx[frel.supID].add(frel.ID)
+                self._frel_f_idx[frel.subID].add(frel.ID)
+                for ferel in frel.feRelations:
+                    ferel.superFrame = supF
+                    ferel.subFrame = subF
+                    ferel.superFE = Future((lambda fer: lambda: fer.superFrame.FE[fer.superFEName])(ferel))
+                    ferel.subFE = Future((lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel))
+                    self._ferel_idx[ferel.ID] = ferel
+        #print('...done building relation index', file=sys.stderr)
+
+    def _warn(self, *message, **kwargs):
+        if self._warnings:
+            kwargs.setdefault('file', sys.stderr)
+            print(*message, **kwargs)
+
+    def readme(self):
+        """
+        Return the contents of the corpus README.txt (or README) file.
+        """
+        try:
+            return self.open("README.txt").read()
+        except IOError:
+            return self.open("README").read()
+
+    def buildindexes(self):
+        """
+        Build the internal indexes to make look-ups faster.
+        """
+        # Frames
+        self._buildframeindex()
+        # LUs
+        self._buildluindex()
+        # Fulltext annotation corpora index
+        self._buildcorpusindex()
+        # frame and FE relations
+        self._buildrelationindex()
+
+    def doc(self, fn_docid):
+        """
+        Returns the annotated document whose id number is
+        ``fn_docid``. This id number can be obtained by calling the
+        Documents() function.
+
+        The dict that is returned from this function will contain the
+        following keys:
+
+        - '_type'      : 'fulltextannotation'
+        - 'sentence'   : a list of sentences in the document
+           - Each item in the list is a dict containing the following keys:
+              - 'ID'    : the ID number of the sentence
+              - '_type' : 'sentence'
+              - 'text'  : the text of the sentence
+              - 'paragNo' : the paragraph number
+              - 'sentNo'  : the sentence number
+              - 'docID'   : the document ID number
+              - 'corpID'  : the corpus ID number
+              - 'aPos'    : the annotation position
+              - 'annotationSet' : a list of annotation layers for the sentence
+                 - Each item in the list is a dict containing the following keys:
+                    - 'ID'       : the ID number of the annotation set
+                    - '_type'    : 'annotationset'
+                    - 'status'   : either 'MANUAL' or 'UNANN'
+                    - 'luName'   : (only if status is 'MANUAL')
+                    - 'luID'     : (only if status is 'MANUAL')
+                    - 'frameID'  : (only if status is 'MANUAL')
+                    - 'frameName': (only if status is 'MANUAL')
+                    - 'layer' : a list of labels for the layer
+                       - Each item in the layer is a dict containing the
+                         following keys:
+                          - '_type': 'layer'
+                          - 'rank'
+                          - 'name'
+                          - 'label' : a list of labels in the layer
+                             - Each item is a dict containing the following keys:
+                                - 'start'
+                                - 'end'
+                                - 'name'
+                                - 'feID' (optional)
+
+        :param fn_docid: The Framenet id number of the document
+        :type fn_docid: int
+        :return: Information about the annotated document
+        :rtype: dict
+        """
+        try:
+            xmlfname = self._fulltext_idx[fn_docid].filename
+        except TypeError:  # happens when self._fulltext_idx == None
+            # build the index
+            self._buildcorpusindex()
+            xmlfname = self._fulltext_idx[fn_docid].filename
+        except KeyError:  # probably means that fn_docid was not in the index
+            raise FramenetError("Unknown document id: {0}".format(fn_docid))
+
+        # construct the path name for the xml file containing the document info
+        locpath = os.path.join(
+            "{0}".format(self._root), self._fulltext_dir, xmlfname)
+
+        # Grab the top-level xml element containing the fulltext annotation
+        elt = XMLCorpusView(locpath, 'fullTextAnnotation')[0]
+        info = self._handle_fulltextannotation_elt(elt)
+        # add metadata
+        for k,v in self._fulltext_idx[fn_docid].items():
+            info[k] = v
+        return info
+
+    def frame_by_id(self, fn_fid, ignorekeys=[]):
+        """
+        Get the details for the specified Frame using the frame's id
+        number.
+
+        Usage examples:
+
+        >>> from nltk.corpus import framenet as fn
+        >>> f = fn.frame_by_id(256)
+        >>> f.ID
+        256
+        >>> f.name
+        'Medical_specialties'
+        >>> f.definition
+        "This frame includes words that name ..."
+
+        :param fn_fid: The Framenet id number of the frame
+        :type fn_fid: int
+        :param ignorekeys: The keys to ignore. These keys will not be
+            included in the output. (optional)
+        :type ignorekeys: list(str)
+        :return: Information about a frame
+        :rtype: dict
+
+        Also see the ``frame()`` function for details about what is
+        contained in the dict that is returned.
+        """
+
+        # get the name of the frame with this id number
+        try:
+            fentry = self._frame_idx[fn_fid]
+            if '_type' in fentry:
+                return fentry   # full frame object is cached
+            name = fentry['name']
+        except TypeError:
+            self._buildframeindex()
+            name = self._frame_idx[fn_fid]['name']
+        except KeyError:
+            raise FramenetError('Unknown frame id: {0}'.format(fn_fid))
+
+        return self.frame_by_name(name, ignorekeys, check_cache=False)
+
+    def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True):
+        """
+        Get the details for the specified Frame using the frame's name.
+
+        Usage examples:
+
+        >>> from nltk.corpus import framenet as fn
+        >>> f = fn.frame_by_name('Medical_specialties')
+        >>> f.ID
+        256
+        >>> f.name
+        'Medical_specialties'
+        >>> f.definition
+        "This frame includes words that name ..."
+
+        :param fn_fname: The name of the frame
+        :type fn_fname: str
+        :param ignorekeys: The keys to ignore. These keys will not be
+            included in the output. (optional)
+        :type ignorekeys: list(str)
+        :return: Information about a frame
+        :rtype: dict
+
+        Also see the ``frame()`` function for details about what is
+        contained in the dict that is returned.
+        """
+
+        if check_cache and fn_fname in self._cached_frames:
+            return self._frame_idx[self._cached_frames[fn_fname]]
+        elif not self._frame_idx:
+            self._buildframeindex()
+
+        # construct the path name for the xml file containing the Frame info
+        locpath = os.path.join(
+            "{0}".format(self._root), self._frame_dir, fn_fname + ".xml")
+        #print(locpath, file=sys.stderr)
+        # Grab the xml for the frame
+        try:
+            elt = XMLCorpusView(locpath, 'frame')[0]
+        except IOError:
+            raise FramenetError('Unknown frame: {0}'.format(fn_fname))
+
+        fentry = self._handle_frame_elt(elt, ignorekeys)
+        assert fentry
+
+        fentry.URL = self._fnweb_url + '/' + self._frame_dir + '/' + fn_fname + '.xml'
+
+        # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs
+        for st in fentry.semTypes:
+            if st.rootType.name=='Lexical_type':
+                for lu in fentry.lexUnit.values():
+                    if not any(x is st for x in lu.semTypes):  # identity containment check
+                        lu.semTypes.append(st)
+
+
+        self._frame_idx[fentry.ID] = fentry
+        self._cached_frames[fentry.name] = fentry.ID
+        '''
+        # now set up callables to resolve the LU pointers lazily.
+        # (could also do this here--caching avoids infinite recursion.)
+        for luName,luinfo in fentry.lexUnit.items():
+            fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID)
+        '''
+        return fentry
+
+    def frame(self, fn_fid_or_fname, ignorekeys=[]):
+        """
+        Get the details for the specified Frame using the frame's name
+        or id number.
+
+        Usage examples:
+
+        >>> from nltk.corpus import framenet as fn
+        >>> f = fn.frame(256)
+        >>> f.name
+        'Medical_specialties'
+        >>> f = fn.frame('Medical_specialties')
+        >>> f.ID
+        256
+        >>> # ensure non-ASCII character in definition doesn't trigger an encoding error:
+        >>> fn.frame('Imposing_obligation')
+        frame (1494): Imposing_obligation...
+
+        The dict that is returned from this function will contain the
+        following information about the Frame:
+
+        - 'name'       : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.)
+        - 'definition' : textual definition of the Frame
+        - 'ID'         : the internal ID number of the Frame
+        - 'semTypes'   : a list of semantic types for this frame
+           - Each item in the list is a dict containing the following keys:
+              - 'name' : can be used with the semtype() function
+              - 'ID'   : can be used with the semtype() function
+
+        - 'lexUnit'    : a dict containing all of the LUs for this frame.
+                         The keys in this dict are the names of the LUs and
+                         the value for each key is itself a dict containing
+                         info about the LU (see the lu() function for more info.)
+
+        - 'FE' : a dict containing the Frame Elements that are part of this frame
+                 The keys in this dict are the names of the FEs (e.g. 'Body_system')
+                 and the values are dicts containing the following keys
+              - 'definition' : The definition of the FE
+              - 'name'       : The name of the FE e.g. 'Body_system'
+              - 'ID'         : The id number
+              - '_type'      : 'fe'
+              - 'abbrev'     : Abbreviation e.g. 'bod'
+              - 'coreType'   : one of "Core", "Peripheral", or "Extra-Thematic"
+              - 'semType'    : if not None, a dict with the following two keys:
+                 - 'name' : name of the semantic type. can be used with
+                            the semtype() function
+                 - 'ID'   : id number of the semantic type. can be used with
+                            the semtype() function
+              - 'requiresFE' : if not None, a dict with the following two keys:
+                 - 'name' : the name of another FE in this frame
+                 - 'ID'   : the id of the other FE in this frame
+              - 'excludesFE' : if not None, a dict with the following two keys:
+                 - 'name' : the name of another FE in this frame
+                 - 'ID'   : the id of the other FE in this frame
+
+        - 'frameRelation'      : a list of objects describing frame relations
+        - 'FEcoreSets'  : a list of Frame Element core sets for this frame
+           - Each item in the list is a list of FE objects
+
+        :param fn_fid_or_fname: The Framenet name or id number of the frame
+        :type fn_fid_or_fname: int or str
+        :param ignorekeys: The keys to ignore. These keys will not be
+            included in the output. (optional)
+        :type ignorekeys: list(str)
+        :return: Information about a frame
+        :rtype: dict
+        """
+
+        # get the frame info by name or id number
+        if isinstance(fn_fid_or_fname, string_types):
+            f = self.frame_by_name(fn_fid_or_fname, ignorekeys)
+        else:
+            f = self.frame_by_id(fn_fid_or_fname, ignorekeys)
+
+        return f
+
+    def frames_by_lemma(self, pat):
+        """
+        Returns a list of all frames that contain LUs in which the
+        ``name`` attribute of the LU matchs the given regular expression
+        ``pat``. Note that LU names are composed of "lemma.POS", where
+        the "lemma" part can be made up of either a single lexeme
+        (e.g. 'run') or multiple lexemes (e.g. 'a little').
+
+        Note: if you are going to be doing a lot of this type of
+        searching, you'd want to build an index that maps from lemmas to
+        frames because each time frames_by_lemma() is called, it has to
+        search through ALL of the frame XML files in the db.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> fn.frames_by_lemma(r'(?i)a little') # doctest: +ELLIPSIS
+        [<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
+
+        :return: A list of frame objects.
+        :rtype: list(AttrDict)
+        """
+        return PrettyList(f for f in self.frames() if any(re.search(pat, luName) for luName in f.lexUnit))
+
+    def lu_basic(self, fn_luid):
+        """
+        Returns basic information about the LU whose id is
+        ``fn_luid``. This is basically just a wrapper around the
+        ``lu()`` function with "subCorpus" info excluded.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> lu = PrettyDict(fn.lu_basic(256), breakLines=True)
+        >>> # ellipses account for differences between FN 1.5 and 1.7
+        >>> lu # doctest: +ELLIPSIS
+        {'ID': 256,
+         'POS': 'V',
+         'URL': u'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
+         '_type': 'lu',
+         'cBy': ...,
+         'cDate': '02/08/2001 01:27:50 PST Thu',
+         'definition': 'COD: be aware of beforehand; predict.',
+         'definitionMarkup': 'COD: be aware of beforehand; predict.',
+         'frame': <frame ID=26 name=Expectation>,
+         'lemmaID': 15082,
+         'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}],
+         'name': 'foresee.v',
+         'semTypes': [],
+         'sentenceCount': {'annotated': ..., 'total': ...},
+         'status': 'FN1_Sent'}
+
+        :param fn_luid: The id number of the desired LU
+        :type fn_luid: int
+        :return: Basic information about the lexical unit
+        :rtype: dict
+        """
+        return self.lu(fn_luid, ignorekeys=['subCorpus', 'exemplars'])
+
+    def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
+        """
+        Access a lexical unit by its ID. luName, frameID, and frameName are used
+        only in the event that the LU does not have a file in the database
+        (which is the case for LUs with "Problem" status); in this case,
+        a placeholder LU is created which just contains its name, ID, and frame.
+
+
+        Usage examples:
+
+        >>> from nltk.corpus import framenet as fn
+        >>> fn.lu(256).name
+        'foresee.v'
+        >>> fn.lu(256).definition
+        'COD: be aware of beforehand; predict.'
+        >>> fn.lu(256).frame.name
+        'Expectation'
+        >>> pprint(list(map(PrettyDict, fn.lu(256).lexemes)))
+        [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}]
+
+        >>> fn.lu(227).exemplars[23]
+        exemplar sentence (352962):
+        [sentNo] 0
+        [aPos] 59699508
+        <BLANKLINE>
+        [LU] (227) guess.v in Coming_to_believe
+        <BLANKLINE>
+        [frame] (23) Coming_to_believe
+        <BLANKLINE>
+        [annotationSet] 2 annotation sets
+        <BLANKLINE>
+        [POS] 18 tags
+        <BLANKLINE>
+        [POS_tagset] BNC
+        <BLANKLINE>
+        [GF] 3 relations
+        <BLANKLINE>
+        [PT] 3 phrases
+        <BLANKLINE>
+        [Other] 1 entry
+        <BLANKLINE>
+        [text] + [Target] + [FE]
+        <BLANKLINE>
+        When he was inside the house , Culley noticed the characteristic
+                                                      ------------------
+                                                      Content
+        <BLANKLINE>
+        he would n't have guessed at .
+        --                ******* --
+        Co                        C1 [Evidence:INI]
+         (Co=Cognizer, C1=Content)
+        <BLANKLINE>
+        <BLANKLINE>
+
+        The dict that is returned from this function will contain most of the
+        following information about the LU. Note that some LUs do not contain
+        all of these pieces of information - particularly 'totalAnnotated' and
+        'incorporatedFE' may be missing in some LUs:
+
+        - 'name'       : the name of the LU (e.g. 'merger.n')
+        - 'definition' : textual definition of the LU
+        - 'ID'         : the internal ID number of the LU
+        - '_type'      : 'lu'
+        - 'status'     : e.g. 'Created'
+        - 'frame'      : Frame that this LU belongs to
+        - 'POS'        : the part of speech of this LU (e.g. 'N')
+        - 'totalAnnotated' : total number of examples annotated with this LU
+        - 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment')
+        - 'sentenceCount'  : a dict with the following two keys:
+                 - 'annotated': number of sentences annotated with this LU
+                 - 'total'    : total number of sentences with this LU
+
+        - 'lexemes'  : a list of dicts describing the lemma of this LU.
+           Each dict in the list contains these keys:
+           - 'POS'     : part of speech e.g. 'N'
+           - 'name'    : either single-lexeme e.g. 'merger' or
+                         multi-lexeme e.g. 'a little'
+           - 'order': the order of the lexeme in the lemma (starting from 1)
+           - 'headword': a boolean ('true' or 'false')
+           - 'breakBefore': Can this lexeme be separated from the previous lexeme?
+                Consider: "take over.v" as in:
+                         Germany took over the Netherlands in 2 days.
+                         Germany took the Netherlands over in 2 days.
+                In this case, 'breakBefore' would be "true" for the lexeme
+                "over". Contrast this with "take after.v" as in:
+                         Mary takes after her grandmother.
+                        *Mary takes her grandmother after.
+                In this case, 'breakBefore' would be "false" for the lexeme "after"
+
+        - 'lemmaID'    : Can be used to connect lemmas in different LUs
+        - 'semTypes'   : a list of semantic type objects for this LU
+        - 'subCorpus'  : a list of subcorpora
+           - Each item in the list is a dict containing the following keys:
+              - 'name' :
+              - 'sentence' : a list of sentences in the subcorpus
+                 - each item in the list is a dict with the following keys:
+                    - 'ID':
+                    - 'sentNo':
+                    - 'text': the text of the sentence
+                    - 'aPos':
+                    - 'annotationSet': a list of annotation sets
+                       - each item in the list is a dict with the following keys:
+                          - 'ID':
+                          - 'status':
+                          - 'layer': a list of layers
+                             - each layer is a dict containing the following keys:
+                                - 'name': layer name (e.g. 'BNC')
+                                - 'rank':
+                                - 'label': a list of labels for the layer
+                                   - each label is a dict containing the following keys:
+                                      - 'start': start pos of label in sentence 'text' (0-based)
+                                      - 'end': end pos of label in sentence 'text' (0-based)
+                                      - 'name': name of label (e.g. 'NN1')
+
+        Under the hood, this implementation looks up the lexical unit information
+        in the *frame* definition file. That file does not contain
+        corpus annotations, so the LU files will be accessed on demand if those are
+        needed. In principle, valence patterns could be loaded here too,
+        though these are not currently supported.
+
+        :param fn_luid: The id number of the lexical unit
+        :type fn_luid: int
+        :param ignorekeys: The keys to ignore. These keys will not be
+            included in the output. (optional)
+        :type ignorekeys: list(str)
+        :return: All information about the lexical unit
+        :rtype: dict
+        """
+        # look for this LU in cache
+        if not self._lu_idx:
+            self._buildluindex()
+        OOV = object()
+        luinfo = self._lu_idx.get(fn_luid, OOV)
+        if luinfo is OOV:
+            # LU not in the index. We create a placeholder by falling back to
+            # luName, frameID, and frameName. However, this will not be listed
+            # among the LUs for its frame.
+            self._warn('LU ID not found: {0} ({1}) in {2} ({3})'.format(luName, fn_luid, frameName, frameID))
+            luinfo = AttrDict({'_type': 'lu', 'ID': fn_luid, 'name': luName,
+                               'frameID': frameID, 'status': 'Problem'})
+            f = self.frame_by_id(luinfo.frameID)
+            assert f.name==frameName,(f.name,frameName)
+            luinfo['frame'] = f
+            self._lu_idx[fn_luid] = luinfo
+        elif '_type' not in luinfo:
+            # we only have an index entry for the LU. loading the frame will replace this.
+            f = self.frame_by_id(luinfo.frameID)
+            luinfo = self._lu_idx[fn_luid]
+        if ignorekeys:
+            return AttrDict(dict((k, v) for k, v in luinfo.items() if k not in ignorekeys))
+
+        return luinfo
+
+    def _lu_file(self, lu, ignorekeys=[]):
+        """
+        Augment the LU information that was loaded from the frame file
+        with additional information from the LU file.
+        """
+        fn_luid = lu.ID
+
+        fname = "lu{0}.xml".format(fn_luid)
+        locpath = os.path.join("{0}".format(self._root), self._lu_dir, fname)
+        #print(locpath, file=sys.stderr)
+        if not self._lu_idx:
+            self._buildluindex()
+
+        try:
+            elt = XMLCorpusView(locpath, 'lexUnit')[0]
+        except IOError:
+            raise FramenetError('Unknown LU id: {0}'.format(fn_luid))
+
+        lu2 = self._handle_lexunit_elt(elt, ignorekeys)
+        lu.URL = self._fnweb_url + '/' + self._lu_dir + '/' + fname
+        lu.subCorpus = lu2.subCorpus
+        lu.exemplars = SpecialList('luexemplars',
+                                   [sent for subc in lu.subCorpus for sent in subc.sentence])
+        for sent in lu.exemplars:
+            sent['LU'] = lu
+            sent['frame'] = lu.frame
+            for aset in sent.annotationSet:
+                aset['LU'] = lu
+                aset['frame'] = lu.frame
+
+        return lu
+
+    def _loadsemtypes(self):
+        """Create the semantic types index."""
+        self._semtypes = AttrDict()
+        semtypeXML = [x for x in XMLCorpusView(self.abspath("semTypes.xml"),
+                                             'semTypes/semType',
+                                             self._handle_semtype_elt)]
+        for st in semtypeXML:
+            n = st['name']
+            a = st['abbrev']
+            i = st['ID']
+            # Both name and abbrev should be able to retrieve the
+            # ID. The ID will retrieve the semantic type dict itself.
+            self._semtypes[n] = i
+            self._semtypes[a] = i
+            self._semtypes[i] = st
+        # now that all individual semtype XML is loaded, we can link them together
+        roots = []
+        for st in self.semtypes():
+            if st.superType:
+                st.superType = self.semtype(st.superType.supID)
+                st.superType.subTypes.append(st)
+            else:
+                if st not in roots: roots.append(st)
+                st.rootType = st
+        queue = list(roots)
+        assert queue
+        while queue:
+            st = queue.pop(0)
+            for child in st.subTypes:
+                child.rootType = st.rootType
+                queue.append(child)
+        #self.propagate_semtypes()  # apply inferencing over FE relations
+
+    def propagate_semtypes(self):
+        """
+        Apply inference rules to distribute semtypes over relations between FEs.
+        For FrameNet 1.5, this results in 1011 semtypes being propagated.
+        (Not done by default because it requires loading all frame files,
+        which takes several seconds. If this needed to be fast, it could be rewritten
+        to traverse the neighboring relations on demand for each FE semtype.)
+
+        >>> from nltk.corpus import framenet as fn
+        >>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
+        >>> fn.propagate_semtypes()
+        >>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
+        >>> y-x > 1000
+        True
+        """
+        if not self._semtypes:
+            self._loadsemtypes()
+        if not self._ferel_idx:
+            self._buildrelationindex()
+        changed = True
+        i = 0
+        nPropagations = 0
+        while changed:
+            # make a pass and see if anything needs to be propagated
+            i += 1
+            changed = False
+            for ferel in self.fe_relations():
+                superST = ferel.superFE.semType
+                subST = ferel.subFE.semType
+                try:
+                    if superST and superST is not subST:
+                        # propagate downward
+                        assert subST is None or self.semtype_inherits(subST, superST),(superST.name,ferel,subST.name)
+                        if subST is None:
+                            ferel.subFE.semType = subST = superST
+                            changed = True
+                            nPropagations += 1
+                    if ferel.type.name in ['Perspective_on', 'Subframe', 'Precedes'] and subST \
+                        and subST is not superST:
+                        # propagate upward
+                        assert superST is None,(superST.name,ferel,subST.name)
+                        ferel.superFE.semType = superST = subST
+                        changed = True
+                        nPropagations += 1
+                except AssertionError as ex:
+                    # bug in the data! ignore
+                    #print(ex, file=sys.stderr)
+                    continue
+            #print(i, nPropagations, file=sys.stderr)
+
+    def semtype(self, key):
+        """
+        >>> from nltk.corpus import framenet as fn
+        >>> fn.semtype(233).name
+        'Temperature'
+        >>> fn.semtype(233).abbrev
+        'Temp'
+        >>> fn.semtype('Temperature').ID
+        233
+
+        :param key: The name, abbreviation, or id number of the semantic type
+        :type key: string or int
+        :return: Information about a semantic type
+        :rtype: dict
+        """
+        if isinstance(key, int):
+            stid = key
+        else:
+            try:
+                stid = self._semtypes[key]
+            except TypeError:
+                self._loadsemtypes()
+                stid = self._semtypes[key]
+
+        try:
+            st = self._semtypes[stid]
+        except TypeError:
+            self._loadsemtypes()
+            st = self._semtypes[stid]
+
+        return st
+
+    def semtype_inherits(self, st, superST):
+        if not isinstance(st, dict):
+            st = self.semtype(st)
+        if not isinstance(superST, dict):
+            superST = self.semtype(superST)
+        par = st.superType
+        while par:
+            if par is superST:
+                return True
+            par = par.superType
+        return False
+
+    def frames(self, name=None):
+        """
+        Obtain details for a specific frame.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> len(fn.frames()) in (1019, 1221)    # FN 1.5 and 1.7, resp.
+        True
+        >>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True)
+        >>> x.sort(key=lambda f: f.ID)
+        >>> x
+        [<frame ID=200 name=Criminal_process>,
+         <frame ID=500 name=Criminal_investigation>,
+         <frame ID=692 name=Crime_scenario>,
+         <frame ID=700 name=Committing_crime>]
+
+        A brief intro to Frames (excerpted from "FrameNet II: Extended
+        Theory and Practice" by Ruppenhofer et. al., 2010):
+
+        A Frame is a script-like conceptual structure that describes a
+        particular type of situation, object, or event along with the
+        participants and props that are needed for that Frame. For
+        example, the "Apply_heat" frame describes a common situation
+        involving a Cook, some Food, and a Heating_Instrument, and is
+        evoked by words such as bake, blanch, boil, broil, brown,
+        simmer, steam, etc.
+
+        We call the roles of a Frame "frame elements" (FEs) and the
+        frame-evoking words are called "lexical units" (LUs).
+
+        FrameNet includes relations between Frames. Several types of
+        relations are defined, of which the most important are:
+
+           - Inheritance: An IS-A relation. The child frame is a subtype
+             of the parent frame, and each FE in the parent is bound to
+             a corresponding FE in the child. An example is the
+             "Revenge" frame which inherits from the
+             "Rewards_and_punishments" frame.
+
+           - Using: The child frame presupposes the parent frame as
+             background, e.g the "Speed" frame "uses" (or presupposes)
+             the "Motion" frame; however, not all parent FEs need to be
+             bound to child FEs.
+
+           - Subframe: The child frame is a subevent of a complex event
+             represented by the parent, e.g. the "Criminal_process" frame
+             has subframes of "Arrest", "Arraignment", "Trial", and
+             "Sentencing".
+
+           - Perspective_on: The child frame provides a particular
+             perspective on an un-perspectivized parent frame. A pair of
+             examples consists of the "Hiring" and "Get_a_job" frames,
+             which perspectivize the "Employment_start" frame from the
+             Employer's and the Employee's point of view, respectively.
+
+        :param name: A regular expression pattern used to match against
+            Frame names. If 'name' is None, then a list of all
+            Framenet Frames will be returned.
+        :type name: str
+        :return: A list of matching Frames (or all Frames).
+        :rtype: list(AttrDict)
+        """
+        try:
+            fIDs = list(self._frame_idx.keys())
+        except AttributeError:
+            self._buildframeindex()
+            fIDs = list(self._frame_idx.keys())
+
+        if name is not None:
+            return PrettyList(self.frame(fID) for fID,finfo in self.frame_ids_and_names(name).items())
+        else:
+            return PrettyLazyMap(self.frame, fIDs)
+
+    def frame_ids_and_names(self, name=None):
+        """
+        Uses the frame index, which is much faster than looking up each frame definition
+        if only the names and IDs are needed.
+        """
+        if not self._frame_idx:
+            self._buildframeindex()
+        return dict((fID, finfo.name) for fID,finfo in self._frame_idx.items() if name is None or re.search(name, finfo.name) is not None)
+
+    def fes(self, name=None, frame=None):
+        '''
+        Lists frame element objects. If 'name' is provided, this is treated as
+        a case-insensitive regular expression to filter by frame name.
+        (Case-insensitivity is because casing of frame element names is not always
+        consistent across frames.) Specify 'frame' to filter by a frame name pattern,
+        ID, or object.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> fn.fes('Noise_maker')
+        [<fe ID=6043 name=Noise_maker>]
+        >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')])
+        [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'),
+         ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'),
+         ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'),
+         ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'),
+         ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'),
+         ('Vocalizations', 'Sound_source')]
+        >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')])
+        [('Cause_to_make_noise', 'Sound_maker'),
+         ('Make_noise', 'Sound'),
+         ('Make_noise', 'Sound_source')]
+        >>> sorted(set(fe.name for fe in fn.fes('^sound')))
+        ['Sound', 'Sound_maker', 'Sound_source']
+        >>> len(fn.fes('^sound$'))
+        2
+
+        :param name: A regular expression pattern used to match against
+            frame element names. If 'name' is None, then a list of all
+            frame elements will be returned.
+        :type name: str
+        :return: A list of matching frame elements
+        :rtype: list(AttrDict)
+        '''
+        # what frames are we searching in?
+        if frame is not None:
+            if isinstance(frame, int):
+                frames = [self.frame(frame)]
+            elif isinstance(frame, string_types):
+                frames = self.frames(frame)
+            else:
+                frames = [frame]
+        else:
+            frames = self.frames()
+
+        return PrettyList(fe for f in frames for fename,fe in f.FE.items() if name is None or re.search(name, fename, re.I))
+
+    def lus(self, name=None, frame=None):
+        """
+        Obtain details for lexical units.
+        Optionally restrict by lexical unit name pattern, and/or to a certain frame
+        or frames whose name matches a pattern.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp.
+        True
+        >>> PrettyList(fn.lus(r'(?i)a little'), maxReprSize=0, breakLines=True)
+        [<lu ID=14744 name=a little bit.adv>,
+         <lu ID=14733 name=a little.n>,
+         <lu ID=14743 name=a little.adv>]
+        >>> fn.lus(r'interest', r'(?i)stimulus')
+        [<lu ID=14920 name=interesting.a>, <lu ID=14894 name=interested.a>]
+
+        A brief intro to Lexical Units (excerpted from "FrameNet II:
+        Extended Theory and Practice" by Ruppenhofer et. al., 2010):
+
+        A lexical unit (LU) is a pairing of a word with a meaning. For
+        example, the "Apply_heat" Frame describes a common situation
+        involving a Cook, some Food, and a Heating Instrument, and is
+        _evoked_ by words such as bake, blanch, boil, broil, brown,
+        simmer, steam, etc. These frame-evoking words are the LUs in the
+        Apply_heat frame. Each sense of a polysemous word is a different
+        LU.
+
+        We have used the word "word" in talking about LUs. The reality
+        is actually rather complex. When we say that the word "bake" is
+        polysemous, we mean that the lemma "bake.v" (which has the
+        word-forms "bake", "bakes", "baked", and "baking") is linked to
+        three different frames:
+
+           - Apply_heat: "Michelle baked the potatoes for 45 minutes."
+
+           - Cooking_creation: "Michelle baked her mother a cake for her birthday."
+
+           - Absorb_heat: "The potatoes have to bake for more than 30 minutes."
+
+        These constitute three different LUs, with different
+        definitions.
+
+        Multiword expressions such as "given name" and hyphenated words
+        like "shut-eye" can also be LUs. Idiomatic phrases such as
+        "middle of nowhere" and "give the slip (to)" are also defined as
+        LUs in the appropriate frames ("Isolated_places" and "Evading",
+        respectively), and their internal structure is not analyzed.
+
+        Framenet provides multiple annotated examples of each sense of a
+        word (i.e. each LU).  Moreover, the set of examples
+        (approximately 20 per LU) illustrates all of the combinatorial
+        possibilities of the lexical unit.
+
+        Each LU is linked to a Frame, and hence to the other words which
+        evoke that Frame. This makes the FrameNet database similar to a
+        thesaurus, grouping together semantically similar words.
+
+        In the simplest case, frame-evoking words are verbs such as
+        "fried" in:
+
+           "Matilde fried the catfish in a heavy iron skillet."
+
+        Sometimes event nouns may evoke a Frame. For example,
+        "reduction" evokes "Cause_change_of_scalar_position" in:
+
+           "...the reduction of debt levels to $665 million from $2.6 billion."
+
+        Adjectives may also evoke a Frame. For example, "asleep" may
+        evoke the "Sleep" frame as in:
+
+           "They were asleep for hours."
+
+        Many common nouns, such as artifacts like "hat" or "tower",
+        typically serve as dependents rather than clearly evoking their
+        own frames.
+
+        :param name: A regular expression pattern used to search the LU
+            names. Note that LU names take the form of a dotted
+            string (e.g. "run.v" or "a little.adv") in which a
+            lemma preceeds the "." and a POS follows the
+            dot. The lemma may be composed of a single lexeme
+            (e.g. "run") or of multiple lexemes (e.g. "a
+            little"). If 'name' is not given, then all LUs will
+            be returned.
+
+            The valid POSes are:
+
+                   v    - verb
+                   n    - noun
+                   a    - adjective
+                   adv  - adverb
+                   prep - preposition
+                   num  - numbers
+                   intj - interjection
+                   art  - article
+                   c    - conjunction
+                   scon - subordinating conjunction
+
+        :type name: str
+        :type frame: str or int or frame
+        :return: A list of selected (or all) lexical units
+        :rtype: list of LU objects (dicts). See the lu() function for info
+          about the specifics of LU objects.
+
+        """
+        if not self._lu_idx:
+            self._buildluindex()
+
+
+
+        if name is not None:    # match LUs, then restrict by frame
+            result = PrettyList(self.lu(luID) for luID,luName in self.lu_ids_and_names(name).items())
+            if frame is not None:
+                if isinstance(frame, int):
+                    frameIDs = {frame}
+                elif isinstance(frame, string_types):
+                    frameIDs = {f.ID for f in self.frames(frame)}
+                else:
+                    frameIDs = {frame.ID}
+                result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs)
+        elif frame is not None: # all LUs in matching frames
+            if isinstance(frame, int):
+                frames = [self.frame(frame)]
+            elif isinstance(frame, string_types):
+                frames = self.frames(frame)
+            else:
+                frames = [frame]
+            result = PrettyLazyIteratorList(iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)))
+        else:   # all LUs
+            luIDs = [luID for luID,lu in self._lu_idx.items() if lu.status not in self._bad_statuses]
+            result = PrettyLazyMap(self.lu, luIDs)
+        return result
+
+    def lu_ids_and_names(self, name=None):
+        """
+        Uses the LU index, which is much faster than looking up each LU definition
+        if only the names and IDs are needed.
+        """
+        if not self._lu_idx:
+            self._buildluindex()
+        return {luID: luinfo.name for luID,luinfo in self._lu_idx.items()
+                if luinfo.status not in self._bad_statuses
+                    and (name is None or re.search(name, luinfo.name) is not None)}
+
+    def docs_metadata(self, name=None):
+        """
+        Return an index of the annotated documents in Framenet.
+
+        Details for a specific annotated document can be obtained using this
+        class's doc() function and pass it the value of the 'ID' field.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp.
+        True
+        >>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \
+                    'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank'])
+        True
+
+        :param name: A regular expression pattern used to search the
+            file name of each annotated document. The document's
+            file name contains the name of the corpus that the
+            document is from, followed by two underscores "__"
+            followed by the document name. So, for example, the
+            file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is
+            from the corpus named "LUCorpus-v0.3" and the
+            document name is "20000410_nyt-NEW.xml".
+        :type name: str
+        :return: A list of selected (or all) annotated documents
+        :rtype: list of dicts, where each dict object contains the following
+                keys:
+
+                - 'name'
+                - 'ID'
+                - 'corpid'
+                - 'corpname'
+                - 'description'
+                - 'filename'
+        """
+        try:
+            ftlist = PrettyList(self._fulltext_idx.values())
+        except AttributeError:
+            self._buildcorpusindex()
+            ftlist = PrettyList(self._fulltext_idx.values())
+
+        if name is None:
+            return ftlist
+        else:
+            return PrettyList(x for x in ftlist if re.search(name, x['filename']) is not None)
+
+    def docs(self, name=None):
+        """
+        Return a list of the annotated full-text documents in FrameNet,
+        optionally filtered by a regex to be matched against the document name.
+        """
+        return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name))
+
+    def sents(self, exemplars=True, full_text=True):
+        """
+        Annotated sentences matching the specified criteria.
+        """
+        if exemplars:
+            if full_text:
+                return self.exemplars() + self.ft_sents()
+            else:
+                return self.exemplars()
+        elif full_text:
+            return self.ft_sents()
+
+    def annotations(self, luNamePattern=None, exemplars=True, full_text=True):
+        """
+        Frame annotation sets matching the specified criteria.
+        """
+
+        if exemplars:
+            epart = PrettyLazyIteratorList(sent.frameAnnotation for sent in self.exemplars(luNamePattern))
+        else:
+            epart = []
+
+        if full_text:
+            if luNamePattern is not None:
+                matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys())
+            ftpart = PrettyLazyIteratorList(aset for sent in self.ft_sents() for aset in sent.annotationSet[1:] if luNamePattern is None or aset.get('luID','CXN_ASET') in matchedLUIDs)
+        else:
+            ftpart = []
+
+        if exemplars:
+            if full_text:
+                return epart + ftpart
+            else:
+                return epart
+        elif full_text:
+            return ftpart
+
+    def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None):
+        """
+        Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that
+        are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance.
+        'fe' may be a name pattern or FE instance; if specified, 'fe2' may also
+        be specified to retrieve sentences with both overt FEs (in either order).
+        """
+        if fe is None and fe2 is not None:
+            raise FramenetError('exemplars(..., fe=None, fe2=<value>) is not allowed')
+        elif fe is not None and fe2 is not None:
+            if not isinstance(fe2, string_types):
+                if isinstance(fe, string_types):
+                    # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame.
+                    fe, fe2 = fe2, fe
+                elif fe.frame is not fe2.frame: # ensure frames match
+                    raise FramenetError('exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)')
+        if frame is None and fe is not None and not isinstance(fe, string_types):
+            frame = fe.frame
+
+        # narrow down to frames matching criteria
+
+        lusByFrame = defaultdict(list)   # frame name -> matching LUs, if luNamePattern is specified
+        if frame is not None or luNamePattern is not None:
+            if frame is None or isinstance(frame, string_types):
+                if luNamePattern is not None:
+                    frames = set()
+                    for lu in self.lus(luNamePattern, frame=frame):
+                        frames.add(lu.frame.ID)
+                        lusByFrame[lu.frame.name].append(lu)
+                    frames = LazyMap(self.frame, list(frames))
+                else:
+                    frames = self.frames(frame)
+            else:
+                if isinstance(frame,int):
+                    frames = [self.frame(frame)]
+                else:   # frame object
+                    frames = [frame]
+
+                if luNamePattern is not None:
+                    lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)}
+
+            if fe is not None:  # narrow to frames that define this FE
+                if isinstance(fe, string_types):
+                    frames = PrettyLazyIteratorList(f for f in frames if fe in f.FE or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()))
+                else:
+                    if fe.frame not in frames:
+                        raise FramenetError('exemplars() call with inconsistent `frame` and `fe` specification')
+                    frames = [fe.frame]
+
+                if fe2 is not None: # narrow to frames that ALSO define this FE
+                    if isinstance(fe2, string_types):
+                        frames = PrettyLazyIteratorList(f for f in frames if fe2 in f.FE or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()))
+                    # else we already narrowed it to a single frame
+        else:   # frame, luNamePattern are None. fe, fe2 are None or strings
+            if fe is not None:
+                frames = {ffe.frame.ID for ffe in self.fes(fe)}
+                if fe2 is not None:
+                    frames2 = {ffe.frame.ID for ffe in self.fes(fe2)}
+                    frames = frames & frames2
+                frames = LazyMap(self.frame, list(frames))
+            else:
+                frames = self.frames()
+
+        # we've narrowed down 'frames'
+        # now get exemplars for relevant LUs in those frames
+
+        def _matching_exs():
+            for f in frames:
+                fes = fes2 = None   # FEs of interest
+                if fe is not None:
+                    fes = {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} if isinstance(fe, string_types) else {fe.name}
+                    if fe2 is not None:
+                        fes2 = {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} if isinstance(fe2, string_types) else {fe2.name}
+
+                for lu in lusByFrame[f.name] if luNamePattern is not None else f.lexUnit.values():
+                    for ex in lu.exemplars:
+                        if (fes is None or self._exemplar_of_fes(ex, fes)) and (fes2 is None or self._exemplar_of_fes(ex, fes2)):
+                            yield ex
+
+        return PrettyLazyIteratorList(_matching_exs())
+
+    def _exemplar_of_fes(self, ex, fes=None):
+        """
+        Given an exemplar sentence and a set of FE names, return the subset of FE names
+        that are realized overtly in the sentence on the FE, FE2, or FE3 layer.
+
+        If 'fes' is None, returns all overt FE names.
+        """
+        overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set()
+        if 'FE2' in ex:
+            overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set()
+            if 'FE3' in ex:
+                overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set()
+        return overtNames & fes if fes is not None else overtNames
+
+    def ft_sents(self, docNamePattern=None):
+        """
+        Full-text annotation sentences, optionally filtered by document name.
+        """
+        return PrettyLazyIteratorList(sent for d in self.docs(docNamePattern) for sent in d.sentence)
+
+
+    def frame_relation_types(self):
+        """
+        Obtain a list of frame relation types.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> frts = list(fn.frame_relation_types())
+        >>> isinstance(frts, list)
+        True
+        >>> len(frts) in (9, 10)    # FN 1.5 and 1.7, resp.
+        True
+        >>> PrettyDict(frts[0], breakLines=True)
+        {'ID': 1,
+         '_type': 'framerelationtype',
+         'frameRelations': [<Parent=Event -- Inheritance -> Child=Change_of_consistency>, <Parent=Event -- Inheritance -> Child=Rotting>, ...],
+         'name': 'Inheritance',
+         'subFrameName': 'Child',
+         'superFrameName': 'Parent'}
+
+        :return: A list of all of the frame relation types in framenet
+        :rtype: list(dict)
+        """
+        if not self._freltyp_idx:
+            self._buildrelationindex()
+        return self._freltyp_idx.values()
+
+    def frame_relations(self, frame=None, frame2=None, type=None):
+        """
+        :param frame: (optional) frame object, name, or ID; only relations involving
+        this frame will be returned
+        :param frame2: (optional; 'frame' must be a different frame) only show relations
+        between the two specified frames, in either direction
+        :param type: (optional) frame relation type (name or object); show only relations
+        of this type
+        :type frame: int or str or AttrDict
+        :return: A list of all of the frame relations in framenet
+        :rtype: list(dict)
+
+        >>> from nltk.corpus import framenet as fn
+        >>> frels = fn.frame_relations()
+        >>> isinstance(frels, list)
+        True
+        >>> len(frels) in (1676, 2070)  # FN 1.5 and 1.7, resp.
+        True
+        >>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True)
+        [<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>,
+         <Parent=Apply_heat -- Using -> Child=Cooking_creation>,
+         <MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>]
+        >>> PrettyList(fn.frame_relations(274), breakLines=True)
+        [<Parent=Avoiding -- Inheritance -> Child=Dodging>,
+         <Parent=Avoiding -- Inheritance -> Child=Evading>, ...]
+        >>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True)
+        [<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>,
+         <Parent=Apply_heat -- Using -> Child=Cooking_creation>, ...]
+        >>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance'))
+        [<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>]
+        >>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True)
+        [<Parent=Apply_heat -- Using -> Child=Cooking_creation>,
+        <MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>]
+        """
+        relation_type = type
+
+        if not self._frel_idx:
+            self._buildrelationindex()
+
+        rels = None
+
+        if relation_type is not None:
+            if not isinstance(relation_type, dict):
+                type = [rt for rt in self.frame_relation_types() if rt.name==type][0]
+                assert isinstance(type,dict)
+
+        # lookup by 'frame'
+        if frame is not None:
+            if isinstance(frame,dict) and 'frameRelations' in frame:
+                rels = PrettyList(frame.frameRelations)
+            else:
+                if not isinstance(frame, int):
+                    if isinstance(frame, dict):
+                        frame = frame.ID
+                    else:
+                        frame = self.frame_by_name(frame).ID
+                rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]]
+
+            # filter by 'type'
+            if type is not None:
+                rels = [rel for rel in rels if rel.type is type]
+        elif type is not None:
+            # lookup by 'type'
+            rels = type.frameRelations
+        else:
+            rels = self._frel_idx.values()
+
+        # filter by 'frame2'
+        if frame2 is not None:
+            if frame is None:
+                raise FramenetError("frame_relations(frame=None, frame2=<value>) is not allowed")
+            if not isinstance(frame2, int):
+                if isinstance(frame2, dict):
+                    frame2 = frame2.ID
+                else:
+                    frame2 = self.frame_by_name(frame2).ID
+            if frame==frame2:
+                raise FramenetError("The two frame arguments to frame_relations() must be different frames")
+            rels = [rel for rel in rels if rel.superFrame.ID==frame2 or rel.subFrame.ID==frame2]
+
+        return PrettyList(sorted(rels,
+                key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName)))
+
+    def fe_relations(self):
+        """
+        Obtain a list of frame element relations.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> ferels = fn.fe_relations()
+        >>> isinstance(ferels, list)
+        True
+        >>> len(ferels) in (10020, 12393)   # FN 1.5 and 1.7, resp.
+        True
+        >>> PrettyDict(ferels[0], breakLines=True)
+        {'ID': 14642,
+        '_type': 'ferelation',
+        'frameRelation': <Parent=Abounding_with -- Inheritance -> Child=Lively_place>,
+        'subFE': <fe ID=11370 name=Degree>,
+        'subFEName': 'Degree',
+        'subFrame': <frame ID=1904 name=Lively_place>,
+        'subID': 11370,
+        'supID': 2271,
+        'superFE': <fe ID=2271 name=Degree>,
+        'superFEName': 'Degree',
+        'superFrame': <frame ID=262 name=Abounding_with>,
+        'type': <framerelationtype ID=1 name=Inheritance>}
+
+        :return: A list of all of the frame element relations in framenet
+        :rtype: list(dict)
+        """
+        if not self._ferel_idx:
+            self._buildrelationindex()
+        return PrettyList(sorted(self._ferel_idx.values(),
+                key=lambda ferel: (ferel.type.ID, ferel.frameRelation.superFrameName,
+                    ferel.superFEName, ferel.frameRelation.subFrameName, ferel.subFEName)))
+
+    def semtypes(self):
+        """
+        Obtain a list of semantic types.
+
+        >>> from nltk.corpus import framenet as fn
+        >>> stypes = fn.semtypes()
+        >>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp.
+        True
+        >>> sorted(stypes[0].keys())
+        ['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType']
+
+        :return: A list of all of the semantic types in framenet
+        :rtype: list(dict)
+        """
+        if not self._semtypes:
+            self._loadsemtypes()
+        return PrettyList(self._semtypes[i] for i in self._semtypes if isinstance(i, int))
+
+    def _load_xml_attributes(self, d, elt):
+        """
+        Extracts a subset of the attributes from the given element and
+        returns them in a dictionary.
+
+        :param d: A dictionary in which to store the attributes.
+        :type d: dict
+        :param elt: An ElementTree Element
+        :type elt: Element
+        :return: Returns the input dict ``d`` possibly including attributes from ``elt``
+        :rtype: dict
+        """
+
+        d = type(d)(d)
+
+        try:
+            attr_dict = elt.attrib
+        except AttributeError:
+            return d
+
+        if attr_dict is None:
+            return d
+
+        # Ignore these attributes when loading attributes from an xml node
+        ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
+                        'xsi', 'schemaLocation', 'xmlns', 'bgColor', 'fgColor']
+
+        for attr in attr_dict:
+
+            if any(attr.endswith(x) for x in ignore_attrs):
+                continue
+
+            val = attr_dict[attr]
+            if val.isdigit():
+                d[attr] = int(val)
+            else:
+                d[attr] = val
+
+        return d
+
+    def _strip_tags(self, data):
+        """
+        Gets rid of all tags and newline characters from the given input
+
+        :return: A cleaned-up version of the input string
+        :rtype: str
+        """
+
+        try:
+            '''
+            # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
+            m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
+            if m:
+                print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr)
+            '''
+
+            data = data.replace('<t>', '')
+            data = data.replace('</t>', '')
+            data = re.sub('<fex name="[^"]+">', '', data)
+            data = data.replace('</fex>', '')
+            data = data.replace('<fen>', '')
+            data = data.replace('</fen>', '')
+            data = data.replace('<m>', '')
+            data = data.replace('</m>', '')
+            data = data.replace('<ment>', '')
+            data = data.replace('</ment>', '')
+            data = data.replace('<ex>', "'")
+            data = data.replace('</ex>', "'")
+            data = data.replace('<gov>', '')
+            data = data.replace('</gov>', '')
+            data = data.replace('<x>', '')
+            data = data.replace('</x>', '')
+
+            # Get rid of <def-root> and </def-root> tags
+            data = data.replace('<def-root>', '')
+            data = data.replace('</def-root>', '')
+
+            data = data.replace('\n', ' ')
+        except AttributeError:
+            pass
+
+        return data
+
+    def _handle_elt(self, elt, tagspec=None):
+        """Extracts and returns the attributes of the given element"""
+        return self._load_xml_attributes(AttrDict(), elt)
+
+    def _handle_fulltextindex_elt(self, elt, tagspec=None):
+        """
+        Extracts corpus/document info from the fulltextIndex.xml file.
+
+        Note that this function "flattens" the information contained
+        in each of the "corpus" elements, so that each "document"
+        element will contain attributes for the corpus and
+        corpusid. Also, each of the "document" items will contain a
+        new attribute called "filename" that is the base file name of
+        the xml file for the document in the "fulltext" subdir of the
+        Framenet corpus.
+        """
+        ftinfo = self._load_xml_attributes(AttrDict(), elt)
+        corpname = ftinfo.name
+        corpid = ftinfo.ID
+        retlist = []
+        for sub in elt:
+            if sub.tag.endswith('document'):
+                doc = self._load_xml_attributes(AttrDict(), sub)
+                if 'name' in doc:
+                    docname = doc.name
+                else:
+                    docname = doc.description
+                doc.filename = "{0}__{1}.xml".format(corpname, docname)
+                doc.URL = self._fnweb_url + '/' + self._fulltext_dir + '/' + doc.filename
+                doc.corpname = corpname
+                doc.corpid = corpid
+                retlist.append(doc)
+
+        return retlist
+
+    def _handle_frame_elt(self, elt, ignorekeys=[]):
+        """Load the info for a Frame from a frame xml file"""
+        frinfo = self._load_xml_attributes(AttrDict(), elt)
+
+        frinfo['_type'] = 'frame'
+        frinfo['definition'] = ""
+        frinfo['definitionMarkup'] = ""
+        frinfo['FE'] = PrettyDict()
+        frinfo['FEcoreSets'] = []
+        frinfo['lexUnit'] = PrettyDict()
+        frinfo['semTypes'] = []
+        for k in ignorekeys:
+            if k in frinfo:
+                del frinfo[k]
+
+        for sub in elt:
+            if sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+                frinfo['definitionMarkup'] = sub.text
+                frinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('FE') and 'FE' not in ignorekeys:
+                feinfo = self._handle_fe_elt(sub)
+                frinfo['FE'][feinfo.name] = feinfo
+                feinfo['frame'] = frinfo    # backpointer
+            elif sub.tag.endswith('FEcoreSet') and 'FEcoreSet' not in ignorekeys:
+                coreset = self._handle_fecoreset_elt(sub)
+                # assumes all FEs have been loaded before coresets
+                frinfo['FEcoreSets'].append(PrettyList(frinfo['FE'][fe.name] for fe in coreset))
+            elif sub.tag.endswith('lexUnit') and 'lexUnit' not in ignorekeys:
+                luentry = self._handle_framelexunit_elt(sub)
+                if luentry['status'] in self._bad_statuses:
+                    # problematic LU entry; ignore it
+                    continue
+                luentry['frame'] = frinfo
+                luentry['URL'] = self._fnweb_url + '/' + self._lu_dir + '/' + "lu{0}.xml".format(luentry['ID'])
+                luentry['subCorpus'] = Future((lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry))
+                luentry['exemplars'] = Future((lambda lu: lambda: self._lu_file(lu).exemplars)(luentry))
+                frinfo['lexUnit'][luentry.name] = luentry
+                if not self._lu_idx:
+                    self._buildluindex()
+                self._lu_idx[luentry.ID] = luentry
+            elif sub.tag.endswith('semType') and 'semTypes' not in ignorekeys:
+                semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
+                frinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
+
+        frinfo['frameRelations'] = self.frame_relations(frame=frinfo)
+
+        # resolve 'requires' and 'excludes' links between FEs of this frame
+        for fe in frinfo.FE.values():
+            if fe.requiresFE:
+                name, ID = fe.requiresFE.name, fe.requiresFE.ID
+                fe.requiresFE = frinfo.FE[name]
+                assert fe.requiresFE.ID==ID
+            if fe.excludesFE:
+                name, ID = fe.excludesFE.name, fe.excludesFE.ID
+                fe.excludesFE = frinfo.FE[name]
+                assert fe.excludesFE.ID==ID
+
+        return frinfo
+
+    def _handle_fecoreset_elt(self, elt):
+        """Load fe coreset info from xml."""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        tmp = []
+        for sub in elt:
+            tmp.append(self._load_xml_attributes(AttrDict(), sub))
+
+        return tmp
+
+    def _handle_framerelationtype_elt(self, elt, *args):
+        """Load frame-relation element and its child fe-relation elements from frRelation.xml."""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        info['_type'] = 'framerelationtype'
+        info['frameRelations'] = PrettyList()
+
+        for sub in elt:
+            if sub.tag.endswith('frameRelation'):
+                frel = self._handle_framerelation_elt(sub)
+                frel['type'] = info   # backpointer
+                for ferel in frel.feRelations:
+                    ferel['type'] = info
+                info['frameRelations'].append(frel)
+
+        return info
+
+    def _handle_framerelation_elt(self, elt):
+        """Load frame-relation element and its child fe-relation elements from frRelation.xml."""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        assert info['superFrameName']!=info['subFrameName'],(elt,info)
+        info['_type'] = 'framerelation'
+        info['feRelations'] = PrettyList()
+
+        for sub in elt:
+            if sub.tag.endswith('FERelation'):
+                ferel = self._handle_elt(sub)
+                ferel['_type'] = 'ferelation'
+                ferel['frameRelation'] = info   # backpointer
+                info['feRelations'].append(ferel)
+
+        return info
+
+    def _handle_fulltextannotation_elt(self, elt):
+        """Load full annotation info for a document from its xml
+        file. The main element (fullTextAnnotation) contains a 'header'
+        element (which we ignore here) and a bunch of 'sentence'
+        elements."""
+        info = AttrDict()
+        info['_type'] = 'fulltext_annotation'
+        info['sentence'] = []
+
+        for sub in elt:
+            if sub.tag.endswith('header'):
+                continue  # not used
+            elif sub.tag.endswith('sentence'):
+                s = self._handle_fulltext_sentence_elt(sub)
+                s.doc = info
+                info['sentence'].append(s)
+
+        return info
+
+    def _handle_fulltext_sentence_elt(self, elt):
+        """Load information from the given 'sentence' element. Each
+        'sentence' element contains a "text" and "annotationSet" sub
+        elements."""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        info['_type'] = "fulltext_sentence"
+        info['annotationSet'] = []
+        info['targets'] = []
+        target_spans = set()
+        info['_ascii'] = types.MethodType(_annotation_ascii, info)  # attach a method for this instance
+        info['text'] = ""
+
+        for sub in elt:
+            if sub.tag.endswith('text'):
+                info['text'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('annotationSet'):
+                a = self._handle_fulltextannotationset_elt(sub, is_pos=(len(info['annotationSet'])==0))
+                if 'cxnID' in a: # ignoring construction annotations for now
+                    continue
+                a.sent = info
+                a.text = info.text
+                info['annotationSet'].append(a)
+                if 'Target' in a:
+                    for tspan in a.Target:
+                        if tspan in target_spans:
+                            self._warn('Duplicate target span "{0}"'.format(info.text[slice(*tspan)]),
+                                tspan, 'in sentence',info['ID'], info.text)
+                            # this can happen in cases like "chemical and biological weapons"
+                            # being annotated as "chemical weapons" and "biological weapons"
+                        else:
+                            target_spans.add(tspan)
+                    info['targets'].append((a.Target, a.luName, a.frameName))
+
+        assert info['annotationSet'][0].status=='UNANN'
+        info['POS'] = info['annotationSet'][0].POS
+        info['POS_tagset'] = info['annotationSet'][0].POS_tagset
+        return info
+
+    def _handle_fulltextannotationset_elt(self, elt, is_pos=False):
+        """Load information from the given 'annotationSet' element. Each
+        'annotationSet' contains several "layer" elements."""
+
+        info = self._handle_luannotationset_elt(elt, is_pos=is_pos)
+        if not is_pos:
+            info['_type'] = 'fulltext_annotationset'
+            if 'cxnID' not in info: # ignoring construction annotations for now
+                info['LU'] = self.lu(info.luID, luName=info.luName, frameID=info.frameID, frameName=info.frameName)
+                info['frame'] = info.LU.frame
+        return info
+
+    def _handle_fulltextlayer_elt(self, elt):
+        """Load information from the given 'layer' element. Each
+        'layer' contains several "label" elements."""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        info['_type'] = 'layer'
+        info['label'] = []
+
+        for sub in elt:
+            if sub.tag.endswith('label'):
+                l = self._load_xml_attributes(AttrDict(), sub)
+                info['label'].append(l)
+
+        return info
+
+    def _handle_framelexunit_elt(self, elt):
+        """Load the lexical unit info from an xml element in a frame's xml file."""
+        luinfo = AttrDict()
+        luinfo['_type'] = 'lu'
+        luinfo = self._load_xml_attributes(luinfo, elt)
+        luinfo["definition"] = ""
+        luinfo["definitionMarkup"] = ""
+        luinfo["sentenceCount"] = PrettyDict()
+        luinfo['lexemes'] = PrettyList()   # multiword LUs have multiple lexemes
+        luinfo['semTypes'] = PrettyList()  # an LU can have multiple semtypes
+
+        for sub in elt:
+            if sub.tag.endswith('definition'):
+                luinfo['definitionMarkup'] = sub.text
+                luinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('sentenceCount'):
+                luinfo['sentenceCount'] = self._load_xml_attributes(
+                    PrettyDict(), sub)
+            elif sub.tag.endswith('lexeme'):
+                lexemeinfo = self._load_xml_attributes(PrettyDict(), sub)
+                if not isinstance(lexemeinfo.name, string_types):
+                    # some lexeme names are ints by default: e.g.,
+                    # thousand.num has lexeme with name="1000"
+                    lexemeinfo.name = str(lexemeinfo.name)
+                luinfo['lexemes'].append(lexemeinfo)
+            elif sub.tag.endswith('semType'):
+                semtypeinfo = self._load_xml_attributes(PrettyDict(), sub)
+                luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
+
+        # sort lexemes by 'order' attribute
+        # otherwise, e.g., 'write down.v' may have lexemes in wrong order
+        luinfo['lexemes'].sort(key=lambda x: x.order)
+
+        return luinfo
+
+    def _handle_lexunit_elt(self, elt, ignorekeys):
+        """
+        Load full info for a lexical unit from its xml file.
+        This should only be called when accessing corpus annotations
+        (which are not included in frame files).
+        """
+        luinfo = self._load_xml_attributes(AttrDict(), elt)
+        luinfo['_type'] = 'lu'
+        luinfo['definition'] = ""
+        luinfo['definitionMarkup'] = ""
+        luinfo['subCorpus'] = PrettyList()
+        luinfo['lexemes'] = PrettyList()   # multiword LUs have multiple lexemes
+        luinfo['semTypes'] = PrettyList()  # an LU can have multiple semtypes
+        for k in ignorekeys:
+            if k in luinfo:
+                del luinfo[k]
+
+        for sub in elt:
+            if sub.tag.endswith('header'):
+                continue  # not used
+            elif sub.tag.endswith('valences'):
+                continue  # not used
+            elif sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+                luinfo['definitionMarkup'] = sub.text
+                luinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('subCorpus') and 'subCorpus' not in ignorekeys:
+                sc = self._handle_lusubcorpus_elt(sub)
+                if sc is not None:
+                    luinfo['subCorpus'].append(sc)
+            elif sub.tag.endswith('lexeme') and 'lexeme' not in ignorekeys:
+                luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub))
+            elif sub.tag.endswith('semType') and 'semType' not in ignorekeys:
+                semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
+                luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
+
+        return luinfo
+
+    def _handle_lusubcorpus_elt(self, elt):
+        """Load a subcorpus of a lexical unit from the given xml."""
+        sc = AttrDict()
+        try:
+            sc['name'] = elt.get('name')
+        except AttributeError:
+            return None
+        sc['_type'] = "lusubcorpus"
+        sc['sentence'] = []
+
+        for sub in elt:
+            if sub.tag.endswith('sentence'):
+                s = self._handle_lusentence_elt(sub)
+                if s is not None:
+                    sc['sentence'].append(s)
+
+        return sc
+
+    def _handle_lusentence_elt(self, elt):
+        """Load a sentence from a subcorpus of an LU from xml."""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        info['_type'] = 'lusentence'
+        info['annotationSet'] = []
+        info['_ascii'] = types.MethodType(_annotation_ascii, info)  # attach a method for this instance
+        for sub in elt:
+            if sub.tag.endswith('text'):
+                info['text'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('annotationSet'):
+                annset = self._handle_luannotationset_elt(sub, is_pos=(len(info['annotationSet'])==0))
+                if annset is not None:
+                    assert annset.status=='UNANN' or 'FE' in annset,annset
+                    if annset.status!='UNANN':
+                        info['frameAnnotation'] = annset
+                    # copy layer info up to current level
+                    for k in ('Target', 'FE', 'FE2', 'FE3', 'GF', 'PT', 'POS', 'POS_tagset',
+                              'Other', 'Sent', 'Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
+                        if k in annset:
+                            info[k] = annset[k]
+                    info['annotationSet'].append(annset)
+                    annset['sent'] = info
+                    annset['text'] = info.text
+        return info
+
+    def _handle_luannotationset_elt(self, elt, is_pos=False):
+        """Load an annotation set from a sentence in an subcorpus of an LU"""
+        info = self._load_xml_attributes(AttrDict(), elt)
+        info['_type'] = 'posannotationset' if is_pos else 'luannotationset'
+        info['layer'] = []
+        info['_ascii'] = types.MethodType(_annotation_ascii, info)  # attach a method for this instance
+
+        if 'cxnID' in info: # ignoring construction annotations for now.
+            return info
+
+        for sub in elt:
+            if sub.tag.endswith('layer'):
+                l = self._handle_lulayer_elt(sub)
+                if l is not None:
+                    overt = []
+                    ni = {} # null instantiations
+
+                    info['layer'].append(l)
+                    for lbl in l.label:
+                        if 'start' in lbl:
+                            thespan = (lbl.start,lbl.end+1,lbl.name)
+                            if l.name not in ('Sent','Other'):  # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
+                                assert thespan not in overt,(info.ID,l.name,thespan)
+                            overt.append(thespan)
+                        else: # null instantiation
+                            if lbl.name in ni:
+                                self._warn('FE with multiple NI entries:', lbl.name, ni[lbl.name], lbl.itype)
+                            else:
+                                ni[lbl.name] = lbl.itype
+                    overt = sorted(overt)
+
+                    if l.name=='Target':
+                        if not overt:
+                            self._warn('Skipping empty Target layer in annotation set ID={0}'.format(info.ID))
+                            continue
+                        assert all(lblname=='Target' for i,j,lblname in overt)
+                        if 'Target' in info:
+                            self._warn('Annotation set {0} has multiple Target layers'.format(info.ID))
+                        else:
+                            info['Target'] = [(i,j) for (i,j,_) in overt]
+                    elif l.name=='FE':
+                        if l.rank==1:
+                            assert 'FE' not in info
+                            info['FE'] = (overt, ni)
+                            #assert False,info
+                        else:
+                            # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v
+                            assert 2<=l.rank<=3,l.rank
+                            k = 'FE'+str(l.rank)
+                            assert k not in info
+                            info[k] = (overt, ni)
+                    elif l.name in ('GF', 'PT'):
+                        assert l.rank==1
+                        info[l.name] = overt
+                    elif l.name in ('BNC', 'PENN'):
+                        assert l.rank==1
+                        info['POS'] = overt
+                        info['POS_tagset'] = l.name
+                    else:
+                        if is_pos:
+                            if l.name not in ('NER', 'WSL'):
+                                self._warn('Unexpected layer in sentence annotationset:', l.name)
+                        else:
+                            if l.name not in ('Sent', 'Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art', 'Other'):
+                                self._warn('Unexpected layer in frame annotationset:', l.name)
+                        info[l.name] = overt
+        if not is_pos and 'cxnID' not in info:
+            if 'Target' not in info:
+                self._warn('Missing target in annotation set ID={0}'.format(info.ID))
+            assert 'FE' in info
+            if 'FE3' in info:
+                assert 'FE2' in info
+
+        return info
+
+    def _handle_lulayer_elt(self, elt):
+        """Load a layer from an annotation set"""
+        layer = self._load_xml_attributes(AttrDict(), elt)
+        layer['_type'] = 'lulayer'
+        layer['label'] = []
+
+        for sub in elt:
+            if sub.tag.endswith('label'):
+                l = self._load_xml_attributes(AttrDict(), sub)
+                if l is not None:
+                    layer['label'].append(l)
+        return layer
+
+    def _handle_fe_elt(self, elt):
+        feinfo = self._load_xml_attributes(AttrDict(), elt)
+        feinfo['_type'] = 'fe'
+        feinfo['definition'] = ""
+        feinfo['definitionMarkup'] = ""
+        feinfo['semType'] = None
+        feinfo['requiresFE'] = None
+        feinfo['excludesFE'] = None
+        for sub in elt:
+            if sub.tag.endswith('definition'):
+                feinfo['definitionMarkup'] = sub.text
+                feinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('semType'):
+                stinfo = self._load_xml_attributes(AttrDict(), sub)
+                feinfo['semType'] = self.semtype(stinfo.ID)
+            elif sub.tag.endswith('requiresFE'):
+                feinfo['requiresFE'] = self._load_xml_attributes(AttrDict(), sub)
+            elif sub.tag.endswith('excludesFE'):
+                feinfo['excludesFE'] = self._load_xml_attributes(AttrDict(), sub)
+
+        return feinfo
+
+    def _handle_semtype_elt(self, elt, tagspec=None):
+        semt = self._load_xml_attributes(AttrDict(), elt)
+        semt['_type'] = 'semtype'
+        semt['superType'] = None
+        semt['subTypes'] = PrettyList()
+        for sub in elt:
+            if sub.text is not None:
+                semt['definitionMarkup'] = sub.text
+                semt['definition'] = self._strip_tags(sub.text)
+            else:
+                supertypeinfo = self._load_xml_attributes(AttrDict(), sub)
+                semt['superType'] = supertypeinfo
+                # the supertype may not have been loaded yet
+
+        return semt
+
+
+#
+# Demo
+#
+def demo():
+    from nltk.corpus import framenet as fn
+
+    #
+    # It is not necessary to explicitly build the indexes by calling
+    # buildindexes(). We do this here just for demo purposes. If the
+    # indexes are not built explicitely, they will be built as needed.
+    #
+    print('Building the indexes...')
+    fn.buildindexes()
+
+    #
+    # Get some statistics about the corpus
+    #
+    print('Number of Frames:', len(fn.frames()))
+    print('Number of Lexical Units:', len(fn.lus()))
+    print('Number of annotated documents:', len(fn.docs()))
+    print()
+
+    #
+    # Frames
+    #
+    print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"')
+    medframes = fn.frames(r'(?i)medical')
+    print(
+        'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
+    print([(f.name, f.ID) for f in medframes])
+
+    #
+    # store the first frame in the list of frames
+    #
+    tmp_id = medframes[0].ID
+    m_frame = fn.frame(tmp_id)  # reads all info for the frame
+
+    #
+    # get the frame relations
+    #
+    print(
+        '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name,
+                                                                        m_frame.ID),
+        len(m_frame.frameRelations))
+    for fr in m_frame.frameRelations:
+        print('   ', fr)
+
+    #
+    # get the names of the Frame Elements
+    #
+    print(
+        '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
+        len(m_frame.FE))
+    print('   ', [x for x in m_frame.FE])
+
+    #
+    # get the names of the "Core" Frame Elements
+    #
+    print(
+        '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
+    print('   ', [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
+
+    #
+    # get all of the Lexical Units that are incorporated in the
+    # 'Ailment' FE of the 'Medical_conditions' frame (id=239)
+    #
+    print('\nAll Lexical Units that are incorporated in the "Ailment" FE:')
+    m_frame = fn.frame(239)
+    ailment_lus = [x for x in m_frame.lexUnit.values() if 'incorporatedFE' in x and x.incorporatedFE == 'Ailment']
+    print('   ', [x.name for x in ailment_lus])
+
+    #
+    # get all of the Lexical Units for the frame
+    #
+    print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
+          len(m_frame.lexUnit))
+    print('  ', [x.name for x in m_frame.lexUnit.values()][:5], '...')
+
+    #
+    # get basic info on the second LU in the frame
+    #
+    tmp_id = m_frame.lexUnit['ailment.n'].ID  # grab the id of the specified LU
+    luinfo = fn.lu_basic(tmp_id)  # get basic info on the LU
+    print('\nInformation on the LU: {0}'.format(luinfo.name))
+    pprint(luinfo)
+
+    #
+    # Get a list of all of the corpora used for fulltext annotation
+    #
+    print('\nNames of all of the corpora used for fulltext annotation:')
+    allcorpora = set([x.corpname for x in fn.docs_metadata()])
+    pprint(list(allcorpora))
+
+    #
+    # Get the names of the annotated documents in the first corpus
+    #
+    firstcorp = list(allcorpora)[0]
+    firstcorp_docs = fn.docs(firstcorp)
+    print(
+        '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp))
+    pprint([x.filename for x in firstcorp_docs])
+
+    #
+    # Search for frames containing LUs whose name attribute matches a
+    # regexp pattern.
+    #
+    # Note: if you were going to be doing a lot of this type of
+    #       searching, you'd want to build an index that maps from
+    #       lemmas to frames because each time frames_by_lemma() is
+    #       called, it has to search through ALL of the frame XML files
+    #       in the db.
+    print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":')
+    pprint(fn.frames_by_lemma(r'^run.v$'))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/corpus/reader/framenet.pyc b/nlp_resource_data/nltk/corpus/reader/framenet.pyc

new file mode 100755 (executable)

index 0000000..3b56c32

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/framenet.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/ieer.py b/nlp_resource_data/nltk/corpus/reader/ieer.py

new file mode 100755 (executable)

index 0000000..91b9425
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/ieer.py
@@ -0,0 +1,112 @@
+# Natural Language Toolkit: IEER Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the Information Extraction and Entity Recognition Corpus.
+
+NIST 1999 Information Extraction: Entity Recognition Evaluation
+http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
+
+This corpus contains the NEWSWIRE development test data for the
+NIST 1999 IE-ER Evaluation.  The files were taken from the
+subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
+and filenames were shortened.
+
+The corpus contains the following files: APW_19980314, APW_19980424,
+APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
+"""
+from __future__ import unicode_literals
+
+from six import string_types
+
+import nltk
+from nltk import compat
+from nltk.corpus.reader.api import *
+
+#: A dictionary whose keys are the names of documents in this corpus;
+#: and whose values are descriptions of those documents' contents.
+titles = {
+    'APW_19980314': 'Associated Press Weekly, 14 March 1998',
+    'APW_19980424': 'Associated Press Weekly, 24 April 1998',
+    'APW_19980429': 'Associated Press Weekly, 29 April 1998',
+    'NYT_19980315': 'New York Times, 15 March 1998',
+    'NYT_19980403': 'New York Times, 3 April 1998',
+    'NYT_19980407': 'New York Times, 7 April 1998',
+    }
+
+#: A list of all documents in this corpus.
+documents = sorted(titles)
+
+@compat.python_2_unicode_compatible
+class IEERDocument(object):
+    def __init__(self, text, docno=None, doctype=None,
+                 date_time=None, headline=''):
+        self.text = text
+        self.docno = docno
+        self.doctype = doctype
+        self.date_time = date_time
+        self.headline = headline
+
+    def __repr__(self):
+        if self.headline:
+            headline = ' '.join(self.headline.leaves())
+        else:
+            headline = ' '.join([w for w in self.text.leaves()
+                                 if w[:1] != '<'][:12])+'...'
+        if self.docno is not None:
+            return '<IEERDocument %s: %r>' % (self.docno, headline)
+        else:
+            return '<IEERDocument: %r>' % headline
+
+class IEERCorpusReader(CorpusReader):
+    """
+    """
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def docs(self, fileids=None):
+        return concat([StreamBackedCorpusView(fileid, self._read_block,
+                                              encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def parsed_docs(self, fileids=None):
+        return concat([StreamBackedCorpusView(fileid,
+                                              self._read_parsed_block,
+                                              encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def _read_parsed_block(self,stream):
+        # TODO: figure out while empty documents are being returned
+        return [self._parse(doc) for doc in self._read_block(stream)
+                if self._parse(doc).docno is not None]
+
+    def _parse(self, doc):
+        val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
+        if isinstance(val, dict):
+            return IEERDocument(**val)
+        else:
+            return IEERDocument(val)
+
+    def _read_block(self, stream):
+        out = []
+        # Skip any preamble.
+        while True:
+            line = stream.readline()
+            if not line: break
+            if line.strip() == '<DOC>': break
+        out.append(line)
+        # Read the document
+        while True:
+            line = stream.readline()
+            if not line: break
+            out.append(line)
+            if line.strip() == '</DOC>': break
+        # Return the document
+        return ['\n'.join(out)]
diff --git a/nlp_resource_data/nltk/corpus/reader/ieer.pyc b/nlp_resource_data/nltk/corpus/reader/ieer.pyc

new file mode 100755 (executable)

index 0000000..9eed214

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/ieer.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/indian.py b/nlp_resource_data/nltk/corpus/reader/indian.py

new file mode 100755 (executable)

index 0000000..1c50547
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/indian.py
@@ -0,0 +1,85 @@
+# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Indian Language POS-Tagged Corpus
+Collected by A Kumaran, Microsoft Research, India
+Distributed with permission
+
+Contents:
+  - Bangla: IIT Kharagpur
+  - Hindi: Microsoft Research India
+  - Marathi: IIT Bombay
+  - Telugu: IIIT Hyderabad
+"""
+
+from six import string_types
+
+from nltk.tag import str2tuple, map_tag
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class IndianCorpusReader(CorpusReader):
+    """
+    List of words, one per line.  Blank lines are ignored.
+    """
+    def words(self, fileids=None):
+        return concat([IndianCorpusView(fileid, enc,
+                                        False, False)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_words(self, fileids=None, tagset=None):
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat([IndianCorpusView(fileid, enc,
+                                        True, False, tag_mapping_function)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        return concat([IndianCorpusView(fileid, enc,
+                                        False, True)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat([IndianCorpusView(fileid, enc,
+                                        True, True, tag_mapping_function)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+
+class IndianCorpusView(StreamBackedCorpusView):
+    def __init__(self, corpus_file, encoding, tagged,
+                 group_by_sent, tag_mapping_function=None):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._tag_mapping_function = tag_mapping_function
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        line = stream.readline()
+        if line.startswith('<'):
+            return []
+        sent = [str2tuple(word, sep='_') for word in line.split()]
+        if self._tag_mapping_function:
+            sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
+        if not self._tagged: sent = [w for (w,t) in sent]
+        if self._group_by_sent:
+            return [sent]
+        else:
+            return sent
diff --git a/nlp_resource_data/nltk/corpus/reader/indian.pyc b/nlp_resource_data/nltk/corpus/reader/indian.pyc

new file mode 100755 (executable)

index 0000000..f0b521a

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/indian.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/ipipan.py b/nlp_resource_data/nltk/corpus/reader/ipipan.py

new file mode 100755 (executable)

index 0000000..bf9b73e
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/ipipan.py
@@ -0,0 +1,332 @@
+# Natural Language Toolkit: IPI PAN Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import functools
+
+from six import string_types
+
+from nltk.corpus.reader.util import StreamBackedCorpusView, concat
+from nltk.corpus.reader.api import CorpusReader
+
+def _parse_args(fun):
+    @functools.wraps(fun)
+    def decorator(self, fileids=None, **kwargs):
+        kwargs.pop('tags', None)
+        if not fileids:
+            fileids = self.fileids()
+        return fun(self, fileids, **kwargs)
+    return decorator
+
+class IPIPANCorpusReader(CorpusReader):
+    """
+    Corpus reader designed to work with corpus created by IPI PAN.
+    See http://korpus.pl/en/ for more details about IPI PAN corpus.
+
+    The corpus includes information about text domain, channel and categories.
+    You can access possible values using ``domains()``, ``channels()`` and
+    ``categories()``. You can use also this metadata to filter files, e.g.:
+    ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
+
+    The reader supports methods: words, sents, paras and their tagged versions.
+    You can get part of speech instead of full tag by giving "simplify_tags=True"
+    parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
+
+    Also you can get all tags disambiguated tags specifying parameter
+    "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
+
+    You can get all tags that were assigned by a morphological analyzer specifying
+    parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
+
+    The IPIPAN Corpus contains tags indicating if there is a space between two
+    tokens. To add special "no space" markers, you should specify parameter
+    "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
+    As a result in place where there should be no space between two tokens new
+    pair ('', 'no-space') will be inserted (for tagged data) and just '' for
+    methods without tags.
+
+    The corpus reader can also try to append spaces between words. To enable this
+    option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
+    As a result either ' ' or (' ', 'space') will be inserted between tokens.
+
+    By default, xml entities like &quot; and &amp; are replaced by corresponding
+    characters. You can turn off this feature, specifying parameter
+    "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
+    """
+
+    def __init__(self, root, fileids):
+        CorpusReader.__init__(self, root, fileids, None, None)
+
+    def raw(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+
+        filecontents = []
+        for fileid in self._list_morph_files(fileids):
+            with open(fileid, 'r') as infile:
+                filecontents.append(infile.read())
+        return ''.join(filecontents)
+
+    def channels(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+        return self._parse_header(fileids, 'channel')
+
+    def domains(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+        return self._parse_header(fileids, 'domain')
+
+    def categories(self, fileids=None):
+        if not fileids:
+            fileids = self.fileids()
+        return [self._map_category(cat)
+                for cat in self._parse_header(fileids, 'keyTerm')]
+
+    def fileids(self, channels=None, domains=None, categories=None):
+        if channels is not None and domains is not None and \
+                categories is not None:
+            raise ValueError('You can specify only one of channels, domains '
+                             'and categories parameter at once')
+        if channels is None and domains is None and \
+                categories is None:
+            return CorpusReader.fileids(self)
+        if isinstance(channels, string_types):
+            channels = [channels]
+        if isinstance(domains, string_types):
+            domains = [domains]
+        if isinstance(categories, string_types):
+            categories = [categories]
+        if channels:
+            return self._list_morph_files_by('channel', channels)
+        elif domains:
+            return self._list_morph_files_by('domain', domains)
+        else:
+            return self._list_morph_files_by('keyTerm', categories,
+                    map=self._map_category)
+
+    @_parse_args
+    def sents(self, fileids=None, **kwargs):
+        return concat([self._view(fileid,
+            mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs)
+            for fileid in self._list_morph_files(fileids)])
+
+    @_parse_args
+    def paras(self, fileids=None, **kwargs):
+        return concat([self._view(fileid,
+            mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs)
+            for fileid in self._list_morph_files(fileids)])
+
+    @_parse_args
+    def words(self, fileids=None, **kwargs):
+        return concat([self._view(fileid, tags=False, **kwargs)
+            for fileid in self._list_morph_files(fileids)])
+
+    @_parse_args
+    def tagged_sents(self, fileids=None, **kwargs):
+        return concat([self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE,
+            **kwargs)
+            for fileid in self._list_morph_files(fileids)])
+
+    @_parse_args
+    def tagged_paras(self, fileids=None, **kwargs):
+        return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE,
+            **kwargs)
+            for fileid in self._list_morph_files(fileids)])
+
+    @_parse_args
+    def tagged_words(self, fileids=None, **kwargs):
+        return concat([self._view(fileid, **kwargs)
+            for fileid in self._list_morph_files(fileids)])
+
+    def _list_morph_files(self, fileids):
+        return [f for f in self.abspaths(fileids)]
+
+    def _list_header_files(self, fileids):
+        return [f.replace('morph.xml', 'header.xml')
+                for f in self._list_morph_files(fileids)]
+
+    def _parse_header(self, fileids, tag):
+        values = set()
+        for f in self._list_header_files(fileids):
+            values_list = self._get_tag(f, tag)
+            for v in values_list:
+                values.add(v)
+        return list(values)
+
+    def _list_morph_files_by(self, tag, values, map=None):
+        fileids = self.fileids()
+        ret_fileids = set()
+        for f in fileids:
+            fp = self.abspath(f).replace('morph.xml', 'header.xml')
+            values_list = self._get_tag(fp, tag)
+            for value in values_list:
+                if map is not None:
+                    value = map(value)
+                if value in values:
+                    ret_fileids.add(f)
+        return list(ret_fileids)
+
+    def _get_tag(self, f, tag):
+        tags = []
+        with open(f, 'r') as infile:
+            header = infile.read()
+        tag_end = 0
+        while True:
+            tag_pos = header.find('<'+tag, tag_end)
+            if tag_pos < 0: return tags
+            tag_end = header.find('</'+tag+'>', tag_pos)
+            tags.append(header[tag_pos+len(tag)+2:tag_end])
+
+    def _map_category(self, cat):
+        pos = cat.find('>')
+        if pos == -1:
+            return cat
+        else:
+            return cat[pos+1:]
+
+    def _view(self, filename, **kwargs):
+        tags = kwargs.pop('tags', True)
+        mode = kwargs.pop('mode', 0)
+        simplify_tags = kwargs.pop('simplify_tags', False)
+        one_tag = kwargs.pop('one_tag', True)
+        disamb_only = kwargs.pop('disamb_only', True)
+        append_no_space = kwargs.pop('append_no_space', False)
+        append_space = kwargs.pop('append_space', False)
+        replace_xmlentities = kwargs.pop('replace_xmlentities', True)
+
+        if len(kwargs) > 0:
+            raise ValueError('Unexpected arguments: %s' % kwargs.keys())
+        if not one_tag and not disamb_only:
+            raise ValueError('You cannot specify both one_tag=False and '
+                             'disamb_only=False')
+        if not tags and (simplify_tags or not one_tag or not disamb_only):
+            raise ValueError('You cannot specify simplify_tags, one_tag or '
+                             'disamb_only with functions other than tagged_*')
+
+        return IPIPANCorpusView(filename,
+                 tags=tags, mode=mode, simplify_tags=simplify_tags,
+                 one_tag=one_tag, disamb_only=disamb_only,
+                 append_no_space=append_no_space,
+                 append_space=append_space,
+                 replace_xmlentities=replace_xmlentities
+                 )
+
+
+class IPIPANCorpusView(StreamBackedCorpusView):
+
+    WORDS_MODE = 0
+    SENTS_MODE = 1
+    PARAS_MODE = 2
+
+    def __init__(self, filename, startpos=0, **kwargs):
+        StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
+        self.in_sentence = False
+        self.position = 0
+
+        self.show_tags = kwargs.pop('tags', True)
+        self.disamb_only = kwargs.pop('disamb_only', True)
+        self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
+        self.simplify_tags = kwargs.pop('simplify_tags', False)
+        self.one_tag = kwargs.pop('one_tag', True)
+        self.append_no_space = kwargs.pop('append_no_space', False)
+        self.append_space = kwargs.pop('append_space', False)
+        self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
+
+    def read_block(self, stream):
+        sentence = []
+        sentences = []
+        space = False
+        no_space = False
+
+        tags = set()
+
+        lines = self._read_data(stream)
+
+        while True:
+
+            # we may have only part of last line
+            if len(lines) <= 1:
+                self._seek(stream)
+                lines = self._read_data(stream)
+
+            if lines == ['']:
+                assert not sentences
+                return []
+
+            line = lines.pop()
+            self.position += len(line) + 1
+
+            if line.startswith('<chunk type="s"'):
+                self.in_sentence = True
+            elif line.startswith('<chunk type="p"'):
+                pass
+            elif line.startswith('<tok'):
+                if self.append_space and space and not no_space:
+                    self._append_space(sentence)
+                space = True
+                no_space = False
+                orth = ""
+                tags = set()
+            elif line.startswith('</chunk'):
+                if self.in_sentence:
+                    self.in_sentence = False
+                    self._seek(stream)
+                    if self.mode == self.SENTS_MODE:
+                        return [sentence]
+                    elif self.mode == self.WORDS_MODE:
+                        if self.append_space:
+                            self._append_space(sentence)
+                        return sentence
+                    else:
+                        sentences.append(sentence)
+                elif self.mode == self.PARAS_MODE:
+                    self._seek(stream)
+                    return [sentences]
+            elif line.startswith('<orth'):
+                orth = line[6:-7]
+                if self.replace_xmlentities:
+                    orth = orth.replace('&quot;', '"').replace('&amp;', '&')
+            elif line.startswith('<lex'):
+                if not self.disamb_only or line.find('disamb=') != -1:
+                    tag = line[line.index('<ctag')+6 : line.index('</ctag') ]
+                    tags.add(tag)
+            elif line.startswith('</tok'):
+                if self.show_tags:
+                    if self.simplify_tags:
+                        tags = [t.split(':')[0] for t in tags]
+                    if not self.one_tag or not self.disamb_only:
+                        sentence.append((orth, tuple(tags)))
+                    else:
+                        sentence.append((orth, tags.pop()))
+                else:
+                    sentence.append(orth)
+            elif line.startswith('<ns/>'):
+                if self.append_space:
+                    no_space = True
+                if self.append_no_space:
+                    if self.show_tags:
+                        sentence.append(('', 'no-space'))
+                    else:
+                        sentence.append('')
+            elif line.startswith('</cesAna'):
+                pass
+
+    def _read_data(self, stream):
+        self.position = stream.tell()
+        buff = stream.read(4096)
+        lines = buff.split('\n')
+        lines.reverse()
+        return lines
+
+    def _seek(self, stream):
+        stream.seek(self.position)
+
+    def _append_space(self, sentence):
+        if self.show_tags:
+            sentence.append((' ', 'space'))
+        else:
+            sentence.append(' ')
diff --git a/nlp_resource_data/nltk/corpus/reader/ipipan.pyc b/nlp_resource_data/nltk/corpus/reader/ipipan.pyc

new file mode 100755 (executable)

index 0000000..bb9ff46

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/ipipan.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/knbc.py b/nlp_resource_data/nltk/corpus/reader/knbc.py

new file mode 100755 (executable)

index 0000000..8ad90a7
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/knbc.py
@@ -0,0 +1,187 @@
+#! /usr/bin/env python
+# KNB Corpus reader
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Masato Hagiwara <hagisan@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
+
+import re
+from six import string_types
+
+from nltk.parse import DependencyGraph
+
+from nltk.corpus.reader.util import (
+    FileSystemPathPointer,
+    find_corpus_fileids,
+    read_blankline_block,
+)
+from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
+
+# default function to convert morphlist to str for tree representation
+_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS')
+
+
+class KNBCorpusReader(SyntaxCorpusReader):
+    """
+    This class implements:
+      - ``__init__``, which specifies the location of the corpus
+        and a method for detecting the sentence blocks in corpus files.
+      - ``_read_block``, which reads a block from the input stream.
+      - ``_word``, which takes a block and returns a list of list of words.
+      - ``_tag``, which takes a block and returns a list of list of tagged
+        words.
+      - ``_parse``, which takes a block and returns a list of parsed
+        sentences.
+
+    The structure of tagged words:
+      tagged_word = (word(str), tags(tuple))
+      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
+
+    Usage example
+    -------------
+
+    >>> from nltk.corpus.util import LazyCorpusLoader
+    >>> knbc = LazyCorpusLoader(
+    ...     'knbc/corpus1',
+    ...     KNBCorpusReader,
+    ...     r'.*/KN.*',
+    ...     encoding='euc-jp',
+    ... )
+
+    >>> len(knbc.sents()[0])
+    9
+
+    """
+
+    def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
+        """
+        Initialize KNBCorpusReader
+        morphs2str is a function to convert morphlist to str for tree representation
+        for _parse()
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self.morphs2str = morphs2str
+
+    def _read_block(self, stream):
+        # blocks are split by blankline (or EOF) - default
+        return read_blankline_block(stream)
+
+    def _word(self, t):
+        res = []
+        for line in t.splitlines():
+            # ignore the Bunsets headers
+            if not re.match(r"EOS|\*|\#|\+", line):
+                cells = line.strip().split(" ")
+                res.append(cells[0])
+
+        return res
+
+    # ignores tagset argument
+    def _tag(self, t, tagset=None):
+        res = []
+        for line in t.splitlines():
+            # ignore the Bunsets headers
+            if not re.match(r"EOS|\*|\#|\+", line):
+                cells = line.strip().split(" ")
+                # convert cells to morph tuples
+                res.append((cells[0], ' '.join(cells[1:])))
+
+        return res
+
+    def _parse(self, t):
+        dg = DependencyGraph()
+        i = 0
+        for line in t.splitlines():
+            if line[0] in '*+':
+                # start of bunsetsu or tag
+
+                cells = line.strip().split(" ", 3)
+                m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
+
+                assert m is not None
+
+                node = dg.nodes[i]
+                node.update(
+                    {
+                        'address': i,
+                        'rel': m.group(2),
+                        'word': [],
+                    }
+                )
+
+                dep_parent = int(m.group(1))
+
+                if dep_parent == -1:
+                    dg.root = node
+                else:
+                    dg.nodes[dep_parent]['deps'].append(i)
+
+                i += 1
+            elif line[0] != '#':
+                # normal morph
+                cells = line.strip().split(" ")
+                # convert cells to morph tuples
+                morph = cells[0], ' '.join(cells[1:])
+                dg.nodes[i - 1]['word'].append(morph)
+
+        if self.morphs2str:
+            for node in dg.nodes.values():
+                node['word'] = self.morphs2str(node['word'])
+
+        return dg.tree()
+
+######################################################################
+# Demo
+######################################################################
+
+
+def demo():
+
+    import nltk
+    from nltk.corpus.util import LazyCorpusLoader
+
+    root = nltk.data.find('corpora/knbc/corpus1')
+    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
+               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
+
+    def _knbc_fileids_sort(x):
+        cells = x.split('-')
+        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
+
+    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
+                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
+
+    print(knbc.fileids()[:10])
+    print(''.join(knbc.words()[:100]))
+
+    print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))
+
+    knbc.morphs2str = lambda morphs: '/'.join(
+        "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
+    ).encode('utf-8')
+
+    print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))
+
+    print(
+        '\n'.join(
+            ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
+            for sent in knbc.tagged_sents()[0:2]
+        )
+    )
+
+
+def test():
+
+    from nltk.corpus.util import LazyCorpusLoader
+    knbc = LazyCorpusLoader(
+        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
+    assert isinstance(knbc.words()[0], string_types)
+    assert isinstance(knbc.sents()[0][0], string_types)
+    assert isinstance(knbc.tagged_words()[0], tuple)
+    assert isinstance(knbc.tagged_sents()[0][0], tuple)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/corpus/reader/knbc.pyc b/nlp_resource_data/nltk/corpus/reader/knbc.pyc

new file mode 100755 (executable)

index 0000000..f344c95

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/knbc.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/lin.py b/nlp_resource_data/nltk/corpus/reader/lin.py

new file mode 100755 (executable)

index 0000000..49d8a93
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/lin.py
@@ -0,0 +1,156 @@
+# Natural Language Toolkit: Lin's Thesaurus
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Dan Blanchard <dblanchard@ets.org>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.txt
+from __future__ import print_function
+
+import re
+from collections import defaultdict
+from functools import reduce
+
+from nltk.corpus.reader import CorpusReader
+
+
+class LinThesaurusCorpusReader(CorpusReader):
+    """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
+
+    # Compiled regular expression for extracting the key from the first line of each
+    # thesaurus entry
+    _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
+
+    @staticmethod
+    def __defaultdict_factory():
+        ''' Factory for creating defaultdict of defaultdict(dict)s '''
+        return defaultdict(dict)
+
+    def __init__(self, root, badscore=0.0):
+        '''
+        Initialize the thesaurus.
+
+        :param root: root directory containing thesaurus LISP files
+        :type root: C{string}
+        :param badscore: the score to give to words which do not appear in each other's sets of synonyms
+        :type badscore: C{float}
+        '''
+
+        super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
+        self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
+        self._badscore = badscore
+        for path, encoding, fileid in self.abspaths(include_encoding=True, include_fileid=True):
+            with open(path) as lin_file:
+                first = True
+                for line in lin_file:
+                    line = line.strip()
+                    # Start of entry
+                    if first:
+                        key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
+                        first = False
+                    # End of entry
+                    elif line == '))':
+                        first = True
+                    # Lines with pairs of ngrams and scores
+                    else:
+                        split_line = line.split('\t')
+                        if len(split_line) == 2:
+                            ngram, score = split_line
+                            self._thesaurus[fileid][key][ngram.strip('"')] = float(score)
+
+    def similarity(self, ngram1, ngram2, fileid=None):
+        '''
+        Returns the similarity score for two ngrams.
+
+        :param ngram1: first ngram to compare
+        :type ngram1: C{string}
+        :param ngram2: second ngram to compare
+        :type ngram2: C{string}
+        :param fileid: thesaurus fileid to search in. If None, search all fileids.
+        :type fileid: C{string}
+        :return: If fileid is specified, just the score for the two ngrams; otherwise,
+                 list of tuples of fileids and scores.
+        '''
+        # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
+        if ngram1 == ngram2:
+            if fileid:
+                return 1.0
+            else:
+                return [(fid, 1.0) for fid in self._fileids]
+        else:
+            if fileid:
+                return self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore
+            else:
+                return [(fid, (self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1]
+                                  else self._badscore)) for fid in self._fileids]
+
+    def scored_synonyms(self, ngram, fileid=None):
+        '''
+        Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
+
+        :param ngram: ngram to lookup
+        :type ngram: C{string}
+        :param fileid: thesaurus fileid to search in. If None, search all fileids.
+        :type fileid: C{string}
+        :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
+                 list of tuples of fileids and lists, where inner lists consist of tuples of
+                 scores and synonyms.
+        '''
+        if fileid:
+            return self._thesaurus[fileid][ngram].items()
+        else:
+            return [(fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids]
+
+    def synonyms(self, ngram, fileid=None):
+        '''
+        Returns a list of synonyms for the current ngram.
+
+        :param ngram: ngram to lookup
+        :type ngram: C{string}
+        :param fileid: thesaurus fileid to search in. If None, search all fileids.
+        :type fileid: C{string}
+        :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
+                 lists, where inner lists contain synonyms.
+        '''
+        if fileid:
+            return self._thesaurus[fileid][ngram].keys()
+        else:
+            return [(fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids]
+
+    def __contains__(self, ngram):
+        '''
+        Determines whether or not the given ngram is in the thesaurus.
+
+        :param ngram: ngram to lookup
+        :type ngram: C{string}
+        :return: whether the given ngram is in the thesaurus.
+        '''
+        return reduce(lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False)
+
+
+######################################################################
+# Demo
+######################################################################
+
+def demo():
+    from nltk.corpus import lin_thesaurus as thes
+
+    word1 = "business"
+    word2 = "enterprise"
+    print("Getting synonyms for " + word1)
+    print(thes.synonyms(word1))
+
+    print("Getting scored synonyms for " + word1)
+    print(thes.scored_synonyms(word1))
+
+    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
+    print(thes.synonyms(word1, fileid="simN.lsp"))
+
+    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
+    print(thes.synonyms(word1, fileid="simN.lsp"))
+
+    print("Similarity score for %s and %s:" % (word1, word2))
+    print(thes.similarity(word1, word2))
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/corpus/reader/lin.pyc b/nlp_resource_data/nltk/corpus/reader/lin.pyc

new file mode 100755 (executable)

index 0000000..59a51ae

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/lin.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/mte.py b/nlp_resource_data/nltk/corpus/reader/mte.py

new file mode 100755 (executable)

index 0000000..cd443a1
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/mte.py
@@ -0,0 +1,311 @@
+"""
+A reader for corpora whose documents are in MTE format.
+"""
+import os
+import re
+from functools import reduce
+
+from six import string_types
+
+from nltk.corpus.reader import concat, TaggedCorpusReader
+from nltk.corpus.reader.xmldocs import XMLCorpusView
+
+
+def xpath(root, path, ns):
+    return root.findall(path, ns)
+
+class MTECorpusView(XMLCorpusView):
+    """
+    Class for lazy viewing the MTE Corpus.
+    """
+
+    def __init__(self, fileid, tagspec, elt_handler=None):
+        XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
+
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        return list(filter(lambda x: x is not None, XMLCorpusView.read_block(self, stream, tagspec, elt_handler)))
+
+class MTEFileReader:
+    """
+    Class for loading the content of the multext-east corpus. It
+    parses the xml files and does some tag-filtering depending on the
+    given method parameters.
+    """
+    ns = {'tei': 'http://www.tei-c.org/ns/1.0',
+          'xml': 'http://www.w3.org/XML/1998/namespace'}
+    tag_ns = '{http://www.tei-c.org/ns/1.0}'
+    xml_ns = '{http://www.w3.org/XML/1998/namespace}'
+    word_path = "TEI/text/body/div/div/p/s/(w|c)"
+    sent_path = "TEI/text/body/div/div/p/s"
+    para_path = "TEI/text/body/div/div/p"
+
+
+    def __init__(self, file_path):
+        self.__file_path = file_path
+
+    @classmethod
+    def _word_elt(self, elt, context):
+        return elt.text
+
+    @classmethod
+    def _sent_elt(self, elt, context):
+        return [self._word_elt(w, None) for w in xpath(elt, '*', self.ns)]
+
+    @classmethod
+    def _para_elt(self, elt, context):
+        return [self._sent_elt(s, None) for s in xpath(elt, '*', self.ns)]
+
+    @classmethod
+    def _tagged_word_elt(self, elt, context):
+        if ('ana' not in elt.attrib):
+            return (elt.text, '')
+
+        if self.__tags == "" and self.__tagset == "msd":
+            return (elt.text, elt.attrib['ana'])
+        elif self.__tags == "" and self.__tagset == "universal":
+            return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
+        else:
+            tags = re.compile('^' + re.sub("-", ".", self.__tags) + '.*$')
+            if (tags.match(elt.attrib['ana'])):
+                if self.__tagset == "msd":
+                    return (elt.text, elt.attrib['ana'])
+                else:
+                    return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
+            else:
+                return None
+
+    @classmethod
+    def _tagged_sent_elt(self, elt, context):
+        return list(filter(lambda x: x is not None, [self._tagged_word_elt(w, None) for w in xpath(elt, '*', self.ns)]))
+
+    @classmethod
+    def _tagged_para_elt(self, elt, context):
+        return list(filter(lambda x: x is not None, [self._tagged_sent_elt(s, None) for s in xpath(elt, '*', self.ns)]))
+
+    @classmethod
+    def _lemma_word_elt(self, elt, context):
+        if ('lemma' not in elt.attrib):
+            return (elt.text, '')
+        else:
+            return (elt.text, elt.attrib['lemma'])
+
+    @classmethod
+    def _lemma_sent_elt(self, elt, context):
+        return [self._lemma_word_elt(w, None) for w in xpath(elt, '*', self.ns)]
+
+    @classmethod
+    def _lemma_para_elt(self, elt, context):
+        return [self._lemma_sent_elt(s, None) for s in xpath(elt, '*', self.ns)]
+
+    def words(self):
+        return MTECorpusView(self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt)
+
+    def sents(self):
+        return MTECorpusView(self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt)
+
+    def paras(self):
+        return MTECorpusView(self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt)
+
+    def lemma_words(self):
+        return MTECorpusView(self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt)
+
+    def tagged_words(self, tagset, tags):
+        MTEFileReader.__tagset = tagset
+        MTEFileReader.__tags = tags
+        return MTECorpusView(self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt)
+
+    def lemma_sents(self):
+        return MTECorpusView(self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt)
+
+    def tagged_sents(self, tagset, tags):
+        MTEFileReader.__tagset = tagset
+        MTEFileReader.__tags = tags
+        return MTECorpusView(self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt)
+
+    def lemma_paras(self):
+        return MTECorpusView(self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt)
+
+    def tagged_paras(self, tagset, tags):
+        MTEFileReader.__tagset = tagset
+        MTEFileReader.__tags = tags
+        return MTECorpusView(self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt)
+
+
+class MTETagConverter:
+    """
+    Class for converting msd tags to universal tags, more conversion
+    options are currently not implemented.
+    """
+
+    mapping_msd_universal = {
+        'A': 'ADJ', 'S': 'ADP', 'R': 'ADV', 'C': 'CONJ',
+        'D': 'DET', 'N': 'NOUN', 'M': 'NUM', 'Q': 'PRT',
+        'P': 'PRON', 'V': 'VERB', '.': '.', '-': 'X'}
+
+    @staticmethod
+    def msd_to_universal(tag):
+        """
+        This function converts the annotation from the Multex-East to the universal tagset
+        as described in Chapter 5 of the NLTK-Book
+
+        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
+        """
+        indicator = tag[0] if not tag[0] == "#" else tag[1]
+
+        if not indicator in MTETagConverter.mapping_msd_universal:
+            indicator = '-'
+
+        return MTETagConverter.mapping_msd_universal[indicator]
+
+class MTECorpusReader(TaggedCorpusReader):
+    """
+    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
+    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
+    scheme. These tags can be converted to the Universal tagset
+    """
+
+    def __init__(self, root=None, fileids=None, encoding='utf8'):
+        """
+        Construct a new MTECorpusreader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
+
+        :param root: The root directory for this corpus. (default points to location in multext config file)
+        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
+        :param enconding: The encoding of the given files (default is utf8)
+        """
+        TaggedCorpusReader.__init__(self, root, fileids, encoding)
+
+    def __fileids(self, fileids):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        # filter wrong userinput
+        fileids = filter(lambda x : x in self._fileids, fileids)
+        # filter multext-east sourcefiles that are not compatible to the teip5 specification
+        fileids = filter(lambda x : x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
+        if not fileids:
+            print("No valid multext-east file specified")
+        return fileids
+
+    def readme(self):
+        """
+        Prints some information about this corpus.
+        :return: the content of the attached README file
+        :rtype: str
+        """
+        return self.open("00README.txt").read()
+
+    def raw(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        return reduce([self.open(f).read() for f in self.__fileids(fileids)], [])
+
+    def words(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return  concat([MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids)])
+
+    def sents(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of sentences or utterances,
+                 each encoded as a list of word strings
+        :rtype: list(list(str))
+        """
+        return  concat([MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids)])
+
+    def paras(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of paragraphs, each encoded as a list
+                 of sentences, which are in turn encoded as lists of word string
+        :rtype: list(list(list(str)))
+        """
+        return  concat([MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids)])
+
+    def lemma_words(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of words, the corresponding lemmas
+                 and punctuation symbols, encoded as tuples (word, lemma)
+        :rtype: list(tuple(str,str))
+        """
+        return  concat([MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids)])
+
+    def tagged_words(self, fileids=None, tagset="msd", tags=""):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of tagged words and punctuation symbols
+                 encoded as tuples (word, tag)
+        :rtype: list(tuple(str, str))
+        """
+        if tagset == "universal" or tagset == "msd":
+            return concat([MTEFileReader(os.path.join(self._root, f)).tagged_words(tagset, tags) for f in self.__fileids(fileids)])
+        else:
+            print("Unknown tagset specified.")
+
+    def lemma_sents(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of sentences or utterances, each
+                 encoded as a list of tuples of the word and the corresponding
+                 lemma (word, lemma)
+        :rtype: list(list(tuple(str, str)))
+        """
+        return  concat([MTEFileReader(os.path.join(self._root, f)).lemma_sents() for f in self.__fileids(fileids)])
+
+
+    def tagged_sents(self, fileids=None, tagset="msd", tags=""):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of sentences or utterances, each
+                 each encoded as a list of (word,tag) tuples
+        :rtype: list(list(tuple(str, str)))
+        """
+        if tagset == "universal" or tagset == "msd":
+            return concat([MTEFileReader(os.path.join(self._root, f)).tagged_sents(tagset, tags) for f in self.__fileids(fileids)])
+        else:
+            print("Unknown tagset specified.")
+
+    def lemma_paras(self, fileids=None):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of paragraphs, each encoded as a
+                 list of sentences, which are in turn encoded as a list of
+                 tuples of the word and the corresponding lemma (word, lemma)
+        :rtype: list(List(List(tuple(str, str))))
+        """
+        return concat([MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids)])
+
+    def tagged_paras(self, fileids=None, tagset="msd", tags=""):
+        """
+           :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of paragraphs, each encoded as a
+                 list of sentences, which are in turn encoded as a list
+                 of (word,tag) tuples
+        :rtype: list(list(list(tuple(str, str))))
+        """
+        if tagset == "universal" or tagset == "msd":
+            return concat([MTEFileReader(os.path.join(self._root, f)).tagged_paras(tagset, tags) for f in self.__fileids(fileids)])
+        else:
+            print("Unknown tagset specified.")
diff --git a/nlp_resource_data/nltk/corpus/reader/mte.pyc b/nlp_resource_data/nltk/corpus/reader/mte.pyc

new file mode 100755 (executable)

index 0000000..d5d0838

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/mte.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/nkjp.py b/nlp_resource_data/nltk/corpus/reader/nkjp.py

new file mode 100755 (executable)

index 0000000..6f141a2
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/nkjp.py
@@ -0,0 +1,429 @@
+# Natural Language Toolkit: NKJP Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Gabriela Kaczka
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import functools
+import os
+import tempfile
+
+from six import string_types
+
+from nltk.corpus.reader.util import concat
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
+import re
+
+
+def _parse_args(fun):
+    """
+    Wraps function arguments:
+    if fileids not specified then function set NKJPCorpusReader paths.
+    """
+    @functools.wraps(fun)
+    def decorator(self, fileids=None, **kwargs):
+        if not fileids:
+            fileids = self._paths
+        return fun(self, fileids, **kwargs)
+
+    return decorator
+
+
+class NKJPCorpusReader(XMLCorpusReader):
+    WORDS_MODE = 0
+    SENTS_MODE = 1
+    HEADER_MODE = 2
+    RAW_MODE = 3
+
+    def __init__(self, root, fileids='.*'):
+        """
+        Corpus reader designed to work with National Corpus of Polish.
+        See http://nkjp.pl/ for more details about NKJP.
+        use example:
+        import nltk
+        import nkjp
+        from nkjp import NKJPCorpusReader
+        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
+        x.header()
+        x.raw()
+        x.words()
+        x.tagged_words(tags=['subst', 'comp'])  #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
+        x.sents()
+        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
+        x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
+        x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
+        """
+        if isinstance(fileids, string_types):
+            XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
+        else:
+            XMLCorpusReader.__init__(self, root, [fileid + '/header.xml' for fileid in fileids])
+        self._paths = self.get_paths()
+
+    def get_paths(self):
+        return [os.path.join(str(self._root), f.split("header.xml")[0]) for f in self._fileids]
+
+    def fileids(self):
+        """
+        Returns a list of file identifiers for the fileids that make up
+        this corpus.
+        """
+        return [f.split("header.xml")[0] for f in self._fileids]
+
+    def _view(self, filename, tags=None, **kwargs):
+        """
+        Returns a view specialised for use with particular corpus file.
+        """
+        mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE)
+        if mode is NKJPCorpusReader.WORDS_MODE:
+            return NKJPCorpus_Morph_View(filename, tags=tags)
+        elif mode is NKJPCorpusReader.SENTS_MODE:
+            return NKJPCorpus_Segmentation_View(filename, tags=tags)
+        elif mode is NKJPCorpusReader.HEADER_MODE:
+            return NKJPCorpus_Header_View(filename, tags=tags)
+        elif mode is NKJPCorpusReader.RAW_MODE:
+            return NKJPCorpus_Text_View(filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE)
+
+        else:
+            raise NameError('No such mode!')
+
+    def add_root(self, fileid):
+        """
+        Add root if necessary to specified fileid.
+        """
+        if self.root in fileid:
+            return fileid
+        return self.root + fileid
+
+    @_parse_args
+    def header(self, fileids=None, **kwargs):
+        """
+        Returns header(s) of specified fileids.
+        """
+        return concat([self._view(self.add_root(fileid),
+                                  mode=NKJPCorpusReader.HEADER_MODE, **kwargs).handle_query()
+                       for fileid in fileids])
+
+    @_parse_args
+    def sents(self, fileids=None, **kwargs):
+        """
+        Returns sentences in specified fileids.
+        """
+        return concat([self._view(self.add_root(fileid),
+                                  mode=NKJPCorpusReader.SENTS_MODE, **kwargs).handle_query()
+                       for fileid in fileids])
+
+    @_parse_args
+    def words(self, fileids=None, **kwargs):
+        """
+        Returns words in specified fileids.
+        """
+
+        return concat([self._view(self.add_root(fileid),
+                                  mode=NKJPCorpusReader.WORDS_MODE, **kwargs).handle_query()
+                       for fileid in fileids])
+
+    @_parse_args
+    def tagged_words(self, fileids=None, **kwargs):
+        """
+        Call with specified tags as a list, e.g. tags=['subst', 'comp'].
+        Returns tagged words in specified fileids.
+        """
+        tags = kwargs.pop('tags', [])
+        return concat([self._view(self.add_root(fileid),
+                                  mode=NKJPCorpusReader.WORDS_MODE, tags=tags, **kwargs).handle_query()
+                       for fileid in fileids])
+
+    @_parse_args
+    def raw(self, fileids=None, **kwargs):
+        """
+        Returns words in specified fileids.
+        """
+        return concat([self._view(self.add_root(fileid),
+                                  mode=NKJPCorpusReader.RAW_MODE, **kwargs).handle_query()
+                       for fileid in fileids])
+
+
+class NKJPCorpus_Header_View(XMLCorpusView):
+
+    def __init__(self, filename, **kwargs):
+        """
+        HEADER_MODE
+        A stream backed corpus view specialized for use with
+        header.xml files in NKJP corpus.
+        """
+        self.tagspec = ".*/sourceDesc$"
+        XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
+
+    def handle_query(self):
+        self._open()
+        header = []
+        while True:
+            segm = XMLCorpusView.read_block(self, self._stream)
+            if len(segm) == 0:
+                break
+            header.extend(segm)
+        self.close()
+        return header
+
+    def handle_elt(self, elt, context):
+        titles = elt.findall('bibl/title')
+        title = []
+        if titles:
+            title = '\n'.join(title.text.strip() for title in titles)
+
+        authors = elt.findall('bibl/author')
+        author = []
+        if authors:
+            author = '\n'.join(author.text.strip() for author in authors)
+
+        dates = elt.findall('bibl/date')
+        date = []
+        if dates:
+            date = '\n'.join(date.text.strip() for date in dates)
+
+        publishers = elt.findall('bibl/publisher')
+        publisher = []
+        if publishers:
+            publisher = '\n'.join(publisher.text.strip() for publisher in publishers)
+
+        idnos = elt.findall('bibl/idno')
+        idno = []
+        if idnos:
+            idno = '\n'.join(idno.text.strip() for idno in idnos)
+
+        notes = elt.findall('bibl/note')
+        note = []
+        if notes:
+            note = '\n'.join(note.text.strip() for note in notes)
+
+        return {'title': title, 'author': author, 'date': date, 'publisher': publisher,
+                'idno': idno, 'note': note}
+
+
+class XML_Tool():
+    """
+    Helper class creating xml file to one without references to nkjp: namespace.
+    That's needed because the XMLCorpusView assumes that one can find short substrings
+    of XML that are valid XML, which is not true if a namespace is declared at top level
+    """
+    def __init__(self, root, filename):
+        self.read_file = os.path.join(root, filename)
+        self.write_file = tempfile.NamedTemporaryFile(delete=False)
+
+    def build_preprocessed_file(self):
+        try:
+            fr = open(self.read_file, 'r')
+            fw = self.write_file
+            line = ' '
+            while len(line):
+                line = fr.readline()
+                x = re.split(r'nkjp:[^ ]* ', line)  #in all files
+                ret = ' '.join(x)
+                x = re.split('<nkjp:paren>', ret)   #in ann_segmentation.xml
+                ret = ' '.join(x)
+                x = re.split('</nkjp:paren>', ret)  #in ann_segmentation.xml
+                ret = ' '.join(x)
+                x = re.split('<choice>', ret)   #in ann_segmentation.xml
+                ret = ' '.join(x)
+                x = re.split('</choice>', ret)  #in ann_segmentation.xml
+                ret = ' '.join(x)
+                fw.write(ret)
+            fr.close()
+            fw.close()
+            return self.write_file.name
+        except Exception:
+            self.remove_preprocessed_file()
+            raise Exception
+
+    def remove_preprocessed_file(self):
+        os.remove(self.write_file.name)
+        pass
+
+
+class NKJPCorpus_Segmentation_View(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with
+    ann_segmentation.xml files in NKJP corpus.
+    """
+
+    def __init__(self, filename, **kwargs):
+        self.tagspec = '.*p/.*s'
+        #intersperse NKJPCorpus_Text_View
+        self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE)
+        self.text_view.handle_query()
+        #xml preprocessing
+        self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
+        #base class init
+        XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
+
+    def get_segm_id(self, example_word):
+        return example_word.split('(')[1].split(',')[0]
+
+    def get_sent_beg(self, beg_word):
+        #returns index of beginning letter in sentence
+        return int(beg_word.split(',')[1])
+
+    def get_sent_end(self, end_word):
+        #returns index of end letter in sentence
+        splitted = end_word.split(')')[0].split(',')
+        return int(splitted[1]) + int(splitted[2])
+
+    def get_sentences(self, sent_segm):
+        #returns one sentence
+        id = self.get_segm_id(sent_segm[0])
+        segm = self.text_view.segm_dict[id]    #text segment
+        beg = self.get_sent_beg(sent_segm[0])
+        end = self.get_sent_end(sent_segm[len(sent_segm)-1])
+        return segm[beg:end]
+
+    def remove_choice(self, segm):
+        ret = []
+        prev_txt_end = -1
+        prev_txt_nr = -1
+        for word in segm:
+            txt_nr = self.get_segm_id(word)
+            #get increasing sequence of ids: in case of choice get first possibility
+            if self.get_sent_beg(word) > prev_txt_end-1 or prev_txt_nr != txt_nr:
+                ret.append(word)
+                prev_txt_end = self.get_sent_end(word)
+            prev_txt_nr = txt_nr
+
+        return ret
+
+    def handle_query(self):
+        try:
+            self._open()
+            sentences = []
+            while True:
+                sent_segm = XMLCorpusView.read_block(self, self._stream)
+                if len(sent_segm) == 0:
+                    break
+                for segm in sent_segm:
+                    segm = self.remove_choice(segm)
+                    sentences.append(self.get_sentences(segm))
+            self.close()
+            self.xml_tool.remove_preprocessed_file()
+            return sentences
+        except Exception:
+            self.xml_tool.remove_preprocessed_file()
+            raise Exception
+
+    def handle_elt(self, elt, context):
+        ret = []
+        for seg in elt:
+            ret.append(seg.get('corresp'))
+        return ret
+
+
+class NKJPCorpus_Text_View(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with
+    text.xml files in NKJP corpus.
+    """
+    SENTS_MODE = 0
+    RAW_MODE = 1
+
+    def __init__(self, filename, **kwargs):
+        self.mode = kwargs.pop('mode', 0)
+        self.tagspec = '.*/div/ab'
+        self.segm_dict = dict()
+        #xml preprocessing
+        self.xml_tool = XML_Tool(filename, 'text.xml')
+        #base class init
+        XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
+
+    def handle_query(self):
+        try:
+            self._open()
+            x = self.read_block(self._stream)
+            self.close()
+            self.xml_tool.remove_preprocessed_file()
+            return x
+        except Exception:
+            self.xml_tool.remove_preprocessed_file()
+            raise Exception
+
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        """
+        Returns text as a list of sentences.
+        """
+        txt = []
+        while True:
+            segm = XMLCorpusView.read_block(self, stream)
+            if len(segm) == 0:
+                break
+            for part in segm:
+                txt.append(part)
+
+        return [' '.join([segm for segm in txt])]
+
+    def get_segm_id(self, elt):
+        for attr in elt.attrib:
+            if attr.endswith('id'):
+                return elt.get(attr)
+
+    def handle_elt(self, elt, context):
+        #fill dictionary to use later in sents mode
+        if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
+            self.segm_dict[self.get_segm_id(elt)] = elt.text
+        return elt.text
+
+
+class NKJPCorpus_Morph_View(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with
+    ann_morphosyntax.xml files in NKJP corpus.
+    """
+
+    def __init__(self, filename, **kwargs):
+        self.tags = kwargs.pop('tags', None)
+        self.tagspec = '.*/seg/fs'
+        self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
+        XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
+
+    def handle_query(self):
+        try:
+            self._open()
+            words = []
+            while True:
+                segm = XMLCorpusView.read_block(self, self._stream)
+                if len(segm) == 0:
+                    break
+                for part in segm:
+                    if part is not None:
+                        words.append(part)
+            self.close()
+            self.xml_tool.remove_preprocessed_file()
+            return words
+        except Exception:
+            self.xml_tool.remove_preprocessed_file()
+            raise Exception
+
+    def handle_elt(self, elt, context):
+        word = ''
+        flag = False
+        is_not_interp = True
+        #if tags not specified, then always return word
+        if self.tags is None:
+            flag = True
+
+        for child in elt:
+
+            #get word
+            if 'name' in child.keys() and child.attrib['name'] == 'orth':
+                for symbol in child:
+                    if symbol.tag == 'string':
+                        word = symbol.text
+            elif 'name' in child.keys() and child.attrib['name'] == 'interps':
+                for symbol in child:
+                    if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex':
+                        for symbol2 in symbol:
+                            if 'name' in symbol2.keys() and symbol2.attrib['name'] == 'ctag':
+                                for symbol3 in symbol2:
+                                    if 'value' in symbol3.keys() and self.tags is not None and symbol3.attrib['value'] in self.tags:
+                                        flag = True
+                                    elif 'value' in symbol3.keys() and symbol3.attrib['value'] == 'interp':
+                                        is_not_interp = False
+        if flag and is_not_interp:
+            return word
diff --git a/nlp_resource_data/nltk/corpus/reader/nkjp.pyc b/nlp_resource_data/nltk/corpus/reader/nkjp.pyc

new file mode 100755 (executable)

index 0000000..54e51b4

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/nkjp.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/nombank.py b/nlp_resource_data/nltk/corpus/reader/nombank.py

new file mode 100755 (executable)

index 0000000..c6d7d16
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/nombank.py
@@ -0,0 +1,421 @@
+# Natural Language Toolkit: NomBank Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Paul Bedaride <paul.bedaride@gmail.com>
+#          Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals
+from xml.etree import ElementTree
+from functools import total_ordering
+
+from six import string_types
+
+from nltk.tree import Tree
+from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class NombankCorpusReader(CorpusReader):
+    """
+    Corpus reader for the nombank corpus, which augments the Penn
+    Treebank with information about the predicate argument structure
+    of every noun instance.  The corpus consists of two parts: the
+    predicate-argument annotations themselves, and a set of "frameset
+    files" which define the argument labels used by the annotations,
+    on a per-noun basis.  Each "frameset file" contains one or more
+    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
+    divided into coarse-grained word senses called "rolesets".  For
+    each "roleset", the frameset file provides descriptions of the
+    argument roles, along with examples.
+    """
+    def __init__(self, root, nomfile, framefiles='',
+                 nounsfile=None, parse_fileid_xform=None,
+                 parse_corpus=None, encoding='utf8'):
+        """
+        :param root: The root directory for this corpus.
+        :param nomfile: The name of the file containing the predicate-
+            argument annotations (relative to ``root``).
+        :param framefiles: A list or regexp specifying the frameset
+            fileids for this corpus.
+        :param parse_fileid_xform: A transform that should be applied
+            to the fileids in this corpus.  This should be a function
+            of one argument (a fileid) that returns a string (the new
+            fileid).
+        :param parse_corpus: The corpus containing the parse trees
+            corresponding to this corpus.  These parse trees are
+            necessary to resolve the tree pointers used by nombank.
+        """
+        # If framefiles is specified as a regexp, expand it.
+        if isinstance(framefiles, string_types):
+            framefiles = find_corpus_fileids(root, framefiles)
+        framefiles = list(framefiles)
+        # Initialze the corpus reader.
+        CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles,
+                              encoding)
+
+        # Record our frame fileids & nom file.
+        self._nomfile = nomfile
+        self._framefiles = framefiles
+        self._nounsfile = nounsfile
+        self._parse_fileid_xform = parse_fileid_xform
+        self._parse_corpus = parse_corpus
+
+    def raw(self, fileids=None):
+        """
+        :return: the text contents of the given fileids, as a single string.
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def instances(self, baseform=None):
+        """
+        :return: a corpus view that acts as a list of
+        ``NombankInstance`` objects, one for each noun in the corpus.
+        """
+        kwargs = {}
+        if baseform is not None:
+            kwargs['instance_filter'] = lambda inst: inst.baseform==baseform
+        return StreamBackedCorpusView(self.abspath(self._nomfile),
+                                      lambda stream: self._read_instance_block(stream, **kwargs),
+                                      encoding=self.encoding(self._nomfile))
+
+    def lines(self):
+        """
+        :return: a corpus view that acts as a list of strings, one for
+        each line in the predicate-argument annotation file.
+        """
+        return StreamBackedCorpusView(self.abspath(self._nomfile),
+                                      read_line_block,
+                                      encoding=self.encoding(self._nomfile))
+
+    def roleset(self, roleset_id):
+        """
+        :return: the xml description for the given roleset.
+        """
+        baseform = roleset_id.split('.')[0]
+        baseform = baseform.replace('perc-sign','%')
+        baseform = baseform.replace('oneslashonezero', '1/10').replace('1/10','1-slash-10')
+        framefile = 'frames/%s.xml' % baseform
+        if framefile not in self._framefiles:
+            raise ValueError('Frameset file for %s not found' %
+                             roleset_id)
+
+        # n.b.: The encoding for XML fileids is specified by the file
+        # itself; so we ignore self._encoding here.
+        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
+        for roleset in etree.findall('predicate/roleset'):
+            if roleset.attrib['id'] == roleset_id:
+                return roleset
+        raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
+
+    def rolesets(self, baseform=None):
+        """
+        :return: list of xml descriptions for rolesets.
+        """
+        if baseform is not None:
+            framefile = 'frames/%s.xml' % baseform
+            if framefile not in self._framefiles:
+                raise ValueError('Frameset file for %s not found' %
+                                 baseform)
+            framefiles = [framefile]
+        else:
+            framefiles = self._framefiles
+
+        rsets = []
+        for framefile in framefiles:
+            # n.b.: The encoding for XML fileids is specified by the file
+            # itself; so we ignore self._encoding here.
+            etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
+            rsets.append(etree.findall('predicate/roleset'))
+        return LazyConcatenation(rsets)
+
+    def nouns(self):
+        """
+        :return: a corpus view that acts as a list of all noun lemmas
+        in this corpus (from the nombank.1.0.words file).
+        """
+        return StreamBackedCorpusView(self.abspath(self._nounsfile),
+                                      read_line_block,
+                                      encoding=self.encoding(self._nounsfile))
+
+    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
+        block = []
+
+        # Read 100 at a time.
+        for i in range(100):
+            line = stream.readline().strip()
+            if line:
+                inst = NombankInstance.parse(
+                    line, self._parse_fileid_xform,
+                    self._parse_corpus)
+                if instance_filter(inst):
+                    block.append(inst)
+
+        return block
+
+######################################################################
+#{ Nombank Instance & related datatypes
+######################################################################
+
+@python_2_unicode_compatible
+class NombankInstance(object):
+
+    def __init__(self, fileid, sentnum, wordnum, baseform, sensenumber,
+                 predicate, predid, arguments, parse_corpus=None):
+
+        self.fileid = fileid
+        """The name of the file containing the parse tree for this
+        instance's sentence."""
+
+        self.sentnum = sentnum
+        """The sentence number of this sentence within ``fileid``.
+        Indexing starts from zero."""
+
+        self.wordnum = wordnum
+        """The word number of this instance's predicate within its
+        containing sentence.  Word numbers are indexed starting from
+        zero, and include traces and other empty parse elements."""
+
+        self.baseform = baseform
+        """The baseform of the predicate."""
+
+        self.sensenumber = sensenumber
+        """The sense number of the predicate."""
+
+        self.predicate = predicate
+        """A ``NombankTreePointer`` indicating the position of this
+        instance's predicate within its containing sentence."""
+
+        self.predid = predid
+        """Identifier of the predicate."""
+
+        self.arguments = tuple(arguments)
+        """A list of tuples (argloc, argid), specifying the location
+        and identifier for each of the predicate's argument in the
+        containing sentence.  Argument identifiers are strings such as
+        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain
+        the predicate."""
+
+        self.parse_corpus = parse_corpus
+        """A corpus reader for the parse trees corresponding to the
+        instances in this nombank corpus."""
+
+    @property
+    def roleset(self):
+        """The name of the roleset used by this instance's predicate.
+        Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
+        look up information about the roleset."""
+        r = self.baseform.replace('%', 'perc-sign')
+        r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero')
+        return '%s.%s'%(r, self.sensenumber)
+
+    def __repr__(self):
+        return ('<NombankInstance: %s, sent %s, word %s>' %
+                (self.fileid, self.sentnum, self.wordnum))
+
+    def __str__(self):
+        s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
+                                self.baseform, self.sensenumber)
+        items = self.arguments + ((self.predicate, 'rel'),)
+        for (argloc, argid) in sorted(items):
+            s += ' %s-%s' % (argloc, argid)
+        return s
+
+    def _get_tree(self):
+        if self.parse_corpus is None: return None
+        if self.fileid not in self.parse_corpus.fileids(): return None
+        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
+    tree = property(_get_tree, doc="""
+        The parse tree corresponding to this instance, or None if
+        the corresponding tree is not available.""")
+
+    @staticmethod
+    def parse(s, parse_fileid_xform=None, parse_corpus=None):
+        pieces = s.split()
+        if len(pieces) < 6:
+            raise ValueError('Badly formatted nombank line: %r' % s)
+
+        # Divide the line into its basic pieces.
+        (fileid, sentnum, wordnum,
+          baseform, sensenumber) = pieces[:5]
+
+        args = pieces[5:]
+        rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p]
+        if len(rel) != 1:
+            raise ValueError('Badly formatted nombank line: %r' % s)
+
+        # Apply the fileid selector, if any.
+        if parse_fileid_xform is not None:
+            fileid = parse_fileid_xform(fileid)
+
+        # Convert sentence & word numbers to ints.
+        sentnum = int(sentnum)
+        wordnum = int(wordnum)
+
+        # Parse the predicate location.
+
+        predloc, predid = rel[0].split('-', 1)
+        predicate = NombankTreePointer.parse(predloc)
+
+        # Parse the arguments.
+        arguments = []
+        for arg in args:
+            argloc, argid = arg.split('-', 1)
+            arguments.append( (NombankTreePointer.parse(argloc), argid) )
+
+        # Put it all together.
+        return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber,
+                               predicate, predid, arguments, parse_corpus)
+
+class NombankPointer(object):
+    """
+    A pointer used by nombank to identify one or more constituents in
+    a parse tree.  ``NombankPointer`` is an abstract base class with
+    three concrete subclasses:
+
+    - ``NombankTreePointer`` is used to point to single constituents.
+    - ``NombankSplitTreePointer`` is used to point to 'split'
+      constituents, which consist of a sequence of two or more
+      ``NombankTreePointer`` pointers.
+    - ``NombankChainTreePointer`` is used to point to entire trace
+      chains in a tree.  It consists of a sequence of pieces, which
+      can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
+    """
+    def __init__(self):
+        if self.__class__ == NombankPointer:
+            raise NotImplementedError()
+
+@python_2_unicode_compatible
+class NombankChainTreePointer(NombankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements may
+           be either ``NombankSplitTreePointer`` or
+           ``NombankTreePointer`` pointers."""
+
+    def __str__(self):
+        return '*'.join('%s' % p for p in self.pieces)
+    def __repr__(self):
+        return '<NombankChainTreePointer: %s>' % self
+    def select(self, tree):
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
+
+@python_2_unicode_compatible
+class NombankSplitTreePointer(NombankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements are
+           all ``NombankTreePointer`` pointers."""
+
+    def __str__(self):
+        return ','.join('%s' % p for p in self.pieces)
+    def __repr__(self):
+        return '<NombankSplitTreePointer: %s>' % self
+    def select(self, tree):
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
+
+@total_ordering
+@python_2_unicode_compatible
+class NombankTreePointer(NombankPointer):
+    """
+    wordnum:height*wordnum:height*...
+    wordnum:height,
+
+    """
+    def __init__(self, wordnum, height):
+        self.wordnum = wordnum
+        self.height = height
+
+    @staticmethod
+    def parse(s):
+        # Deal with chains (xx*yy*zz)
+        pieces = s.split('*')
+        if len(pieces) > 1:
+            return NombankChainTreePointer([NombankTreePointer.parse(elt)
+                                              for elt in pieces])
+
+        # Deal with split args (xx,yy,zz)
+        pieces = s.split(',')
+        if len(pieces) > 1:
+            return NombankSplitTreePointer([NombankTreePointer.parse(elt)
+                                             for elt in pieces])
+
+        # Deal with normal pointers.
+        pieces = s.split(':')
+        if len(pieces) != 2: raise ValueError('bad nombank pointer %r' % s)
+        return NombankTreePointer(int(pieces[0]), int(pieces[1]))
+
+    def __str__(self):
+        return '%s:%s' % (self.wordnum, self.height)
+
+    def __repr__(self):
+        return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
+
+    def __eq__(self, other):
+        while isinstance(other, (NombankChainTreePointer,
+                                 NombankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, NombankTreePointer):
+            return self is other
+
+        return (self.wordnum == other.wordnum and self.height == other.height)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        while isinstance(other, (NombankChainTreePointer,
+                                 NombankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, NombankTreePointer):
+            return id(self) < id(other)
+
+        return (self.wordnum, -self.height) < (other.wordnum, -other.height)
+
+    def select(self, tree):
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        return tree[self.treepos(tree)]
+
+    def treepos(self, tree):
+        """
+        Convert this pointer to a standard 'tree position' pointer,
+        given that it points to the given tree.
+        """
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        stack = [tree]
+        treepos = []
+
+        wordnum = 0
+        while True:
+            #print treepos
+            #print stack[-1]
+            # tree node:
+            if isinstance(stack[-1], Tree):
+                # Select the next child.
+                if len(treepos) < len(stack):
+                    treepos.append(0)
+                else:
+                    treepos[-1] += 1
+                # Update the stack.
+                if treepos[-1] < len(stack[-1]):
+                    stack.append(stack[-1][treepos[-1]])
+                else:
+                    # End of node's child list: pop up a level.
+                    stack.pop()
+                    treepos.pop()
+            # word node:
+            else:
+                if wordnum == self.wordnum:
+                    return tuple(treepos[:len(treepos)-self.height-1])
+                else:
+                    wordnum += 1
+                    stack.pop()
diff --git a/nlp_resource_data/nltk/corpus/reader/nombank.pyc b/nlp_resource_data/nltk/corpus/reader/nombank.pyc

new file mode 100755 (executable)

index 0000000..506b3eb

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/nombank.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/nps_chat.py b/nlp_resource_data/nltk/corpus/reader/nps_chat.py

new file mode 100755 (executable)

index 0000000..a2da13c
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/nps_chat.py
@@ -0,0 +1,73 @@
+# Natural Language Toolkit: NPS Chat Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+
+import re
+import textwrap
+
+from nltk.util import LazyConcatenation
+from nltk.internals import ElementWrapper
+from nltk.tag import map_tag
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import *
+
+class NPSChatCorpusReader(XMLCorpusReader):
+
+    def __init__(self, root, fileids, wrap_etree=False, tagset=None):
+        XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
+        self._tagset = tagset
+
+    def xml_posts(self, fileids=None):
+        if self._wrap_etree:
+            return concat([XMLCorpusView(fileid, 'Session/Posts/Post',
+                                         self._wrap_elt)
+                           for fileid in self.abspaths(fileids)])
+        else:
+            return concat([XMLCorpusView(fileid, 'Session/Posts/Post')
+                           for fileid in self.abspaths(fileids)])
+
+    def posts(self, fileids=None):
+        return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals',
+                                     self._elt_to_words)
+                       for fileid in self.abspaths(fileids)])
+
+    def tagged_posts(self, fileids=None, tagset=None):
+        def reader(elt, handler):
+            return self._elt_to_tagged_words(elt, handler, tagset)
+        return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals',
+                                     reader)
+                       for fileid in self.abspaths(fileids)])
+
+    def words(self, fileids=None):
+        return LazyConcatenation(self.posts(fileids))
+
+    def tagged_words(self, fileids=None, tagset=None):
+        return LazyConcatenation(self.tagged_posts(fileids, tagset))
+
+    def _wrap_elt(self, elt, handler):
+        return ElementWrapper(elt)
+
+    def _elt_to_words(self, elt, handler):
+        return [self._simplify_username(t.attrib['word'])
+                for t in elt.findall('t')]
+
+    def _elt_to_tagged_words(self, elt, handler, tagset=None):
+        tagged_post = [(self._simplify_username(t.attrib['word']),
+                        t.attrib['pos']) for t in elt.findall('t')]
+        if tagset and tagset != self._tagset:
+            tagged_post = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post]
+        return tagged_post
+
+    @staticmethod
+    def _simplify_username(word):
+        if 'User' in word:
+            word = 'U' + word.split('User', 1)[1]
+        elif isinstance(word, bytes):
+            word = word.decode('ascii')
+        return word
diff --git a/nlp_resource_data/nltk/corpus/reader/nps_chat.pyc b/nlp_resource_data/nltk/corpus/reader/nps_chat.pyc

new file mode 100755 (executable)

index 0000000..5fefe8a

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/nps_chat.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/opinion_lexicon.py b/nlp_resource_data/nltk/corpus/reader/opinion_lexicon.py

new file mode 100755 (executable)

index 0000000..0c70278
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/opinion_lexicon.py
@@ -0,0 +1,115 @@
+# Natural Language Toolkit: Opinion Lexicon Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Opinion Lexicon.
+
+- Opinion Lexicon information -
+Authors: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Sicence
+    University of Illinois at Chicago
+
+Contact: Bing Liu, liub@cs.uic.edu
+        http://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+Related papers:
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
+    & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
+
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
+    Comparing Opinions on the Web". Proceedings of the 14th International World
+    Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
+"""
+from six import string_types
+
+from nltk.corpus.reader import WordListCorpusReader
+from nltk.corpus.reader.api import *
+
+class IgnoreReadmeCorpusView(StreamBackedCorpusView):
+    """
+    This CorpusView is used to skip the initial readme block of the corpus.
+    """
+    def __init__(self, *args, **kwargs):
+        StreamBackedCorpusView.__init__(self, *args, **kwargs)
+        # open self._stream
+        self._open()
+        # skip the readme block
+        read_blankline_block(self._stream)
+        # Set the initial position to the current stream position
+        self._filepos = [self._stream.tell()]
+
+
+class OpinionLexiconCorpusReader(WordListCorpusReader):
+    """
+    Reader for Liu and Hu opinion lexicon.  Blank lines and readme are ignored.
+
+        >>> from nltk.corpus import opinion_lexicon
+        >>> opinion_lexicon.words()
+        ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
+
+    The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
+    words:
+
+        >>> opinion_lexicon.negative()
+        ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
+
+    Note that words from `words()` method are sorted by file id, not alphabetically:
+
+        >>> opinion_lexicon.words()[0:10]
+        ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
+        'abominate', 'abomination', 'abort', 'aborted']
+        >>> sorted(opinion_lexicon.words())[0:10]
+        ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
+        'abominate', 'abomination', 'abort']
+    """
+
+    CorpusView = IgnoreReadmeCorpusView
+
+    def words(self, fileids=None):
+        """
+        Return all words in the opinion lexicon. Note that these words are not
+        sorted in alphabetical order.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def positive(self):
+        """
+        Return all positive words in alphabetical order.
+
+        :return: a list of positive words.
+        :rtype: list(str)
+        """
+        return self.words('positive-words.txt')
+
+    def negative(self):
+        """
+        Return all negative words in alphabetical order.
+
+        :return: a list of negative words.
+        :rtype: list(str)
+        """
+        return self.words('negative-words.txt')
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            words.append(line.strip())
+        return words
diff --git a/nlp_resource_data/nltk/corpus/reader/opinion_lexicon.pyc b/nlp_resource_data/nltk/corpus/reader/opinion_lexicon.pyc

new file mode 100755 (executable)

index 0000000..a0cdfa5

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/opinion_lexicon.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/panlex_lite.py b/nlp_resource_data/nltk/corpus/reader/panlex_lite.py

new file mode 100755 (executable)

index 0000000..08d3399
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/panlex_lite.py
@@ -0,0 +1,165 @@
+# Natural Language Toolkit: PanLex Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: David Kamholz <kamholz@panlex.org>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
+as an SQLite database. See the README.txt in the panlex_lite corpus directory
+for more information on PanLex Lite.
+"""
+
+import os
+import sqlite3
+
+from nltk.corpus.reader.api import CorpusReader
+
+class PanLexLiteCorpusReader(CorpusReader):
+    MEANING_Q = """
+        SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
+        FROM dnx
+        JOIN ex ON (ex.ex = dnx.ex)
+        JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
+        JOIN ex ex2 ON (ex2.ex = dnx2.ex)
+        WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
+        ORDER BY dnx2.uq DESC
+    """
+
+    TRANSLATION_Q = """
+        SELECT s.tt, sum(s.uq) AS trq FROM (
+            SELECT ex2.tt, max(dnx.uq) AS uq
+            FROM dnx
+            JOIN ex ON (ex.ex = dnx.ex)
+            JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
+            JOIN ex ex2 ON (ex2.ex = dnx2.ex)
+            WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
+            GROUP BY ex2.tt, dnx.ui
+        ) s
+        GROUP BY s.tt
+        ORDER BY trq DESC, s.tt
+    """
+
+    def __init__(self, root):
+        self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
+
+        self._uid_lv = {}
+        self._lv_uid = {}
+
+        for row in self._c.execute('SELECT uid, lv FROM lv'):
+            self._uid_lv[row[0]] = row[1]
+            self._lv_uid[row[1]] = row[0]
+
+    def language_varieties(self, lc=None):
+        """
+        Return a list of PanLex language varieties.
+
+        :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
+            by this code. If unspecified, all varieties are returned.
+        :return: the specified language varieties as a list of tuples. The first
+            element is the language variety's seven-character uniform identifier,
+            and the second element is its default name.
+        :rtype: list(tuple)
+        """
+
+        if lc == None:
+            return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
+        else:
+            return self._c.execute('SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)).fetchall()
+
+    def meanings(self, expr_uid, expr_tt):
+        """
+        Return a list of meanings for an expression.
+
+        :param expr_uid: the expression's language variety, as a seven-character
+            uniform identifier.
+        :param expr_tt: the expression's text.
+        :return: a list of Meaning objects.
+        :rtype: list(Meaning)
+        """
+
+        expr_lv = self._uid_lv[expr_uid]
+
+        mn_info = {}
+
+        for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
+            mn = i[0]
+            uid = self._lv_uid[i[5]]
+
+            if not mn in mn_info:
+                mn_info[mn] = { 'uq': i[1], 'ap': i[2], 'ui': i[3], 'ex': { expr_uid: [expr_tt] } }
+
+            if not uid in mn_info[mn]['ex']:
+                mn_info[mn]['ex'][uid] = []
+
+            mn_info[mn]['ex'][uid].append(i[4])
+
+        return [ Meaning(mn, mn_info[mn]) for mn in mn_info ]
+
+    def translations(self, from_uid, from_tt, to_uid):
+        """
+        Return a list of translations for an expression into a single language
+            variety.
+
+        :param from_uid: the source expression's language variety, as a
+            seven-character uniform identifier.
+        :param from_tt: the source expression's text.
+        :param to_uid: the target language variety, as a seven-character
+            uniform identifier.
+        :return a list of translation tuples. The first element is the expression 
+            text and the second element is the translation quality.
+        :rtype: list(tuple)
+        """
+
+        from_lv = self._uid_lv[from_uid]
+        to_lv = self._uid_lv[to_uid]
+
+        return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
+
+class Meaning(dict):
+    """
+    Represents a single PanLex meaning. A meaning is a translation set derived
+    from a single source.
+    """
+
+    def __init__(self, mn, attr):
+        super(Meaning, self).__init__(**attr)
+        self['mn'] = mn
+
+    def id(self):
+        """
+        :return: the meaning's id.
+        :rtype: int
+        """
+        return self['mn']
+
+    def quality(self):
+        """
+        :return: the meaning's source's quality (0=worst, 9=best).
+        :rtype: int
+        """
+        return self['uq']
+
+    def source(self):
+        """
+        :return: the meaning's source id.
+        :rtype: int
+        """
+        return self['ap']
+
+    def source_group(self):
+        """
+        :return: the meaning's source group id.
+        :rtype: int
+        """
+        return self['ui']
+
+    def expressions(self):
+        """
+        :return: the meaning's expressions as a dictionary whose keys are language
+            variety uniform identifiers and whose values are lists of expression
+            texts.
+        :rtype: dict
+        """
+        return self['ex']
diff --git a/nlp_resource_data/nltk/corpus/reader/panlex_lite.pyc b/nlp_resource_data/nltk/corpus/reader/panlex_lite.pyc

new file mode 100755 (executable)

index 0000000..c57a041

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/panlex_lite.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/pl196x.py b/nlp_resource_data/nltk/corpus/reader/pl196x.py

new file mode 100755 (executable)

index 0000000..93b8b19
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/pl196x.py
@@ -0,0 +1,292 @@
+# Natural Language Toolkit:
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from six import string_types
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import XMLCorpusReader
+
+
+PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
+SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
+
+TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
+WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
+
+TYPE = re.compile(r'type="(.*?)"')
+ANA = re.compile(r'ana="(.*?)"')
+
+TEXTID = re.compile(r'text id="(.*?)"')
+
+
+class TEICorpusView(StreamBackedCorpusView):
+    def __init__(self, corpus_file,
+                 tagged, group_by_sent, group_by_para,
+                 tagset=None, head_len=0, textids=None):
+
+        self._tagged = tagged
+        self._textids = textids
+
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        # WARNING -- skip header
+        StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
+
+    _pagesize = 4096
+
+    def read_block(self, stream):
+        block = stream.readlines(self._pagesize)
+        block = concat(block)
+        while (block.count('<text id') > block.count('</text>')) \
+                or block.count('<text id') == 0:
+            tmp = stream.readline()
+            if len(tmp) <= 0:
+                break
+            block += tmp
+
+        block = block.replace('\n', '')
+
+        textids = TEXTID.findall(block)
+        if self._textids:
+            for tid in textids:
+                if tid not in self._textids:
+                    beg = block.find(tid) - 1
+                    end = block[beg:].find('</text>') + len('</text>')
+                    block = block[:beg] + block[beg + end:]
+
+        output = []
+        for para_str in PARA.findall(block):
+            para = []
+            for sent_str in SENT.findall(para_str):
+                if not self._tagged:
+                    sent = WORD.findall(sent_str)
+                else:
+                    sent = list(
+                        map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                output.append(para)
+            else:
+                output.extend(para)
+        return output
+
+    def _parse_tag(self, tag_word_tuple):
+        (tag, word) = tag_word_tuple
+        if tag.startswith('w'):
+            tag = ANA.search(tag).group(1)
+        else:  # tag.startswith('c')
+            tag = TYPE.search(tag).group(1)
+        return word, tag
+
+
+class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
+    head_len = 2770
+
+    def __init__(self, *args, **kwargs):
+        if 'textid_file' in kwargs:
+            self._textids = kwargs['textid_file']
+        else:
+            self._textids = None
+
+        XMLCorpusReader.__init__(self, *args)
+        CategorizedCorpusReader.__init__(self, kwargs)
+
+        self._init_textids()
+
+    def _init_textids(self):
+        self._f2t = defaultdict(list)
+        self._t2f = defaultdict(list)
+        if self._textids is not None:
+            with open(self._textids) as fp:
+                for line in fp:
+                    line = line.strip()
+                    file_id, text_ids = line.split(' ', 1)
+                    if file_id not in self.fileids():
+                        raise ValueError(
+                            'In text_id mapping file %s: %s not found'
+                            % (self._textids, file_id)
+                        )
+                    for text_id in text_ids.split(self._delimiter):
+                        self._add_textids(file_id, text_id)
+
+    def _add_textids(self, file_id, text_id):
+        self._f2t[file_id].append(text_id)
+        self._t2f[text_id].append(file_id)
+
+    def _resolve(self, fileids, categories, textids=None):
+        tmp = None
+        if len(filter(lambda accessor: accessor is None,
+                      (fileids, categories, textids))) != 1:
+
+            raise ValueError('Specify exactly one of: fileids, '
+                             'categories or textids')
+
+        if fileids is not None:
+            return fileids, None
+
+        if categories is not None:
+            return self.fileids(categories), None
+
+        if textids is not None:
+            if isinstance(textids, string_types):
+                textids = [textids]
+            files = sum((self._t2f[t] for t in textids), [])
+            tdict = dict()
+            for f in files:
+                tdict[f] = (set(self._f2t[f]) & set(textids))
+            return files, tdict
+
+    def decode_tag(self, tag):
+        # to be implemented
+        return tag
+
+    def textids(self, fileids=None, categories=None):
+        """
+        In the pl196x corpus each category is stored in single
+        file and thus both methods provide identical functionality. In order
+        to accommodate finer granularity, a non-standard textids() method was
+        implemented. All the main functions can be supplied with a list
+        of required chunks---giving much more control to the user.
+        """
+        fileids, _ = self._resolve(fileids, categories)
+        if fileids is None: return sorted(self._t2f)
+
+        if isinstance(fileids, string_types):
+            fileids = [fileids]
+        return sorted(sum((self._f2t[d] for d in fileids), []))
+
+    def words(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        if textids:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         False, False, False,
+                                         head_len=self.head_len,
+                                         textids=textids[fileid])
+                           for fileid in fileids])
+        else:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         False, False, False,
+                                         head_len=self.head_len)
+                           for fileid in fileids])
+
+    def sents(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        if textids:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         False, True, False,
+                                         head_len=self.head_len,
+                                         textids=textids[fileid])
+                           for fileid in fileids])
+        else:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         False, True, False,
+                                         head_len=self.head_len)
+                           for fileid in fileids])
+
+    def paras(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        if textids:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         False, True, True,
+                                         head_len=self.head_len,
+                                         textids=textids[fileid])
+                           for fileid in fileids])
+        else:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         False, True, True,
+                                         head_len=self.head_len)
+                           for fileid in fileids])
+
+    def tagged_words(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        if textids:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         True, False, False,
+                                         head_len=self.head_len,
+                                         textids=textids[fileid])
+                           for fileid in fileids])
+        else:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         True, False, False,
+                                         head_len=self.head_len)
+                           for fileid in fileids])
+
+    def tagged_sents(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        if textids:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         True, True, False,
+                                         head_len=self.head_len,
+                                         textids=textids[fileid])
+                           for fileid in fileids])
+        else:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         True, True, False,
+                                         head_len=self.head_len)
+                           for fileid in fileids])
+
+    def tagged_paras(self, fileids=None, categories=None, textids=None):
+        fileids, textids = self._resolve(fileids, categories, textids)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+
+        if textids:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         True, True, True,
+                                         head_len=self.head_len,
+                                         textids=textids[fileid])
+                           for fileid in fileids])
+        else:
+            return concat([TEICorpusView(self.abspath(fileid),
+                                         True, True, True,
+                                         head_len=self.head_len)
+                           for fileid in fileids])
+
+    def xml(self, fileids=None, categories=None):
+        fileids, _ = self._resolve(fileids, categories)
+        if len(fileids) == 1:
+            return XMLCorpusReader.xml(self, fileids[0])
+        else:
+            raise TypeError('Expected a single file')
+
+    def raw(self, fileids=None, categories=None):
+        fileids, _ = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
diff --git a/nlp_resource_data/nltk/corpus/reader/pl196x.pyc b/nlp_resource_data/nltk/corpus/reader/pl196x.pyc

new file mode 100755 (executable)

index 0000000..9cafe4a

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/pl196x.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/plaintext.py b/nlp_resource_data/nltk/corpus/reader/plaintext.py

new file mode 100755 (executable)

index 0000000..332b6aa
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/plaintext.py
@@ -0,0 +1,232 @@
+# Natural Language Toolkit: Plaintext Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Nitin Madnani <nmadnani@umiacs.umd.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that consist of plaintext documents.
+"""
+
+from six import string_types
+import codecs
+
+import nltk.data
+from nltk.tokenize import *
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class PlaintextCorpusReader(CorpusReader):
+    """
+    Reader for corpora that consist of plaintext documents.  Paragraphs
+    are assumed to be split using blank lines.  Sentences and words can
+    be tokenized using the default tokenizers, or by custom tokenizers
+    specificed as parameters to the constructor.
+
+    This corpus reader can be customized (e.g., to skip preface
+    sections of specific document formats) by creating a subclass and
+    overriding the ``CorpusView`` class variable.
+    """
+
+    CorpusView = StreamBackedCorpusView
+    """The corpus view class used by this reader.  Subclasses of
+       ``PlaintextCorpusReader`` may specify alternative corpus view
+       classes (e.g., to skip the preface sections of documents.)"""
+
+    def __init__(self, root, fileids,
+                 word_tokenizer=WordPunctTokenizer(),
+                 sent_tokenizer=nltk.data.LazyLoader(
+                     'tokenizers/punkt/english.pickle'),
+                 para_block_reader=read_blankline_block,
+                 encoding='utf8'):
+        """
+        Construct a new plaintext corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
+            >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
+
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: Tokenizer for breaking sentences or
+            paragraphs into words.
+        :param sent_tokenizer: Tokenizer for breaking paragraphs
+            into words.
+        :param para_block_reader: The block reader used to divide the
+            corpus into paragraph blocks.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+
+    def raw(self, fileids=None):
+        """
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        raw_texts = []
+        for f in fileids:
+            _fin = self.open(f)
+            raw_texts.append(_fin.read())
+            _fin.close()
+        return concat(raw_texts)
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        if self._sent_tokenizer is None:
+            raise ValueError('No sentence tokenizer for this corpus')
+
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        if self._sent_tokenizer is None:
+            raise ValueError('No sentence tokenizer for this corpus')
+
+        return concat([self.CorpusView(path, self._read_para_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20): # Read 20 lines at a time.
+            words.extend(self._word_tokenizer.tokenize(stream.readline()))
+        return words
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for para in self._para_block_reader(stream):
+            sents.extend([self._word_tokenizer.tokenize(sent)
+                          for sent in self._sent_tokenizer.tokenize(para)])
+        return sents
+
+    def _read_para_block(self, stream):
+        paras = []
+        for para in self._para_block_reader(stream):
+            paras.append([self._word_tokenizer.tokenize(sent)
+                          for sent in self._sent_tokenizer.tokenize(para)])
+        return paras
+
+
+class CategorizedPlaintextCorpusReader(CategorizedCorpusReader,
+                                    PlaintextCorpusReader):
+    """
+    A reader for plaintext corpora whose documents are divided into
+    categories based on their file identifiers.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``PlaintextCorpusReader`` constructor.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        PlaintextCorpusReader.__init__(self, *args, **kwargs)
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+    def raw(self, fileids=None, categories=None):
+        return PlaintextCorpusReader.raw(
+            self, self._resolve(fileids, categories))
+    def words(self, fileids=None, categories=None):
+        return PlaintextCorpusReader.words(
+            self, self._resolve(fileids, categories))
+    def sents(self, fileids=None, categories=None):
+        return PlaintextCorpusReader.sents(
+            self, self._resolve(fileids, categories))
+    def paras(self, fileids=None, categories=None):
+        return PlaintextCorpusReader.paras(
+            self, self._resolve(fileids, categories))
+
+# is there a better way?
+class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
+    def __init__(self, *args, **kwargs):
+        CategorizedCorpusReader.__init__(self, kwargs)
+        kwargs['sent_tokenizer'] = nltk.data.LazyLoader('tokenizers/punkt/portuguese.pickle')
+        PlaintextCorpusReader.__init__(self, *args, **kwargs)
+
+class EuroparlCorpusReader(PlaintextCorpusReader):
+
+    """
+    Reader for Europarl corpora that consist of plaintext documents.
+    Documents are divided into chapters instead of paragraphs as
+    for regular plaintext documents. Chapters are separated using blank
+    lines. Everything is inherited from ``PlaintextCorpusReader`` except
+    that:
+      - Since the corpus is pre-processed and pre-tokenized, the
+        word tokenizer should just split the line at whitespaces.
+      - For the same reason, the sentence tokenizer should just
+        split the paragraph at line breaks.
+      - There is a new 'chapters()' method that returns chapters instead
+        instead of paragraphs.
+      - The 'paras()' method inherited from PlaintextCorpusReader is
+        made non-functional to remove any confusion between chapters
+        and paragraphs for Europarl.
+    """
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20): # Read 20 lines at a time.
+            words.extend(stream.readline().split())
+        return words
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for para in self._para_block_reader(stream):
+            sents.extend([sent.split() for sent in para.splitlines()])
+        return sents
+
+    def _read_para_block(self, stream):
+        paras = []
+        for para in self._para_block_reader(stream):
+            paras.append([sent.split() for sent in para.splitlines()])
+        return paras
+
+    def chapters(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            chapters, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat([self.CorpusView(fileid, self._read_para_block,
+                                       encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def paras(self, fileids=None):
+        raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.')
diff --git a/nlp_resource_data/nltk/corpus/reader/plaintext.pyc b/nlp_resource_data/nltk/corpus/reader/plaintext.pyc

new file mode 100755 (executable)

index 0000000..467729c

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/plaintext.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/ppattach.py b/nlp_resource_data/nltk/corpus/reader/ppattach.py

new file mode 100755 (executable)

index 0000000..9c0ac65
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/ppattach.py
@@ -0,0 +1,96 @@
+# Natural Language Toolkit: PP Attachment Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read lines from the Prepositional Phrase Attachment Corpus.
+
+The PP Attachment Corpus contains several files having the format:
+
+sentence_id verb noun1 preposition noun2 attachment
+
+For example:
+
+42960 gives authority to administration V
+46742 gives inventors of microchip N
+
+The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
+
+(VP gives (NP authority) (PP to administration))
+(VP gives (NP inventors (PP of microchip)))
+
+The corpus contains the following files:
+
+training:   training set
+devset:     development test set, used for algorithm development.
+test:       test set, used to report results
+bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
+
+Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
+Phrase Attachment.  Proceedings of the ARPA Human Language Technology
+Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
+
+The PP Attachment Corpus is distributed with NLTK with the permission
+of the author.
+"""
+from __future__ import unicode_literals
+
+from six import string_types
+
+from nltk import compat
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+
+@compat.python_2_unicode_compatible
+class PPAttachment(object):
+    def __init__(self, sent, verb, noun1, prep, noun2, attachment):
+        self.sent = sent
+        self.verb = verb
+        self.noun1 = noun1
+        self.prep = prep
+        self.noun2 = noun2
+        self.attachment = attachment
+
+    def __repr__(self):
+        return ('PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, '
+                'noun2=%r, attachment=%r)' %
+                (self.sent, self.verb, self.noun1, self.prep,
+                 self.noun2, self.attachment))
+
+class PPAttachmentCorpusReader(CorpusReader):
+    """
+    sentence_id verb noun1 preposition noun2 attachment
+    """
+    def attachments(self, fileids):
+        return concat([StreamBackedCorpusView(fileid, self._read_obj_block,
+                                              encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tuples(self, fileids):
+        return concat([StreamBackedCorpusView(fileid, self._read_tuple_block,
+                                              encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def _read_tuple_block(self, stream):
+        line = stream.readline()
+        if line:
+            return [tuple(line.split())]
+        else:
+            return []
+
+    def _read_obj_block(self, stream):
+        line = stream.readline()
+        if line:
+            return [PPAttachment(*line.split())]
+        else:
+            return []
diff --git a/nlp_resource_data/nltk/corpus/reader/ppattach.pyc b/nlp_resource_data/nltk/corpus/reader/ppattach.pyc

new file mode 100755 (executable)

index 0000000..8b4f874

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/ppattach.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/propbank.py b/nlp_resource_data/nltk/corpus/reader/propbank.py

new file mode 100755 (executable)

index 0000000..343858a
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/propbank.py
@@ -0,0 +1,479 @@
+# Natural Language Toolkit: PropBank Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals
+import re
+from functools import total_ordering
+from xml.etree import ElementTree
+
+from six import string_types
+
+from nltk.tree import Tree
+from nltk.internals import raise_unorderable_types
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class PropbankCorpusReader(CorpusReader):
+    """
+    Corpus reader for the propbank corpus, which augments the Penn
+    Treebank with information about the predicate argument structure
+    of every verb instance.  The corpus consists of two parts: the
+    predicate-argument annotations themselves, and a set of "frameset
+    files" which define the argument labels used by the annotations,
+    on a per-verb basis.  Each "frameset file" contains one or more
+    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
+    divided into coarse-grained word senses called "rolesets".  For
+    each "roleset", the frameset file provides descriptions of the
+    argument roles, along with examples.
+    """
+    def __init__(self, root, propfile, framefiles='',
+                 verbsfile=None, parse_fileid_xform=None,
+                 parse_corpus=None, encoding='utf8'):
+        """
+        :param root: The root directory for this corpus.
+        :param propfile: The name of the file containing the predicate-
+            argument annotations (relative to ``root``).
+        :param framefiles: A list or regexp specifying the frameset
+            fileids for this corpus.
+        :param parse_fileid_xform: A transform that should be applied
+            to the fileids in this corpus.  This should be a function
+            of one argument (a fileid) that returns a string (the new
+            fileid).
+        :param parse_corpus: The corpus containing the parse trees
+            corresponding to this corpus.  These parse trees are
+            necessary to resolve the tree pointers used by propbank.
+        """
+        # If framefiles is specified as a regexp, expand it.
+        if isinstance(framefiles, string_types):
+            framefiles = find_corpus_fileids(root, framefiles)
+        framefiles = list(framefiles)
+        # Initialze the corpus reader.
+        CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles,
+                              encoding)
+
+        # Record our frame fileids & prop file.
+        self._propfile = propfile
+        self._framefiles = framefiles
+        self._verbsfile = verbsfile
+        self._parse_fileid_xform = parse_fileid_xform
+        self._parse_corpus = parse_corpus
+
+    def raw(self, fileids=None):
+        """
+        :return: the text contents of the given fileids, as a single string.
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, ): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def instances(self, baseform=None):
+        """
+        :return: a corpus view that acts as a list of
+        ``PropBankInstance`` objects, one for each noun in the corpus.
+        """
+        kwargs = {}
+        if baseform is not None:
+            kwargs['instance_filter'] = lambda inst: inst.baseform==baseform
+        return StreamBackedCorpusView(self.abspath(self._propfile),
+                                      lambda stream: self._read_instance_block(stream, **kwargs),
+                                      encoding=self.encoding(self._propfile))
+
+    def lines(self):
+        """
+        :return: a corpus view that acts as a list of strings, one for
+        each line in the predicate-argument annotation file.
+        """
+        return StreamBackedCorpusView(self.abspath(self._propfile),
+                                      read_line_block,
+                                      encoding=self.encoding(self._propfile))
+
+    def roleset(self, roleset_id):
+        """
+        :return: the xml description for the given roleset.
+        """
+        baseform = roleset_id.split('.')[0]
+        framefile = 'frames/%s.xml' % baseform
+        if framefile not in self._framefiles:
+            raise ValueError('Frameset file for %s not found' %
+                             roleset_id)
+
+        # n.b.: The encoding for XML fileids is specified by the file
+        # itself; so we ignore self._encoding here.
+        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
+        for roleset in etree.findall('predicate/roleset'):
+            if roleset.attrib['id'] == roleset_id:
+                return roleset
+        raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
+
+    def rolesets(self, baseform=None):
+        """
+        :return: list of xml descriptions for rolesets.
+        """
+        if baseform is not None:
+            framefile = 'frames/%s.xml' % baseform
+            if framefile not in self._framefiles:
+                raise ValueError('Frameset file for %s not found' %
+                                 baseform)
+            framefiles = [framefile]
+        else:
+            framefiles = self._framefiles
+
+        rsets = []
+        for framefile in framefiles:
+            # n.b.: The encoding for XML fileids is specified by the file
+            # itself; so we ignore self._encoding here.
+            etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
+            rsets.append(etree.findall('predicate/roleset'))
+        return LazyConcatenation(rsets)
+
+    def verbs(self):
+        """
+        :return: a corpus view that acts as a list of all verb lemmas
+        in this corpus (from the verbs.txt file).
+        """
+        return StreamBackedCorpusView(self.abspath(self._verbsfile),
+                                      read_line_block,
+                                      encoding=self.encoding(self._verbsfile))
+
+    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
+        block = []
+
+        # Read 100 at a time.
+        for i in range(100):
+            line = stream.readline().strip()
+            if line:
+                inst = PropbankInstance.parse(
+                    line, self._parse_fileid_xform,
+                    self._parse_corpus)
+                if instance_filter(inst):
+                    block.append(inst)
+
+        return block
+
+######################################################################
+#{ Propbank Instance & related datatypes
+######################################################################
+
+@compat.python_2_unicode_compatible
+class PropbankInstance(object):
+
+    def __init__(self, fileid, sentnum, wordnum, tagger, roleset,
+                 inflection, predicate, arguments, parse_corpus=None):
+
+        self.fileid = fileid
+        """The name of the file containing the parse tree for this
+        instance's sentence."""
+
+        self.sentnum = sentnum
+        """The sentence number of this sentence within ``fileid``.
+        Indexing starts from zero."""
+
+        self.wordnum = wordnum
+        """The word number of this instance's predicate within its
+        containing sentence.  Word numbers are indexed starting from
+        zero, and include traces and other empty parse elements."""
+
+        self.tagger = tagger
+        """An identifier for the tagger who tagged this instance; or
+        ``'gold'`` if this is an adjuticated instance."""
+
+        self.roleset = roleset
+        """The name of the roleset used by this instance's predicate.
+        Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
+        look up information about the roleset."""
+
+        self.inflection = inflection
+        """A ``PropbankInflection`` object describing the inflection of
+        this instance's predicate."""
+
+        self.predicate = predicate
+        """A ``PropbankTreePointer`` indicating the position of this
+        instance's predicate within its containing sentence."""
+
+        self.arguments = tuple(arguments)
+        """A list of tuples (argloc, argid), specifying the location
+        and identifier for each of the predicate's argument in the
+        containing sentence.  Argument identifiers are strings such as
+        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain
+        the predicate."""
+
+        self.parse_corpus = parse_corpus
+        """A corpus reader for the parse trees corresponding to the
+        instances in this propbank corpus."""
+
+    @property
+    def baseform(self):
+        """The baseform of the predicate."""
+        return self.roleset.split('.')[0]
+
+    @property
+    def sensenumber(self):
+        """The sense number of the predicate."""
+        return self.roleset.split('.')[1]
+
+    @property
+    def predid(self):
+        """Identifier of the predicate."""
+        return 'rel'
+
+    def __repr__(self):
+        return ('<PropbankInstance: %s, sent %s, word %s>' %
+                (self.fileid, self.sentnum, self.wordnum))
+
+    def __str__(self):
+        s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
+                                   self.tagger, self.roleset, self.inflection)
+        items = self.arguments + ((self.predicate, 'rel'),)
+        for (argloc, argid) in sorted(items):
+            s += ' %s-%s' % (argloc, argid)
+        return s
+
+    def _get_tree(self):
+        if self.parse_corpus is None: return None
+        if self.fileid not in self.parse_corpus.fileids(): return None
+        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
+    tree = property(_get_tree, doc="""
+        The parse tree corresponding to this instance, or None if
+        the corresponding tree is not available.""")
+
+    @staticmethod
+    def parse(s, parse_fileid_xform=None, parse_corpus=None):
+        pieces = s.split()
+        if len(pieces) < 7:
+            raise ValueError('Badly formatted propbank line: %r' % s)
+
+        # Divide the line into its basic pieces.
+        (fileid, sentnum, wordnum,
+         tagger, roleset, inflection) = pieces[:6]
+        rel = [p for p in pieces[6:] if p.endswith('-rel')]
+        args = [p for p in pieces[6:] if not p.endswith('-rel')]
+        if len(rel) != 1:
+            raise ValueError('Badly formatted propbank line: %r' % s)
+
+        # Apply the fileid selector, if any.
+        if parse_fileid_xform is not None:
+            fileid = parse_fileid_xform(fileid)
+
+        # Convert sentence & word numbers to ints.
+        sentnum = int(sentnum)
+        wordnum = int(wordnum)
+
+        # Parse the inflection
+        inflection = PropbankInflection.parse(inflection)
+
+        # Parse the predicate location.
+        predicate = PropbankTreePointer.parse(rel[0][:-4])
+
+        # Parse the arguments.
+        arguments = []
+        for arg in args:
+            argloc, argid = arg.split('-', 1)
+            arguments.append( (PropbankTreePointer.parse(argloc), argid) )
+
+        # Put it all together.
+        return PropbankInstance(fileid, sentnum, wordnum, tagger,
+                                roleset, inflection, predicate,
+                                arguments, parse_corpus)
+
+class PropbankPointer(object):
+    """
+    A pointer used by propbank to identify one or more constituents in
+    a parse tree.  ``PropbankPointer`` is an abstract base class with
+    three concrete subclasses:
+
+      - ``PropbankTreePointer`` is used to point to single constituents.
+      - ``PropbankSplitTreePointer`` is used to point to 'split'
+        constituents, which consist of a sequence of two or more
+        ``PropbankTreePointer`` pointers.
+      - ``PropbankChainTreePointer`` is used to point to entire trace
+        chains in a tree.  It consists of a sequence of pieces, which
+        can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
+    """
+    def __init__(self):
+        if self.__class__ == PropbankPointer:
+            raise NotImplementedError()
+
+@compat.python_2_unicode_compatible
+class PropbankChainTreePointer(PropbankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements may
+           be either ``PropbankSplitTreePointer`` or
+           ``PropbankTreePointer`` pointers."""
+
+    def __str__(self):
+        return '*'.join('%s' % p for p in self.pieces)
+    def __repr__(self):
+        return '<PropbankChainTreePointer: %s>' % self
+    def select(self, tree):
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
+
+
+@compat.python_2_unicode_compatible
+class PropbankSplitTreePointer(PropbankPointer):
+    def __init__(self, pieces):
+        self.pieces = pieces
+        """A list of the pieces that make up this chain.  Elements are
+           all ``PropbankTreePointer`` pointers."""
+
+    def __str__(self):
+        return ','.join('%s' % p for p in self.pieces)
+    def __repr__(self):
+        return '<PropbankSplitTreePointer: %s>' % self
+    def select(self, tree):
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
+
+
+@total_ordering
+@compat.python_2_unicode_compatible
+class PropbankTreePointer(PropbankPointer):
+    """
+    wordnum:height*wordnum:height*...
+    wordnum:height,
+
+    """
+    def __init__(self, wordnum, height):
+        self.wordnum = wordnum
+        self.height = height
+
+    @staticmethod
+    def parse(s):
+        # Deal with chains (xx*yy*zz)
+        pieces = s.split('*')
+        if len(pieces) > 1:
+            return PropbankChainTreePointer([PropbankTreePointer.parse(elt)
+                                              for elt in pieces])
+
+        # Deal with split args (xx,yy,zz)
+        pieces = s.split(',')
+        if len(pieces) > 1:
+            return PropbankSplitTreePointer([PropbankTreePointer.parse(elt)
+                                             for elt in pieces])
+
+        # Deal with normal pointers.
+        pieces = s.split(':')
+        if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s)
+        return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
+
+    def __str__(self):
+        return '%s:%s' % (self.wordnum, self.height)
+
+    def __repr__(self):
+        return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
+
+    def __eq__(self, other):
+        while isinstance(other, (PropbankChainTreePointer,
+                                 PropbankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, PropbankTreePointer):
+            return self is other
+
+        return (self.wordnum == other.wordnum and self.height == other.height)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        while isinstance(other, (PropbankChainTreePointer,
+                                 PropbankSplitTreePointer)):
+            other = other.pieces[0]
+
+        if not isinstance(other, PropbankTreePointer):
+            return id(self) < id(other)
+
+        return (self.wordnum, -self.height) < (other.wordnum, -other.height)
+
+    def select(self, tree):
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        return tree[self.treepos(tree)]
+
+    def treepos(self, tree):
+        """
+        Convert this pointer to a standard 'tree position' pointer,
+        given that it points to the given tree.
+        """
+        if tree is None: raise ValueError('Parse tree not avaialable')
+        stack = [tree]
+        treepos = []
+
+        wordnum = 0
+        while True:
+            #print treepos
+            #print stack[-1]
+            # tree node:
+            if isinstance(stack[-1], Tree):
+                # Select the next child.
+                if len(treepos) < len(stack):
+                    treepos.append(0)
+                else:
+                    treepos[-1] += 1
+                # Update the stack.
+                if treepos[-1] < len(stack[-1]):
+                    stack.append(stack[-1][treepos[-1]])
+                else:
+                    # End of node's child list: pop up a level.
+                    stack.pop()
+                    treepos.pop()
+            # word node:
+            else:
+                if wordnum == self.wordnum:
+                    return tuple(treepos[:len(treepos)-self.height-1])
+                else:
+                    wordnum += 1
+                    stack.pop()
+
+@compat.python_2_unicode_compatible
+class PropbankInflection(object):
+    #{ Inflection Form
+    INFINITIVE = 'i'
+    GERUND = 'g'
+    PARTICIPLE = 'p'
+    FINITE = 'v'
+    #{ Inflection Tense
+    FUTURE = 'f'
+    PAST = 'p'
+    PRESENT = 'n'
+    #{ Inflection Aspect
+    PERFECT = 'p'
+    PROGRESSIVE = 'o'
+    PERFECT_AND_PROGRESSIVE = 'b'
+    #{ Inflection Person
+    THIRD_PERSON = '3'
+    #{ Inflection Voice
+    ACTIVE = 'a'
+    PASSIVE = 'p'
+    #{ Inflection
+    NONE = '-'
+    #}
+
+    def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
+        self.form = form
+        self.tense = tense
+        self.aspect = aspect
+        self.person = person
+        self.voice = voice
+
+    def __str__(self):
+        return self.form+self.tense+self.aspect+self.person+self.voice
+
+    def __repr__(self):
+        return '<PropbankInflection: %s>' % self
+
+    _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
+
+    @staticmethod
+    def parse(s):
+        if not isinstance(s, string_types):
+            raise TypeError('expected a string')
+        if (len(s) != 5 or
+            not PropbankInflection._VALIDATE.match(s)):
+            raise ValueError('Bad propbank inflection string %r' % s)
+        return PropbankInflection(*s)
diff --git a/nlp_resource_data/nltk/corpus/reader/propbank.pyc b/nlp_resource_data/nltk/corpus/reader/propbank.pyc

new file mode 100755 (executable)

index 0000000..52302d3

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/propbank.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/pros_cons.py b/nlp_resource_data/nltk/corpus/reader/pros_cons.py

new file mode 100755 (executable)

index 0000000..61e904e
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/pros_cons.py
@@ -0,0 +1,128 @@
+# Natural Language Toolkit: Pros and Cons Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Pros and Cons dataset.
+
+- Pros and Cons dataset information -
+
+Contact: Bing Liu, liub@cs.uic.edu
+        http://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+Related papers:
+
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
+    Opinions on the Web". Proceedings of the 14th international World Wide Web
+    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
+"""
+import re
+
+from six import string_types
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+
+class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    Reader for the Pros and Cons sentence dataset.
+
+        >>> from nltk.corpus import pros_cons
+        >>> pros_cons.sents(categories='Cons')
+        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
+        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
+        ...]
+        >>> pros_cons.words('IntegratedPros.txt')
+        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(),
+                 encoding='utf8', **kwargs):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified files/categories.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences
+            have to be returned.
+        :return: the given file(s) as a list of sentences. Each sentence is
+            tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files/categories.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have
+            to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
+            if sent:
+                sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
diff --git a/nlp_resource_data/nltk/corpus/reader/pros_cons.pyc b/nlp_resource_data/nltk/corpus/reader/pros_cons.pyc

new file mode 100755 (executable)

index 0000000..a1daa86

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/pros_cons.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/reviews.py b/nlp_resource_data/nltk/corpus/reader/reviews.py

new file mode 100755 (executable)

index 0000000..1ce3d25
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/reviews.py
@@ -0,0 +1,330 @@
+# Natural Language Toolkit: Product Reviews Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
+
+- Customer Review Corpus information -
+Annotated by: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Sicence
+    University of Illinois at Chicago
+
+Contact: Bing Liu, liub@cs.uic.edu
+        http://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+The "product_reviews_1" and "product_reviews_2" datasets respectively contain
+annotated customer reviews of 5 and 9 products from amazon.com.
+
+Related papers:
+
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge
+    Discovery & Data Mining (KDD-04), 2004.
+
+- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
+    Proceedings of Nineteeth National Conference on Artificial Intelligence
+    (AAAI-2004), 2004.
+
+- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
+    Opinion Mining." Proceedings of First ACM International Conference on Web
+    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
+    Stanford, California, USA.
+
+Symbols used in the annotated reviews:
+
+    [t] : the title of the review: Each [t] tag starts a review.
+    xxxx[+|-n]: xxxx is a product feature.
+    [+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
+          Note that the strength is quite subjective.
+          You may want ignore it, but only considering + and -
+    [-n]: Negative opinion
+    ##  : start of each sentence. Each line is a sentence.
+    [u] : feature not appeared in the sentence.
+    [p] : feature not appeared in the sentence. Pronoun resolution is needed.
+    [s] : suggestion or recommendation.
+    [cc]: comparison with a competing product from a different brand.
+    [cs]: comparison with a competing product from the same brand.
+
+Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
+    provide separation between different reviews. This is due to the fact that
+    the dataset was specifically designed for aspect/feature-based sentiment
+    analysis, for which sentence-level annotation is sufficient. For document-
+    level classification and analysis, this peculiarity should be taken into
+    consideration.
+"""
+
+from __future__ import division
+
+from six import string_types
+
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+TITLE = re.compile(r'^\[t\](.*)$') # [t] Title
+FEATURES = re.compile(r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]') # find 'feature' in feature[+3]
+NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p]
+SENT = re.compile(r'##(.*)$') # find tokenized sentence
+
+
+@compat.python_2_unicode_compatible
+class Review(object):
+    """
+    A Review is the main block of a ReviewsCorpusReader.
+    """
+    def __init__(self, title=None, review_lines=None):
+        """
+        :param title: the title of the review.
+        :param review_lines: the list of the ReviewLines that belong to the Review.
+        """
+        self.title = title
+        if review_lines is None:
+            self.review_lines = []
+        else:
+            self.review_lines = review_lines
+
+    def add_line(self, review_line):
+        """
+        Add a line (ReviewLine) to the review.
+
+        :param review_line: a ReviewLine instance that belongs to the Review.
+        """
+        assert isinstance(review_line, ReviewLine)
+        self.review_lines.append(review_line)
+
+    def features(self):
+        """
+        Return a list of features in the review. Each feature is a tuple made of
+        the specific item feature and the opinion strength about that feature.
+
+        :return: all features of the review as a list of tuples (feat, score).
+        :rtype: list(tuple)
+        """
+        features = []
+        for review_line in self.review_lines:
+            features.extend(review_line.features)
+        return features
+
+    def sents(self):
+        """
+        Return all tokenized sentences in the review.
+
+        :return: all sentences of the review as lists of tokens.
+        :rtype: list(list(str))
+        """
+        return [review_line.sent for review_line in self.review_lines]
+
+    def __repr__(self):
+        return 'Review(title=\"{}\", review_lines={})'.format(self.title, self.review_lines)
+
+
+@compat.python_2_unicode_compatible
+class ReviewLine(object):
+    """
+    A ReviewLine represents a sentence of the review, together with (optional)
+    annotations of its features and notes about the reviewed item.
+    """
+    def __init__(self, sent, features=None, notes=None):
+        self.sent = sent
+        if features is None:
+            self.features = []
+        else:
+            self.features = features
+
+        if notes is None:
+            self.notes = []
+        else:
+            self.notes = notes
+
+    def __repr__(self):
+        return ('ReviewLine(features={}, notes={}, sent={})'.format(
+            self.features, self.notes, self.sent))
+
+
+class ReviewsCorpusReader(CorpusReader):
+    """
+    Reader for the Customer Review Data dataset by Hu, Liu (2004).
+    Note: we are not applying any sentence tokenization at the moment, just word
+    tokenization.
+
+        >>> from nltk.corpus import product_reviews_1
+        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
+        >>> review = camera_reviews[0]
+        >>> review.sents()[0]
+        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
+        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
+        >>> review.features()
+        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
+        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
+        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
+        ('option', '+1')]
+
+    We can also reach the same information directly from the stream:
+
+        >>> product_reviews_1.features('Canon_G3.txt')
+        [('canon powershot g3', '+3'), ('use', '+2'), ...]
+
+    We can compute stats for specific product features:
+
+        >>> from __future__ import division
+        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> # We use float for backward compatibility with division in Python2.7
+        >>> mean = tot / n_reviews
+        >>> print(n_reviews, tot, mean)
+        15 24 1.6
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(),
+                 encoding='utf8'):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WordPunctTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+
+    def features(self, fileids=None):
+        """
+        Return a list of features. Each feature is a tuple made of the specific
+        item feature and the opinion strength about that feature.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            features have to be returned.
+        :return: all features for the item(s) in the given file(s).
+        :rtype: list(tuple)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(fileid, self._read_features, encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def raw(self, fileids=None):
+        """
+        :param fileids: a list or regexp specifying the fileids of the files that
+            have to be returned as a raw string.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def readme(self):
+        """
+        Return the contents of the corpus README.txt file.
+        """
+        return self.open("README.txt").read()
+
+    def reviews(self, fileids=None):
+        """
+        Return all the reviews as a list of Review objects. If `fileids` is
+        specified, return all the reviews from each of the specified files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            reviews have to be returned.
+        :return: the given file(s) as a list of reviews.
+        """
+        if fileids is None:
+            fileids = self._fileids
+        return concat([self.CorpusView(fileid, self._read_review_block, encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus or in the specified files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: the given file(s) as a list of sentences, each encoded as a
+            list of word strings.
+        :rtype: list(list(str))
+        """
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def _read_features(self, stream):
+        features = []
+        for i in range(20):
+            line = stream.readline()
+            if not line:
+                return features
+            features.extend(re.findall(FEATURES, line))
+        return features
+
+    def _read_review_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return [] # end of file.
+            title_match = re.match(TITLE, line)
+            if title_match:
+                review = Review(title=title_match.group(1).strip()) # We create a new review
+                break
+
+        # Scan until we find another line matching the regexp, or EOF.
+        while True:
+            oldpos = stream.tell()
+            line = stream.readline()
+            # End of file:
+            if not line:
+                return [review]
+            # Start of a new review: backup to just before it starts, and
+            # return the review we've already collected.
+            if re.match(TITLE, line):
+                stream.seek(oldpos)
+                return [review]
+            # Anything else is part of the review line.
+            feats = re.findall(FEATURES, line)
+            notes = re.findall(NOTES, line)
+            sent = re.findall(SENT, line)
+            if sent:
+                sent = self._word_tokenizer.tokenize(sent[0])
+            review_line = ReviewLine(sent=sent, features=feats, notes=notes)
+            review.add_line(review_line)
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for review in self._read_review_block(stream):
+            sents.extend([sent for sent in review.sents()])
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            sent = re.findall(SENT, line)
+            if sent:
+                words.extend(self._word_tokenizer.tokenize(sent[0]))
+        return words
diff --git a/nlp_resource_data/nltk/corpus/reader/reviews.pyc b/nlp_resource_data/nltk/corpus/reader/reviews.pyc

new file mode 100755 (executable)

index 0000000..103b814

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/reviews.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/rte.py b/nlp_resource_data/nltk/corpus/reader/rte.py

new file mode 100755 (executable)

index 0000000..66c702d
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/rte.py
@@ -0,0 +1,144 @@
+# Natural Language Toolkit: RTE Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author:  Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
+
+The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
+were regularized.
+
+Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
+gold standard annotated files.
+
+Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
+example is taken from RTE3::
+
+ <pair id="1" entailment="YES" task="IE" length="short" >
+
+    <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
+    Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
+    company Baikalfinansgroup which was later bought by the Russian
+    state-owned oil company Rosneft .</t>
+
+   <h>Baikalfinansgroup was sold to Rosneft.</h>
+ </pair>
+
+In order to provide globally unique IDs for each pair, a new attribute
+``challenge`` has been added to the root element ``entailment-corpus`` of each
+file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
+challenge number and 'n' is the pair ID.
+"""
+from __future__ import unicode_literals
+
+from six import string_types
+
+from nltk import compat
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import *
+
+
+def norm(value_string):
+    """
+    Normalize the string value in an RTE pair's ``value`` or ``entailment``
+    attribute as an integer (1, 0).
+
+    :param value_string: the label used to classify a text/hypothesis pair
+    :type value_string: str
+    :rtype: int
+    """
+
+    valdict = {"TRUE": 1,
+                     "FALSE": 0,
+                     "YES": 1,
+                     "NO": 0}
+    return valdict[value_string.upper()]
+
+@compat.python_2_unicode_compatible
+class RTEPair(object):
+    """
+    Container for RTE text-hypothesis pairs.
+
+    The entailment relation is signalled by the ``value`` attribute in RTE1, and by
+    ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
+    attribute of this class.
+    """
+    def __init__(self, pair, challenge=None, id=None, text=None, hyp=None,
+             value=None, task=None, length=None):
+        """
+        :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
+        :param id: identifier for the pair
+        :param text: the text component of the pair
+        :param hyp: the hypothesis component of the pair
+        :param value: classification label for the pair
+        :param task: attribute for the particular NLP task that the data was drawn from
+        :param length: attribute for the length of the text of the pair
+        """
+        self.challenge =  challenge
+        self.id = pair.attrib["id"]
+        self.gid = "%s-%s" % (self.challenge, self.id)
+        self.text = pair[0].text
+        self.hyp = pair[1].text
+
+        if "value" in pair.attrib:
+            self.value = norm(pair.attrib["value"])
+        elif "entailment" in pair.attrib:
+            self.value = norm(pair.attrib["entailment"])
+        else:
+            self.value = value
+        if "task" in pair.attrib:
+            self.task = pair.attrib["task"]
+        else:
+            self.task = task
+        if "length" in pair.attrib:
+            self.length = pair.attrib["length"]
+        else:
+            self.length = length
+
+    def __repr__(self):
+        if self.challenge:
+            return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
+        else:
+            return '<RTEPair: id=%s>' % self.id
+
+
+class RTECorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for corpora in RTE challenges.
+
+    This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
+    structure of input documents.
+    """
+
+    def _read_etree(self, doc):
+        """
+        Map the XML input into an RTEPair.
+
+        This uses the ``getiterator()`` method from the ElementTree package to
+        find all the ``<pair>`` elements.
+
+        :param doc: a parsed XML document
+        :rtype: list(RTEPair)
+        """
+        try:
+            challenge = doc.attrib['challenge']
+        except KeyError:
+            challenge = None
+        return [RTEPair(pair, challenge=challenge)
+                for pair in doc.getiterator("pair")]
+
+
+    def pairs(self, fileids):
+        """
+        Build a list of RTEPairs from a RTE corpus.
+
+        :param fileids: a list of RTE corpus fileids
+        :type: list
+        :rtype: list(RTEPair)
+        """
+        if isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
diff --git a/nlp_resource_data/nltk/corpus/reader/rte.pyc b/nlp_resource_data/nltk/corpus/reader/rte.pyc

new file mode 100755 (executable)

index 0000000..fadecfc

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/rte.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/semcor.py b/nlp_resource_data/nltk/corpus/reader/semcor.py

new file mode 100755 (executable)

index 0000000..826439f
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/semcor.py
@@ -0,0 +1,256 @@
+# Natural Language Toolkit: SemCor Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Nathan Schneider <nschneid@cs.cmu.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the SemCor Corpus.
+"""
+from __future__ import absolute_import, unicode_literals
+__docformat__ = 'epytext en'
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
+from nltk.tree import Tree
+
+class SemcorCorpusReader(XMLCorpusReader):
+    """
+    Corpus reader for the SemCor Corpus.
+    For access to the complete XML data structure, use the ``xml()``
+    method.  For access to simple word lists and tagged word lists, use
+    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
+    """
+    def __init__(self, root, fileids, wordnet, lazy=True):
+        XMLCorpusReader.__init__(self, root, fileids)
+        self._lazy = lazy
+        self._wordnet = wordnet
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return self._items(fileids, 'word', False, False, False)
+
+    def chunks(self, fileids=None):
+        """
+        :return: the given file(s) as a list of chunks,
+            each of which is a list of words and punctuation symbols
+            that form a unit.
+        :rtype: list(list(str))
+        """
+        return self._items(fileids, 'chunk', False, False, False)
+
+    def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
+        """
+        :return: the given file(s) as a list of tagged chunks, represented
+            in tree form.
+        :rtype: list(Tree)
+
+        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
+            to indicate the kind of tags to include.  Semantic tags consist of
+            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
+            without a specific entry in WordNet.  (Named entities of type 'other'
+            have no lemma.  Other chunks not in WordNet have no semantic tag.
+            Punctuation tokens have `None` for their part of speech tag.)
+        """
+        return self._items(fileids, 'chunk', False, tag!='sem', tag!='pos')
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of sentences, each encoded
+            as a list of word strings.
+        :rtype: list(list(str))
+        """
+        return self._items(fileids, 'word', True, False, False)
+
+    def chunk_sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of sentences, each encoded
+            as a list of chunks.
+        :rtype: list(list(list(str)))
+        """
+        return self._items(fileids, 'chunk', True, False, False)
+
+    def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
+        """
+        :return: the given file(s) as a list of sentences. Each sentence
+            is represented as a list of tagged chunks (in tree form).
+        :rtype: list(list(Tree))
+
+        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
+            to indicate the kind of tags to include.  Semantic tags consist of
+            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
+            without a specific entry in WordNet.  (Named entities of type 'other'
+            have no lemma.  Other chunks not in WordNet have no semantic tag.
+            Punctuation tokens have `None` for their part of speech tag.)
+        """
+        return self._items(fileids, 'chunk', True, tag!='sem', tag!='pos')
+
+    def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
+        if unit=='word' and not bracket_sent:
+            # the result of the SemcorWordView may be a multiword unit, so the
+            # LazyConcatenation will make sure the sentence is flattened
+            _ = lambda *args: LazyConcatenation((SemcorWordView if self._lazy else self._words)(*args))
+        else:
+            _ = SemcorWordView if self._lazy else self._words
+        return concat([_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
+                       for fileid in self.abspaths(fileids)])
+
+    def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
+        """
+        Helper used to implement the view methods -- returns a list of
+        tokens, (segmented) words, chunks, or sentences. The tokens
+        and chunks may optionally be tagged (with POS and sense
+        information).
+
+        :param fileid: The name of the underlying file.
+        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param pos_tag: Whether to include part-of-speech tags.
+        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
+            and OOV named entity status.
+        """
+        assert unit in ('token', 'word', 'chunk')
+        result = []
+
+        xmldoc = ElementTree.parse(fileid).getroot()
+        for xmlsent in xmldoc.findall('.//s'):
+            sent = []
+            for xmlword in _all_xmlwords_in(xmlsent):
+                itm = SemcorCorpusReader._word(xmlword, unit, pos_tag, sem_tag, self._wordnet)
+                if unit=='word':
+                    sent.extend(itm)
+                else:
+                    sent.append(itm)
+
+            if bracket_sent:
+                result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
+            else:
+                result.extend(sent)
+
+        assert None not in result
+        return result
+
+    @staticmethod
+    def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
+        tkn = xmlword.text
+        if not tkn:
+            tkn = "" # fixes issue 337?
+
+        lemma = xmlword.get('lemma', tkn) # lemma or NE class
+        lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense)
+        if lexsn is not None:
+            sense_key = lemma + '%' + lexsn
+            wnpos = ('n','v','a','r','s')[int(lexsn.split(':')[0])-1]   # see http://wordnet.princeton.edu/man/senseidx.5WN.html
+        else:
+            sense_key = wnpos = None
+        redef = xmlword.get('rdf', tkn)        # redefinition--this indicates the lookup string
+        # does not exactly match the enclosed string, e.g. due to typographical adjustments
+        # or discontinuity of a multiword expression. If a redefinition has occurred,
+        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
+        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
+        sensenum = xmlword.get('wnsn')  # WordNet sense number
+        isOOVEntity = 'pn' in xmlword.keys()   # a "personal name" (NE) not in WordNet
+        pos = xmlword.get('pos')    # part of speech for the whole chunk (None for punctuation)
+
+        if unit=='token':
+            if not pos_tag and not sem_tag:
+                itm = tkn
+            else:
+                itm = (tkn,) + ((pos,) if pos_tag else ()) + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
+            return itm
+        else:
+            ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
+            if unit=='word':
+                return ww
+            else:
+                if sensenum is not None:
+                    try:
+                        sense = wordnet.lemma_from_key(sense_key)   # Lemma object
+                    except Exception:
+                        # cannot retrieve the wordnet.Lemma object. possible reasons:
+                        #  (a) the wordnet corpus is not downloaded;
+                        #  (b) a nonexistant sense is annotated: e.g., such.s.00 triggers: 
+                        #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
+                        # solution: just use the lemma name as a string
+                        try:
+                            sense = '%s.%s.%02d' % (lemma, wnpos, int(sensenum))    # e.g.: reach.v.02
+                        except ValueError:
+                            sense = lemma+'.'+wnpos+'.'+sensenum  # e.g. the sense number may be "2;1"
+
+                bottom = [Tree(pos, ww)] if pos_tag else ww
+
+                if sem_tag and isOOVEntity:
+                    if sensenum is not None:
+                        return Tree(sense, [Tree('NE', bottom)])
+                    else:      # 'other' NE
+                        return Tree('NE', bottom)
+                elif sem_tag and sensenum is not None:
+                    return Tree(sense, bottom)
+                elif pos_tag:
+                    return bottom[0]
+                else:
+                    return bottom # chunk as a list
+
+def _all_xmlwords_in(elt, result=None):
+    if result is None: result = []
+    for child in elt:
+        if child.tag in ('wf', 'punc'): result.append(child)
+        else: _all_xmlwords_in(child, result)
+    return result
+
+class SemcorSentence(list):
+    """
+    A list of words, augmented by an attribute ``num`` used to record
+    the sentence identifier (the ``n`` attribute from the XML).
+    """
+    def __init__(self, num, items):
+        self.num = num
+        list.__init__(self, items)
+
+class SemcorWordView(XMLCorpusView):
+    """
+    A stream backed corpus view specialized for use with the BNC corpus.
+    """
+    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
+        """
+        :param fileid: The name of the underlying file.
+        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
+        :param bracket_sent: If true, include sentence bracketing.
+        :param pos_tag: Whether to include part-of-speech tags.
+        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
+            and OOV named entity status.
+        """
+        if bracket_sent: tagspec = '.*/s'
+        else: tagspec = '.*/s/(punc|wf)'
+
+        self._unit = unit
+        self._sent = bracket_sent
+        self._pos_tag = pos_tag
+        self._sem_tag = sem_tag
+        self._wordnet = wordnet
+
+        XMLCorpusView.__init__(self, fileid, tagspec)
+
+    def handle_elt(self, elt, context):
+        if self._sent: return self.handle_sent(elt)
+        else: return self.handle_word(elt)
+
+    def handle_word(self, elt):
+        return SemcorCorpusReader._word(elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet)
+
+    def handle_sent(self, elt):
+        sent = []
+        for child in elt:
+            if child.tag in ('wf','punc'):
+                itm = self.handle_word(child)
+                if self._unit=='word':
+                    sent.extend(itm)
+                else:
+                    sent.append(itm)
+            else:
+                raise ValueError('Unexpected element %s' % child.tag)
+        return SemcorSentence(elt.attrib['snum'], sent)
diff --git a/nlp_resource_data/nltk/corpus/reader/semcor.pyc b/nlp_resource_data/nltk/corpus/reader/semcor.pyc

new file mode 100755 (executable)

index 0000000..4998693

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/semcor.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/senseval.py b/nlp_resource_data/nltk/corpus/reader/senseval.py

new file mode 100755 (executable)

index 0000000..e8a0f3e
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/senseval.py
@@ -0,0 +1,203 @@
+# Natural Language Toolkit: Senseval 2 Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Steven Bird <stevenbird1@gmail.com> (modifications)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read from the Senseval 2 Corpus.
+
+SENSEVAL [http://www.senseval.org/]
+Evaluation exercises for Word Sense Disambiguation.
+Organized by ACL-SIGLEX [http://www.siglex.org/]
+
+Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
+http://www.d.umn.edu/~tpederse/data.html
+Distributed with permission.
+
+The NLTK version of the Senseval 2 files uses well-formed XML.
+Each instance of the ambiguous words "hard", "interest", "line", and "serve"
+is tagged with a sense identifier, and supplied with context.
+"""
+from __future__ import print_function, unicode_literals
+
+from six import string_types
+
+import re
+from xml.etree import ElementTree
+
+from nltk import compat
+from nltk.tokenize import *
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+@compat.python_2_unicode_compatible
+class SensevalInstance(object):
+    def __init__(self, word, position, context, senses):
+        self.word = word
+        self.senses = tuple(senses)
+        self.position = position
+        self.context = context
+
+    def __repr__(self):
+        return ('SensevalInstance(word=%r, position=%r, '
+                'context=%r, senses=%r)' %
+                (self.word, self.position, self.context, self.senses))
+
+
+class SensevalCorpusReader(CorpusReader):
+    def instances(self, fileids=None):
+        return concat([SensevalCorpusView(fileid, enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def raw(self, fileids=None):
+        """
+        :return: the text contents of the given fileids, as a single string.
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def _entry(self, tree):
+        elts = []
+        for lexelt in tree.findall('lexelt'):
+            for inst in lexelt.findall('instance'):
+                sense = inst[0].attrib['senseid']
+                context = [(w.text, w.attrib['pos'])
+                           for w in inst[1]]
+                elts.append( (sense, context) )
+        return elts
+
+
+class SensevalCorpusView(StreamBackedCorpusView):
+    def __init__(self, fileid, encoding):
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+
+        self._word_tokenizer = WhitespaceTokenizer()
+        self._lexelt_starts = [0] # list of streampos
+        self._lexelts = [None] # list of lexelt names
+
+    def read_block(self, stream):
+        # Decide which lexical element we're in.
+        lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1
+        lexelt = self._lexelts[lexelt_num]
+
+        instance_lines = []
+        in_instance = False
+        while True:
+            line = stream.readline()
+            if line == '':
+                assert instance_lines == []
+                return []
+
+            # Start of a lexical element?
+            if line.lstrip().startswith('<lexelt'):
+                lexelt_num += 1
+                m = re.search('item=("[^"]+"|\'[^\']+\')', line)
+                assert m is not None # <lexelt> has no 'item=...'
+                lexelt = m.group(1)[1:-1]
+                if lexelt_num < len(self._lexelts):
+                    assert lexelt == self._lexelts[lexelt_num]
+                else:
+                    self._lexelts.append(lexelt)
+                    self._lexelt_starts.append(stream.tell())
+
+            # Start of an instance?
+            if line.lstrip().startswith('<instance'):
+                assert instance_lines == []
+                in_instance = True
+
+            # Body of an instance?
+            if in_instance:
+                instance_lines.append(line)
+
+            # End of an instance?
+            if line.lstrip().startswith('</instance'):
+                xml_block = '\n'.join(instance_lines)
+                xml_block = _fixXML(xml_block)
+                inst = ElementTree.fromstring(xml_block)
+                return [self._parse_instance(inst, lexelt)]
+
+    def _parse_instance(self, instance, lexelt):
+        senses = []
+        context = []
+        position = None
+        for child in instance:
+            if child.tag == 'answer':
+                senses.append(child.attrib['senseid'])
+            elif child.tag == 'context':
+                context += self._word_tokenizer.tokenize(child.text)
+                for cword in child:
+                    if cword.tag == 'compound':
+                        cword = cword[0] # is this ok to do?
+
+                    if cword.tag == 'head':
+                        # Some santiy checks:
+                        assert position is None, 'head specified twice'
+                        assert cword.text.strip() or len(cword)==1
+                        assert not (cword.text.strip() and len(cword)==1)
+                        # Record the position of the head:
+                        position = len(context)
+                        # Addd on the head word itself:
+                        if cword.text.strip():
+                            context.append(cword.text.strip())
+                        elif cword[0].tag == 'wf':
+                            context.append((cword[0].text,
+                                            cword[0].attrib['pos']))
+                            if cword[0].tail:
+                                context += self._word_tokenizer.tokenize(
+                                    cword[0].tail)
+                        else:
+                            assert False, 'expected CDATA or wf in <head>'
+                    elif cword.tag == 'wf':
+                        context.append((cword.text, cword.attrib['pos']))
+                    elif cword.tag == 's':
+                        pass # Sentence boundary marker.
+
+                    else:
+                        print('ACK', cword.tag)
+                        assert False, 'expected CDATA or <wf> or <head>'
+                    if cword.tail:
+                        context += self._word_tokenizer.tokenize(cword.tail)
+            else:
+                assert False, 'unexpected tag %s' % child.tag
+        return SensevalInstance(lexelt, position, context, senses)
+
+def _fixXML(text):
+    """
+    Fix the various issues with Senseval pseudo-XML.
+    """
+    # <~> or <^> => ~ or ^
+    text = re.sub(r'<([~\^])>', r'\1', text)
+    # fix lone &
+    text = re.sub(r'(\s+)\&(\s+)', r'\1&amp;\2', text)
+    # fix """
+    text = re.sub(r'"""', '\'"\'', text)
+    # fix <s snum=dd> => <s snum="dd"/>
+    text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
+    # fix foreign word tag
+    text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
+    # remove <&I .>
+    text = re.sub(r'<\&I[^>]*>', '', text)
+    # fix <{word}>
+    text = re.sub(r'<{([^}]+)}>', r'\1', text)
+    # remove <@>, <p>, </p>
+    text = re.sub(r'<(@|/?p)>', r'', text)
+    # remove <&M .> and <&T .> and <&Ms .>
+    text = re.sub(r'<&\w+ \.>', r'', text)
+    # remove <!DOCTYPE... > lines
+    text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
+    # remove <[hi]> and <[/p]> etc
+    text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
+    # take the thing out of the brackets: <&hellip;>
+    text = re.sub(r'<(\&\w+;)>', r'\1', text)
+    # and remove the & for those patterns that aren't regular XML
+    text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
+    # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
+    text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>',
+                  r' <wf pos="\2">\1</wf>', text)
+    text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
+    return text
diff --git a/nlp_resource_data/nltk/corpus/reader/senseval.pyc b/nlp_resource_data/nltk/corpus/reader/senseval.pyc

new file mode 100755 (executable)

index 0000000..500f9b5

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/senseval.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/sentiwordnet.py b/nlp_resource_data/nltk/corpus/reader/sentiwordnet.py

new file mode 100755 (executable)

index 0000000..afb398b
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/sentiwordnet.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: SentiWordNet
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Christopher Potts <cgpotts@stanford.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for SentiWordNet
+
+SentiWordNet is a lexical resource for opinion mining.
+SentiWordNet assigns to each synset of WordNet three
+sentiment scores: positivity, negativity, and objectivity.
+
+For details about SentiWordNet see:
+http://sentiwordnet.isti.cnr.it/
+
+    >>> from nltk.corpus import sentiwordnet as swn
+    >>> print(swn.senti_synset('breakdown.n.03'))
+    <breakdown.n.03: PosScore=0.0 NegScore=0.25>
+    >>> list(swn.senti_synsets('slow'))
+    [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
+    SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
+    SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
+    SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
+    SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
+    SentiSynset('behind.r.03')]
+    >>> happy = swn.senti_synsets('happy', 'a')
+    >>> happy0 = list(happy)[0]
+    >>> happy0.pos_score()
+    0.875
+    >>> happy0.neg_score()
+    0.0
+    >>> happy0.obj_score()
+    0.125
+"""
+
+import re
+from nltk.compat import python_2_unicode_compatible
+from nltk.corpus.reader import CorpusReader
+
+@python_2_unicode_compatible
+class SentiWordNetCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, encoding='utf-8'):
+        """
+        Construct a new SentiWordNet Corpus Reader, using data from
+       the specified file.
+        """        
+        super(SentiWordNetCorpusReader, self).__init__(root, fileids,
+                                                  encoding=encoding)
+        if len(self._fileids) != 1:
+            raise ValueError('Exactly one file must be specified')
+        self._db = {}
+        self._parse_src_file()
+
+    def _parse_src_file(self):
+        lines = self.open(self._fileids[0]).read().splitlines()
+        lines = filter((lambda x : not re.search(r"^\s*#", x)), lines)
+        for i, line in enumerate(lines):
+            fields = [field.strip() for field in re.split(r"\t+", line)]
+            try:            
+                pos, offset, pos_score, neg_score, synset_terms, gloss = fields
+            except:
+                raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
+            if pos and offset:
+                offset = int(offset)
+                self._db[(pos, offset)] = (float(pos_score), float(neg_score))
+
+    def senti_synset(self, *vals):        
+        from nltk.corpus import wordnet as wn
+        if tuple(vals) in self._db:
+            pos_score, neg_score = self._db[tuple(vals)]
+            pos, offset = vals
+            if pos == 's':
+                pos = 'a'
+            synset = wn._synset_from_pos_and_offset(pos, offset)
+            return SentiSynset(pos_score, neg_score, synset)
+        else:
+            synset = wn.synset(vals[0])
+            pos = synset.pos()
+            if pos == 's':
+                pos = 'a'
+            offset = synset.offset()
+            if (pos, offset) in self._db:
+                pos_score, neg_score = self._db[(pos, offset)]
+                return SentiSynset(pos_score, neg_score, synset)
+            else:
+                return None
+
+    def senti_synsets(self, string, pos=None):
+        from nltk.corpus import wordnet as wn
+        sentis = []
+        synset_list = wn.synsets(string, pos)
+        for synset in synset_list:
+            sentis.append(self.senti_synset(synset.name()))
+        sentis = filter(lambda x : x, sentis)
+        return sentis
+
+    def all_senti_synsets(self):
+        from nltk.corpus import wordnet as wn
+        for key, fields in self._db.items():
+            pos, offset = key
+            pos_score, neg_score = fields
+            synset = wn._synset_from_pos_and_offset(pos, offset)
+            yield SentiSynset(pos_score, neg_score, synset)
+
+
+@python_2_unicode_compatible
+class SentiSynset(object):
+    def __init__(self, pos_score, neg_score, synset):
+        self._pos_score = pos_score
+        self._neg_score = neg_score
+        self._obj_score = 1.0 - (self._pos_score + self._neg_score)
+        self.synset = synset
+
+    def pos_score(self):
+        return self._pos_score
+
+    def neg_score(self):
+        return self._neg_score
+
+    def obj_score(self):
+        return self._obj_score
+
+    def __str__(self):
+        """Prints just the Pos/Neg scores for now."""
+        s = "<"
+        s += self.synset.name() + ": "
+        s += "PosScore=%s " % self._pos_score
+        s += "NegScore=%s" % self._neg_score
+        s += ">"
+        return s
+
+    def __repr__(self):
+        return "Senti" + repr(self.synset)
+                    
diff --git a/nlp_resource_data/nltk/corpus/reader/sentiwordnet.pyc b/nlp_resource_data/nltk/corpus/reader/sentiwordnet.pyc

new file mode 100755 (executable)

index 0000000..cfbfcac

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/sentiwordnet.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/sinica_treebank.py b/nlp_resource_data/nltk/corpus/reader/sinica_treebank.py

new file mode 100755 (executable)

index 0000000..c63f7ad
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/sinica_treebank.py
@@ -0,0 +1,75 @@
+# Natural Language Toolkit: Sinica Treebank Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Sinica Treebank Corpus Sample
+
+http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
+
+10,000 parsed sentences, drawn from the Academia Sinica Balanced
+Corpus of Modern Chinese.  Parse tree notation is based on
+Information-based Case Grammar.  Tagset documentation is available
+at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
+
+Language and Knowledge Processing Group, Institute of Information
+Science, Academia Sinica
+
+It is distributed with the Natural Language Toolkit under the terms of
+the Creative Commons Attribution-NonCommercial-ShareAlike License
+[http://creativecommons.org/licenses/by-nc-sa/2.5/].
+
+References:
+
+Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
+The Construction of Sinica Treebank. Computational Linguistics and
+Chinese Language Processing, 4, pp 87-104.
+
+Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
+Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
+Annotation Guidelines, and On-line Interface. Proceedings of 2nd
+Chinese Language Processing Workshop, Association for Computational
+Linguistics.
+
+Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
+Extraction, Proceedings of IJCNLP-04, pp560-565.
+"""
+
+import os
+import re
+
+from nltk.tree import sinica_parse
+from nltk.tag import map_tag
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+IDENTIFIER = re.compile(r'^#\S+\s')
+APPENDIX = re.compile(r'(?<=\))#.*$')
+TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
+WORD = re.compile(r':[^:()|]+:([^:()|]+)')
+
+class SinicaTreebankCorpusReader(SyntaxCorpusReader):
+    """
+    Reader for the sinica treebank.
+    """
+    def _read_block(self, stream):
+        sent = stream.readline()
+        sent = IDENTIFIER.sub('', sent)
+        sent = APPENDIX.sub('', sent)
+        return [sent]
+
+    def _parse(self, sent):
+        return sinica_parse(sent)
+
+    def _tag(self, sent, tagset=None):
+        tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)]
+        if tagset and tagset != self._tagset:
+            tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent]
+        return tagged_sent
+
+    def _word(self, sent):
+        return WORD.findall(sent)
diff --git a/nlp_resource_data/nltk/corpus/reader/sinica_treebank.pyc b/nlp_resource_data/nltk/corpus/reader/sinica_treebank.pyc

new file mode 100755 (executable)

index 0000000..57038c1

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/sinica_treebank.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/string_category.py b/nlp_resource_data/nltk/corpus/reader/string_category.py

new file mode 100755 (executable)

index 0000000..2afd080
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/string_category.py
@@ -0,0 +1,60 @@
+# Natural Language Toolkit: String Category Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Read tuples from a corpus consisting of categorized strings.
+For example, from the question classification corpus:
+
+NUM:dist How far is it from Denver to Aspen ?
+LOC:city What county is Modesto , California in ?
+HUM:desc Who was Galileo ?
+DESC:def What is an atom ?
+NUM:date When did Hawaii become a state ?
+"""
+
+# based on PPAttachmentCorpusReader
+from six import string_types
+
+from nltk import compat
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+# [xx] Should the order of the tuple be reversed -- in most other places
+# in nltk, we use the form (data, tag) -- e.g., tagged words and
+# labeled texts for classifiers.
+class StringCategoryCorpusReader(CorpusReader):
+    def __init__(self, root, fileids, delimiter=' ', encoding='utf8'):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        :param delimiter: Field delimiter
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._delimiter = delimiter
+
+    def tuples(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([StreamBackedCorpusView(fileid, self._read_tuple_block,
+                                              encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def raw(self, fileids=None):
+        """
+        :return: the text contents of the given fileids, as a single string.
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def _read_tuple_block(self, stream):
+        line = stream.readline().strip()
+        if line:
+            return [tuple(line.split(self._delimiter, 1))]
+        else:
+            return []
diff --git a/nlp_resource_data/nltk/corpus/reader/string_category.pyc b/nlp_resource_data/nltk/corpus/reader/string_category.pyc

new file mode 100755 (executable)

index 0000000..b9fdee2

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/string_category.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/switchboard.py b/nlp_resource_data/nltk/corpus/reader/switchboard.py

new file mode 100755 (executable)

index 0000000..f07e2f6
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/switchboard.py
@@ -0,0 +1,119 @@
+# Natural Language Toolkit: Switchboard Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+import re
+
+from nltk.tag import str2tuple, map_tag
+from nltk import compat
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+
+@compat.python_2_unicode_compatible
+class SwitchboardTurn(list):
+    """
+    A specialized list object used to encode switchboard utterances.
+    The elements of the list are the words in the utterance; and two
+    attributes, ``speaker`` and ``id``, are provided to retrieve the
+    spearker identifier and utterance id.  Note that utterance ids
+    are only unique within a given discourse.
+    """
+    def __init__(self, words, speaker, id):
+        list.__init__(self, words)
+        self.speaker = speaker
+        self.id = int(id)
+
+    def __repr__(self):
+        if len(self) == 0:
+            text = ''
+        elif isinstance(self[0], tuple):
+            text = ' '.join('%s/%s' % w for w in self)
+        else:
+            text = ' '.join(self)
+        return '<%s.%s: %r>' % (self.speaker, self.id, text)
+
+
+class SwitchboardCorpusReader(CorpusReader):
+    _FILES = ['tagged']
+    # Use the "tagged" file even for non-tagged data methods, since
+    # it's tokenized.
+
+    def __init__(self, root, tagset=None):
+        CorpusReader.__init__(self, root, self._FILES)
+        self._tagset = tagset
+
+    def words(self):
+        return StreamBackedCorpusView(self.abspath('tagged'),
+                                      self._words_block_reader)
+
+    def tagged_words(self, tagset=None):
+        def tagged_words_block_reader(stream):
+            return self._tagged_words_block_reader(stream, tagset)
+        return StreamBackedCorpusView(self.abspath('tagged'),
+                                      tagged_words_block_reader)
+
+    def turns(self):
+        return StreamBackedCorpusView(self.abspath('tagged'),
+                                      self._turns_block_reader)
+
+    def tagged_turns(self, tagset=None):
+        def tagged_turns_block_reader(stream):
+            return self._tagged_turns_block_reader(stream, tagset)
+        return StreamBackedCorpusView(self.abspath('tagged'),
+                                      tagged_turns_block_reader)
+
+    def discourses(self):
+        return StreamBackedCorpusView(self.abspath('tagged'),
+                                      self._discourses_block_reader)
+
+    def tagged_discourses(self, tagset=False):
+        def tagged_discourses_block_reader(stream):
+            return self._tagged_discourses_block_reader(stream, tagset)
+        return StreamBackedCorpusView(self.abspath('tagged'),
+                                      tagged_discourses_block_reader)
+
+    def _discourses_block_reader(self, stream):
+        # returns at most 1 discourse.  (The other methods depend on this.)
+        return [[self._parse_utterance(u, include_tag=False)
+                 for b in read_blankline_block(stream)
+                 for u in b.split('\n') if u.strip()]]
+
+    def _tagged_discourses_block_reader(self, stream, tagset=None):
+        # returns at most 1 discourse.  (The other methods depend on this.)
+        return [[self._parse_utterance(u, include_tag=True,
+                                       tagset=tagset)
+                 for b in read_blankline_block(stream)
+                 for u in b.split('\n') if u.strip()]]
+
+    def _turns_block_reader(self, stream):
+        return self._discourses_block_reader(stream)[0]
+
+    def _tagged_turns_block_reader(self, stream, tagset=None):
+        return self._tagged_discourses_block_reader(stream, tagset)[0]
+
+    def _words_block_reader(self, stream):
+        return sum(self._discourses_block_reader(stream)[0], [])
+
+    def _tagged_words_block_reader(self, stream, tagset=None):
+        return sum(self._tagged_discourses_block_reader(stream,
+                                                        tagset)[0], [])
+
+    _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
+    _SEP = '/'
+    def _parse_utterance(self, utterance, include_tag, tagset=None):
+        m = self._UTTERANCE_RE.match(utterance)
+        if m is None:
+            raise ValueError('Bad utterance %r' % utterance)
+        speaker, id, text = m.groups()
+        words = [str2tuple(s, self._SEP) for s in text.split()]
+        if not include_tag:
+            words = [w for (w,t) in words]
+        elif tagset and tagset != self._tagset:
+            words = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in words]
+        return SwitchboardTurn(words, speaker, id)
+
diff --git a/nlp_resource_data/nltk/corpus/reader/switchboard.pyc b/nlp_resource_data/nltk/corpus/reader/switchboard.pyc

new file mode 100755 (executable)

index 0000000..42731ae

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/switchboard.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/tagged.py b/nlp_resource_data/nltk/corpus/reader/tagged.py

new file mode 100755 (executable)

index 0000000..d7f563d
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/tagged.py
@@ -0,0 +1,295 @@
+# Natural Language Toolkit: Tagged Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jacob Perkins <japerk@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora whose documents contain part-of-speech-tagged words.
+"""
+
+import os
+
+from six import string_types
+
+from nltk.tag import str2tuple, map_tag
+from nltk.tokenize import *
+
+from nltk.corpus.reader.api import *
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.timit import read_timit_block
+
+class TaggedCorpusReader(CorpusReader):
+    """
+    Reader for simple part-of-speech tagged corpora.  Paragraphs are
+    assumed to be split using blank lines.  Sentences and words can be
+    tokenized using the default tokenizers, or by custom tokenizers
+    specified as parameters to the constructor.  Words are parsed
+    using ``nltk.tag.str2tuple``.  By default, ``'/'`` is used as the
+    separator.  I.e., words should have the form::
+
+       word1/tag1 word2/tag2 word3/tag3 ...
+
+    But custom separators may be specified as parameters to the
+    constructor.  Part of speech tags are case-normalized to upper
+    case.
+    """
+    def __init__(self, root, fileids,
+                 sep='/', word_tokenizer=WhitespaceTokenizer(),
+                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+                 para_block_reader=read_blankline_block,
+                 encoding='utf8',
+                 tagset=None):
+        """
+        Construct a new Tagged Corpus reader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
+
+        :param root: The root directory for this corpus.
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._tagset = tagset
+
+    def raw(self, fileids=None):
+        """
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def words(self, fileids=None):
+        """
+        :return: the given file(s) as a list of words
+            and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([TaggedCorpusView(fileid, enc,
+                                        False, False, False,
+                                        self._sep, self._word_tokenizer,
+                                        self._sent_tokenizer,
+                                        self._para_block_reader,
+                                        None)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            sentences or utterances, each encoded as a list of word
+            strings.
+        :rtype: list(list(str))
+        """
+        return concat([TaggedCorpusView(fileid, enc,
+                                        False, True, False,
+                                        self._sep, self._word_tokenizer,
+                                        self._sent_tokenizer,
+                                        self._para_block_reader,
+                                        None)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def paras(self, fileids=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of word strings.
+        :rtype: list(list(list(str)))
+        """
+        return concat([TaggedCorpusView(fileid, enc,
+                                        False, True, True,
+                                        self._sep, self._word_tokenizer,
+                                        self._sent_tokenizer,
+                                        self._para_block_reader,
+                                        None)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_words(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of tagged
+            words and punctuation symbols, encoded as tuples
+            ``(word,tag)``.
+        :rtype: list(tuple(str,str))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat([TaggedCorpusView(fileid, enc,
+                                        True, False, False,
+                                        self._sep, self._word_tokenizer,
+                                        self._sent_tokenizer,
+                                        self._para_block_reader,
+                                        tag_mapping_function)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_sents(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            sentences, each encoded as a list of ``(word,tag)`` tuples.
+
+        :rtype: list(list(tuple(str,str)))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat([TaggedCorpusView(fileid, enc,
+                                        True, True, False,
+                                        self._sep, self._word_tokenizer,
+                                        self._sent_tokenizer,
+                                        self._para_block_reader,
+                                        tag_mapping_function)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def tagged_paras(self, fileids=None, tagset=None):
+        """
+        :return: the given file(s) as a list of
+            paragraphs, each encoded as a list of sentences, which are
+            in turn encoded as lists of ``(word,tag)`` tuples.
+        :rtype: list(list(list(tuple(str,str))))
+        """
+        if tagset and tagset != self._tagset:
+            tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
+        else:
+            tag_mapping_function = None
+        return concat([TaggedCorpusView(fileid, enc,
+                                        True, True, True,
+                                        self._sep, self._word_tokenizer,
+                                        self._sent_tokenizer,
+                                        self._para_block_reader,
+                                        tag_mapping_function)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+class CategorizedTaggedCorpusReader(CategorizedCorpusReader,
+                                    TaggedCorpusReader):
+    """
+    A reader for part-of-speech tagged corpora whose documents are
+    divided into categories based on their file identifiers.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the corpus reader.  Categorization arguments
+        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
+        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
+        are passed to the ``TaggedCorpusReader``.
+        """
+        CategorizedCorpusReader.__init__(self, kwargs)
+        TaggedCorpusReader.__init__(self, *args, **kwargs)
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+    def raw(self, fileids=None, categories=None):
+        return TaggedCorpusReader.raw(
+            self, self._resolve(fileids, categories))
+    def words(self, fileids=None, categories=None):
+        return TaggedCorpusReader.words(
+            self, self._resolve(fileids, categories))
+    def sents(self, fileids=None, categories=None):
+        return TaggedCorpusReader.sents(
+            self, self._resolve(fileids, categories))
+    def paras(self, fileids=None, categories=None):
+        return TaggedCorpusReader.paras(
+            self, self._resolve(fileids, categories))
+    def tagged_words(self, fileids=None, categories=None, tagset=None):
+        return TaggedCorpusReader.tagged_words(
+            self, self._resolve(fileids, categories), tagset)
+    def tagged_sents(self, fileids=None, categories=None, tagset=None):
+        return TaggedCorpusReader.tagged_sents(
+            self, self._resolve(fileids, categories), tagset)
+    def tagged_paras(self, fileids=None, categories=None, tagset=None):
+        return TaggedCorpusReader.tagged_paras(
+            self, self._resolve(fileids, categories), tagset)
+
+class TaggedCorpusView(StreamBackedCorpusView):
+    """
+    A specialized corpus view for tagged documents.  It can be
+    customized via flags to divide the tagged corpus documents up by
+    sentence or paragraph, and to include or omit part of speech tags.
+    ``TaggedCorpusView`` objects are typically created by
+    ``TaggedCorpusReader`` (not directly by nltk users).
+    """
+    def __init__(self, corpus_file, encoding, tagged, group_by_sent,
+                 group_by_para, sep, word_tokenizer, sent_tokenizer,
+                 para_block_reader, tag_mapping_function=None):
+        self._tagged = tagged
+        self._group_by_sent = group_by_sent
+        self._group_by_para = group_by_para
+        self._sep = sep
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+        self._para_block_reader = para_block_reader
+        self._tag_mapping_function = tag_mapping_function
+        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
+    def read_block(self, stream):
+        """Reads one paragraph at a time."""
+        block = []
+        for para_str in self._para_block_reader(stream):
+            para = []
+            for sent_str in self._sent_tokenizer.tokenize(para_str):
+                sent = [str2tuple(s, self._sep) for s in
+                        self._word_tokenizer.tokenize(sent_str)]
+                if self._tag_mapping_function:
+                    sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
+                if not self._tagged:
+                    sent = [w for (w,t) in sent]
+                if self._group_by_sent:
+                    para.append(sent)
+                else:
+                    para.extend(sent)
+            if self._group_by_para:
+                block.append(para)
+            else:
+                block.extend(para)
+        return block
+
+# needs to implement simplified tags
+class MacMorphoCorpusReader(TaggedCorpusReader):
+    """
+    A corpus reader for the MAC_MORPHO corpus.  Each line contains a
+    single tagged word, using '_' as a separator.  Sentence boundaries
+    are based on the end-sentence tag ('_.').  Paragraph information
+    is not included in the corpus, so each paragraph returned by
+    ``self.paras()`` and ``self.tagged_paras()`` contains a single
+    sentence.
+    """
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
+        TaggedCorpusReader.__init__(
+            self, root, fileids, sep='_',
+            word_tokenizer=LineTokenizer(),
+            sent_tokenizer=RegexpTokenizer('.*\n'),
+            para_block_reader=self._read_block,
+            encoding=encoding,
+            tagset=tagset)
+
+    def _read_block(self, stream):
+        return read_regexp_block(stream, r'.*', r'.*_\.')
+
+class TimitTaggedCorpusReader(TaggedCorpusReader):
+    """
+    A corpus reader for tagged sentences that are included in the TIMIT corpus.
+    """
+    def __init__(self, *args, **kwargs):
+        TaggedCorpusReader.__init__(
+            self, para_block_reader=read_timit_block, *args, **kwargs)
+
+    def paras(self):
+        raise NotImplementedError('use sents() instead')
+
+    def tagged_paras(self):
+        raise NotImplementedError('use tagged_sents() instead')
diff --git a/nlp_resource_data/nltk/corpus/reader/tagged.pyc b/nlp_resource_data/nltk/corpus/reader/tagged.pyc

new file mode 100755 (executable)

index 0000000..4253455

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/tagged.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/timit.py b/nlp_resource_data/nltk/corpus/reader/timit.py

new file mode 100755 (executable)

index 0000000..b8346df
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/timit.py
@@ -0,0 +1,452 @@
+# Natural Language Toolkit: TIMIT Corpus Reader
+#
+# Copyright (C) 2001-2007 NLTK Project
+# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jacob Perkins <japerk@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# [xx] this docstring is out-of-date:
+"""
+Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
+
+This corpus contains selected portion of the TIMIT corpus.
+
+ - 16 speakers from 8 dialect regions
+ - 1 male and 1 female from each dialect region
+ - total 130 sentences (10 sentences per speaker.  Note that some
+   sentences are shared among other speakers, especially sa1 and sa2
+   are spoken by all speakers.)
+ - total 160 recording of sentences (10 recordings per speaker)
+ - audio format: NIST Sphere, single channel, 16kHz sampling,
+  16 bit sample, PCM encoding
+
+
+Module contents
+===============
+
+The timit corpus reader provides 4 functions and 4 data items.
+
+ - utterances
+
+   List of utterances in the corpus.  There are total 160 utterances,
+   each of which corresponds to a unique utterance of a speaker.
+   Here's an example of an utterance identifier in the list::
+
+       dr1-fvmh0/sx206
+         - _----  _---
+         | |  |   | |
+         | |  |   | |
+         | |  |   | `--- sentence number
+         | |  |   `----- sentence type (a:all, i:shared, x:exclusive)
+         | |  `--------- speaker ID
+         | `------------ sex (m:male, f:female)
+         `-------------- dialect region (1..8)
+
+ - speakers
+
+   List of speaker IDs.  An example of speaker ID::
+
+       dr1-fvmh0
+
+   Note that if you split an item ID with colon and take the first element of
+   the result, you will get a speaker ID.
+
+       >>> itemid = 'dr1-fvmh0/sx206'
+       >>> spkrid , sentid = itemid.split('/')
+       >>> spkrid
+       'dr1-fvmh0'
+
+   The second element of the result is a sentence ID.
+
+ - dictionary()
+
+   Phonetic dictionary of words contained in this corpus.  This is a Python
+   dictionary from words to phoneme lists.
+
+ - spkrinfo()
+
+   Speaker information table.  It's a Python dictionary from speaker IDs to
+   records of 10 fields.  Speaker IDs the same as the ones in timie.speakers.
+   Each record is a dictionary from field names to values, and the fields are
+   as follows::
+
+     id         speaker ID as defined in the original TIMIT speaker info table
+     sex        speaker gender (M:male, F:female)
+     dr         speaker dialect region (1:new england, 2:northern,
+                3:north midland, 4:south midland, 5:southern, 6:new york city,
+                7:western, 8:army brat (moved around))
+     use        corpus type (TRN:training, TST:test)
+                in this sample corpus only TRN is available
+     recdate    recording date
+     birthdate  speaker birth date
+     ht         speaker height
+     race       speaker race (WHT:white, BLK:black, AMR:american indian,
+                SPN:spanish-american, ORN:oriental,???:unknown)
+     edu        speaker education level (HS:high school, AS:associate degree,
+                BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
+                PHD:doctorate degree (PhD,JD,MD), ??:unknown)
+     comments   comments by the recorder
+
+The 4 functions are as follows.
+
+ - tokenized(sentences=items, offset=False)
+
+   Given a list of items, returns an iterator of a list of word lists,
+   each of which corresponds to an item (sentence).  If offset is set to True,
+   each element of the word list is a tuple of word(string), start offset and
+   end offset, where offset is represented as a number of 16kHz samples.
+
+ - phonetic(sentences=items, offset=False)
+
+   Given a list of items, returns an iterator of a list of phoneme lists,
+   each of which corresponds to an item (sentence).  If offset is set to True,
+   each element of the phoneme list is a tuple of word(string), start offset
+   and end offset, where offset is represented as a number of 16kHz samples.
+
+ - audiodata(item, start=0, end=None)
+
+   Given an item, returns a chunk of audio samples formatted into a string.
+   When the fuction is called, if start and end are omitted, the entire
+   samples of the recording will be returned.  If only end is omitted,
+   samples from the start offset to the end of the recording will be returned.
+
+ - play(data)
+
+   Play the given audio samples. The audio samples can be obtained from the
+   timit.audiodata function.
+
+"""
+from __future__ import print_function, unicode_literals
+
+import sys
+import os
+import re
+import tempfile
+import time
+
+from six import string_types
+
+from nltk import compat
+from nltk.tree import Tree
+from nltk.internals import import_from_stdlib
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class TimitCorpusReader(CorpusReader):
+    """
+    Reader for the TIMIT corpus (or any other corpus with the same
+    file layout and use of file formats).  The corpus root directory
+    should contain the following files:
+
+      - timitdic.txt: dictionary of standard transcriptions
+      - spkrinfo.txt: table of speaker information
+
+    In addition, the root directory should contain one subdirectory
+    for each speaker, containing three files for each utterance:
+
+      - <utterance-id>.txt: text content of utterances
+      - <utterance-id>.wrd: tokenized text content of utterances
+      - <utterance-id>.phn: phonetic transcription of utterances
+      - <utterance-id>.wav: utterance sound file
+    """
+
+    _FILE_RE = (r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' +
+                r'timitdic\.txt|spkrinfo\.txt')
+    """A regexp matching fileids that are used by this corpus reader."""
+    _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
+
+    def __init__(self, root, encoding='utf8'):
+        """
+        Construct a new TIMIT corpus reader in the given directory.
+        :param root: The root directory for this corpus.
+        """
+        # Ensure that wave files don't get treated as unicode data:
+        if isinstance(encoding, string_types):
+            encoding = [('.*\.wav', None), ('.*', encoding)]
+
+        CorpusReader.__init__(self, root,
+                              find_corpus_fileids(root, self._FILE_RE),
+                              encoding=encoding)
+
+        self._utterances = [name[:-4] for name in
+                            find_corpus_fileids(root, self._UTTERANCE_RE)]
+        """A list of the utterance identifiers for all utterances in
+        this corpus."""
+
+        self._speakerinfo = None
+        self._root = root
+        self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
+
+    def fileids(self, filetype=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus.
+
+        :param filetype: If specified, then ``filetype`` indicates that
+            only the files that have the given type should be
+            returned.  Accepted values are: ``txt``, ``wrd``, ``phn``,
+            ``wav``, or ``metadata``,
+        """
+        if filetype is None:
+            return CorpusReader.fileids(self)
+        elif filetype in ('txt', 'wrd', 'phn', 'wav'):
+            return ['%s.%s' % (u, filetype) for u in self._utterances]
+        elif filetype == 'metadata':
+            return ['timitdic.txt', 'spkrinfo.txt']
+        else:
+            raise ValueError('Bad value for filetype: %r' % filetype)
+
+    def utteranceids(self, dialect=None, sex=None, spkrid=None,
+                   sent_type=None, sentid=None):
+        """
+        :return: A list of the utterance identifiers for all
+        utterances in this corpus, or for the given speaker, dialect
+        region, gender, sentence type, or sentence number, if
+        specified.
+        """
+        if isinstance(dialect, string_types): dialect = [dialect]
+        if isinstance(sex, string_types): sex = [sex]
+        if isinstance(spkrid, string_types): spkrid = [spkrid]
+        if isinstance(sent_type, string_types): sent_type = [sent_type]
+        if isinstance(sentid, string_types): sentid = [sentid]
+
+        utterances = self._utterances[:]
+        if dialect is not None:
+            utterances = [u for u in utterances if u[2] in dialect]
+        if sex is not None:
+            utterances = [u for u in utterances if u[4] in sex]
+        if spkrid is not None:
+            utterances = [u for u in utterances if u[:9] in spkrid]
+        if sent_type is not None:
+            utterances = [u for u in utterances if u[11] in sent_type]
+        if sentid is not None:
+            utterances = [u for u in utterances if u[10:] in spkrid]
+        return utterances
+
+    def transcription_dict(self):
+        """
+        :return: A dictionary giving the 'standard' transcription for
+        each word.
+        """
+        _transcriptions = {}
+        for line in self.open('timitdic.txt'):
+            if not line.strip() or line[0] == ';': continue
+            m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
+            if not m: raise ValueError('Bad line: %r' % line)
+            _transcriptions[m.group(1)] = m.group(2).split()
+        return _transcriptions
+
+    def spkrid(self, utterance):
+        return utterance.split('/')[0]
+
+    def sentid(self, utterance):
+        return utterance.split('/')[1]
+
+    def utterance(self, spkrid, sentid):
+        return '%s/%s' % (spkrid, sentid)
+
+    def spkrutteranceids(self, speaker):
+        """
+        :return: A list of all utterances associated with a given
+        speaker.
+        """
+        return [utterance for utterance in self._utterances
+                if utterance.startswith(speaker+'/')]
+
+    def spkrinfo(self, speaker):
+        """
+        :return: A dictionary mapping .. something.
+        """
+        if speaker in self._utterances:
+            speaker = self.spkrid(speaker)
+
+        if self._speakerinfo is None:
+            self._speakerinfo = {}
+            for line in self.open('spkrinfo.txt'):
+                if not line.strip() or line[0] == ';': continue
+                rec = line.strip().split(None, 9)
+                key = "dr%s-%s%s" % (rec[2],rec[1].lower(),rec[0].lower())
+                self._speakerinfo[key] = SpeakerInfo(*rec)
+
+        return self._speakerinfo[speaker]
+
+    def phones(self, utterances=None):
+        return [line.split()[-1]
+                for fileid in self._utterance_fileids(utterances, '.phn')
+                for line in self.open(fileid) if line.strip()]
+
+    def phone_times(self, utterances=None):
+        """
+        offset is represented as a number of 16kHz samples!
+        """
+        return [(line.split()[2], int(line.split()[0]), int(line.split()[1]))
+                for fileid in self._utterance_fileids(utterances, '.phn')
+                for line in self.open(fileid) if line.strip()]
+
+    def words(self, utterances=None):
+        return [line.split()[-1]
+                for fileid in self._utterance_fileids(utterances, '.wrd')
+                for line in self.open(fileid) if line.strip()]
+
+    def word_times(self, utterances=None):
+        return [(line.split()[2], int(line.split()[0]), int(line.split()[1]))
+                for fileid in self._utterance_fileids(utterances, '.wrd')
+                for line in self.open(fileid) if line.strip()]
+
+    def sents(self, utterances=None):
+        return [[line.split()[-1]
+                 for line in self.open(fileid) if line.strip()]
+                for fileid in self._utterance_fileids(utterances, '.wrd')]
+
+    def sent_times(self, utterances=None):
+        return [(line.split(None,2)[-1].strip(),
+                 int(line.split()[0]), int(line.split()[1]))
+                for fileid in self._utterance_fileids(utterances, '.txt')
+                for line in self.open(fileid) if line.strip()]
+
+    def phone_trees(self, utterances=None):
+        if utterances is None: utterances = self._utterances
+        if isinstance(utterances, string_types): utterances = [utterances]
+
+        trees = []
+        for utterance in utterances:
+            word_times = self.word_times(utterance)
+            phone_times = self.phone_times(utterance)
+            sent_times = self.sent_times(utterance)
+
+            while sent_times:
+                (sent, sent_start, sent_end) = sent_times.pop(0)
+                trees.append(Tree('S', []))
+                while (word_times and phone_times and
+                       phone_times[0][2] <= word_times[0][1]):
+                    trees[-1].append(phone_times.pop(0)[0])
+                while word_times and word_times[0][2] <= sent_end:
+                    (word, word_start, word_end) = word_times.pop(0)
+                    trees[-1].append(Tree(word, []))
+                    while phone_times and phone_times[0][2] <= word_end:
+                        trees[-1][-1].append(phone_times.pop(0)[0])
+                while phone_times and phone_times[0][2] <= sent_end:
+                    trees[-1].append(phone_times.pop(0)[0])
+        return trees
+
+    # [xx] NOTE: This is currently broken -- we're assuming that the
+    # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
+    # fileids.
+    def wav(self, utterance, start=0, end=None):
+        # nltk.chunk conflicts with the stdlib module 'chunk'
+        wave = import_from_stdlib('wave')
+
+        w = wave.open(self.open(utterance+'.wav'), 'rb')
+
+        if end is None:
+            end = w.getnframes()
+
+        # Skip past frames before start, then read the frames we want
+        w.readframes(start)
+        frames = w.readframes(end-start)
+
+        # Open a new temporary file -- the wave module requires
+        # an actual file, and won't work w/ stringio. :(
+        tf = tempfile.TemporaryFile()
+        out = wave.open(tf, 'w')
+
+        # Write the parameters & data to the new file.
+        out.setparams(w.getparams())
+        out.writeframes(frames)
+        out.close()
+
+        # Read the data back from the file, and return it.  The
+        # file will automatically be deleted when we return.
+        tf.seek(0)
+        return tf.read()
+
+    def audiodata(self, utterance, start=0, end=None):
+        assert(end is None or end > start)
+        headersize = 44
+        if end is None:
+            data = self.open(utterance+'.wav').read()
+        else:
+            data = self.open(utterance+'.wav').read(headersize+end*2)
+        return data[headersize+start*2:]
+
+    def _utterance_fileids(self, utterances, extension):
+        if utterances is None: utterances = self._utterances
+        if isinstance(utterances, string_types): utterances = [utterances]
+        return ['%s%s' % (u, extension) for u in utterances]
+
+    def play(self, utterance, start=0, end=None):
+        """
+        Play the given audio sample.
+
+        :param utterance: The utterance id of the sample to play
+        """
+        # Method 1: os audio dev.
+        try:
+            import ossaudiodev
+            try:
+                dsp = ossaudiodev.open('w')
+                dsp.setfmt(ossaudiodev.AFMT_S16_LE)
+                dsp.channels(1)
+                dsp.speed(16000)
+                dsp.write(self.audiodata(utterance, start, end))
+                dsp.close()
+            except IOError as e:
+                print(("can't acquire the audio device; please "
+                                     "activate your audio device."), file=sys.stderr)
+                print("system error message:", str(e), file=sys.stderr)
+            return
+        except ImportError:
+            pass
+
+        # Method 2: pygame
+        try:
+            # FIXME: this won't work under python 3
+            import pygame.mixer, StringIO
+            pygame.mixer.init(16000)
+            f = StringIO.StringIO(self.wav(utterance, start, end))
+            pygame.mixer.Sound(f).play()
+            while pygame.mixer.get_busy():
+                time.sleep(0.01)
+            return
+        except ImportError:
+            pass
+
+        # Method 3: complain. :)
+        print(("you must install pygame or ossaudiodev "
+                             "for audio playback."), file=sys.stderr)
+
+
+@compat.python_2_unicode_compatible
+class SpeakerInfo(object):
+    def __init__(self, id, sex, dr, use, recdate, birthdate,
+                 ht, race, edu, comments=None):
+        self.id = id
+        self.sex = sex
+        self.dr = dr
+        self.use = use
+        self.recdate = recdate
+        self.birthdate = birthdate
+        self.ht = ht
+        self.race = race
+        self.edu = edu
+        self.comments = comments
+
+    def __repr__(self):
+        attribs = 'id sex dr use recdate birthdate ht race edu comments'
+        args = ['%s=%r' % (attr, getattr(self, attr))
+                for attr in attribs.split()]
+        return 'SpeakerInfo(%s)' % (', '.join(args))
+
+
+def read_timit_block(stream):
+    """
+    Block reader for timit tagged sentences, which are preceded by a sentence
+    number that will be ignored.
+    """
+    line = stream.readline()
+    if not line: return []
+    n, sent = line.split(' ', 1)
+    return [sent]
diff --git a/nlp_resource_data/nltk/corpus/reader/timit.pyc b/nlp_resource_data/nltk/corpus/reader/timit.pyc

new file mode 100755 (executable)

index 0000000..fe95108

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/timit.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/toolbox.py b/nlp_resource_data/nltk/corpus/reader/toolbox.py

new file mode 100755 (executable)

index 0000000..169ed02
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/toolbox.py
@@ -0,0 +1,68 @@
+# Natural Language Toolkit: Toolbox Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Greg Aumann <greg_aumann@sil.org>
+#         Stuart Robinson <Stuart.Robinson@mpi.nl>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Module for reading, writing and manipulating
+Toolbox databases and settings fileids.
+"""
+
+import os
+import re
+import codecs
+
+from six import string_types
+
+from nltk.toolbox import ToolboxData
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class ToolboxCorpusReader(CorpusReader):
+    def xml(self, fileids, key=None):
+        return concat([ToolboxData(path, enc).parse(key=key)
+                       for (path, enc) in self.abspaths(fileids, True)])
+
+    def fields(self, fileids, strip=True, unwrap=True, encoding='utf8',
+               errors='strict', unicode_fields=None):
+        return concat([list(ToolboxData(fileid,enc).fields(
+                             strip, unwrap, encoding, errors, unicode_fields))
+                       for (fileid, enc)
+                       in self.abspaths(fileids, include_encoding=True)])
+
+    # should probably be done lazily:
+    def entries(self, fileids, **kwargs):
+        if 'key' in kwargs:
+            key = kwargs['key']
+            del kwargs['key']
+        else:
+            key = 'lx'  # the default key in MDF
+        entries = []
+        for marker, contents in self.fields(fileids, **kwargs):
+            if marker == key:
+                entries.append((contents, []))
+            else:
+                try:
+                    entries[-1][-1].append((marker, contents))
+                except IndexError:
+                    pass
+        return entries
+
+    def words(self, fileids, key='lx'):
+        return [contents for marker, contents in self.fields(fileids) if marker == key]
+
+    def raw(self, fileids):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+
+def demo():
+    pass
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/corpus/reader/toolbox.pyc b/nlp_resource_data/nltk/corpus/reader/toolbox.pyc

new file mode 100755 (executable)

index 0000000..582c6cd

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/toolbox.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/twitter.py b/nlp_resource_data/nltk/corpus/reader/twitter.py

new file mode 100755 (executable)

index 0000000..5b48dcf
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/twitter.py
@@ -0,0 +1,155 @@
+# Natural Language Toolkit: Twitter Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that consist of Tweets. It is assumed that the Tweets
+have been serialised into line-delimited JSON.
+"""
+
+import json
+import os
+
+from six import string_types
+
+from nltk.tokenize import TweetTokenizer
+
+from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
+from nltk.corpus.reader.api import CorpusReader
+
+
+class TwitterCorpusReader(CorpusReader):
+    """
+    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
+
+    Individual Tweets can be tokenized using the default tokenizer, or by a
+    custom tokenizer specified as a parameter to the constructor.
+
+    Construct a new Tweet corpus reader for a set of documents
+    located at the given root directory.
+
+    If you made your own tweet collection in a directory called
+    `twitter-files`, then you can initialise the reader as::
+
+        from nltk.corpus import TwitterCorpusReader
+        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
+
+    However, the recommended approach is to set the relevant directory as the
+    value of the environmental variable `TWITTER`, and then invoke the reader
+    as follows::
+
+       root = os.environ['TWITTER']
+       reader = TwitterCorpusReader(root, '.*\.json')
+
+    If you want to work directly with the raw Tweets, the `json` library can
+    be used::
+
+       import json
+       for tweet in reader.docs():
+           print(json.dumps(tweet, indent=1, sort_keys=True))
+
+    """
+
+    CorpusView = StreamBackedCorpusView
+    """
+    The corpus view class used by this reader.
+    """
+
+    def __init__(self, root, fileids=None,
+                 word_tokenizer=TweetTokenizer(),
+                 encoding='utf8'):
+        """
+
+        :param root: The root directory for this corpus.
+
+        :param fileids: A list or regexp specifying the fileids in this corpus.
+
+        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
+        smaller units, including but not limited to words.
+
+        """
+        CorpusReader.__init__(self, root, fileids, encoding)
+
+        for path in self.abspaths(self._fileids):
+            if isinstance(path, ZipFilePathPointer):
+                pass
+            elif os.path.getsize(path) == 0:
+                raise ValueError("File {} is empty".format(path))
+        """Check that all user-created corpus files are non-empty."""
+
+        self._word_tokenizer = word_tokenizer
+
+
+
+    def docs(self, fileids=None):
+        """
+        Returns the full Tweet objects, as specified by `Twitter
+        documentation on Tweets
+        <https://dev.twitter.com/docs/platform-objects/tweets>`_
+
+        :return: the given file(s) as a list of dictionaries deserialised
+        from JSON.
+        :rtype: list(dict)
+        """
+        return concat([self.CorpusView(path, self._read_tweets, encoding=enc)
+                       for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+
+    def strings(self, fileids=None):
+        """
+        Returns only the text content of Tweets in the file(s)
+
+        :return: the given file(s) as a list of Tweets.
+        :rtype: list(str)
+        """
+        fulltweets = self.docs(fileids)
+        tweets = []
+        for jsono in fulltweets:
+            try:
+                text = jsono['text']
+                if isinstance(text, bytes):
+                    text = text.decode(self.encoding)
+                tweets.append(text)
+            except KeyError:
+                pass
+        return tweets
+
+
+    def tokenized(self, fileids=None):
+        """
+        :return: the given file(s) as a list of the text content of Tweets as
+        as a list of words, screenanames, hashtags, URLs and punctuation symbols.
+
+        :rtype: list(list(str))
+        """
+        tweets = self.strings(fileids)
+        tokenizer = self._word_tokenizer
+        return [tokenizer.tokenize(t) for t in tweets]
+
+
+    def raw(self, fileids=None):
+        """
+        Return the corpora in their raw form.
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+
+    def _read_tweets(self, stream):
+        """
+        Assumes that each line in ``stream`` is a JSON-serialised object.
+        """
+        tweets = []
+        for i in range(10):
+            line = stream.readline()
+            if not line:
+                return tweets
+            tweet = json.loads(line)
+            tweets.append(tweet)
+        return tweets
diff --git a/nlp_resource_data/nltk/corpus/reader/twitter.pyc b/nlp_resource_data/nltk/corpus/reader/twitter.pyc

new file mode 100755 (executable)

index 0000000..15a6534

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/twitter.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/udhr.py b/nlp_resource_data/nltk/corpus/reader/udhr.py

new file mode 100755 (executable)

index 0000000..523c521
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/udhr.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+"""
+UDHR corpus reader. It mostly deals with encodings.
+"""
+from __future__ import absolute_import, unicode_literals
+
+from nltk.corpus.reader.util import find_corpus_fileids
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+
+class UdhrCorpusReader(PlaintextCorpusReader):
+
+    ENCODINGS = [
+        ('.*-Latin1$', 'latin-1'),
+        ('.*-Hebrew$', 'hebrew'),
+        ('.*-Arabic$', 'cp1256'),
+        ('Czech_Cesky-UTF8', 'cp1250'), # yeah
+        ('.*-Cyrillic$', 'cyrillic'),
+        ('.*-SJIS$', 'SJIS'),
+        ('.*-GB2312$', 'GB2312'),
+        ('.*-Latin2$', 'ISO-8859-2'),
+        ('.*-Greek$', 'greek'),
+        ('.*-UTF8$', 'utf-8'),
+
+        ('Hungarian_Magyar-Unicode', 'utf-16-le'),
+        ('Amahuaca', 'latin1'),
+        ('Turkish_Turkce-Turkish', 'latin5'),
+        ('Lithuanian_Lietuviskai-Baltic', 'latin4'),
+        ('Japanese_Nihongo-EUC', 'EUC-JP'),
+        ('Japanese_Nihongo-JIS', 'iso2022_jp'),
+        ('Chinese_Mandarin-HZ', 'hz'),
+        ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
+    ]
+
+    SKIP = set([
+        # The following files are not fully decodable because they
+        # were truncated at wrong bytes:
+        'Burmese_Myanmar-UTF8',
+        'Japanese_Nihongo-JIS',
+        'Chinese_Mandarin-HZ',
+        'Chinese_Mandarin-UTF8',
+        'Gujarati-UTF8',
+        'Hungarian_Magyar-Unicode',
+        'Lao-UTF8',
+        'Magahi-UTF8',
+        'Marathi-UTF8',
+        'Tamil-UTF8',
+
+        # Unfortunately, encodings required for reading
+        # the following files are not supported by Python:
+        'Vietnamese-VPS',
+        'Vietnamese-VIQR',
+        'Vietnamese-TCVN',
+        'Magahi-Agra',
+        'Bhojpuri-Agra',
+        'Esperanto-T61', # latin3 raises an exception
+
+        # The following files are encoded for specific fonts:
+        'Burmese_Myanmar-WinResearcher',
+        'Armenian-DallakHelv',
+        'Tigrinya_Tigrigna-VG2Main',
+        'Amharic-Afenegus6..60375', # ?
+        'Navaho_Dine-Navajo-Navaho-font',
+
+        # What are these?
+        'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
+        'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
+
+        # The following files are unintended:
+        'Czech-Latin2-err',
+        'Russian_Russky-UTF8~',
+    ])
+
+
+    def __init__(self, root='udhr'):
+        fileids = find_corpus_fileids(root, r'(?!README|\.).*')
+        super(UdhrCorpusReader, self).__init__(
+            root,
+            [fileid for fileid in fileids if fileid not in self.SKIP],
+            encoding=self.ENCODINGS
+        )
diff --git a/nlp_resource_data/nltk/corpus/reader/udhr.pyc b/nlp_resource_data/nltk/corpus/reader/udhr.pyc

new file mode 100755 (executable)

index 0000000..026b536

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/udhr.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/util.py b/nlp_resource_data/nltk/corpus/reader/util.py

new file mode 100755 (executable)

index 0000000..cf44eb9
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/util.py
@@ -0,0 +1,804 @@
+# Natural Language Toolkit: Corpus Reader Utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import os
+import bisect
+import re
+import tempfile
+from six import string_types, text_type
+from functools import reduce
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+# Use the c version of ElementTree, which is faster, if possible:
+try: from xml.etree import cElementTree as ElementTree
+except ImportError: from xml.etree import ElementTree
+
+from nltk.tokenize import wordpunct_tokenize
+from nltk.internals import slice_bounds
+from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
+from nltk.data import SeekableUnicodeStreamReader
+from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation, py25
+
+######################################################################
+#{ Corpus View
+######################################################################
+
+class StreamBackedCorpusView(AbstractLazySequence):
+    """
+    A 'view' of a corpus file, which acts like a sequence of tokens:
+    it can be accessed by index, iterated over, etc.  However, the
+    tokens are only constructed as-needed -- the entire corpus is
+    never stored in memory at once.
+
+    The constructor to ``StreamBackedCorpusView`` takes two arguments:
+    a corpus fileid (specified as a string or as a ``PathPointer``);
+    and a block reader.  A "block reader" is a function that reads
+    zero or more tokens from a stream, and returns them as a list.  A
+    very simple example of a block reader is:
+
+        >>> def simple_block_reader(stream):
+        ...     return stream.readline().split()
+
+    This simple block reader reads a single line at a time, and
+    returns a single token (consisting of a string) for each
+    whitespace-separated substring on the line.
+
+    When deciding how to define the block reader for a given
+    corpus, careful consideration should be given to the size of
+    blocks handled by the block reader.  Smaller block sizes will
+    increase the memory requirements of the corpus view's internal
+    data structures (by 2 integers per block).  On the other hand,
+    larger block sizes may decrease performance for random access to
+    the corpus.  (But note that larger block sizes will *not*
+    decrease performance for iteration.)
+
+    Internally, ``CorpusView`` maintains a partial mapping from token
+    index to file position, with one entry per block.  When a token
+    with a given index *i* is requested, the ``CorpusView`` constructs
+    it as follows:
+
+      1. First, it searches the toknum/filepos mapping for the token
+         index closest to (but less than or equal to) *i*.
+
+      2. Then, starting at the file position corresponding to that
+         index, it reads one block at a time using the block reader
+         until it reaches the requested token.
+
+    The toknum/filepos mapping is created lazily: it is initially
+    empty, but every time a new block is read, the block's
+    initial token is added to the mapping.  (Thus, the toknum/filepos
+    map has one entry per block.)
+
+    In order to increase efficiency for random access patterns that
+    have high degrees of locality, the corpus view may cache one or
+    more blocks.
+
+    :note: Each ``CorpusView`` object internally maintains an open file
+        object for its underlying corpus file.  This file should be
+        automatically closed when the ``CorpusView`` is garbage collected,
+        but if you wish to close it manually, use the ``close()``
+        method.  If you access a ``CorpusView``'s items after it has been
+        closed, the file object will be automatically re-opened.
+
+    :warning: If the contents of the file are modified during the
+        lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
+        is undefined.
+
+    :warning: If a unicode encoding is specified when constructing a
+        ``CorpusView``, then the block reader may only call
+        ``stream.seek()`` with offsets that have been returned by
+        ``stream.tell()``; in particular, calling ``stream.seek()`` with
+        relative offsets, or with offsets based on string lengths, may
+        lead to incorrect behavior.
+
+    :ivar _block_reader: The function used to read
+        a single block from the underlying file stream.
+    :ivar _toknum: A list containing the token index of each block
+        that has been processed.  In particular, ``_toknum[i]`` is the
+        token index of the first token in block ``i``.  Together
+        with ``_filepos``, this forms a partial mapping between token
+        indices and file positions.
+    :ivar _filepos: A list containing the file position of each block
+        that has been processed.  In particular, ``_toknum[i]`` is the
+        file position of the first character in block ``i``.  Together
+        with ``_toknum``, this forms a partial mapping between token
+        indices and file positions.
+    :ivar _stream: The stream used to access the underlying corpus file.
+    :ivar _len: The total number of tokens in the corpus, if known;
+        or None, if the number of tokens is not yet known.
+    :ivar _eofpos: The character position of the last character in the
+        file.  This is calculated when the corpus view is initialized,
+        and is used to decide when the end of file has been reached.
+    :ivar _cache: A cache of the most recently read block.  It
+       is encoded as a tuple (start_toknum, end_toknum, tokens), where
+       start_toknum is the token index of the first token in the block;
+       end_toknum is the token index of the first token not in the
+       block; and tokens is a list of the tokens in the block.
+    """
+    def __init__(self, fileid, block_reader=None, startpos=0,
+                 encoding='utf8'):
+        """
+        Create a new corpus view, based on the file ``fileid``, and
+        read with ``block_reader``.  See the class documentation
+        for more information.
+
+        :param fileid: The path to the file that is read by this
+            corpus view.  ``fileid`` can either be a string or a
+            ``PathPointer``.
+
+        :param startpos: The file position at which the view will
+            start reading.  This can be used to skip over preface
+            sections.
+
+        :param encoding: The unicode encoding that should be used to
+            read the file's contents.  If no encoding is specified,
+            then the file's contents will be read as a non-unicode
+            string (i.e., a str).
+        """
+        if block_reader:
+            self.read_block = block_reader
+        # Initialize our toknum/filepos mapping.
+        self._toknum = [0]
+        self._filepos = [startpos]
+        self._encoding = encoding
+        # We don't know our length (number of tokens) yet.
+        self._len = None
+
+        self._fileid = fileid
+        self._stream = None
+
+        self._current_toknum = None
+        """This variable is set to the index of the next token that
+           will be read, immediately before ``self.read_block()`` is
+           called.  This is provided for the benefit of the block
+           reader, which under rare circumstances may need to know
+           the current token number."""
+
+        self._current_blocknum = None
+        """This variable is set to the index of the next block that
+           will be read, immediately before ``self.read_block()`` is
+           called.  This is provided for the benefit of the block
+           reader, which under rare circumstances may need to know
+           the current block number."""
+
+        # Find the length of the file.
+        try:
+            if isinstance(self._fileid, PathPointer):
+                self._eofpos = self._fileid.file_size()
+            else:
+                self._eofpos = os.stat(self._fileid).st_size
+        except Exception as exc:
+            raise ValueError('Unable to open or access %r -- %s' %
+                             (fileid, exc))
+
+        # Maintain a cache of the most recently read block, to
+        # increase efficiency of random access.
+        self._cache = (-1, -1, None)
+
+    fileid = property(lambda self: self._fileid, doc="""
+        The fileid of the file that is accessed by this view.
+
+        :type: str or PathPointer""")
+
+    def read_block(self, stream):
+        """
+        Read a block from the input stream.
+
+        :return: a block of tokens from the input stream
+        :rtype: list(any)
+        :param stream: an input stream
+        :type stream: stream
+        """
+        raise NotImplementedError('Abstract Method')
+
+    def _open(self):
+        """
+        Open the file stream associated with this corpus view.  This
+        will be called performed if any value is read from the view
+        while its file stream is closed.
+        """
+        if isinstance(self._fileid, PathPointer):
+            self._stream = self._fileid.open(self._encoding)
+        elif self._encoding:
+            self._stream = SeekableUnicodeStreamReader(
+                open(self._fileid, 'rb'), self._encoding)
+        else:
+            self._stream = open(self._fileid, 'rb')
+
+    def close(self):
+        """
+        Close the file stream associated with this corpus view.  This
+        can be useful if you are worried about running out of file
+        handles (although the stream should automatically be closed
+        upon garbage collection of the corpus view).  If the corpus
+        view is accessed after it is closed, it will be automatically
+        re-opened.
+        """
+        if self._stream is not None:
+            self._stream.close()
+        self._stream = None
+
+    def __len__(self):
+        if self._len is None:
+            # iterate_from() sets self._len when it reaches the end
+            # of the file:
+            for tok in self.iterate_from(self._toknum[-1]): pass
+        return self._len
+
+    def __getitem__(self, i):
+        if isinstance(i, slice):
+            start, stop = slice_bounds(self, i)
+            # Check if it's in the cache.
+            offset = self._cache[0]
+            if offset <= start and stop <= self._cache[1]:
+                return self._cache[2][start-offset:stop-offset]
+            # Construct & return the result.
+            return LazySubsequence(self, start, stop)
+        else:
+            # Handle negative indices
+            if i < 0: i += len(self)
+            if i < 0: raise IndexError('index out of range')
+            # Check if it's in the cache.
+            offset = self._cache[0]
+            if offset <= i < self._cache[1]:
+                return self._cache[2][i-offset]
+            # Use iterate_from to extract it.
+            try:
+                return next(self.iterate_from(i))
+            except StopIteration:
+                raise IndexError('index out of range')
+
+    # If we wanted to be thread-safe, then this method would need to
+    # do some locking.
+    def iterate_from(self, start_tok):
+        # Start by feeding from the cache, if possible.
+        if self._cache[0] <= start_tok < self._cache[1]:
+            for tok in self._cache[2][start_tok-self._cache[0]:]:
+                yield tok
+                start_tok += 1
+
+        # Decide where in the file we should start.  If `start` is in
+        # our mapping, then we can jump straight to the correct block;
+        # otherwise, start at the last block we've processed.
+        if start_tok < self._toknum[-1]:
+            block_index = bisect.bisect_right(self._toknum, start_tok)-1
+            toknum = self._toknum[block_index]
+            filepos = self._filepos[block_index]
+        else:
+            block_index = len(self._toknum)-1
+            toknum = self._toknum[-1]
+            filepos = self._filepos[-1]
+
+        # Open the stream, if it's not open already.
+        if self._stream is None:
+            self._open()
+
+        # If the file is empty, the while loop will never run.
+        # This *seems* to be all the state we need to set:
+        if self._eofpos == 0:
+            self._len = 0
+
+        # Each iteration through this loop, we read a single block
+        # from the stream.
+        while filepos < self._eofpos:
+            # Read the next block.
+            self._stream.seek(filepos)
+            self._current_toknum = toknum
+            self._current_blocknum = block_index
+            tokens = self.read_block(self._stream)
+            assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
+                'block reader %s() should return list or tuple.' %
+                self.read_block.__name__)
+            num_toks = len(tokens)
+            new_filepos = self._stream.tell()
+            assert new_filepos > filepos, (
+                'block reader %s() should consume at least 1 byte (filepos=%d)' %
+                (self.read_block.__name__, filepos))
+
+            # Update our cache.
+            self._cache = (toknum, toknum+num_toks, list(tokens))
+
+            # Update our mapping.
+            assert toknum <= self._toknum[-1]
+            if num_toks > 0:
+                block_index += 1
+                if toknum == self._toknum[-1]:
+                    assert new_filepos > self._filepos[-1] # monotonic!
+                    self._filepos.append(new_filepos)
+                    self._toknum.append(toknum+num_toks)
+                else:
+                    # Check for consistency:
+                    assert new_filepos == self._filepos[block_index], (
+                        'inconsistent block reader (num chars read)')
+                    assert toknum+num_toks == self._toknum[block_index], (
+                        'inconsistent block reader (num tokens returned)')
+
+            # If we reached the end of the file, then update self._len
+            if new_filepos == self._eofpos:
+                self._len = toknum + num_toks
+            # Generate the tokens in this block (but skip any tokens
+            # before start_tok).  Note that between yields, our state
+            # may be modified.
+            for tok in tokens[max(0, start_tok-toknum):]:
+                yield tok
+            # If we're at the end of the file, then we're done.
+            assert new_filepos <= self._eofpos
+            if new_filepos == self._eofpos:
+                break
+            # Update our indices
+            toknum += num_toks
+            filepos = new_filepos
+
+        # If we reach this point, then we should know our length.
+        assert self._len is not None
+        # Enforce closing of stream once we reached end of file
+        # We should have reached EOF once we're out of the while loop.
+        self.close()
+
+    # Use concat for these, so we can use a ConcatenatedCorpusView
+    # when possible.
+    def __add__(self, other):
+        return concat([self, other])
+    def __radd__(self, other):
+        return concat([other, self])
+    def __mul__(self, count):
+        return concat([self] * count)
+    def __rmul__(self, count):
+        return concat([self] * count)
+
+class ConcatenatedCorpusView(AbstractLazySequence):
+    """
+    A 'view' of a corpus file that joins together one or more
+    ``StreamBackedCorpusViews<StreamBackedCorpusView>``.  At most
+    one file handle is left open at any time.
+    """
+    def __init__(self, corpus_views):
+        self._pieces = corpus_views
+        """A list of the corpus subviews that make up this
+        concatenation."""
+
+        self._offsets = [0]
+        """A list of offsets, indicating the index at which each
+        subview begins.  In particular::
+            offsets[i] = sum([len(p) for p in pieces[:i]])"""
+
+        self._open_piece = None
+        """The most recently accessed corpus subview (or None).
+        Before a new subview is accessed, this subview will be closed."""
+
+    def __len__(self):
+        if len(self._offsets) <= len(self._pieces):
+            # Iterate to the end of the corpus.
+            for tok in self.iterate_from(self._offsets[-1]): pass
+
+        return self._offsets[-1]
+
+    def close(self):
+        for piece in self._pieces:
+            piece.close()
+
+    def iterate_from(self, start_tok):
+        piecenum = bisect.bisect_right(self._offsets, start_tok)-1
+
+        while piecenum < len(self._pieces):
+            offset = self._offsets[piecenum]
+            piece = self._pieces[piecenum]
+
+            # If we've got another piece open, close it first.
+            if self._open_piece is not piece:
+                if self._open_piece is not None:
+                    self._open_piece.close()
+                self._open_piece = piece
+
+            # Get everything we can from this piece.
+            for tok in piece.iterate_from(max(0, start_tok-offset)):
+                yield tok
+
+            # Update the offset table.
+            if piecenum+1 == len(self._offsets):
+                self._offsets.append(self._offsets[-1] + len(piece))
+
+            # Move on to the next piece.
+            piecenum += 1
+
+def concat(docs):
+    """
+    Concatenate together the contents of multiple documents from a
+    single corpus, using an appropriate concatenation function.  This
+    utility function is used by corpus readers when the user requests
+    more than one document at a time.
+    """
+    if len(docs) == 1:
+        return docs[0]
+    if len(docs) == 0:
+        raise ValueError('concat() expects at least one object!')
+
+    types = set(d.__class__ for d in docs)
+
+    # If they're all strings, use string concatenation.
+    if all(isinstance(doc, string_types) for doc in docs):
+        return ''.join(docs)
+
+    # If they're all corpus views, then use ConcatenatedCorpusView.
+    for typ in types:
+        if not issubclass(typ, (StreamBackedCorpusView,
+                                ConcatenatedCorpusView)):
+            break
+    else:
+        return ConcatenatedCorpusView(docs)
+
+    # If they're all lazy sequences, use a lazy concatenation
+    for typ in types:
+        if not issubclass(typ, AbstractLazySequence):
+            break
+    else:
+        return LazyConcatenation(docs)
+
+    # Otherwise, see what we can do:
+    if len(types) == 1:
+        typ = list(types)[0]
+
+        if issubclass(typ, list):
+            return reduce((lambda a,b:a+b), docs, [])
+
+        if issubclass(typ, tuple):
+            return reduce((lambda a,b:a+b), docs, ())
+
+        if ElementTree.iselement(typ):
+            xmltree = ElementTree.Element('documents')
+            for doc in docs: xmltree.append(doc)
+            return xmltree
+
+    # No method found!
+    raise ValueError("Don't know how to concatenate types: %r" % types)
+
+######################################################################
+#{ Corpus View for Pickled Sequences
+######################################################################
+
+class PickleCorpusView(StreamBackedCorpusView):
+    """
+    A stream backed corpus view for corpus files that consist of
+    sequences of serialized Python objects (serialized using
+    ``pickle.dump``).  One use case for this class is to store the
+    result of running feature detection on a corpus to disk.  This can
+    be useful when performing feature detection is expensive (so we
+    don't want to repeat it); but the corpus is too large to store in
+    memory.  The following example illustrates this technique:
+
+        >>> from nltk.corpus.reader.util import PickleCorpusView
+        >>> from nltk.util import LazyMap
+        >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP
+        >>> PickleCorpusView.write(feature_corpus, some_fileid)  # doctest: +SKIP
+        >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
+    """
+    BLOCK_SIZE = 100
+    PROTOCOL = -1
+
+    def __init__(self, fileid, delete_on_gc=False):
+        """
+        Create a new corpus view that reads the pickle corpus
+        ``fileid``.
+
+        :param delete_on_gc: If true, then ``fileid`` will be deleted
+            whenever this object gets garbage-collected.
+        """
+        self._delete_on_gc = delete_on_gc
+        StreamBackedCorpusView.__init__(self, fileid)
+
+    def read_block(self, stream):
+        result = []
+        for i in range(self.BLOCK_SIZE):
+            try: result.append(pickle.load(stream))
+            except EOFError: break
+        return result
+
+    def __del__(self):
+        """
+        If ``delete_on_gc`` was set to true when this
+        ``PickleCorpusView`` was created, then delete the corpus view's
+        fileid.  (This method is called whenever a
+        ``PickledCorpusView`` is garbage-collected.
+        """
+        if getattr(self, '_delete_on_gc'):
+            if os.path.exists(self._fileid):
+                try: os.remove(self._fileid)
+                except (OSError, IOError): pass
+        self.__dict__.clear() # make the garbage collector's job easier
+
+    @classmethod
+    def write(cls, sequence, output_file):
+        if isinstance(output_file, string_types):
+            output_file = open(output_file, 'wb')
+        for item in sequence:
+            pickle.dump(item, output_file, cls.PROTOCOL)
+
+    @classmethod
+    def cache_to_tempfile(cls, sequence, delete_on_gc=True):
+        """
+        Write the given sequence to a temporary file as a pickle
+        corpus; and then return a ``PickleCorpusView`` view for that
+        temporary corpus file.
+
+        :param delete_on_gc: If true, then the temporary file will be
+            deleted whenever this object gets garbage-collected.
+        """
+        try:
+            fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
+            output_file = os.fdopen(fd, 'wb')
+            cls.write(sequence, output_file)
+            output_file.close()
+            return PickleCorpusView(output_file_name, delete_on_gc)
+        except (OSError, IOError) as e:
+            raise ValueError('Error while creating temp file: %s' % e)
+
+
+
+######################################################################
+#{ Block Readers
+######################################################################
+
+def read_whitespace_block(stream):
+    toks = []
+    for i in range(20): # Read 20 lines at a time.
+        toks.extend(stream.readline().split())
+    return toks
+
+def read_wordpunct_block(stream):
+    toks = []
+    for i in range(20): # Read 20 lines at a time.
+        toks.extend(wordpunct_tokenize(stream.readline()))
+    return toks
+
+def read_line_block(stream):
+    toks = []
+    for i in range(20):
+        line = stream.readline()
+        if not line: return toks
+        toks.append(line.rstrip('\n'))
+    return toks
+
+def read_blankline_block(stream):
+    s = ''
+    while True:
+        line = stream.readline()
+        # End of file:
+        if not line:
+            if s: return [s]
+            else: return []
+        # Blank line:
+        elif line and not line.strip():
+            if s: return [s]
+        # Other line:
+        else:
+            s += line
+
+def read_alignedsent_block(stream):
+    s = ''
+    while True:
+        line = stream.readline()
+        if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n':
+            continue
+        # End of file:
+        if not line:
+            if s: return [s]
+            else: return []
+        # Other line:
+        else:
+            s += line
+            if re.match('^\d+-\d+', line) is not None:
+                return [s]
+
+def read_regexp_block(stream, start_re, end_re=None):
+    """
+    Read a sequence of tokens from a stream, where tokens begin with
+    lines that match ``start_re``.  If ``end_re`` is specified, then
+    tokens end with lines that match ``end_re``; otherwise, tokens end
+    whenever the next line matching ``start_re`` or EOF is found.
+    """
+    # Scan until we find a line matching the start regexp.
+    while True:
+        line = stream.readline()
+        if not line: return [] # end of file.
+        if re.match(start_re, line): break
+
+    # Scan until we find another line matching the regexp, or EOF.
+    lines = [line]
+    while True:
+        oldpos = stream.tell()
+        line = stream.readline()
+        # End of file:
+        if not line:
+            return [''.join(lines)]
+        # End of token:
+        if end_re is not None and re.match(end_re, line):
+            return [''.join(lines)]
+        # Start of new token: backup to just before it starts, and
+        # return the token we've already collected.
+        if end_re is None and re.match(start_re, line):
+            stream.seek(oldpos)
+            return [''.join(lines)]
+        # Anything else is part of the token.
+        lines.append(line)
+
+def read_sexpr_block(stream, block_size=16384, comment_char=None):
+    """
+    Read a sequence of s-expressions from the stream, and leave the
+    stream's file position at the end the last complete s-expression
+    read.  This function will always return at least one s-expression,
+    unless there are no more s-expressions in the file.
+
+    If the file ends in in the middle of an s-expression, then that
+    incomplete s-expression is returned when the end of the file is
+    reached.
+
+    :param block_size: The default block size for reading.  If an
+        s-expression is longer than one block, then more than one
+        block will be read.
+    :param comment_char: A character that marks comments.  Any lines
+        that begin with this character will be stripped out.
+        (If spaces or tabs precede the comment character, then the
+        line will not be stripped.)
+    """
+    start = stream.tell()
+    block = stream.read(block_size)
+    encoding = getattr(stream, 'encoding', None)
+    assert encoding is not None or isinstance(block, text_type)
+    if encoding not in (None, 'utf-8'):
+        import warnings
+        warnings.warn('Parsing may fail, depending on the properties '
+                      'of the %s encoding!' % encoding)
+        # (e.g., the utf-16 encoding does not work because it insists
+        # on adding BOMs to the beginning of encoded strings.)
+
+    if comment_char:
+        COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
+    while True:
+        try:
+            # If we're stripping comments, then make sure our block ends
+            # on a line boundary; and then replace any comments with
+            # space characters.  (We can't just strip them out -- that
+            # would make our offset wrong.)
+            if comment_char:
+                block += stream.readline()
+                block = re.sub(COMMENT, _sub_space, block)
+            # Read the block.
+            tokens, offset = _parse_sexpr_block(block)
+            # Skip whitespace
+            offset = re.compile(r'\s*').search(block, offset).end()
+
+            # Move to the end position.
+            if encoding is None:
+                stream.seek(start+offset)
+            else:
+                stream.seek(start+len(block[:offset].encode(encoding)))
+
+            # Return the list of tokens we processed
+            return tokens
+        except ValueError as e:
+            if e.args[0] == 'Block too small':
+                next_block = stream.read(block_size)
+                if next_block:
+                    block += next_block
+                    continue
+                else:
+                    # The file ended mid-sexpr -- return what we got.
+                    return [block.strip()]
+            else: raise
+
+def _sub_space(m):
+    """Helper function: given a regexp match, return a string of
+    spaces that's the same length as the matched string."""
+    return ' '*(m.end()-m.start())
+
+def _parse_sexpr_block(block):
+    tokens = []
+    start = end = 0
+
+    while end < len(block):
+        m = re.compile(r'\S').search(block, end)
+        if not m:
+            return tokens, end
+
+        start = m.start()
+
+        # Case 1: sexpr is not parenthesized.
+        if m.group() != '(':
+            m2 = re.compile(r'[\s(]').search(block, start)
+            if m2:
+                end = m2.start()
+            else:
+                if tokens: return tokens, end
+                raise ValueError('Block too small')
+
+        # Case 2: parenthesized sexpr.
+        else:
+            nesting = 0
+            for m in re.compile(r'[()]').finditer(block, start):
+                if m.group()=='(': nesting += 1
+                else: nesting -= 1
+                if nesting == 0:
+                    end = m.end()
+                    break
+            else:
+                if tokens: return tokens, end
+                raise ValueError('Block too small')
+
+        tokens.append(block[start:end])
+
+    return tokens, end
+
+
+######################################################################
+#{ Finding Corpus Items
+######################################################################
+
+def find_corpus_fileids(root, regexp):
+    if not isinstance(root, PathPointer):
+        raise TypeError('find_corpus_fileids: expected a PathPointer')
+    regexp += '$'
+
+    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
+    # out entries that end in '/' -- they're directories.
+    if isinstance(root, ZipFilePathPointer):
+        fileids = [name[len(root.entry):] for name in root.zipfile.namelist()
+                 if not name.endswith('/')]
+        items = [name for name in fileids if re.match(regexp, name)]
+        return sorted(items)
+
+    # Find fileids in a directory: use os.walk to search all (proper
+    # or symlinked) subdirectories, and match paths against the regexp.
+    elif isinstance(root, FileSystemPathPointer):
+        items = []
+        # workaround for py25 which doesn't support followlinks
+        kwargs = {}
+        if not py25():
+            kwargs = {'followlinks': True}
+        for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
+            prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
+            items += [prefix+fileid for fileid in fileids
+                      if re.match(regexp, prefix+fileid)]
+            # Don't visit svn directories:
+            if '.svn' in subdirs: subdirs.remove('.svn')
+        return sorted(items)
+
+    else:
+        raise AssertionError("Don't know how to handle %r" % root)
+
+def _path_from(parent, child):
+    if os.path.split(parent)[1] == '':
+        parent = os.path.split(parent)[0]
+    path = []
+    while parent != child:
+        child, dirname = os.path.split(child)
+        path.insert(0, dirname)
+        assert os.path.split(child)[0] != child
+    return path
+
+######################################################################
+#{ Paragraph structure in Treebank files
+######################################################################
+
+def tagged_treebank_para_block_reader(stream):
+    # Read the next paragraph.
+    para = ''
+    while True:
+        line = stream.readline()
+        # End of paragraph:
+        if re.match('======+\s*$', line):
+            if para.strip(): return [para]
+        # End of file:
+        elif line == '':
+            if para.strip(): return [para]
+            else: return []
+        # Content line:
+        else:
+            para += line
diff --git a/nlp_resource_data/nltk/corpus/reader/util.pyc b/nlp_resource_data/nltk/corpus/reader/util.pyc

new file mode 100755 (executable)

index 0000000..cb17813

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/util.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/verbnet.py b/nlp_resource_data/nltk/corpus/reader/verbnet.py

new file mode 100755 (executable)

index 0000000..641cff9
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/verbnet.py
@@ -0,0 +1,595 @@
+# Natural Language Toolkit: Verbnet Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface to the VerbNet verb lexicon
+
+For details about VerbNet see:
+https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+"""
+from __future__ import unicode_literals
+
+import re
+import textwrap
+from collections import defaultdict
+
+from six import string_types
+
+from nltk.corpus.reader.xmldocs import XMLCorpusReader
+
+
+class VerbnetCorpusReader(XMLCorpusReader):
+    """
+    An NLTK interface to the VerbNet verb lexicon.
+
+    From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
+    on-line verb lexicon currently available for English. It is a hierarchical
+    domain-independent, broad-coverage verb lexicon with mappings to other
+    lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
+    (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
+
+    For details about VerbNet see:
+    https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+    """
+
+    # No unicode encoding param, since the data files are all XML.
+    def __init__(self, root, fileids, wrap_etree=False):
+        XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
+
+        self._lemma_to_class = defaultdict(list)
+        """A dictionary mapping from verb lemma strings to lists of
+        VerbNet class identifiers."""
+
+        self._wordnet_to_class = defaultdict(list)
+        """A dictionary mapping from wordnet identifier strings to
+        lists of VerbNet class identifiers."""
+
+        self._class_to_fileid = {}
+        """A dictionary mapping from class identifiers to
+        corresponding file identifiers.  The keys of this dictionary
+        provide a complete list of all classes and subclasses."""
+
+        self._shortid_to_longid = {}
+
+        # Initialize the dictionaries.  Use the quick (regexp-based)
+        # method instead of the slow (xml-based) method, because it
+        # runs 2-30 times faster.
+        self._quick_index()
+
+    _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
+    """Regular expression that matches (and decomposes) longids"""
+
+    _SHORTID_RE = re.compile(r'[\d+.\-]+$')
+    """Regular expression that matches shortids"""
+
+    _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|'
+                           r'<VNSUBCLASS ID="([^"]+)"/?>')
+    """Regular expression used by ``_index()`` to quickly scan the corpus
+       for basic information."""
+
+    def lemmas(self, vnclass=None):
+        """
+        Return a list of all verb lemmas that appear in any class, or
+        in the ``classid`` if specified.
+        """
+        if vnclass is None:
+            return sorted(self._lemma_to_class.keys())
+        else:
+            # [xx] should this include subclass members?
+            if isinstance(vnclass, string_types):
+                vnclass = self.vnclass(vnclass)
+            return [member.get('name') for member in
+                    vnclass.findall('MEMBERS/MEMBER')]
+
+    def wordnetids(self, vnclass=None):
+        """
+        Return a list of all wordnet identifiers that appear in any
+        class, or in ``classid`` if specified.
+        """
+        if vnclass is None:
+            return sorted(self._wordnet_to_class.keys())
+        else:
+            # [xx] should this include subclass members?
+            if isinstance(vnclass, string_types):
+                vnclass = self.vnclass(vnclass)
+            return sum([member.get('wn', '').split() for member in
+                        vnclass.findall('MEMBERS/MEMBER')], [])
+
+    def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
+        """
+        Return a list of the VerbNet class identifiers.  If a file
+        identifier is specified, then return only the VerbNet class
+        identifiers for classes (and subclasses) defined by that file.
+        If a lemma is specified, then return only VerbNet class
+        identifiers for classes that contain that lemma as a member.
+        If a wordnetid is specified, then return only identifiers for
+        classes that contain that wordnetid as a member.  If a classid
+        is specified, then return only identifiers for subclasses of
+        the specified VerbNet class.
+        If nothing is specified, return all classids within VerbNet
+        """
+        if fileid is not None:
+            return [c for (c, f) in self._class_to_fileid.items()
+                    if f == fileid]
+        elif lemma is not None:
+            return self._lemma_to_class[lemma]
+        elif wordnetid is not None:
+            return self._wordnet_to_class[wordnetid]
+        elif classid is not None:
+            xmltree = self.vnclass(classid)
+            return [subclass.get('ID') for subclass in
+                    xmltree.findall('SUBCLASSES/VNSUBCLASS')]
+        else:
+            return sorted(self._class_to_fileid.keys())
+
+    def vnclass(self, fileid_or_classid):
+        """Returns VerbNet class ElementTree
+        
+        Return an ElementTree containing the xml for the specified
+        VerbNet class.
+
+        :param fileid_or_classid: An identifier specifying which class
+            should be returned.  Can be a file identifier (such as
+            ``'put-9.1.xml'``), or a VerbNet class identifier (such as
+            ``'put-9.1'``) or a short VerbNet class identifier (such as
+            ``'9.1'``).
+        """
+        # File identifier: just return the xml.
+        if fileid_or_classid in self._fileids:
+            return self.xml(fileid_or_classid)
+
+        # Class identifier: get the xml, and find the right elt.
+        classid = self.longid(fileid_or_classid)
+        if classid in self._class_to_fileid:
+            fileid = self._class_to_fileid[self.longid(classid)]
+            tree = self.xml(fileid)
+            if classid == tree.get('ID'):
+                return tree
+            else:
+                for subclass in tree.findall('.//VNSUBCLASS'):
+                    if classid == subclass.get('ID'):
+                        return subclass
+                else:
+                    assert False  # we saw it during _index()!
+
+        else:
+            raise ValueError('Unknown identifier {}'.format(fileid_or_classid))
+
+    def fileids(self, vnclass_ids=None):
+        """
+        Return a list of fileids that make up this corpus.  If
+        ``vnclass_ids`` is specified, then return the fileids that make
+        up the specified VerbNet class(es).
+        """
+        if vnclass_ids is None:
+            return self._fileids
+        elif isinstance(vnclass_ids, string_types):
+            return [self._class_to_fileid[self.longid(vnclass_ids)]]
+        else:
+            return [self._class_to_fileid[self.longid(vnclass_id)]
+                    for vnclass_id in vnclass_ids]
+
+    def frames(self, vnclass):
+        """Given a VerbNet class, this method returns VerbNet frames
+        
+        The members returned are:
+        1) Example
+        2) Description
+        3) Syntax
+        4) Semantics
+        
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: frames - a list of frame dictionaries
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+        frames = []
+        vnframes = vnclass.findall('FRAMES/FRAME')
+        for vnframe in vnframes:
+            frames.append({
+                'example': self._get_example_within_frame(vnframe),
+                'description': self._get_description_within_frame(vnframe),
+                'syntax': self._get_syntactic_list_within_frame(vnframe),
+                'semantics': self._get_semantics_within_frame(vnframe)
+            })
+        return frames
+
+    def subclasses(self, vnclass):
+        """Returns subclass ids, if any exist 
+        
+        Given a VerbNet class, this method returns subclass ids (if they exist)
+        in a list of strings.
+        
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: list of subclasses
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        subclasses = [subclass.get('ID') for subclass in
+                      vnclass.findall('SUBCLASSES/VNSUBCLASS')]
+        return subclasses
+
+    def themroles(self, vnclass):
+        """Returns thematic roles participating in a VerbNet class
+        
+        Members returned as part of roles are-
+        1) Type
+        2) Modifiers
+        
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: themroles: A list of thematic roles in the VerbNet class
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        themroles = []
+        for trole in vnclass.findall('THEMROLES/THEMROLE'):
+            themroles.append({
+                'type': trole.get('type'),
+                'modifiers': [{'value': restr.get('Value'), 'type': restr.get('type')}
+                              for restr in trole.findall('SELRESTRS/SELRESTR')]
+            })
+        return themroles
+
+    ######################################################################
+    # { Index Initialization
+    ######################################################################
+
+    def _index(self):
+        """
+        Initialize the indexes ``_lemma_to_class``,
+        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
+        through the corpus fileids.  This is fast with cElementTree
+        (<0.1 secs), but quite slow (>10 secs) with the python
+        implementation of ElementTree.
+        """
+        for fileid in self._fileids:
+            self._index_helper(self.xml(fileid), fileid)
+
+    def _index_helper(self, xmltree, fileid):
+        """Helper for ``_index()``"""
+        vnclass = xmltree.get('ID')
+        self._class_to_fileid[vnclass] = fileid
+        self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+        for member in xmltree.findall('MEMBERS/MEMBER'):
+            self._lemma_to_class[member.get('name')].append(vnclass)
+            for wn in member.get('wn', '').split():
+                self._wordnet_to_class[wn].append(vnclass)
+        for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'):
+            self._index_helper(subclass, fileid)
+
+    def _quick_index(self):
+        """
+        Initialize the indexes ``_lemma_to_class``,
+        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
+        through the corpus fileids.  This doesn't do proper xml parsing,
+        but is good enough to find everything in the standard VerbNet
+        corpus -- and it runs about 30 times faster than xml parsing
+        (with the python ElementTree; only 2-3 times faster with
+        cElementTree).
+        """
+        # nb: if we got rid of wordnet_to_class, this would run 2-3
+        # times faster.
+        for fileid in self._fileids:
+            vnclass = fileid[:-4]  # strip the '.xml'
+            self._class_to_fileid[vnclass] = fileid
+            self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+            for m in self._INDEX_RE.finditer(self.open(fileid).read()):
+                groups = m.groups()
+                if groups[0] is not None:
+                    self._lemma_to_class[groups[0]].append(vnclass)
+                    for wn in groups[1].split():
+                        self._wordnet_to_class[wn].append(vnclass)
+                elif groups[2] is not None:
+                    self._class_to_fileid[groups[2]] = fileid
+                    vnclass = groups[2]  # for <MEMBER> elts.
+                    self._shortid_to_longid[self.shortid(vnclass)] = vnclass
+                else:
+                    assert False, 'unexpected match condition'
+
+    ######################################################################
+    # { Identifier conversion
+    ######################################################################
+
+    def longid(self, shortid):
+        """Returns longid of a VerbNet class
+        
+        Given a short VerbNet class identifier (eg '37.10'), map it
+        to a long id (eg 'confess-37.10').  If ``shortid`` is already a
+        long id, then return it as-is"""
+        if self._LONGID_RE.match(shortid):
+            return shortid  # it's already a longid.
+        elif not self._SHORTID_RE.match(shortid):
+            raise ValueError('vnclass identifier %r not found' % shortid)
+        try:
+            return self._shortid_to_longid[shortid]
+        except KeyError:
+            raise ValueError('vnclass identifier %r not found' % shortid)
+
+    def shortid(self, longid):
+        """Returns shortid of a VerbNet class
+        
+        Given a long VerbNet class identifier (eg 'confess-37.10'),
+        map it to a short id (eg '37.10').  If ``longid`` is already a
+        short id, then return it as-is."""
+        if self._SHORTID_RE.match(longid):
+            return longid  # it's already a shortid.
+        m = self._LONGID_RE.match(longid)
+        if m:
+            return m.group(2)
+        else:
+            raise ValueError('vnclass identifier %r not found' % longid)
+
+    ######################################################################
+    # { Frame access utility functions
+    ######################################################################
+
+    def _get_semantics_within_frame(self, vnframe):
+        """Returns semantics within a single frame
+        
+        A utility function to retrieve semantics within a frame in VerbNet
+        Members of the semantics dictionary:
+        1) Predicate value 
+        2) Arguments
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: semantics: semantics dictionary
+        """
+        semantics_within_single_frame = []
+        for pred in vnframe.findall('SEMANTICS/PRED'):
+            arguments = [{'type': arg.get('type'), 'value': arg.get('value')}
+                         for arg in pred.findall('ARGS/ARG')]
+            semantics_within_single_frame.append({
+                'predicate_value': pred.get('value'),
+                'arguments': arguments
+            })
+        return semantics_within_single_frame
+
+    def _get_example_within_frame(self, vnframe):
+        """Returns example within a frame
+        
+        A utility function to retrieve an example within a frame in VerbNet.
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: example_text: The example sentence for this particular frame
+        """
+        example_element = vnframe.find('EXAMPLES/EXAMPLE')
+        if example_element is not None:
+            example_text = example_element.text
+        else:
+            example_text = ""
+        return example_text
+
+    def _get_description_within_frame(self, vnframe):
+        """Returns member description within frame
+         
+        A utility function to retrieve a description of participating members
+        within a frame in VerbNet.
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: description: a description dictionary with members - primary and secondary 
+        """
+        description_element = vnframe.find('DESCRIPTION')
+        return {
+            'primary': description_element.attrib['primary'],
+            'secondary': description_element.get('secondary', '')
+        }
+
+    def _get_syntactic_list_within_frame(self, vnframe):
+        """Returns semantics within a frame
+        
+        A utility function to retrieve semantics within a frame in VerbNet.
+        Members of the syntactic dictionary:
+        1) POS Tag
+        2) Modifiers
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: syntax_within_single_frame
+        """
+        syntax_within_single_frame = []
+        for elt in vnframe.find('SYNTAX'):
+            pos_tag = elt.tag
+            modifiers = dict()
+            modifiers['value'] = elt.get('value') if 'value' in elt.attrib else ""
+            modifiers['selrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+                                      for restr in elt.findall('SELRESTRS/SELRESTR')]
+            modifiers['synrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+                                      for restr in elt.findall('SYNRESTRS/SYNRESTR')]
+            syntax_within_single_frame.append({
+                'pos_tag': pos_tag,
+                'modifiers': modifiers
+            })
+        return syntax_within_single_frame
+
+    ######################################################################
+    # { Pretty Printing
+    ######################################################################
+
+    def pprint(self, vnclass):
+        """Returns pretty printed version of a VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+        containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        s = vnclass.get('ID') + '\n'
+        s += self.pprint_subclasses(vnclass, indent='  ') + '\n'
+        s += self.pprint_members(vnclass, indent='  ') + '\n'
+        s += '  Thematic roles:\n'
+        s += self.pprint_themroles(vnclass, indent='    ') + '\n'
+        s += '  Frames:\n'
+        s += self.pprint_frames(vnclass, indent='    ')
+        return s
+
+    def pprint_subclasses(self, vnclass, indent=''):
+        """Returns pretty printed version of subclasses of VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's subclasses.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        subclasses = self.subclasses(vnclass)
+        if not subclasses: subclasses = ['(none)']
+        s = 'Subclasses: ' + ' '.join(subclasses)
+        return textwrap.fill(s, 70, initial_indent=indent,
+                             subsequent_indent=indent + '  ')
+
+    def pprint_members(self, vnclass, indent=''):
+        """Returns pretty printed version of members in a VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's member verbs.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        members = self.lemmas(vnclass)
+        if not members:
+            members = ['(none)']
+        s = 'Members: ' + ' '.join(members)
+        return textwrap.fill(s, 70, initial_indent=indent,
+                             subsequent_indent=indent + '  ')
+
+    def pprint_themroles(self, vnclass, indent=''):
+        """Returns pretty printed version of thematic roles in a VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet class's thematic roles.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        pieces = []
+        for themrole in self.themroles(vnclass):
+            piece = indent + '* ' + themrole.get('type')
+            modifiers = [modifier['value'] + modifier['type']
+                         for modifier in themrole['modifiers']]
+            if modifiers:
+                piece += '[{}]'.format(' '.join(modifiers))
+            pieces.append(piece)
+        return '\n'.join(pieces)
+
+    def pprint_frames(self, vnclass, indent=''):
+        """Returns pretty version of all frames in a VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the list of frames within the VerbNet class.
+
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+        pieces = []
+        for vnframe in self.frames(vnclass):
+            pieces.append(self._pprint_single_frame(vnframe, indent))
+        return '\n'.join(pieces)
+
+    def _pprint_single_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of a single frame in a VerbNet class
+        
+        Returns a string containing a pretty-printed representation of
+        the given frame.
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        frame_string = self._pprint_description_within_frame(vnframe, indent) + '\n'
+        frame_string += self._pprint_example_within_frame(vnframe, indent + ' ') + '\n'
+        frame_string += self._pprint_syntax_within_frame(vnframe, indent + '  Syntax: ') + '\n'
+        frame_string += indent + '  Semantics:\n'
+        frame_string += self._pprint_semantics_within_frame(vnframe, indent + '    ')
+        return frame_string
+
+    def _pprint_example_within_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of example within frame in a VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame example.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a Verbnet frame.
+        """
+        if vnframe['example']:
+            return indent + ' Example: ' + vnframe['example']
+
+    def _pprint_description_within_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of a VerbNet frame description
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame description.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        description = indent + vnframe['description']['primary']
+        if vnframe['description']['secondary']:
+            description += ' ({})'.format(vnframe['description']['secondary'])
+        return description
+
+    def _pprint_syntax_within_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of syntax within a frame in a VerbNet class 
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame syntax.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        pieces = []
+        for element in vnframe['syntax']:
+            piece = element['pos_tag']
+            modifier_list = []
+            if 'value' in element['modifiers'] and element['modifiers']['value']:
+                modifier_list.append(element['modifiers']['value'])
+            modifier_list += ['{}{}'.format(restr['value'], restr['type'])
+                              for restr in (element['modifiers']['selrestrs'] +
+                                            element['modifiers']['synrestrs'])]
+            if modifier_list:
+                piece += '[{}]'.format(' '.join(modifier_list))
+            pieces.append(piece)
+
+        return indent + ' '.join(pieces)
+
+    def _pprint_semantics_within_frame(self, vnframe, indent=''):
+        """Returns a pretty printed version of semantics within frame in a VerbNet class
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame semantics.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        """
+        pieces = []
+        for predicate in vnframe['semantics']:
+            arguments = [argument['value'] for argument in predicate['arguments']]
+            pieces.append('{}({})'.format(predicate['predicate_value'], ', '.join(arguments)))
+        return '\n'.join('{}* {}'.format(indent, piece) for piece in pieces)
diff --git a/nlp_resource_data/nltk/corpus/reader/verbnet.pyc b/nlp_resource_data/nltk/corpus/reader/verbnet.pyc

new file mode 100755 (executable)

index 0000000..a5f172c

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/verbnet.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/wordlist.py b/nlp_resource_data/nltk/corpus/reader/wordlist.py

new file mode 100755 (executable)

index 0000000..24e06ae
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/wordlist.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Word List Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from six import string_types
+
+from nltk.tokenize import line_tokenize
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+
+class WordListCorpusReader(CorpusReader):
+    """
+    List of words, one per line.  Blank lines are ignored.
+    """
+    def words(self, fileids=None, ignore_lines_startswith='\n'):
+        return [line for line in line_tokenize(self.raw(fileids))
+                if not line.startswith(ignore_lines_startswith)]
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+
+class SwadeshCorpusReader(WordListCorpusReader):
+    def entries(self, fileids=None):
+        """
+        :return: a tuple of words for the specified fileids.
+        """
+        if not fileids:
+            fileids = self.fileids()
+
+        wordlists = [self.words(f) for f in fileids]
+        return list(zip(*wordlists))
+
+
+class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
+    """
+    This is a class to read the nonbreaking prefixes textfiles from the
+    Moses Machine Translation toolkit. These lists are used in the Python port
+    of the Moses' word tokenizer.
+    """
+    available_langs = {'catalan': 'ca', 'czech': 'cs', 'german': 'de',
+                        'greek': 'el', 'english': 'en', 'spanish': 'es',
+                        'finnish': 'fi',  'french': 'fr', 'hungarian': 'hu',
+                        'icelandic': 'is', 'italian': 'it', 'latvian': 'lv',
+                        'dutch': 'nl', 'polish': 'pl', 'portuguese': 'pt',
+                        'romanian': 'ro', 'russian': 'ru', 'slovak': 'sk',
+                        'slovenian': 'sl', 'swedish': 'sv',  'tamil': 'ta'}
+    # Also, add the lang IDs as the keys.
+    available_langs.update({v:v for v in available_langs.values()})
+
+    def words(self, lang=None, fileids=None, ignore_lines_startswith='#'):
+        """
+        This module returns a list of nonbreaking prefixes for the specified
+        language(s).
+
+        >>> from nltk.corpus import nonbreaking_prefixes as nbp
+        >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
+        True
+        >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
+        True
+
+        :return: a list words for the specified language(s).
+        """
+        # If *lang* in list of languages available, allocate apt fileid.
+        # Otherwise, the function returns non-breaking prefixes for
+        # all languages when fileids==None.
+        if lang in self.available_langs:
+            lang = self.available_langs[lang]
+            fileids = ['nonbreaking_prefix.'+lang]
+        return [line for line in line_tokenize(self.raw(fileids))
+                if not line.startswith(ignore_lines_startswith)]
+
+class UnicharsCorpusReader(WordListCorpusReader):
+    """
+    This class is used to read lists of characters from the Perl Unicode
+    Properties (see http://perldoc.perl.org/perluniprops.html).
+    The files in the perluniprop.zip are extracted using the Unicode::Tussle
+    module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
+    """
+    # These are categories similar to the Perl Unicode Properties
+    available_categories = ['Close_Punctuation', 'Currency_Symbol',
+                            'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc',
+                            'IsSo', 'IsUpper', 'Line_Separator', 'Number',
+                            'Open_Punctuation', 'Punctuation', 'Separator',
+                            'Symbol']
+
+    def chars(self, category=None, fileids=None):
+        """
+        This module returns a list of characters from  the Perl Unicode Properties.
+        They are very useful when porting Perl tokenizers to Python.
+
+        >>> from nltk.corpus import perluniprops as pup
+        >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
+        True
+        >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
+        True
+        >>> pup.available_categories
+        ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
+
+        :return: a list of characters given the specific unicode character category
+        """
+        if category in self.available_categories:
+            fileids = [category+'.txt']
+        return list(self.raw(fileids).strip())
+
+
+class MWAPPDBCorpusReader(WordListCorpusReader):
+    """
+    This class is used to read the list of word pairs from the subset of lexical
+    pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
+    Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
+     - http://acl2014.org/acl2014/Q14/pdf/Q14-1017
+     - http://www.aclweb.org/anthology/S14-2039
+     - http://www.aclweb.org/anthology/S15-2027
+
+    The original source of the full PPDB corpus can be found on
+    http://www.cis.upenn.edu/~ccb/ppdb/
+
+    :return: a list of tuples of similar lexical terms.
+    """
+    mwa_ppdb_xxxl_file = 'ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'
+    def entries(self, fileids=mwa_ppdb_xxxl_file):
+        """
+        :return: a tuple of synonym word pairs.
+        """
+        return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
diff --git a/nlp_resource_data/nltk/corpus/reader/wordlist.pyc b/nlp_resource_data/nltk/corpus/reader/wordlist.pyc

new file mode 100755 (executable)

index 0000000..1df236f

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/wordlist.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/wordnet.py b/nlp_resource_data/nltk/corpus/reader/wordnet.py

new file mode 100755 (executable)

index 0000000..7063aed
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/wordnet.py
@@ -0,0 +1,2064 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: WordNet
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bethard <Steven.Bethard@colorado.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Nitin Madnani <nmadnani@ets.org>
+#         Nasruddin A’aidil Shari
+#         Sim Wei Ying Geraldine
+#         Soe Lynn
+#         Francis Bond <bond@ieee.org>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for WordNet
+
+WordNet is a lexical database of English.
+Using synsets, helps find conceptual relationships between words
+such as hypernyms, hyponyms, synonyms, antonyms etc.
+
+For details about WordNet see:
+http://wordnet.princeton.edu/
+
+This module also allows you to find lemmas in languages
+other than English from the Open Multilingual Wordnet
+http://compling.hss.ntu.edu.sg/omw/
+
+"""
+
+from __future__ import print_function, unicode_literals
+
+import math
+import re
+from itertools import islice, chain
+from functools import total_ordering
+from operator import itemgetter
+from collections import defaultdict, deque
+
+from six import iteritems
+from six.moves import range
+
+from nltk.corpus.reader import CorpusReader
+from nltk.util import binary_search_file as _binary_search_file
+from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
+from nltk.internals import deprecated
+
+######################################################################
+# Table of Contents
+######################################################################
+# - Constants
+# - Data Classes
+#   - WordNetError
+#   - Lemma
+#   - Synset
+# - WordNet Corpus Reader
+# - WordNet Information Content Corpus Reader
+# - Similarity Metrics
+# - Demo
+
+######################################################################
+# Constants
+######################################################################
+
+#: Positive infinity (for similarity functions)
+_INF = 1e300
+
+# { Part-of-speech constants
+ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
+# }
+
+POS_LIST = [NOUN, VERB, ADJ, ADV]
+
+# A table of strings that are used to express verb frames.
+VERB_FRAME_STRINGS = (
+    None,
+    "Something %s",
+    "Somebody %s",
+    "It is %sing",
+    "Something is %sing PP",
+    "Something %s something Adjective/Noun",
+    "Something %s Adjective/Noun",
+    "Somebody %s Adjective",
+    "Somebody %s something",
+    "Somebody %s somebody",
+    "Something %s somebody",
+    "Something %s something",
+    "Something %s to somebody",
+    "Somebody %s on something",
+    "Somebody %s somebody something",
+    "Somebody %s something to somebody",
+    "Somebody %s something from somebody",
+    "Somebody %s somebody with something",
+    "Somebody %s somebody of something",
+    "Somebody %s something on somebody",
+    "Somebody %s somebody PP",
+    "Somebody %s something PP",
+    "Somebody %s PP",
+    "Somebody's (body part) %s",
+    "Somebody %s somebody to INFINITIVE",
+    "Somebody %s somebody INFINITIVE",
+    "Somebody %s that CLAUSE",
+    "Somebody %s to somebody",
+    "Somebody %s to INFINITIVE",
+    "Somebody %s whether INFINITIVE",
+    "Somebody %s somebody into V-ing something",
+    "Somebody %s something with something",
+    "Somebody %s INFINITIVE",
+    "Somebody %s VERB-ing",
+    "It %s that CLAUSE",
+    "Something %s INFINITIVE")
+
+SENSENUM_RE = re.compile(r'\.[\d]+\.')
+
+
+######################################################################
+# Data Classes
+######################################################################
+
+
+class WordNetError(Exception):
+    """An exception class for wordnet-related errors."""
+
+
+@total_ordering
+class _WordNetObject(object):
+    """A common base class for lemmas and synsets."""
+
+    def hypernyms(self):
+        return self._related('@')
+
+    def _hypernyms(self):
+        return self._related('@')
+
+    def instance_hypernyms(self):
+        return self._related('@i')
+
+    def _instance_hypernyms(self):
+        return self._related('@i')
+
+    def hyponyms(self):
+        return self._related('~')
+
+    def instance_hyponyms(self):
+        return self._related('~i')
+
+    def member_holonyms(self):
+        return self._related('#m')
+
+    def substance_holonyms(self):
+        return self._related('#s')
+
+    def part_holonyms(self):
+        return self._related('#p')
+
+    def member_meronyms(self):
+        return self._related('%m')
+
+    def substance_meronyms(self):
+        return self._related('%s')
+
+    def part_meronyms(self):
+        return self._related('%p')
+
+    def topic_domains(self):
+        return self._related(';c')
+
+    def region_domains(self):
+        return self._related(';r')
+
+    def usage_domains(self):
+        return self._related(';u')
+
+    def attributes(self):
+        return self._related('=')
+
+    def entailments(self):
+        return self._related('*')
+
+    def causes(self):
+        return self._related('>')
+
+    def also_sees(self):
+        return self._related('^')
+
+    def verb_groups(self):
+        return self._related('$')
+
+    def similar_tos(self):
+        return self._related('&')
+
+    def __hash__(self):
+        return hash(self._name)
+
+    def __eq__(self, other):
+        return self._name == other._name
+
+    def __ne__(self, other):
+        return self._name != other._name
+
+    def __lt__(self, other):
+        return self._name < other._name
+
+
+@python_2_unicode_compatible
+class Lemma(_WordNetObject):
+    """
+    The lexical entry for a single morphological form of a
+    sense-disambiguated word.
+
+    Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
+    <word> is the morphological stem identifying the synset
+    <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
+    <number> is the sense number, counting from 0.
+    <lemma> is the morphological form of interest
+
+    Note that <word> and <lemma> can be different, e.g. the Synset
+    'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
+    'salt.n.03.salinity'.
+
+    Lemma attributes, accessible via methods with the same name::
+
+    - name: The canonical name of this lemma.
+    - synset: The synset that this lemma belongs to.
+    - syntactic_marker: For adjectives, the WordNet string identifying the
+      syntactic position relative modified noun. See:
+      http://wordnet.princeton.edu/man/wninput.5WN.html#sect10
+      For all other parts of speech, this attribute is None.
+    - count: The frequency of this lemma in wordnet.
+
+    Lemma methods:
+
+    Lemmas have the following methods for retrieving related Lemmas. They
+    correspond to the names for the pointer symbols defined here:
+    http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
+    These methods all return lists of Lemmas:
+
+    - antonyms
+    - hypernyms, instance_hypernyms
+    - hyponyms, instance_hyponyms
+    - member_holonyms, substance_holonyms, part_holonyms
+    - member_meronyms, substance_meronyms, part_meronyms
+    - topic_domains, region_domains, usage_domains
+    - attributes
+    - derivationally_related_forms
+    - entailments
+    - causes
+    - also_sees
+    - verb_groups
+    - similar_tos
+    - pertainyms
+    """
+
+    __slots__ = ['_wordnet_corpus_reader', '_name', '_syntactic_marker',
+                 '_synset', '_frame_strings', '_frame_ids',
+                 '_lexname_index', '_lex_id', '_lang', '_key']
+
+    def __init__(self, wordnet_corpus_reader, synset, name,
+                 lexname_index, lex_id, syntactic_marker):
+        self._wordnet_corpus_reader = wordnet_corpus_reader
+        self._name = name
+        self._syntactic_marker = syntactic_marker
+        self._synset = synset
+        self._frame_strings = []
+        self._frame_ids = []
+        self._lexname_index = lexname_index
+        self._lex_id = lex_id
+        self._lang = 'eng'
+
+        self._key = None  # gets set later.
+
+    def name(self):
+        return self._name
+
+    def syntactic_marker(self):
+        return self._syntactic_marker
+
+    def synset(self):
+        return self._synset
+
+    def frame_strings(self):
+        return self._frame_strings
+
+    def frame_ids(self):
+        return self._frame_ids
+
+    def lang(self):
+        return self._lang
+
+    def key(self):
+        return self._key
+
+    def __repr__(self):
+        tup = type(self).__name__, self._synset._name, self._name
+        return "%s('%s.%s')" % tup
+
+    def _related(self, relation_symbol):
+        get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
+        return sorted([
+            get_synset(pos, offset)._lemmas[lemma_index]
+            for pos, offset, lemma_index
+            in self._synset._lemma_pointers[self._name, relation_symbol]
+        ])
+
+    def count(self):
+        """Return the frequency count for this Lemma"""
+        return self._wordnet_corpus_reader.lemma_count(self)
+
+    def antonyms(self):
+        return self._related('!')
+
+    def derivationally_related_forms(self):
+        return self._related('+')
+
+    def pertainyms(self):
+        return self._related('\\')
+
+
+@python_2_unicode_compatible
+class Synset(_WordNetObject):
+    """Create a Synset from a "<lemma>.<pos>.<number>" string where:
+    <lemma> is the word's morphological stem
+    <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
+    <number> is the sense number, counting from 0.
+
+    Synset attributes, accessible via methods with the same name:
+
+    - name: The canonical name of this synset, formed using the first lemma
+      of this synset. Note that this may be different from the name
+      passed to the constructor if that string used a different lemma to
+      identify the synset.
+    - pos: The synset's part of speech, matching one of the module level
+      attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
+    - lemmas: A list of the Lemma objects for this synset.
+    - definition: The definition for this synset.
+    - examples: A list of example strings for this synset.
+    - offset: The offset in the WordNet dict file of this synset.
+    - lexname: The name of the lexicographer file containing this synset.
+
+    Synset methods:
+
+    Synsets have the following methods for retrieving related Synsets.
+    They correspond to the names for the pointer symbols defined here:
+    http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
+    These methods all return lists of Synsets.
+
+    - hypernyms, instance_hypernyms
+    - hyponyms, instance_hyponyms
+    - member_holonyms, substance_holonyms, part_holonyms
+    - member_meronyms, substance_meronyms, part_meronyms
+    - attributes
+    - entailments
+    - causes
+    - also_sees
+    - verb_groups
+    - similar_tos
+
+    Additionally, Synsets support the following methods specific to the
+    hypernym relation:
+
+    - root_hypernyms
+    - common_hypernyms
+    - lowest_common_hypernyms
+
+    Note that Synsets do not support the following relations because
+    these are defined by WordNet as lexical relations:
+
+    - antonyms
+    - derivationally_related_forms
+    - pertainyms
+    """
+
+    __slots__ = ['_pos', '_offset', '_name', '_frame_ids',
+                 '_lemmas', '_lemma_names',
+                 '_definition', '_examples', '_lexname',
+                 '_pointers', '_lemma_pointers', '_max_depth',
+                 '_min_depth']
+
+    def __init__(self, wordnet_corpus_reader):
+        self._wordnet_corpus_reader = wordnet_corpus_reader
+        # All of these attributes get initialized by
+        # WordNetCorpusReader._synset_from_pos_and_line()
+
+        self._pos = None
+        self._offset = None
+        self._name = None
+        self._frame_ids = []
+        self._lemmas = []
+        self._lemma_names = []
+        self._definition = None
+        self._examples = []
+        self._lexname = None  # lexicographer name
+        self._all_hypernyms = None
+
+        self._pointers = defaultdict(set)
+        self._lemma_pointers = defaultdict(set)
+
+    def pos(self):
+        return self._pos
+
+    def offset(self):
+        return self._offset
+
+    def name(self):
+        return self._name
+
+    def frame_ids(self):
+        return self._frame_ids
+
+    def definition(self):
+        return self._definition
+
+    def examples(self):
+        return self._examples
+
+    def lexname(self):
+        return self._lexname
+
+    def _needs_root(self):
+        if self._pos == NOUN:
+            if self._wordnet_corpus_reader.get_version() == '1.6':
+                return True
+            else:
+                return False
+        elif self._pos == VERB:
+            return True
+
+    def lemma_names(self, lang='eng'):
+        '''Return all the lemma_names associated with the synset'''
+        if lang == 'eng':
+            return self._lemma_names
+        else:
+            self._wordnet_corpus_reader._load_lang_data(lang)
+
+            i = self._wordnet_corpus_reader.ss2of(self)
+            if i in self._wordnet_corpus_reader._lang_data[lang][0]:
+                return self._wordnet_corpus_reader._lang_data[lang][0][i]
+            else:
+                return []
+
+    def lemmas(self, lang='eng'):
+        '''Return all the lemma objects associated with the synset'''
+        if lang == 'eng':
+            return self._lemmas
+        else:
+            self._wordnet_corpus_reader._load_lang_data(lang)
+            lemmark = []
+            lemmy = self.lemma_names(lang)
+            for lem in lemmy:
+                temp = Lemma(
+                    self._wordnet_corpus_reader,
+                    self,
+                    lem,
+                    self._wordnet_corpus_reader._lexnames.index(
+                        self.lexname()
+                    ),
+                    0,
+                    None
+                )
+                temp._lang = lang
+                lemmark.append(temp)
+            return lemmark
+
+    def root_hypernyms(self):
+        """Get the topmost hypernyms of this synset in WordNet."""
+
+        result = []
+        seen = set()
+        todo = [self]
+        while todo:
+            next_synset = todo.pop()
+            if next_synset not in seen:
+                seen.add(next_synset)
+                next_hypernyms = next_synset.hypernyms() + \
+                    next_synset.instance_hypernyms()
+                if not next_hypernyms:
+                    result.append(next_synset)
+                else:
+                    todo.extend(next_hypernyms)
+        return result
+
+# Simpler implementation which makes incorrect assumption that
+# hypernym hierarchy is acyclic:
+#
+#        if not self.hypernyms():
+#            return [self]
+#        else:
+#            return list(set(root for h in self.hypernyms()
+#                            for root in h.root_hypernyms()))
+    def max_depth(self):
+        """
+        :return: The length of the longest hypernym path from this
+        synset to the root.
+        """
+
+        if "_max_depth" not in self.__dict__:
+            hypernyms = self.hypernyms() + self.instance_hypernyms()
+            if not hypernyms:
+                self._max_depth = 0
+            else:
+                self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
+        return self._max_depth
+
+    def min_depth(self):
+        """
+        :return: The length of the shortest hypernym path from this
+        synset to the root.
+        """
+
+        if "_min_depth" not in self.__dict__:
+            hypernyms = self.hypernyms() + self.instance_hypernyms()
+            if not hypernyms:
+                self._min_depth = 0
+            else:
+                self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
+        return self._min_depth
+
+    def closure(self, rel, depth=-1):
+        """Return the transitive closure of source under the rel
+        relationship, breadth-first
+
+            >>> from nltk.corpus import wordnet as wn
+            >>> dog = wn.synset('dog.n.01')
+            >>> hyp = lambda s:s.hypernyms()
+            >>> list(dog.closure(hyp))
+            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
+            Synset('carnivore.n.01'), Synset('animal.n.01'),
+            Synset('placental.n.01'), Synset('organism.n.01'),
+            Synset('mammal.n.01'), Synset('living_thing.n.01'),
+            Synset('vertebrate.n.01'), Synset('whole.n.02'),
+            Synset('chordate.n.01'), Synset('object.n.01'),
+            Synset('physical_entity.n.01'), Synset('entity.n.01')]
+
+        """
+        from nltk.util import breadth_first
+        synset_offsets = []
+        for synset in breadth_first(self, rel, depth):
+            if synset._offset != self._offset:
+                if synset._offset not in synset_offsets:
+                    synset_offsets.append(synset._offset)
+                    yield synset
+
+    def hypernym_paths(self):
+        """
+        Get the path(s) from this synset to the root, where each path is a
+        list of the synset nodes traversed on the way to the root.
+
+        :return: A list of lists, where each list gives the node sequence
+           connecting the initial ``Synset`` node and a root node.
+        """
+        paths = []
+
+        hypernyms = self.hypernyms() + self.instance_hypernyms()
+        if len(hypernyms) == 0:
+            paths = [[self]]
+
+        for hypernym in hypernyms:
+            for ancestor_list in hypernym.hypernym_paths():
+                ancestor_list.append(self)
+                paths.append(ancestor_list)
+        return paths
+
+    def common_hypernyms(self, other):
+        """
+        Find all synsets that are hypernyms of this synset and the
+        other synset.
+
+        :type other: Synset
+        :param other: other input synset.
+        :return: The synsets that are hypernyms of both synsets.
+        """
+        if not self._all_hypernyms:
+            self._all_hypernyms = set(
+                self_synset
+                for self_synsets in self._iter_hypernym_lists()
+                for self_synset in self_synsets
+            )
+        if not other._all_hypernyms:
+            other._all_hypernyms = set(
+                other_synset
+                for other_synsets in other._iter_hypernym_lists()
+                for other_synset in other_synsets
+            )
+        return list(self._all_hypernyms.intersection(other._all_hypernyms))
+
+    def lowest_common_hypernyms(
+        self, other, simulate_root=False, use_min_depth=False
+    ):
+        """
+        Get a list of lowest synset(s) that both synsets have as a hypernym.
+        When `use_min_depth == False` this means that the synset which appears
+        as a hypernym of both `self` and `other` with the lowest maximum depth
+        is returned or if there are multiple such synsets at the same depth
+        they are all returned
+
+        However, if `use_min_depth == True` then the synset(s) which has/have
+        the lowest minimum depth and appear(s) in both paths is/are returned.
+
+        By setting the use_min_depth flag to True, the behavior of NLTK2 can be
+        preserved. This was changed in NLTK3 to give more accurate results in a
+        small set of cases, generally with synsets concerning people. (eg:
+        'chef.n.01', 'fireman.n.01', etc.)
+
+        This method is an implementation of Ted Pedersen's "Lowest Common
+        Subsumer" method from the Perl Wordnet module. It can return either
+        "self" or "other" if they are a hypernym of the other.
+
+        :type other: Synset
+        :param other: other input synset
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (False by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to True to enable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will need to be added
+            for nouns as well.
+        :type use_min_depth: bool
+        :param use_min_depth: This setting mimics older (v2) behavior of NLTK
+            wordnet If True, will use the min_depth function to calculate the
+            lowest common hypernyms. This is known to give strange results for
+            some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
+            for backwards compatibility
+        :return: The synsets that are the lowest common hypernyms of both
+            synsets
+        """
+        synsets = self.common_hypernyms(other)
+        if simulate_root:
+            fake_synset = Synset(None)
+            fake_synset._name = '*ROOT*'
+            fake_synset.hypernyms = lambda: []
+            fake_synset.instance_hypernyms = lambda: []
+            synsets.append(fake_synset)
+
+        try:
+            if use_min_depth:
+                max_depth = max(s.min_depth() for s in synsets)
+                unsorted_lch = [
+                    s for s in synsets if s.min_depth() == max_depth
+                ]
+            else:
+                max_depth = max(s.max_depth() for s in synsets)
+                unsorted_lch = [
+                    s for s in synsets if s.max_depth() == max_depth
+                ]
+            return sorted(unsorted_lch)
+        except ValueError:
+            return []
+
+    def hypernym_distances(self, distance=0, simulate_root=False):
+        """
+        Get the path(s) from this synset to the root, counting the distance
+        of each node from the initial node on the way. A set of
+        (synset, distance) tuples is returned.
+
+        :type distance: int
+        :param distance: the distance (number of edges) from this hypernym to
+            the original hypernym ``Synset`` on which this method was called.
+        :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
+           a hypernym of the first ``Synset``.
+        """
+        distances = set([(self, distance)])
+        for hypernym in self._hypernyms() + self._instance_hypernyms():
+            distances |= hypernym.hypernym_distances(
+                distance+1,
+                simulate_root=False
+            )
+        if simulate_root:
+            fake_synset = Synset(None)
+            fake_synset._name = '*ROOT*'
+            fake_synset_distance = max(distances, key=itemgetter(1))[1]
+            distances.add((fake_synset, fake_synset_distance+1))
+        return distances
+
+    def _shortest_hypernym_paths(self, simulate_root):
+        if self._name == '*ROOT*':
+            return {self: 0}
+
+        queue = deque([(self, 0)])
+        path = {}
+
+        while queue:
+            s, depth = queue.popleft()
+            if s in path:
+                continue
+            path[s] = depth
+
+            depth += 1
+            queue.extend((hyp, depth) for hyp in s._hypernyms())
+            queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
+
+        if simulate_root:
+            fake_synset = Synset(None)
+            fake_synset._name = '*ROOT*'
+            path[fake_synset] = max(path.values()) + 1
+
+        return path
+
+    def shortest_path_distance(self, other, simulate_root=False):
+        """
+        Returns the distance of the shortest path linking the two synsets (if
+        one exists). For each synset, all the ancestor nodes and their
+        distances are recorded and compared. The ancestor node common to both
+        synsets that can be reached with the minimum number of traversals is
+        used. If no ancestor nodes are common, None is returned. If a node is
+        compared with itself 0 is returned.
+
+        :type other: Synset
+        :param other: The Synset to which the shortest path will be found.
+        :return: The number of edges in the shortest path connecting the two
+            nodes, or None if no path exists.
+        """
+
+        if self == other:
+            return 0
+
+        dist_dict1 = self._shortest_hypernym_paths(simulate_root)
+        dist_dict2 = other._shortest_hypernym_paths(simulate_root)
+
+        # For each ancestor synset common to both subject synsets, find the
+        # connecting path length. Return the shortest of these.
+
+        inf = float('inf')
+        path_distance = inf
+        for synset, d1 in iteritems(dist_dict1):
+            d2 = dist_dict2.get(synset, inf)
+            path_distance = min(path_distance, d1 + d2)
+
+        return None if math.isinf(path_distance) else path_distance
+
+    def tree(self, rel, depth=-1, cut_mark=None):
+        """
+        >>> from nltk.corpus import wordnet as wn
+        >>> dog = wn.synset('dog.n.01')
+        >>> hyp = lambda s:s.hypernyms()
+        >>> from pprint import pprint
+        >>> pprint(dog.tree(hyp))
+        [Synset('dog.n.01'),
+         [Synset('canine.n.02'),
+          [Synset('carnivore.n.01'),
+           [Synset('placental.n.01'),
+            [Synset('mammal.n.01'),
+             [Synset('vertebrate.n.01'),
+              [Synset('chordate.n.01'),
+               [Synset('animal.n.01'),
+                [Synset('organism.n.01'),
+                 [Synset('living_thing.n.01'),
+                  [Synset('whole.n.02'),
+                   [Synset('object.n.01'),
+                    [Synset('physical_entity.n.01'),
+                     [Synset('entity.n.01')]]]]]]]]]]]]],
+         [Synset('domestic_animal.n.01'),
+          [Synset('animal.n.01'),
+           [Synset('organism.n.01'),
+            [Synset('living_thing.n.01'),
+             [Synset('whole.n.02'),
+              [Synset('object.n.01'),
+               [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
+        """
+
+        tree = [self]
+        if depth != 0:
+            tree += [x.tree(rel, depth-1, cut_mark) for x in rel(self)]
+        elif cut_mark:
+            tree += [cut_mark]
+        return tree
+
+    # interface to similarity methods
+    def path_similarity(self, other, verbose=False, simulate_root=True):
+        """
+        Path Distance Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        shortest path that connects the senses in the is-a (hypernym/hypnoym)
+        taxonomy. The score is in the range 0 to 1, except in those cases where
+        a path cannot be found (will only be true for verbs as there are many
+        distinct verb taxonomies), in which case None is returned. A score of
+        1 represents identity i.e. comparing a sense with itself will return 1.
+
+        :type other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (True by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to false to disable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will be added for nouns
+            as well.
+        :return: A score denoting the similarity of the two ``Synset`` objects,
+            normally between 0 and 1. None is returned if no connecting path
+            could be found. 1 is returned if a ``Synset`` is compared with
+            itself.
+        """
+
+        distance = self.shortest_path_distance(
+            other,
+            simulate_root=simulate_root and self._needs_root()
+        )
+        if distance is None or distance < 0:
+            return None
+        return 1.0 / (distance + 1)
+
+    def lch_similarity(self, other, verbose=False, simulate_root=True):
+        """
+        Leacock Chodorow Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        shortest path that connects the senses (as above) and the maximum depth
+        of the taxonomy in which the senses occur. The relationship is given as
+        -log(p/2d) where p is the shortest path length and d is the taxonomy
+        depth.
+
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (True by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to false to disable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will be added for nouns
+            as well.
+        :return: A score denoting the similarity of the two ``Synset`` objects,
+            normally greater than 0. None is returned if no connecting path
+            could be found. If a ``Synset`` is compared with itself, the
+            maximum score is returned, which varies depending on the taxonomy
+            depth.
+        """
+
+        if self._pos != other._pos:
+            raise WordNetError(
+                'Computing the lch similarity requires '
+                '%s and %s to have the same part of speech.' %
+                (self, other)
+            )
+
+        need_root = self._needs_root()
+
+        if self._pos not in self._wordnet_corpus_reader._max_depth:
+            self._wordnet_corpus_reader._compute_max_depth(
+                self._pos, need_root
+            )
+
+        depth = self._wordnet_corpus_reader._max_depth[self._pos]
+
+        distance = self.shortest_path_distance(
+            other,
+            simulate_root=simulate_root and need_root
+        )
+
+        if distance is None or distance < 0 or depth == 0:
+            return None
+        return -math.log((distance + 1) / (2.0 * depth))
+
+    def wup_similarity(self, other, verbose=False, simulate_root=True):
+        """
+        Wu-Palmer Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        depth of the two senses in the taxonomy and that of their Least Common
+        Subsumer (most specific ancestor node). Previously, the scores computed
+        by this implementation did _not_ always agree with those given by
+        Pedersen's Perl implementation of WordNet Similarity. However, with
+        the addition of the simulate_root flag (see below), the score for
+        verbs now almost always agree but not always for nouns.
+
+        The LCS does not necessarily feature in the shortest path connecting
+        the two senses, as it is by definition the common ancestor deepest in
+        the taxonomy, not closest to the two senses. Typically, however, it
+        will so feature. Where multiple candidates for the LCS exist, that
+        whose shortest path to the root node is the longest will be selected.
+        Where the LCS has multiple paths to the root, the longer path is used
+        for the purposes of the calculation.
+
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type simulate_root: bool
+        :param simulate_root: The various verb taxonomies do not
+            share a single root which disallows this metric from working for
+            synsets that are not connected. This flag (True by default)
+            creates a fake root that connects all the taxonomies. Set it
+            to false to disable this behavior. For the noun taxonomy,
+            there is usually a default root except for WordNet version 1.6.
+            If you are using wordnet 1.6, a fake root will be added for nouns
+            as well.
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects, normally greater than zero. If no connecting path between
+            the two senses can be found, None is returned.
+
+        """
+
+        need_root = self._needs_root()
+        # Note that to preserve behavior from NLTK2 we set use_min_depth=True
+        # It is possible that more accurate results could be obtained by
+        # removing this setting and it should be tested later on
+        subsumers = self.lowest_common_hypernyms(
+            other,
+            simulate_root=simulate_root and need_root, use_min_depth=True
+        )
+
+        # If no LCS was found return None
+        if len(subsumers) == 0:
+            return None
+
+        subsumer = self if self in subsumers else subsumers[0]
+
+        # Get the longest path from the LCS to the root,
+        # including a correction:
+        # - add one because the calculations include both the start and end
+        #   nodes
+        depth = subsumer.max_depth() + 1
+
+        # Note: No need for an additional add-one correction for non-nouns
+        # to account for an imaginary root node because that is now
+        # automatically handled by simulate_root
+        # if subsumer._pos != NOUN:
+        #     depth += 1
+
+        # Get the shortest path from the LCS to each of the synsets it is
+        # subsuming.  Add this to the LCS path length to get the path
+        # length from each synset to the root.
+        len1 = self.shortest_path_distance(
+            subsumer,
+            simulate_root=simulate_root and need_root
+        )
+        len2 = other.shortest_path_distance(
+            subsumer,
+            simulate_root=simulate_root and need_root
+        )
+        if len1 is None or len2 is None:
+            return None
+        len1 += depth
+        len2 += depth
+        return (2.0 * depth) / (len1 + len2)
+
+    def res_similarity(self, other, ic, verbose=False):
+        """
+        Resnik Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        Information Content (IC) of the Least Common Subsumer (most specific
+        ancestor node).
+
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type ic: dict
+        :param ic: an information content object (as returned by
+            ``nltk.corpus.wordnet_ic.ic()``).
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects. Synsets whose LCS is the root node of the taxonomy will
+            have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
+        """
+
+        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
+        return lcs_ic
+
+    def jcn_similarity(self, other, ic, verbose=False):
+        """
+        Jiang-Conrath Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        Information Content (IC) of the Least Common Subsumer (most specific
+        ancestor node) and that of the two input Synsets. The relationship is
+        given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
+
+        :type  other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type  ic: dict
+        :param ic: an information content object (as returned by
+            ``nltk.corpus.wordnet_ic.ic()``).
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects.
+        """
+
+        if self == other:
+            return _INF
+
+        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
+
+        # If either of the input synsets are the root synset, or have a
+        # frequency of 0 (sparse data problem), return 0.
+        if ic1 == 0 or ic2 == 0:
+            return 0
+
+        ic_difference = ic1 + ic2 - 2 * lcs_ic
+
+        if ic_difference == 0:
+            return _INF
+
+        return 1 / ic_difference
+
+    def lin_similarity(self, other, ic, verbose=False):
+        """
+        Lin Similarity:
+        Return a score denoting how similar two word senses are, based on the
+        Information Content (IC) of the Least Common Subsumer (most specific
+        ancestor node) and that of the two input Synsets. The relationship is
+        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
+
+        :type other: Synset
+        :param other: The ``Synset`` that this ``Synset`` is being compared to.
+        :type ic: dict
+        :param ic: an information content object (as returned by
+            ``nltk.corpus.wordnet_ic.ic()``).
+        :return: A float score denoting the similarity of the two ``Synset``
+            objects, in the range 0 to 1.
+        """
+
+        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
+        return (2.0 * lcs_ic) / (ic1 + ic2)
+
+    def _iter_hypernym_lists(self):
+        """
+        :return: An iterator over ``Synset`` objects that are either proper
+        hypernyms or instance of hypernyms of the synset.
+        """
+        todo = [self]
+        seen = set()
+        while todo:
+            for synset in todo:
+                seen.add(synset)
+            yield todo
+            todo = [hypernym
+                    for synset in todo
+                    for hypernym in (
+                        synset.hypernyms() + synset.instance_hypernyms()
+                    )
+                    if hypernym not in seen]
+
+    def __repr__(self):
+        return "%s('%s')" % (type(self).__name__, self._name)
+
+    def _related(self, relation_symbol, sort=True):
+        get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
+        pointer_tuples = self._pointers[relation_symbol]
+        r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
+        if sort:
+            r.sort()
+        return r
+
+
+######################################################################
+# WordNet Corpus Reader
+######################################################################
+
+class WordNetCorpusReader(CorpusReader):
+    """
+    A corpus reader used to access wordnet or its variants.
+    """
+
+    _ENCODING = 'utf8'
+
+    # { Part-of-speech constants
+    ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
+    # }
+
+    # { Filename constants
+    _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
+    # }
+
+    # { Part of speech constants
+    _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
+    _pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
+    # }
+
+    #: A list of file identifiers for all the fileids used by this
+    #: corpus reader.
+    _FILES = ('cntlist.rev', 'lexnames', 'index.sense',
+              'index.adj', 'index.adv', 'index.noun', 'index.verb',
+              'data.adj', 'data.adv', 'data.noun', 'data.verb',
+              'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc', )
+
+    def __init__(self, root, omw_reader):
+        """
+        Construct a new wordnet corpus reader, with the given root
+        directory.
+        """
+        super(WordNetCorpusReader, self).__init__(root, self._FILES,
+                                                  encoding=self._ENCODING)
+
+        # A index that provides the file offset
+        # Map from lemma -> pos -> synset_index -> offset
+        self._lemma_pos_offset_map = defaultdict(dict)
+
+        # A cache so we don't have to reconstuct synsets
+        # Map from pos -> offset -> synset
+        self._synset_offset_cache = defaultdict(dict)
+
+        # A lookup for the maximum depth of each part of speech.  Useful for
+        # the lch similarity metric.
+        self._max_depth = defaultdict(dict)
+
+        # Corpus reader containing omw data.
+        self._omw_reader = omw_reader
+
+        # A cache to store the wordnet data of multiple languages
+        self._lang_data = defaultdict(list)
+
+        self._data_file_map = {}
+        self._exception_map = {}
+        self._lexnames = []
+        self._key_count_file = None
+        self._key_synset_file = None
+
+        # Load the lexnames
+        for i, line in enumerate(self.open('lexnames')):
+            index, lexname, _ = line.split()
+            assert int(index) == i
+            self._lexnames.append(lexname)
+
+        # Load the indices for lemmas and synset offsets
+        self._load_lemma_pos_offset_map()
+
+        # load the exception file data into memory
+        self._load_exception_map()
+
+# Open Multilingual WordNet functions, contributed by
+# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
+
+    def of2ss(self, of):
+        ''' take an id and return the synsets '''
+        return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
+
+    def ss2of(self, ss):
+        ''' return the ID of the synset '''
+        return ("{:08d}-{}".format(ss.offset(), ss.pos()))
+
+    def _load_lang_data(self, lang):
+        ''' load the wordnet data of the requested language from the file to
+        the cache, _lang_data '''
+
+        if lang in self._lang_data.keys():
+            return
+
+        if lang not in self.langs():
+            raise WordNetError("Language is not supported.")
+
+        f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
+        self.custom_lemmas(f, lang)
+        f.close()
+
+    def langs(self):
+        ''' return a list of languages supported by Multilingual Wordnet '''
+        import os
+        langs = ['eng']
+        fileids = self._omw_reader.fileids()
+        for fileid in fileids:
+            file_name, file_extension = os.path.splitext(fileid)
+            if file_extension == '.tab':
+                langs.append(file_name.split('-')[-1])
+
+        return langs
+
+    def _load_lemma_pos_offset_map(self):
+        for suffix in self._FILEMAP.values():
+
+            # parse each line of the file (ignoring comment lines)
+            for i, line in enumerate(self.open('index.%s' % suffix)):
+                if line.startswith(' '):
+                    continue
+
+                _iter = iter(line.split())
+
+                def _next_token(): return next(_iter)
+
+                try:
+
+                    # get the lemma and part-of-speech
+                    lemma = _next_token()
+                    pos = _next_token()
+
+                    # get the number of synsets for this lemma
+                    n_synsets = int(_next_token())
+                    assert n_synsets > 0
+
+                    # get and ignore the pointer symbols for all synsets of
+                    # this lemma
+                    n_pointers = int(_next_token())
+                    [_next_token() for _ in range(n_pointers)]
+
+                    # same as number of synsets
+                    n_senses = int(_next_token())
+                    assert n_synsets == n_senses
+
+                    # get and ignore number of senses ranked according to
+                    # frequency
+                    _next_token()
+
+                    # get synset offsets
+                    synset_offsets = [
+                        int(_next_token()) for _ in range(n_synsets)
+                    ]
+
+                # raise more informative error with file name and line number
+                except (AssertionError, ValueError) as e:
+                    tup = ('index.%s' % suffix), (i + 1), e
+                    raise WordNetError('file %s, line %i: %s' % tup)
+
+                # map lemmas and parts of speech to synsets
+                self._lemma_pos_offset_map[lemma][pos] = synset_offsets
+                if pos == ADJ:
+                    self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
+
+    def _load_exception_map(self):
+        # load the exception file data into memory
+        for pos, suffix in self._FILEMAP.items():
+            self._exception_map[pos] = {}
+            for line in self.open('%s.exc' % suffix):
+                terms = line.split()
+                self._exception_map[pos][terms[0]] = terms[1:]
+        self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
+
+    def _compute_max_depth(self, pos, simulate_root):
+        """
+        Compute the max depth for the given part of speech.  This is
+        used by the lch similarity metric.
+        """
+        depth = 0
+        for ii in self.all_synsets(pos):
+            try:
+                depth = max(depth, ii.max_depth())
+            except RuntimeError:
+                print(ii)
+        if simulate_root:
+            depth += 1
+        self._max_depth[pos] = depth
+
+    def get_version(self):
+        fh = self._data_file(ADJ)
+        for line in fh:
+            match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
+            if match is not None:
+                version = match.group(1)
+                fh.seek(0)
+                return version
+
+    #############################################################
+    # Loading Lemmas
+    #############################################################
+
+    def lemma(self, name, lang='eng'):
+        '''Return lemma object that matches the name'''
+        # cannot simply split on first '.',
+        # e.g.: '.45_caliber.a.01..45_caliber'
+        separator = SENSENUM_RE.search(name).start()
+
+        leadingZero = int(name[separator+1]) == 0
+        if (leadingZero):
+            synset_name, lemma_name = name[:separator+3], name[separator+4:]
+        else:
+            synset_name, lemma_name = name[:separator+2], name[separator+3:]
+        
+        synset = self.synset(synset_name)
+        for lemma in synset.lemmas(lang):
+            if lemma._name == lemma_name:
+                return lemma
+        raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
+
+    def lemma_from_key(self, key):
+        # Keys are case sensitive and always lower-case
+        key = key.lower()
+
+        lemma_name, lex_sense = key.split('%')
+        pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
+        pos = self._pos_names[int(pos_number)]
+
+        # open the key -> synset file if necessary
+        if self._key_synset_file is None:
+            self._key_synset_file = self.open('index.sense')
+
+        # Find the synset for the lemma.
+        synset_line = _binary_search_file(self._key_synset_file, key)
+        if not synset_line:
+            raise WordNetError("No synset found for key %r" % key)
+        offset = int(synset_line.split()[1])
+        synset = self.synset_from_pos_and_offset(pos, offset)
+
+        # return the corresponding lemma
+        for lemma in synset._lemmas:
+            if lemma._key == key:
+                return lemma
+        raise WordNetError("No lemma found for for key %r" % key)
+
+    #############################################################
+    # Loading Synsets
+    #############################################################
+    def synset(self, name):
+        # split name into lemma, part of speech and synset number
+        lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
+        synset_index = int(synset_index_str) - 1
+
+        # get the offset for this synset
+        try:
+            offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
+        except KeyError:
+            message = 'no lemma %r with part of speech %r'
+            raise WordNetError(message % (lemma, pos))
+        except IndexError:
+            n_senses = len(self._lemma_pos_offset_map[lemma][pos])
+            message = "lemma %r with part of speech %r has only %i %s"
+            if n_senses == 1:
+                tup = lemma, pos, n_senses, "sense"
+            else:
+                tup = lemma, pos, n_senses, "senses"
+            raise WordNetError(message % tup)
+
+        # load synset information from the appropriate file
+        synset = self.synset_from_pos_and_offset(pos, offset)
+
+        # some basic sanity checks on loaded attributes
+        if pos == 's' and synset._pos == 'a':
+            message = ('adjective satellite requested but only plain '
+                       'adjective found for lemma %r')
+            raise WordNetError(message % lemma)
+        assert synset._pos == pos or (pos == 'a' and synset._pos == 's')
+
+        # Return the synset object.
+        return synset
+
+    def _data_file(self, pos):
+        """
+        Return an open file pointer for the data file for the given
+        part of speech.
+        """
+        if pos == ADJ_SAT:
+            pos = ADJ
+        if self._data_file_map.get(pos) is None:
+            fileid = 'data.%s' % self._FILEMAP[pos]
+            self._data_file_map[pos] = self.open(fileid)
+        return self._data_file_map[pos]
+
+    def synset_from_pos_and_offset(self, pos, offset):
+        # Check to see if the synset is in the cache
+        if offset in self._synset_offset_cache[pos]:
+            return self._synset_offset_cache[pos][offset]
+
+        data_file = self._data_file(pos)
+        data_file.seek(offset)
+        data_file_line = data_file.readline()
+        synset = self._synset_from_pos_and_line(pos, data_file_line)
+        assert synset._offset == offset
+        self._synset_offset_cache[pos][offset] = synset
+        return synset
+
+    @deprecated('Use public method synset_from_pos_and_offset() instead')
+    def _synset_from_pos_and_offset(self, *args, **kwargs):
+        """
+        Hack to help people like the readers of
+        http://stackoverflow.com/a/27145655/1709587
+        who were using this function before it was officially a public method
+        """
+        return self.synset_from_pos_and_offset(*args, **kwargs)
+
+    def _synset_from_pos_and_line(self, pos, data_file_line):
+        # Construct a new (empty) synset.
+        synset = Synset(self)
+
+        # parse the entry for this synset
+        try:
+
+            # parse out the definitions and examples from the gloss
+            columns_str, gloss = data_file_line.split('|')
+            gloss = gloss.strip()
+            definitions = []
+            for gloss_part in gloss.split(';'):
+                gloss_part = gloss_part.strip()
+                if gloss_part.startswith('"'):
+                    synset._examples.append(gloss_part.strip('"'))
+                else:
+                    definitions.append(gloss_part)
+            synset._definition = '; '.join(definitions)
+
+            # split the other info into fields
+            _iter = iter(columns_str.split())
+
+            def _next_token(): return next(_iter)
+
+            # get the offset
+            synset._offset = int(_next_token())
+
+            # determine the lexicographer file name
+            lexname_index = int(_next_token())
+            synset._lexname = self._lexnames[lexname_index]
+
+            # get the part of speech
+            synset._pos = _next_token()
+
+            # create Lemma objects for each lemma
+            n_lemmas = int(_next_token(), 16)
+            for _ in range(n_lemmas):
+                # get the lemma name
+                lemma_name = _next_token()
+                # get the lex_id (used for sense_keys)
+                lex_id = int(_next_token(), 16)
+                # If the lemma has a syntactic marker, extract it.
+                m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
+                lemma_name, syn_mark = m.groups()
+                # create the lemma object
+                lemma = Lemma(self, synset, lemma_name, lexname_index,
+                              lex_id, syn_mark)
+                synset._lemmas.append(lemma)
+                synset._lemma_names.append(lemma._name)
+
+            # collect the pointer tuples
+            n_pointers = int(_next_token())
+            for _ in range(n_pointers):
+                symbol = _next_token()
+                offset = int(_next_token())
+                pos = _next_token()
+                lemma_ids_str = _next_token()
+                if lemma_ids_str == '0000':
+                    synset._pointers[symbol].add((pos, offset))
+                else:
+                    source_index = int(lemma_ids_str[:2], 16) - 1
+                    target_index = int(lemma_ids_str[2:], 16) - 1
+                    source_lemma_name = synset._lemmas[source_index]._name
+                    lemma_pointers = synset._lemma_pointers
+                    tups = lemma_pointers[source_lemma_name, symbol]
+                    tups.add((pos, offset, target_index))
+
+            # read the verb frames
+            try:
+                frame_count = int(_next_token())
+            except StopIteration:
+                pass
+            else:
+                for _ in range(frame_count):
+                    # read the plus sign
+                    plus = _next_token()
+                    assert plus == '+'
+                    # read the frame and lemma number
+                    frame_number = int(_next_token())
+                    frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
+                    lemma_number = int(_next_token(), 16)
+                    # lemma number of 00 means all words in the synset
+                    if lemma_number == 0:
+                        synset._frame_ids.append(frame_number)
+                        for lemma in synset._lemmas:
+                            lemma._frame_ids.append(frame_number)
+                            lemma._frame_strings.append(
+                                frame_string_fmt % lemma._name
+                            )
+                    # only a specific word in the synset
+                    else:
+                        lemma = synset._lemmas[lemma_number - 1]
+                        lemma._frame_ids.append(frame_number)
+                        lemma._frame_strings.append(
+                            frame_string_fmt % lemma._name
+                        )
+
+        # raise a more informative error with line text
+        except ValueError as e:
+            raise WordNetError('line %r: %s' % (data_file_line, e))
+
+        # set sense keys for Lemma objects - note that this has to be
+        # done afterwards so that the relations are available
+        for lemma in synset._lemmas:
+            if synset._pos == ADJ_SAT:
+                head_lemma = synset.similar_tos()[0]._lemmas[0]
+                head_name = head_lemma._name
+                head_id = '%02d' % head_lemma._lex_id
+            else:
+                head_name = head_id = ''
+            tup = (lemma._name, WordNetCorpusReader._pos_numbers[synset._pos],
+                   lemma._lexname_index, lemma._lex_id, head_name, head_id)
+            lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
+
+        # the canonical name is based on the first lemma
+        lemma_name = synset._lemmas[0]._name.lower()
+        offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
+        sense_index = offsets.index(synset._offset)
+        tup = lemma_name, synset._pos, sense_index + 1
+        synset._name = '%s.%s.%02i' % tup
+
+        return synset
+
+    #############################################################
+    # Retrieve synsets and lemmas.
+    #############################################################
+
+    def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
+        """Load all synsets with a given lemma and part of speech tag.
+        If no pos is specified, all synsets for all parts of speech
+        will be loaded.
+        If lang is specified, all the synsets associated with the lemma name
+        of that language will be returned.
+        """
+        lemma = lemma.lower()
+
+        if lang == 'eng':
+            get_synset = self.synset_from_pos_and_offset
+            index = self._lemma_pos_offset_map
+            if pos is None:
+                pos = POS_LIST
+            return [get_synset(p, offset)
+                    for p in pos
+                    for form in self._morphy(lemma, p, check_exceptions)
+                    for offset in index[form].get(p, [])]
+
+        else:
+            self._load_lang_data(lang)
+            synset_list = []
+            for l in self._lang_data[lang][1][lemma]:
+                if pos is not None and l[-1] != pos:
+                    continue
+                synset_list.append(self.of2ss(l))
+            return synset_list
+
+    def lemmas(self, lemma, pos=None, lang='eng'):
+        """Return all Lemma objects with a name matching the specified lemma
+        name and part of speech tag. Matches any part of speech tag if none is
+        specified."""
+
+        lemma = lemma.lower()
+        if lang == 'eng':
+            return [lemma_obj
+                    for synset in self.synsets(lemma, pos)
+                    for lemma_obj in synset.lemmas()
+                    if lemma_obj.name().lower() == lemma]
+
+        else:
+            self._load_lang_data(lang)
+            lemmas = []
+            syn = self.synsets(lemma, lang=lang)
+            for s in syn:
+                if pos is not None and s.pos() != pos:
+                    continue
+                for lemma_obj in s.lemmas(lang=lang):
+                    if lemma_obj.name().lower() == lemma:
+                        lemmas.append(lemma_obj)
+            return lemmas
+
+    def all_lemma_names(self, pos=None, lang='eng'):
+        """Return all lemma names for all synsets for the given
+        part of speech tag and language or languages. If pos is
+        not specified, all synsets for all parts of speech will
+        be used."""
+
+        if lang == 'eng':
+            if pos is None:
+                return iter(self._lemma_pos_offset_map)
+            else:
+                return (
+                    lemma for lemma in self._lemma_pos_offset_map
+                    if pos in self._lemma_pos_offset_map[lemma]
+                )
+        else:
+            self._load_lang_data(lang)
+            lemma = []
+            for i in self._lang_data[lang][0]:
+                if pos is not None and i[-1] != pos:
+                    continue
+                lemma.extend(self._lang_data[lang][0][i])
+
+            lemma = list(set(lemma))
+            return lemma
+
+    def all_synsets(self, pos=None):
+        """Iterate over all synsets with a given part of speech tag.
+        If no pos is specified, all synsets for all parts of speech
+        will be loaded.
+        """
+        if pos is None:
+            pos_tags = self._FILEMAP.keys()
+        else:
+            pos_tags = [pos]
+
+        cache = self._synset_offset_cache
+        from_pos_and_line = self._synset_from_pos_and_line
+
+        # generate all synsets for each part of speech
+        for pos_tag in pos_tags:
+            # Open the file for reading.  Note that we can not re-use
+            # the file poitners from self._data_file_map here, because
+            # we're defining an iterator, and those file pointers might
+            # be moved while we're not looking.
+            if pos_tag == ADJ_SAT:
+                pos_tag = ADJ
+            fileid = 'data.%s' % self._FILEMAP[pos_tag]
+            data_file = self.open(fileid)
+
+            try:
+                # generate synsets for each line in the POS file
+                offset = data_file.tell()
+                line = data_file.readline()
+                while line:
+                    if not line[0].isspace():
+                        if offset in cache[pos_tag]:
+                            # See if the synset is cached
+                            synset = cache[pos_tag][offset]
+                        else:
+                            # Otherwise, parse the line
+                            synset = from_pos_and_line(pos_tag, line)
+                            cache[pos_tag][offset] = synset
+
+                        # adjective satellites are in the same file as
+                        # adjectives so only yield the synset if it's actually
+                        # a satellite
+                        if synset._pos == ADJ_SAT:
+                            yield synset
+
+                        # for all other POS tags, yield all synsets (this means
+                        # that adjectives also include adjective satellites)
+                        else:
+                            yield synset
+                    offset = data_file.tell()
+                    line = data_file.readline()
+
+            # close the extra file handle we opened
+            except:
+                data_file.close()
+                raise
+            else:
+                data_file.close()
+
+    def words(self, lang='eng'):
+        """return lemmas of the given language as list of words"""
+        return self.all_lemma_names(lang=lang)
+
+    def license(self, lang='eng'):
+        """Return the contents of LICENSE (for omw)
+           use lang=lang to get the license for an individual language"""
+        if lang == 'eng':
+            return self.open("LICENSE").read()
+        elif lang in self.langs():
+            return self._omw_reader.open("{}/LICENSE".format(lang)).read()
+        elif lang == 'omw':
+            # under the assumption you don't mean Omwunra-Toqura
+            return self._omw_reader.open("LICENSE").read()
+        elif lang in self._lang_data:
+            raise WordNetError(
+                "Cannot determine license for user-provided tab file"
+            )
+        else:
+            raise WordNetError("Language is not supported.")
+
+    def readme(self, lang='omw'):
+        """Return the contents of README (for omw)
+           use lang=lang to get the readme for an individual language"""
+        if lang == 'eng':
+            return self.open("README").read()
+        elif lang in self.langs():
+            return self._omw_reader.open("{}/README".format(lang)).read()
+        elif lang == 'omw':
+            # under the assumption you don't mean Omwunra-Toqura
+            return self._omw_reader.open("README").read()
+        elif lang in self._lang_data:
+            raise WordNetError("No README for user-provided tab file")
+        else:
+            raise WordNetError("Language is not supported.")
+
+    def citation(self, lang='omw'):
+        """Return the contents of citation.bib file (for omw)
+           use lang=lang to get the citation for an individual language"""
+        if lang == 'eng':
+            return self.open("citation.bib").read()
+        elif lang in self.langs():
+            return self._omw_reader.open("{}/citation.bib".format(lang)).read()
+        elif lang == 'omw':
+            # under the assumption you don't mean Omwunra-Toqura
+            return self._omw_reader.open("citation.bib").read()
+        elif lang in self._lang_data:
+            raise WordNetError("citation not known for user-provided tab file")
+        else:
+            raise WordNetError("Language is not supported.")
+
+    #############################################################
+    # Misc
+    #############################################################
+    def lemma_count(self, lemma):
+        """Return the frequency count for this Lemma"""
+        # Currently, count is only work for English
+        if lemma._lang != 'eng':
+            return 0
+        # open the count file if we haven't already
+        if self._key_count_file is None:
+            self._key_count_file = self.open('cntlist.rev')
+        # find the key in the counts file and return the count
+        line = _binary_search_file(self._key_count_file, lemma._key)
+        if line:
+            return int(line.rsplit(' ', 1)[-1])
+        else:
+            return 0
+
+    def path_similarity(
+        self, synset1, synset2, verbose=False, simulate_root=True
+    ):
+        return synset1.path_similarity(synset2, verbose, simulate_root)
+    path_similarity.__doc__ = Synset.path_similarity.__doc__
+
+    def lch_similarity(
+        self, synset1, synset2, verbose=False, simulate_root=True
+    ):
+        return synset1.lch_similarity(synset2, verbose, simulate_root)
+    lch_similarity.__doc__ = Synset.lch_similarity.__doc__
+
+    def wup_similarity(
+        self, synset1, synset2, verbose=False, simulate_root=True
+    ):
+        return synset1.wup_similarity(synset2, verbose, simulate_root)
+    wup_similarity.__doc__ = Synset.wup_similarity.__doc__
+
+    def res_similarity(self, synset1, synset2, ic, verbose=False):
+        return synset1.res_similarity(synset2, ic, verbose)
+    res_similarity.__doc__ = Synset.res_similarity.__doc__
+
+    def jcn_similarity(self, synset1, synset2, ic, verbose=False):
+        return synset1.jcn_similarity(synset2, ic, verbose)
+    jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
+
+    def lin_similarity(self, synset1, synset2, ic, verbose=False):
+        return synset1.lin_similarity(synset2, ic, verbose)
+    lin_similarity.__doc__ = Synset.lin_similarity.__doc__
+
+    #############################################################
+    # Morphy
+    #############################################################
+    # Morphy, adapted from Oliver Steele's pywordnet
+    def morphy(self, form, pos=None, check_exceptions=True):
+        """
+        Find a possible base form for the given form, with the given
+        part of speech, by checking WordNet's list of exceptional
+        forms, and by recursively stripping affixes for this part of
+        speech until a form in WordNet is found.
+
+        >>> from nltk.corpus import wordnet as wn
+        >>> print(wn.morphy('dogs'))
+        dog
+        >>> print(wn.morphy('churches'))
+        church
+        >>> print(wn.morphy('aardwolves'))
+        aardwolf
+        >>> print(wn.morphy('abaci'))
+        abacus
+        >>> wn.morphy('hardrock', wn.ADV)
+        >>> print(wn.morphy('book', wn.NOUN))
+        book
+        >>> wn.morphy('book', wn.ADJ)
+        """
+
+        if pos is None:
+            morphy = self._morphy
+            analyses = chain(a for p in POS_LIST for a in morphy(form, p))
+        else:
+            analyses = self._morphy(form, pos, check_exceptions)
+
+        # get the first one we find
+        first = list(islice(analyses, 1))
+        if len(first) == 1:
+            return first[0]
+        else:
+            return None
+
+    MORPHOLOGICAL_SUBSTITUTIONS = {
+        NOUN: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'),
+               ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'),
+               ('men', 'man'), ('ies', 'y')],
+        VERB: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''),
+               ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')],
+        ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
+        ADV: []}
+
+    MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
+
+    def _morphy(self, form, pos, check_exceptions=True):
+        # from jordanbg:
+        # Given an original string x
+        # 1. Apply rules once to the input to get y1, y2, y3, etc.
+        # 2. Return all that are in the database
+        # 3. If there are no matches, keep applying rules until you either
+        #    find a match or you can't go any further
+
+        exceptions = self._exception_map[pos]
+        substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
+
+        def apply_rules(forms):
+            return [form[:-len(old)] + new
+                    for form in forms
+                    for old, new in substitutions
+                    if form.endswith(old)]
+
+        def filter_forms(forms):
+            result = []
+            seen = set()
+            for form in forms:
+                if form in self._lemma_pos_offset_map:
+                    if pos in self._lemma_pos_offset_map[form]:
+                        if form not in seen:
+                            result.append(form)
+                            seen.add(form)
+            return result
+
+        # 0. Check the exception lists
+        if check_exceptions:
+            if form in exceptions:
+                return filter_forms([form] + exceptions[form])
+
+        # 1. Apply rules once to the input to get y1, y2, y3, etc.
+        forms = apply_rules([form])
+
+        # 2. Return all that are in the database (and check the original too)
+        results = filter_forms([form] + forms)
+        if results:
+            return results
+
+        # 3. If there are no matches, keep applying rules until we find a match
+        while forms:
+            forms = apply_rules(forms)
+            results = filter_forms(forms)
+            if results:
+                return results
+
+        # Return an empty list if we can't find anything
+        return []
+
+    #############################################################
+    # Create information content from corpus
+    #############################################################
+    def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
+        """
+        Creates an information content lookup dictionary from a corpus.
+
+        :type corpus: CorpusReader
+        :param corpus: The corpus from which we create an information
+        content dictionary.
+        :type weight_senses_equally: bool
+        :param weight_senses_equally: If this is True, gives all
+        possible senses equal weight rather than dividing by the
+        number of possible senses.  (If a word has 3 synses, each
+        sense gets 0.3333 per appearance when this is False, 1.0 when
+        it is true.)
+        :param smoothing: How much do we smooth synset counts (default is 1.0)
+        :type smoothing: float
+        :return: An information content dictionary
+        """
+        counts = FreqDist()
+        for ww in corpus.words():
+            counts[ww] += 1
+
+        ic = {}
+        for pp in POS_LIST:
+            ic[pp] = defaultdict(float)
+
+        # Initialize the counts with the smoothing value
+        if smoothing > 0.0:
+            for ss in self.all_synsets():
+                pos = ss._pos
+                if pos == ADJ_SAT:
+                    pos = ADJ
+                ic[pos][ss._offset] = smoothing
+
+        for ww in counts:
+            possible_synsets = self.synsets(ww)
+            if len(possible_synsets) == 0:
+                continue
+
+            # Distribute weight among possible synsets
+            weight = float(counts[ww])
+            if not weight_senses_equally:
+                weight /= float(len(possible_synsets))
+
+            for ss in possible_synsets:
+                pos = ss._pos
+                if pos == ADJ_SAT:
+                    pos = ADJ
+                for level in ss._iter_hypernym_lists():
+                    for hh in level:
+                        ic[pos][hh._offset] += weight
+                # Add the weight to the root
+                ic[pos][0] += weight
+        return ic
+
+    def custom_lemmas(self, tab_file, lang):
+        """
+        Reads a custom tab file containing mappings of lemmas in the given
+        language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
+        WordNet functions to then be used with that language.
+
+        See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
+        documentation on the Multilingual WordNet tab file format.
+
+        :param tab_file: Tab file as a file or file-like object
+        :type  lang str
+        :param lang ISO 639-3 code of the language of the tab file
+        """
+        if len(lang) != 3:
+            raise ValueError('lang should be a (3 character) ISO 639-3 code')
+        self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
+        for l in tab_file.readlines():
+            if isinstance(l, bytes):
+                # Support byte-stream files (e.g. as returned by Python 2's
+                # open() function) as well as text-stream ones
+                l = l.decode('utf-8')
+            l = l.replace('\n', '')
+            l = l.replace(' ', '_')
+            if l[0] != '#':
+                word = l.split('\t')
+                self._lang_data[lang][0][word[0]].append(word[2])
+                self._lang_data[lang][1][word[2].lower()].append(word[0])
+
+
+######################################################################
+# WordNet Information Content Corpus Reader
+######################################################################
+
+class WordNetICCorpusReader(CorpusReader):
+    """
+    A corpus reader for the WordNet information content corpus.
+    """
+
+    def __init__(self, root, fileids):
+        CorpusReader.__init__(self, root, fileids, encoding='utf8')
+
+    # this load function would be more efficient if the data was pickled
+    # Note that we can't use NLTK's frequency distributions because
+    # synsets are overlapping (each instance of a synset also counts
+    # as an instance of its hypernyms)
+    def ic(self, icfile):
+        """
+        Load an information content file from the wordnet_ic corpus
+        and return a dictionary.  This dictionary has just two keys,
+        NOUN and VERB, whose values are dictionaries that map from
+        synsets to information content values.
+
+        :type icfile: str
+        :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
+        :return: An information content dictionary
+        """
+        ic = {}
+        ic[NOUN] = defaultdict(float)
+        ic[VERB] = defaultdict(float)
+        for num, line in enumerate(self.open(icfile)):
+            if num == 0:  # skip the header
+                continue
+            fields = line.split()
+            offset = int(fields[0][:-1])
+            value = float(fields[1])
+            pos = _get_pos(fields[0])
+            if len(fields) == 3 and fields[2] == "ROOT":
+                # Store root count.
+                ic[pos][0] += value
+            if value != 0:
+                ic[pos][offset] = value
+        return ic
+
+
+######################################################################
+# Similarity metrics
+######################################################################
+
+# TODO: Add in the option to manually add a new root node; this will be
+# useful for verb similarity as there exist multiple verb taxonomies.
+
+# More information about the metrics is available at
+# http://marimba.d.umn.edu/similarity/measures.html
+
+def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
+    return synset1.path_similarity(synset2, verbose, simulate_root)
+
+
+def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
+    return synset1.lch_similarity(synset2, verbose, simulate_root)
+
+
+def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
+    return synset1.wup_similarity(synset2, verbose, simulate_root)
+
+
+def res_similarity(synset1, synset2, ic, verbose=False):
+    return synset1.res_similarity(synset2, verbose)
+
+
+def jcn_similarity(synset1, synset2, ic, verbose=False):
+    return synset1.jcn_similarity(synset2, verbose)
+
+
+def lin_similarity(synset1, synset2, ic, verbose=False):
+    return synset1.lin_similarity(synset2, verbose)
+
+
+path_similarity.__doc__ = Synset.path_similarity.__doc__
+lch_similarity.__doc__ = Synset.lch_similarity.__doc__
+wup_similarity.__doc__ = Synset.wup_similarity.__doc__
+res_similarity.__doc__ = Synset.res_similarity.__doc__
+jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
+lin_similarity.__doc__ = Synset.lin_similarity.__doc__
+
+
+def _lcs_ic(synset1, synset2, ic, verbose=False):
+    """
+    Get the information content of the least common subsumer that has
+    the highest information content value.  If two nodes have no
+    explicit common subsumer, assume that they share an artificial
+    root node that is the hypernym of all explicit roots.
+
+    :type synset1: Synset
+    :param synset1: First input synset.
+    :type synset2: Synset
+    :param synset2: Second input synset.  Must be the same part of
+    speech as the first synset.
+    :type  ic: dict
+    :param ic: an information content object (as returned by ``load_ic()``).
+    :return: The information content of the two synsets and their most
+    informative subsumer
+    """
+    if synset1._pos != synset2._pos:
+        raise WordNetError(
+            'Computing the least common subsumer requires '
+            '%s and %s to have the same part of speech.' %
+            (synset1, synset2)
+        )
+
+    ic1 = information_content(synset1, ic)
+    ic2 = information_content(synset2, ic)
+    subsumers = synset1.common_hypernyms(synset2)
+    if len(subsumers) == 0:
+        subsumer_ic = 0
+    else:
+        subsumer_ic = max(information_content(s, ic) for s in subsumers)
+
+    if verbose:
+        print("> LCS Subsumer by content:", subsumer_ic)
+
+    return ic1, ic2, subsumer_ic
+
+
+# Utility functions
+
+def information_content(synset, ic):
+    try:
+        icpos = ic[synset._pos]
+    except KeyError:
+        msg = 'Information content file has no entries for part-of-speech: %s'
+        raise WordNetError(msg % synset._pos)
+
+    counts = icpos[synset._offset]
+    if counts == 0:
+        return _INF
+    else:
+        return -math.log(counts / icpos[0])
+
+
+# get the part of speech (NOUN or VERB) from the information content record
+# (each identifier has a 'n' or 'v' suffix)
+
+def _get_pos(field):
+    if field[-1] == 'n':
+        return NOUN
+    elif field[-1] == 'v':
+        return VERB
+    else:
+        msg = (
+            "Unidentified part of speech in WordNet Information Content file "
+            "for field %s" % field
+        )
+        raise ValueError(msg)
+
+
+# unload corpus after tests
+def teardown_module(module=None):
+    from nltk.corpus import wordnet
+    wordnet._unload()
+
diff --git a/nlp_resource_data/nltk/corpus/reader/wordnet.pyc b/nlp_resource_data/nltk/corpus/reader/wordnet.pyc

new file mode 100755 (executable)

index 0000000..5351bbe

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/wordnet.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/xmldocs.py b/nlp_resource_data/nltk/corpus/reader/xmldocs.py

new file mode 100755 (executable)

index 0000000..295e91e
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/xmldocs.py
@@ -0,0 +1,392 @@
+# Natural Language Toolkit: XML Corpus Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for corpora whose documents are xml files.
+
+(note -- not named 'xml' to avoid conflicting w/ standard xml package)
+"""
+from __future__ import print_function, unicode_literals
+
+import codecs
+
+# Use the c version of ElementTree, which is faster, if possible:
+try: from xml.etree import cElementTree as ElementTree
+except ImportError: from xml.etree import ElementTree
+
+from six import string_types
+
+from nltk.data import SeekableUnicodeStreamReader
+from nltk.tokenize import WordPunctTokenizer
+from nltk.internals import ElementWrapper
+
+from nltk.corpus.reader.api import CorpusReader
+from nltk.corpus.reader.util import *
+
+class XMLCorpusReader(CorpusReader):
+    """
+    Corpus reader for corpora whose documents are xml files.
+
+    Note that the ``XMLCorpusReader`` constructor does not take an
+    ``encoding`` argument, because the unicode encoding is specified by
+    the XML files themselves.  See the XML specs for more info.
+    """
+    def __init__(self, root, fileids, wrap_etree=False):
+        self._wrap_etree = wrap_etree
+        CorpusReader.__init__(self, root, fileids)
+
+    def xml(self, fileid=None):
+        # Make sure we have exactly one file -- no concatenating XML.
+        if fileid is None and len(self._fileids) == 1:
+            fileid = self._fileids[0]
+        if not isinstance(fileid, string_types):
+            raise TypeError('Expected a single file identifier string')
+        # Read the XML in using ElementTree.
+        elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
+        # If requested, wrap it.
+        if self._wrap_etree:
+            elt = ElementWrapper(elt)
+        # Return the ElementTree element.
+        return elt
+
+    def words(self, fileid=None):
+        """
+        Returns all of the words and punctuation symbols in the specified file
+        that were in text nodes -- ie, tags are ignored. Like the xml() method,
+        fileid can only specify one file.
+
+        :return: the given file's text nodes as a list of words and punctuation symbols
+        :rtype: list(str)
+        """
+
+        elt = self.xml(fileid)
+        encoding = self.encoding(fileid)
+        word_tokenizer=WordPunctTokenizer()
+        iterator = elt.getiterator()
+        out = []
+
+        for node in iterator:
+            text = node.text
+            if text is not None:
+                if isinstance(text, bytes):
+                    text = text.decode(encoding)
+                toks = word_tokenizer.tokenize(text)
+                out.extend(toks)
+        return out
+
+    def raw(self, fileids=None):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, string_types): fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+
+class XMLCorpusView(StreamBackedCorpusView):
+    """
+    A corpus view that selects out specified elements from an XML
+    file, and provides a flat list-like interface for accessing them.
+    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
+    but may be used by subclasses of ``XMLCorpusReader``.)
+
+    Every XML corpus view has a "tag specification", indicating what
+    XML elements should be included in the view; and each (non-nested)
+    element that matches this specification corresponds to one item in
+    the view.  Tag specifications are regular expressions over tag
+    paths, where a tag path is a list of element tag names, separated
+    by '/', indicating the ancestry of the element.  Some examples:
+
+      - ``'foo'``: A top-level element whose tag is ``foo``.
+      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
+        is a top-level element whose tag is ``foo``.
+      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
+        in the xml tree.
+      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
+        appearing anywhere in the xml tree.
+
+    The view items are generated from the selected XML elements via
+    the method ``handle_elt()``.  By default, this method returns the
+    element as-is (i.e., as an ElementTree object); but it can be
+    overridden, either via subclassing or via the ``elt_handler``
+    constructor parameter.
+    """
+
+    #: If true, then display debugging output to stdout when reading
+    #: blocks.
+    _DEBUG = False
+
+    #: The number of characters read at a time by this corpus reader.
+    _BLOCK_SIZE = 1024
+
+    def __init__(self, fileid, tagspec, elt_handler=None):
+        """
+        Create a new corpus view based on a specified XML file.
+
+        Note that the ``XMLCorpusView`` constructor does not take an
+        ``encoding`` argument, because the unicode encoding is
+        specified by the XML files themselves.
+
+        :type tagspec: str
+        :param tagspec: A tag specification, indicating what XML
+            elements should be included in the view.  Each non-nested
+            element that matches this specification corresponds to one
+            item in the view.
+
+        :param elt_handler: A function used to transform each element
+            to a value for the view.  If no handler is specified, then
+            ``self.handle_elt()`` is called, which returns the element
+            as an ElementTree object.  The signature of elt_handler is::
+
+                elt_handler(elt, tagspec) -> value
+        """
+        if elt_handler: self.handle_elt = elt_handler
+
+        self._tagspec = re.compile(tagspec+r'\Z')
+        """The tag specification for this corpus view."""
+
+        self._tag_context = {0: ()}
+        """A dictionary mapping from file positions (as returned by
+           ``stream.seek()`` to XML contexts.  An XML context is a
+           tuple of XML tag names, indicating which tags have not yet
+           been closed."""
+
+        encoding = self._detect_encoding(fileid)
+        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
+
+    def _detect_encoding(self, fileid):
+        if isinstance(fileid, PathPointer):
+            try:
+                infile = fileid.open()
+                s = infile.readline()
+            finally:
+                infile.close()
+        else:
+            with open(fileid, 'rb') as infile:
+                s = infile.readline()
+        if s.startswith(codecs.BOM_UTF16_BE):
+            return 'utf-16-be'
+        if s.startswith(codecs.BOM_UTF16_LE):
+            return 'utf-16-le'
+        if s.startswith(codecs.BOM_UTF32_BE):
+            return 'utf-32-be'
+        if s.startswith(codecs.BOM_UTF32_LE):
+            return 'utf-32-le'
+        if s.startswith(codecs.BOM_UTF8):
+            return 'utf-8'
+        m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
+        if m:
+            return m.group(1).decode()
+        m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s)
+        if m:
+            return m.group(1).decode()
+        # No encoding found -- what should the default be?
+        return 'utf-8'
+
+    def handle_elt(self, elt, context):
+        """
+        Convert an element into an appropriate value for inclusion in
+        the view.  Unless overridden by a subclass or by the
+        ``elt_handler`` constructor argument, this method simply
+        returns ``elt``.
+
+        :return: The view value corresponding to ``elt``.
+
+        :type elt: ElementTree
+        :param elt: The element that should be converted.
+
+        :type context: str
+        :param context: A string composed of element tags separated by
+            forward slashes, indicating the XML context of the given
+            element.  For example, the string ``'foo/bar/baz'``
+            indicates that the element is a ``baz`` element whose
+            parent is a ``bar`` element and whose grandparent is a
+            top-level ``foo`` element.
+        """
+        return elt
+
+    #: A regular expression that matches XML fragments that do not
+    #: contain any un-closed tags.
+    _VALID_XML_RE = re.compile(r"""
+        [^<]*
+        (
+          ((<!--.*?-->)                         |  # comment
+           (<![CDATA[.*?]])                     |  # raw character data
+           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
+           (<[^!>][^>]*>))                         # tag or PI
+          [^<]*)*
+        \Z""",
+        re.DOTALL|re.VERBOSE)
+
+    #: A regular expression used to extract the tag name from a start tag,
+    #: end tag, or empty-elt tag string.
+    _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
+
+    #: A regular expression used to find all start-tags, end-tags, and
+    #: emtpy-elt tags in an XML file.  This regexp is more lenient than
+    #: the XML spec -- e.g., it allows spaces in some places where the
+    #: spec does not.
+    _XML_PIECE = re.compile(r"""
+        # Include these so we can skip them:
+        (?P<COMMENT>        <!--.*?-->                          )|
+        (?P<CDATA>          <![CDATA[.*?]]>                     )|
+        (?P<PI>             <\?.*?\?>                           )|
+        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
+        # These are the ones we actually care about:
+        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
+        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
+        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )""",
+        re.DOTALL|re.VERBOSE)
+
+    def _read_xml_fragment(self, stream):
+        """
+        Read a string from the given stream that does not contain any
+        un-closed tags.  In particular, this function first reads a
+        block from the stream of size ``self._BLOCK_SIZE``.  It then
+        checks if that block contains an un-closed tag.  If it does,
+        then this function either backtracks to the last '<', or reads
+        another block.
+        """
+        fragment = ''
+
+        if isinstance(stream, SeekableUnicodeStreamReader):
+            startpos = stream.tell()
+        while True:
+            # Read a block and add it to the fragment.
+            xml_block = stream.read(self._BLOCK_SIZE)
+            fragment += xml_block
+
+            # Do we have a well-formed xml fragment?
+            if self._VALID_XML_RE.match(fragment):
+                return fragment
+
+            # Do we have a fragment that will never be well-formed?
+            if re.search('[<>]', fragment).group(0) == '>':
+                pos = stream.tell() - (
+                    len(fragment)-re.search('[<>]', fragment).end())
+                raise ValueError('Unexpected ">" near char %s' % pos)
+
+            # End of file?
+            if not xml_block:
+                raise ValueError('Unexpected end of file: tag not closed')
+
+            # If not, then we must be in the middle of a <..tag..>.
+            # If appropriate, backtrack to the most recent '<'
+            # character.
+            last_open_bracket = fragment.rfind('<')
+            if last_open_bracket > 0:
+                if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
+                    if isinstance(stream, SeekableUnicodeStreamReader):
+                        stream.seek(startpos)
+                        stream.char_seek_forward(last_open_bracket)
+                    else:
+                        stream.seek(-(len(fragment)-last_open_bracket), 1)
+                    return fragment[:last_open_bracket]
+
+            # Otherwise, read another block. (i.e., return to the
+            # top of the loop.)
+
+    def read_block(self, stream, tagspec=None, elt_handler=None):
+        """
+        Read from ``stream`` until we find at least one element that
+        matches ``tagspec``, and return the result of applying
+        ``elt_handler`` to each element found.
+        """
+        if tagspec is None: tagspec = self._tagspec
+        if elt_handler is None: elt_handler = self.handle_elt
+
+        # Use a stack of strings to keep track of our context:
+        context = list(self._tag_context.get(stream.tell()))
+        assert context is not None # check this -- could it ever happen?
+
+        elts = []
+
+        elt_start = None # where does the elt start
+        elt_depth = None # what context depth
+        elt_text = ''
+
+        while elts==[] or elt_start is not None:
+            if isinstance(stream, SeekableUnicodeStreamReader):
+                startpos = stream.tell()
+            xml_fragment = self._read_xml_fragment(stream)
+
+            # End of file.
+            if not xml_fragment:
+                if elt_start is None: break
+                else: raise ValueError('Unexpected end of file')
+
+            # Process each <tag> in the xml fragment.
+            for piece in self._XML_PIECE.finditer(xml_fragment):
+                if self._DEBUG:
+                    print('%25s %s' % ('/'.join(context)[-20:], piece.group()))
+
+                if piece.group('START_TAG'):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    # Keep context up-to-date.
+                    context.append(name)
+                    # Is this one of the elts we're looking for?
+                    if elt_start is None:
+                        if re.match(tagspec, '/'.join(context)):
+                            elt_start = piece.start()
+                            elt_depth = len(context)
+
+                elif piece.group('END_TAG'):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    # sanity checks:
+                    if not context:
+                        raise ValueError('Unmatched tag </%s>' % name)
+                    if name != context[-1]:
+                        raise ValueError('Unmatched tag <%s>...</%s>' %
+                                         (context[-1], name))
+                    # Is this the end of an element?
+                    if elt_start is not None and elt_depth == len(context):
+                        elt_text += xml_fragment[elt_start:piece.end()]
+                        elts.append( (elt_text, '/'.join(context)) )
+                        elt_start = elt_depth = None
+                        elt_text = ''
+                    # Keep context up-to-date
+                    context.pop()
+
+                elif piece.group('EMPTY_ELT_TAG'):
+                    name = self._XML_TAG_NAME.match(piece.group()).group(1)
+                    if elt_start is None:
+                        if re.match(tagspec, '/'.join(context)+'/'+name):
+                            elts.append((piece.group(),
+                                         '/'.join(context)+'/'+name))
+
+            if elt_start is not None:
+                # If we haven't found any elements yet, then keep
+                # looping until we do.
+                if elts == []:
+                    elt_text += xml_fragment[elt_start:]
+                    elt_start = 0
+
+                # If we've found at least one element, then try
+                # backtracking to the start of the element that we're
+                # inside of.
+                else:
+                    # take back the last start-tag, and return what
+                    # we've gotten so far (elts is non-empty).
+                    if self._DEBUG:
+                        print(' '*36+'(backtrack)')
+                    if isinstance(stream, SeekableUnicodeStreamReader):
+                        stream.seek(startpos)
+                        stream.char_seek_forward(elt_start)
+                    else:
+                        stream.seek(-(len(xml_fragment)-elt_start), 1)
+                    context = context[:elt_depth-1]
+                    elt_start = elt_depth = None
+                    elt_text = ''
+
+        # Update the _tag_context dict.
+        pos = stream.tell()
+        if pos in self._tag_context:
+            assert tuple(context) == self._tag_context[pos]
+        else:
+            self._tag_context[pos] = tuple(context)
+
+        return [elt_handler(ElementTree.fromstring(
+                                  elt.encode('ascii', 'xmlcharrefreplace')),
+                            context)
+                for (elt, context) in elts]
diff --git a/nlp_resource_data/nltk/corpus/reader/xmldocs.pyc b/nlp_resource_data/nltk/corpus/reader/xmldocs.pyc

new file mode 100755 (executable)

index 0000000..7ac3910

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/xmldocs.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/ycoe.py b/nlp_resource_data/nltk/corpus/reader/ycoe.py

new file mode 100755 (executable)

index 0000000..a8870b1
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/reader/ycoe.py
@@ -0,0 +1,243 @@
+# -*- coding: iso-8859-1 -*-
+
+# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Selina Dennis <selina@tranzfusion.net>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
+English Prose (YCOE), a 1.5 million word syntactically-annotated
+corpus of Old English prose texts. The corpus is distributed by the
+Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
+with NLTK.
+
+The YCOE corpus is divided into 100 files, each representing
+an Old English prose text. Tags used within each text complies
+to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
+"""
+
+import os
+import re
+
+from six import string_types
+
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
+from nltk.corpus.reader.tagged import TaggedCorpusReader
+
+from nltk.corpus.reader.util import *
+from nltk.corpus.reader.api import *
+
+class YCOECorpusReader(CorpusReader):
+    """
+    Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
+    English Prose (YCOE), a 1.5 million word syntactically-annotated
+    corpus of Old English prose texts.
+    """
+    def __init__(self, root, encoding='utf8'):
+        CorpusReader.__init__(self, root, [], encoding)
+
+        self._psd_reader = YCOEParseCorpusReader(
+            self.root.join('psd'), '.*', '.psd', encoding=encoding)
+        self._pos_reader = YCOETaggedCorpusReader(
+            self.root.join('pos'), '.*', '.pos')
+
+        # Make sure we have a consistent set of items:
+        documents = set(f[:-4] for f in self._psd_reader.fileids())
+        if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
+            raise ValueError('Items in "psd" and "pos" '
+                             'subdirectories do not match.')
+
+        fileids = sorted(['%s.psd' % doc for doc in documents] +
+                       ['%s.pos' % doc for doc in documents])
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._documents = sorted(documents)
+
+    def documents(self, fileids=None):
+        """
+        Return a list of document identifiers for all documents in
+        this corpus, or for the documents with the given file(s) if
+        specified.
+        """
+        if fileids is None:
+            return self._documents
+        if isinstance(fileids, string_types):
+            fileids = [fileids]
+        for f in fileids:
+            if f not in self._fileids:
+                raise KeyError('File id %s not found' % fileids)
+        # Strip off the '.pos' and '.psd' extensions.
+        return sorted(set(f[:-4] for f in fileids))
+
+    def fileids(self, documents=None):
+        """
+        Return a list of file identifiers for the files that make up
+        this corpus, or that store the given document(s) if specified.
+        """
+        if documents is None:
+            return self._fileids
+        elif isinstance(documents, string_types):
+            documents = [documents]
+        return sorted(set(['%s.pos' % doc for doc in documents] +
+                          ['%s.psd' % doc for doc in documents]))
+
+    def _getfileids(self, documents, subcorpus):
+        """
+        Helper that selects the appropriate fileids for a given set of
+        documents from a given subcorpus (pos or psd).
+        """
+        if documents is None:
+            documents = self._documents
+        else:
+            if isinstance(documents, string_types):
+                documents = [documents]
+            for document in documents:
+                if document not in self._documents:
+                    if document[-4:] in ('.pos', '.psd'):
+                        raise ValueError(
+                            'Expected a document identifier, not a file '
+                            'identifier.  (Use corpus.documents() to get '
+                            'a list of document identifiers.')
+                    else:
+                        raise ValueError('Document identifier %s not found'
+                                         % document)
+        return ['%s.%s' % (d, subcorpus) for d in documents]
+
+    # Delegate to one of our two sub-readers:
+    def words(self, documents=None):
+        return self._pos_reader.words(self._getfileids(documents, 'pos'))
+    def sents(self, documents=None):
+        return self._pos_reader.sents(self._getfileids(documents, 'pos'))
+    def paras(self, documents=None):
+        return self._pos_reader.paras(self._getfileids(documents, 'pos'))
+    def tagged_words(self, documents=None):
+        return self._pos_reader.tagged_words(self._getfileids(documents, 'pos'))
+    def tagged_sents(self, documents=None):
+        return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos'))
+    def tagged_paras(self, documents=None):
+        return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos'))
+    def parsed_sents(self, documents=None):
+        return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd'))
+
+
+class YCOEParseCorpusReader(BracketParseCorpusReader):
+    """Specialized version of the standard bracket parse corpus reader
+    that strips out (CODE ...) and (ID ...) nodes."""
+    def _parse(self, t):
+        t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t)
+        if re.match(r'\s*\(\s*\)\s*$', t): return None
+        return BracketParseCorpusReader._parse(self, t)
+
+class YCOETaggedCorpusReader(TaggedCorpusReader):
+    def __init__(self, root, items, encoding='utf8'):
+        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
+        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
+        TaggedCorpusReader.__init__(self, root, items, sep='_',
+                                    sent_tokenizer=sent_tokenizer)
+
+#: A list of all documents and their titles in ycoe.
+documents = {
+    'coadrian.o34': 'Adrian and Ritheus',
+    'coaelhom.o3': 'Ælfric, Supplemental Homilies',
+    'coaelive.o3': 'Ælfric\'s Lives of Saints',
+    'coalcuin': 'Alcuin De virtutibus et vitiis',
+    'coalex.o23': 'Alexander\'s Letter to Aristotle',
+    'coapollo.o3': 'Apollonius of Tyre',
+    'coaugust': 'Augustine',
+    'cobede.o2': 'Bede\'s History of the English Church',
+    'cobenrul.o3': 'Benedictine Rule',
+    'coblick.o23': 'Blickling Homilies',
+    'coboeth.o2': 'Boethius\' Consolation of Philosophy',
+    'cobyrhtf.o3': 'Byrhtferth\'s Manual',
+    'cocanedgD': 'Canons of Edgar (D)',
+    'cocanedgX': 'Canons of Edgar (X)',
+    'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
+    'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
+    'cochad.o24': 'Saint Chad',
+    'cochdrul': 'Chrodegang of Metz, Rule',
+    'cochristoph': 'Saint Christopher',
+    'cochronA.o23': 'Anglo-Saxon Chronicle A',
+    'cochronC': 'Anglo-Saxon Chronicle C',
+    'cochronD': 'Anglo-Saxon Chronicle D',
+    'cochronE.o34': 'Anglo-Saxon Chronicle E',
+    'cocura.o2': 'Cura Pastoralis',
+    'cocuraC': 'Cura Pastoralis (Cotton)',
+    'codicts.o34': 'Dicts of Cato',
+    'codocu1.o1': 'Documents 1 (O1)',
+    'codocu2.o12': 'Documents 2 (O1/O2)',
+    'codocu2.o2': 'Documents 2 (O2)',
+    'codocu3.o23': 'Documents 3 (O2/O3)',
+    'codocu3.o3': 'Documents 3 (O3)',
+    'codocu4.o24': 'Documents 4 (O2/O4)',
+    'coeluc1': 'Honorius of Autun, Elucidarium 1',
+    'coeluc2': 'Honorius of Autun, Elucidarium 1',
+    'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
+    'coeuphr': 'Saint Euphrosyne',
+    'coeust': 'Saint Eustace and his companions',
+    'coexodusP': 'Exodus (P)',
+    'cogenesiC': 'Genesis (C)',
+    'cogregdC.o24': 'Gregory\'s Dialogues (C)',
+    'cogregdH.o23': 'Gregory\'s Dialogues (H)',
+    'coherbar': 'Pseudo-Apuleius, Herbarium',
+    'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
+    'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
+    'cojames': 'Saint James',
+    'colacnu.o23': 'Lacnunga',
+    'colaece.o2': 'Leechdoms',
+    'colaw1cn.o3': 'Laws, Cnut I',
+    'colaw2cn.o3': 'Laws, Cnut II',
+    'colaw5atr.o3': 'Laws, Æthelred V',
+    'colaw6atr.o3': 'Laws, Æthelred VI',
+    'colawaf.o2': 'Laws, Alfred',
+    'colawafint.o2': 'Alfred\'s Introduction to Laws',
+    'colawger.o34': 'Laws, Gerefa',
+    'colawine.ox2': 'Laws, Ine',
+    'colawnorthu.o3': 'Northumbra Preosta Lagu',
+    'colawwllad.o4': 'Laws, William I, Lad',
+    'coleofri.o4': 'Leofric',
+    'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
+    'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
+    'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
+    'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
+    'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
+    'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
+    'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
+    'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
+    'comargaC.o34': 'Saint Margaret (C)',
+    'comargaT': 'Saint Margaret (T)',
+    'comart1': 'Martyrology, I',
+    'comart2': 'Martyrology, II',
+    'comart3.o23': 'Martyrology, III',
+    'comarvel.o23': 'Marvels of the East',
+    'comary': 'Mary of Egypt',
+    'coneot': 'Saint Neot',
+    'conicodA': 'Gospel of Nicodemus (A)',
+    'conicodC': 'Gospel of Nicodemus (C)',
+    'conicodD': 'Gospel of Nicodemus (D)',
+    'conicodE': 'Gospel of Nicodemus (E)',
+    'coorosiu.o2': 'Orosius',
+    'cootest.o3': 'Heptateuch',
+    'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
+    'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
+    'coprefcura.o2': 'Preface to the Cura Pastoralis',
+    'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
+    'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
+    'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
+    'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
+    'corood': 'History of the Holy Rood-Tree',
+    'cosevensl': 'Seven Sleepers',
+    'cosolilo': 'St. Augustine\'s Soliloquies',
+    'cosolsat1.o4': 'Solomon and Saturn I',
+    'cosolsat2': 'Solomon and Saturn II',
+    'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
+    'coverhom': 'Vercelli Homilies',
+    'coverhomE': 'Vercelli Homilies (E)',
+    'coverhomL': 'Vercelli Homilies (L)',
+    'covinceB': 'Saint Vincent (Bodley 343)',
+    'covinsal': 'Vindicta Salvatoris',
+    'cowsgosp.o3': 'West-Saxon Gospels',
+    'cowulf.o34': 'Wulfstan\'s Homilies'
+    }
diff --git a/nlp_resource_data/nltk/corpus/reader/ycoe.pyc b/nlp_resource_data/nltk/corpus/reader/ycoe.pyc

new file mode 100755 (executable)

index 0000000..0572c2e

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/reader/ycoe.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/util.py b/nlp_resource_data/nltk/corpus/util.py

new file mode 100755 (executable)

index 0000000..d23c561
--- /dev/null
+++ b/nlp_resource_data/nltk/corpus/util.py
@@ -0,0 +1,144 @@
+# Natural Language Toolkit: Corpus Reader Utility Functions
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+######################################################################
+#{ Lazy Corpus Loader
+######################################################################
+
+from __future__ import unicode_literals
+import re
+import gc
+import nltk
+from nltk.compat import python_2_unicode_compatible
+
+TRY_ZIPFILE_FIRST = False
+
+@python_2_unicode_compatible
+class LazyCorpusLoader(object):
+    """
+    To see the API documentation for this lazily loaded corpus, first
+    run corpus.ensure_loaded(), and then run help(this_corpus).
+    
+    LazyCorpusLoader is a proxy object which is used to stand in for a
+    corpus object before the corpus is loaded.  This allows NLTK to
+    create an object for each corpus, but defer the costs associated
+    with loading those corpora until the first time that they're
+    actually accessed.
+
+    The first time this object is accessed in any way, it will load
+    the corresponding corpus, and transform itself into that corpus
+    (by modifying its own ``__class__`` and ``__dict__`` attributes).
+
+    If the corpus can not be found, then accessing this object will
+    raise an exception, displaying installation instructions for the
+    NLTK data package.  Once they've properly installed the data
+    package (or modified ``nltk.data.path`` to point to its location),
+    they can then use the corpus object without restarting python.
+    
+    :param name: The name of the corpus
+    :type name: str
+    :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
+    :type reader: nltk.corpus.reader.api.CorpusReader
+    :param nltk_data_subdir: The subdirectory where the corpus is stored.
+    :type nltk_data_subdir: str
+    :param *args: Any other non-keywords arguments that `reader_cls` might need.
+    :param *kargs: Any other keywords arguments that `reader_cls` might need.
+    """
+    def __init__(self, name, reader_cls, *args, **kwargs):
+        from nltk.corpus.reader.api import CorpusReader
+        assert issubclass(reader_cls, CorpusReader)
+        self.__name = self.__name__ = name
+        self.__reader_cls = reader_cls
+        # If nltk_data_subdir is set explicitly 
+        if 'nltk_data_subdir' in kwargs:
+            # Use the specified subdirectory path
+            self.subdir = kwargs['nltk_data_subdir']
+            # Pops the `nltk_data_subdir` argument, we don't need it anymore.
+            kwargs.pop('nltk_data_subdir', None)
+        else: # Otherwise use 'nltk_data/corpora'
+            self.subdir = 'corpora'
+        self.__args = args
+        self.__kwargs = kwargs
+
+    def __load(self):
+        # Find the corpus root directory.
+        zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name)
+        if TRY_ZIPFILE_FIRST:
+            try:
+                root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
+            except LookupError as e:
+                try: root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
+                except LookupError: raise e
+        else:
+            try:
+                root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
+            except LookupError as e:
+                try: root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
+                except LookupError: raise e
+
+        # Load the corpus.
+        corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
+
+        # This is where the magic happens!  Transform ourselves into
+        # the corpus by modifying our own __dict__ and __class__ to
+        # match that of the corpus.
+
+        args, kwargs  = self.__args, self.__kwargs
+        name, reader_cls = self.__name, self.__reader_cls
+
+        self.__dict__ = corpus.__dict__
+        self.__class__ = corpus.__class__
+
+        # _unload support: assign __dict__ and __class__ back, then do GC.
+        # after reassigning __dict__ there shouldn't be any references to
+        # corpus data so the memory should be deallocated after gc.collect()
+        def _unload(self):
+            lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs)
+            self.__dict__ = lazy_reader.__dict__
+            self.__class__ = lazy_reader.__class__
+            gc.collect()
+
+        self._unload = _make_bound_method(_unload, self)
+
+    def __getattr__(self, attr):
+
+        # Fix for inspect.isclass under Python 2.6
+        # (see http://bugs.python.org/issue1225107).
+        # Without this fix tests may take extra 1.5GB RAM
+        # because all corpora gets loaded during test collection.
+        if attr == '__bases__':
+            raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
+
+        self.__load()
+        # This looks circular, but its not, since __load() changes our
+        # __class__ to something new:
+        return getattr(self, attr)
+
+    def __repr__(self):
+        return '<%s in %r (not loaded yet)>' % (
+            self.__reader_cls.__name__, '.../corpora/'+self.__name)
+
+    def _unload(self):
+        # If an exception occures during corpus loading then
+        # '_unload' method may be unattached, so __getattr__ can be called;
+        # we shouldn't trigger corpus loading again in this case.
+        pass
+
+
+def _make_bound_method(func, self):
+    """
+    Magic for creating bound methods (used for _unload).
+    """
+    class Foo(object):
+        def meth(self): pass
+    f = Foo()
+    bound_method = type(f.meth)
+
+    try:
+        return bound_method(func, self, self.__class__)
+    except TypeError: # python3
+        return bound_method(func, self)
diff --git a/nlp_resource_data/nltk/corpus/util.pyc b/nlp_resource_data/nltk/corpus/util.pyc

new file mode 100755 (executable)

index 0000000..7915a85

Binary files /dev/null and b/nlp_resource_data/nltk/corpus/util.pyc differ
diff --git a/nlp_resource_data/nltk/data.py b/nlp_resource_data/nltk/data.py

new file mode 100755 (executable)

index 0000000..3295bb8
--- /dev/null
+++ b/nlp_resource_data/nltk/data.py
@@ -0,0 +1,1485 @@
+# Natural Language Toolkit: Utility functions
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Functions to find and load NLTK resource files, such as corpora,
+grammars, and saved processing objects.  Resource files are identified
+using URLs, such as ``nltk:corpora/abc/rural.txt`` or
+``http://nltk.org/sample/toy.cfg``.  The following URL protocols are
+supported:
+
+  - ``file:path``: Specifies the file whose path is *path*.
+    Both relative and absolute paths may be used.
+
+  - ``http://host/path``: Specifies the file stored on the web
+    server *host* at path *path*.
+
+  - ``nltk:path``: Specifies the file stored in the NLTK data
+    package at *path*.  NLTK will search for these files in the
+    directories specified by ``nltk.data.path``.
+
+If no protocol is specified, then the default protocol ``nltk:`` will
+be used.
+
+This module provides to functions that can be used to access a
+resource file, given its URL: ``load()`` loads a given resource, and
+adds it to a resource cache; and ``retrieve()`` copies a given resource
+to a local file.
+"""
+from __future__ import print_function, unicode_literals
+from __future__ import division
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+import functools
+import textwrap
+import io
+import os
+import re
+import sys
+import zipfile
+import codecs
+
+from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
+
+try: # Python 3.
+    textwrap_indent = functools.partial(textwrap.indent, prefix='  ')
+except AttributeError: # Python 2; indent() not available for Python2.
+    textwrap_fill = functools.partial(textwrap.fill,
+                                        initial_indent='  ',
+                                        subsequent_indent='  ',
+                                        replace_whitespace=False)
+    def textwrap_indent(text):
+        return '\n'.join(textwrap_fill(line) for line in text.splitlines())
+
+try:
+    from zlib import Z_SYNC_FLUSH as FLUSH
+except ImportError:
+    from zlib import Z_FINISH as FLUSH
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+from six import string_types, text_type
+from six.moves.urllib.request import urlopen, url2pathname
+
+# this import should be more specific:
+import nltk
+from nltk.compat import py3_data, add_py3_data, BytesIO
+
+######################################################################
+# Search Path
+######################################################################
+
+path = []
+"""A list of directories where the NLTK data package might reside.
+   These directories will be checked in order when looking for a
+   resource in the data package.  Note that this allows users to
+   substitute in their own versions of resources, if they have them
+   (e.g., in their home directory under ~/nltk_data)."""
+
+# User-specified locations:
+_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
+path += [d for d in _paths_from_env if d]
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
+    path.append(os.path.expanduser(str('~/nltk_data')))
+
+if sys.platform.startswith('win'):
+    # Common locations on Windows:
+    path += [
+        str(r'C:\nltk_data'), str(r'D:\nltk_data'), str(r'E:\nltk_data'),
+        os.path.join(sys.prefix, str('nltk_data')),
+        os.path.join(sys.prefix, str('lib'), str('nltk_data')),
+        os.path.join(
+            os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data'))
+    ]
+else:
+    # Common locations on UNIX & OS X:
+    path += [
+        str('/usr/share/nltk_data'),
+        str('/usr/local/share/nltk_data'),
+        str('/usr/lib/nltk_data'),
+        str('/usr/local/lib/nltk_data'),
+        os.path.join(sys.prefix, str('nltk_data')),
+        os.path.join(sys.prefix, str('lib'), str('nltk_data'))
+    ]
+
+
+######################################################################
+# Util Functions
+######################################################################
+
+def gzip_open_unicode(filename, mode="rb", compresslevel=9, encoding='utf-8',
+                      fileobj=None, errors=None, newline=None):
+    if fileobj is None:
+        fileobj = GzipFile(filename, mode, compresslevel, fileobj)
+    return io.TextIOWrapper(fileobj, encoding, errors, newline)
+
+
+def split_resource_url(resource_url):
+    """
+    Splits a resource url into "<protocol>:<path>".
+
+    >>> windows = sys.platform.startswith('win')
+    >>> split_resource_url('nltk:home/nltk')
+    ('nltk', 'home/nltk')
+    >>> split_resource_url('nltk:/home/nltk')
+    ('nltk', '/home/nltk')
+    >>> split_resource_url('file:/home/nltk')
+    ('file', '/home/nltk')
+    >>> split_resource_url('file:///home/nltk')
+    ('file', '/home/nltk')
+    >>> split_resource_url('file:///C:/home/nltk')
+    ('file', '/C:/home/nltk')
+    """
+    protocol, path_ = resource_url.split(':', 1)
+    if protocol == 'nltk':
+        pass
+    elif protocol == 'file':
+        if path_.startswith('/'):
+            path_ = '/' + path_.lstrip('/')
+    else:
+        path_ = re.sub(r'^/{0,2}', '', path_)
+    return protocol, path_
+
+
+def normalize_resource_url(resource_url):
+    r"""
+    Normalizes a resource url
+
+    >>> windows = sys.platform.startswith('win')
+    >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \
+    ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg'))
+    True
+    >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file'
+    True
+    >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file'
+    True
+    >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file'
+    True
+    >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file'
+    True
+    >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file'
+    True
+    >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file'
+    True
+    >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file'
+    True
+    >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg'
+    True
+    >>> normalize_resource_url('nltk:home/nltk')
+    'nltk:home/nltk'
+    >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk'
+    True
+    >>> normalize_resource_url('http://example.com/dir/file')
+    'http://example.com/dir/file'
+    >>> normalize_resource_url('dir/file')
+    'nltk:dir/file'
+    """
+    try:
+        protocol, name = split_resource_url(resource_url)
+    except ValueError:
+        # the resource url has no protocol, use the nltk protocol by default
+        protocol = 'nltk'
+        name = resource_url
+    # use file protocol if the path is an absolute path
+    if protocol == 'nltk' and os.path.isabs(name):
+        protocol = 'file://'
+        name = normalize_resource_name(name, False, None)
+    elif protocol == 'file':
+        protocol = 'file://'
+        # name is absolute
+        name = normalize_resource_name(name, False, None)
+    elif protocol == 'nltk':
+        protocol = 'nltk:'
+        name = normalize_resource_name(name, True)
+    else:
+        # handled by urllib
+        protocol += '://'
+    return ''.join([protocol, name])
+
+
+def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
+    """
+    :type resource_name: str or unicode
+    :param resource_name: The name of the resource to search for.
+        Resource names are posix-style relative path names, such as
+        ``corpora/brown``.  Directory names will automatically
+        be converted to a platform-appropriate path separator.
+        Directory trailing slashes are preserved
+
+    >>> windows = sys.platform.startswith('win')
+    >>> normalize_resource_name('.', True)
+    './'
+    >>> normalize_resource_name('./', True)
+    './'
+    >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file'
+    True
+    >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file'
+    True
+    >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file'
+    True
+    >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file'
+    True
+    >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file'
+    True
+    >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file'
+    True
+    """
+    is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(os.path.sep)
+    if sys.platform.startswith('win'):
+        resource_name = resource_name.lstrip('/')
+    else:
+        resource_name = re.sub(r'^/+', '/', resource_name)
+    if allow_relative:
+        resource_name = os.path.normpath(resource_name)
+    else:
+        if relative_path is None:
+            relative_path = os.curdir
+        resource_name = os.path.abspath(
+            os.path.join(relative_path, resource_name))
+    resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
+    if sys.platform.startswith('win') and os.path.isabs(resource_name):
+        resource_name = '/' + resource_name
+    if is_dir and not resource_name.endswith('/'):
+        resource_name += '/'
+    return resource_name
+
+
+######################################################################
+# Path Pointers
+######################################################################
+
+@add_metaclass(ABCMeta)
+class PathPointer(object):
+    """
+    An abstract base class for 'path pointers,' used by NLTK's data
+    package to identify specific paths.  Two subclasses exist:
+    ``FileSystemPathPointer`` identifies a file that can be accessed
+    directly via a given absolute path.  ``ZipFilePathPointer``
+    identifies a file contained within a zipfile, that can be accessed
+    by reading that zipfile.
+    """
+
+    @abstractmethod
+    def open(self, encoding=None):
+        """
+        Return a seekable read-only stream that can be used to read
+        the contents of the file identified by this path pointer.
+
+        :raise IOError: If the path specified by this pointer does
+            not contain a readable file.
+        """
+
+    @abstractmethod
+    def file_size(self):
+        """
+        Return the size of the file pointed to by this path pointer,
+        in bytes.
+
+        :raise IOError: If the path specified by this pointer does
+            not contain a readable file.
+        """
+
+    @abstractmethod
+    def join(self, fileid):
+        """
+        Return a new path pointer formed by starting at the path
+        identified by this pointer, and then following the relative
+        path given by ``fileid``.  The path components of ``fileid``
+        should be separated by forward slashes, regardless of
+        the underlying file system's path seperator character.
+        """
+
+
+class FileSystemPathPointer(PathPointer, text_type):
+    """
+    A path pointer that identifies a file which can be accessed
+    directly via a given absolute path.
+    """
+    @py3_data
+    def __init__(self, _path):
+        """
+        Create a new path pointer for the given absolute path.
+
+        :raise IOError: If the given path does not exist.
+        """
+
+        _path = os.path.abspath(_path)
+        if not os.path.exists(_path):
+            raise IOError('No such file or directory: %r' % _path)
+        self._path = _path
+
+        # There's no need to call str.__init__(), since it's a no-op;
+        # str does all of its setup work in __new__.
+
+    @property
+    def path(self):
+        """The absolute path identified by this path pointer."""
+        return self._path
+
+    def open(self, encoding=None):
+        stream = open(self._path, 'rb')
+        if encoding is not None:
+            stream = SeekableUnicodeStreamReader(stream, encoding)
+        return stream
+
+    def file_size(self):
+        return os.stat(self._path).st_size
+
+    def join(self, fileid):
+        _path = os.path.join(self._path, fileid)
+        return FileSystemPathPointer(_path)
+
+    def __repr__(self):
+        # This should be a byte string under Python 2.x;
+        # we don't want transliteration here so
+        # @python_2_unicode_compatible is not used.
+        return str('FileSystemPathPointer(%r)' % self._path)
+
+    def __str__(self):
+        return self._path
+
+
+class BufferedGzipFile(GzipFile):
+    """
+    A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``.
+    This allows faster reads and writes of data to and from gzip-compressed
+    files at the cost of using more memory.
+
+    The default buffer size is 2MB.
+
+    ``BufferedGzipFile`` is useful for loading large gzipped pickle objects
+    as well as writing large encoded feature files for classifier training.
+    """
+    MB = 2 ** 20
+    SIZE = 2 * MB
+
+    @py3_data
+    def __init__(self, filename=None, mode=None, compresslevel=9,
+                 fileobj=None, **kwargs):
+        """
+        Return a buffered gzip file object.
+
+        :param filename: a filesystem path
+        :type filename: str
+        :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
+            'w', or 'wb'
+        :type mode: str
+        :param compresslevel: The compresslevel argument is an integer from 1
+            to 9 controlling the level of compression; 1 is fastest and
+            produces the least compression, and 9 is slowest and produces the
+            most compression. The default is 9.
+        :type compresslevel: int
+        :param fileobj: a BytesIO stream to read from instead of a file.
+        :type fileobj: BytesIO
+        :param size: number of bytes to buffer during calls to read() and write()
+        :type size: int
+        :rtype: BufferedGzipFile
+        """
+        GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
+        self._size = kwargs.get('size', self.SIZE)
+        self._nltk_buffer = BytesIO()
+        # cStringIO does not support len.
+        self._len = 0
+
+    def _reset_buffer(self):
+        # For some reason calling BytesIO.truncate() here will lead to
+        # inconsistent writes so just set _buffer to a new BytesIO object.
+        self._nltk_buffer = BytesIO()
+        self._len = 0
+
+    def _write_buffer(self, data):
+        # Simply write to the buffer and increment the buffer size.
+        if data is not None:
+            self._nltk_buffer.write(data)
+            self._len += len(data)
+
+    def _write_gzip(self, data):
+        # Write the current buffer to the GzipFile.
+        GzipFile.write(self, self._nltk_buffer.getvalue())
+        # Then reset the buffer and write the new data to the buffer.
+        self._reset_buffer()
+        self._write_buffer(data)
+
+    def close(self):
+        # GzipFile.close() doesn't actuallly close anything.
+        if self.mode == GZ_WRITE:
+            self._write_gzip(None)
+            self._reset_buffer()
+        return GzipFile.close(self)
+
+    def flush(self, lib_mode=FLUSH):
+        self._nltk_buffer.flush()
+        GzipFile.flush(self, lib_mode)
+
+    def read(self, size=None):
+        if not size:
+            size = self._size
+            contents = BytesIO()
+            while True:
+                blocks = GzipFile.read(self, size)
+                if not blocks:
+                    contents.flush()
+                    break
+                contents.write(blocks)
+            return contents.getvalue()
+        else:
+            return GzipFile.read(self, size)
+
+    def write(self, data, size=-1):
+        """
+        :param data: bytes to write to file or buffer
+        :type data: bytes
+        :param size: buffer at least size bytes before writing to file
+        :type size: int
+        """
+        if not size:
+            size = self._size
+        if self._len + len(data) <= size:
+            self._write_buffer(data)
+        else:
+            self._write_gzip(data)
+
+
+class GzipFileSystemPathPointer(FileSystemPathPointer):
+    """
+    A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed
+    file located at a given absolute path.  ``GzipFileSystemPathPointer`` is
+    appropriate for loading large gzip-compressed pickle objects efficiently.
+    """
+
+    def open(self, encoding=None):
+        # Note: In >= Python3.5, GzipFile is already using a
+        # buffered reader in the backend which has a variable self._buffer
+        # See https://github.com/nltk/nltk/issues/1308
+        if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+            stream = BufferedGzipFile(self._path, 'rb')
+        else:
+            stream = GzipFile(self._path, 'rb')
+        if encoding:
+            stream = SeekableUnicodeStreamReader(stream, encoding)
+        return stream
+
+
+class ZipFilePathPointer(PathPointer):
+    """
+    A path pointer that identifies a file contained within a zipfile,
+    which can be accessed by reading that zipfile.
+    """
+    @py3_data
+    def __init__(self, zipfile, entry=''):
+        """
+        Create a new path pointer pointing at the specified entry
+        in the given zipfile.
+
+        :raise IOError: If the given zipfile does not exist, or if it
+        does not contain the specified entry.
+        """
+        if isinstance(zipfile, string_types):
+            zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
+
+        # Normalize the entry string, it should be relative:
+        entry = normalize_resource_name(entry, True, '/').lstrip('/')
+
+        # Check that the entry exists:
+        if entry:
+            try:
+                zipfile.getinfo(entry)
+            except Exception:
+                # Sometimes directories aren't explicitly listed in
+                # the zip file.  So if `entry` is a directory name,
+                # then check if the zipfile contains any files that
+                # are under the given directory.
+                if (entry.endswith('/') and
+                        [n for n in zipfile.namelist() if n.startswith(entry)]):
+                    pass  # zipfile contains a file in that directory.
+                else:
+                    # Otherwise, complain.
+                    raise IOError('Zipfile %r does not contain %r' %
+                                  (zipfile.filename, entry))
+        self._zipfile = zipfile
+        self._entry = entry
+
+    @property
+    def zipfile(self):
+        """
+        The zipfile.ZipFile object used to access the zip file
+        containing the entry identified by this path pointer.
+        """
+        return self._zipfile
+
+    @property
+    def entry(self):
+        """
+        The name of the file within zipfile that this path
+        pointer points to.
+        """
+        return self._entry
+
+    def open(self, encoding=None):
+        data = self._zipfile.read(self._entry)
+        stream = BytesIO(data)
+        if self._entry.endswith('.gz'):
+            # Note: In >= Python3.5, GzipFile is already using a
+            # buffered reader in the backend which has a variable self._buffer
+            # See https://github.com/nltk/nltk/issues/1308
+            if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+                stream = BufferedGzipFile(self._entry, fileobj=stream)
+            else:
+                stream = GzipFile(self._entry, fileobj=stream)
+        elif encoding is not None:
+            stream = SeekableUnicodeStreamReader(stream, encoding)
+        return stream
+
+    def file_size(self):
+        return self._zipfile.getinfo(self._entry).file_size
+
+    def join(self, fileid):
+        entry = '%s/%s' % (self._entry, fileid)
+        return ZipFilePathPointer(self._zipfile, entry)
+
+    def __repr__(self):
+        return str('ZipFilePathPointer(%r, %r)') % (
+            self._zipfile.filename, self._entry)
+
+    def __str__(self):
+        return os.path.normpath(os.path.join(self._zipfile.filename,
+                                             self._entry))
+
+
+######################################################################
+# Access Functions
+######################################################################
+
+# Don't use a weak dictionary, because in the common case this
+# causes a lot more reloading that necessary.
+_resource_cache = {}
+"""A dictionary used to cache resources so that they won't
+   need to be loaded more than once."""
+
+
+def find(resource_name, paths=None):
+    """
+    Find the given resource by searching through the directories and
+    zip files in paths, where a None or empty string specifies an absolute path.
+    Returns a corresponding path name.  If the given resource is not
+    found, raise a ``LookupError``, whose message gives a pointer to
+    the installation instructions for the NLTK downloader.
+
+    Zip File Handling:
+
+      - If ``resource_name`` contains a component with a ``.zip``
+        extension, then it is assumed to be a zipfile; and the
+        remaining path components are used to look inside the zipfile.
+
+      - If any element of ``nltk.data.path`` has a ``.zip`` extension,
+        then it is assumed to be a zipfile.
+
+      - If a given resource name that does not contain any zipfile
+        component is not found initially, then ``find()`` will make a
+        second attempt to find that resource, by replacing each
+        component *p* in the path with *p.zip/p*.  For example, this
+        allows ``find()`` to map the resource name
+        ``corpora/chat80/cities.pl`` to a zip file path pointer to
+        ``corpora/chat80.zip/chat80/cities.pl``.
+
+      - When using ``find()`` to locate a directory contained in a
+        zipfile, the resource name must end with the forward slash
+        character.  Otherwise, ``find()`` will not locate the
+        directory.
+
+    :type resource_name: str or unicode
+    :param resource_name: The name of the resource to search for.
+        Resource names are posix-style relative path names, such as
+        ``corpora/brown``.  Directory names will be
+        automatically converted to a platform-appropriate path separator.
+    :rtype: str
+    """
+    resource_name = normalize_resource_name(resource_name, True)
+
+    # Resolve default paths at runtime in-case the user overrides
+    # nltk.data.path
+    if paths is None:
+        paths = path
+
+    # Check if the resource name includes a zipfile name
+    m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
+    zipfile, zipentry = m.groups()
+
+    # Check each item in our path
+    for path_ in paths:
+        # Is the path item a zipfile?
+        if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
+            try:
+                return ZipFilePathPointer(path_, resource_name)
+            except IOError:
+                # resource not in zipfile
+                continue
+
+        # Is the path item a directory or is resource_name an absolute path?
+        elif not path_ or os.path.isdir(path_):
+            if zipfile is None:
+                p = os.path.join(path_, url2pathname(resource_name))
+                if os.path.exists(p):
+                    if p.endswith('.gz'):
+                        return GzipFileSystemPathPointer(p)
+                    else:
+                        return FileSystemPathPointer(p)
+            else:
+                p = os.path.join(path_, url2pathname(zipfile))
+                if os.path.exists(p):
+                    try:
+                        return ZipFilePathPointer(p, zipentry)
+                    except IOError:
+                        # resource not in zipfile
+                        continue
+
+    # Fallback: if the path doesn't include a zip file, then try
+    # again, assuming that one of the path components is inside a
+    # zipfile of the same name.
+    if zipfile is None:
+        pieces = resource_name.split('/')
+        for i in range(len(pieces)):
+            modified_name = '/'.join(pieces[:i] +
+                                     [pieces[i] + '.zip'] + pieces[i:])
+            try:
+                return find(modified_name, paths)
+            except LookupError:
+                pass
+
+    # Identify the package (i.e. the .zip file) to download.
+    resource_zipname = resource_name.split('/')[1]
+    if resource_zipname.endswith('.zip'):
+        resource_zipname = resource_zipname.rpartition('.')[0]
+    # Display a friendly error message if the resource wasn't found:
+    msg = str("Resource \33[93m{resource}\033[0m not found.\n"
+              "Please use the NLTK Downloader to obtain the resource:\n\n"
+              "\33[31m" # To display red text in terminal.
+              ">>> import nltk\n"
+              ">>> nltk.download(\'{resource}\')\n"
+              "\033[0m").format(resource=resource_zipname)
+    msg = textwrap_indent(msg)
+
+    msg += '\n  Searched in:' + ''.join('\n    - %r' % d for d in paths)
+    sep = '*' * 70
+    resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
+    raise LookupError(resource_not_found)
+
+
+def retrieve(resource_url, filename=None, verbose=True):
+    """
+    Copy the given resource to a local file.  If no filename is
+    specified, then use the URL's filename.  If there is already a
+    file named ``filename``, then raise a ``ValueError``.
+
+    :type resource_url: str
+    :param resource_url: A URL specifying where the resource should be
+        loaded from.  The default protocol is "nltk:", which searches
+        for the file in the the NLTK data package.
+    """
+    resource_url = normalize_resource_url(resource_url)
+    if filename is None:
+        if resource_url.startswith('file:'):
+            filename = os.path.split(resource_url)[-1]
+        else:
+            filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
+    if os.path.exists(filename):
+        filename = os.path.abspath(filename)
+        raise ValueError("File %r already exists!" % filename)
+
+    if verbose:
+        print('Retrieving %r, saving to %r' % (resource_url, filename))
+
+    # Open the input & output streams.
+    infile = _open(resource_url)
+
+    # Copy infile -> outfile, using 64k blocks.
+    with open(filename, "wb") as outfile:
+        while True:
+            s = infile.read(1024 * 64)  # 64k blocks.
+            outfile.write(s)
+            if not s:
+                break
+
+    infile.close()
+
+
+#: A dictionary describing the formats that are supported by NLTK's
+#: load() method.  Keys are format names, and values are format
+#: descriptions.
+FORMATS = {
+    'pickle': "A serialized python object, stored using the pickle module.",
+    'json': "A serialized python object, stored using the json module.",
+    'yaml': "A serialized python object, stored using the yaml module.",
+    'cfg': "A context free grammar.",
+    'pcfg': "A probabilistic CFG.",
+    'fcfg': "A feature CFG.",
+    'fol': "A list of first order logic expressions, parsed with "
+            "nltk.sem.logic.Expression.fromstring.",
+    'logic': "A list of first order logic expressions, parsed with "
+            "nltk.sem.logic.LogicParser.  Requires an additional logic_parser "
+            "parameter",
+    'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
+    'raw': "The raw (byte string) contents of a file.",
+    'text': "The raw (unicode string) contents of a file. "
+}
+
+#: A dictionary mapping from file extensions to format names, used
+#: by load() when format="auto" to decide the format for a
+#: given resource url.
+AUTO_FORMATS = {
+    'pickle': 'pickle',
+    'json': 'json',
+    'yaml': 'yaml',
+    'cfg': 'cfg',
+    'pcfg': 'pcfg',
+    'fcfg': 'fcfg',
+    'fol': 'fol',
+    'logic': 'logic',
+    'val': 'val',
+    'txt': 'text',
+    'text': 'text',
+}
+
+
+def load(resource_url, format='auto', cache=True, verbose=False,
+         logic_parser=None, fstruct_reader=None, encoding=None):
+    """
+    Load a given resource from the NLTK data package.  The following
+    resource formats are currently supported:
+
+      - ``pickle``
+      - ``json``
+      - ``yaml``
+      - ``cfg`` (context free grammars)
+      - ``pcfg`` (probabilistic CFGs)
+      - ``fcfg`` (feature-based CFGs)
+      - ``fol`` (formulas of First Order Logic)
+      - ``logic`` (Logical formulas to be parsed by the given logic_parser)
+      - ``val`` (valuation of First Order Logic model)
+      - ``text`` (the file contents as a unicode string)
+      - ``raw`` (the raw file contents as a byte string)
+
+    If no format is specified, ``load()`` will attempt to determine a
+    format based on the resource name's file extension.  If that
+    fails, ``load()`` will raise a ``ValueError`` exception.
+
+    For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``),
+    it tries to decode the raw contents using UTF-8, and if that doesn't
+    work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding``
+    is specified.
+
+    :type resource_url: str
+    :param resource_url: A URL specifying where the resource should be
+        loaded from.  The default protocol is "nltk:", which searches
+        for the file in the the NLTK data package.
+    :type cache: bool
+    :param cache: If true, add this resource to a cache.  If load()
+        finds a resource in its cache, then it will return it from the
+        cache rather than loading it.  The cache uses weak references,
+        so a resource wil automatically be expunged from the cache
+        when no more objects are using it.
+    :type verbose: bool
+    :param verbose: If true, print a message when loading a resource.
+        Messages are not displayed when a resource is retrieved from
+        the cache.
+    :type logic_parser: LogicParser
+    :param logic_parser: The parser that will be used to parse logical
+        expressions.
+    :type fstruct_reader: FeatStructReader
+    :param fstruct_reader: The parser that will be used to parse the
+        feature structure of an fcfg.
+    :type encoding: str
+    :param encoding: the encoding of the input; only used for text formats.
+    """
+    resource_url = normalize_resource_url(resource_url)
+    resource_url = add_py3_data(resource_url)
+
+    # Determine the format of the resource.
+    if format == 'auto':
+        resource_url_parts = resource_url.split('.')
+        ext = resource_url_parts[-1]
+        if ext == 'gz':
+            ext = resource_url_parts[-2]
+        format = AUTO_FORMATS.get(ext)
+        if format is None:
+            raise ValueError('Could not determine format for %s based '
+                             'on its file\nextension; use the "format" '
+                             'argument to specify the format explicitly.'
+                             % resource_url)
+
+    if format not in FORMATS:
+        raise ValueError('Unknown format type: %s!' % (format,))
+
+    # If we've cached the resource, then just return it.
+    if cache:
+        resource_val = _resource_cache.get((resource_url, format))
+        if resource_val is not None:
+            if verbose:
+                print('<<Using cached copy of %s>>' % (resource_url,))
+            return resource_val
+
+    # Let the user know what's going on.
+    if verbose:
+        print('<<Loading %s>>' % (resource_url,))
+
+    # Load the resource.
+    opened_resource = _open(resource_url)
+
+    if format == 'raw':
+        resource_val = opened_resource.read()
+    elif format == 'pickle':
+        resource_val = pickle.load(opened_resource)
+    elif format == 'json':
+        import json
+        from nltk.jsontags import json_tags
+        resource_val = json.load(opened_resource)
+        tag = None
+        if len(resource_val) != 1:
+            tag = next(resource_val.keys())
+        if tag not in json_tags:
+            raise ValueError('Unknown json tag.')
+    elif format == 'yaml':
+        import yaml
+        resource_val = yaml.load(opened_resource)
+    else:
+        # The resource is a text format.
+        binary_data = opened_resource.read()
+        if encoding is not None:
+            string_data = binary_data.decode(encoding)
+        else:
+            try:
+                string_data = binary_data.decode('utf-8')
+            except UnicodeDecodeError:
+                string_data = binary_data.decode('latin-1')
+        if format == 'text':
+            resource_val = string_data
+        elif format == 'cfg':
+            resource_val = nltk.grammar.CFG.fromstring(
+                string_data, encoding=encoding)
+        elif format == 'pcfg':
+            resource_val = nltk.grammar.PCFG.fromstring(
+                string_data, encoding=encoding)
+        elif format == 'fcfg':
+            resource_val = nltk.grammar.FeatureGrammar.fromstring(
+                string_data, logic_parser=logic_parser,
+                fstruct_reader=fstruct_reader, encoding=encoding)
+        elif format == 'fol':
+            resource_val = nltk.sem.read_logic(
+                string_data, logic_parser=nltk.sem.logic.LogicParser(),
+                encoding=encoding)
+        elif format == 'logic':
+            resource_val = nltk.sem.read_logic(
+                string_data, logic_parser=logic_parser, encoding=encoding)
+        elif format == 'val':
+            resource_val = nltk.sem.read_valuation(
+                string_data, encoding=encoding)
+        else:
+            raise AssertionError("Internal NLTK error: Format %s isn't "
+                                 "handled by nltk.data.load()" % (format,))
+
+    opened_resource.close()
+
+    # If requested, add it to the cache.
+    if cache:
+        try:
+            _resource_cache[(resource_url, format)] = resource_val
+            # TODO: add this line
+            # print('<<Caching a copy of %s>>' % (resource_url,))
+        except TypeError:
+            # We can't create weak references to some object types, like
+            # strings and tuples.  For now, just don't cache them.
+            pass
+
+    return resource_val
+
+
+def show_cfg(resource_url, escape='##'):
+    """
+    Write out a grammar file, ignoring escaped and empty lines.
+
+    :type resource_url: str
+    :param resource_url: A URL specifying where the resource should be
+        loaded from.  The default protocol is "nltk:", which searches
+        for the file in the the NLTK data package.
+    :type escape: str
+    :param escape: Prepended string that signals lines to be ignored
+    """
+    resource_url = normalize_resource_url(resource_url)
+    resource_val = load(resource_url, format='text', cache=False)
+    lines = resource_val.splitlines()
+    for l in lines:
+        if l.startswith(escape):
+            continue
+        if re.match('^$', l):
+            continue
+        print(l)
+
+
+def clear_cache():
+    """
+    Remove all objects from the resource cache.
+    :see: load()
+    """
+    _resource_cache.clear()
+
+
+def _open(resource_url):
+    """
+    Helper function that returns an open file object for a resource,
+    given its resource URL.  If the given resource URL uses the "nltk:"
+    protocol, or uses no protocol, then use ``nltk.data.find`` to find
+    its path, and open it with the given mode; if the resource URL
+    uses the 'file' protocol, then open the file with the given mode;
+    otherwise, delegate to ``urllib2.urlopen``.
+
+    :type resource_url: str
+    :param resource_url: A URL specifying where the resource should be
+        loaded from.  The default protocol is "nltk:", which searches
+        for the file in the the NLTK data package.
+    """
+    resource_url = normalize_resource_url(resource_url)
+    protocol, path_ = split_resource_url(resource_url)
+
+    if protocol is None or protocol.lower() == 'nltk':
+        return find(path_, path + ['']).open()
+    elif protocol.lower() == 'file':
+        # urllib might not use mode='rb', so handle this one ourselves:
+        return find(path_, ['']).open()
+    else:
+        return urlopen(resource_url)
+
+######################################################################
+# Lazy Resource Loader
+######################################################################
+
+# We shouldn't apply @python_2_unicode_compatible
+# decorator to LazyLoader, this is resource.__class__ responsibility.
+
+
+class LazyLoader(object):
+
+    @py3_data
+    def __init__(self, _path):
+        self._path = _path
+
+    def __load(self):
+        resource = load(self._path)
+        # This is where the magic happens!  Transform ourselves into
+        # the object by modifying our own __dict__ and __class__ to
+        # match that of `resource`.
+        self.__dict__ = resource.__dict__
+        self.__class__ = resource.__class__
+
+    def __getattr__(self, attr):
+        self.__load()
+        # This looks circular, but its not, since __load() changes our
+        # __class__ to something new:
+        return getattr(self, attr)
+
+    def __repr__(self):
+        self.__load()
+        # This looks circular, but its not, since __load() changes our
+        # __class__ to something new:
+        return repr(self)
+
+######################################################################
+# Open-On-Demand ZipFile
+######################################################################
+
+
+class OpenOnDemandZipFile(zipfile.ZipFile):
+    """
+    A subclass of ``zipfile.ZipFile`` that closes its file pointer
+    whenever it is not using it; and re-opens it when it needs to read
+    data from the zipfile.  This is useful for reducing the number of
+    open file handles when many zip files are being accessed at once.
+    ``OpenOnDemandZipFile`` must be constructed from a filename, not a
+    file-like object (to allow re-opening).  ``OpenOnDemandZipFile`` is
+    read-only (i.e. ``write()`` and ``writestr()`` are disabled.
+    """
+    @py3_data
+    def __init__(self, filename):
+        if not isinstance(filename, string_types):
+            raise TypeError('ReopenableZipFile filename must be a string')
+        zipfile.ZipFile.__init__(self, filename)
+        assert self.filename == filename
+        self.close()
+        # After closing a ZipFile object, the _fileRefCnt needs to be cleared
+        # for Python2and3 compatible code.
+        self._fileRefCnt = 0
+
+    def read(self, name):
+        assert self.fp is None
+        self.fp = open(self.filename, 'rb')
+        value = zipfile.ZipFile.read(self, name)
+        # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
+        # Since we only opened one file here, we add 1.
+        self._fileRefCnt += 1
+        self.close()
+        return value
+
+    def write(self, *args, **kwargs):
+        """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
+        raise NotImplementedError('OpenOnDemandZipfile is read-only')
+
+    def writestr(self, *args, **kwargs):
+        """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
+        raise NotImplementedError('OpenOnDemandZipfile is read-only')
+
+    def __repr__(self):
+        return repr(str('OpenOnDemandZipFile(%r)') % self.filename)
+
+######################################################################
+#{ Seekable Unicode Stream Reader
+######################################################################
+
+
+class SeekableUnicodeStreamReader(object):
+    """
+    A stream reader that automatically encodes the source byte stream
+    into unicode (like ``codecs.StreamReader``); but still supports the
+    ``seek()`` and ``tell()`` operations correctly.  This is in contrast
+    to ``codecs.StreamReader``, which provide *broken* ``seek()`` and
+    ``tell()`` methods.
+
+    This class was motivated by ``StreamBackedCorpusView``, which
+    makes extensive use of ``seek()`` and ``tell()``, and needs to be
+    able to handle unicode-encoded files.
+
+    Note: this class requires stateless decoders.  To my knowledge,
+    this shouldn't cause a problem with any of python's builtin
+    unicode encodings.
+    """
+    DEBUG = True  # : If true, then perform extra sanity checks.
+
+    @py3_data
+    def __init__(self, stream, encoding, errors='strict'):
+        # Rewind the stream to its beginning.
+        stream.seek(0)
+
+        self.stream = stream
+        """The underlying stream."""
+
+        self.encoding = encoding
+        """The name of the encoding that should be used to encode the
+           underlying stream."""
+
+        self.errors = errors
+        """The error mode that should be used when decoding data from
+           the underlying stream.  Can be 'strict', 'ignore', or
+           'replace'."""
+
+        self.decode = codecs.getdecoder(encoding)
+        """The function that is used to decode byte strings into
+           unicode strings."""
+
+        self.bytebuffer = b''
+        """A buffer to use bytes that have been read but have not yet
+           been decoded.  This is only used when the final bytes from
+           a read do not form a complete encoding for a character."""
+
+        self.linebuffer = None
+        """A buffer used by ``readline()`` to hold characters that have
+           been read, but have not yet been returned by ``read()`` or
+           ``readline()``.  This buffer consists of a list of unicode
+           strings, where each string corresponds to a single line.
+           The final element of the list may or may not be a complete
+           line.  Note that the existence of a linebuffer makes the
+           ``tell()`` operation more complex, because it must backtrack
+           to the beginning of the buffer to determine the correct
+           file position in the underlying byte stream."""
+
+        self._rewind_checkpoint = 0
+        """The file position at which the most recent read on the
+           underlying stream began.  This is used, together with
+           ``_rewind_numchars``, to backtrack to the beginning of
+           ``linebuffer`` (which is required by ``tell()``)."""
+
+        self._rewind_numchars = None
+        """The number of characters that have been returned since the
+           read that started at ``_rewind_checkpoint``.  This is used,
+           together with ``_rewind_checkpoint``, to backtrack to the
+           beginning of ``linebuffer`` (which is required by ``tell()``)."""
+
+        self._bom = self._check_bom()
+        """The length of the byte order marker at the beginning of
+           the stream (or None for no byte order marker)."""
+
+    #/////////////////////////////////////////////////////////////////
+    # Read methods
+    #/////////////////////////////////////////////////////////////////
+
+    def read(self, size=None):
+        """
+        Read up to ``size`` bytes, decode them using this reader's
+        encoding, and return the resulting unicode string.
+
+        :param size: The maximum number of bytes to read.  If not
+            specified, then read as many bytes as possible.
+        :type size: int
+        :rtype: unicode
+        """
+        chars = self._read(size)
+
+        # If linebuffer is not empty, then include it in the result
+        if self.linebuffer:
+            chars = ''.join(self.linebuffer) + chars
+            self.linebuffer = None
+            self._rewind_numchars = None
+
+        return chars
+
+    def readline(self, size=None):
+        """
+        Read a line of text, decode it using this reader's encoding,
+        and return the resulting unicode string.
+
+        :param size: The maximum number of bytes to read.  If no
+            newline is encountered before ``size`` bytes have been read,
+            then the returned value may not be a complete line of text.
+        :type size: int
+        """
+        # If we have a non-empty linebuffer, then return the first
+        # line from it.  (Note that the last element of linebuffer may
+        # not be a complete line; so let _read() deal with it.)
+        if self.linebuffer and len(self.linebuffer) > 1:
+            line = self.linebuffer.pop(0)
+            self._rewind_numchars += len(line)
+            return line
+
+        readsize = size or 72
+        chars = ''
+
+        # If there's a remaining incomplete line in the buffer, add it.
+        if self.linebuffer:
+            chars += self.linebuffer.pop()
+            self.linebuffer = None
+
+        while True:
+            startpos = self.stream.tell() - len(self.bytebuffer)
+            new_chars = self._read(readsize)
+
+            # If we're at a '\r', then read one extra character, since
+            # it might be a '\n', to get the proper line ending.
+            if new_chars and new_chars.endswith('\r'):
+                new_chars += self._read(1)
+
+            chars += new_chars
+            lines = chars.splitlines(True)
+            if len(lines) > 1:
+                line = lines[0]
+                self.linebuffer = lines[1:]
+                self._rewind_numchars = (len(new_chars) -
+                                         (len(chars) - len(line)))
+                self._rewind_checkpoint = startpos
+                break
+            elif len(lines) == 1:
+                line0withend = lines[0]
+                line0withoutend = lines[0].splitlines(False)[0]
+                if line0withend != line0withoutend:  # complete line
+                    line = line0withend
+                    break
+
+            if not new_chars or size is not None:
+                line = chars
+                break
+
+            # Read successively larger blocks of text.
+            if readsize < 8000:
+                readsize *= 2
+
+        return line
+
+    def readlines(self, sizehint=None, keepends=True):
+        """
+        Read this file's contents, decode them using this reader's
+        encoding, and return it as a list of unicode lines.
+
+        :rtype: list(unicode)
+        :param sizehint: Ignored.
+        :param keepends: If false, then strip newlines.
+        """
+        return self.read().splitlines(keepends)
+
+    def next(self):
+        """Return the next decoded line from the underlying stream."""
+        line = self.readline()
+        if line:
+            return line
+        else:
+            raise StopIteration
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        """Return self"""
+        return self
+
+    def xreadlines(self):
+        """Return self"""
+        return self
+
+    #/////////////////////////////////////////////////////////////////
+    # Pass-through methods & properties
+    #/////////////////////////////////////////////////////////////////
+
+    @property
+    def closed(self):
+        """True if the underlying stream is closed."""
+        return self.stream.closed
+
+    @property
+    def name(self):
+        """The name of the underlying stream."""
+        return self.stream.name
+
+    @property
+    def mode(self):
+        """The mode of the underlying stream."""
+        return self.stream.mode
+
+    def close(self):
+        """
+        Close the underlying stream.
+        """
+        self.stream.close()
+
+    #/////////////////////////////////////////////////////////////////
+    # Seek and tell
+    #/////////////////////////////////////////////////////////////////
+
+    def seek(self, offset, whence=0):
+        """
+        Move the stream to a new file position.  If the reader is
+        maintaining any buffers, then they will be cleared.
+
+        :param offset: A byte count offset.
+        :param whence: If 0, then the offset is from the start of the file
+            (offset should be positive), if 1, then the offset is from the
+            current position (offset may be positive or negative); and if 2,
+            then the offset is from the end of the file (offset should
+            typically be negative).
+        """
+        if whence == 1:
+            raise ValueError('Relative seek is not supported for '
+                             'SeekableUnicodeStreamReader -- consider '
+                             'using char_seek_forward() instead.')
+        self.stream.seek(offset, whence)
+        self.linebuffer = None
+        self.bytebuffer = b''
+        self._rewind_numchars = None
+        self._rewind_checkpoint = self.stream.tell()
+
+    def char_seek_forward(self, offset):
+        """
+        Move the read pointer forward by ``offset`` characters.
+        """
+        if offset < 0:
+            raise ValueError('Negative offsets are not supported')
+        # Clear all buffers.
+        self.seek(self.tell())
+        # Perform the seek operation.
+        self._char_seek_forward(offset)
+
+    def _char_seek_forward(self, offset, est_bytes=None):
+        """
+        Move the file position forward by ``offset`` characters,
+        ignoring all buffers.
+
+        :param est_bytes: A hint, giving an estimate of the number of
+            bytes that will be needed to move forward by ``offset`` chars.
+            Defaults to ``offset``.
+        """
+        if est_bytes is None:
+            est_bytes = offset
+        bytes = b''
+
+        while True:
+            # Read in a block of bytes.
+            newbytes = self.stream.read(est_bytes - len(bytes))
+            bytes += newbytes
+
+            # Decode the bytes to characters.
+            chars, bytes_decoded = self._incr_decode(bytes)
+
+            # If we got the right number of characters, then seek
+            # backwards over any truncated characters, and return.
+            if len(chars) == offset:
+                self.stream.seek(-len(bytes) + bytes_decoded, 1)
+                return
+
+            # If we went too far, then we can back-up until we get it
+            # right, using the bytes we've already read.
+            if len(chars) > offset:
+                while len(chars) > offset:
+                    # Assume at least one byte/char.
+                    est_bytes += offset - len(chars)
+                    chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
+                self.stream.seek(-len(bytes) + bytes_decoded, 1)
+                return
+
+            # Otherwise, we haven't read enough bytes yet; loop again.
+            est_bytes += offset - len(chars)
+
+    def tell(self):
+        """
+        Return the current file position on the underlying byte
+        stream.  If this reader is maintaining any buffers, then the
+        returned file position will be the position of the beginning
+        of those buffers.
+        """
+        # If nothing's buffered, then just return our current filepos:
+        if self.linebuffer is None:
+            return self.stream.tell() - len(self.bytebuffer)
+
+        # Otherwise, we'll need to backtrack the filepos until we
+        # reach the beginning of the buffer.
+
+        # Store our original file position, so we can return here.
+        orig_filepos = self.stream.tell()
+
+        # Calculate an estimate of where we think the newline is.
+        bytes_read = ((orig_filepos - len(self.bytebuffer)) -
+                      self._rewind_checkpoint)
+        buf_size = sum(len(line) for line in self.linebuffer)
+        est_bytes = int((bytes_read * self._rewind_numchars /
+                         (self._rewind_numchars + buf_size)))
+
+        self.stream.seek(self._rewind_checkpoint)
+        self._char_seek_forward(self._rewind_numchars, est_bytes)
+        filepos = self.stream.tell()
+
+        # Sanity check
+        if self.DEBUG:
+            self.stream.seek(filepos)
+            check1 = self._incr_decode(self.stream.read(50))[0]
+            check2 = ''.join(self.linebuffer)
+            assert check1.startswith(check2) or check2.startswith(check1)
+
+        # Return to our original filepos (so we don't have to throw
+        # out our buffer.)
+        self.stream.seek(orig_filepos)
+
+        # Return the calculated filepos
+        return filepos
+
+    #/////////////////////////////////////////////////////////////////
+    # Helper methods
+    #/////////////////////////////////////////////////////////////////
+
+    def _read(self, size=None):
+        """
+        Read up to ``size`` bytes from the underlying stream, decode
+        them using this reader's encoding, and return the resulting
+        unicode string.  ``linebuffer`` is not included in the result.
+        """
+        if size == 0:
+            return ''
+
+        # Skip past the byte order marker, if present.
+        if self._bom and self.stream.tell() == 0:
+            self.stream.read(self._bom)
+
+        # Read the requested number of bytes.
+        if size is None:
+            new_bytes = self.stream.read()
+        else:
+            new_bytes = self.stream.read(size)
+        bytes = self.bytebuffer + new_bytes
+
+        # Decode the bytes into unicode characters
+        chars, bytes_decoded = self._incr_decode(bytes)
+
+        # If we got bytes but couldn't decode any, then read further.
+        if (size is not None) and (not chars) and (len(new_bytes) > 0):
+            while not chars:
+                new_bytes = self.stream.read(1)
+                if not new_bytes:
+                    break  # end of file.
+                bytes += new_bytes
+                chars, bytes_decoded = self._incr_decode(bytes)
+
+        # Record any bytes we didn't consume.
+        self.bytebuffer = bytes[bytes_decoded:]
+
+        # Return the result
+        return chars
+
+    def _incr_decode(self, bytes):
+        """
+        Decode the given byte string into a unicode string, using this
+        reader's encoding.  If an exception is encountered that
+        appears to be caused by a truncation error, then just decode
+        the byte string without the bytes that cause the trunctaion
+        error.
+
+        Return a tuple ``(chars, num_consumed)``, where ``chars`` is
+        the decoded unicode string, and ``num_consumed`` is the
+        number of bytes that were consumed.
+        """
+        while True:
+            try:
+                return self.decode(bytes, 'strict')
+            except UnicodeDecodeError as exc:
+                # If the exception occurs at the end of the string,
+                # then assume that it's a truncation error.
+                if exc.end == len(bytes):
+                    return self.decode(bytes[:exc.start], self.errors)
+
+                # Otherwise, if we're being strict, then raise it.
+                elif self.errors == 'strict':
+                    raise
+
+                # If we're not strict, then re-process it with our
+                # errors setting.  This *may* raise an exception.
+                else:
+                    return self.decode(bytes, self.errors)
+
+    _BOM_TABLE = {
+        'utf8': [(codecs.BOM_UTF8, None)],
+        'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'),
+                  (codecs.BOM_UTF16_BE, 'utf16-be')],
+        'utf16le': [(codecs.BOM_UTF16_LE, None)],
+        'utf16be': [(codecs.BOM_UTF16_BE, None)],
+        'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'),
+                  (codecs.BOM_UTF32_BE, 'utf32-be')],
+        'utf32le': [(codecs.BOM_UTF32_LE, None)],
+        'utf32be': [(codecs.BOM_UTF32_BE, None)],
+    }
+
+    def _check_bom(self):
+        # Normalize our encoding name
+        enc = re.sub('[ -]', '', self.encoding.lower())
+
+        # Look up our encoding in the BOM table.
+        bom_info = self._BOM_TABLE.get(enc)
+
+        if bom_info:
+            # Read a prefix, to check against the BOM(s)
+            bytes = self.stream.read(16)
+            self.stream.seek(0)
+
+            # Check for each possible BOM.
+            for (bom, new_encoding) in bom_info:
+                if bytes.startswith(bom):
+                    if new_encoding:
+                        self.encoding = new_encoding
+                    return len(bom)
+
+        return None
+
+
+__all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile',
+           'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer',
+           'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load',
+           'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile',
+           'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader']
diff --git a/nlp_resource_data/nltk/data.pyc b/nlp_resource_data/nltk/data.pyc

new file mode 100755 (executable)

index 0000000..fe91094

Binary files /dev/null and b/nlp_resource_data/nltk/data.pyc differ
diff --git a/nlp_resource_data/nltk/decorators.py b/nlp_resource_data/nltk/decorators.py

new file mode 100755 (executable)

index 0000000..6350eae
--- /dev/null
+++ b/nlp_resource_data/nltk/decorators.py
@@ -0,0 +1,219 @@
+"""
+Decorator module by Michele Simionato <michelesimionato@libero.it>
+Copyright Michele Simionato, distributed under the terms of the BSD License (see below).
+http://www.phyast.pitt.edu/~micheles/python/documentation.html
+
+Included in NLTK for its support of a nice memoization decorator.
+"""
+from __future__ import print_function
+__docformat__ = 'restructuredtext en'
+
+## The basic trick is to generate the source code for the decorated function
+## with the right signature and to evaluate it.
+## Uncomment the statement 'print >> sys.stderr, func_src'  in _decorator
+## to understand what is going on.
+
+__all__ = ["decorator", "new_wrapper", "getinfo"]
+
+import sys
+
+# Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in
+# the Python standard library.
+old_sys_path = sys.path[:]
+sys.path = [p for p in sys.path if "nltk" not in p]
+import inspect
+sys.path = old_sys_path
+
+try:
+    set
+except NameError:
+    from sets import Set as set
+
+def getinfo(func):
+    """
+    Returns an info dictionary containing:
+    - name (the name of the function : str)
+    - argnames (the names of the arguments : list)
+    - defaults (the values of the default arguments : tuple)
+    - signature (the signature : str)
+    - doc (the docstring : str)
+    - module (the module name : str)
+    - dict (the function __dict__ : str)
+
+    >>> def f(self, x=1, y=2, *args, **kw): pass
+
+    >>> info = getinfo(f)
+
+    >>> info["name"]
+    'f'
+    >>> info["argnames"]
+    ['self', 'x', 'y', 'args', 'kw']
+
+    >>> info["defaults"]
+    (1, 2)
+
+    >>> info["signature"]
+    'self, x, y, *args, **kw'
+    """
+    assert inspect.ismethod(func) or inspect.isfunction(func)
+    if sys.version_info[0] >= 3:
+        argspec = inspect.getfullargspec(func)
+    else:
+        argspec = inspect.getargspec(func)
+    regargs, varargs, varkwargs, defaults = argspec[:4]
+    argnames = list(regargs)
+    if varargs:
+        argnames.append(varargs)
+    if varkwargs:
+        argnames.append(varkwargs)
+    signature = inspect.formatargspec(regargs, varargs, varkwargs, defaults,
+                                      formatvalue=lambda value: "")[1:-1]
+
+    # pypy compatibility
+    if hasattr(func, '__closure__'):
+        _closure = func.__closure__
+        _globals = func.__globals__
+    else:
+        _closure = func.func_closure
+        _globals = func.func_globals
+
+    return dict(name=func.__name__, argnames=argnames, signature=signature,
+                defaults = func.__defaults__, doc=func.__doc__,
+                module=func.__module__, dict=func.__dict__,
+                globals=_globals, closure=_closure)
+
+# akin to functools.update_wrapper
+def update_wrapper(wrapper, model, infodict=None):
+    infodict = infodict or getinfo(model)
+    wrapper.__name__ = infodict['name']
+    wrapper.__doc__ = infodict['doc']
+    wrapper.__module__ = infodict['module']
+    wrapper.__dict__.update(infodict['dict'])
+    wrapper.__defaults__ = infodict['defaults']
+    wrapper.undecorated = model
+    return wrapper
+
+def new_wrapper(wrapper, model):
+    """
+    An improvement over functools.update_wrapper. The wrapper is a generic
+    callable object. It works by generating a copy of the wrapper with the
+    right signature and by updating the copy, not the original.
+    Moreovoer, 'model' can be a dictionary with keys 'name', 'doc', 'module',
+    'dict', 'defaults'.
+    """
+    if isinstance(model, dict):
+        infodict = model
+    else: # assume model is a function
+        infodict = getinfo(model)
+    assert not '_wrapper_' in infodict["argnames"], (
+        '"_wrapper_" is a reserved argument name!')
+    src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict
+    funcopy = eval(src, dict(_wrapper_=wrapper))
+    return update_wrapper(funcopy, model, infodict)
+
+# helper used in decorator_factory
+def __call__(self, func):
+    return new_wrapper(lambda *a, **k : self.call(func, *a, **k), func)
+
+def decorator_factory(cls):
+    """
+    Take a class with a ``.caller`` method and return a callable decorator
+    object. It works by adding a suitable __call__ method to the class;
+    it raises a TypeError if the class already has a nontrivial __call__
+    method.
+    """
+    attrs = set(dir(cls))
+    if '__call__' in attrs:
+        raise TypeError('You cannot decorate a class with a nontrivial '
+                        '__call__ method')
+    if 'call' not in attrs:
+        raise TypeError('You cannot decorate a class without a '
+                        '.call method')
+    cls.__call__ = __call__
+    return cls
+
+def decorator(caller):
+    """
+    General purpose decorator factory: takes a caller function as
+    input and returns a decorator with the same attributes.
+    A caller function is any function like this::
+
+     def caller(func, *args, **kw):
+         # do something
+         return func(*args, **kw)
+
+    Here is an example of usage:
+
+    >>> @decorator
+    ... def chatty(f, *args, **kw):
+    ...     print("Calling %r" % f.__name__)
+    ...     return f(*args, **kw)
+
+    >>> chatty.__name__
+    'chatty'
+
+    >>> @chatty
+    ... def f(): pass
+    ...
+    >>> f()
+    Calling 'f'
+
+    decorator can also take in input a class with a .caller method; in this
+    case it converts the class into a factory of callable decorator objects.
+    See the documentation for an example.
+    """
+    if inspect.isclass(caller):
+        return decorator_factory(caller)
+    def _decorator(func): # the real meat is here
+        infodict = getinfo(func)
+        argnames = infodict['argnames']
+        assert not ('_call_' in argnames or '_func_' in argnames), (
+            'You cannot use _call_ or _func_ as argument names!')
+        src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict
+        # import sys; print >> sys.stderr, src # for debugging purposes
+        dec_func = eval(src, dict(_func_=func, _call_=caller))
+        return update_wrapper(dec_func, func, infodict)
+    return update_wrapper(_decorator, caller)
+
+def getattr_(obj, name, default_thunk):
+    "Similar to .setdefault in dictionaries."
+    try:
+        return getattr(obj, name)
+    except AttributeError:
+        default = default_thunk()
+        setattr(obj, name, default)
+        return default
+
+@decorator
+def memoize(func, *args):
+    dic = getattr_(func, "memoize_dic", dict)
+    # memoize_dic is created at the first call
+    if args in dic:
+        return dic[args]
+    else:
+        result = func(*args)
+        dic[args] = result
+        return result
+
+
+##########################     LEGALESE    ###############################
+
+##   Redistributions of source code must retain the above copyright
+##   notice, this list of conditions and the following disclaimer.
+##   Redistributions in bytecode form must reproduce the above copyright
+##   notice, this list of conditions and the following disclaimer in
+##   the documentation and/or other materials provided with the
+##   distribution.
+
+##   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+##   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+##   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+##   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+##   HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+##   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+##   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+##   OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+##   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+##   TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+##   USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+##   DAMAGE.
diff --git a/nlp_resource_data/nltk/decorators.pyc b/nlp_resource_data/nltk/decorators.pyc

new file mode 100755 (executable)

index 0000000..5e3bd00

Binary files /dev/null and b/nlp_resource_data/nltk/decorators.pyc differ
diff --git a/nlp_resource_data/nltk/downloader.py b/nlp_resource_data/nltk/downloader.py

new file mode 100755 (executable)

index 0000000..452fade
--- /dev/null
+++ b/nlp_resource_data/nltk/downloader.py
@@ -0,0 +1,2278 @@
+# Natural Language Toolkit: Corpus & Model Downloader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+The NLTK corpus and module downloader.  This module defines several
+interfaces which can be used to download corpora, models, and other
+data packages that can be used with NLTK.
+
+Downloading Packages
+====================
+If called with no arguments, ``download()`` will display an interactive
+interface which can be used to download and install new packages.
+If Tkinter is available, then a graphical interface will be shown,
+otherwise a simple text interface will be provided.
+
+Individual packages can be downloaded by calling the ``download()``
+function with a single argument, giving the package identifier for the
+package that should be downloaded:
+
+    >>> download('treebank') # doctest: +SKIP
+    [nltk_data] Downloading package 'treebank'...
+    [nltk_data]   Unzipping corpora/treebank.zip.
+
+NLTK also provides a number of \"package collections\", consisting of
+a group of related packages.  To download all packages in a
+colleciton, simply call ``download()`` with the collection's
+identifier:
+
+    >>> download('all-corpora') # doctest: +SKIP
+    [nltk_data] Downloading package 'abc'...
+    [nltk_data]   Unzipping corpora/abc.zip.
+    [nltk_data] Downloading package 'alpino'...
+    [nltk_data]   Unzipping corpora/alpino.zip.
+      ...
+    [nltk_data] Downloading package 'words'...
+    [nltk_data]   Unzipping corpora/words.zip.
+
+Download Directory
+==================
+By default, packages are installed in either a system-wide directory
+(if Python has sufficient access to write to it); or in the current
+user's home directory.  However, the ``download_dir`` argument may be
+used to specify a different installation target, if desired.
+
+See ``Downloader.default_download_dir()`` for more a detailed
+description of how the default download directory is chosen.
+
+NLTK Download Server
+====================
+Before downloading any packages, the corpus and module downloader
+contacts the NLTK download server, to retrieve an index file
+describing the available packages.  By default, this index file is
+loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``.
+If necessary, it is possible to create a new ``Downloader`` object,
+specifying a different URL for the package index file.
+
+Usage::
+
+    python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
+
+or::
+
+    python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
+"""
+#----------------------------------------------------------------------
+from __future__ import print_function, division, unicode_literals
+
+"""
+
+  0     1  2    3
+[label][----][label][----]
+[column  ][column     ]
+
+Notes
+=====
+Handling data files..  Some questions:
+
+* Should the data files be kept zipped or unzipped?  I say zipped.
+
+* Should the data files be kept in svn at all?  Advantages: history;
+  automatic version numbers; 'svn up' could be used rather than the
+  downloader to update the corpora.  Disadvantages: they're big,
+  which makes working from svn a bit of a pain.  And we're planning
+  to potentially make them much bigger.  I don't think we want
+  people to have to download 400MB corpora just to use nltk from svn.
+
+* Compromise: keep the data files in trunk/data rather than in
+  trunk/nltk.  That way you can check them out in svn if you want
+  to; but you don't need to, and you can use the downloader instead.
+
+* Also: keep models in mind.  When we change the code, we'd
+  potentially like the models to get updated.  This could require a
+  little thought.
+
+* So.. let's assume we have a trunk/data directory, containing a bunch
+  of packages.  The packages should be kept as zip files, because we
+  really shouldn't be editing them much (well -- we may edit models
+  more, but they tend to be binary-ish files anyway, where diffs
+  aren't that helpful).  So we'll have trunk/data, with a bunch of
+  files like abc.zip and treebank.zip and propbank.zip.  For each
+  package we could also have eg treebank.xml and propbank.xml,
+  describing the contents of the package (name, copyright, license,
+  etc).  Collections would also have .xml files.  Finally, we would
+  pull all these together to form a single index.xml file.  Some
+  directory structure wouldn't hurt.  So how about::
+
+    /trunk/data/ ....................... root of data svn
+      index.xml ........................ main index file
+      src/ ............................. python scripts
+      packages/ ........................ dir for packages
+        corpora/ ....................... zip & xml files for corpora
+        grammars/ ...................... zip & xml files for grammars
+        taggers/ ....................... zip & xml files for taggers
+        tokenizers/ .................... zip & xml files for tokenizers
+        etc.
+      collections/ ..................... xml files for collections
+
+  Where the root (/trunk/data) would contain a makefile; and src/
+  would contain a script to update the info.xml file.  It could also
+  contain scripts to rebuild some of the various model files.  The
+  script that builds index.xml should probably check that each zip
+  file expands entirely into a single subdir, whose name matches the
+  package's uid.
+
+Changes I need to make:
+  - in index: change "size" to "filesize" or "compressed-size"
+  - in index: add "unzipped-size"
+  - when checking status: check both compressed & uncompressed size.
+    uncompressed size is important to make sure we detect a problem
+    if something got partially unzipped.  define new status values
+    to differentiate stale vs corrupt vs corruptly-uncompressed??
+    (we shouldn't need to re-download the file if the zip file is ok
+    but it didn't get uncompressed fully.)
+  - add other fields to the index: author, license, copyright, contact,
+    etc.
+
+the current grammars/ package would become a single new package (eg
+toy-grammars or book-grammars).
+
+xml file should have:
+  - authorship info
+  - license info
+  - copyright info
+  - contact info
+  - info about what type of data/annotation it contains?
+  - recommended corpus reader?
+
+collections can contain other collections.  they can also contain
+multiple package types (corpora & models).  Have a single 'basics'
+package that includes everything we talk about in the book?
+
+n.b.: there will have to be a fallback to the punkt tokenizer, in case
+they didn't download that model.
+
+default: unzip or not?
+
+"""
+import time, os, zipfile, sys, textwrap, threading, itertools, shutil
+from hashlib import md5
+
+try:
+    TKINTER = True
+    from six.moves.tkinter import (Tk, Frame, Label, Entry, Button, Canvas,
+                                   Menu, IntVar, TclError)
+    from six.moves.tkinter_messagebox import showerror
+    from nltk.draw.table import Table
+    from nltk.draw.util import ShowText
+except:
+    TKINTER = False
+    TclError = ValueError
+
+from xml.etree import ElementTree
+
+from six import string_types, text_type
+from six.moves import input
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.error import HTTPError, URLError
+
+import nltk
+from nltk.compat import python_2_unicode_compatible
+#urllib2 = nltk.internals.import_from_stdlib('urllib2')
+
+
+######################################################################
+# Directory entry objects (from the data server's index file)
+######################################################################
+
+@python_2_unicode_compatible
+class Package(object):
+    """
+    A directory entry for a downloadable package.  These entries are
+    extracted from the XML index file that is downloaded by
+    ``Downloader``.  Each package consists of a single file; but if
+    that file is a zip file, then it can be automatically decompressed
+    when the package is installed.
+    """
+    def __init__(self, id, url, name=None, subdir='',
+                 size=None, unzipped_size=None,
+                 checksum=None, svn_revision=None,
+                 copyright='Unknown', contact='Unknown',
+                 license='Unknown', author='Unknown',
+                 unzip=True,
+                 **kw):
+        self.id = id
+        """A unique identifier for this package."""
+
+        self.name = name or id
+        """A string name for this package."""
+
+        self.subdir = subdir
+        """The subdirectory where this package should be installed.
+           E.g., ``'corpora'`` or ``'taggers'``."""
+
+        self.url = url
+        """A URL that can be used to download this package's file."""
+
+        self.size = int(size)
+        """The filesize (in bytes) of the package file."""
+
+        self.unzipped_size = int(unzipped_size)
+        """The total filesize of the files contained in the package's
+           zipfile."""
+
+        self.checksum = checksum
+        """The MD-5 checksum of the package file."""
+
+        self.svn_revision = svn_revision
+        """A subversion revision number for this package."""
+
+        self.copyright = copyright
+        """Copyright holder for this package."""
+
+        self.contact = contact
+        """Name & email of the person who should be contacted with
+           questions about this package."""
+
+        self.license = license
+        """License information for this package."""
+
+        self.author = author
+        """Author of this package."""
+
+        ext = os.path.splitext(url.split('/')[-1])[1]
+        self.filename = os.path.join(subdir, id+ext)
+        """The filename that should be used for this package's file.  It
+           is formed by joining ``self.subdir`` with ``self.id``, and
+           using the same extension as ``url``."""
+
+        self.unzip = bool(int(unzip)) # '0' or '1'
+        """A flag indicating whether this corpus should be unzipped by
+           default."""
+
+        # Include any other attributes provided by the XML file.
+        self.__dict__.update(kw)
+
+    @staticmethod
+    def fromxml(xml):
+        if isinstance(xml, string_types):
+            xml = ElementTree.parse(xml)
+        for key in xml.attrib:
+            xml.attrib[key] = text_type(xml.attrib[key])
+        return Package(**xml.attrib)
+
+    def __lt__(self, other):
+        return self.id < other.id
+
+    def __repr__(self):
+        return '<Package %s>' % self.id
+
+@python_2_unicode_compatible
+class Collection(object):
+    """
+    A directory entry for a collection of downloadable packages.
+    These entries are extracted from the XML index file that is
+    downloaded by ``Downloader``.
+    """
+    def __init__(self, id, children, name=None, **kw):
+        self.id = id
+        """A unique identifier for this collection."""
+
+        self.name = name or id
+        """A string name for this collection."""
+
+        self.children = children
+        """A list of the ``Collections`` or ``Packages`` directly
+           contained by this collection."""
+
+        self.packages = None
+        """A list of ``Packages`` contained by this collection or any
+           collections it recursively contains."""
+
+        # Include any other attributes provided by the XML file.
+        self.__dict__.update(kw)
+
+    @staticmethod
+    def fromxml(xml):
+        if isinstance(xml, string_types):
+            xml = ElementTree.parse(xml)
+        for key in xml.attrib:
+            xml.attrib[key] = text_type(xml.attrib[key])
+        children = [child.get('ref') for child in xml.findall('item')]
+        return Collection(children=children, **xml.attrib)
+
+    def __lt__(self, other):
+        return self.id < other.id
+
+    def __repr__(self):
+        return '<Collection %s>' % self.id
+
+######################################################################
+# Message Passing Objects
+######################################################################
+
+class DownloaderMessage(object):
+    """A status message object, used by ``incr_download`` to
+       communicate its progress."""
+class StartCollectionMessage(DownloaderMessage):
+    """Data server has started working on a collection of packages."""
+    def __init__(self, collection): self.collection = collection
+class FinishCollectionMessage(DownloaderMessage):
+    """Data server has finished working on a collection of packages."""
+    def __init__(self, collection): self.collection = collection
+class StartPackageMessage(DownloaderMessage):
+    """Data server has started working on a package."""
+    def __init__(self, package): self.package = package
+class FinishPackageMessage(DownloaderMessage):
+    """Data server has finished working on a package."""
+    def __init__(self, package): self.package = package
+class StartDownloadMessage(DownloaderMessage):
+    """Data server has started downloading a package."""
+    def __init__(self, package): self.package = package
+class FinishDownloadMessage(DownloaderMessage):
+    """Data server has finished downloading a package."""
+    def __init__(self, package): self.package = package
+class StartUnzipMessage(DownloaderMessage):
+    """Data server has started unzipping a package."""
+    def __init__(self, package): self.package = package
+class FinishUnzipMessage(DownloaderMessage):
+    """Data server has finished unzipping a package."""
+    def __init__(self, package): self.package = package
+class UpToDateMessage(DownloaderMessage):
+    """The package download file is already up-to-date"""
+    def __init__(self, package): self.package = package
+class StaleMessage(DownloaderMessage):
+    """The package download file is out-of-date or corrupt"""
+    def __init__(self, package): self.package = package
+class ErrorMessage(DownloaderMessage):
+    """Data server encountered an error"""
+    def __init__(self, package, message):
+        self.package = package
+        if isinstance(message, Exception):
+            self.message = str(message)
+        else:
+            self.message = message
+
+class ProgressMessage(DownloaderMessage):
+    """Indicates how much progress the data server has made"""
+    def __init__(self, progress): self.progress = progress
+class SelectDownloadDirMessage(DownloaderMessage):
+    """Indicates what download directory the data server is using"""
+    def __init__(self, download_dir): self.download_dir = download_dir
+
+######################################################################
+# NLTK Data Server
+######################################################################
+
+class Downloader(object):
+    """
+    A class used to access the NLTK data server, which can be used to
+    download corpora and other data packages.
+    """
+
+    #/////////////////////////////////////////////////////////////////
+    # Configuration
+    #/////////////////////////////////////////////////////////////////
+
+    INDEX_TIMEOUT = 60*60 # 1 hour
+    """The amount of time after which the cached copy of the data
+       server index will be considered 'stale,' and will be
+       re-downloaded."""
+
+    DEFAULT_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml'
+    """The default URL for the NLTK data server's index.  An
+       alternative URL can be specified when creating a new
+       ``Downloader`` object."""
+
+    #/////////////////////////////////////////////////////////////////
+    # Status Constants
+    #/////////////////////////////////////////////////////////////////
+
+    INSTALLED = 'installed'
+    """A status string indicating that a package or collection is
+       installed and up-to-date."""
+    NOT_INSTALLED = 'not installed'
+    """A status string indicating that a package or collection is
+       not installed."""
+    STALE = 'out of date'
+    """A status string indicating that a package or collection is
+       corrupt or out-of-date."""
+    PARTIAL = 'partial'
+    """A status string indicating that a collection is partially
+       installed (i.e., only some of its packages are installed.)"""
+
+    #/////////////////////////////////////////////////////////////////
+    # Cosntructor
+    #/////////////////////////////////////////////////////////////////
+
+    def __init__(self, server_index_url=None, download_dir=None):
+        self._url = server_index_url or self.DEFAULT_URL
+        """The URL for the data server's index file."""
+
+        self._collections = {}
+        """Dictionary from collection identifier to ``Collection``"""
+
+        self._packages = {}
+        """Dictionary from package identifier to ``Package``"""
+
+        self._download_dir = download_dir
+        """The default directory to which packages will be downloaded."""
+
+        self._index = None
+        """The XML index file downloaded from the data server"""
+
+        self._index_timestamp = None
+        """Time at which ``self._index`` was downloaded.  If it is more
+           than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded."""
+
+        self._status_cache = {}
+        """Dictionary from package/collection identifier to status
+           string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or
+           ``PARTIAL``).  Cache is used for packages only, not
+           collections."""
+
+        self._errors = None
+        """Flag for telling if all packages got successfully downloaded or not."""
+
+        # decide where we're going to save things to.
+        if self._download_dir is None:
+            self._download_dir = self.default_download_dir()
+
+    #/////////////////////////////////////////////////////////////////
+    # Information
+    #/////////////////////////////////////////////////////////////////
+
+    def list(self, download_dir=None, show_packages=True,
+             show_collections=True, header=True, more_prompt=False,
+             skip_installed=False):
+        lines = 0 # for more_prompt
+        if download_dir is None:
+            download_dir = self._download_dir
+            print('Using default data directory (%s)' % download_dir)
+        if header:
+            print('='*(26+len(self._url)))
+            print(' Data server index for <%s>' % self._url)
+            print('='*(26+len(self._url)))
+            lines += 3 # for more_prompt
+        stale = partial = False
+
+        categories = []
+        if show_packages: categories.append('packages')
+        if show_collections: categories.append('collections')
+        for category in categories:
+            print('%s:' % category.capitalize())
+            lines += 1 # for more_prompt
+            for info in sorted(getattr(self, category)(), key=str):
+                status = self.status(info, download_dir)
+                if status == self.INSTALLED and skip_installed: continue
+                if status == self.STALE: stale = True
+                if status == self.PARTIAL: partial = True
+                prefix = {self.INSTALLED:'*', self.STALE:'-',
+                          self.PARTIAL:'P', self.NOT_INSTALLED: ' '}[status]
+                name = textwrap.fill('-'*27 + (info.name or info.id),
+                                     75, subsequent_indent=27*' ')[27:]
+                print('  [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name))
+                lines += len(name.split('\n')) # for more_prompt
+                if more_prompt and lines > 20:
+                    user_input = input("Hit Enter to continue: ")
+                    if (user_input.lower() in ('x', 'q')): return
+                    lines = 0
+            print()
+        msg = '([*] marks installed packages'
+        if stale: msg += '; [-] marks out-of-date or corrupt packages'
+        if partial: msg += '; [P] marks partially installed collections'
+        print(textwrap.fill(msg+')', subsequent_indent=' ', width=76))
+
+    def packages(self):
+        self._update_index()
+        return self._packages.values()
+
+    def corpora(self):
+        self._update_index()
+        return [pkg for (id,pkg) in self._packages.items()
+                if pkg.subdir == 'corpora']
+
+    def models(self):
+        self._update_index()
+        return [pkg for (id,pkg) in self._packages.items()
+                if pkg.subdir != 'corpora']
+
+    def collections(self):
+        self._update_index()
+        return self._collections.values()
+
+    #/////////////////////////////////////////////////////////////////
+    # Downloading
+    #/////////////////////////////////////////////////////////////////
+
+    def _info_or_id(self, info_or_id):
+        if isinstance(info_or_id, string_types):
+            return self.info(info_or_id)
+        else:
+            return info_or_id
+
+    # [xx] When during downloading is it 'safe' to abort?  Only unsafe
+    # time is *during* an unzip -- we don't want to leave a
+    # partially-unzipped corpus in place because we wouldn't notice
+    # it.  But if we had the exact total size of the unzipped corpus,
+    # then that would be fine.  Then we could abort anytime we want!
+    # So this is really what we should do.  That way the threaded
+    # downloader in the gui can just kill the download thread anytime
+    # it wants.
+
+    def incr_download(self, info_or_id, download_dir=None, force=False):
+        # If they didn't specify a download_dir, then use the default one.
+        if download_dir is None:
+            download_dir = self._download_dir
+            yield SelectDownloadDirMessage(download_dir)
+
+        # If they gave us a list of ids, then download each one.
+        if isinstance(info_or_id, (list,tuple)):
+            for msg in self._download_list(info_or_id, download_dir, force):
+                yield msg
+            return
+
+        # Look up the requested collection or package.
+        try: info = self._info_or_id(info_or_id)
+        except (IOError, ValueError) as e:
+            yield ErrorMessage(None, 'Error loading %s: %s' %
+                               (info_or_id, e))
+            return
+
+        # Handle collections.
+        if isinstance(info, Collection):
+            yield StartCollectionMessage(info)
+            for msg in self.incr_download(info.children, download_dir, force):
+                yield msg
+            yield FinishCollectionMessage(info)
+
+        # Handle Packages (delegate to a helper function).
+        else:
+            for msg in self._download_package(info, download_dir, force):
+                yield msg
+
+    def _num_packages(self, item):
+        if isinstance(item, Package): return 1
+        else: return len(item.packages)
+
+    def _download_list(self, items, download_dir, force):
+        # Look up the requested items.
+        for i in range(len(items)):
+            try: items[i] = self._info_or_id(items[i])
+            except (IOError, ValueError) as e:
+                yield ErrorMessage(items[i], e)
+                return
+
+        # Download each item, re-scaling their progress.
+        num_packages = sum(self._num_packages(item) for item in items)
+        progress = 0
+        for i, item in enumerate(items):
+            if isinstance(item, Package):
+                delta = 1./num_packages
+            else:
+                delta = len(item.packages)/num_packages
+            for msg in self.incr_download(item, download_dir, force):
+                if isinstance(msg, ProgressMessage):
+                    yield ProgressMessage(progress + msg.progress*delta)
+                else:
+                    yield msg
+
+            progress += 100*delta
+
+    def _download_package(self, info, download_dir, force):
+        yield StartPackageMessage(info)
+        yield ProgressMessage(0)
+
+        # Do we already have the current version?
+        status = self.status(info, download_dir)
+        if not force and status == self.INSTALLED:
+            yield UpToDateMessage(info)
+            yield ProgressMessage(100)
+            yield FinishPackageMessage(info)
+            return
+
+        # Remove the package from our status cache
+        self._status_cache.pop(info.id, None)
+
+        # Check for (and remove) any old/stale version.
+        filepath = os.path.join(download_dir, info.filename)
+        if os.path.exists(filepath):
+            if status == self.STALE:
+                yield StaleMessage(info)
+            os.remove(filepath)
+
+        # Ensure the download_dir exists
+        if not os.path.exists(download_dir):
+            os.mkdir(download_dir)
+        if not os.path.exists(os.path.join(download_dir, info.subdir)):
+            os.mkdir(os.path.join(download_dir, info.subdir))
+
+        # Download the file.  This will raise an IOError if the url
+        # is not found.
+        yield StartDownloadMessage(info)
+        yield ProgressMessage(5)
+        try:
+            infile = urlopen(info.url)
+            with open(filepath, 'wb') as outfile:
+                #print info.size
+                num_blocks = max(1, info.size/(1024*16))
+                for block in itertools.count():
+                    s = infile.read(1024*16) # 16k blocks.
+                    outfile.write(s)
+                    if not s: break
+                    if block % 2 == 0: # how often?
+                        yield ProgressMessage(min(80, 5+75*(block/num_blocks)))
+            infile.close()
+        except IOError as e:
+            yield ErrorMessage(info, 'Error downloading %r from <%s>:'
+                               '\n  %s' % (info.id, info.url, e))
+            return
+        yield FinishDownloadMessage(info)
+        yield ProgressMessage(80)
+
+        # If it's a zipfile, uncompress it.
+        if info.filename.endswith('.zip'):
+            zipdir = os.path.join(download_dir, info.subdir)
+            # Unzip if we're unzipping by default; *or* if it's already
+            # been unzipped (presumably a previous version).
+            if info.unzip or os.path.exists(os.path.join(zipdir, info.id)):
+                yield StartUnzipMessage(info)
+                for msg in _unzip_iter(filepath, zipdir, verbose=False):
+                    # Somewhat of a hack, but we need a proper package reference
+                    msg.package = info
+                    yield msg
+                yield FinishUnzipMessage(info)
+
+        yield FinishPackageMessage(info)
+
+    def download(self, info_or_id=None, download_dir=None, quiet=False,
+                 force=False, prefix='[nltk_data] ', halt_on_error=True,
+                 raise_on_error=False):
+        # If no info or id is given, then use the interactive shell.
+        if info_or_id is None:
+            # [xx] hmm -- changing self._download_dir here seems like
+            # the wrong thing to do.  Maybe the _interactive_download
+            # function should make a new copy of self to use?
+            if download_dir is not None: self._download_dir = download_dir
+            self._interactive_download()
+            return True
+
+        else:
+            # Define a helper function for displaying output:
+            def show(s, prefix2=''):
+                print(textwrap.fill(s, initial_indent=prefix+prefix2,
+                                    subsequent_indent=prefix+prefix2+' '*4))
+
+            for msg in self.incr_download(info_or_id, download_dir, force):
+                # Error messages
+                if isinstance(msg, ErrorMessage):
+                    show(msg.message)
+                    if raise_on_error:
+                        raise ValueError(msg.message)
+                    if halt_on_error:
+                        return False
+                    self._errors = True
+                    if not quiet:
+                        print("Error installing package. Retry? [n/y/e]")
+                        choice = input().strip()
+                        if choice in ['y', 'Y']:
+                            if not self.download(msg.package.id, download_dir,
+                                                 quiet, force, prefix,
+                                                 halt_on_error, raise_on_error):
+                                return False
+                        elif choice in ['e', 'E']:
+                            return False
+
+                # All other messages
+                if not quiet:
+                    # Collection downloading messages:
+                    if isinstance(msg, StartCollectionMessage):
+                        show('Downloading collection %r' % msg.collection.id)
+                        prefix += '   | '
+                        print(prefix)
+                    elif isinstance(msg, FinishCollectionMessage):
+                        print(prefix)
+                        prefix = prefix[:-4]
+                        if self._errors:
+                            show('Downloaded collection %r with errors' %
+                                 msg.collection.id)
+                        else:
+                            show('Done downloading collection %s' %
+                                 msg.collection.id)
+
+                    # Package downloading messages:
+                    elif isinstance(msg, StartPackageMessage):
+                        show('Downloading package %s to %s...' %
+                             (msg.package.id, download_dir))
+                    elif isinstance(msg, UpToDateMessage):
+                        show('Package %s is already up-to-date!' %
+                             msg.package.id, '  ')
+                    #elif isinstance(msg, StaleMessage):
+                    #    show('Package %s is out-of-date or corrupt' %
+                    #         msg.package.id, '  ')
+                    elif isinstance(msg, StartUnzipMessage):
+                        show('Unzipping %s.' % msg.package.filename, '  ')
+
+                    # Data directory message:
+                    elif isinstance(msg, SelectDownloadDirMessage):
+                        download_dir = msg.download_dir
+        return True
+
+    def is_stale(self, info_or_id, download_dir=None):
+        return self.status(info_or_id, download_dir) == self.STALE
+
+    def is_installed(self, info_or_id, download_dir=None):
+        return self.status(info_or_id, download_dir) == self.INSTALLED
+
+    def clear_status_cache(self, id=None):
+        if id is None:
+            self._status_cache.clear()
+        else:
+            self._status_cache.pop(id, None)
+
+    def status(self, info_or_id, download_dir=None):
+        """
+        Return a constant describing the status of the given package
+        or collection.  Status can be one of ``INSTALLED``,
+        ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``.
+        """
+        if download_dir is None: download_dir = self._download_dir
+        info = self._info_or_id(info_or_id)
+
+        # Handle collections:
+        if isinstance(info, Collection):
+            pkg_status = [self.status(pkg.id) for pkg in info.packages]
+            if self.STALE in pkg_status:
+                return self.STALE
+            elif self.PARTIAL in pkg_status:
+                return self.PARTIAL
+            elif (self.INSTALLED in pkg_status and
+                  self.NOT_INSTALLED in pkg_status):
+                return self.PARTIAL
+            elif self.NOT_INSTALLED in pkg_status:
+                return self.NOT_INSTALLED
+            else:
+                return self.INSTALLED
+
+        # Handle packages:
+        else:
+            filepath = os.path.join(download_dir, info.filename)
+            if download_dir != self._download_dir:
+                return self._pkg_status(info, filepath)
+            else:
+                if info.id not in self._status_cache:
+                    self._status_cache[info.id] = self._pkg_status(info,
+                                                                   filepath)
+                return self._status_cache[info.id]
+
+    def _pkg_status(self, info, filepath):
+        if not os.path.exists(filepath):
+            return self.NOT_INSTALLED
+
+        # Check if the file has the correct size.
+        try: filestat = os.stat(filepath)
+        except OSError: return self.NOT_INSTALLED
+        if filestat.st_size != int(info.size):
+            return self.STALE
+
+        # Check if the file's checksum matches
+        if md5_hexdigest(filepath) != info.checksum:
+            return self.STALE
+
+        # If it's a zipfile, and it's been at least partially
+        # unzipped, then check if it's been fully unzipped.
+        if filepath.endswith('.zip'):
+            unzipdir = filepath[:-4]
+            if not os.path.exists(unzipdir):
+                return self.INSTALLED # but not unzipped -- ok!
+            if not os.path.isdir(unzipdir):
+                return self.STALE
+
+            unzipped_size = sum(os.stat(os.path.join(d, f)).st_size
+                                for d, _, files in os.walk(unzipdir)
+                                for f in files)
+            if unzipped_size != info.unzipped_size:
+                return self.STALE
+
+        # Otherwise, everything looks good.
+        return self.INSTALLED
+
+    def update(self, quiet=False, prefix='[nltk_data] '):
+        """
+        Re-download any packages whose status is STALE.
+        """
+        self.clear_status_cache()
+        for pkg in self.packages():
+            if self.status(pkg) == self.STALE:
+                self.download(pkg, quiet=quiet, prefix=prefix)
+
+    #/////////////////////////////////////////////////////////////////
+    # Index
+    #/////////////////////////////////////////////////////////////////
+
+    def _update_index(self, url=None):
+        """A helper function that ensures that self._index is
+        up-to-date.  If the index is older than self.INDEX_TIMEOUT,
+        then download it again."""
+        # Check if the index is aleady up-to-date.  If so, do nothing.
+        if not (self._index is None or url is not None or
+                time.time()-self._index_timestamp > self.INDEX_TIMEOUT):
+            return
+
+        # If a URL was specified, then update our URL.
+        self._url = url or self._url
+
+        # Download the index file.
+        self._index = nltk.internals.ElementWrapper(
+            ElementTree.parse(urlopen(self._url)).getroot())
+        self._index_timestamp = time.time()
+
+        # Build a dictionary of packages.
+        packages = [Package.fromxml(p) for p in
+                    self._index.findall('packages/package')]
+        self._packages = dict((p.id, p) for p in packages)
+
+        # Build a dictionary of collections.
+        collections = [Collection.fromxml(c) for c in
+                       self._index.findall('collections/collection')]
+        self._collections = dict((c.id, c) for c in collections)
+
+        # Replace identifiers with actual children in collection.children.
+        for collection in self._collections.values():
+            for i, child_id in enumerate(collection.children):
+                if child_id in self._packages:
+                    collection.children[i] = self._packages[child_id]
+                elif child_id in self._collections:
+                    collection.children[i] = self._collections[child_id]
+                else:
+                    print('removing collection member with no package: {}'.format(child_id))
+                    del collection.children[i]
+
+        # Fill in collection.packages for each collection.
+        for collection in self._collections.values():
+            packages = {}
+            queue = [collection]
+            for child in queue:
+                if isinstance(child, Collection):
+                    queue.extend(child.children)
+                elif isinstance(child, Package):
+                    packages[child.id] = child
+                else:
+                    pass
+            collection.packages = packages.values()
+
+        # Flush the status cache
+        self._status_cache.clear()
+
+    def index(self):
+        """
+        Return the XML index describing the packages available from
+        the data server.  If necessary, this index will be downloaded
+        from the data server.
+        """
+        self._update_index()
+        return self._index
+
+    def info(self, id):
+        """Return the ``Package`` or ``Collection`` record for the
+           given item."""
+        self._update_index()
+        if id in self._packages: return self._packages[id]
+        if id in self._collections: return self._collections[id]
+        raise ValueError('Package %r not found in index' % id)
+
+    def xmlinfo(self, id):
+        """Return the XML info record for the given item"""
+        self._update_index()
+        for package in self._index.findall('packages/package'):
+            if package.get('id') == id:
+                return package
+        for collection in self._index.findall('collections/collection'):
+            if collection.get('id') == id:
+                return collection
+        raise ValueError('Package %r not found in index' % id)
+
+    #/////////////////////////////////////////////////////////////////
+    # URL & Data Directory
+    #/////////////////////////////////////////////////////////////////
+
+    def _get_url(self):
+        """The URL for the data server's index file."""
+        return self._url
+    def _set_url(self, url):
+        """
+        Set a new URL for the data server. If we're unable to contact
+        the given url, then the original url is kept.
+        """
+        original_url = self._url
+        try:
+            self._update_index(url)
+        except:
+            self._url = original_url
+            raise
+    url = property(_get_url, _set_url)
+
+    def default_download_dir(self):
+        """
+        Return the directory to which packages will be downloaded by
+        default.  This value can be overridden using the constructor,
+        or on a case-by-case basis using the ``download_dir`` argument when
+        calling ``download()``.
+
+        On Windows, the default download directory is
+        ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
+        directory containing Python, e.g. ``C:\\Python25``.
+
+        On all other platforms, the default directory is the first of
+        the following which exists or which can be created with write
+        permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
+        ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
+        """
+        # Check if we are on GAE where we cannot write into filesystem.
+        if 'APPENGINE_RUNTIME' in os.environ:
+            return
+
+        # Check if we have sufficient permissions to install in a
+        # variety of system-wide locations.
+        for nltkdir in nltk.data.path:
+            if (os.path.exists(nltkdir) and
+                nltk.internals.is_writable(nltkdir)):
+                return nltkdir
+
+        # On Windows, use %APPDATA%
+        if sys.platform == 'win32' and 'APPDATA' in os.environ:
+            homedir = os.environ['APPDATA']
+
+        # Otherwise, install in the user's home directory.
+        else:
+            homedir = os.path.expanduser('~/')
+            if homedir == '~/':
+                raise ValueError("Could not find a default download directory")
+
+        # append "nltk_data" to the home directory
+        return os.path.join(homedir, 'nltk_data')
+
+    def _get_download_dir(self):
+        """
+        The default directory to which packages will be downloaded.
+        This defaults to the value returned by ``default_download_dir()``.
+        To override this default on a case-by-case basis, use the
+        ``download_dir`` argument when calling ``download()``.
+        """
+        return self._download_dir
+    def _set_download_dir(self, download_dir):
+        self._download_dir = download_dir
+        # Clear the status cache.
+        self._status_cache.clear()
+    download_dir = property(_get_download_dir, _set_download_dir)
+
+    #/////////////////////////////////////////////////////////////////
+    # Interactive Shell
+    #/////////////////////////////////////////////////////////////////
+
+    def _interactive_download(self):
+        # Try the GUI first; if that doesn't work, try the simple
+        # interactive shell.
+        if TKINTER:
+            try:
+                DownloaderGUI(self).mainloop()
+            except TclError:
+                DownloaderShell(self).run()
+        else:
+            DownloaderShell(self).run()
+
+class DownloaderShell(object):
+    def __init__(self, dataserver):
+        self._ds = dataserver
+
+    def _simple_interactive_menu(self, *options):
+        print('-'*75)
+        spc = (68 - sum(len(o) for o in options))//(len(options)-1)*' '
+        print('    ' + spc.join(options))
+        #w = 76/len(options)
+        #fmt = '  ' + ('%-'+str(w)+'s')*(len(options)-1) + '%s'
+        #print fmt % options
+        print('-'*75)
+
+    def run(self):
+        print('NLTK Downloader')
+        while True:
+            self._simple_interactive_menu(
+                'd) Download', 'l) List', ' u) Update', 'c) Config', 'h) Help', 'q) Quit')
+            user_input = input('Downloader> ').strip()
+            if not user_input: print(); continue
+            command = user_input.lower().split()[0]
+            args = user_input.split()[1:]
+            try:
+                if command == 'l':
+                    print()
+                    self._ds.list(self._ds.download_dir, header=False,
+                                  more_prompt=True)
+                elif command == 'h':
+                    self._simple_interactive_help()
+                elif command == 'c':
+                    self._simple_interactive_config()
+                elif command in ('q', 'x'):
+                    return
+                elif command == 'd':
+                    self._simple_interactive_download(args)
+                elif command == 'u':
+                    self._simple_interactive_update()
+                else:
+                    print('Command %r unrecognized' % user_input)
+            except HTTPError as e:
+                print('Error reading from server: %s'%e)
+            except URLError as e:
+                print('Error connecting to server: %s'%e.reason)
+            # try checking if user_input is a package name, &
+            # downloading it?
+            print()
+
+    def _simple_interactive_download(self, args):
+        if args:
+            for arg in args:
+                try: self._ds.download(arg, prefix='    ')
+                except (IOError, ValueError) as e: print(e)
+        else:
+            while True:
+                print()
+                print('Download which package (l=list; x=cancel)?')
+                user_input = input('  Identifier> ')
+                if user_input.lower()=='l':
+                    self._ds.list(self._ds.download_dir, header=False,
+                                  more_prompt=True, skip_installed=True)
+                    continue
+                elif user_input.lower() in ('x', 'q', ''):
+                    return
+                elif user_input:
+                    for id in user_input.split():
+                        try: self._ds.download(id, prefix='    ')
+                        except (IOError, ValueError) as e: print(e)
+                    break
+
+    def _simple_interactive_update(self):
+        while True:
+            stale_packages = []
+            stale = partial = False
+            for info in sorted(getattr(self._ds, 'packages')(), key=str):
+                if self._ds.status(info) == self._ds.STALE:
+                    stale_packages.append((info.id, info.name))
+
+            print()
+            if stale_packages:
+                print('Will update following packages (o=ok; x=cancel)')
+                for pid, pname in stale_packages:
+                    name = textwrap.fill('-'*27 + (pname),
+                                     75, subsequent_indent=27*' ')[27:]
+                    print('  [ ] %s %s' % (pid.ljust(20, '.'), name))
+                print()
+
+                user_input = input('  Identifier> ')
+                if user_input.lower()=='o':
+                    for pid, pname in stale_packages:
+                        try: self._ds.download(pid, prefix='    ')
+                        except (IOError, ValueError) as e: print(e)
+                    break
+                elif user_input.lower() in ('x', 'q', ''):
+                    return
+            else:
+                print('Nothing to update.')
+                return
+
+    def _simple_interactive_help(self):
+        print()
+        print('Commands:')
+        print('  d) Download a package or collection     u) Update out of date packages')
+        print('  l) List packages & collections          h) Help')
+        print('  c) View & Modify Configuration          q) Quit')
+
+    def _show_config(self):
+        print()
+        print('Data Server:')
+        print('  - URL: <%s>' % self._ds.url)
+        print(('  - %d Package Collections Available' %
+               len(self._ds.collections())))
+        print(('  - %d Individual Packages Available' %
+               len(self._ds.packages())))
+        print()
+        print('Local Machine:')
+        print('  - Data directory: %s' % self._ds.download_dir)
+
+    def _simple_interactive_config(self):
+        self._show_config()
+        while True:
+            print()
+            self._simple_interactive_menu(
+                's) Show Config', 'u) Set Server URL',
+                'd) Set Data Dir', 'm) Main Menu')
+            user_input = input('Config> ').strip().lower()
+            if user_input == 's':
+                self._show_config()
+            elif user_input == 'd':
+                new_dl_dir = input('  New Directory> ').strip()
+                if new_dl_dir in ('', 'x', 'q', 'X', 'Q'):
+                    print('  Cancelled!')
+                elif os.path.isdir(new_dl_dir):
+                    self._ds.download_dir = new_dl_dir
+                else:
+                    print(('Directory %r not found!  Create it first.' %
+                           new_dl_dir))
+            elif user_input == 'u':
+                new_url = input('  New URL> ').strip()
+                if new_url in ('', 'x', 'q', 'X', 'Q'):
+                    print('  Cancelled!')
+                else:
+                    if not new_url.startswith(('http://', 'https://')):
+                        new_url = 'http://'+new_url
+                    try: self._ds.url = new_url
+                    except Exception as e:
+                        print('Error reading <%r>:\n  %s' % (new_url, e))
+            elif user_input == 'm':
+                break
+
+class DownloaderGUI(object):
+    """
+    Graphical interface for downloading packages from the NLTK data
+    server.
+    """
+
+    #/////////////////////////////////////////////////////////////////
+    # Column Configuration
+    #/////////////////////////////////////////////////////////////////
+
+    COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status',
+               'Unzipped Size',
+               'Copyright', 'Contact', 'License', 'Author',
+               'Subdir', 'Checksum']
+    """A list of the names of columns.  This controls the order in
+       which the columns will appear.  If this is edited, then
+       ``_package_to_columns()`` may need to be edited to match."""
+
+    COLUMN_WEIGHTS = {'': 0, 'Name': 5, 'Size': 0, 'Status': 0}
+    """A dictionary specifying how columns should be resized when the
+       table is resized.  Columns with weight 0 will not be resized at
+       all; and columns with high weight will be resized more.
+       Default weight (for columns not explicitly listed) is 1."""
+
+    COLUMN_WIDTHS = {'':1, 'Identifier':20, 'Name':45,
+                     'Size': 10, 'Unzipped Size': 10,
+                     'Status': 12}
+    """A dictionary specifying how wide each column should be, in
+       characters.  The default width (for columns not explicitly
+       listed) is specified by ``DEFAULT_COLUMN_WIDTH``."""
+
+    DEFAULT_COLUMN_WIDTH = 30
+    """The default width for columns that are not explicitly listed
+       in ``COLUMN_WIDTHS``."""
+
+    INITIAL_COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status']
+    """The set of columns that should be displayed by default."""
+
+    # Perform a few import-time sanity checks to make sure that the
+    # column configuration variables are defined consistently:
+    for c in COLUMN_WEIGHTS: assert c in COLUMNS
+    for c in COLUMN_WIDTHS: assert c in COLUMNS
+    for c in INITIAL_COLUMNS: assert c in COLUMNS
+
+    #/////////////////////////////////////////////////////////////////
+    # Color Configuration
+    #/////////////////////////////////////////////////////////////////
+
+    _BACKDROP_COLOR = ('#000', '#ccc')
+
+    _ROW_COLOR = {Downloader.INSTALLED: ('#afa', '#080'),
+                  Downloader.PARTIAL: ('#ffa', '#880'),
+                  Downloader.STALE: ('#faa', '#800'),
+                  Downloader.NOT_INSTALLED: ('#fff', '#888')}
+
+    _MARK_COLOR = ('#000', '#ccc')
+
+    #_FRONT_TAB_COLOR = ('#ccf', '#008')
+    #_BACK_TAB_COLOR = ('#88a', '#448')
+    _FRONT_TAB_COLOR = ('#fff', '#45c')
+    _BACK_TAB_COLOR = ('#aaa', '#67a')
+
+    _PROGRESS_COLOR = ('#f00', '#aaa')
+
+    _TAB_FONT = 'helvetica -16 bold'
+
+    #/////////////////////////////////////////////////////////////////
+    # Constructor
+    #/////////////////////////////////////////////////////////////////
+
+    def __init__(self, dataserver, use_threads=True):
+        self._ds = dataserver
+        self._use_threads = use_threads
+
+        # For the threaded downloader:
+        self._download_lock = threading.Lock()
+        self._download_msg_queue = []
+        self._download_abort_queue = []
+        self._downloading = False
+
+        # For tkinter after callbacks:
+        self._afterid = {}
+
+        # A message log.
+        self._log_messages = []
+        self._log_indent = 0
+        self._log('NLTK Downloader Started!')
+
+        # Create the main window.
+        top = self.top = Tk()
+        top.geometry('+50+50')
+        top.title('NLTK Downloader')
+        top.configure(background=self._BACKDROP_COLOR[1])
+
+        # Set up some bindings now, in case anything goes wrong.
+        top.bind('<Control-q>', self.destroy)
+        top.bind('<Control-x>', self.destroy)
+        self._destroyed = False
+
+        self._column_vars = {}
+
+        # Initialize the GUI.
+        self._init_widgets()
+        self._init_menu()
+        try:
+            self._fill_table()
+        except HTTPError as e:
+            showerror('Error reading from server', e)
+        except URLError as e:
+            showerror('Error connecting to server', e.reason)
+
+        self._show_info()
+        self._select_columns()
+        self._table.select(0)
+
+        # Make sure we get notified when we're destroyed, so we can
+        # cancel any download in progress.
+        self._table.bind('<Destroy>', self._destroy)
+
+    def _log(self, msg):
+        self._log_messages.append('%s %s%s' % (time.ctime(),
+                                     ' | '*self._log_indent, msg))
+
+    #/////////////////////////////////////////////////////////////////
+    # Internals
+    #/////////////////////////////////////////////////////////////////
+
+    def _init_widgets(self):
+        # Create the top-level frame structures
+        f1 = Frame(self.top, relief='raised', border=2, padx=8, pady=0)
+        f1.pack(sid='top', expand=True, fill='both')
+        f1.grid_rowconfigure(2, weight=1)
+        f1.grid_columnconfigure(0, weight=1)
+        Frame(f1, height=8).grid(column=0, row=0) # spacer
+        tabframe = Frame(f1)
+        tabframe.grid(column=0, row=1, sticky='news')
+        tableframe = Frame(f1)
+        tableframe.grid(column=0, row=2, sticky='news')
+        buttonframe = Frame(f1)
+        buttonframe.grid(column=0, row=3, sticky='news')
+        Frame(f1, height=8).grid(column=0, row=4) # spacer
+        infoframe = Frame(f1)
+        infoframe.grid(column=0, row=5, sticky='news')
+        Frame(f1, height=8).grid(column=0, row=6) # spacer
+        progressframe = Frame(self.top, padx=3, pady=3,
+                              background=self._BACKDROP_COLOR[1])
+        progressframe.pack(side='bottom', fill='x')
+        self.top['border'] = 0
+        self.top['highlightthickness'] = 0
+
+        # Create the tabs
+        self._tab_names = ['Collections', 'Corpora',
+                           'Models', 'All Packages',]
+        self._tabs = {}
+        for i, tab in enumerate(self._tab_names):
+            label = Label(tabframe, text=tab, font=self._TAB_FONT)
+            label.pack(side='left', padx=((i+1)%2)*10)
+            label.bind('<Button-1>', self._select_tab)
+            self._tabs[tab.lower()] = label
+
+        # Create the table.
+        column_weights = [self.COLUMN_WEIGHTS.get(column, 1)
+                          for column in self.COLUMNS]
+        self._table = Table(tableframe, self.COLUMNS,
+                            column_weights=column_weights,
+                            highlightthickness=0, listbox_height=16,
+                            reprfunc=self._table_reprfunc)
+        self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked
+        for i, column in enumerate(self.COLUMNS):
+            width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH)
+            self._table.columnconfig(i, width=width)
+        self._table.pack(expand=True, fill='both')
+        self._table.focus()
+        self._table.bind_to_listboxes('<Double-Button-1>',
+                                      self._download)
+        self._table.bind('<space>', self._table_mark)
+        self._table.bind('<Return>', self._download)
+        self._table.bind('<Left>', self._prev_tab)
+        self._table.bind('<Right>', self._next_tab)
+        self._table.bind('<Control-a>', self._mark_all)
+
+        # Create entry boxes for URL & download_dir
+        infoframe.grid_columnconfigure(1, weight=1)
+
+        info = [('url', 'Server Index:', self._set_url),
+                ('download_dir','Download Directory:',self._set_download_dir)]
+        self._info = {}
+        for (i, (key, label, callback)) in enumerate(info):
+            Label(infoframe, text=label).grid(column=0, row=i, sticky='e')
+            entry = Entry(infoframe, font='courier', relief='groove',
+                          disabledforeground='black')
+            self._info[key] = (entry, callback)
+            entry.bind('<Return>', self._info_save)
+            entry.bind('<Button-1>', lambda e,key=key: self._info_edit(key))
+            entry.grid(column=1, row=i, sticky='ew')
+
+        # If the user edits url or download_dir, and then clicks outside
+        # the entry box, then save their results.
+        self.top.bind('<Button-1>', self._info_save)
+
+        # Create Download & Refresh buttons.
+        self._download_button = Button(
+            buttonframe, text='Download', command=self._download, width=8)
+        self._download_button.pack(side='left')
+        self._refresh_button = Button(
+            buttonframe, text='Refresh', command=self._refresh, width=8)
+        self._refresh_button.pack(side='right')
+
+        # Create Progress bar
+        self._progresslabel = Label(progressframe, text='',
+                                    foreground=self._BACKDROP_COLOR[0],
+                                    background=self._BACKDROP_COLOR[1])
+        self._progressbar = Canvas(progressframe, width=200, height=16,
+                                   background=self._PROGRESS_COLOR[1],
+                                   relief='sunken', border=1)
+        self._init_progressbar()
+        self._progressbar.pack(side='right')
+        self._progresslabel.pack(side='left')
+
+    def _init_menu(self):
+        menubar = Menu(self.top)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Download', underline=0,
+                             command=self._download, accelerator='Return')
+        filemenu.add_separator()
+        filemenu.add_command(label='Change Server Index', underline=7,
+                             command=lambda: self._info_edit('url'))
+        filemenu.add_command(label='Change Download Directory', underline=0,
+                             command=lambda: self._info_edit('download_dir'))
+        filemenu.add_separator()
+        filemenu.add_command(label='Show Log', underline=5,
+                             command=self._show_log)
+        filemenu.add_separator()
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        # Create a menu to control which columns of the table are
+        # shown.  n.b.: we never hide the first two columns (mark and
+        # identifier).
+        viewmenu = Menu(menubar, tearoff=0)
+        for column in self._table.column_names[2:]:
+            var = IntVar(self.top)
+            assert column not in self._column_vars
+            self._column_vars[column] = var
+            if column in self.INITIAL_COLUMNS: var.set(1)
+            viewmenu.add_checkbutton(label=column, underline=0, variable=var,
+                                     command=self._select_columns)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
+
+        # Create a sort menu
+        # [xx] this should be selectbuttons; and it should include
+        # reversed sorts as options.
+        sortmenu = Menu(menubar, tearoff=0)
+        for column in self._table.column_names[1:]:
+            sortmenu.add_command(label='Sort by %s' % column,
+                      command=(lambda c=column:
+                               self._table.sort_by(c, 'ascending')))
+        sortmenu.add_separator()
+        #sortmenu.add_command(label='Descending Sort:')
+        for column in self._table.column_names[1:]:
+            sortmenu.add_command(label='Reverse sort by %s' % column,
+                      command=(lambda c=column:
+                               self._table.sort_by(c, 'descending')))
+        menubar.add_cascade(label='Sort', underline=0, menu=sortmenu)
+
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label='About', underline=0,
+                             command=self.about)
+        helpmenu.add_command(label='Instructions', underline=0,
+                             command=self.help, accelerator='F1')
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+        self.top.bind('<F1>', self.help)
+
+        self.top.config(menu=menubar)
+
+    def _select_columns(self):
+        for (column, var) in self._column_vars.items():
+            if var.get():
+                self._table.show_column(column)
+            else:
+                self._table.hide_column(column)
+
+    def _refresh(self):
+        self._ds.clear_status_cache()
+        try:
+            self._fill_table()
+        except HTTPError as e:
+            showerror('Error reading from server', e)
+        except URLError as e:
+            showerror('Error connecting to server', e.reason)
+        self._table.select(0)
+
+    def _info_edit(self, info_key):
+        self._info_save() # just in case.
+        (entry, callback) = self._info[info_key]
+        entry['state'] = 'normal'
+        entry['relief'] = 'sunken'
+        entry.focus()
+
+    def _info_save(self, e=None):
+        focus = self._table
+        for entry, callback in self._info.values():
+            if entry['state'] == 'disabled': continue
+            if e is not None and e.widget is entry and e.keysym != 'Return':
+                focus = entry
+            else:
+                entry['state'] = 'disabled'
+                entry['relief'] = 'groove'
+                callback(entry.get())
+        focus.focus()
+
+    def _table_reprfunc(self, row, col, val):
+        if self._table.column_names[col].endswith('Size'):
+            if isinstance(val, string_types): return '  %s' % val
+            elif val < 1024**2: return '  %.1f KB' % (val/1024.**1)
+            elif val < 1024**3: return '  %.1f MB' % (val/1024.**2)
+            else: return '  %.1f GB' % (val/1024.**3)
+
+        if col in (0, ''): return str(val)
+        else: return '  %s' % val
+
+    def _set_url(self, url):
+        if url == self._ds.url: return
+        try:
+            self._ds.url = url
+            self._fill_table()
+        except IOError as e:
+            showerror('Error Setting Server Index', str(e))
+        self._show_info()
+
+
+    def _set_download_dir(self, download_dir):
+        if self._ds.download_dir == download_dir: return
+        # check if the dir exists, and if not, ask if we should create it?
+
+        # Clear our status cache, & re-check what's installed
+        self._ds.download_dir = download_dir
+        try:
+            self._fill_table()
+        except HTTPError as e:
+            showerror('Error reading from server', e)
+        except URLError as e:
+            showerror('Error connecting to server', e.reason)
+        self._show_info()
+
+    def _show_info(self):
+        print('showing info', self._ds.url)
+        for entry,cb in self._info.values():
+            entry['state'] = 'normal'
+            entry.delete(0, 'end')
+        self._info['url'][0].insert(0, self._ds.url)
+        self._info['download_dir'][0].insert(0, self._ds.download_dir)
+        for entry,cb in self._info.values():
+            entry['state'] = 'disabled'
+
+    def _prev_tab(self, *e):
+        for i, tab in enumerate(self._tab_names):
+            if tab.lower() == self._tab and i > 0:
+                self._tab = self._tab_names[i-1].lower()
+                try:
+                    return self._fill_table()
+                except HTTPError as e:
+                    showerror('Error reading from server', e)
+                except URLError as e:
+                    showerror('Error connecting to server', e.reason)
+
+    def _next_tab(self, *e):
+        for i, tab in enumerate(self._tab_names):
+            if tab.lower() == self._tab and i < (len(self._tabs)-1):
+                self._tab = self._tab_names[i+1].lower()
+                try:
+                    return self._fill_table()
+                except HTTPError as e:
+                    showerror('Error reading from server', e)
+                except URLError as e:
+                    showerror('Error connecting to server', e.reason)
+
+    def _select_tab(self, event):
+        self._tab = event.widget['text'].lower()
+        try:
+            self._fill_table()
+        except HTTPError as e:
+            showerror('Error reading from server', e)
+        except URLError as e:
+            showerror('Error connecting to server', e.reason)
+
+    _tab = 'collections'
+    #_tab = 'corpora'
+    _rows = None
+    def _fill_table(self):
+        selected_row = self._table.selected_row()
+        self._table.clear()
+        if self._tab == 'all packages':
+            items = self._ds.packages()
+        elif self._tab == 'corpora':
+            items = self._ds.corpora()
+        elif self._tab == 'models':
+            items = self._ds.models()
+        elif self._tab == 'collections':
+            items = self._ds.collections()
+        else:
+            assert 0, 'bad tab value %r' % self._tab
+        rows = [self._package_to_columns(item) for item in items]
+        self._table.extend(rows)
+
+        # Highlight the active tab.
+        for tab, label in self._tabs.items():
+            if tab == self._tab:
+                label.configure(foreground=self._FRONT_TAB_COLOR[0],
+                                background=self._FRONT_TAB_COLOR[1])
+            else:
+                label.configure(foreground=self._BACK_TAB_COLOR[0],
+                                background=self._BACK_TAB_COLOR[1])
+
+        self._table.sort_by('Identifier', order='ascending')
+        self._color_table()
+        self._table.select(selected_row)
+
+        # This is a hack, because the scrollbar isn't updating its
+        # position right -- I'm not sure what the underlying cause is
+        # though.  (This is on OS X w/ python 2.5)  The length of
+        # delay that's necessary seems to depend on how fast the
+        # comptuer is. :-/
+        self.top.after(150, self._table._scrollbar.set,
+                       *self._table._mlb.yview())
+        self.top.after(300, self._table._scrollbar.set,
+                       *self._table._mlb.yview())
+
+    def _update_table_status(self):
+        for row_num in range(len(self._table)):
+            status = self._ds.status(self._table[row_num, 'Identifier'])
+            self._table[row_num, 'Status'] = status
+        self._color_table()
+
+    def _download(self, *e):
+        # If we're using threads, then delegate to the threaded
+        # downloader instead.
+        if self._use_threads:
+            return self._download_threaded(*e)
+
+        marked = [self._table[row, 'Identifier']
+                  for row in range(len(self._table))
+                  if self._table[row, 0] != '']
+        selection = self._table.selected_row()
+        if not marked and selection is not None:
+            marked = [self._table[selection, 'Identifier']]
+
+        download_iter = self._ds.incr_download(marked, self._ds.download_dir)
+        self._log_indent = 0
+        self._download_cb(download_iter, marked)
+
+    _DL_DELAY=10
+    def _download_cb(self, download_iter, ids):
+        try: msg = next(download_iter)
+        except StopIteration:
+            #self._fill_table(sort=False)
+            self._update_table_status()
+            afterid = self.top.after(10, self._show_progress, 0)
+            self._afterid['_download_cb'] = afterid
+            return
+
+        def show(s):
+            self._progresslabel['text'] = s
+            self._log(s)
+        if isinstance(msg, ProgressMessage):
+            self._show_progress(msg.progress)
+        elif isinstance(msg, ErrorMessage):
+            show(msg.message)
+            if msg.package is not None:
+                self._select(msg.package.id)
+            self._show_progress(None)
+            return # halt progress.
+        elif isinstance(msg, StartCollectionMessage):
+            show('Downloading collection %s' % msg.collection.id)
+            self._log_indent += 1
+        elif isinstance(msg, StartPackageMessage):
+            show('Downloading package %s' % msg.package.id)
+        elif isinstance(msg, UpToDateMessage):
+            show('Package %s is up-to-date!' % msg.package.id)
+        #elif isinstance(msg, StaleMessage):
+        #    show('Package %s is out-of-date or corrupt' % msg.package.id)
+        elif isinstance(msg, FinishDownloadMessage):
+            show('Finished downloading %r.' % msg.package.id)
+        elif isinstance(msg, StartUnzipMessage):
+            show('Unzipping %s' % msg.package.filename)
+        elif isinstance(msg, FinishCollectionMessage):
+            self._log_indent -= 1
+            show('Finished downloading collection %r.' % msg.collection.id)
+            self._clear_mark(msg.collection.id)
+        elif isinstance(msg, FinishPackageMessage):
+            self._clear_mark(msg.package.id)
+        afterid = self.top.after(self._DL_DELAY, self._download_cb,
+                                 download_iter, ids)
+        self._afterid['_download_cb'] = afterid
+
+    def _select(self, id):
+        for row in range(len(self._table)):
+            if self._table[row, 'Identifier'] == id:
+                self._table.select(row)
+                return
+
+    def _color_table(self):
+        # Color rows according to status.
+        for row in range(len(self._table)):
+            bg, sbg = self._ROW_COLOR[self._table[row, 'Status']]
+            fg, sfg = ('black', 'white')
+            self._table.rowconfig(row, foreground=fg, selectforeground=sfg,
+                                  background=bg, selectbackground=sbg)
+            # Color the marked column
+            self._table.itemconfigure(row, 0,
+                                      foreground=self._MARK_COLOR[0],
+                                      background=self._MARK_COLOR[1])
+
+
+    def _clear_mark(self, id):
+        for row in range(len(self._table)):
+            if self._table[row, 'Identifier'] == id:
+                self._table[row, 0] = ''
+
+    def _mark_all(self, *e):
+        for row in range(len(self._table)):
+            self._table[row,0] = 'X'
+
+    def _table_mark(self, *e):
+        selection = self._table.selected_row()
+        if selection >= 0:
+            if self._table[selection][0] != '':
+                self._table[selection,0] = ''
+            else:
+                self._table[selection,0] = 'X'
+        self._table.select(delta=1)
+
+    def _show_log(self):
+        text = '\n'.join(self._log_messages)
+        ShowText(self.top, 'NLTK Downloader Log', text)
+
+    def _package_to_columns(self, pkg):
+        """
+        Given a package, return a list of values describing that
+        package, one for each column in ``self.COLUMNS``.
+        """
+        row = []
+        for column_index, column_name in enumerate(self.COLUMNS):
+            if column_index == 0: # Mark:
+                row.append('')
+            elif column_name == 'Identifier':
+                row.append(pkg.id)
+            elif column_name == 'Status':
+                row.append(self._ds.status(pkg))
+            else:
+                attr = column_name.lower().replace(' ', '_')
+                row.append(getattr(pkg, attr, 'n/a'))
+        return row
+
+    #/////////////////////////////////////////////////////////////////
+    # External Interface
+    #/////////////////////////////////////////////////////////////////
+
+    def destroy(self, *e):
+        if self._destroyed: return
+        self.top.destroy()
+        self._destroyed = True
+
+    def _destroy(self, *e):
+        if self.top is not None:
+            for afterid in self._afterid.values():
+                self.top.after_cancel(afterid)
+
+        # Abort any download in progress.
+        if self._downloading and self._use_threads:
+            self._abort_download()
+
+        # Make sure the garbage collector destroys these now;
+        # otherwise, they may get destroyed when we're not in the main
+        # thread, which would make Tkinter unhappy.
+        self._column_vars.clear()
+
+    def mainloop(self, *args, **kwargs):
+        self.top.mainloop(*args, **kwargs)
+
+    #/////////////////////////////////////////////////////////////////
+    # HELP
+    #/////////////////////////////////////////////////////////////////
+
+    HELP = textwrap.dedent("""\
+    This tool can be used to download a variety of corpora and models
+    that can be used with NLTK.  Each corpus or model is distributed
+    in a single zip file, known as a \"package file.\"  You can
+    download packages individually, or you can download pre-defined
+    collections of packages.
+
+    When you download a package, it will be saved to the \"download
+    directory.\"  A default download directory is chosen when you run
+
+    the downloader; but you may also select a different download
+    directory.  On Windows, the default download directory is
+
+
+    \"package.\"
+
+    The NLTK downloader can be used to download a variety of corpora,
+    models, and other data packages.
+
+    Keyboard shortcuts::
+      [return]\t Download
+      [up]\t Select previous package
+      [down]\t Select next package
+      [left]\t Select previous tab
+      [right]\t Select next tab
+    """)
+
+    def help(self, *e):
+        # The default font's not very legible; try using 'fixed' instead.
+        try:
+            ShowText(self.top, 'Help: NLTK Dowloader',
+                     self.HELP.strip(), width=75, font='fixed')
+        except:
+            ShowText(self.top, 'Help: NLTK Downloader',
+                     self.HELP.strip(), width=75)
+
+    def about(self, *e):
+        ABOUT = ("NLTK Downloader\n"+
+                 "Written by Edward Loper")
+        TITLE = 'About: NLTK Downloader'
+        try:
+            from six.moves.tkinter_messagebox import Message
+            Message(message=ABOUT, title=TITLE).show()
+        except ImportError:
+            ShowText(self.top, TITLE, ABOUT)
+
+    #/////////////////////////////////////////////////////////////////
+    # Progress Bar
+    #/////////////////////////////////////////////////////////////////
+
+    _gradient_width = 5
+    def _init_progressbar(self):
+        c = self._progressbar
+        width, height = int(c['width']), int(c['height'])
+        for i in range(0, (int(c['width'])*2)//self._gradient_width):
+            c.create_line(i*self._gradient_width+20, -20,
+                          i*self._gradient_width-height-20, height+20,
+                          width=self._gradient_width,
+                          fill='#%02x0000' % (80 + abs(i%6-3)*12))
+        c.addtag_all('gradient')
+        c.itemconfig('gradient', state='hidden')
+
+        # This is used to display progress
+        c.addtag_withtag('redbox', c.create_rectangle(
+            0, 0, 0, 0, fill=self._PROGRESS_COLOR[0]))
+
+    def _show_progress(self, percent):
+        c = self._progressbar
+        if percent is None:
+            c.coords('redbox', 0, 0, 0, 0)
+            c.itemconfig('gradient', state='hidden')
+        else:
+            width, height = int(c['width']), int(c['height'])
+            x = percent * int(width) // 100 + 1
+            c.coords('redbox', 0, 0, x, height+1)
+
+    def _progress_alive(self):
+        c = self._progressbar
+        if not self._downloading:
+            c.itemconfig('gradient', state='hidden')
+        else:
+            c.itemconfig('gradient', state='normal')
+            x1, y1, x2, y2 = c.bbox('gradient')
+            if x1 <= -100:
+                c.move('gradient', (self._gradient_width*6)-4, 0)
+            else:
+                c.move('gradient', -4, 0)
+            afterid = self.top.after(200, self._progress_alive)
+            self._afterid['_progress_alive'] = afterid
+
+    #/////////////////////////////////////////////////////////////////
+    # Threaded downloader
+    #/////////////////////////////////////////////////////////////////
+
+    def _download_threaded(self, *e):
+        # If the user tries to start a new download while we're already
+        # downloading something, then abort the current download instead.
+        if self._downloading:
+            self._abort_download()
+            return
+
+        # Change the 'download' button to an 'abort' button.
+        self._download_button['text'] = 'Cancel'
+
+        marked = [self._table[row, 'Identifier']
+                  for row in range(len(self._table))
+                  if self._table[row, 0] != '']
+        selection = self._table.selected_row()
+        if not marked and selection is not None:
+            marked = [self._table[selection, 'Identifier']]
+
+        # Create a new data server object for the download operation,
+        # just in case the user modifies our data server during the
+        # download (e.g., clicking 'refresh' or editing the index url).
+        ds = Downloader(self._ds.url, self._ds.download_dir)
+
+        # Start downloading in a separate thread.
+        assert self._download_msg_queue == []
+        assert self._download_abort_queue == []
+        self._DownloadThread(ds, marked, self._download_lock,
+                             self._download_msg_queue,
+                             self._download_abort_queue).start()
+
+        # Monitor the download message queue & display its progress.
+        self._log_indent = 0
+        self._downloading = True
+        self._monitor_message_queue()
+
+        # Display an indication that we're still alive and well by
+        # cycling the progress bar.
+        self._progress_alive()
+
+    def _abort_download(self):
+        if self._downloading:
+            self._download_lock.acquire()
+            self._download_abort_queue.append('abort')
+            self._download_lock.release()
+
+    class _DownloadThread(threading.Thread):
+        def __init__(self, data_server, items, lock, message_queue, abort):
+            self.data_server = data_server
+            self.items = items
+            self.lock = lock
+            self.message_queue = message_queue
+            self.abort = abort
+            threading.Thread.__init__(self)
+
+        def run (self):
+            for msg in self.data_server.incr_download(self.items):
+                self.lock.acquire()
+                self.message_queue.append(msg)
+                # Check if we've been told to kill ourselves:
+                if self.abort:
+                    self.message_queue.append('aborted')
+                    self.lock.release()
+                    return
+                self.lock.release()
+            self.lock.acquire()
+            self.message_queue.append('finished')
+            self.lock.release()
+
+    _MONITOR_QUEUE_DELAY=100
+    def _monitor_message_queue(self):
+        def show(s):
+            self._progresslabel['text'] = s
+            self._log(s)
+
+        # Try to acquire the lock; if it's busy, then just try again later.
+        if not self._download_lock.acquire():
+            return
+        for msg in self._download_msg_queue:
+
+            # Done downloading?
+            if msg == 'finished' or msg == 'aborted':
+                #self._fill_table(sort=False)
+                self._update_table_status()
+                self._downloading = False
+                self._download_button['text'] = 'Download'
+                del self._download_msg_queue[:]
+                del self._download_abort_queue[:]
+                self._download_lock.release()
+                if msg == 'aborted':
+                    show('Download aborted!')
+                    self._show_progress(None)
+                else:
+                    afterid = self.top.after(100, self._show_progress, None)
+                    self._afterid['_monitor_message_queue'] = afterid
+                return
+
+            # All other messages
+            elif isinstance(msg, ProgressMessage):
+                self._show_progress(msg.progress)
+            elif isinstance(msg, ErrorMessage):
+                show(msg.message)
+                if msg.package is not None:
+                    self._select(msg.package.id)
+                self._show_progress(None)
+                self._downloading = False
+                return # halt progress.
+            elif isinstance(msg, StartCollectionMessage):
+                show('Downloading collection %r' % msg.collection.id)
+                self._log_indent += 1
+            elif isinstance(msg, StartPackageMessage):
+                self._ds.clear_status_cache(msg.package.id)
+                show('Downloading package %r' % msg.package.id)
+            elif isinstance(msg, UpToDateMessage):
+                show('Package %s is up-to-date!' % msg.package.id)
+            #elif isinstance(msg, StaleMessage):
+            #    show('Package %s is out-of-date or corrupt; updating it' %
+            #         msg.package.id)
+            elif isinstance(msg, FinishDownloadMessage):
+                show('Finished downloading %r.' % msg.package.id)
+            elif isinstance(msg, StartUnzipMessage):
+                show('Unzipping %s' % msg.package.filename)
+            elif isinstance(msg, FinishUnzipMessage):
+                show('Finished installing %s' % msg.package.id)
+            elif isinstance(msg, FinishCollectionMessage):
+                self._log_indent -= 1
+                show('Finished downloading collection %r.' % msg.collection.id)
+                self._clear_mark(msg.collection.id)
+            elif isinstance(msg, FinishPackageMessage):
+                self._update_table_status()
+                self._clear_mark(msg.package.id)
+
+        # Let the user know when we're aborting a download (but
+        # waiting for a good point to abort it, so we don't end up
+        # with a partially unzipped package or anything like that).
+        if self._download_abort_queue:
+            self._progresslabel['text'] = 'Aborting download...'
+
+        # Clear the message queue and then release the lock
+        del self._download_msg_queue[:]
+        self._download_lock.release()
+
+        # Check the queue again after MONITOR_QUEUE_DELAY msec.
+        afterid = self.top.after(self._MONITOR_QUEUE_DELAY,
+                                 self._monitor_message_queue)
+        self._afterid['_monitor_message_queue'] = afterid
+
+######################################################################
+# Helper Functions
+######################################################################
+# [xx] It may make sense to move these to nltk.internals.
+
+def md5_hexdigest(file):
+    """
+    Calculate and return the MD5 checksum for a given file.
+    ``file`` may either be a filename or an open stream.
+    """
+    if isinstance(file, string_types):
+        with open(file, 'rb') as infile:
+            return _md5_hexdigest(infile)
+    return _md5_hexdigest(file)
+
+def _md5_hexdigest(fp):
+    md5_digest = md5()
+    while True:
+        block = fp.read(1024*16)  # 16k blocks
+        if not block: break
+        md5_digest.update(block)
+    return md5_digest.hexdigest()
+
+
+# change this to periodically yield progress messages?
+# [xx] get rid of topdir parameter -- we should be checking
+# this when we build the index, anyway.
+def unzip(filename, root, verbose=True):
+    """
+    Extract the contents of the zip file ``filename`` into the
+    directory ``root``.
+    """
+    for message in _unzip_iter(filename, root, verbose):
+        if isinstance(message, ErrorMessage):
+            raise Exception(message)
+
+def _unzip_iter(filename, root, verbose=True):
+    if verbose:
+        sys.stdout.write('Unzipping %s' % os.path.split(filename)[1])
+        sys.stdout.flush()
+
+    try: zf = zipfile.ZipFile(filename)
+    except zipfile.error as e:
+        yield ErrorMessage(filename, 'Error with downloaded zip file')
+        return
+    except Exception as e:
+        yield ErrorMessage(filename, e)
+        return
+
+    # Get lists of directories & files
+    namelist = zf.namelist()
+    dirlist = set()
+    for x in namelist:
+        if x.endswith('/'):
+            dirlist.add(x)
+        else:
+            dirlist.add(x.rsplit('/',1)[0] + '/')
+    filelist = [x for x in namelist if not x.endswith('/')]
+
+    # Create the target directory if it doesn't exist
+    if not os.path.exists(root):
+        os.mkdir(root)
+
+    # Create the directory structure
+    for dirname in sorted(dirlist):
+        pieces = dirname[:-1].split('/')
+        for i in range(len(pieces)):
+            dirpath = os.path.join(root, *pieces[:i+1])
+            if not os.path.exists(dirpath):
+                os.mkdir(dirpath)
+
+    # Extract files.
+    for i, filename in enumerate(filelist):
+        filepath = os.path.join(root, *filename.split('/'))
+
+        try:
+            with open(filepath, 'wb') as dstfile, zf.open(filename) as srcfile:
+                shutil.copyfileobj(srcfile, dstfile)
+        except Exception as e:
+            yield ErrorMessage(filename, e)
+            return
+
+        if verbose and (i*10/len(filelist) > (i-1)*10/len(filelist)):
+            sys.stdout.write('.')
+            sys.stdout.flush()
+    if verbose:
+        print()
+
+######################################################################
+# Index Builder
+######################################################################
+# This may move to a different file sometime.
+import subprocess, zipfile
+
+def build_index(root, base_url):
+    """
+    Create a new data.xml index file, by combining the xml description
+    files for various packages and collections.  ``root`` should be the
+    path to a directory containing the package xml and zip files; and
+    the collection xml files.  The ``root`` directory is expected to
+    have the following subdirectories::
+
+      root/
+        packages/ .................. subdirectory for packages
+          corpora/ ................. zip & xml files for corpora
+          grammars/ ................ zip & xml files for grammars
+          taggers/ ................. zip & xml files for taggers
+          tokenizers/ .............. zip & xml files for tokenizers
+          etc.
+        collections/ ............... xml files for collections
+
+    For each package, there should be two files: ``package.zip``
+    (where *package* is the package name)
+    which contains the package itself as a compressed zip file; and
+    ``package.xml``, which is an xml description of the package.  The
+    zipfile ``package.zip`` should expand to a single subdirectory
+    named ``package/``.  The base filename ``package`` must match
+    the identifier given in the package's xml file.
+
+    For each collection, there should be a single file ``collection.zip``
+    describing the collection, where *collection* is the name of the collection.
+
+    All identifiers (for both packages and collections) must be unique.
+    """
+    # Find all packages.
+    packages = []
+    for pkg_xml, zf, subdir in _find_packages(os.path.join(root, 'packages')):
+        zipstat = os.stat(zf.filename)
+        url = '%s/%s/%s' % (base_url, subdir, os.path.split(zf.filename)[1])
+        unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist())
+
+        # Fill in several fields of the package xml with calculated values.
+        pkg_xml.set('unzipped_size', '%s' % unzipped_size)
+        pkg_xml.set('size', '%s' % zipstat.st_size)
+        pkg_xml.set('checksum', '%s' % md5_hexdigest(zf.filename))
+        pkg_xml.set('subdir', subdir)
+        #pkg_xml.set('svn_revision', _svn_revision(zf.filename))
+        if not pkg_xml.get('url'):
+            pkg_xml.set('url', url)
+
+        # Record the package.
+        packages.append(pkg_xml)
+
+    # Find all collections
+    collections = list(_find_collections(os.path.join(root, 'collections')))
+
+    # Check that all UIDs are unique
+    uids = set()
+    for item in packages+collections:
+        if item.get('id') in uids:
+            raise ValueError('Duplicate UID: %s' % item.get('id'))
+        uids.add(item.get('id'))
+
+    # Put it all together
+    top_elt = ElementTree.Element('nltk_data')
+    top_elt.append(ElementTree.Element('packages'))
+    for package in packages: top_elt[0].append(package)
+    top_elt.append(ElementTree.Element('collections'))
+    for collection in collections: top_elt[1].append(collection)
+
+    _indent_xml(top_elt)
+    return top_elt
+
+def _indent_xml(xml, prefix=''):
+    """
+    Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
+    (and its descendents) ``text`` and ``tail`` attributes to generate
+    an indented tree, where each nested element is indented by 2
+    spaces with respect to its parent.
+    """
+    if len(xml) > 0:
+        xml.text = (xml.text or '').strip() + '\n' + prefix + '  '
+        for child in xml:
+            _indent_xml(child, prefix+'  ')
+        for child in xml[:-1]:
+            child.tail = (child.tail or '').strip() + '\n' + prefix + '  '
+        xml[-1].tail = (xml[-1].tail or '').strip() + '\n' + prefix
+
+def _check_package(pkg_xml, zipfilename, zf):
+    """
+    Helper for ``build_index()``: Perform some checks to make sure that
+    the given package is consistent.
+    """
+    # The filename must patch the id given in the XML file.
+    uid = os.path.splitext(os.path.split(zipfilename)[1])[0]
+    if pkg_xml.get('id') != uid:
+        raise ValueError('package identifier mismatch (%s vs %s)' %
+                         (pkg_xml.get('id'), uid))
+
+    # Zip file must expand to a subdir whose name matches uid.
+    if sum( (name!=uid and not name.startswith(uid+'/'))
+            for name in zf.namelist() ):
+        raise ValueError('Zipfile %s.zip does not expand to a single '
+                         'subdirectory %s/' % (uid, uid))
+
+# update for git?
+def _svn_revision(filename):
+    """
+    Helper for ``build_index()``: Calculate the subversion revision
+    number for a given file (by using ``subprocess`` to run ``svn``).
+    """
+    p = subprocess.Popen(['svn', 'status', '-v', filename],
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
+    (stdout, stderr) = p.communicate()
+    if p.returncode != 0 or stderr or not stdout:
+        raise ValueError('Error determining svn_revision for %s: %s' %
+                         (os.path.split(filename)[1], textwrap.fill(stderr)))
+    return stdout.split()[2]
+
+def _find_collections(root):
+    """
+    Helper for ``build_index()``: Yield a list of ElementTree.Element
+    objects, each holding the xml for a single package collection.
+    """
+    packages = []
+    for dirname, subdirs, files in os.walk(root):
+        for filename in files:
+            if filename.endswith('.xml'):
+                xmlfile = os.path.join(dirname, filename)
+                yield ElementTree.parse(xmlfile).getroot()
+
+def _find_packages(root):
+    """
+    Helper for ``build_index()``: Yield a list of tuples
+    ``(pkg_xml, zf, subdir)``, where:
+      - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a
+        package
+      - ``zf`` is a ``zipfile.ZipFile`` for the package's contents.
+      - ``subdir`` is the subdirectory (relative to ``root``) where
+        the package was found (e.g. 'corpora' or 'grammars').
+    """
+    from nltk.corpus.reader.util import _path_from
+    # Find all packages.
+    packages = []
+    for dirname, subdirs, files in os.walk(root):
+        relpath = '/'.join(_path_from(root, dirname))
+        for filename in files:
+            if filename.endswith('.xml'):
+                xmlfilename = os.path.join(dirname, filename)
+                zipfilename = xmlfilename[:-4]+'.zip'
+                try: zf = zipfile.ZipFile(zipfilename)
+                except Exception as e:
+                    raise ValueError('Error reading file %r!\n%s' %
+                                     (zipfilename, e))
+                try: pkg_xml = ElementTree.parse(xmlfilename).getroot()
+                except Exception as e:
+                    raise ValueError('Error reading file %r!\n%s' %
+                                     (xmlfilename, e))
+
+                # Check that the UID matches the filename
+                uid = os.path.split(xmlfilename[:-4])[1]
+                if pkg_xml.get('id') != uid:
+                    raise ValueError('package identifier mismatch (%s '
+                                     'vs %s)' % (pkg_xml.get('id'), uid))
+
+                # Check that the zipfile expands to a subdir whose
+                # name matches the uid.
+                if sum( (name!=uid and not name.startswith(uid+'/'))
+                        for name in zf.namelist() ):
+                    raise ValueError('Zipfile %s.zip does not expand to a '
+                                     'single subdirectory %s/' % (uid, uid))
+
+                yield pkg_xml, zf, relpath
+        # Don't recurse into svn subdirectories:
+        try: subdirs.remove('.svn')
+        except ValueError: pass
+
+######################################################################
+# Main:
+######################################################################
+
+# There should be a command-line interface
+
+# Aliases
+_downloader = Downloader()
+download = _downloader.download
+
+def download_shell():
+    DownloaderShell(_downloader).run()
+
+def download_gui():
+    DownloaderGUI(_downloader).mainloop()
+
+def update():
+    _downloader.update()
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+    parser = OptionParser()
+    parser.add_option("-d", "--dir", dest="dir",
+        help="download package to directory DIR", metavar="DIR")
+    parser.add_option("-q", "--quiet", dest="quiet", action="store_true",
+        default=False, help="work quietly")
+    parser.add_option("-f", "--force", dest="force", action="store_true",
+        default=False, help="download even if already installed")
+    parser.add_option("-e", "--exit-on-error", dest="halt_on_error", action="store_true",
+        default=False, help="exit if an error occurs")
+    parser.add_option("-u", "--url", dest="server_index_url",
+        default=os.environ.get('NLTK_DOWNLOAD_URL'),
+        help="download server index url")
+
+    (options, args) = parser.parse_args()
+
+    downloader = Downloader(server_index_url = options.server_index_url)
+
+    if args:
+        for pkg_id in args:
+            rv = downloader.download(info_or_id=pkg_id, download_dir=options.dir,
+                quiet=options.quiet, force=options.force,
+                halt_on_error=options.halt_on_error)
+            if rv==False and options.halt_on_error:
+                break
+    else:
+        downloader.download(download_dir=options.dir,
+            quiet=options.quiet, force=options.force,
+            halt_on_error=options.halt_on_error)
diff --git a/nlp_resource_data/nltk/downloader.pyc b/nlp_resource_data/nltk/downloader.pyc

new file mode 100755 (executable)

index 0000000..9510b13

Binary files /dev/null and b/nlp_resource_data/nltk/downloader.pyc differ
diff --git a/nlp_resource_data/nltk/draw/__init__.py b/nlp_resource_data/nltk/draw/__init__.py

new file mode 100755 (executable)

index 0000000..fdc6678
--- /dev/null
+++ b/nlp_resource_data/nltk/draw/__init__.py
@@ -0,0 +1,27 @@
+# Natural Language Toolkit: graphical representations package
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# Import Tkinter-based modules if Tkinter is installed
+try:
+    from six.moves import tkinter
+except ImportError:
+    import warnings
+    warnings.warn("nltk.draw package not loaded "
+                  "(please install Tkinter library).")
+else:
+    from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo
+    from nltk.draw.tree import (TreeSegmentWidget, tree_to_treesegment,
+                      TreeWidget, TreeView, draw_trees)
+    from nltk.draw.table import Table
+
+from nltk.draw.dispersion import dispersion_plot
+
+# skip doctests from this package
+def setup_module(module):
+    from nose import SkipTest
+    raise SkipTest("nltk.draw examples are not doctests")
diff --git a/nlp_resource_data/nltk/draw/__init__.pyc b/nlp_resource_data/nltk/draw/__init__.pyc

new file mode 100755 (executable)

index 0000000..94f0ee4

Binary files /dev/null and b/nlp_resource_data/nltk/draw/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/draw/cfg.py b/nlp_resource_data/nltk/draw/cfg.py

new file mode 100755 (executable)

index 0000000..3038f9f
--- /dev/null
+++ b/nlp_resource_data/nltk/draw/cfg.py
@@ -0,0 +1,774 @@
+# Natural Language Toolkit: CFG visualization
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Visualization tools for CFGs.
+"""
+
+# Idea for a nice demo:
+#   - 3 panes: grammar, treelet, working area
+#     - grammar is a list of productions
+#     - when you select a production, the treelet that it licenses appears
+#       in the treelet area
+#     - the working area has the text on the bottom, and S at top.  When
+#       you select a production, it shows (ghosted) the locations where
+#       that production's treelet could be attached to either the text
+#       or the tree rooted at S.
+#     - the user can drag the treelet onto one of those (or click on them?)
+#     - the user can delete pieces of the tree from the working area
+#       (right click?)
+#     - connecting top to bottom? drag one NP onto another?
+#
+# +-------------------------------------------------------------+
+# | S -> NP VP   |                 S                            |
+# |[NP -> Det N ]|                / \                           |
+# |     ...      |              NP  VP                          |
+# | N -> 'dog'   |                                              |
+# | N -> 'cat'   |                                              |
+# |     ...      |                                              |
+# +--------------+                                              |
+# |      NP      |                      Det     N               |
+# |     /  \     |                       |      |               |
+# |   Det   N    |  the    cat    saw   the    dog              |
+# |              |                                              |
+# +--------------+----------------------------------------------+
+#
+# Operations:
+#   - connect a new treelet -- drag or click shadow
+#   - delete a treelet -- right click
+#     - if only connected to top, delete everything below
+#     - if only connected to bottom, delete everything above
+#   - connect top & bottom -- drag a leaf to a root or a root to a leaf
+#   - disconnect top & bottom -- right click
+#     - if connected to top & bottom, then disconnect
+
+import re
+
+from six import string_types
+from six.moves.tkinter import (Button, Canvas, Entry, Frame, IntVar, Label,
+                               Scrollbar, Text, Tk, Toplevel)
+
+from nltk.grammar import (CFG, _read_cfg_production,
+                          Nonterminal, nonterminals)
+from nltk.tree import Tree
+from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment
+from nltk.draw.util import (CanvasFrame, ColorizedList, ShowText,
+                            SymbolWidget, TextWidget)
+
+######################################################################
+# Production List
+######################################################################
+
+class ProductionList(ColorizedList):
+    ARROW = SymbolWidget.SYMBOLS['rightarrow']
+
+    def _init_colortags(self, textwidget, options):
+        textwidget.tag_config('terminal', foreground='#006000')
+        textwidget.tag_config('arrow', font='symbol', underline='0')
+        textwidget.tag_config('nonterminal', foreground='blue',
+                              font=('helvetica', -12, 'bold'))
+
+    def _item_repr(self, item):
+        contents = []
+        contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+        contents.append((self.ARROW, 'arrow'))
+        for elt in item.rhs():
+            if isinstance(elt, Nonterminal):
+                contents.append((' %s' % elt.symbol(), 'nonterminal'))
+            else:
+                contents.append((' %r' % elt, 'terminal'))
+        return contents
+
+######################################################################
+# CFG Editor
+######################################################################
+
+_CFGEditor_HELP = """
+
+The CFG Editor can be used to create or modify context free grammars.
+A context free grammar consists of a start symbol and a list of
+productions.  The start symbol is specified by the text entry field in
+the upper right hand corner of the editor; and the list of productions
+are specified in the main text editing box.
+
+Every non-blank line specifies a single production.  Each production
+has the form "LHS -> RHS," where LHS is a single nonterminal, and RHS
+is a list of nonterminals and terminals.
+
+Nonterminals must be a single word, such as S or NP or NP_subj.
+Currently, nonterminals must consists of alphanumeric characters and
+underscores (_).  Nonterminals are colored blue.  If you place the
+mouse over any nonterminal, then all occurrences of that nonterminal
+will be highlighted.
+
+Terminals must be surrounded by single quotes (') or double
+quotes(\").  For example, "dog" and "New York" are terminals.
+Currently, the string within the quotes must consist of alphanumeric
+characters, underscores, and spaces.
+
+To enter a new production, go to a blank line, and type a nonterminal,
+followed by an arrow (->), followed by a sequence of terminals and
+nonterminals.  Note that "->" (dash + greater-than) is automatically
+converted to an arrow symbol.  When you move your cursor to a
+different line, your production will automatically be colorized.  If
+there are any errors, they will be highlighted in red.
+
+Note that the order of the productions is significant for some
+algorithms.  To re-order the productions, use cut and paste to move
+them.
+
+Use the buttons at the bottom of the window when you are done editing
+the CFG:
+  - Ok: apply the new CFG, and exit the editor.
+  - Apply: apply the new CFG, and do not exit the editor.
+  - Reset: revert to the original CFG, and do not exit the editor.
+  - Cancel: revert to the original CFG, and exit the editor.
+
+"""
+
+class CFGEditor(object):
+    """
+    A dialog window for creating and editing context free grammars.
+    ``CFGEditor`` imposes the following restrictions:
+
+    - All nonterminals must be strings consisting of word
+      characters.
+    - All terminals must be strings consisting of word characters
+      and space characters.
+    """
+    # Regular expressions used by _analyze_line.  Precompile them, so
+    # we can process the text faster.
+    ARROW = SymbolWidget.SYMBOLS['rightarrow']
+    _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|("+ARROW+"))")
+    _ARROW_RE = re.compile("\s*(->|("+ARROW+"))\s*")
+    _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" +              # LHS
+                                "(->|("+ARROW+"))\s*" +        # arrow
+                                r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$") # RHS
+    _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|("+ARROW+")")
+    _BOLD = ('helvetica', -12, 'bold')
+
+    def __init__(self, parent, cfg=None, set_cfg_callback=None):
+        self._parent = parent
+        if cfg is not None: self._cfg = cfg
+        else: self._cfg = CFG(Nonterminal('S'), [])
+        self._set_cfg_callback = set_cfg_callback
+
+        self._highlight_matching_nonterminals = 1
+
+        # Create the top-level window.
+        self._top = Toplevel(parent)
+        self._init_bindings()
+
+        self._init_startframe()
+        self._startframe.pack(side='top', fill='x', expand=0)
+        self._init_prodframe()
+        self._prodframe.pack(side='top', fill='both', expand=1)
+        self._init_buttons()
+        self._buttonframe.pack(side='bottom', fill='x', expand=0)
+
+        self._textwidget.focus()
+
+    def _init_startframe(self):
+        frame = self._startframe = Frame(self._top)
+        self._start = Entry(frame)
+        self._start.pack(side='right')
+        Label(frame, text='Start Symbol:').pack(side='right')
+        Label(frame, text='Productions:').pack(side='left')
+        self._start.insert(0, self._cfg.start().symbol())
+
+    def _init_buttons(self):
+        frame = self._buttonframe = Frame(self._top)
+        Button(frame, text='Ok', command=self._ok,
+               underline=0, takefocus=0).pack(side='left')
+        Button(frame, text='Apply', command=self._apply,
+               underline=0, takefocus=0).pack(side='left')
+        Button(frame, text='Reset', command=self._reset,
+               underline=0, takefocus=0,).pack(side='left')
+        Button(frame, text='Cancel', command=self._cancel,
+               underline=0, takefocus=0).pack(side='left')
+        Button(frame, text='Help', command=self._help,
+               underline=0, takefocus=0).pack(side='right')
+
+    def _init_bindings(self):
+        self._top.title('CFG Editor')
+        self._top.bind('<Control-q>', self._cancel)
+        self._top.bind('<Alt-q>', self._cancel)
+        self._top.bind('<Control-d>', self._cancel)
+        #self._top.bind('<Control-x>', self._cancel)
+        self._top.bind('<Alt-x>', self._cancel)
+        self._top.bind('<Escape>', self._cancel)
+        #self._top.bind('<Control-c>', self._cancel)
+        self._top.bind('<Alt-c>', self._cancel)
+
+        self._top.bind('<Control-o>', self._ok)
+        self._top.bind('<Alt-o>', self._ok)
+        self._top.bind('<Control-a>', self._apply)
+        self._top.bind('<Alt-a>', self._apply)
+        self._top.bind('<Control-r>', self._reset)
+        self._top.bind('<Alt-r>', self._reset)
+        self._top.bind('<Control-h>', self._help)
+        self._top.bind('<Alt-h>', self._help)
+        self._top.bind('<F1>', self._help)
+
+    def _init_prodframe(self):
+        self._prodframe = Frame(self._top)
+
+        # Create the basic Text widget & scrollbar.
+        self._textwidget = Text(self._prodframe, background='#e0e0e0',
+                                exportselection=1)
+        self._textscroll = Scrollbar(self._prodframe, takefocus=0,
+                                     orient='vertical')
+        self._textwidget.config(yscrollcommand = self._textscroll.set)
+        self._textscroll.config(command=self._textwidget.yview)
+        self._textscroll.pack(side='right', fill='y')
+        self._textwidget.pack(expand=1, fill='both', side='left')
+
+        # Initialize the colorization tags.  Each nonterminal gets its
+        # own tag, so they aren't listed here.
+        self._textwidget.tag_config('terminal', foreground='#006000')
+        self._textwidget.tag_config('arrow', font='symbol')
+        self._textwidget.tag_config('error', background='red')
+
+        # Keep track of what line they're on.  We use that to remember
+        # to re-analyze a line whenever they leave it.
+        self._linenum = 0
+
+        # Expand "->" to an arrow.
+        self._top.bind('>', self._replace_arrows)
+
+        # Re-colorize lines when appropriate.
+        self._top.bind('<<Paste>>', self._analyze)
+        self._top.bind('<KeyPress>', self._check_analyze)
+        self._top.bind('<ButtonPress>', self._check_analyze)
+
+        # Tab cycles focus. (why doesn't this work??)
+        def cycle(e, textwidget=self._textwidget):
+            textwidget.tk_focusNext().focus()
+        self._textwidget.bind('<Tab>', cycle)
+
+        prod_tuples = [(p.lhs(),[p.rhs()]) for p in self._cfg.productions()]
+        for i in range(len(prod_tuples)-1,0,-1):
+            if (prod_tuples[i][0] == prod_tuples[i-1][0]):
+                if () in prod_tuples[i][1]: continue
+                if () in prod_tuples[i-1][1]: continue
+                print(prod_tuples[i-1][1])
+                print(prod_tuples[i][1])
+                prod_tuples[i-1][1].extend(prod_tuples[i][1])
+                del prod_tuples[i]
+
+        for lhs, rhss in prod_tuples:
+            print(lhs, rhss)
+            s = '%s ->' % lhs
+            for rhs in rhss:
+                for elt in rhs:
+                    if isinstance(elt, Nonterminal): s += ' %s' % elt
+                    else: s += ' %r' % elt
+                s += ' |'
+            s = s[:-2] + '\n'
+            self._textwidget.insert('end', s)
+
+        self._analyze()
+
+#         # Add the producitons to the text widget, and colorize them.
+#         prod_by_lhs = {}
+#         for prod in self._cfg.productions():
+#             if len(prod.rhs()) > 0:
+#                 prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
+#         for (lhs, prods) in prod_by_lhs.items():
+#             self._textwidget.insert('end', '%s ->' % lhs)
+#             self._textwidget.insert('end', self._rhs(prods[0]))
+#             for prod in prods[1:]:
+#                 print '\t|'+self._rhs(prod),
+#                 self._textwidget.insert('end', '\t|'+self._rhs(prod))
+#             print
+#             self._textwidget.insert('end', '\n')
+#         for prod in self._cfg.productions():
+#             if len(prod.rhs()) == 0:
+#                 self._textwidget.insert('end', '%s' % prod)
+#         self._analyze()
+
+#     def _rhs(self, prod):
+#         s = ''
+#         for elt in prod.rhs():
+#             if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
+#             else: s += ' %r' % elt
+#         return s
+
+    def _clear_tags(self, linenum):
+        """
+        Remove all tags (except ``arrow`` and ``sel``) from the given
+        line of the text widget used for editing the productions.
+        """
+        start = '%d.0'%linenum
+        end = '%d.end'%linenum
+        for tag in self._textwidget.tag_names():
+            if tag not in ('arrow', 'sel'):
+                self._textwidget.tag_remove(tag, start, end)
+
+    def _check_analyze(self, *e):
+        """
+        Check if we've moved to a new line.  If we have, then remove
+        all colorization from the line we moved to, and re-colorize
+        the line that we moved from.
+        """
+        linenum = int(self._textwidget.index('insert').split('.')[0])
+        if linenum != self._linenum:
+            self._clear_tags(linenum)
+            self._analyze_line(self._linenum)
+            self._linenum = linenum
+
+    def _replace_arrows(self, *e):
+        """
+        Replace any ``'->'`` text strings with arrows (char \\256, in
+        symbol font).  This searches the whole buffer, but is fast
+        enough to be done anytime they press '>'.
+        """
+        arrow = '1.0'
+        while True:
+            arrow = self._textwidget.search('->', arrow, 'end+1char')
+            if arrow == '': break
+            self._textwidget.delete(arrow, arrow+'+2char')
+            self._textwidget.insert(arrow, self.ARROW, 'arrow')
+            self._textwidget.insert(arrow, '\t')
+
+        arrow = '1.0'
+        while True:
+            arrow = self._textwidget.search(self.ARROW, arrow+'+1char',
+                                            'end+1char')
+            if arrow == '': break
+            self._textwidget.tag_add('arrow', arrow, arrow+'+1char')
+
+    def _analyze_token(self, match, linenum):
+        """
+        Given a line number and a regexp match for a token on that
+        line, colorize the token.  Note that the regexp match gives us
+        the token's text, start index (on the line), and end index (on
+        the line).
+        """
+        # What type of token is it?
+        if match.group()[0] in "'\"": tag = 'terminal'
+        elif match.group() in ('->', self.ARROW): tag = 'arrow'
+        else:
+            # If it's a nonterminal, then set up new bindings, so we
+            # can highlight all instances of that nonterminal when we
+            # put the mouse over it.
+            tag = 'nonterminal_'+match.group()
+            if tag not in self._textwidget.tag_names():
+                self._init_nonterminal_tag(tag)
+
+        start = '%d.%d' % (linenum, match.start())
+        end = '%d.%d' % (linenum, match.end())
+        self._textwidget.tag_add(tag, start, end)
+
+    def _init_nonterminal_tag(self, tag, foreground='blue'):
+        self._textwidget.tag_config(tag, foreground=foreground,
+                                    font=CFGEditor._BOLD)
+        if not self._highlight_matching_nonterminals:
+            return
+        def enter(e, textwidget=self._textwidget, tag=tag):
+            textwidget.tag_config(tag, background='#80ff80')
+        def leave(e, textwidget=self._textwidget, tag=tag):
+            textwidget.tag_config(tag, background='')
+        self._textwidget.tag_bind(tag, '<Enter>', enter)
+        self._textwidget.tag_bind(tag, '<Leave>', leave)
+
+    def _analyze_line(self, linenum):
+        """
+        Colorize a given line.
+        """
+        # Get rid of any tags that were previously on the line.
+        self._clear_tags(linenum)
+
+        # Get the line line's text string.
+        line = self._textwidget.get(repr(linenum)+'.0', repr(linenum)+'.end')
+
+        # If it's a valid production, then colorize each token.
+        if CFGEditor._PRODUCTION_RE.match(line):
+            # It's valid; Use _TOKEN_RE to tokenize the production,
+            # and call analyze_token on each token.
+            def analyze_token(match, self=self, linenum=linenum):
+                self._analyze_token(match, linenum)
+                return ''
+            CFGEditor._TOKEN_RE.sub(analyze_token, line)
+        elif line.strip() != '':
+            # It's invalid; show the user where the error is.
+            self._mark_error(linenum, line)
+
+    def _mark_error(self, linenum, line):
+        """
+        Mark the location of an error in a line.
+        """
+        arrowmatch = CFGEditor._ARROW_RE.search(line)
+        if not arrowmatch:
+            # If there's no arrow at all, highlight the whole line.
+            start = '%d.0' % linenum
+            end = '%d.end' % linenum
+        elif not CFGEditor._LHS_RE.match(line):
+            # Otherwise, if the LHS is bad, highlight it.
+            start = '%d.0' % linenum
+            end = '%d.%d' % (linenum, arrowmatch.start())
+        else:
+            # Otherwise, highlight the RHS.
+            start = '%d.%d' % (linenum, arrowmatch.end())
+            end = '%d.end' % linenum
+
+        # If we're highlighting 0 chars, highlight the whole line.
+        if self._textwidget.compare(start, '==', end):
+            start = '%d.0' % linenum
+            end = '%d.end' % linenum
+        self._textwidget.tag_add('error', start, end)
+
+    def _analyze(self, *e):
+        """
+        Replace ``->`` with arrows, and colorize the entire buffer.
+        """
+        self._replace_arrows()
+        numlines = int(self._textwidget.index('end').split('.')[0])
+        for linenum in range(1, numlines+1):  # line numbers start at 1.
+            self._analyze_line(linenum)
+
+    def _parse_productions(self):
+        """
+        Parse the current contents of the textwidget buffer, to create
+        a list of productions.
+        """
+        productions = []
+
+        # Get the text, normalize it, and split it into lines.
+        text = self._textwidget.get('1.0', 'end')
+        text = re.sub(self.ARROW, '->', text)
+        text = re.sub('\t', ' ', text)
+        lines = text.split('\n')
+
+        # Convert each line to a CFG production
+        for line in lines:
+            line = line.strip()
+            if line=='': continue
+            productions += _read_cfg_production(line)
+            #if line.strip() == '': continue
+            #if not CFGEditor._PRODUCTION_RE.match(line):
+            #    raise ValueError('Bad production string %r' % line)
+            #
+            #(lhs_str, rhs_str) = line.split('->')
+            #lhs = Nonterminal(lhs_str.strip())
+            #rhs = []
+            #def parse_token(match, rhs=rhs):
+            #    token = match.group()
+            #    if token[0] in "'\"": rhs.append(token[1:-1])
+            #    else: rhs.append(Nonterminal(token))
+            #    return ''
+            #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
+            #
+            #productions.append(Production(lhs, *rhs))
+
+        return productions
+
+    def _destroy(self, *e):
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+    def _ok(self, *e):
+        self._apply()
+        self._destroy()
+
+    def _apply(self, *e):
+        productions = self._parse_productions()
+        start = Nonterminal(self._start.get())
+        cfg = CFG(start, productions)
+        if self._set_cfg_callback is not None:
+            self._set_cfg_callback(cfg)
+
+    def _reset(self, *e):
+        self._textwidget.delete('1.0', 'end')
+        for production in self._cfg.productions():
+            self._textwidget.insert('end', '%s\n' % production)
+        self._analyze()
+        if self._set_cfg_callback is not None:
+            self._set_cfg_callback(self._cfg)
+
+    def _cancel(self, *e):
+        try: self._reset()
+        except: pass
+        self._destroy()
+
+    def _help(self, *e):
+        # The default font's not very legible; try using 'fixed' instead.
+        try:
+            ShowText(self._parent, 'Help: Chart Parser Demo',
+                     (_CFGEditor_HELP).strip(), width=75, font='fixed')
+        except:
+            ShowText(self._parent, 'Help: Chart Parser Demo',
+                     (_CFGEditor_HELP).strip(), width=75)
+
+######################################################################
+# New Demo (built tree based on cfg)
+######################################################################
+
+class CFGDemo(object):
+    def __init__(self, grammar, text):
+        self._grammar = grammar
+        self._text = text
+
+        # Set up the main window.
+        self._top = Tk()
+        self._top.title('Context Free Grammar Demo')
+
+        # Base font size
+        self._size = IntVar(self._top)
+        self._size.set(12) # = medium
+
+        # Set up the key bindings
+        self._init_bindings(self._top)
+
+        # Create the basic frames
+        frame1 = Frame(self._top)
+        frame1.pack(side='left', fill='y', expand=0)
+        self._init_menubar(self._top)
+        self._init_buttons(self._top)
+        self._init_grammar(frame1)
+        self._init_treelet(frame1)
+        self._init_workspace(self._top)
+
+    #//////////////////////////////////////////////////
+    # Initialization
+    #//////////////////////////////////////////////////
+
+    def _init_bindings(self, top):
+        top.bind('<Control-q>', self.destroy)
+
+    def _init_menubar(self, parent): pass
+
+    def _init_buttons(self, parent): pass
+
+    def _init_grammar(self, parent):
+        self._prodlist = ProductionList(parent, self._grammar, width=20)
+        self._prodlist.pack(side='top', fill='both', expand=1)
+        self._prodlist.focus()
+        self._prodlist.add_callback('select', self._selectprod_cb)
+        self._prodlist.add_callback('move', self._selectprod_cb)
+
+    def _init_treelet(self, parent):
+        self._treelet_canvas = Canvas(parent, background='white')
+        self._treelet_canvas.pack(side='bottom', fill='x')
+        self._treelet = None
+
+    def _init_workspace(self, parent):
+        self._workspace = CanvasFrame(parent, background='white')
+        self._workspace.pack(side='right', fill='both', expand=1)
+        self._tree = None
+        self.reset_workspace()
+
+    #//////////////////////////////////////////////////
+    # Workspace
+    #//////////////////////////////////////////////////
+
+    def reset_workspace(self):
+        c = self._workspace.canvas()
+        fontsize = int(self._size.get())
+        node_font = ('helvetica', -(fontsize+4), 'bold')
+        leaf_font = ('helvetica', -(fontsize+2))
+
+        # Remove the old tree
+        if self._tree is not None:
+            self._workspace.remove_widget(self._tree)
+
+        # The root of the tree.
+        start = self._grammar.start().symbol()
+        rootnode = TextWidget(c, start, font=node_font, draggable=1)
+
+        # The leaves of the tree.
+        leaves = []
+        for word in self._text:
+            leaves.append(TextWidget(c, word, font=leaf_font, draggable=1))
+
+        # Put it all together into one tree
+        self._tree = TreeSegmentWidget(c, rootnode, leaves,
+                                       color='white')
+
+        # Add it to the workspace.
+        self._workspace.add_widget(self._tree)
+
+        # Move the leaves to the bottom of the workspace.
+        for leaf in leaves: leaf.move(0,100)
+
+        #self._nodes = {start:1}
+        #self._leaves = dict([(l,1) for l in leaves])
+
+    def workspace_markprod(self, production):
+        pass
+
+    def _markproduction(self, prod, tree=None):
+        if tree is None: tree = self._tree
+        for i in range(len(tree.subtrees())-len(prod.rhs())):
+            if tree['color', i] == 'white':
+                self._markproduction
+
+            for j, node in enumerate(prod.rhs()):
+                widget = tree.subtrees()[i+j]
+                if (isinstance(node, Nonterminal) and
+                    isinstance(widget, TreeSegmentWidget) and
+                    node.symbol == widget.label().text()):
+                    pass # matching nonterminal
+                elif (isinstance(node, string_types) and
+                      isinstance(widget, TextWidget) and
+                      node == widget.text()):
+                    pass # matching nonterminal
+                else: break
+            else:
+                # Everything matched!
+                print('MATCH AT', i)
+
+    #//////////////////////////////////////////////////
+    # Grammar
+    #//////////////////////////////////////////////////
+
+    def _selectprod_cb(self, production):
+        canvas = self._treelet_canvas
+
+        self._prodlist.highlight(production)
+        if self._treelet is not None: self._treelet.destroy()
+
+        # Convert the production to a tree.
+        rhs = production.rhs()
+        for (i, elt) in enumerate(rhs):
+            if isinstance(elt, Nonterminal): elt = Tree(elt)
+        tree = Tree(production.lhs().symbol(), *rhs)
+
+        # Draw the tree in the treelet area.
+        fontsize = int(self._size.get())
+        node_font = ('helvetica', -(fontsize+4), 'bold')
+        leaf_font = ('helvetica', -(fontsize+2))
+        self._treelet = tree_to_treesegment(canvas, tree,
+                                            node_font=node_font,
+                                            leaf_font=leaf_font)
+        self._treelet['draggable'] = 1
+
+        # Center the treelet.
+        (x1, y1, x2, y2) = self._treelet.bbox()
+        w, h = int(canvas['width']), int(canvas['height'])
+        self._treelet.move((w-x1-x2)/2, (h-y1-y2)/2)
+
+        # Mark the places where we can add it to the workspace.
+        self._markproduction(production)
+
+    def destroy(self, *args):
+        self._top.destroy()
+
+    def mainloop(self, *args, **kwargs):
+        self._top.mainloop(*args, **kwargs)
+
+def demo2():
+    from nltk import Nonterminal, Production, CFG
+    nonterminals = 'S VP NP PP P N Name V Det'
+    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
+                                           for s in nonterminals.split()]
+    productions = (
+        # Syntactic Productions
+        Production(S, [NP, VP]),
+        Production(NP, [Det, N]),
+        Production(NP, [NP, PP]),
+        Production(VP, [VP, PP]),
+        Production(VP, [V, NP, PP]),
+        Production(VP, [V, NP]),
+        Production(PP, [P, NP]),
+        Production(PP, []),
+
+        Production(PP, ['up', 'over', NP]),
+
+        # Lexical Productions
+        Production(NP, ['I']),   Production(Det, ['the']),
+        Production(Det, ['a']),  Production(N, ['man']),
+        Production(V, ['saw']),  Production(P, ['in']),
+        Production(P, ['with']), Production(N, ['park']),
+        Production(N, ['dog']),  Production(N, ['statue']),
+        Production(Det, ['my']),
+        )
+    grammar = CFG(S, productions)
+
+    text = 'I saw a man in the park'.split()
+    d=CFGDemo(grammar, text)
+    d.mainloop()
+
+######################################################################
+# Old Demo
+######################################################################
+
+def demo():
+    from nltk import Nonterminal, CFG
+    nonterminals = 'S VP NP PP P N Name V Det'
+    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
+                                           for s in nonterminals.split()]
+
+    grammar = CFG.fromstring("""
+    S -> NP VP
+    PP -> P NP
+    NP -> Det N
+    NP -> NP PP
+    VP -> V NP
+    VP -> VP PP
+    Det -> 'a'
+    Det -> 'the'
+    Det -> 'my'
+    NP -> 'I'
+    N -> 'dog'
+    N -> 'man'
+    N -> 'park'
+    N -> 'statue'
+    V -> 'saw'
+    P -> 'in'
+    P -> 'up'
+    P -> 'over'
+    P -> 'with'
+    """)
+
+    def cb(grammar): print(grammar)
+    top = Tk()
+    editor = CFGEditor(top, grammar, cb)
+    Label(top, text='\nTesting CFG Editor\n').pack()
+    Button(top, text='Quit', command=top.destroy).pack()
+    top.mainloop()
+
+def demo3():
+    from nltk import Production
+    (S, VP, NP, PP, P, N, Name, V, Det) = \
+        nonterminals('S, VP, NP, PP, P, N, Name, V, Det')
+
+    productions = (
+        # Syntactic Productions
+        Production(S, [NP, VP]),
+        Production(NP, [Det, N]),
+        Production(NP, [NP, PP]),
+        Production(VP, [VP, PP]),
+        Production(VP, [V, NP, PP]),
+        Production(VP, [V, NP]),
+        Production(PP, [P, NP]),
+        Production(PP, []),
+
+        Production(PP, ['up', 'over', NP]),
+
+        # Lexical Productions
+        Production(NP, ['I']),   Production(Det, ['the']),
+        Production(Det, ['a']),  Production(N, ['man']),
+        Production(V, ['saw']),  Production(P, ['in']),
+        Production(P, ['with']), Production(N, ['park']),
+        Production(N, ['dog']),  Production(N, ['statue']),
+        Production(Det, ['my']),
+        )
+
+    t = Tk()
+    def destroy(e, t=t): t.destroy()
+    t.bind('q', destroy)
+    p = ProductionList(t, productions)
+    p.pack(expand=1, fill='both')
+    p.add_callback('select', p.markonly)
+    p.add_callback('move', p.markonly)
+    p.focus()
+    p.mark(productions[2])
+    p.mark(productions[8])
+
+if __name__ == '__main__': demo()
diff --git a/nlp_resource_data/nltk/draw/cfg.pyc b/nlp_resource_data/nltk/draw/cfg.pyc

new file mode 100755 (executable)

index 0000000..15f6bd5

Binary files /dev/null and b/nlp_resource_data/nltk/draw/cfg.pyc differ
diff --git a/nlp_resource_data/nltk/draw/dispersion.py b/nlp_resource_data/nltk/draw/dispersion.py

new file mode 100755 (executable)

index 0000000..5f3a568
--- /dev/null
+++ b/nlp_resource_data/nltk/draw/dispersion.py
@@ -0,0 +1,58 @@
+# Natural Language Toolkit: Dispersion Plots
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A utility for displaying lexical dispersion.
+"""
+
+def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
+    """
+    Generate a lexical dispersion plot.
+
+    :param text: The source text
+    :type text: list(str) or enum(str)
+    :param words: The target words
+    :type words: list of str
+    :param ignore_case: flag to set if case should be ignored when searching text
+    :type ignore_case: bool
+    """
+
+    try:
+        from matplotlib import pylab
+    except ImportError:
+        raise ValueError('The plot function requires matplotlib to be installed.'
+                     'See http://matplotlib.org/')
+
+    text = list(text)
+    words.reverse()
+
+    if ignore_case:
+        words_to_comp = list(map(str.lower, words))
+        text_to_comp = list(map(str.lower, text))
+    else:
+        words_to_comp = words
+        text_to_comp = text
+
+    points = [(x,y) for x in range(len(text_to_comp))
+                    for y in range(len(words_to_comp))
+                    if text_to_comp[x] == words_to_comp[y]]
+    if points:
+        x, y = list(zip(*points))
+    else:
+        x = y = ()
+    pylab.plot(x, y, "b|", scalex=.1)
+    pylab.yticks(list(range(len(words))), words, color="b")
+    pylab.ylim(-1, len(words))
+    pylab.title(title)
+    pylab.xlabel("Word Offset")
+    pylab.show()
+
+if __name__ == '__main__':
+    import nltk.compat
+    from nltk.corpus import gutenberg
+    words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
+    dispersion_plot(gutenberg.words('austen-sense.txt'), words)
diff --git a/nlp_resource_data/nltk/draw/dispersion.pyc b/nlp_resource_data/nltk/draw/dispersion.pyc

new file mode 100755 (executable)

index 0000000..9e7801c

Binary files /dev/null and b/nlp_resource_data/nltk/draw/dispersion.pyc differ
diff --git a/nlp_resource_data/nltk/draw/table.py b/nlp_resource_data/nltk/draw/table.py

new file mode 100755 (executable)

index 0000000..7894f8e
--- /dev/null
+++ b/nlp_resource_data/nltk/draw/table.py
@@ -0,0 +1,1098 @@
+# Natural Language Toolkit: Table widget
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tkinter widgets for displaying multi-column listboxes and tables.
+"""
+
+from __future__ import division
+
+
+import operator
+
+from six.moves.tkinter import (Frame, Label, Listbox, Scrollbar, Tk)
+
+
+######################################################################
+# Multi-Column Listbox
+######################################################################
+
+class MultiListbox(Frame):
+    """
+    A multi-column listbox, where the current selection applies to an
+    entire row.  Based on the MultiListbox Tkinter widget
+    recipe from the Python Cookbook (http://code.activestate.com/recipes/52266/)
+
+    For the most part, ``MultiListbox`` methods delegate to its
+    contained listboxes.  For any methods that do not have docstrings,
+    see ``Tkinter.Listbox`` for a description of what that method does.
+    """
+    #/////////////////////////////////////////////////////////////////
+    # Configuration
+    #/////////////////////////////////////////////////////////////////
+
+    #: Default configuration values for the frame.
+    FRAME_CONFIG = dict(background='#888',
+                        takefocus=True,
+                        highlightthickness=1)
+
+    #: Default configurations for the column labels.
+    LABEL_CONFIG = dict(borderwidth=1, relief='raised',
+                        font='helvetica -16 bold',
+                      background='#444', foreground='white')
+
+    #: Default configuration for the column listboxes.
+    LISTBOX_CONFIG = dict(borderwidth=1,
+                          selectborderwidth=0,
+                          highlightthickness=0,
+                          exportselection=False,
+                          selectbackground='#888',
+                          activestyle='none',
+                          takefocus=False)
+
+    #/////////////////////////////////////////////////////////////////
+    # Constructor
+    #/////////////////////////////////////////////////////////////////
+
+    def __init__(self, master, columns, column_weights=None, cnf={}, **kw):
+        """
+        Construct a new multi-column listbox widget.
+
+        :param master: The widget that should contain the new
+            multi-column listbox.
+
+        :param columns: Specifies what columns should be included in
+            the new multi-column listbox.  If ``columns`` is an integer,
+            the it is the number of columns to include.  If it is
+            a list, then its length indicates the number of columns
+            to include; and each element of the list will be used as
+            a label for the corresponding column.
+
+        :param cnf, kw: Configuration parameters for this widget.
+            Use ``label_*`` to configure all labels; and ``listbox_*``
+            to configure all listboxes.  E.g.:
+
+                >>> mlb = MultiListbox(master, 5, label_foreground='red')
+        """
+        # If columns was specified as an int, convert it to a list.
+        if isinstance(columns, int):
+            columns = list(range(columns))
+            include_labels = False
+        else:
+            include_labels = True
+
+        if len(columns) == 0:
+            raise ValueError("Expected at least one column")
+
+        # Instance variables
+        self._column_names = tuple(columns)
+        self._listboxes = []
+        self._labels = []
+
+        # Pick a default value for column_weights, if none was specified.
+        if column_weights is None:
+            column_weights = [1] * len(columns)
+        elif len(column_weights) != len(columns):
+            raise ValueError('Expected one column_weight for each column')
+        self._column_weights = column_weights
+
+        # Configure our widgets.
+        Frame.__init__(self, master, **self.FRAME_CONFIG)
+        self.grid_rowconfigure(1, weight=1)
+        for i, label in enumerate(self._column_names):
+            self.grid_columnconfigure(i, weight=column_weights[i])
+
+            # Create a label for the column
+            if include_labels:
+                l = Label(self, text=label, **self.LABEL_CONFIG)
+                self._labels.append(l)
+                l.grid(column=i, row=0, sticky='news', padx=0, pady=0)
+                l.column_index = i
+
+            # Create a listbox for the column
+            lb = Listbox(self, **self.LISTBOX_CONFIG)
+            self._listboxes.append(lb)
+            lb.grid(column=i, row=1, sticky='news', padx=0, pady=0)
+            lb.column_index = i
+
+            # Clicking or dragging selects:
+            lb.bind('<Button-1>', self._select)
+            lb.bind('<B1-Motion>', self._select)
+            # Scroll whell scrolls:
+            lb.bind('<Button-4>', lambda e: self._scroll(-1))
+            lb.bind('<Button-5>', lambda e: self._scroll(+1))
+            lb.bind('<MouseWheel>', lambda e: self._scroll(e.delta))
+            # Button 2 can be used to scan:
+            lb.bind('<Button-2>', lambda e: self.scan_mark(e.x, e.y))
+            lb.bind('<B2-Motion>', lambda e: self.scan_dragto(e.x, e.y))
+            # Dragging outside the window has no effect (diable
+            # the default listbox behavior, which scrolls):
+            lb.bind('<B1-Leave>', lambda e: 'break')
+            # Columns can be resized by dragging them:
+            l.bind('<Button-1>', self._resize_column)
+
+        # Columns can be resized by dragging them.  (This binding is
+        # used if they click on the grid between columns:)
+        self.bind('<Button-1>', self._resize_column)
+
+        # Set up key bindings for the widget:
+        self.bind('<Up>', lambda e: self.select(delta=-1))
+        self.bind('<Down>', lambda e: self.select(delta=1))
+        self.bind('<Prior>', lambda e: self.select(delta=-self._pagesize()))
+        self.bind('<Next>', lambda e: self.select(delta=self._pagesize()))
+
+        # Configuration customizations
+        self.configure(cnf, **kw)
+
+    #/////////////////////////////////////////////////////////////////
+    # Column Resizing
+    #/////////////////////////////////////////////////////////////////
+
+    def _resize_column(self, event):
+        """
+        Callback used to resize a column of the table.  Return ``True``
+        if the column is actually getting resized (if the user clicked
+        on the far left or far right 5 pixels of a label); and
+        ``False`` otherwies.
+        """
+        # If we're already waiting for a button release, then ignore
+        # the new button press.
+        if event.widget.bind('<ButtonRelease>'):
+            return False
+
+        # Decide which column (if any) to resize.
+        self._resize_column_index = None
+        if event.widget is self:
+            for i, lb in enumerate(self._listboxes):
+                if abs(event.x-(lb.winfo_x()+lb.winfo_width())) < 10:
+                    self._resize_column_index = i
+        elif event.x > (event.widget.winfo_width()-5):
+            self._resize_column_index = event.widget.column_index
+        elif event.x < 5 and event.widget.column_index != 0:
+            self._resize_column_index = event.widget.column_index-1
+
+        # Bind callbacks that are used to resize it.
+        if self._resize_column_index is not None:
+            event.widget.bind('<Motion>', self._resize_column_motion_cb)
+            event.widget.bind('<ButtonRelease-%d>' % event.num,
+                              self._resize_column_buttonrelease_cb)
+            return True
+        else:
+            return False
+
+    def _resize_column_motion_cb(self, event):
+        lb = self._listboxes[self._resize_column_index]
+        charwidth = lb.winfo_width() / lb['width']
+
+        x1 = event.x + event.widget.winfo_x()
+        x2 = lb.winfo_x() + lb.winfo_width()
+
+        lb['width'] = max(3, lb['width'] + (x1-x2) // charwidth)
+
+    def _resize_column_buttonrelease_cb(self, event):
+        event.widget.unbind('<ButtonRelease-%d>' % event.num)
+        event.widget.unbind('<Motion>')
+
+    #/////////////////////////////////////////////////////////////////
+    # Properties
+    #/////////////////////////////////////////////////////////////////
+
+    @property
+    def column_names(self):
+        """
+        A tuple containing the names of the columns used by this
+        multi-column listbox.
+        """
+        return self._column_names
+
+    @property
+    def column_labels(self):
+        """
+        A tuple containing the ``Tkinter.Label`` widgets used to
+        display the label of each column.  If this multi-column
+        listbox was created without labels, then this will be an empty
+        tuple.  These widgets will all be augmented with a
+        ``column_index`` attribute, which can be used to determine
+        which column they correspond to.  This can be convenient,
+        e.g., when defining callbacks for bound events.
+        """
+        return tuple(self._labels)
+
+    @property
+    def listboxes(self):
+        """
+        A tuple containing the ``Tkinter.Listbox`` widgets used to
+        display individual columns.  These widgets will all be
+        augmented with a ``column_index`` attribute, which can be used
+        to determine which column they correspond to.  This can be
+        convenient, e.g., when defining callbacks for bound events.
+        """
+        return tuple(self._listboxes)
+
+    #/////////////////////////////////////////////////////////////////
+    # Mouse & Keyboard Callback Functions
+    #/////////////////////////////////////////////////////////////////
+
+    def _select(self, e):
+        i = e.widget.nearest(e.y)
+        self.selection_clear(0, 'end')
+        self.selection_set(i)
+        self.activate(i)
+        self.focus()
+
+    def _scroll(self, delta):
+        for lb in self._listboxes:
+            lb.yview_scroll(delta, 'unit')
+        return 'break'
+
+    def _pagesize(self):
+        """:return: The number of rows that makes up one page"""
+        return int(self.index('@0,1000000')) - int(self.index('@0,0'))
+
+    #/////////////////////////////////////////////////////////////////
+    # Row selection
+    #/////////////////////////////////////////////////////////////////
+
+    def select(self, index=None, delta=None, see=True):
+        """
+        Set the selected row.  If ``index`` is specified, then select
+        row ``index``.  Otherwise, if ``delta`` is specified, then move
+        the current selection by ``delta`` (negative numbers for up,
+        positive numbers for down).  This will not move the selection
+        past the top or the bottom of the list.
+
+        :param see: If true, then call ``self.see()`` with the newly
+            selected index, to ensure that it is visible.
+        """
+        if (index is not None) and (delta is not None):
+            raise ValueError('specify index or delta, but not both')
+
+        # If delta was given, then calculate index.
+        if delta is not None:
+            if len(self.curselection()) == 0:
+                index = -1 + delta
+            else:
+                index = int(self.curselection()[0]) + delta
+
+        # Clear all selected rows.
+        self.selection_clear(0, 'end')
+
+        # Select the specified index
+        if index is not None:
+            index = min(max(index, 0), self.size()-1)
+            #self.activate(index)
+            self.selection_set(index)
+            if see: self.see(index)
+
+    #/////////////////////////////////////////////////////////////////
+    # Configuration
+    #/////////////////////////////////////////////////////////////////
+
+    def configure(self, cnf={}, **kw):
+        """
+        Configure this widget.  Use ``label_*`` to configure all
+        labels; and ``listbox_*`` to configure all listboxes.  E.g.:
+
+                >>> mlb = MultiListbox(master, 5)
+                >>> mlb.configure(label_foreground='red')
+                >>> mlb.configure(listbox_foreground='red')
+        """
+        cnf = dict(list(cnf.items()) + list(kw.items()))
+        for (key, val) in list(cnf.items()):
+            if key.startswith('label_') or key.startswith('label-'):
+                for label in self._labels:
+                    label.configure({key[6:]: val})
+            elif key.startswith('listbox_') or key.startswith('listbox-'):
+                for listbox in self._listboxes:
+                    listbox.configure({key[8:]: val})
+            else:
+                Frame.configure(self, {key:val})
+
+    def __setitem__(self, key, val):
+        """
+        Configure this widget.  This is equivalent to
+        ``self.configure({key,val``)}.  See ``configure()``.
+        """
+        self.configure({key:val})
+
+    def rowconfigure(self, row_index, cnf={}, **kw):
+        """
+        Configure all table cells in the given row.  Valid keyword
+        arguments are: ``background``, ``bg``, ``foreground``, ``fg``,
+        ``selectbackground``, ``selectforeground``.
+        """
+        for lb in self._listboxes: lb.itemconfigure(row_index, cnf, **kw)
+
+    def columnconfigure(self, col_index, cnf={}, **kw):
+        """
+        Configure all table cells in the given column.  Valid keyword
+        arguments are: ``background``, ``bg``, ``foreground``, ``fg``,
+        ``selectbackground``, ``selectforeground``.
+        """
+        lb = self._listboxes[col_index]
+
+        cnf = dict(list(cnf.items()) + list(kw.items()))
+        for (key, val) in list(cnf.items()):
+            if key in ('background', 'bg', 'foreground', 'fg',
+                       'selectbackground', 'selectforeground'):
+                for i in range(lb.size()): lb.itemconfigure(i, {key:val})
+            else:
+                lb.configure({key:val})
+
+    def itemconfigure(self, row_index, col_index, cnf=None, **kw):
+        """
+        Configure the table cell at the given row and column.  Valid
+        keyword arguments are: ``background``, ``bg``, ``foreground``,
+        ``fg``, ``selectbackground``, ``selectforeground``.
+        """
+        lb = self._listboxes[col_index]
+        return lb.itemconfigure(row_index, cnf, **kw)
+
+    #/////////////////////////////////////////////////////////////////
+    # Value Access
+    #/////////////////////////////////////////////////////////////////
+
+    def insert(self, index, *rows):
+        """
+        Insert the given row or rows into the table, at the given
+        index.  Each row value should be a tuple of cell values, one
+        for each column in the row.  Index may be an integer or any of
+        the special strings (such as ``'end'``) accepted by
+        ``Tkinter.Listbox``.
+        """
+        for elt in rows:
+            if len(elt) != len(self._column_names):
+                raise ValueError('rows should be tuples whose length '
+                                 'is equal to the number of columns')
+        for (lb,elts) in zip(self._listboxes, list(zip(*rows))):
+            lb.insert(index, *elts)
+
+    def get(self, first, last=None):
+        """
+        Return the value(s) of the specified row(s).  If ``last`` is
+        not specified, then return a single row value; otherwise,
+        return a list of row values.  Each row value is a tuple of
+        cell values, one for each column in the row.
+        """
+        values = [lb.get(first, last) for lb in self._listboxes]
+        if last:
+            return [tuple(row) for row in zip(*values)]
+        else:
+            return tuple(values)
+
+    def bbox(self, row, col):
+        """
+        Return the bounding box for the given table cell, relative to
+        this widget's top-left corner.  The bounding box is a tuple
+        of integers ``(left, top, width, height)``.
+        """
+        dx, dy, _, _ = self.grid_bbox(row=0, column=col)
+        x, y, w, h = self._listboxes[col].bbox(row)
+        return int(x)+int(dx), int(y)+int(dy), int(w), int(h)
+
+    #/////////////////////////////////////////////////////////////////
+    # Hide/Show Columns
+    #/////////////////////////////////////////////////////////////////
+
+    def hide_column(self, col_index):
+        """
+        Hide the given column.  The column's state is still
+        maintained: its values will still be returned by ``get()``, and
+        you must supply its values when calling ``insert()``.  It is
+        safe to call this on a column that is already hidden.
+
+        :see: ``show_column()``
+        """
+        if self._labels:
+            self._labels[col_index].grid_forget()
+        self.listboxes[col_index].grid_forget()
+        self.grid_columnconfigure(col_index, weight=0)
+
+    def show_column(self, col_index):
+        """
+        Display a column that has been hidden using ``hide_column()``.
+        It is safe to call this on a column that is not hidden.
+        """
+        weight = self._column_weights[col_index]
+        if self._labels:
+            self._labels[col_index].grid(column=col_index, row=0,
+                                         sticky='news', padx=0, pady=0)
+        self._listboxes[col_index].grid(column=col_index, row=1,
+                                        sticky='news', padx=0, pady=0)
+        self.grid_columnconfigure(col_index, weight=weight)
+
+    #/////////////////////////////////////////////////////////////////
+    # Binding Methods
+    #/////////////////////////////////////////////////////////////////
+
+    def bind_to_labels(self, sequence=None, func=None, add=None):
+        """
+        Add a binding to each ``Tkinter.Label`` widget in this
+        mult-column listbox that will call ``func`` in response to the
+        event sequence.
+
+        :return: A list of the identifiers of replaced binding
+            functions (if any), allowing for their deletion (to
+            prevent a memory leak).
+        """
+        return [label.bind(sequence, func, add)
+                for label in self.column_labels]
+
+    def bind_to_listboxes(self, sequence=None, func=None, add=None):
+        """
+        Add a binding to each ``Tkinter.Listbox`` widget in this
+        mult-column listbox that will call ``func`` in response to the
+        event sequence.
+
+        :return: A list of the identifiers of replaced binding
+            functions (if any), allowing for their deletion (to
+            prevent a memory leak).
+        """
+        for listbox in self.listboxes:
+            listbox.bind(sequence, func, add)
+
+    def bind_to_columns(self, sequence=None, func=None, add=None):
+        """
+        Add a binding to each ``Tkinter.Label`` and ``Tkinter.Listbox``
+        widget in this mult-column listbox that will call ``func`` in
+        response to the event sequence.
+
+        :return: A list of the identifiers of replaced binding
+            functions (if any), allowing for their deletion (to
+            prevent a memory leak).
+        """
+        return (self.bind_to_labels(sequence, func, add) +
+                self.bind_to_listboxes(sequence, func, add))
+
+    #/////////////////////////////////////////////////////////////////
+    # Simple Delegation
+    #/////////////////////////////////////////////////////////////////
+
+    # These methods delegate to the first listbox:
+    def curselection(self, *args, **kwargs):
+        return self._listboxes[0].curselection(*args, **kwargs)
+    def selection_includes(self, *args, **kwargs):
+        return self._listboxes[0].selection_includes(*args, **kwargs)
+    def itemcget(self, *args, **kwargs):
+        return self._listboxes[0].itemcget(*args, **kwargs)
+    def size(self, *args, **kwargs):
+        return self._listboxes[0].size(*args, **kwargs)
+    def index(self, *args, **kwargs):
+        return self._listboxes[0].index(*args, **kwargs)
+    def nearest(self, *args, **kwargs):
+        return self._listboxes[0].nearest(*args, **kwargs)
+
+    # These methods delegate to each listbox (and return None):
+    def activate(self, *args, **kwargs):
+        for lb in self._listboxes: lb.activate(*args, **kwargs)
+    def delete(self, *args, **kwargs):
+        for lb in self._listboxes: lb.delete(*args, **kwargs)
+    def scan_mark(self, *args, **kwargs):
+        for lb in self._listboxes: lb.scan_mark(*args, **kwargs)
+    def scan_dragto(self, *args, **kwargs):
+        for lb in self._listboxes: lb.scan_dragto(*args, **kwargs)
+    def see(self, *args, **kwargs):
+        for lb in self._listboxes: lb.see(*args, **kwargs)
+    def selection_anchor(self, *args, **kwargs):
+        for lb in self._listboxes: lb.selection_anchor(*args, **kwargs)
+    def selection_clear(self, *args, **kwargs):
+        for lb in self._listboxes: lb.selection_clear(*args, **kwargs)
+    def selection_set(self, *args, **kwargs):
+        for lb in self._listboxes: lb.selection_set(*args, **kwargs)
+    def yview(self, *args, **kwargs):
+        for lb in self._listboxes: v = lb.yview(*args, **kwargs)
+        return v # if called with no arguments
+    def yview_moveto(self, *args, **kwargs):
+        for lb in self._listboxes: lb.yview_moveto(*args, **kwargs)
+    def yview_scroll(self, *args, **kwargs):
+        for lb in self._listboxes: lb.yview_scroll(*args, **kwargs)
+
+    #/////////////////////////////////////////////////////////////////
+    # Aliases
+    #/////////////////////////////////////////////////////////////////
+
+    itemconfig = itemconfigure
+    rowconfig = rowconfigure
+    columnconfig = columnconfigure
+    select_anchor = selection_anchor
+    select_clear = selection_clear
+    select_includes = selection_includes
+    select_set = selection_set
+
+    #/////////////////////////////////////////////////////////////////
+    # These listbox methods are not defined for multi-listbox
+    #/////////////////////////////////////////////////////////////////
+    # def xview(self, *what): pass
+    # def xview_moveto(self, fraction): pass
+    # def xview_scroll(self, number, what): pass
+
+######################################################################
+# Table
+######################################################################
+
+class Table(object):
+    """
+    A display widget for a table of values, based on a ``MultiListbox``
+    widget.  For many purposes, ``Table`` can be treated as a
+    list-of-lists.  E.g., table[i] is a list of the values for row i;
+    and table.append(row) adds a new row with the given lits of
+    values.  Individual cells can be accessed using table[i,j], which
+    refers to the j-th column of the i-th row.  This can be used to
+    both read and write values from the table.  E.g.:
+
+        >>> table[i,j] = 'hello'
+
+    The column (j) can be given either as an index number, or as a
+    column name.  E.g., the following prints the value in the 3rd row
+    for the 'First Name' column:
+
+        >>> print(table[3, 'First Name'])
+        John
+
+    You can configure the colors for individual rows, columns, or
+    cells using ``rowconfig()``, ``columnconfig()``, and ``itemconfig()``.
+    The color configuration for each row will be preserved if the
+    table is modified; however, when new rows are added, any color
+    configurations that have been made for *columns* will not be
+    applied to the new row.
+
+    Note: Although ``Table`` acts like a widget in some ways (e.g., it
+    defines ``grid()``, ``pack()``, and ``bind()``), it is not itself a
+    widget; it just contains one.  This is because widgets need to
+    define ``__getitem__()``, ``__setitem__()``, and ``__nonzero__()`` in
+    a way that's incompatible with the fact that ``Table`` behaves as a
+    list-of-lists.
+
+    :ivar _mlb: The multi-column listbox used to display this table's data.
+    :ivar _rows: A list-of-lists used to hold the cell values of this
+        table.  Each element of _rows is a row value, i.e., a list of
+        cell values, one for each column in the row.
+    """
+    def __init__(self, master, column_names, rows=None,
+                 column_weights=None,
+                 scrollbar=True, click_to_sort=True,
+                 reprfunc=None, cnf={}, **kw):
+        """
+        Construct a new Table widget.
+
+        :type master: Tkinter.Widget
+        :param master: The widget that should contain the new table.
+        :type column_names: list(str)
+        :param column_names: A list of names for the columns; these
+            names will be used to create labels for each column;
+            and can be used as an index when reading or writing
+            cell values from the table.
+        :type rows: list(list)
+        :param rows: A list of row values used to initialze the table.
+            Each row value should be a tuple of cell values, one for
+            each column in the row.
+        :type scrollbar: bool
+        :param scrollbar: If true, then create a scrollbar for the
+            new table widget.
+        :type click_to_sort: bool
+        :param click_to_sort: If true, then create bindings that will
+            sort the table's rows by a given column's values if the
+            user clicks on that colum's label.
+        :type reprfunc: function
+        :param reprfunc: If specified, then use this function to
+            convert each table cell value to a string suitable for
+            display.  ``reprfunc`` has the following signature:
+            reprfunc(row_index, col_index, cell_value) -> str
+            (Note that the column is specified by index, not by name.)
+        :param cnf, kw: Configuration parameters for this widget's
+            contained ``MultiListbox``.  See ``MultiListbox.__init__()``
+            for details.
+        """
+        self._num_columns = len(column_names)
+        self._reprfunc = reprfunc
+        self._frame = Frame(master)
+
+        self._column_name_to_index = dict((c,i) for (i,c) in
+                                          enumerate(column_names))
+
+        # Make a copy of the rows & check that it's valid.
+        if rows is None: self._rows = []
+        else: self._rows = [[v for v in row] for row in rows]
+        for row in self._rows: self._checkrow(row)
+
+        # Create our multi-list box.
+        self._mlb = MultiListbox(self._frame, column_names,
+                                 column_weights, cnf, **kw)
+        self._mlb.pack(side='left', expand=True, fill='both')
+
+        # Optional scrollbar
+        if scrollbar:
+            sb = Scrollbar(self._frame, orient='vertical',
+                           command=self._mlb.yview)
+            self._mlb.listboxes[0]['yscrollcommand'] = sb.set
+            #for listbox in self._mlb.listboxes:
+            #    listbox['yscrollcommand'] = sb.set
+            sb.pack(side='right', fill='y')
+            self._scrollbar = sb
+
+        # Set up sorting
+        self._sortkey = None
+        if click_to_sort:
+            for i, l in enumerate(self._mlb.column_labels):
+                l.bind('<Button-1>', self._sort)
+
+        # Fill in our multi-list box.
+        self._fill_table()
+
+    #/////////////////////////////////////////////////////////////////
+    #{ Widget-like Methods
+    #/////////////////////////////////////////////////////////////////
+    # These all just delegate to either our frame or our MLB.
+
+    def pack(self, *args, **kwargs):
+        """Position this table's main frame widget in its parent
+        widget.  See ``Tkinter.Frame.pack()`` for more info."""
+        self._frame.pack(*args, **kwargs)
+
+    def grid(self, *args, **kwargs):
+        """Position this table's main frame widget in its parent
+        widget.  See ``Tkinter.Frame.grid()`` for more info."""
+        self._frame.grid(*args, **kwargs)
+
+    def focus(self):
+        """Direct (keyboard) input foxus to this widget."""
+        self._mlb.focus()
+
+    def bind(self, sequence=None, func=None, add=None):
+        """Add a binding to this table's main frame that will call
+        ``func`` in response to the event sequence."""
+        self._mlb.bind(sequence, func, add)
+
+    def rowconfigure(self, row_index, cnf={}, **kw):
+        """:see: ``MultiListbox.rowconfigure()``"""
+        self._mlb.rowconfigure(row_index, cnf, **kw)
+
+    def columnconfigure(self, col_index, cnf={}, **kw):
+        """:see: ``MultiListbox.columnconfigure()``"""
+        col_index = self.column_index(col_index)
+        self._mlb.columnconfigure(col_index, cnf, **kw)
+
+    def itemconfigure(self, row_index, col_index, cnf=None, **kw):
+        """:see: ``MultiListbox.itemconfigure()``"""
+        col_index = self.column_index(col_index)
+        return self._mlb.itemconfigure(row_index, col_index, cnf, **kw)
+
+    def bind_to_labels(self, sequence=None, func=None, add=None):
+        """:see: ``MultiListbox.bind_to_labels()``"""
+        return self._mlb.bind_to_labels(sequence, func, add)
+
+    def bind_to_listboxes(self, sequence=None, func=None, add=None):
+        """:see: ``MultiListbox.bind_to_listboxes()``"""
+        return self._mlb.bind_to_listboxes(sequence, func, add)
+
+    def bind_to_columns(self, sequence=None, func=None, add=None):
+        """:see: ``MultiListbox.bind_to_columns()``"""
+        return self._mlb.bind_to_columns(sequence, func, add)
+
+    rowconfig = rowconfigure
+    columnconfig = columnconfigure
+    itemconfig = itemconfigure
+
+    #/////////////////////////////////////////////////////////////////
+    #{ Table as list-of-lists
+    #/////////////////////////////////////////////////////////////////
+
+    def insert(self, row_index, rowvalue):
+        """
+        Insert a new row into the table, so that its row index will be
+        ``row_index``.  If the table contains any rows whose row index
+        is greater than or equal to ``row_index``, then they will be
+        shifted down.
+
+        :param rowvalue: A tuple of cell values, one for each column
+            in the new row.
+        """
+        self._checkrow(rowvalue)
+        self._rows.insert(row_index, rowvalue)
+        if self._reprfunc is not None:
+            rowvalue = [self._reprfunc(row_index,j,v)
+                        for (j,v) in enumerate(rowvalue)]
+        self._mlb.insert(row_index, rowvalue)
+        if self._DEBUG: self._check_table_vs_mlb()
+
+    def extend(self, rowvalues):
+        """
+        Add new rows at the end of the table.
+
+        :param rowvalues: A list of row values used to initialze the
+            table.  Each row value should be a tuple of cell values,
+            one for each column in the row.
+        """
+        for rowvalue in rowvalues: self.append(rowvalue)
+        if self._DEBUG: self._check_table_vs_mlb()
+
+    def append(self, rowvalue):
+        """
+        Add a new row to the end of the table.
+
+        :param rowvalue: A tuple of cell values, one for each column
+            in the new row.
+        """
+        self.insert(len(self._rows), rowvalue)
+        if self._DEBUG: self._check_table_vs_mlb()
+
+    def clear(self):
+        """
+        Delete all rows in this table.
+        """
+        self._rows = []
+        self._mlb.delete(0, 'end')
+        if self._DEBUG: self._check_table_vs_mlb()
+
+    def __getitem__(self, index):
+        """
+        Return the value of a row or a cell in this table.  If
+        ``index`` is an integer, then the row value for the ``index``th
+        row.  This row value consists of a tuple of cell values, one
+        for each column in the row.  If ``index`` is a tuple of two
+        integers, ``(i,j)``, then return the value of the cell in the
+        ``i``th row and the ``j``th column.
+        """
+        if isinstance(index, slice):
+            raise ValueError('Slicing not supported')
+        elif isinstance(index, tuple) and len(index)==2:
+            return self._rows[index[0]][self.column_index(index[1])]
+        else:
+            return tuple(self._rows[index])
+
+    def __setitem__(self, index, val):
+        """
+        Replace the value of a row or a cell in this table with
+        ``val``.
+
+        If ``index`` is an integer, then ``val`` should be a row value
+        (i.e., a tuple of cell values, one for each column).  In this
+        case, the values of the ``index``th row of the table will be
+        replaced with the values in ``val``.
+
+        If ``index`` is a tuple of integers, ``(i,j)``, then replace the
+        value of the cell in the ``i``th row and ``j``th column with
+        ``val``.
+        """
+        if isinstance(index, slice):
+            raise ValueError('Slicing not supported')
+
+
+        # table[i,j] = val
+        elif isinstance(index, tuple) and len(index)==2:
+            i, j = index[0], self.column_index(index[1])
+            config_cookie = self._save_config_info([i])
+            self._rows[i][j] = val
+            if self._reprfunc is not None:
+                val = self._reprfunc(i, j, val)
+            self._mlb.listboxes[j].insert(i, val)
+            self._mlb.listboxes[j].delete(i+1)
+            self._restore_config_info(config_cookie)
+
+        # table[i] = val
+        else:
+            config_cookie = self._save_config_info([index])
+            self._checkrow(val)
+            self._rows[index] = list(val)
+            if self._reprfunc is not None:
+                val = [self._reprfunc(index,j,v) for (j,v) in enumerate(val)]
+            self._mlb.insert(index, val)
+            self._mlb.delete(index+1)
+            self._restore_config_info(config_cookie)
+
+    def __delitem__(self, row_index):
+        """
+        Delete the ``row_index``th row from this table.
+        """
+        if isinstance(row_index, slice):
+            raise ValueError('Slicing not supported')
+        if isinstance(row_index, tuple) and len(row_index)==2:
+            raise ValueError('Cannot delete a single cell!')
+        del self._rows[row_index]
+        self._mlb.delete(row_index)
+        if self._DEBUG: self._check_table_vs_mlb()
+
+    def __len__(self):
+        """
+        :return: the number of rows in this table.
+        """
+        return len(self._rows)
+
+    def _checkrow(self, rowvalue):
+        """
+        Helper function: check that a given row value has the correct
+        number of elements; and if not, raise an exception.
+        """
+        if len(rowvalue) != self._num_columns:
+            raise ValueError('Row %r has %d columns; expected %d' %
+                             (rowvalue, len(rowvalue), self._num_columns))
+
+    #/////////////////////////////////////////////////////////////////
+    # Columns
+    #/////////////////////////////////////////////////////////////////
+
+    @property
+    def column_names(self):
+        """A list of the names of the columns in this table."""
+        return self._mlb.column_names
+
+    def column_index(self, i):
+        """
+        If ``i`` is a valid column index integer, then return it as is.
+        Otherwise, check if ``i`` is used as the name for any column;
+        if so, return that column's index.  Otherwise, raise a
+        ``KeyError`` exception.
+        """
+        if isinstance(i, int) and 0 <= i < self._num_columns:
+            return i
+        else:
+            # This raises a key error if the column is not found.
+            return self._column_name_to_index[i]
+
+    def hide_column(self, column_index):
+        """:see: ``MultiListbox.hide_column()``"""
+        self._mlb.hide_column(self.column_index(column_index))
+
+    def show_column(self, column_index):
+        """:see: ``MultiListbox.show_column()``"""
+        self._mlb.show_column(self.column_index(column_index))
+
+    #/////////////////////////////////////////////////////////////////
+    # Selection
+    #/////////////////////////////////////////////////////////////////
+
+    def selected_row(self):
+        """
+        Return the index of the currently selected row, or None if
+        no row is selected.  To get the row value itself, use
+        ``table[table.selected_row()]``.
+        """
+        sel = self._mlb.curselection()
+        if sel: return int(sel[0])
+        else: return None
+
+    def select(self, index=None, delta=None, see=True):
+        """:see: ``MultiListbox.select()``"""
+        self._mlb.select(index, delta, see)
+
+    #/////////////////////////////////////////////////////////////////
+    # Sorting
+    #/////////////////////////////////////////////////////////////////
+
+    def sort_by(self, column_index, order='toggle'):
+        """
+        Sort the rows in this table, using the specified column's
+        values as a sort key.
+
+        :param column_index: Specifies which column to sort, using
+            either a column index (int) or a column's label name
+            (str).
+
+        :param order: Specifies whether to sort the values in
+            ascending or descending order:
+
+              - ``'ascending'``: Sort from least to greatest.
+              - ``'descending'``: Sort from greatest to least.
+              - ``'toggle'``: If the most recent call to ``sort_by()``
+                sorted the table by the same column (``column_index``),
+                then reverse the rows; otherwise sort in ascending
+                order.
+        """
+        if order not in ('ascending', 'descending', 'toggle'):
+            raise ValueError('sort_by(): order should be "ascending", '
+                             '"descending", or "toggle".')
+        column_index = self.column_index(column_index)
+        config_cookie = self._save_config_info(index_by_id=True)
+
+        # Sort the rows.
+        if order == 'toggle' and column_index == self._sortkey:
+            self._rows.reverse()
+        else:
+            self._rows.sort(key=operator.itemgetter(column_index),
+                            reverse=(order=='descending'))
+            self._sortkey = column_index
+
+        # Redraw the table.
+        self._fill_table()
+        self._restore_config_info(config_cookie, index_by_id=True, see=True)
+        if self._DEBUG: self._check_table_vs_mlb()
+
+    def _sort(self, event):
+        """Event handler for clicking on a column label -- sort by
+        that column."""
+        column_index = event.widget.column_index
+
+        # If they click on the far-left of far-right of a column's
+        # label, then resize rather than sorting.
+        if self._mlb._resize_column(event):
+            return 'continue'
+
+        # Otherwise, sort.
+        else:
+            self.sort_by(column_index)
+            return 'continue'
+
+    #/////////////////////////////////////////////////////////////////
+    #{ Table Drawing Helpers
+    #/////////////////////////////////////////////////////////////////
+
+    def _fill_table(self, save_config=True):
+        """
+        Re-draw the table from scratch, by clearing out the table's
+        multi-column listbox; and then filling it in with values from
+        ``self._rows``.  Note that any cell-, row-, or column-specific
+        color configuration that has been done will be lost.  The
+        selection will also be lost -- i.e., no row will be selected
+        after this call completes.
+        """
+        self._mlb.delete(0, 'end')
+        for i, row in enumerate(self._rows):
+            if self._reprfunc is not None:
+                row = [self._reprfunc(i,j,v) for (j,v) in enumerate(row)]
+            self._mlb.insert('end', row)
+
+    def _get_itemconfig(self, r, c):
+        return dict( (k, self._mlb.itemconfig(r, c, k)[-1])
+                     for k in ('foreground', 'selectforeground',
+                               'background', 'selectbackground') )
+
+    def _save_config_info(self, row_indices=None, index_by_id=False):
+        """
+        Return a 'cookie' containing information about which row is
+        selected, and what color configurations have been applied.
+        this information can the be re-applied to the table (after
+        making modifications) using ``_restore_config_info()``.  Color
+        configuration information will be saved for any rows in
+        ``row_indices``, or in the entire table, if
+        ``row_indices=None``.  If ``index_by_id=True``, the the cookie
+        will associate rows with their configuration information based
+        on the rows' python id.  This is useful when performing
+        operations that re-arrange the rows (e.g. ``sort``).  If
+        ``index_by_id=False``, then it is assumed that all rows will be
+        in the same order when ``_restore_config_info()`` is called.
+        """
+        # Default value for row_indices is all rows.
+        if row_indices is None:
+            row_indices = list(range(len(self._rows)))
+
+        # Look up our current selection.
+        selection = self.selected_row()
+        if index_by_id and selection is not None:
+            selection = id(self._rows[selection])
+
+        # Look up the color configuration info for each row.
+        if index_by_id:
+            config = dict((id(self._rows[r]), [self._get_itemconfig(r, c)
+                                        for c in range(self._num_columns)])
+                          for r in row_indices)
+        else:
+            config = dict((r, [self._get_itemconfig(r, c)
+                               for c in range(self._num_columns)])
+                          for r in row_indices)
+
+
+        return selection, config
+
+    def _restore_config_info(self, cookie, index_by_id=False, see=False):
+        """
+        Restore selection & color configuration information that was
+        saved using ``_save_config_info``.
+        """
+        selection, config = cookie
+
+        # Clear the selection.
+        if selection is None:
+            self._mlb.selection_clear(0, 'end')
+
+        # Restore selection & color config
+        if index_by_id:
+            for r, row in enumerate(self._rows):
+                if id(row) in config:
+                    for c in range(self._num_columns):
+                        self._mlb.itemconfigure(r, c, config[id(row)][c])
+                if id(row) == selection:
+                    self._mlb.select(r, see=see)
+        else:
+            if selection is not None:
+                self._mlb.select(selection, see=see)
+            for r in config:
+                for c in range(self._num_columns):
+                    self._mlb.itemconfigure(r, c, config[r][c])
+
+    #/////////////////////////////////////////////////////////////////
+    # Debugging (Invariant Checker)
+    #/////////////////////////////////////////////////////////////////
+
+    _DEBUG = False
+    """If true, then run ``_check_table_vs_mlb()`` after any operation
+       that modifies the table."""
+
+    def _check_table_vs_mlb(self):
+        """
+        Verify that the contents of the table's ``_rows`` variable match
+        the contents of its multi-listbox (``_mlb``).  This is just
+        included for debugging purposes, to make sure that the
+        list-modifying operations are working correctly.
+        """
+        for col in self._mlb.listboxes:
+            assert len(self) == col.size()
+        for row in self:
+            assert len(row) == self._num_columns
+        assert self._num_columns == len(self._mlb.column_names)
+        #assert self._column_names == self._mlb.column_names
+        for i, row in enumerate(self):
+            for j, cell in enumerate(row):
+                if self._reprfunc is not None:
+                    cell = self._reprfunc(i, j, cell)
+                assert self._mlb.get(i)[j] == cell
+
+######################################################################
+# Demo/Test Function
+######################################################################
+
+# update this to use new WordNet API
+def demo():
+    root = Tk()
+    root.bind('<Control-q>', lambda e: root.destroy())
+
+    table = Table(root, 'Word Synset Hypernym Hyponym'.split(),
+                  column_weights=[0, 1, 1, 1],
+                  reprfunc=(lambda i,j,s: '  %s' % s))
+    table.pack(expand=True, fill='both')
+
+    from nltk.corpus import wordnet
+    from nltk.corpus import brown
+    for word, pos in sorted(set(brown.tagged_words()[:500])):
+        if pos[0] != 'N': continue
+        word = word.lower()
+        for synset in wordnet.synsets(word):
+            try:
+                hyper_def = synset.hypernyms()[0].definition()
+            except:
+                hyper_def = '*none*'
+            try:
+                hypo_def = synset.hypernyms()[0].definition()
+            except:
+                hypo_def = '*none*'
+            table.append([word,
+                          synset.definition(),
+                          hyper_def,
+                          hypo_def])
+
+    table.columnconfig('Word', background='#afa')
+    table.columnconfig('Synset', background='#efe')
+    table.columnconfig('Hypernym', background='#fee')
+    table.columnconfig('Hyponym', background='#ffe')
+    for row in range(len(table)):
+        for column in ('Hypernym', 'Hyponym'):
+            if table[row, column] == '*none*':
+                table.itemconfig(row, column, foreground='#666',
+                                 selectforeground='#666')
+    root.mainloop()
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/draw/table.pyc b/nlp_resource_data/nltk/draw/table.pyc

new file mode 100755 (executable)

index 0000000..f70e348

Binary files /dev/null and b/nlp_resource_data/nltk/draw/table.pyc differ
diff --git a/nlp_resource_data/nltk/draw/tree.py b/nlp_resource_data/nltk/draw/tree.py

new file mode 100755 (executable)

index 0000000..f421d13
--- /dev/null
+++ b/nlp_resource_data/nltk/draw/tree.py
@@ -0,0 +1,963 @@
+# Natural Language Toolkit: Graphical Representations for Trees
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Graphically display a Tree.
+"""
+
+from six.moves.tkinter import IntVar, Menu, Tk
+
+from nltk.util import in_idle
+from nltk.tree import Tree
+from nltk.draw.util import (CanvasFrame, CanvasWidget, BoxWidget,
+                            TextWidget, ParenWidget, OvalWidget)
+
+##//////////////////////////////////////////////////////
+##  Tree Segment
+##//////////////////////////////////////////////////////
+
+class TreeSegmentWidget(CanvasWidget):
+    """
+    A canvas widget that displays a single segment of a hierarchical
+    tree.  Each ``TreeSegmentWidget`` connects a single "node widget"
+    to a sequence of zero or more "subtree widgets".  By default, the
+    bottom of the node is connected to the top of each subtree by a
+    single line.  However, if the ``roof`` attribute is set, then a
+    single triangular "roof" will connect the node to all of its
+    children.
+
+    Attributes:
+      - ``roof``: What sort of connection to draw between the node and
+        its subtrees.  If ``roof`` is true, draw a single triangular
+        "roof" over the subtrees.  If ``roof`` is false, draw a line
+        between each subtree and the node.  Default value is false.
+      - ``xspace``: The amount of horizontal space to leave between
+        subtrees when managing this widget.  Default value is 10.
+      - ``yspace``: The amount of space to place between the node and
+        its children when managing this widget.  Default value is 15.
+      - ``color``: The color of the lines connecting the node to its
+        subtrees; and of the outline of the triangular roof.  Default
+        value is ``'#006060'``.
+      - ``fill``: The fill color for the triangular roof.  Default
+        value is ``''`` (no fill).
+      - ``width``: The width of the lines connecting the node to its
+        subtrees; and of the outline of the triangular roof.  Default
+        value is 1.
+      - ``orientation``: Determines whether the tree branches downwards
+        or rightwards.  Possible values are ``'horizontal'`` and
+        ``'vertical'``.  The default value is ``'vertical'`` (i.e.,
+        branch downwards).
+      - ``draggable``: whether the widget can be dragged by the user.
+    """
+    def __init__(self, canvas, label, subtrees, **attribs):
+        """
+        :type node:
+        :type subtrees: list(CanvasWidgetI)
+        """
+        self._label = label
+        self._subtrees = subtrees
+
+        # Attributes
+        self._horizontal = 0
+        self._roof = 0
+        self._xspace = 10
+        self._yspace = 15
+        self._ordered = False
+
+        # Create canvas objects.
+        self._lines = [canvas.create_line(0,0,0,0, fill='#006060')
+                       for c in subtrees]
+        self._polygon = canvas.create_polygon(0,0, fill='', state='hidden',
+                                              outline='#006060')
+
+        # Register child widgets (label + subtrees)
+        self._add_child_widget(label)
+        for subtree in subtrees:
+            self._add_child_widget(subtree)
+
+        # Are we currently managing?
+        self._managing = False
+
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def __setitem__(self, attr, value):
+        canvas = self.canvas()
+        if attr == 'roof':
+            self._roof = value
+            if self._roof:
+                for l in self._lines: canvas.itemconfig(l, state='hidden')
+                canvas.itemconfig(self._polygon, state='normal')
+            else:
+                for l in self._lines: canvas.itemconfig(l, state='normal')
+                canvas.itemconfig(self._polygon, state='hidden')
+        elif attr == 'orientation':
+            if value == 'horizontal': self._horizontal = 1
+            elif value == 'vertical': self._horizontal = 0
+            else:
+                raise ValueError('orientation must be horizontal or vertical')
+        elif attr == 'color':
+            for l in self._lines: canvas.itemconfig(l, fill=value)
+            canvas.itemconfig(self._polygon, outline=value)
+        elif isinstance(attr, tuple) and attr[0] == 'color':
+            # Set the color of an individual line.
+            l = self._lines[int(attr[1])]
+            canvas.itemconfig(l, fill=value)
+        elif attr == 'fill':
+            canvas.itemconfig(self._polygon, fill=value)
+        elif attr == 'width':
+            canvas.itemconfig(self._polygon, {attr:value})
+            for l in self._lines: canvas.itemconfig(l, {attr:value})
+        elif attr in ('xspace', 'yspace'):
+            if attr == 'xspace': self._xspace = value
+            elif attr == 'yspace': self._yspace = value
+            self.update(self._label)
+        elif attr == 'ordered':
+            self._ordered = value
+        else:
+            CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'roof': return self._roof
+        elif attr == 'width':
+            return self.canvas().itemcget(self._polygon, attr)
+        elif attr == 'color':
+            return self.canvas().itemcget(self._polygon, 'outline')
+        elif isinstance(attr, tuple) and attr[0] == 'color':
+            l = self._lines[int(attr[1])]
+            return self.canvas().itemcget(l, 'fill')
+        elif attr == 'xspace': return self._xspace
+        elif attr == 'yspace': return self._yspace
+        elif attr == 'orientation':
+            if self._horizontal: return 'horizontal'
+            else: return 'vertical'
+        elif attr == 'ordered':
+            return self._ordered
+        else:
+            return CanvasWidget.__getitem__(self, attr)
+
+    def label(self):
+        return self._label
+
+    def subtrees(self):
+        return self._subtrees[:]
+
+    def set_label(self, label):
+        """
+        Set the node label to ``label``.
+        """
+        self._remove_child_widget(self._label)
+        self._add_child_widget(label)
+        self._label = label
+        self.update(self._label)
+
+    def replace_child(self, oldchild, newchild):
+        """
+        Replace the child ``oldchild`` with ``newchild``.
+        """
+        index = self._subtrees.index(oldchild)
+        self._subtrees[index] = newchild
+        self._remove_child_widget(oldchild)
+        self._add_child_widget(newchild)
+        self.update(newchild)
+
+    def remove_child(self, child):
+        index = self._subtrees.index(child)
+        del self._subtrees[index]
+        self._remove_child_widget(child)
+        self.canvas().delete(self._lines.pop())
+        self.update(self._label)
+
+    def insert_child(self, index, child):
+        canvas = self.canvas()
+        self._subtrees.insert(index, child)
+        self._add_child_widget(child)
+        self._lines.append(canvas.create_line(0,0,0,0, fill='#006060'))
+        self.update(self._label)
+
+    # but.. lines???
+
+    def _tags(self):
+        if self._roof:
+            return [self._polygon]
+        else:
+            return self._lines
+
+    def _subtree_top(self, child):
+        if isinstance(child, TreeSegmentWidget):
+            bbox = child.label().bbox()
+        else:
+            bbox = child.bbox()
+        if self._horizontal:
+            return (bbox[0], (bbox[1]+bbox[3])/2.0)
+        else:
+            return ((bbox[0]+bbox[2])/2.0, bbox[1])
+
+    def _node_bottom(self):
+        bbox = self._label.bbox()
+        if self._horizontal:
+            return (bbox[2], (bbox[1]+bbox[3])/2.0)
+        else:
+            return ((bbox[0]+bbox[2])/2.0, bbox[3])
+
+    def _update(self, child):
+        if len(self._subtrees) == 0: return
+        if self._label.bbox() is None: return # [XX] ???
+
+        # Which lines need to be redrawn?
+        if child is self._label: need_update = self._subtrees
+        else: need_update = [child]
+
+        if self._ordered and not self._managing:
+            need_update = self._maintain_order(child)
+
+        # Update the polygon.
+        (nodex, nodey) = self._node_bottom()
+        (xmin, ymin, xmax, ymax) = self._subtrees[0].bbox()
+        for subtree in self._subtrees[1:]:
+            bbox = subtree.bbox()
+            xmin = min(xmin, bbox[0])
+            ymin = min(ymin, bbox[1])
+            xmax = max(xmax, bbox[2])
+            ymax = max(ymax, bbox[3])
+
+        if self._horizontal:
+            self.canvas().coords(self._polygon, nodex, nodey, xmin,
+                                 ymin, xmin, ymax, nodex, nodey)
+        else:
+            self.canvas().coords(self._polygon, nodex, nodey, xmin,
+                                 ymin, xmax, ymin, nodex, nodey)
+
+        # Redraw all lines that need it.
+        for subtree in need_update:
+            (nodex, nodey) = self._node_bottom()
+            line = self._lines[self._subtrees.index(subtree)]
+            (subtreex, subtreey) = self._subtree_top(subtree)
+            self.canvas().coords(line, nodex, nodey, subtreex, subtreey)
+
+    def _maintain_order(self, child):
+        if self._horizontal:
+            return self._maintain_order_horizontal(child)
+        else:
+            return self._maintain_order_vertical(child)
+
+    def _maintain_order_vertical(self, child):
+        (left, top, right, bot) = child.bbox()
+
+        if child is self._label:
+            # Check all the leaves
+            for subtree in self._subtrees:
+                (x1, y1, x2, y2) = subtree.bbox()
+                if bot+self._yspace > y1:
+                    subtree.move(0,bot+self._yspace-y1)
+
+            return self._subtrees
+        else:
+            moved = [child]
+            index = self._subtrees.index(child)
+
+            # Check leaves to our right.
+            x = right + self._xspace
+            for i in range(index+1, len(self._subtrees)):
+                (x1, y1, x2, y2) = self._subtrees[i].bbox()
+                if x > x1:
+                    self._subtrees[i].move(x-x1, 0)
+                    x += x2-x1 + self._xspace
+                    moved.append(self._subtrees[i])
+
+            # Check leaves to our left.
+            x = left - self._xspace
+            for i in range(index-1, -1, -1):
+                (x1, y1, x2, y2) = self._subtrees[i].bbox()
+                if x < x2:
+                    self._subtrees[i].move(x-x2, 0)
+                    x -= x2-x1 + self._xspace
+                    moved.append(self._subtrees[i])
+
+            # Check the node
+            (x1, y1, x2, y2) = self._label.bbox()
+            if y2 > top-self._yspace:
+                self._label.move(0, top-self._yspace-y2)
+                moved = self._subtrees
+
+        # Return a list of the nodes we moved
+        return moved
+
+    def _maintain_order_horizontal(self, child):
+        (left, top, right, bot) = child.bbox()
+
+        if child is self._label:
+            # Check all the leaves
+            for subtree in self._subtrees:
+                (x1, y1, x2, y2) = subtree.bbox()
+                if right+self._xspace > x1:
+                    subtree.move(right+self._xspace-x1)
+
+            return self._subtrees
+        else:
+            moved = [child]
+            index = self._subtrees.index(child)
+
+            # Check leaves below us.
+            y = bot + self._yspace
+            for i in range(index+1, len(self._subtrees)):
+                (x1, y1, x2, y2) = self._subtrees[i].bbox()
+                if y > y1:
+                    self._subtrees[i].move(0, y-y1)
+                    y += y2-y1 + self._yspace
+                    moved.append(self._subtrees[i])
+
+            # Check leaves above us
+            y = top - self._yspace
+            for i in range(index-1, -1, -1):
+                (x1, y1, x2, y2) = self._subtrees[i].bbox()
+                if y < y2:
+                    self._subtrees[i].move(0, y-y2)
+                    y -= y2-y1 + self._yspace
+                    moved.append(self._subtrees[i])
+
+            # Check the node
+            (x1, y1, x2, y2) = self._label.bbox()
+            if x2 > left-self._xspace:
+                self._label.move(left-self._xspace-x2, 0)
+                moved = self._subtrees
+
+        # Return a list of the nodes we moved
+        return moved
+
+    def _manage_horizontal(self):
+        (nodex, nodey) = self._node_bottom()
+
+        # Put the subtrees in a line.
+        y = 20
+        for subtree in self._subtrees:
+            subtree_bbox = subtree.bbox()
+            dx = nodex - subtree_bbox[0] + self._xspace
+            dy = y - subtree_bbox[1]
+            subtree.move(dx, dy)
+            y += subtree_bbox[3] - subtree_bbox[1] + self._yspace
+
+        # Find the center of their tops.
+        center = 0.0
+        for subtree in self._subtrees:
+            center += self._subtree_top(subtree)[1]
+        center /= len(self._subtrees)
+
+        # Center the subtrees with the node.
+        for subtree in self._subtrees:
+            subtree.move(0, nodey-center)
+
+    def _manage_vertical(self):
+        (nodex, nodey) = self._node_bottom()
+
+        # Put the subtrees in a line.
+        x = 0
+        for subtree in self._subtrees:
+            subtree_bbox = subtree.bbox()
+            dy = nodey - subtree_bbox[1] + self._yspace
+            dx = x - subtree_bbox[0]
+            subtree.move(dx, dy)
+            x += subtree_bbox[2] - subtree_bbox[0] + self._xspace
+
+        # Find the center of their tops.
+        center = 0.0
+        for subtree in self._subtrees:
+            center += self._subtree_top(subtree)[0]/len(self._subtrees)
+
+        # Center the subtrees with the node.
+        for subtree in self._subtrees:
+            subtree.move(nodex-center, 0)
+
+    def _manage(self):
+        self._managing = True
+        (nodex, nodey) = self._node_bottom()
+        if len(self._subtrees) == 0: return
+
+        if self._horizontal: self._manage_horizontal()
+        else: self._manage_vertical()
+
+        # Update lines to subtrees.
+        for subtree in self._subtrees:
+            self._update(subtree)
+
+        self._managing = False
+
+    def __repr__(self):
+        return '[TreeSeg %s: %s]' % (self._label, self._subtrees)
+
+def _tree_to_treeseg(canvas, t, make_node, make_leaf,
+                     tree_attribs, node_attribs,
+                     leaf_attribs, loc_attribs):
+    if isinstance(t, Tree):
+        label = make_node(canvas, t.label(), **node_attribs)
+        subtrees = [_tree_to_treeseg(canvas, child, make_node, make_leaf,
+                                     tree_attribs, node_attribs,
+                                     leaf_attribs, loc_attribs)
+                    for child in t]
+        return TreeSegmentWidget(canvas, label, subtrees, **tree_attribs)
+    else:
+        return make_leaf(canvas, t, **leaf_attribs)
+
+def tree_to_treesegment(canvas, t, make_node=TextWidget,
+                        make_leaf=TextWidget, **attribs):
+    """
+    Convert a Tree into a ``TreeSegmentWidget``.
+
+    :param make_node: A ``CanvasWidget`` constructor or a function that
+        creates ``CanvasWidgets``.  ``make_node`` is used to convert
+        the Tree's nodes into ``CanvasWidgets``.  If no constructor
+        is specified, then ``TextWidget`` will be used.
+    :param make_leaf: A ``CanvasWidget`` constructor or a function that
+        creates ``CanvasWidgets``.  ``make_leaf`` is used to convert
+        the Tree's leafs into ``CanvasWidgets``.  If no constructor
+        is specified, then ``TextWidget`` will be used.
+    :param attribs: Attributes for the canvas widgets that make up the
+        returned ``TreeSegmentWidget``.  Any attribute beginning with
+        ``'tree_'`` will be passed to all ``TreeSegmentWidgets`` (with
+        the ``'tree_'`` prefix removed.  Any attribute beginning with
+        ``'node_'`` will be passed to all nodes.  Any attribute
+        beginning with ``'leaf_'`` will be passed to all leaves.  And
+        any attribute beginning with ``'loc_'`` will be passed to all
+        text locations (for Trees).
+    """
+    # Process attribs.
+    tree_attribs = {}
+    node_attribs = {}
+    leaf_attribs = {}
+    loc_attribs = {}
+
+    for (key, value) in list(attribs.items()):
+        if key[:5] == 'tree_': tree_attribs[key[5:]] = value
+        elif key[:5] == 'node_': node_attribs[key[5:]] = value
+        elif key[:5] == 'leaf_': leaf_attribs[key[5:]] = value
+        elif key[:4] == 'loc_': loc_attribs[key[4:]] = value
+        else: raise ValueError('Bad attribute: %s' % key)
+    return _tree_to_treeseg(canvas, t, make_node, make_leaf,
+                                tree_attribs, node_attribs,
+                                leaf_attribs, loc_attribs)
+
+##//////////////////////////////////////////////////////
+##  Tree Widget
+##//////////////////////////////////////////////////////
+
+class TreeWidget(CanvasWidget):
+    """
+    A canvas widget that displays a single Tree.
+    ``TreeWidget`` manages a group of ``TreeSegmentWidgets`` that are
+    used to display a Tree.
+
+    Attributes:
+
+      - ``node_attr``: Sets the attribute ``attr`` on all of the
+        node widgets for this ``TreeWidget``.
+      - ``node_attr``: Sets the attribute ``attr`` on all of the
+        leaf widgets for this ``TreeWidget``.
+      - ``loc_attr``: Sets the attribute ``attr`` on all of the
+        location widgets for this ``TreeWidget`` (if it was built from
+        a Tree).  Note that a location widget is a ``TextWidget``.
+
+      - ``xspace``: The amount of horizontal space to leave between
+        subtrees when managing this widget.  Default value is 10.
+      - ``yspace``: The amount of space to place between the node and
+        its children when managing this widget.  Default value is 15.
+
+      - ``line_color``: The color of the lines connecting each expanded
+        node to its subtrees.
+      - ``roof_color``: The color of the outline of the triangular roof
+        for collapsed trees.
+      - ``roof_fill``: The fill color for the triangular roof for
+        collapsed trees.
+      - ``width``
+
+      - ``orientation``: Determines whether the tree branches downwards
+        or rightwards.  Possible values are ``'horizontal'`` and
+        ``'vertical'``.  The default value is ``'vertical'`` (i.e.,
+        branch downwards).
+
+      - ``shapeable``: whether the subtrees can be independently
+        dragged by the user.  THIS property simply sets the
+        ``DRAGGABLE`` property on all of the ``TreeWidget``'s tree
+        segments.
+      - ``draggable``: whether the widget can be dragged by the user.
+    """
+    def __init__(self, canvas, t, make_node=TextWidget,
+                 make_leaf=TextWidget, **attribs):
+        # Node & leaf canvas widget constructors
+        self._make_node = make_node
+        self._make_leaf = make_leaf
+        self._tree = t
+
+        # Attributes.
+        self._nodeattribs = {}
+        self._leafattribs = {}
+        self._locattribs = {'color': '#008000'}
+        self._line_color = '#008080'
+        self._line_width = 1
+        self._roof_color = '#008080'
+        self._roof_fill = '#c0c0c0'
+        self._shapeable = False
+        self._xspace = 10
+        self._yspace = 10
+        self._orientation = 'vertical'
+        self._ordered = False
+
+        # Build trees.
+        self._keys = {} # treeseg -> key
+        self._expanded_trees = {}
+        self._collapsed_trees = {}
+        self._nodes = []
+        self._leaves = []
+        #self._locs = []
+        self._make_collapsed_trees(canvas, t, ())
+        self._treeseg = self._make_expanded_tree(canvas, t, ())
+        self._add_child_widget(self._treeseg)
+
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def expanded_tree(self, *path_to_tree):
+        """
+        Return the ``TreeSegmentWidget`` for the specified subtree.
+
+        :param path_to_tree: A list of indices i1, i2, ..., in, where
+            the desired widget is the widget corresponding to
+            ``tree.children()[i1].children()[i2]....children()[in]``.
+            For the root, the path is ``()``.
+        """
+        return self._expanded_trees[path_to_tree]
+
+    def collapsed_tree(self, *path_to_tree):
+        """
+        Return the ``TreeSegmentWidget`` for the specified subtree.
+
+        :param path_to_tree: A list of indices i1, i2, ..., in, where
+            the desired widget is the widget corresponding to
+            ``tree.children()[i1].children()[i2]....children()[in]``.
+            For the root, the path is ``()``.
+        """
+        return self._collapsed_trees[path_to_tree]
+
+    def bind_click_trees(self, callback, button=1):
+        """
+        Add a binding to all tree segments.
+        """
+        for tseg in list(self._expanded_trees.values()):
+            tseg.bind_click(callback, button)
+        for tseg in list(self._collapsed_trees.values()):
+            tseg.bind_click(callback, button)
+
+    def bind_drag_trees(self, callback, button=1):
+        """
+        Add a binding to all tree segments.
+        """
+        for tseg in list(self._expanded_trees.values()):
+            tseg.bind_drag(callback, button)
+        for tseg in list(self._collapsed_trees.values()):
+            tseg.bind_drag(callback, button)
+
+    def bind_click_leaves(self, callback, button=1):
+        """
+        Add a binding to all leaves.
+        """
+        for leaf in self._leaves: leaf.bind_click(callback, button)
+        for leaf in self._leaves: leaf.bind_click(callback, button)
+
+    def bind_drag_leaves(self, callback, button=1):
+        """
+        Add a binding to all leaves.
+        """
+        for leaf in self._leaves: leaf.bind_drag(callback, button)
+        for leaf in self._leaves: leaf.bind_drag(callback, button)
+
+    def bind_click_nodes(self, callback, button=1):
+        """
+        Add a binding to all nodes.
+        """
+        for node in self._nodes: node.bind_click(callback, button)
+        for node in self._nodes: node.bind_click(callback, button)
+
+    def bind_drag_nodes(self, callback, button=1):
+        """
+        Add a binding to all nodes.
+        """
+        for node in self._nodes: node.bind_drag(callback, button)
+        for node in self._nodes: node.bind_drag(callback, button)
+
+    def _make_collapsed_trees(self, canvas, t, key):
+        if not isinstance(t, Tree): return
+        make_node = self._make_node
+        make_leaf = self._make_leaf
+
+        node = make_node(canvas, t.label(), **self._nodeattribs)
+        self._nodes.append(node)
+        leaves = [make_leaf(canvas, l, **self._leafattribs)
+                  for l in t.leaves()]
+        self._leaves += leaves
+        treeseg = TreeSegmentWidget(canvas, node, leaves, roof=1,
+                                    color=self._roof_color,
+                                    fill=self._roof_fill,
+                                    width=self._line_width)
+
+        self._collapsed_trees[key] = treeseg
+        self._keys[treeseg] = key
+        #self._add_child_widget(treeseg)
+        treeseg.hide()
+
+        # Build trees for children.
+        for i in range(len(t)):
+            child = t[i]
+            self._make_collapsed_trees(canvas, child, key + (i,))
+
+    def _make_expanded_tree(self, canvas, t, key):
+        make_node = self._make_node
+        make_leaf = self._make_leaf
+
+        if isinstance(t, Tree):
+            node = make_node(canvas, t.label(), **self._nodeattribs)
+            self._nodes.append(node)
+            children = t
+            subtrees = [self._make_expanded_tree(canvas, children[i], key+(i,))
+                        for i in range(len(children))]
+            treeseg = TreeSegmentWidget(canvas, node, subtrees,
+                                        color=self._line_color,
+                                        width=self._line_width)
+            self._expanded_trees[key] = treeseg
+            self._keys[treeseg] = key
+            return treeseg
+        else:
+            leaf = make_leaf(canvas, t, **self._leafattribs)
+            self._leaves.append(leaf)
+            return leaf
+
+    def __setitem__(self, attr, value):
+        if attr[:5] == 'node_':
+            for node in self._nodes: node[attr[5:]] = value
+        elif attr[:5] == 'leaf_':
+            for leaf in self._leaves: leaf[attr[5:]] = value
+        elif attr == 'line_color':
+            self._line_color = value
+            for tseg in list(self._expanded_trees.values()): tseg['color'] = value
+        elif attr == 'line_width':
+            self._line_width = value
+            for tseg in list(self._expanded_trees.values()): tseg['width'] = value
+            for tseg in list(self._collapsed_trees.values()): tseg['width'] = value
+        elif attr == 'roof_color':
+            self._roof_color = value
+            for tseg in list(self._collapsed_trees.values()): tseg['color'] = value
+        elif attr == 'roof_fill':
+            self._roof_fill = value
+            for tseg in list(self._collapsed_trees.values()): tseg['fill'] = value
+        elif attr == 'shapeable':
+            self._shapeable = value
+            for tseg in list(self._expanded_trees.values()):
+                tseg['draggable'] = value
+            for tseg in list(self._collapsed_trees.values()):
+                tseg['draggable'] = value
+            for leaf in self._leaves: leaf['draggable'] = value
+        elif attr == 'xspace':
+            self._xspace = value
+            for tseg in list(self._expanded_trees.values()):
+                tseg['xspace'] = value
+            for tseg in list(self._collapsed_trees.values()):
+                tseg['xspace'] = value
+            self.manage()
+        elif attr == 'yspace':
+            self._yspace = value
+            for tseg in list(self._expanded_trees.values()):
+                tseg['yspace'] = value
+            for tseg in list(self._collapsed_trees.values()):
+                tseg['yspace'] = value
+            self.manage()
+        elif attr == 'orientation':
+            self._orientation = value
+            for tseg in list(self._expanded_trees.values()):
+                tseg['orientation'] = value
+            for tseg in list(self._collapsed_trees.values()):
+                tseg['orientation'] = value
+            self.manage()
+        elif attr == 'ordered':
+            self._ordered = value
+            for tseg in list(self._expanded_trees.values()):
+                tseg['ordered'] = value
+            for tseg in list(self._collapsed_trees.values()):
+                tseg['ordered'] = value
+        else: CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr[:5] == 'node_':
+            return self._nodeattribs.get(attr[5:], None)
+        elif attr[:5] == 'leaf_':
+            return self._leafattribs.get(attr[5:], None)
+        elif attr[:4] == 'loc_':
+            return self._locattribs.get(attr[4:], None)
+        elif attr == 'line_color': return self._line_color
+        elif attr == 'line_width': return self._line_width
+        elif attr == 'roof_color': return self._roof_color
+        elif attr == 'roof_fill': return self._roof_fill
+        elif attr == 'shapeable': return self._shapeable
+        elif attr == 'xspace': return self._xspace
+        elif attr == 'yspace': return self._yspace
+        elif attr == 'orientation': return self._orientation
+        else: return CanvasWidget.__getitem__(self, attr)
+
+    def _tags(self): return []
+
+    def _manage(self):
+        segs = list(self._expanded_trees.values()) + list(self._collapsed_trees.values())
+        for tseg in segs:
+            if tseg.hidden():
+                tseg.show()
+                tseg.manage()
+                tseg.hide()
+
+    def toggle_collapsed(self, treeseg):
+        """
+        Collapse/expand a tree.
+        """
+        old_treeseg = treeseg
+        if old_treeseg['roof']:
+            new_treeseg = self._expanded_trees[self._keys[old_treeseg]]
+        else:
+            new_treeseg = self._collapsed_trees[self._keys[old_treeseg]]
+
+        # Replace the old tree with the new tree.
+        if old_treeseg.parent() is self:
+            self._remove_child_widget(old_treeseg)
+            self._add_child_widget(new_treeseg)
+            self._treeseg = new_treeseg
+        else:
+            old_treeseg.parent().replace_child(old_treeseg, new_treeseg)
+
+        # Move the new tree to where the old tree was.  Show it first,
+        # so we can find its bounding box.
+        new_treeseg.show()
+        (newx, newy) = new_treeseg.label().bbox()[:2]
+        (oldx, oldy) = old_treeseg.label().bbox()[:2]
+        new_treeseg.move(oldx-newx, oldy-newy)
+
+        # Hide the old tree
+        old_treeseg.hide()
+
+        # We could do parent.manage() here instead, if we wanted.
+        new_treeseg.parent().update(new_treeseg)
+
+##//////////////////////////////////////////////////////
+##  draw_trees
+##//////////////////////////////////////////////////////
+
+class TreeView(object):
+    def __init__(self, *trees):
+        from math import sqrt, ceil
+
+        self._trees = trees
+
+        self._top = Tk()
+        self._top.title('NLTK')
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Control-q>', self.destroy)
+
+        cf = self._cframe = CanvasFrame(self._top)
+        self._top.bind('<Control-p>', self._cframe.print_to_file)
+
+        # Size is variable.
+        self._size = IntVar(self._top)
+        self._size.set(12)
+        bold = ('helvetica', -self._size.get(), 'bold')
+        helv = ('helvetica', -self._size.get())
+
+        # Lay the trees out in a square.
+        self._width = int(ceil(sqrt(len(trees))))
+        self._widgets = []
+        for i in range(len(trees)):
+            widget = TreeWidget(cf.canvas(), trees[i], node_font=bold,
+                                leaf_color='#008040', node_color='#004080',
+                                roof_color='#004040', roof_fill='white',
+                                line_color='#004040', draggable=1,
+                                leaf_font=helv)
+            widget.bind_click_trees(widget.toggle_collapsed)
+            self._widgets.append(widget)
+            cf.add_widget(widget, 0, 0)
+
+        self._layout()
+        self._cframe.pack(expand=1, fill='both')
+        self._init_menubar()
+
+    def _layout(self):
+        i = x = y = ymax = 0
+        width = self._width
+        for i in range(len(self._widgets)):
+            widget = self._widgets[i]
+            (oldx, oldy) = widget.bbox()[:2]
+            if i % width == 0:
+                y = ymax
+                x = 0
+            widget.move(x-oldx, y-oldy)
+            x = widget.bbox()[2] + 10
+            ymax = max(ymax, widget.bbox()[3] + 10)
+
+    def _init_menubar(self):
+        menubar = Menu(self._top)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Print to Postscript', underline=0,
+                             command=self._cframe.print_to_file,
+                             accelerator='Ctrl-p')
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        zoommenu = Menu(menubar, tearoff=0)
+        zoommenu.add_radiobutton(label='Tiny', variable=self._size,
+                                 underline=0, value=10, command=self.resize)
+        zoommenu.add_radiobutton(label='Small', variable=self._size,
+                                 underline=0, value=12, command=self.resize)
+        zoommenu.add_radiobutton(label='Medium', variable=self._size,
+                                 underline=0, value=14, command=self.resize)
+        zoommenu.add_radiobutton(label='Large', variable=self._size,
+                                 underline=0, value=28, command=self.resize)
+        zoommenu.add_radiobutton(label='Huge', variable=self._size,
+                                 underline=0, value=50, command=self.resize)
+        menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
+
+        self._top.config(menu=menubar)
+
+    def resize(self, *e):
+        bold = ('helvetica', -self._size.get(), 'bold')
+        helv = ('helvetica', -self._size.get())
+        xspace = self._size.get()
+        yspace = self._size.get()
+        for widget in self._widgets:
+            widget['node_font'] = bold
+            widget['leaf_font'] = helv
+            widget['xspace'] = xspace
+            widget['yspace'] = yspace
+            if self._size.get() < 20: widget['line_width'] = 1
+            elif self._size.get() < 30: widget['line_width'] = 2
+            else: widget['line_width'] = 3
+        self._layout()
+
+    def destroy(self, *e):
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._top.mainloop(*args, **kwargs)
+
+def draw_trees(*trees):
+    """
+    Open a new window containing a graphical diagram of the given
+    trees.
+
+    :rtype: None
+    """
+    TreeView(*trees).mainloop()
+    return
+
+##//////////////////////////////////////////////////////
+##  Demo Code
+##//////////////////////////////////////////////////////
+
+def demo():
+    import random
+    def fill(cw):
+        cw['fill'] = '#%06d' % random.randint(0,999999)
+
+    cf = CanvasFrame(width=550, height=450, closeenough=2)
+
+    t = Tree.fromstring('''
+    (S (NP the very big cat)
+       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''')
+
+    tc = TreeWidget(cf.canvas(), t, draggable=1,
+                    node_font=('helvetica', -14, 'bold'),
+                    leaf_font=('helvetica', -12, 'italic'),
+                    roof_fill='white', roof_color='black',
+                    leaf_color='green4', node_color='blue2')
+    cf.add_widget(tc,10,10)
+
+    def boxit(canvas, text):
+        big = ('helvetica', -16, 'bold')
+        return BoxWidget(canvas, TextWidget(canvas, text,
+                                            font=big), fill='green')
+    def ovalit(canvas, text):
+        return OvalWidget(canvas, TextWidget(canvas, text),
+                          fill='cyan')
+
+    treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
+    tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)
+
+    def color(node):
+        node['color'] = '#%04d00' % random.randint(0,9999)
+    def color2(treeseg):
+        treeseg.label()['fill'] = '#%06d' % random.randint(0,9999)
+        treeseg.label().child()['color'] = 'white'
+
+    tc.bind_click_trees(tc.toggle_collapsed)
+    tc2.bind_click_trees(tc2.toggle_collapsed)
+    tc.bind_click_nodes(color, 3)
+    tc2.expanded_tree(1).bind_click(color2, 3)
+    tc2.expanded_tree().bind_click(color2, 3)
+
+    paren = ParenWidget(cf.canvas(), tc2)
+    cf.add_widget(paren, tc.bbox()[2]+10, 10)
+
+    tree3 = Tree.fromstring('''
+    (S (NP this tree) (AUX was)
+       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''')
+    tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4',
+                              tree_xspace=2, tree_width=2)
+    tc3['draggable'] = 1
+    cf.add_widget(tc3, 10, tc.bbox()[3]+10)
+
+    def orientswitch(treewidget):
+        if treewidget['orientation'] == 'horizontal':
+            treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical')
+            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical')
+            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
+            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
+            treewidget['orientation'] = 'vertical'
+        else:
+            treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal')
+            treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal')
+            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
+            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
+            treewidget['orientation'] = 'horizontal'
+
+    text = """
+Try clicking, right clicking, and dragging
+different elements of each of the trees.
+The top-left tree is a TreeWidget built from
+a Tree.  The top-right is a TreeWidget built
+from a Tree, using non-default widget
+constructors for the nodes & leaves (BoxWidget
+and OvalWidget).  The bottom-left tree is
+built from tree_to_treesegment."""
+    twidget = TextWidget(cf.canvas(), text.strip())
+    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
+    cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10)
+
+    tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))')
+    tc4 = TreeWidget(cf.canvas(), tree4, draggable=1,
+                     line_color='brown2', roof_color='brown2',
+                     node_font=('helvetica', -12, 'bold'),
+                     node_color='brown4', orientation='horizontal')
+    tc4.manage()
+    cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10)
+    tc4.bind_click(orientswitch)
+    tc4.bind_click_trees(tc4.toggle_collapsed, 3)
+
+    # Run mainloop
+    cf.mainloop()
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/draw/tree.pyc b/nlp_resource_data/nltk/draw/tree.pyc

new file mode 100755 (executable)

index 0000000..9be2391

Binary files /dev/null and b/nlp_resource_data/nltk/draw/tree.pyc differ
diff --git a/nlp_resource_data/nltk/draw/util.py b/nlp_resource_data/nltk/draw/util.py

new file mode 100755 (executable)

index 0000000..e4006bc
--- /dev/null
+++ b/nlp_resource_data/nltk/draw/util.py
@@ -0,0 +1,2356 @@
+# Natural Language Toolkit: Drawing utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tools for graphically displaying and interacting with the objects and
+processing classes defined by the Toolkit.  These tools are primarily
+intended to help students visualize the objects that they create.
+
+The graphical tools are typically built using "canvas widgets", each
+of which encapsulates the graphical elements and bindings used to
+display a complex object on a Tkinter ``Canvas``.  For example, NLTK
+defines canvas widgets for displaying trees and directed graphs, as
+well as a number of simpler widgets.  These canvas widgets make it
+easier to build new graphical tools and demos.  See the class
+documentation for ``CanvasWidget`` for more information.
+
+The ``nltk.draw`` module defines the abstract ``CanvasWidget`` base
+class, and a number of simple canvas widgets.  The remaining canvas
+widgets are defined by submodules, such as ``nltk.draw.tree``.
+
+The ``nltk.draw`` module also defines ``CanvasFrame``, which
+encapsulates a ``Canvas`` and its scrollbars.  It uses a
+``ScrollWatcherWidget`` to ensure that all canvas widgets contained on
+its canvas are within the scroll region.
+
+Acknowledgements: Many of the ideas behind the canvas widget system
+are derived from ``CLIG``, a Tk-based grapher for linguistic data
+structures.  For more information, see the CLIG
+homepage (http://www.ags.uni-sb.de/~konrad/clig.html).
+
+"""
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from six.moves.tkinter import (Button, Canvas, Entry, Frame, Label, Menu,
+                               Menubutton, Scrollbar, StringVar, Text, Tk,
+                               Toplevel, Widget, RAISED)
+from six.moves.tkinter_tkfiledialog import asksaveasfilename
+
+from nltk.util import in_idle
+
+##//////////////////////////////////////////////////////
+##  CanvasWidget
+##//////////////////////////////////////////////////////
+
+
+@add_metaclass(ABCMeta)
+class CanvasWidget(object):
+    """
+    A collection of graphical elements and bindings used to display a
+    complex object on a Tkinter ``Canvas``.  A canvas widget is
+    responsible for managing the ``Canvas`` tags and callback bindings
+    necessary to display and interact with the object.  Canvas widgets
+    are often organized into hierarchies, where parent canvas widgets
+    control aspects of their child widgets.
+
+    Each canvas widget is bound to a single ``Canvas``.  This ``Canvas``
+    is specified as the first argument to the ``CanvasWidget``'s
+    constructor.
+
+    Attributes.  Each canvas widget can support a variety of
+    "attributes", which control how the canvas widget is displayed.
+    Some typical examples attributes are ``color``, ``font``, and
+    ``radius``.  Each attribute has a default value.  This default
+    value can be overridden in the constructor, using keyword
+    arguments of the form ``attribute=value``:
+
+        >>> from nltk.draw.util import TextWidget
+        >>> cn = TextWidget(c, 'test', color='red')
+
+    Attribute values can also be changed after a canvas widget has
+    been constructed, using the ``__setitem__`` operator:
+
+        >>> cn['font'] = 'times'
+
+    The current value of an attribute value can be queried using the
+    ``__getitem__`` operator:
+
+        >>> cn['color']
+        red
+
+    For a list of the attributes supported by a type of canvas widget,
+    see its class documentation.
+
+    Interaction.  The attribute ``'draggable'`` controls whether the
+    user can drag a canvas widget around the canvas.  By default,
+    canvas widgets are not draggable.
+
+    ``CanvasWidget`` provides callback support for two types of user
+    interaction: clicking and dragging.  The method ``bind_click``
+    registers a callback function that is called whenever the canvas
+    widget is clicked.  The method ``bind_drag`` registers a callback
+    function that is called after the canvas widget is dragged.  If
+    the user clicks or drags a canvas widget with no registered
+    callback function, then the interaction event will propagate to
+    its parent.  For each canvas widget, only one callback function
+    may be registered for an interaction event.  Callback functions
+    can be deregistered with the ``unbind_click`` and ``unbind_drag``
+    methods.
+
+    Subclassing.  ``CanvasWidget`` is an abstract class.  Subclasses
+    are required to implement the following methods:
+
+      - ``__init__``: Builds a new canvas widget.  It must perform the
+        following three tasks (in order):
+          - Create any new graphical elements.
+          - Call ``_add_child_widget`` on each child widget.
+          - Call the ``CanvasWidget`` constructor.
+      - ``_tags``: Returns a list of the canvas tags for all graphical
+        elements managed by this canvas widget, not including
+        graphical elements managed by its child widgets.
+      - ``_manage``: Arranges the child widgets of this canvas widget.
+        This is typically only called when the canvas widget is
+        created.
+      - ``_update``: Update this canvas widget in response to a
+        change in a single child.
+
+    For a ``CanvasWidget`` with no child widgets, the default
+    definitions for ``_manage`` and ``_update`` may be used.
+
+    If a subclass defines any attributes, then it should implement
+    ``__getitem__`` and ``__setitem__``.  If either of these methods is
+    called with an unknown attribute, then they should propagate the
+    request to ``CanvasWidget``.
+
+    Most subclasses implement a number of additional methods that
+    modify the ``CanvasWidget`` in some way.  These methods must call
+    ``parent.update(self)`` after making any changes to the canvas
+    widget's graphical elements.  The canvas widget must also call
+    ``parent.update(self)`` after changing any attribute value that
+    affects the shape or position of the canvas widget's graphical
+    elements.
+
+    :type __canvas: Tkinter.Canvas
+    :ivar __canvas: This ``CanvasWidget``'s canvas.
+
+    :type __parent: CanvasWidget or None
+    :ivar __parent: This ``CanvasWidget``'s hierarchical parent widget.
+    :type __children: list(CanvasWidget)
+    :ivar __children: This ``CanvasWidget``'s hierarchical child widgets.
+
+    :type __updating: bool
+    :ivar __updating: Is this canvas widget currently performing an
+        update?  If it is, then it will ignore any new update requests
+        from child widgets.
+
+    :type __draggable: bool
+    :ivar __draggable: Is this canvas widget draggable?
+    :type __press: event
+    :ivar __press: The ButtonPress event that we're currently handling.
+    :type __drag_x: int
+    :ivar __drag_x: Where it's been moved to (to find dx)
+    :type __drag_y: int
+    :ivar __drag_y: Where it's been moved to (to find dy)
+    :type __callbacks: dictionary
+    :ivar __callbacks: Registered callbacks.  Currently, four keys are
+        used: ``1``, ``2``, ``3``, and ``'drag'``.  The values are
+        callback functions.  Each callback function takes a single
+        argument, which is the ``CanvasWidget`` that triggered the
+        callback.
+    """
+    def __init__(self, canvas, parent=None, **attribs):
+        """
+        Create a new canvas widget.  This constructor should only be
+        called by subclass constructors; and it should be called only
+        "after" the subclass has constructed all graphical canvas
+        objects and registered all child widgets.
+
+        :param canvas: This canvas widget's canvas.
+        :type canvas: Tkinter.Canvas
+        :param parent: This canvas widget's hierarchical parent.
+        :type parent: CanvasWidget
+        :param attribs: The new canvas widget's attributes.
+        """
+        if self.__class__ == CanvasWidget:
+            raise TypeError('CanvasWidget is an abstract base class')
+
+        if not isinstance(canvas, Canvas):
+            raise TypeError('Expected a canvas!')
+
+        self.__canvas = canvas
+        self.__parent = parent
+
+        # If the subclass constructor called _add_child_widget, then
+        # self.__children will already exist.
+        if not hasattr(self, '_CanvasWidget__children'): self.__children = []
+
+        # Is this widget hidden?
+        self.__hidden = 0
+
+        # Update control (prevents infinite loops)
+        self.__updating = 0
+
+        # Button-press and drag callback handling.
+        self.__press = None
+        self.__drag_x = self.__drag_y = 0
+        self.__callbacks = {}
+        self.__draggable = 0
+
+        # Set up attributes.
+        for (attr, value) in list(attribs.items()): self[attr] = value
+
+        # Manage this canvas widget
+        self._manage()
+
+        # Register any new bindings
+        for tag in self._tags():
+            self.__canvas.tag_bind(tag, '<ButtonPress-1>',
+                                   self.__press_cb)
+            self.__canvas.tag_bind(tag, '<ButtonPress-2>',
+                                   self.__press_cb)
+            self.__canvas.tag_bind(tag, '<ButtonPress-3>',
+                                   self.__press_cb)
+
+    ##//////////////////////////////////////////////////////
+    ##  Inherited methods.
+    ##//////////////////////////////////////////////////////
+
+    def bbox(self):
+        """
+        :return: A bounding box for this ``CanvasWidget``. The bounding
+            box is a tuple of four coordinates, *(xmin, ymin, xmax, ymax)*,
+            for a rectangle which encloses all of the canvas
+            widget's graphical elements.  Bounding box coordinates are
+            specified with respect to the coordinate space of the ``Canvas``.
+        :rtype: tuple(int, int, int, int)
+        """
+        if self.__hidden: return (0,0,0,0)
+        if len(self.tags()) == 0: raise ValueError('No tags')
+        return self.__canvas.bbox(*self.tags())
+
+    def width(self):
+        """
+        :return: The width of this canvas widget's bounding box, in
+            its ``Canvas``'s coordinate space.
+        :rtype: int
+        """
+        if len(self.tags()) == 0: raise ValueError('No tags')
+        bbox = self.__canvas.bbox(*self.tags())
+        return bbox[2]-bbox[0]
+
+    def height(self):
+        """
+        :return: The height of this canvas widget's bounding box, in
+            its ``Canvas``'s coordinate space.
+        :rtype: int
+        """
+        if len(self.tags()) == 0: raise ValueError('No tags')
+        bbox = self.__canvas.bbox(*self.tags())
+        return bbox[3]-bbox[1]
+
+    def parent(self):
+        """
+        :return: The hierarchical parent of this canvas widget.
+            ``self`` is considered a subpart of its parent for
+            purposes of user interaction.
+        :rtype: CanvasWidget or None
+        """
+        return self.__parent
+
+    def child_widgets(self):
+        """
+        :return: A list of the hierarchical children of this canvas
+            widget.  These children are considered part of ``self``
+            for purposes of user interaction.
+        :rtype: list of CanvasWidget
+        """
+        return self.__children
+
+    def canvas(self):
+        """
+        :return: The canvas that this canvas widget is bound to.
+        :rtype: Tkinter.Canvas
+        """
+        return self.__canvas
+
+    def move(self, dx, dy):
+        """
+        Move this canvas widget by a given distance.  In particular,
+        shift the canvas widget right by ``dx`` pixels, and down by
+        ``dy`` pixels.  Both ``dx`` and ``dy`` may be negative, resulting
+        in leftward or upward movement.
+
+        :type dx: int
+        :param dx: The number of pixels to move this canvas widget
+            rightwards.
+        :type dy: int
+        :param dy: The number of pixels to move this canvas widget
+            downwards.
+        :rtype: None
+        """
+        if dx == dy == 0: return
+        for tag in self.tags():
+            self.__canvas.move(tag, dx, dy)
+        if self.__parent: self.__parent.update(self)
+
+    def moveto(self, x, y, anchor='NW'):
+        """
+        Move this canvas widget to the given location.  In particular,
+        shift the canvas widget such that the corner or side of the
+        bounding box specified by ``anchor`` is at location (``x``,
+        ``y``).
+
+        :param x,y: The location that the canvas widget should be moved
+            to.
+        :param anchor: The corner or side of the canvas widget that
+            should be moved to the specified location.  ``'N'``
+            specifies the top center; ``'NE'`` specifies the top right
+            corner; etc.
+        """
+        x1,y1,x2,y2 = self.bbox()
+        if anchor == 'NW': self.move(x-x1,        y-y1)
+        if anchor == 'N':  self.move(x-x1/2-x2/2, y-y1)
+        if anchor == 'NE': self.move(x-x2,        y-y1)
+        if anchor == 'E':  self.move(x-x2,        y-y1/2-y2/2)
+        if anchor == 'SE': self.move(x-x2,        y-y2)
+        if anchor == 'S':  self.move(x-x1/2-x2/2, y-y2)
+        if anchor == 'SW': self.move(x-x1,        y-y2)
+        if anchor == 'W':  self.move(x-x1,        y-y1/2-y2/2)
+
+    def destroy(self):
+        """
+        Remove this ``CanvasWidget`` from its ``Canvas``.  After a
+        ``CanvasWidget`` has been destroyed, it should not be accessed.
+
+        Note that you only need to destroy a top-level
+        ``CanvasWidget``; its child widgets will be destroyed
+        automatically.  If you destroy a non-top-level
+        ``CanvasWidget``, then the entire top-level widget will be
+        destroyed.
+
+        :raise ValueError: if this ``CanvasWidget`` has a parent.
+        :rtype: None
+        """
+        if self.__parent is not None:
+            self.__parent.destroy()
+            return
+
+        for tag in self.tags():
+            self.__canvas.tag_unbind(tag, '<ButtonPress-1>')
+            self.__canvas.tag_unbind(tag, '<ButtonPress-2>')
+            self.__canvas.tag_unbind(tag, '<ButtonPress-3>')
+        self.__canvas.delete(*self.tags())
+        self.__canvas = None
+
+    def update(self, child):
+        """
+        Update the graphical display of this canvas widget, and all of
+        its ancestors, in response to a change in one of this canvas
+        widget's children.
+
+        :param child: The child widget that changed.
+        :type child: CanvasWidget
+        """
+        if self.__hidden or child.__hidden: return
+        # If we're already updating, then do nothing.  This prevents
+        # infinite loops when _update modifies its children.
+        if self.__updating: return
+        self.__updating = 1
+
+        # Update this CanvasWidget.
+        self._update(child)
+
+        # Propagate update request to the parent.
+        if self.__parent: self.__parent.update(self)
+
+        # We're done updating.
+        self.__updating = 0
+
+    def manage(self):
+        """
+        Arrange this canvas widget and all of its descendants.
+
+        :rtype: None
+        """
+        if self.__hidden: return
+        for child in self.__children: child.manage()
+        self._manage()
+
+    def tags(self):
+        """
+        :return: a list of the canvas tags for all graphical
+            elements managed by this canvas widget, including
+            graphical elements managed by its child widgets.
+        :rtype: list of int
+        """
+        if self.__canvas is None:
+            raise ValueError('Attempt to access a destroyed canvas widget')
+        tags = []
+        tags += self._tags()
+        for child in self.__children:
+            tags += child.tags()
+        return tags
+
+    def __setitem__(self, attr, value):
+        """
+        Set the value of the attribute ``attr`` to ``value``.  See the
+        class documentation for a list of attributes supported by this
+        canvas widget.
+
+        :rtype: None
+        """
+        if attr == 'draggable':
+            self.__draggable = value
+        else:
+            raise ValueError('Unknown attribute %r' % attr)
+
+    def __getitem__(self, attr):
+        """
+        :return: the value of the attribute ``attr``.  See the class
+            documentation for a list of attributes supported by this
+            canvas widget.
+        :rtype: (any)
+        """
+        if attr == 'draggable':
+            return self.__draggable
+        else:
+            raise ValueError('Unknown attribute %r' % attr)
+
+    def __repr__(self):
+        """
+        :return: a string representation of this canvas widget.
+        :rtype: str
+        """
+        return '<%s>' % self.__class__.__name__
+
+    def hide(self):
+        """
+        Temporarily hide this canvas widget.
+
+        :rtype: None
+        """
+        self.__hidden = 1
+        for tag in self.tags():
+            self.__canvas.itemconfig(tag, state='hidden')
+
+    def show(self):
+        """
+        Show a hidden canvas widget.
+
+        :rtype: None
+        """
+        self.__hidden = 0
+        for tag in self.tags():
+            self.__canvas.itemconfig(tag, state='normal')
+
+    def hidden(self):
+        """
+        :return: True if this canvas widget is hidden.
+        :rtype: bool
+        """
+        return self.__hidden
+
+    ##//////////////////////////////////////////////////////
+    ##  Callback interface
+    ##//////////////////////////////////////////////////////
+
+    def bind_click(self, callback, button=1):
+        """
+        Register a new callback that will be called whenever this
+        ``CanvasWidget`` is clicked on.
+
+        :type callback: function
+        :param callback: The callback function that will be called
+            whenever this ``CanvasWidget`` is clicked.  This function
+            will be called with this ``CanvasWidget`` as its argument.
+        :type button: int
+        :param button: Which button the user should use to click on
+            this ``CanvasWidget``.  Typically, this should be 1 (left
+            button), 3 (right button), or 2 (middle button).
+        """
+        self.__callbacks[button] = callback
+
+    def bind_drag(self, callback):
+        """
+        Register a new callback that will be called after this
+        ``CanvasWidget`` is dragged.  This implicitly makes this
+        ``CanvasWidget`` draggable.
+
+        :type callback: function
+        :param callback: The callback function that will be called
+            whenever this ``CanvasWidget`` is clicked.  This function
+            will be called with this ``CanvasWidget`` as its argument.
+        """
+        self.__draggable = 1
+        self.__callbacks['drag'] = callback
+
+    def unbind_click(self, button=1):
+        """
+        Remove a callback that was registered with ``bind_click``.
+
+        :type button: int
+        :param button: Which button the user should use to click on
+            this ``CanvasWidget``.  Typically, this should be 1 (left
+            button), 3 (right button), or 2 (middle button).
+        """
+        try: del self.__callbacks[button]
+        except: pass
+
+    def unbind_drag(self):
+        """
+        Remove a callback that was registered with ``bind_drag``.
+        """
+        try: del self.__callbacks['drag']
+        except: pass
+
+    ##//////////////////////////////////////////////////////
+    ##  Callback internals
+    ##//////////////////////////////////////////////////////
+
+    def __press_cb(self, event):
+        """
+        Handle a button-press event:
+          - record the button press event in ``self.__press``
+          - register a button-release callback.
+          - if this CanvasWidget or any of its ancestors are
+            draggable, then register the appropriate motion callback.
+        """
+        # If we're already waiting for a button release, then ignore
+        # this new button press.
+        if (self.__canvas.bind('<ButtonRelease-1>') or
+            self.__canvas.bind('<ButtonRelease-2>') or
+            self.__canvas.bind('<ButtonRelease-3>')):
+            return
+
+        # Unbind motion (just in case; this shouldn't be necessary)
+        self.__canvas.unbind('<Motion>')
+
+        # Record the button press event.
+        self.__press = event
+
+        # If any ancestor is draggable, set up a motion callback.
+        # (Only if they pressed button number 1)
+        if event.num == 1:
+            widget = self
+            while widget is not None:
+                if widget['draggable']:
+                    widget.__start_drag(event)
+                    break
+                widget = widget.parent()
+
+        # Set up the button release callback.
+        self.__canvas.bind('<ButtonRelease-%d>' % event.num,
+                          self.__release_cb)
+
+    def __start_drag(self, event):
+        """
+        Begin dragging this object:
+          - register a motion callback
+          - record the drag coordinates
+        """
+        self.__canvas.bind('<Motion>', self.__motion_cb)
+        self.__drag_x = event.x
+        self.__drag_y = event.y
+
+    def __motion_cb(self, event):
+        """
+        Handle a motion event:
+          - move this object to the new location
+          - record the new drag coordinates
+        """
+        self.move(event.x-self.__drag_x, event.y-self.__drag_y)
+        self.__drag_x = event.x
+        self.__drag_y = event.y
+
+    def __release_cb(self, event):
+        """
+        Handle a release callback:
+          - unregister motion & button release callbacks.
+          - decide whether they clicked, dragged, or cancelled
+          - call the appropriate handler.
+        """
+        # Unbind the button release & motion callbacks.
+        self.__canvas.unbind('<ButtonRelease-%d>' % event.num)
+        self.__canvas.unbind('<Motion>')
+
+        # Is it a click or a drag?
+        if (event.time - self.__press.time < 100 and
+            abs(event.x-self.__press.x) + abs(event.y-self.__press.y) < 5):
+            # Move it back, if we were dragging.
+            if self.__draggable and event.num == 1:
+                self.move(self.__press.x - self.__drag_x,
+                          self.__press.y - self.__drag_y)
+            self.__click(event.num)
+        elif event.num == 1:
+            self.__drag()
+
+        self.__press = None
+
+    def __drag(self):
+        """
+        If this ``CanvasWidget`` has a drag callback, then call it;
+        otherwise, find the closest ancestor with a drag callback, and
+        call it.  If no ancestors have a drag callback, do nothing.
+        """
+        if self.__draggable:
+            if 'drag' in self.__callbacks:
+                cb = self.__callbacks['drag']
+                try:
+                    cb(self)
+                except:
+                    print('Error in drag callback for %r' % self)
+        elif self.__parent is not None:
+            self.__parent.__drag()
+
+    def __click(self, button):
+        """
+        If this ``CanvasWidget`` has a drag callback, then call it;
+        otherwise, find the closest ancestor with a click callback, and
+        call it.  If no ancestors have a click callback, do nothing.
+        """
+        if button in self.__callbacks:
+            cb = self.__callbacks[button]
+            #try:
+            cb(self)
+            #except:
+            #    print 'Error in click callback for %r' % self
+            #    raise
+        elif self.__parent is not None:
+            self.__parent.__click(button)
+
+    ##//////////////////////////////////////////////////////
+    ##  Child/parent Handling
+    ##//////////////////////////////////////////////////////
+
+    def _add_child_widget(self, child):
+        """
+        Register a hierarchical child widget.  The child will be
+        considered part of this canvas widget for purposes of user
+        interaction.  ``_add_child_widget`` has two direct effects:
+          - It sets ``child``'s parent to this canvas widget.
+          - It adds ``child`` to the list of canvas widgets returned by
+            the ``child_widgets`` member function.
+
+        :param child: The new child widget.  ``child`` must not already
+            have a parent.
+        :type child: CanvasWidget
+        """
+        if not hasattr(self, '_CanvasWidget__children'): self.__children = []
+        if child.__parent is not None:
+            raise ValueError('%s already has a parent', child)
+        child.__parent = self
+        self.__children.append(child)
+
+    def _remove_child_widget(self, child):
+        """
+        Remove a hierarchical child widget.  This child will no longer
+        be considered part of this canvas widget for purposes of user
+        interaction.  ``_add_child_widget`` has two direct effects:
+          - It sets ``child``'s parent to None.
+          - It removes ``child`` from the list of canvas widgets
+            returned by the ``child_widgets`` member function.
+
+        :param child: The child widget to remove.  ``child`` must be a
+            child of this canvas widget.
+        :type child: CanvasWidget
+        """
+        self.__children.remove(child)
+        child.__parent = None
+
+    ##//////////////////////////////////////////////////////
+    ##  Defined by subclass
+    ##//////////////////////////////////////////////////////
+
+    @abstractmethod
+    def _tags(self):
+        """
+        :return: a list of canvas tags for all graphical elements
+            managed by this canvas widget, not including graphical
+            elements managed by its child widgets.
+        :rtype: list of int
+        """
+
+    def _manage(self):
+        """
+        Arrange the child widgets of this canvas widget.  This method
+        is called when the canvas widget is initially created.  It is
+        also called if the user calls the ``manage`` method on this
+        canvas widget or any of its ancestors.
+
+        :rtype: None
+        """
+
+    def _update(self, child):
+        """
+        Update this canvas widget in response to a change in one of
+        its children.
+
+        :param child: The child that changed.
+        :type child: CanvasWidget
+        :rtype: None
+        """
+
+##//////////////////////////////////////////////////////
+##  Basic widgets.
+##//////////////////////////////////////////////////////
+
+class TextWidget(CanvasWidget):
+    """
+    A canvas widget that displays a single string of text.
+
+    Attributes:
+      - ``color``: the color of the text.
+      - ``font``: the font used to display the text.
+      - ``justify``: justification for multi-line texts.  Valid values
+        are ``left``, ``center``, and ``right``.
+      - ``width``: the width of the text.  If the text is wider than
+        this width, it will be line-wrapped at whitespace.
+      - ``draggable``: whether the text can be dragged by the user.
+    """
+    def __init__(self, canvas, text, **attribs):
+        """
+        Create a new text widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :type text: str
+        :param text: The string of text to display.
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._text = text
+        self._tag = canvas.create_text(1, 1, text=text)
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def __setitem__(self, attr, value):
+        if attr in ('color', 'font', 'justify', 'width'):
+            if attr == 'color': attr = 'fill'
+            self.canvas().itemconfig(self._tag, {attr:value})
+        else:
+            CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'width':
+            return int(self.canvas().itemcget(self._tag, attr))
+        elif attr in ('color', 'font', 'justify'):
+            if attr == 'color': attr = 'fill'
+            return self.canvas().itemcget(self._tag, attr)
+        else:
+            return CanvasWidget.__getitem__(self, attr)
+
+    def _tags(self): return [self._tag]
+
+    def text(self):
+        """
+        :return: The text displayed by this text widget.
+        :rtype: str
+        """
+        return self.canvas().itemcget(self._tag, 'TEXT')
+
+    def set_text(self, text):
+        """
+        Change the text that is displayed by this text widget.
+
+        :type text: str
+        :param text: The string of text to display.
+        :rtype: None
+        """
+        self.canvas().itemconfig(self._tag, text=text)
+        if self.parent() is not None:
+            self.parent().update(self)
+
+    def __repr__(self):
+        return '[Text: %r]' % self._text
+
+class SymbolWidget(TextWidget):
+    """
+    A canvas widget that displays special symbols, such as the
+    negation sign and the exists operator.  Symbols are specified by
+    name.  Currently, the following symbol names are defined: ``neg``,
+    ``disj``, ``conj``, ``lambda``, ``merge``, ``forall``, ``exists``,
+    ``subseteq``, ``subset``, ``notsubset``, ``emptyset``, ``imp``,
+    ``rightarrow``, ``equal``, ``notequal``, ``epsilon``.
+
+    Attributes:
+
+    - ``color``: the color of the text.
+    - ``draggable``: whether the text can be dragged by the user.
+
+    :cvar SYMBOLS: A dictionary mapping from symbols to the character
+        in the ``symbol`` font used to render them.
+    """
+    SYMBOLS = {'neg':'\330', 'disj':'\332', 'conj': '\331',
+               'lambda': '\154', 'merge': '\304',
+               'forall': '\042', 'exists': '\044',
+               'subseteq': '\315', 'subset': '\314',
+               'notsubset': '\313', 'emptyset': '\306',
+               'imp': '\336', 'rightarrow': chr(222), #'\256',
+               'equal': '\75', 'notequal': '\271',
+               'intersection': '\307', 'union': '\310',
+               'epsilon': 'e',
+               }
+
+    def __init__(self, canvas, symbol, **attribs):
+        """
+        Create a new symbol widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :type symbol: str
+        :param symbol: The name of the symbol to display.
+        :param attribs: The new canvas widget's attributes.
+        """
+        attribs['font'] = 'symbol'
+        TextWidget.__init__(self, canvas, '', **attribs)
+        self.set_symbol(symbol)
+
+    def symbol(self):
+        """
+        :return: the name of the symbol that is displayed by this
+            symbol widget.
+        :rtype: str
+        """
+        return self._symbol
+
+    def set_symbol(self, symbol):
+        """
+        Change the symbol that is displayed by this symbol widget.
+
+        :type symbol: str
+        :param symbol: The name of the symbol to display.
+        """
+        if symbol not in SymbolWidget.SYMBOLS:
+            raise ValueError('Unknown symbol: %s' % symbol)
+        self._symbol = symbol
+        self.set_text(SymbolWidget.SYMBOLS[symbol])
+
+    def __repr__(self):
+        return '[Symbol: %r]' % self._symbol
+
+    @staticmethod
+    def symbolsheet(size=20):
+        """
+        Open a new Tkinter window that displays the entire alphabet
+        for the symbol font.  This is useful for constructing the
+        ``SymbolWidget.SYMBOLS`` dictionary.
+        """
+        top = Tk()
+        def destroy(e, top=top): top.destroy()
+        top.bind('q', destroy)
+        Button(top, text='Quit', command=top.destroy).pack(side='bottom')
+        text = Text(top, font=('helvetica', -size), width=20, height=30)
+        text.pack(side='left')
+        sb=Scrollbar(top, command=text.yview)
+        text['yscrollcommand']=sb.set
+        sb.pack(side='right', fill='y')
+        text.tag_config('symbol', font=('symbol', -size))
+        for i in range(256):
+            if i in (0,10): continue # null and newline
+            for k,v in list(SymbolWidget.SYMBOLS.items()):
+                if v == chr(i):
+                    text.insert('end', '%-10s\t' % k)
+                    break
+            else:
+                text.insert('end', '%-10d  \t' % i)
+            text.insert('end', '[%s]\n' % chr(i), 'symbol')
+        top.mainloop()
+
+
+class AbstractContainerWidget(CanvasWidget):
+    """
+    An abstract class for canvas widgets that contain a single child,
+    such as ``BoxWidget`` and ``OvalWidget``.  Subclasses must define
+    a constructor, which should create any new graphical elements and
+    then call the ``AbstractCanvasContainer`` constructor.  Subclasses
+    must also define the ``_update`` method and the ``_tags`` method;
+    and any subclasses that define attributes should define
+    ``__setitem__`` and ``__getitem__``.
+    """
+    def __init__(self, canvas, child, **attribs):
+        """
+        Create a new container widget.  This constructor should only
+        be called by subclass constructors.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param child: The container's child widget.  ``child`` must not
+            have a parent.
+        :type child: CanvasWidget
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._child = child
+        self._add_child_widget(child)
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def _manage(self):
+        self._update(self._child)
+
+    def child(self):
+        """
+        :return: The child widget contained by this container widget.
+        :rtype: CanvasWidget
+        """
+        return self._child
+
+    def set_child(self, child):
+        """
+        Change the child widget contained by this container widget.
+
+        :param child: The new child widget.  ``child`` must not have a
+            parent.
+        :type child: CanvasWidget
+        :rtype: None
+        """
+        self._remove_child_widget(self._child)
+        self._add_child_widget(child)
+        self._child = child
+        self.update(child)
+
+    def __repr__(self):
+        name = self.__class__.__name__
+        if name[-6:] == 'Widget': name = name[:-6]
+        return '[%s: %r]' % (name, self._child)
+
+class BoxWidget(AbstractContainerWidget):
+    """
+    A canvas widget that places a box around a child widget.
+
+    Attributes:
+      - ``fill``: The color used to fill the interior of the box.
+      - ``outline``: The color used to draw the outline of the box.
+      - ``width``: The width of the outline of the box.
+      - ``margin``: The number of pixels space left between the child
+        and the box.
+      - ``draggable``: whether the text can be dragged by the user.
+    """
+    def __init__(self, canvas, child, **attribs):
+        """
+        Create a new box widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param child: The child widget.  ``child`` must not have a
+            parent.
+        :type child: CanvasWidget
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._child = child
+        self._margin = 1
+        self._box = canvas.create_rectangle(1,1,1,1)
+        canvas.tag_lower(self._box)
+        AbstractContainerWidget.__init__(self, canvas, child, **attribs)
+
+    def __setitem__(self, attr, value):
+        if attr == 'margin': self._margin = value
+        elif attr in ('outline', 'fill', 'width'):
+            self.canvas().itemconfig(self._box, {attr:value})
+        else:
+            CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'margin': return self._margin
+        elif attr == 'width':
+            return float(self.canvas().itemcget(self._box, attr))
+        elif attr in ('outline', 'fill', 'width'):
+            return self.canvas().itemcget(self._box, attr)
+        else:
+            return CanvasWidget.__getitem__(self, attr)
+
+    def _update(self, child):
+        (x1, y1, x2, y2) = child.bbox()
+        margin = self._margin + self['width']/2
+        self.canvas().coords(self._box, x1-margin, y1-margin,
+                             x2+margin, y2+margin)
+
+    def _tags(self): return [self._box]
+
+class OvalWidget(AbstractContainerWidget):
+    """
+    A canvas widget that places a oval around a child widget.
+
+    Attributes:
+      - ``fill``: The color used to fill the interior of the oval.
+      - ``outline``: The color used to draw the outline of the oval.
+      - ``width``: The width of the outline of the oval.
+      - ``margin``: The number of pixels space left between the child
+        and the oval.
+      - ``draggable``: whether the text can be dragged by the user.
+      - ``double``: If true, then a double-oval is drawn.
+    """
+    def __init__(self, canvas, child, **attribs):
+        """
+        Create a new oval widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param child: The child widget.  ``child`` must not have a
+            parent.
+        :type child: CanvasWidget
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._child = child
+        self._margin = 1
+        self._oval = canvas.create_oval(1,1,1,1)
+        self._circle = attribs.pop('circle', False)
+        self._double = attribs.pop('double', False)
+        if self._double:
+            self._oval2 = canvas.create_oval(1,1,1,1)
+        else:
+            self._oval2 = None
+        canvas.tag_lower(self._oval)
+        AbstractContainerWidget.__init__(self, canvas, child, **attribs)
+
+    def __setitem__(self, attr, value):
+        c = self.canvas()
+        if attr == 'margin': self._margin = value
+        elif attr == 'double':
+            if value==True and self._oval2 is None:
+                # Copy attributes & position from self._oval.
+                x1, y1, x2, y2 = c.bbox(self._oval)
+                w = self['width']*2
+                self._oval2 = c.create_oval(x1-w, y1-w, x2+w, y2+w,
+                                outline=c.itemcget(self._oval, 'outline'),
+                                width=c.itemcget(self._oval, 'width'))
+                c.tag_lower(self._oval2)
+            if value==False and self._oval2 is not None:
+                c.delete(self._oval2)
+                self._oval2 = None
+        elif attr in ('outline', 'fill', 'width'):
+            c.itemconfig(self._oval, {attr:value})
+            if self._oval2 is not None and attr!='fill':
+                c.itemconfig(self._oval2, {attr:value})
+            if self._oval2 is not None and attr!='fill':
+                self.canvas().itemconfig(self._oval2, {attr:value})
+        else:
+            CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'margin': return self._margin
+        elif attr == 'double': return self._double is not None
+        elif attr == 'width':
+            return float(self.canvas().itemcget(self._oval, attr))
+        elif attr in ('outline', 'fill', 'width'):
+            return self.canvas().itemcget(self._oval, attr)
+        else:
+            return CanvasWidget.__getitem__(self, attr)
+
+    # The ratio between inscribed & circumscribed ovals
+    RATIO = 1.4142135623730949
+
+    def _update(self, child):
+        R = OvalWidget.RATIO
+        (x1, y1, x2, y2) = child.bbox()
+        margin = self._margin
+
+        # If we're a circle, pretend our contents are square.
+        if self._circle:
+            dx, dy = abs(x1-x2), abs(y1-y2)
+            if dx > dy:
+                y = (y1+y2)/2
+                y1, y2 = y-dx/2, y+dx/2
+            elif dy > dx:
+                x = (x1+x2)/2
+                x1, x2 = x-dy/2, x+dy/2
+
+        # Find the four corners.
+        left = int(( x1*(1+R) + x2*(1-R) ) / 2)
+        right = left + int((x2-x1)*R)
+        top = int(( y1*(1+R) + y2*(1-R) ) / 2)
+        bot = top + int((y2-y1)*R)
+        self.canvas().coords(self._oval, left-margin, top-margin,
+                             right+margin, bot+margin)
+        if self._oval2 is not None:
+            self.canvas().coords(self._oval2, left-margin+2, top-margin+2,
+                                 right+margin-2, bot+margin-2)
+
+    def _tags(self):
+        if self._oval2 is None:
+            return [self._oval]
+        else:
+            return [self._oval, self._oval2]
+
+class ParenWidget(AbstractContainerWidget):
+    """
+    A canvas widget that places a pair of parenthases around a child
+    widget.
+
+    Attributes:
+      - ``color``: The color used to draw the parenthases.
+      - ``width``: The width of the parenthases.
+      - ``draggable``: whether the text can be dragged by the user.
+    """
+    def __init__(self, canvas, child, **attribs):
+        """
+        Create a new parenthasis widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param child: The child widget.  ``child`` must not have a
+            parent.
+        :type child: CanvasWidget
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._child = child
+        self._oparen = canvas.create_arc(1,1,1,1, style='arc',
+                                         start=90, extent=180)
+        self._cparen = canvas.create_arc(1,1,1,1, style='arc',
+                                         start=-90, extent=180)
+        AbstractContainerWidget.__init__(self, canvas, child, **attribs)
+
+    def __setitem__(self, attr, value):
+        if attr == 'color':
+            self.canvas().itemconfig(self._oparen, outline=value)
+            self.canvas().itemconfig(self._cparen, outline=value)
+        elif attr == 'width':
+            self.canvas().itemconfig(self._oparen, width=value)
+            self.canvas().itemconfig(self._cparen, width=value)
+        else:
+            CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'color':
+            return self.canvas().itemcget(self._oparen, 'outline')
+        elif attr == 'width':
+            return self.canvas().itemcget(self._oparen, 'width')
+        else:
+            return CanvasWidget.__getitem__(self, attr)
+
+    def _update(self, child):
+        (x1, y1, x2, y2) = child.bbox()
+        width = max((y2-y1)/6, 4)
+        self.canvas().coords(self._oparen, x1-width, y1, x1+width, y2)
+        self.canvas().coords(self._cparen, x2-width, y1, x2+width, y2)
+
+    def _tags(self): return [self._oparen, self._cparen]
+
+class BracketWidget(AbstractContainerWidget):
+    """
+    A canvas widget that places a pair of brackets around a child
+    widget.
+
+    Attributes:
+      - ``color``: The color used to draw the brackets.
+      - ``width``: The width of the brackets.
+      - ``draggable``: whether the text can be dragged by the user.
+    """
+    def __init__(self, canvas, child, **attribs):
+        """
+        Create a new bracket widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param child: The child widget.  ``child`` must not have a
+            parent.
+        :type child: CanvasWidget
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._child = child
+        self._obrack = canvas.create_line(1,1,1,1,1,1,1,1)
+        self._cbrack = canvas.create_line(1,1,1,1,1,1,1,1)
+        AbstractContainerWidget.__init__(self, canvas, child, **attribs)
+
+    def __setitem__(self, attr, value):
+        if attr == 'color':
+            self.canvas().itemconfig(self._obrack, fill=value)
+            self.canvas().itemconfig(self._cbrack, fill=value)
+        elif attr == 'width':
+            self.canvas().itemconfig(self._obrack, width=value)
+            self.canvas().itemconfig(self._cbrack, width=value)
+        else:
+            CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'color':
+            return self.canvas().itemcget(self._obrack, 'outline')
+        elif attr == 'width':
+            return self.canvas().itemcget(self._obrack, 'width')
+        else:
+            return CanvasWidget.__getitem__(self, attr)
+
+    def _update(self, child):
+        (x1, y1, x2, y2) = child.bbox()
+        width = max((y2-y1)/8, 2)
+        self.canvas().coords(self._obrack, x1, y1, x1-width, y1,
+                             x1-width, y2, x1, y2)
+        self.canvas().coords(self._cbrack, x2, y1, x2+width, y1,
+                             x2+width, y2, x2, y2)
+
+    def _tags(self): return [self._obrack, self._cbrack]
+
+class SequenceWidget(CanvasWidget):
+    """
+    A canvas widget that keeps a list of canvas widgets in a
+    horizontal line.
+
+    Attributes:
+      - ``align``: The vertical alignment of the children.  Possible
+        values are ``'top'``, ``'center'``, and ``'bottom'``.  By
+        default, children are center-aligned.
+      - ``space``: The amount of horizontal space to place between
+        children.  By default, one pixel of space is used.
+      - ``ordered``: If true, then keep the children in their
+        original order.
+    """
+    def __init__(self, canvas, *children, **attribs):
+        """
+        Create a new sequence widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param children: The widgets that should be aligned
+            horizontally.  Each child must not have a parent.
+        :type children: list(CanvasWidget)
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._align = 'center'
+        self._space = 1
+        self._ordered = False
+        self._children = list(children)
+        for child in children: self._add_child_widget(child)
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def __setitem__(self, attr, value):
+        if attr == 'align':
+            if value not in ('top', 'bottom', 'center'):
+                raise ValueError('Bad alignment: %r' % value)
+            self._align = value
+        elif attr == 'space': self._space = value
+        elif attr == 'ordered': self._ordered = value
+        else: CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'align': return self._align
+        elif attr == 'space': return self._space
+        elif attr == 'ordered': return self._ordered
+        else: return CanvasWidget.__getitem__(self, attr)
+
+    def _tags(self): return []
+
+    def _yalign(self, top, bot):
+        if self._align == 'top': return top
+        if self._align == 'bottom': return bot
+        if self._align == 'center': return (top+bot)/2
+
+    def _update(self, child):
+        # Align all children with child.
+        (left, top, right, bot) = child.bbox()
+        y = self._yalign(top, bot)
+        for c in self._children:
+            (x1, y1, x2, y2) = c.bbox()
+            c.move(0, y-self._yalign(y1,y2))
+
+        if self._ordered and len(self._children) > 1:
+            index = self._children.index(child)
+
+            x = right + self._space
+            for i in range(index+1, len(self._children)):
+                (x1, y1, x2, y2) = self._children[i].bbox()
+                if x > x1:
+                    self._children[i].move(x-x1, 0)
+                    x += x2-x1 + self._space
+
+            x = left - self._space
+            for i in range(index-1, -1, -1):
+                (x1, y1, x2, y2) = self._children[i].bbox()
+                if x < x2:
+                    self._children[i].move(x-x2, 0)
+                    x -= x2-x1 + self._space
+
+    def _manage(self):
+        if len(self._children) == 0: return
+        child = self._children[0]
+
+        # Align all children with child.
+        (left, top, right, bot) = child.bbox()
+        y = self._yalign(top, bot)
+
+        index = self._children.index(child)
+
+        # Line up children to the right of child.
+        x = right + self._space
+        for i in range(index+1, len(self._children)):
+            (x1, y1, x2, y2) = self._children[i].bbox()
+            self._children[i].move(x-x1, y-self._yalign(y1,y2))
+            x += x2-x1 + self._space
+
+        # Line up children to the left of child.
+        x = left - self._space
+        for i in range(index-1, -1, -1):
+            (x1, y1, x2, y2) = self._children[i].bbox()
+            self._children[i].move(x-x2, y-self._yalign(y1,y2))
+            x -= x2-x1 + self._space
+
+    def __repr__(self):
+        return '[Sequence: ' + repr(self._children)[1:-1]+']'
+
+    # Provide an alias for the child_widgets() member.
+    children = CanvasWidget.child_widgets
+
+    def replace_child(self, oldchild, newchild):
+        """
+        Replace the child canvas widget ``oldchild`` with ``newchild``.
+        ``newchild`` must not have a parent.  ``oldchild``'s parent will
+        be set to None.
+
+        :type oldchild: CanvasWidget
+        :param oldchild: The child canvas widget to remove.
+        :type newchild: CanvasWidget
+        :param newchild: The canvas widget that should replace
+            ``oldchild``.
+        """
+        index = self._children.index(oldchild)
+        self._children[index] = newchild
+        self._remove_child_widget(oldchild)
+        self._add_child_widget(newchild)
+        self.update(newchild)
+
+    def remove_child(self, child):
+        """
+        Remove the given child canvas widget.  ``child``'s parent will
+        be set ot None.
+
+        :type child: CanvasWidget
+        :param child: The child canvas widget to remove.
+        """
+        index = self._children.index(child)
+        del self._children[index]
+        self._remove_child_widget(child)
+        if len(self._children) > 0:
+            self.update(self._children[0])
+
+    def insert_child(self, index, child):
+        """
+        Insert a child canvas widget before a given index.
+
+        :type child: CanvasWidget
+        :param child: The canvas widget that should be inserted.
+        :type index: int
+        :param index: The index where the child widget should be
+            inserted.  In particular, the index of ``child`` will be
+            ``index``; and the index of any children whose indices were
+            greater than equal to ``index`` before ``child`` was
+            inserted will be incremented by one.
+        """
+        self._children.insert(index, child)
+        self._add_child_widget(child)
+
+class StackWidget(CanvasWidget):
+    """
+    A canvas widget that keeps a list of canvas widgets in a vertical
+    line.
+
+    Attributes:
+      - ``align``: The horizontal alignment of the children.  Possible
+        values are ``'left'``, ``'center'``, and ``'right'``.  By
+        default, children are center-aligned.
+      - ``space``: The amount of vertical space to place between
+        children.  By default, one pixel of space is used.
+      - ``ordered``: If true, then keep the children in their
+        original order.
+    """
+    def __init__(self, canvas, *children, **attribs):
+        """
+        Create a new stack widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :param children: The widgets that should be aligned
+            vertically.  Each child must not have a parent.
+        :type children: list(CanvasWidget)
+        :param attribs: The new canvas widget's attributes.
+        """
+        self._align = 'center'
+        self._space = 1
+        self._ordered = False
+        self._children = list(children)
+        for child in children: self._add_child_widget(child)
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def __setitem__(self, attr, value):
+        if attr == 'align':
+            if value not in ('left', 'right', 'center'):
+                raise ValueError('Bad alignment: %r' % value)
+            self._align = value
+        elif attr == 'space': self._space = value
+        elif attr == 'ordered': self._ordered = value
+        else: CanvasWidget.__setitem__(self, attr, value)
+
+    def __getitem__(self, attr):
+        if attr == 'align': return self._align
+        elif attr == 'space': return self._space
+        elif attr == 'ordered': return self._ordered
+        else: return CanvasWidget.__getitem__(self, attr)
+
+    def _tags(self): return []
+
+    def _xalign(self, left, right):
+        if self._align == 'left': return left
+        if self._align == 'right': return right
+        if self._align == 'center': return (left+right)/2
+
+    def _update(self, child):
+        # Align all children with child.
+        (left, top, right, bot) = child.bbox()
+        x = self._xalign(left, right)
+        for c in self._children:
+            (x1, y1, x2, y2) = c.bbox()
+            c.move(x-self._xalign(x1,x2), 0)
+
+        if self._ordered and len(self._children) > 1:
+            index = self._children.index(child)
+
+            y = bot + self._space
+            for i in range(index+1, len(self._children)):
+                (x1, y1, x2, y2) = self._children[i].bbox()
+                if y > y1:
+                    self._children[i].move(0, y-y1)
+                    y += y2-y1 + self._space
+
+            y = top - self._space
+            for i in range(index-1, -1, -1):
+                (x1, y1, x2, y2) = self._children[i].bbox()
+                if y < y2:
+                    self._children[i].move(0, y-y2)
+                    y -= y2-y1 + self._space
+
+    def _manage(self):
+        if len(self._children) == 0: return
+        child = self._children[0]
+
+        # Align all children with child.
+        (left, top, right, bot) = child.bbox()
+        x = self._xalign(left, right)
+
+        index = self._children.index(child)
+
+        # Line up children below the child.
+        y = bot + self._space
+        for i in range(index+1, len(self._children)):
+            (x1, y1, x2, y2) = self._children[i].bbox()
+            self._children[i].move(x-self._xalign(x1,x2), y-y1)
+            y += y2-y1 + self._space
+
+        # Line up children above the child.
+        y = top - self._space
+        for i in range(index-1, -1, -1):
+            (x1, y1, x2, y2) = self._children[i].bbox()
+            self._children[i].move(x-self._xalign(x1,x2), y-y2)
+            y -= y2-y1 + self._space
+
+    def __repr__(self):
+        return '[Stack: ' + repr(self._children)[1:-1]+']'
+
+    # Provide an alias for the child_widgets() member.
+    children = CanvasWidget.child_widgets
+
+    def replace_child(self, oldchild, newchild):
+        """
+        Replace the child canvas widget ``oldchild`` with ``newchild``.
+        ``newchild`` must not have a parent.  ``oldchild``'s parent will
+        be set to None.
+
+        :type oldchild: CanvasWidget
+        :param oldchild: The child canvas widget to remove.
+        :type newchild: CanvasWidget
+        :param newchild: The canvas widget that should replace
+            ``oldchild``.
+        """
+        index = self._children.index(oldchild)
+        self._children[index] = newchild
+        self._remove_child_widget(oldchild)
+        self._add_child_widget(newchild)
+        self.update(newchild)
+
+    def remove_child(self, child):
+        """
+        Remove the given child canvas widget.  ``child``'s parent will
+        be set ot None.
+
+        :type child: CanvasWidget
+        :param child: The child canvas widget to remove.
+        """
+        index = self._children.index(child)
+        del self._children[index]
+        self._remove_child_widget(child)
+        if len(self._children) > 0:
+            self.update(self._children[0])
+
+    def insert_child(self, index, child):
+        """
+        Insert a child canvas widget before a given index.
+
+        :type child: CanvasWidget
+        :param child: The canvas widget that should be inserted.
+        :type index: int
+        :param index: The index where the child widget should be
+            inserted.  In particular, the index of ``child`` will be
+            ``index``; and the index of any children whose indices were
+            greater than equal to ``index`` before ``child`` was
+            inserted will be incremented by one.
+        """
+        self._children.insert(index, child)
+        self._add_child_widget(child)
+
+class SpaceWidget(CanvasWidget):
+    """
+    A canvas widget that takes up space but does not display
+    anything.  A ``SpaceWidget`` can be used to add space between
+    elements.  Each space widget is characterized by a width and a
+    height.  If you wish to only create horizontal space, then use a
+    height of zero; and if you wish to only create vertical space, use
+    a width of zero.
+    """
+    def __init__(self, canvas, width, height, **attribs):
+        """
+        Create a new space widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :type width: int
+        :param width: The width of the new space widget.
+        :type height: int
+        :param height: The height of the new space widget.
+        :param attribs: The new canvas widget's attributes.
+        """
+        # For some reason,
+        if width > 4: width -= 4
+        if height > 4: height -= 4
+        self._tag = canvas.create_line(1, 1, width, height, fill='')
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    # note: width() and height() are already defined by CanvasWidget.
+    def set_width(self, width):
+        """
+        Change the width of this space widget.
+
+        :param width: The new width.
+        :type width: int
+        :rtype: None
+        """
+        [x1, y1, x2, y2] = self.bbox()
+        self.canvas().coords(self._tag, x1, y1, x1+width, y2)
+
+    def set_height(self, height):
+        """
+        Change the height of this space widget.
+
+        :param height: The new height.
+        :type height: int
+        :rtype: None
+        """
+        [x1, y1, x2, y2] = self.bbox()
+        self.canvas().coords(self._tag, x1, y1, x2, y1+height)
+
+    def _tags(self): return [self._tag]
+
+    def __repr__(self): return '[Space]'
+
+class ScrollWatcherWidget(CanvasWidget):
+    """
+    A special canvas widget that adjusts its ``Canvas``'s scrollregion
+    to always include the bounding boxes of all of its children.  The
+    scroll-watcher widget will only increase the size of the
+    ``Canvas``'s scrollregion; it will never decrease it.
+    """
+    def __init__(self, canvas, *children, **attribs):
+        """
+        Create a new scroll-watcher widget.
+
+        :type canvas: Tkinter.Canvas
+        :param canvas: This canvas widget's canvas.
+        :type children: list(CanvasWidget)
+        :param children: The canvas widgets watched by the
+            scroll-watcher.  The scroll-watcher will ensure that these
+            canvas widgets are always contained in their canvas's
+            scrollregion.
+        :param attribs: The new canvas widget's attributes.
+        """
+        for child in children: self._add_child_widget(child)
+        CanvasWidget.__init__(self, canvas, **attribs)
+
+    def add_child(self, canvaswidget):
+        """
+        Add a new canvas widget to the scroll-watcher.  The
+        scroll-watcher will ensure that the new canvas widget is
+        always contained in its canvas's scrollregion.
+
+        :param canvaswidget: The new canvas widget.
+        :type canvaswidget: CanvasWidget
+        :rtype: None
+        """
+        self._add_child_widget(canvaswidget)
+        self.update(canvaswidget)
+
+    def remove_child(self, canvaswidget):
+        """
+        Remove a canvas widget from the scroll-watcher.  The
+        scroll-watcher will no longer ensure that the new canvas
+        widget is always contained in its canvas's scrollregion.
+
+        :param canvaswidget: The canvas widget to remove.
+        :type canvaswidget: CanvasWidget
+        :rtype: None
+        """
+        self._remove_child_widget(canvaswidget)
+
+    def _tags(self): return []
+
+    def _update(self, child):
+        self._adjust_scrollregion()
+
+    def _adjust_scrollregion(self):
+        """
+        Adjust the scrollregion of this scroll-watcher's ``Canvas`` to
+        include the bounding boxes of all of its children.
+        """
+        bbox = self.bbox()
+        canvas = self.canvas()
+        scrollregion = [int(n) for n in canvas['scrollregion'].split()]
+        if len(scrollregion) != 4: return
+        if (bbox[0] < scrollregion[0] or bbox[1] < scrollregion[1] or
+            bbox[2] > scrollregion[2] or bbox[3] > scrollregion[3]):
+            scrollregion = ('%d %d %d %d' %
+                            (min(bbox[0], scrollregion[0]),
+                             min(bbox[1], scrollregion[1]),
+                             max(bbox[2], scrollregion[2]),
+                         max(bbox[3], scrollregion[3])))
+            canvas['scrollregion'] = scrollregion
+
+##//////////////////////////////////////////////////////
+##  Canvas Frame
+##//////////////////////////////////////////////////////
+
+class CanvasFrame(object):
+    """
+    A ``Tkinter`` frame containing a canvas and scrollbars.
+    ``CanvasFrame`` uses a ``ScrollWatcherWidget`` to ensure that all of
+    the canvas widgets contained on its canvas are within its
+    scrollregion.  In order for ``CanvasFrame`` to make these checks,
+    all canvas widgets must be registered with ``add_widget`` when they
+    are added to the canvas; and destroyed with ``destroy_widget`` when
+    they are no longer needed.
+
+    If a ``CanvasFrame`` is created with no parent, then it will create
+    its own main window, including a "Done" button and a "Print"
+    button.
+    """
+    def __init__(self, parent=None, **kw):
+        """
+        Create a new ``CanvasFrame``.
+
+        :type parent: Tkinter.BaseWidget or Tkinter.Tk
+        :param parent: The parent ``Tkinter`` widget.  If no parent is
+            specified, then ``CanvasFrame`` will create a new main
+            window.
+        :param kw: Keyword arguments for the new ``Canvas``.  See the
+            documentation for ``Tkinter.Canvas`` for more information.
+        """
+        # If no parent was given, set up a top-level window.
+        if parent is None:
+            self._parent = Tk()
+            self._parent.title('NLTK')
+            self._parent.bind('<Control-p>', lambda e: self.print_to_file())
+            self._parent.bind('<Control-x>', self.destroy)
+            self._parent.bind('<Control-q>', self.destroy)
+        else:
+            self._parent = parent
+
+        # Create a frame for the canvas & scrollbars
+        self._frame = frame = Frame(self._parent)
+        self._canvas = canvas = Canvas(frame, **kw)
+        xscrollbar = Scrollbar(self._frame, orient='horizontal')
+        yscrollbar = Scrollbar(self._frame, orient='vertical')
+        xscrollbar['command'] = canvas.xview
+        yscrollbar['command'] = canvas.yview
+        canvas['xscrollcommand'] = xscrollbar.set
+        canvas['yscrollcommand'] = yscrollbar.set
+        yscrollbar.pack(fill='y', side='right')
+        xscrollbar.pack(fill='x', side='bottom')
+        canvas.pack(expand=1, fill='both', side='left')
+
+        # Set initial scroll region.
+        scrollregion = '0 0 %s %s' % (canvas['width'], canvas['height'])
+        canvas['scrollregion'] = scrollregion
+
+        self._scrollwatcher = ScrollWatcherWidget(canvas)
+
+        # If no parent was given, pack the frame, and add a menu.
+        if parent is None:
+            self.pack(expand=1, fill='both')
+            self._init_menubar()
+
+    def _init_menubar(self):
+        menubar = Menu(self._parent)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Print to Postscript', underline=0,
+                             command=self.print_to_file, accelerator='Ctrl-p')
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='Ctrl-x')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        self._parent.config(menu=menubar)
+
+    def print_to_file(self, filename=None):
+        """
+        Print the contents of this ``CanvasFrame`` to a postscript
+        file.  If no filename is given, then prompt the user for one.
+
+        :param filename: The name of the file to print the tree to.
+        :type filename: str
+        :rtype: None
+        """
+        if filename is None:
+            ftypes = [('Postscript files', '.ps'),
+                      ('All files', '*')]
+            filename = asksaveasfilename(filetypes=ftypes,
+                                         defaultextension='.ps')
+            if not filename: return
+        (x0, y0, w, h) = self.scrollregion()
+        postscript = self._canvas.postscript(x=x0, y=y0,
+                                width=w+2, height=h+2,
+                                pagewidth=w+2, # points = 1/72 inch
+                                pageheight=h+2, # points = 1/72 inch
+                                pagex=0, pagey=0)
+        # workaround for bug in Tk font handling
+        postscript = postscript.replace(' 0 scalefont ', ' 9 scalefont ')
+        with open(filename, 'wb') as f:
+            f.write(postscript.encode('utf8'))
+
+    def scrollregion(self):
+        """
+        :return: The current scroll region for the canvas managed by
+            this ``CanvasFrame``.
+        :rtype: 4-tuple of int
+        """
+        (x1, y1, x2, y2) = self._canvas['scrollregion'].split()
+        return (int(x1), int(y1), int(x2), int(y2))
+
+    def canvas(self):
+        """
+        :return: The canvas managed by this ``CanvasFrame``.
+        :rtype: Tkinter.Canvas
+        """
+        return self._canvas
+
+    def add_widget(self, canvaswidget, x=None, y=None):
+        """
+        Register a canvas widget with this ``CanvasFrame``.  The
+        ``CanvasFrame`` will ensure that this canvas widget is always
+        within the ``Canvas``'s scrollregion.  If no coordinates are
+        given for the canvas widget, then the ``CanvasFrame`` will
+        attempt to find a clear area of the canvas for it.
+
+        :type canvaswidget: CanvasWidget
+        :param canvaswidget: The new canvas widget.  ``canvaswidget``
+            must have been created on this ``CanvasFrame``'s canvas.
+        :type x: int
+        :param x: The initial x coordinate for the upper left hand
+            corner of ``canvaswidget``, in the canvas's coordinate
+            space.
+        :type y: int
+        :param y: The initial y coordinate for the upper left hand
+            corner of ``canvaswidget``, in the canvas's coordinate
+            space.
+        """
+        if x is None or y is None:
+            (x, y) = self._find_room(canvaswidget, x, y)
+
+        # Move to (x,y)
+        (x1,y1,x2,y2) = canvaswidget.bbox()
+        canvaswidget.move(x-x1,y-y1)
+
+        # Register with scrollwatcher.
+        self._scrollwatcher.add_child(canvaswidget)
+
+    def _find_room(self, widget, desired_x, desired_y):
+        """
+        Try to find a space for a given widget.
+        """
+        (left, top, right, bot) = self.scrollregion()
+        w = widget.width()
+        h = widget.height()
+
+        if w >= (right-left): return (0,0)
+        if h >= (bot-top): return (0,0)
+
+        # Move the widget out of the way, for now.
+        (x1,y1,x2,y2) = widget.bbox()
+        widget.move(left-x2-50, top-y2-50)
+
+        if desired_x is not None:
+            x = desired_x
+            for y in range(top, bot-h, int((bot-top-h)/10)):
+                if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5):
+                    return (x,y)
+
+        if desired_y is not None:
+            y = desired_y
+            for x in range(left, right-w, int((right-left-w)/10)):
+                if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5):
+                    return (x,y)
+
+        for y in range(top, bot-h, int((bot-top-h)/10)):
+            for x in range(left, right-w, int((right-left-w)/10)):
+                if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5):
+                    return (x,y)
+        return (0,0)
+
+    def destroy_widget(self, canvaswidget):
+        """
+        Remove a canvas widget from this ``CanvasFrame``.  This
+        deregisters the canvas widget, and destroys it.
+        """
+        self.remove_widget(canvaswidget)
+        canvaswidget.destroy()
+
+    def remove_widget(self, canvaswidget):
+        # Deregister with scrollwatcher.
+        self._scrollwatcher.remove_child(canvaswidget)
+
+    def pack(self, cnf={}, **kw):
+        """
+        Pack this ``CanvasFrame``.  See the documentation for
+        ``Tkinter.Pack`` for more information.
+        """
+        self._frame.pack(cnf, **kw)
+        # Adjust to be big enough for kids?
+
+    def destroy(self, *e):
+        """
+        Destroy this ``CanvasFrame``.  If this ``CanvasFrame`` created a
+        top-level window, then this will close that window.
+        """
+        if self._parent is None: return
+        self._parent.destroy()
+        self._parent = None
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this frame is created from a non-interactive program (e.g.
+        from a secript); otherwise, the frame will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._parent.mainloop(*args, **kwargs)
+
+##//////////////////////////////////////////////////////
+##  Text display
+##//////////////////////////////////////////////////////
+
+class ShowText(object):
+    """
+    A ``Tkinter`` window used to display a text.  ``ShowText`` is
+    typically used by graphical tools to display help text, or similar
+    information.
+    """
+    def __init__(self, root, title, text, width=None, height=None,
+                 **textbox_options):
+        if width is None or height is None:
+            (width, height) = self.find_dimentions(text, width, height)
+
+        # Create the main window.
+        if root is None:
+            self._top = top = Tk()
+        else:
+            self._top = top = Toplevel(root)
+        top.title(title)
+
+        b = Button(top, text='Ok', command=self.destroy)
+        b.pack(side='bottom')
+
+        tbf = Frame(top)
+        tbf.pack(expand=1, fill='both')
+        scrollbar = Scrollbar(tbf, orient='vertical')
+        scrollbar.pack(side='right', fill='y')
+        textbox = Text(tbf, wrap='word', width=width,
+                       height=height, **textbox_options)
+        textbox.insert('end', text)
+        textbox['state'] = 'disabled'
+        textbox.pack(side='left', expand=1, fill='both')
+        scrollbar['command'] = textbox.yview
+        textbox['yscrollcommand'] = scrollbar.set
+
+        # Make it easy to close the window.
+        top.bind('q', self.destroy)
+        top.bind('x', self.destroy)
+        top.bind('c', self.destroy)
+        top.bind('<Return>', self.destroy)
+        top.bind('<Escape>', self.destroy)
+
+        # Focus the scrollbar, so they can use up/down, etc.
+        scrollbar.focus()
+
+    def find_dimentions(self, text, width, height):
+        lines = text.split('\n')
+        if width is None:
+            maxwidth = max(len(line) for line in lines)
+            width = min(maxwidth, 80)
+
+        # Now, find height.
+        height = 0
+        for line in lines:
+            while len(line) > width:
+                brk = line[:width].rfind(' ')
+                line = line[brk:]
+                height += 1
+            height += 1
+        height = min(height, 25)
+
+        return (width, height)
+
+    def destroy(self, *e):
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this window is created from a non-interactive program (e.g.
+        from a secript); otherwise, the window will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._top.mainloop(*args, **kwargs)
+
+##//////////////////////////////////////////////////////
+##  Entry dialog
+##//////////////////////////////////////////////////////
+
+class EntryDialog(object):
+    """
+    A dialog box for entering
+    """
+    def __init__(self, parent, original_text='', instructions='',
+                 set_callback=None, title=None):
+        self._parent = parent
+        self._original_text = original_text
+        self._set_callback = set_callback
+
+        width = int(max(30, len(original_text)*3/2))
+        self._top = Toplevel(parent)
+
+        if title: self._top.title(title)
+
+        # The text entry box.
+        entryframe = Frame(self._top)
+        entryframe.pack(expand=1, fill='both', padx=5, pady=5,ipady=10)
+        if instructions:
+            l=Label(entryframe, text=instructions)
+            l.pack(side='top', anchor='w', padx=30)
+        self._entry = Entry(entryframe, width=width)
+        self._entry.pack(expand=1, fill='x', padx=30)
+        self._entry.insert(0, original_text)
+
+        # A divider
+        divider = Frame(self._top, borderwidth=1, relief='sunken')
+        divider.pack(fill='x', ipady=1, padx=10)
+
+        # The buttons.
+        buttons = Frame(self._top)
+        buttons.pack(expand=0, fill='x', padx=5, pady=5)
+        b = Button(buttons, text='Cancel', command=self._cancel, width=8)
+        b.pack(side='right', padx=5)
+        b = Button(buttons, text='Ok', command=self._ok,
+                   width=8, default='active')
+        b.pack(side='left', padx=5)
+        b = Button(buttons, text='Apply', command=self._apply, width=8)
+        b.pack(side='left')
+
+        self._top.bind('<Return>', self._ok)
+        self._top.bind('<Control-q>', self._cancel)
+        self._top.bind('<Escape>', self._cancel)
+
+        self._entry.focus()
+
+    def _reset(self, *e):
+        self._entry.delete(0,'end')
+        self._entry.insert(0, self._original_text)
+        if self._set_callback:
+            self._set_callback(self._original_text)
+
+    def _cancel(self, *e):
+        try: self._reset()
+        except: pass
+        self._destroy()
+
+    def _ok(self, *e):
+        self._apply()
+        self._destroy()
+
+    def _apply(self, *e):
+        if self._set_callback:
+            self._set_callback(self._entry.get())
+
+    def _destroy(self, *e):
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+##//////////////////////////////////////////////////////
+##  Colorized List
+##//////////////////////////////////////////////////////
+
+class ColorizedList(object):
+    """
+    An abstract base class for displaying a colorized list of items.
+    Subclasses should define:
+      - ``_init_colortags``, which sets up Text color tags that
+        will be used by the list.
+      - ``_item_repr``, which returns a list of (text,colortag)
+        tuples that make up the colorized representation of the
+        item.
+    :note: Typically, you will want to register a callback for
+        ``'select'`` that calls ``mark`` on the given item.
+    """
+    def __init__(self, parent, items=[], **options):
+        """
+        Construct a new list.
+
+        :param parent: The Tk widget that contains the colorized list
+        :param items: The initial contents of the colorized list.
+        :param options:
+        """
+        self._parent = parent
+        self._callbacks = {}
+
+        # Which items are marked?
+        self._marks = {}
+
+        # Initialize the Tkinter frames.
+        self._init_itemframe(options.copy())
+
+        # Set up key & mouse bindings.
+        self._textwidget.bind('<KeyPress>', self._keypress)
+        self._textwidget.bind('<ButtonPress>', self._buttonpress)
+
+        # Fill in the given CFG's items.
+        self._items = None
+        self.set(items)
+
+    #////////////////////////////////////////////////////////////
+    # Abstract methods
+    #////////////////////////////////////////////////////////////
+    @abstractmethod
+    def _init_colortags(self, textwidget, options):
+        """
+        Set up any colortags that will be used by this colorized list.
+        E.g.:
+            >>> textwidget.tag_config('terminal', foreground='black')
+        """
+
+    @abstractmethod
+    def _item_repr(self, item):
+        """
+        Return a list of (text, colortag) tuples that make up the
+        colorized representation of the item.  Colorized
+        representations may not span multiple lines.  I.e., the text
+        strings returned may not contain newline characters.
+        """
+
+    #////////////////////////////////////////////////////////////
+    # Item Access
+    #////////////////////////////////////////////////////////////
+
+    def get(self, index=None):
+        """
+        :return: A list of the items contained by this list.
+        """
+        if index is None:
+            return self._items[:]
+        else:
+            return self._items[index]
+
+    def set(self, items):
+        """
+        Modify the list of items contained by this list.
+        """
+        items = list(items)
+        if self._items == items: return
+        self._items = list(items)
+
+        self._textwidget['state'] = 'normal'
+        self._textwidget.delete('1.0', 'end')
+        for item in items:
+            for (text, colortag) in self._item_repr(item):
+                assert '\n' not in text, 'item repr may not contain newline'
+                self._textwidget.insert('end', text, colortag)
+            self._textwidget.insert('end', '\n')
+        # Remove the final newline
+        self._textwidget.delete('end-1char', 'end')
+        self._textwidget.mark_set('insert', '1.0')
+        self._textwidget['state'] = 'disabled'
+        # Clear all marks
+        self._marks.clear()
+
+    def unmark(self, item=None):
+        """
+        Remove highlighting from the given item; or from every item,
+        if no item is given.
+        :raise ValueError: If ``item`` is not contained in the list.
+        :raise KeyError: If ``item`` is not marked.
+        """
+        if item is None:
+            self._marks.clear()
+            self._textwidget.tag_remove('highlight', '1.0', 'end+1char')
+        else:
+            index = self._items.index(item)
+            del self._marks[item]
+            (start, end) = ('%d.0' % (index+1), '%d.0' % (index+2))
+            self._textwidget.tag_remove('highlight', start, end)
+
+    def mark(self, item):
+        """
+        Highlight the given item.
+        :raise ValueError: If ``item`` is not contained in the list.
+        """
+        self._marks[item] = 1
+        index = self._items.index(item)
+        (start, end) = ('%d.0' % (index+1), '%d.0' % (index+2))
+        self._textwidget.tag_add('highlight', start, end)
+
+    def markonly(self, item):
+        """
+        Remove any current highlighting, and mark the given item.
+        :raise ValueError: If ``item`` is not contained in the list.
+        """
+        self.unmark()
+        self.mark(item)
+
+    def view(self, item):
+        """
+        Adjust the view such that the given item is visible.  If
+        the item is already visible, then do nothing.
+        """
+        index = self._items.index(item)
+        self._textwidget.see('%d.0' % (index+1))
+
+    #////////////////////////////////////////////////////////////
+    # Callbacks
+    #////////////////////////////////////////////////////////////
+
+    def add_callback(self, event, func):
+        """
+        Register a callback function with the list.  This function
+        will be called whenever the given event occurs.
+
+        :param event: The event that will trigger the callback
+            function.  Valid events are: click1, click2, click3,
+            space, return, select, up, down, next, prior, move
+        :param func: The function that should be called when
+            the event occurs.  ``func`` will be called with a
+            single item as its argument.  (The item selected
+            or the item moved to).
+        """
+        if event == 'select': events = ['click1', 'space', 'return']
+        elif event == 'move': events = ['up', 'down', 'next', 'prior']
+        else: events = [event]
+
+        for e in events:
+            self._callbacks.setdefault(e,{})[func] = 1
+
+    def remove_callback(self, event, func=None):
+        """
+        Deregister a callback function.  If ``func`` is none, then
+        all callbacks are removed for the given event.
+        """
+        if event is None: events = list(self._callbacks.keys())
+        elif event == 'select': events = ['click1', 'space', 'return']
+        elif event == 'move': events = ['up', 'down', 'next', 'prior']
+        else: events = [event]
+
+        for e in events:
+            if func is None: del self._callbacks[e]
+            else:
+                try: del self._callbacks[e][func]
+                except: pass
+
+    #////////////////////////////////////////////////////////////
+    # Tkinter Methods
+    #////////////////////////////////////////////////////////////
+
+    def pack(self, cnf={}, **kw):
+#        "@include: Tkinter.Pack.pack"
+        self._itemframe.pack(cnf, **kw)
+
+    def grid(self, cnf={}, **kw):
+#        "@include: Tkinter.Grid.grid"
+        self._itemframe.grid(cnf, *kw)
+
+    def focus(self):
+#        "@include: Tkinter.Widget.focus"
+        self._textwidget.focus()
+
+    #////////////////////////////////////////////////////////////
+    # Internal Methods
+    #////////////////////////////////////////////////////////////
+
+    def _init_itemframe(self, options):
+        self._itemframe = Frame(self._parent)
+
+        # Create the basic Text widget & scrollbar.
+        options.setdefault('background', '#e0e0e0')
+        self._textwidget = Text(self._itemframe, **options)
+        self._textscroll = Scrollbar(self._itemframe, takefocus=0,
+                                     orient='vertical')
+        self._textwidget.config(yscrollcommand = self._textscroll.set)
+        self._textscroll.config(command=self._textwidget.yview)
+        self._textscroll.pack(side='right', fill='y')
+        self._textwidget.pack(expand=1, fill='both', side='left')
+
+        # Initialize the colorization tags
+        self._textwidget.tag_config('highlight', background='#e0ffff',
+                                    border='1', relief='raised')
+        self._init_colortags(self._textwidget, options)
+
+        # How do I want to mark keyboard selection?
+        self._textwidget.tag_config('sel', foreground='')
+        self._textwidget.tag_config('sel', foreground='', background='',
+                                    border='', underline=1)
+        self._textwidget.tag_lower('highlight', 'sel')
+
+    def _fire_callback(self, event, itemnum):
+        if event not in self._callbacks: return
+        if 0 <= itemnum < len(self._items):
+            item = self._items[itemnum]
+        else:
+            item = None
+        for cb_func in list(self._callbacks[event].keys()):
+            cb_func(item)
+
+    def _buttonpress(self, event):
+        clickloc = '@%d,%d' % (event.x,event.y)
+        insert_point = self._textwidget.index(clickloc)
+        itemnum = int(insert_point.split('.')[0])-1
+        self._fire_callback('click%d' % event.num, itemnum)
+
+    def _keypress(self, event):
+        if event.keysym == 'Return' or event.keysym == 'space':
+            insert_point = self._textwidget.index('insert')
+            itemnum = int(insert_point.split('.')[0])-1
+            self._fire_callback(event.keysym.lower(), itemnum)
+            return
+        elif event.keysym == 'Down': delta='+1line'
+        elif event.keysym == 'Up': delta='-1line'
+        elif event.keysym == 'Next': delta='+10lines'
+        elif event.keysym == 'Prior': delta='-10lines'
+        else: return 'continue'
+
+        self._textwidget.mark_set('insert', 'insert'+delta)
+        self._textwidget.see('insert')
+        self._textwidget.tag_remove('sel', '1.0', 'end+1char')
+        self._textwidget.tag_add('sel', 'insert linestart', 'insert lineend')
+
+        insert_point = self._textwidget.index('insert')
+        itemnum = int(insert_point.split('.')[0])-1
+        self._fire_callback(event.keysym.lower(), itemnum)
+
+        return 'break'
+
+##//////////////////////////////////////////////////////
+##  Improved OptionMenu
+##//////////////////////////////////////////////////////
+
+class MutableOptionMenu(Menubutton):
+    def __init__(self, master, values, **options):
+        self._callback = options.get('command')
+        if 'command' in options: del options['command']
+
+        # Create a variable
+        self._variable = variable = StringVar()
+        if len(values) > 0:
+            variable.set(values[0])
+
+        kw = {"borderwidth": 2, "textvariable": variable,
+              "indicatoron": 1, "relief": RAISED, "anchor": "c",
+              "highlightthickness": 2}
+        kw.update(options)
+        Widget.__init__(self, master, "menubutton", kw)
+        self.widgetName = 'tk_optionMenu'
+        self._menu = Menu(self, name="menu", tearoff=0,)
+        self.menuname = self._menu._w
+
+        self._values = []
+        for value in values: self.add(value)
+
+        self["menu"] = self._menu
+
+    def add(self, value):
+        if value in self._values: return
+        def set(value=value): self.set(value)
+        self._menu.add_command(label=value, command=set)
+        self._values.append(value)
+
+    def set(self, value):
+        self._variable.set(value)
+        if self._callback:
+            self._callback(value)
+
+    def remove(self, value):
+        # Might raise indexerror: pass to parent.
+        i = self._values.index(value)
+        del self._values[i]
+        self._menu.delete(i, i)
+
+    def __getitem__(self, name):
+        if name == 'menu':
+            return self.__menu
+        return Widget.__getitem__(self, name)
+
+    def destroy(self):
+        """Destroy this widget and the associated menu."""
+        Menubutton.destroy(self)
+        self._menu = None
+
+##//////////////////////////////////////////////////////
+##  Test code.
+##//////////////////////////////////////////////////////
+
+def demo():
+    """
+    A simple demonstration showing how to use canvas widgets.
+    """
+    def fill(cw):
+        from random import randint
+        cw['fill'] = '#00%04d' % randint(0,9999)
+    def color(cw):
+        from random import randint
+        cw['color'] = '#ff%04d' % randint(0,9999)
+
+    cf = CanvasFrame(closeenough=10, width=300, height=300)
+    c = cf.canvas()
+    ct3 = TextWidget(c, 'hiya there', draggable=1)
+    ct2 = TextWidget(c, 'o  o\n||\n___\n  U', draggable=1, justify='center')
+    co = OvalWidget(c, ct2, outline='red')
+    ct = TextWidget(c, 'o  o\n||\n\\___/', draggable=1, justify='center')
+    cp = ParenWidget(c, ct, color='red')
+    cb = BoxWidget(c, cp, fill='cyan', draggable=1, width=3, margin=10)
+    equation = SequenceWidget(c,
+                              SymbolWidget(c, 'forall'), TextWidget(c, 'x'),
+                              SymbolWidget(c, 'exists'), TextWidget(c, 'y: '),
+                              TextWidget(c, 'x'), SymbolWidget(c, 'notequal'),
+                              TextWidget(c, 'y'))
+    space = SpaceWidget(c, 0, 30)
+    cstack = StackWidget(c, cb, ct3, space, co, equation, align='center')
+    foo = TextWidget(c, 'try clicking\nand dragging',
+                     draggable=1, justify='center')
+    cs = SequenceWidget(c, cstack, foo)
+    zz = BracketWidget(c, cs, color='green4', width=3)
+    cf.add_widget(zz, 60, 30)
+
+    cb.bind_click(fill)
+    ct.bind_click(color)
+    co.bind_click(fill)
+    ct2.bind_click(color)
+    ct3.bind_click(color)
+
+    cf.mainloop()
+    #ShowText(None, 'title', ((('this is text'*150)+'\n')*5))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/draw/util.pyc b/nlp_resource_data/nltk/draw/util.pyc

new file mode 100755 (executable)

index 0000000..18f1ae9

Binary files /dev/null and b/nlp_resource_data/nltk/draw/util.pyc differ
diff --git a/nlp_resource_data/nltk/featstruct.py b/nlp_resource_data/nltk/featstruct.py

new file mode 100755 (executable)

index 0000000..7795286
--- /dev/null
+++ b/nlp_resource_data/nltk/featstruct.py
@@ -0,0 +1,2502 @@
+# Natural Language Toolkit: Feature Structures
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>,
+#         Rob Speer,
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+Basic data classes for representing feature structures, and for
+performing basic operations on those feature structures.  A feature
+structure is a mapping from feature identifiers to feature values,
+where each feature value is either a basic value (such as a string or
+an integer), or a nested feature structure.  There are two types of
+feature structure, implemented by two subclasses of ``FeatStruct``:
+
+    - feature dictionaries, implemented by ``FeatDict``, act like
+      Python dictionaries.  Feature identifiers may be strings or
+      instances of the ``Feature`` class.
+    - feature lists, implemented by ``FeatList``, act like Python
+      lists.  Feature identifiers are integers.
+
+Feature structures are typically used to represent partial information
+about objects.  A feature identifier that is not mapped to a value
+stands for a feature whose value is unknown (*not* a feature without
+a value).  Two feature structures that represent (potentially
+overlapping) information about the same object can be combined by
+unification.  When two inconsistent feature structures are unified,
+the unification fails and returns None.
+
+Features can be specified using "feature paths", or tuples of feature
+identifiers that specify path through the nested feature structures to
+a value.  Feature structures may contain reentrant feature values.  A
+"reentrant feature value" is a single feature value that can be
+accessed via multiple feature paths.  Unification preserves the
+reentrance relations imposed by both of the unified feature
+structures.  In the feature structure resulting from unification, any
+modifications to a reentrant feature value will be visible using any
+of its feature paths.
+
+Feature structure variables are encoded using the ``nltk.sem.Variable``
+class.  The variables' values are tracked using a bindings
+dictionary, which maps variables to their values.  When two feature
+structures are unified, a fresh bindings dictionary is created to
+track their values; and before unification completes, all bound
+variables are replaced by their values.  Thus, the bindings
+dictionaries are usually strictly internal to the unification process.
+However, it is possible to track the bindings of variables if you
+choose to, by supplying your own initial bindings dictionary to the
+``unify()`` function.
+
+When unbound variables are unified with one another, they become
+aliased.  This is encoded by binding one variable to the other.
+
+Lightweight Feature Structures
+==============================
+Many of the functions defined by ``nltk.featstruct`` can be applied
+directly to simple Python dictionaries and lists, rather than to
+full-fledged ``FeatDict`` and ``FeatList`` objects.  In other words,
+Python ``dicts`` and ``lists`` can be used as "light-weight" feature
+structures.
+
+    >>> from nltk.featstruct import unify
+    >>> unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b')))  # doctest: +SKIP
+    {'y': {'b': 'b'}, 'x': 1, 'a': 'a'}
+
+However, you should keep in mind the following caveats:
+
+  - Python dictionaries & lists ignore reentrance when checking for
+    equality between values.  But two FeatStructs with different
+    reentrances are considered nonequal, even if all their base
+    values are equal.
+
+  - FeatStructs can be easily frozen, allowing them to be used as
+    keys in hash tables.  Python dictionaries and lists can not.
+
+  - FeatStructs display reentrance in their string representations;
+    Python dictionaries and lists do not.
+
+  - FeatStructs may *not* be mixed with Python dictionaries and lists
+    (e.g., when performing unification).
+
+  - FeatStructs provide a number of useful methods, such as ``walk()``
+    and ``cyclic()``, which are not available for Python dicts and lists.
+
+In general, if your feature structures will contain any reentrances,
+or if you plan to use them as dictionary keys, it is strongly
+recommended that you use full-fledged ``FeatStruct`` objects.
+"""
+from __future__ import print_function, unicode_literals, division
+
+import re
+import copy
+from functools import total_ordering
+
+from six import integer_types, string_types
+
+from nltk.internals import read_str, raise_unorderable_types
+from nltk.sem.logic import (Variable, Expression, SubstituteBindingsI,
+                            LogicParser, LogicalExpressionException)
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
+######################################################################
+# Feature Structure
+######################################################################
+
+@total_ordering
+class FeatStruct(SubstituteBindingsI):
+    """
+    A mapping from feature identifiers to feature values, where each
+    feature value is either a basic value (such as a string or an
+    integer), or a nested feature structure.  There are two types of
+    feature structure:
+
+      - feature dictionaries, implemented by ``FeatDict``, act like
+        Python dictionaries.  Feature identifiers may be strings or
+        instances of the ``Feature`` class.
+      - feature lists, implemented by ``FeatList``, act like Python
+        lists.  Feature identifiers are integers.
+
+    Feature structures may be indexed using either simple feature
+    identifiers or 'feature paths.'  A feature path is a sequence
+    of feature identifiers that stand for a corresponding sequence of
+    indexing operations.  In particular, ``fstruct[(f1,f2,...,fn)]`` is
+    equivalent to ``fstruct[f1][f2]...[fn]``.
+
+    Feature structures may contain reentrant feature structures.  A
+    "reentrant feature structure" is a single feature structure
+    object that can be accessed via multiple feature paths.  Feature
+    structures may also be cyclic.  A feature structure is "cyclic"
+    if there is any feature path from the feature structure to itself.
+
+    Two feature structures are considered equal if they assign the
+    same values to all features, and have the same reentrancies.
+
+    By default, feature structures are mutable.  They may be made
+    immutable with the ``freeze()`` method.  Once they have been
+    frozen, they may be hashed, and thus used as dictionary keys.
+    """
+
+    _frozen = False
+    """:ivar: A flag indicating whether this feature structure is
+       frozen or not.  Once this flag is set, it should never be
+       un-set; and no further modification should be made to this
+       feature structue."""
+
+    ##////////////////////////////////////////////////////////////
+    #{ Constructor
+    ##////////////////////////////////////////////////////////////
+
+    def __new__(cls, features=None, **morefeatures):
+        """
+        Construct and return a new feature structure.  If this
+        constructor is called directly, then the returned feature
+        structure will be an instance of either the ``FeatDict`` class
+        or the ``FeatList`` class.
+
+        :param features: The initial feature values for this feature
+            structure:
+              - FeatStruct(string) -> FeatStructReader().read(string)
+              - FeatStruct(mapping) -> FeatDict(mapping)
+              - FeatStruct(sequence) -> FeatList(sequence)
+              - FeatStruct() -> FeatDict()
+        :param morefeatures: If ``features`` is a mapping or None,
+            then ``morefeatures`` provides additional features for the
+            ``FeatDict`` constructor.
+        """
+        # If the FeatStruct constructor is called directly, then decide
+        # whether to create a FeatDict or a FeatList, based on the
+        # contents of the `features` argument.
+        if cls is FeatStruct:
+            if features is None:
+                return FeatDict.__new__(FeatDict, **morefeatures)
+            elif _is_mapping(features):
+                return FeatDict.__new__(FeatDict, features, **morefeatures)
+            elif morefeatures:
+                raise TypeError('Keyword arguments may only be specified '
+                                'if features is None or is a mapping.')
+            if isinstance(features, string_types):
+                if FeatStructReader._START_FDICT_RE.match(features):
+                    return FeatDict.__new__(FeatDict, features, **morefeatures)
+                else:
+                    return FeatList.__new__(FeatList, features, **morefeatures)
+            elif _is_sequence(features):
+                return FeatList.__new__(FeatList, features)
+            else:
+                raise TypeError('Expected string or mapping or sequence')
+
+        # Otherwise, construct the object as normal.
+        else:
+            return super(FeatStruct, cls).__new__(cls, features,
+                                                  **morefeatures)
+
+    ##////////////////////////////////////////////////////////////
+    #{ Uniform Accessor Methods
+    ##////////////////////////////////////////////////////////////
+    # These helper functions allow the methods defined by FeatStruct
+    # to treat all feature structures as mappings, even if they're
+    # really lists.  (Lists are treated as mappings from ints to vals)
+
+    def _keys(self):
+        """Return an iterable of the feature identifiers used by this
+        FeatStruct."""
+        raise NotImplementedError() # Implemented by subclasses.
+
+    def _values(self):
+        """Return an iterable of the feature values directly defined
+        by this FeatStruct."""
+        raise NotImplementedError() # Implemented by subclasses.
+
+    def _items(self):
+        """Return an iterable of (fid,fval) pairs, where fid is a
+        feature identifier and fval is the corresponding feature
+        value, for all features defined by this FeatStruct."""
+        raise NotImplementedError() # Implemented by subclasses.
+
+    ##////////////////////////////////////////////////////////////
+    #{ Equality & Hashing
+    ##////////////////////////////////////////////////////////////
+
+    def equal_values(self, other, check_reentrance=False):
+        """
+        Return True if ``self`` and ``other`` assign the same value to
+        to every feature.  In particular, return true if
+        ``self[p]==other[p]`` for every feature path *p* such
+        that ``self[p]`` or ``other[p]`` is a base value (i.e.,
+        not a nested feature structure).
+
+        :param check_reentrance: If True, then also return False if
+            there is any difference between the reentrances of ``self``
+            and ``other``.
+        :note: the ``==`` is equivalent to ``equal_values()`` with
+            ``check_reentrance=True``.
+        """
+        return self._equal(other, check_reentrance, set(), set(), set())
+
+    def __eq__(self, other):
+        """
+        Return true if ``self`` and ``other`` are both feature structures,
+        assign the same values to all features, and contain the same
+        reentrances.  I.e., return
+        ``self.equal_values(other, check_reentrance=True)``.
+
+        :see: ``equal_values()``
+        """
+        return self._equal(other, True, set(), set(), set())
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, FeatStruct):
+            # raise_unorderable_types("<", self, other)
+            # Sometimes feature values can be pure strings,
+            # so we need to be able to compare with non-featstructs:
+            return self.__class__.__name__ < other.__class__.__name__
+        else:
+            return len(self) < len(other)
+
+    def __hash__(self):
+        """
+        If this feature structure is frozen, return its hash value;
+        otherwise, raise ``TypeError``.
+        """
+        if not self._frozen:
+            raise TypeError('FeatStructs must be frozen before they '
+                            'can be hashed.')
+        try: return self._hash
+        except AttributeError:
+            self._hash = self._calculate_hashvalue(set())
+            return self._hash
+
+    def _equal(self, other, check_reentrance, visited_self,
+               visited_other, visited_pairs):
+        """
+        Return True iff self and other have equal values.
+
+        :param visited_self: A set containing the ids of all ``self``
+            feature structures we've already visited.
+        :param visited_other: A set containing the ids of all ``other``
+            feature structures we've already visited.
+        :param visited_pairs: A set containing ``(selfid, otherid)`` pairs
+            for all pairs of feature structures we've already visited.
+        """
+        # If we're the same object, then we're equal.
+        if self is other: return True
+
+        # If we have different classes, we're definitely not equal.
+        if self.__class__ != other.__class__: return False
+
+        # If we define different features, we're definitely not equal.
+        # (Perform len test first because it's faster -- we should
+        # do profiling to see if this actually helps)
+        if len(self) != len(other): return False
+        if set(self._keys()) != set(other._keys()): return False
+
+        # If we're checking reentrance, then any time we revisit a
+        # structure, make sure that it was paired with the same
+        # feature structure that it is now.  Note: if check_reentrance,
+        # then visited_pairs will never contain two pairs whose first
+        # values are equal, or two pairs whose second values are equal.
+        if check_reentrance:
+            if id(self) in visited_self or id(other) in visited_other:
+                return (id(self), id(other)) in visited_pairs
+
+        # If we're not checking reentrance, then we still need to deal
+        # with cycles.  If we encounter the same (self, other) pair a
+        # second time, then we won't learn anything more by examining
+        # their children a second time, so just return true.
+        else:
+            if (id(self), id(other)) in visited_pairs:
+                return True
+
+        # Keep track of which nodes we've visited.
+        visited_self.add(id(self))
+        visited_other.add(id(other))
+        visited_pairs.add( (id(self), id(other)) )
+
+        # Now we have to check all values.  If any of them don't match,
+        # then return false.
+        for (fname, self_fval) in self._items():
+            other_fval = other[fname]
+            if isinstance(self_fval, FeatStruct):
+                if not self_fval._equal(other_fval, check_reentrance,
+                                        visited_self, visited_other,
+                                        visited_pairs):
+                    return False
+            else:
+                if self_fval != other_fval: return False
+
+        # Everything matched up; return true.
+        return True
+
+    def _calculate_hashvalue(self, visited):
+        """
+        Return a hash value for this feature structure.
+
+        :require: ``self`` must be frozen.
+        :param visited: A set containing the ids of all feature
+            structures we've already visited while hashing.
+        """
+        if id(self) in visited: return 1
+        visited.add(id(self))
+
+        hashval = 5831
+        for (fname, fval) in sorted(self._items()):
+            hashval *= 37
+            hashval += hash(fname)
+            hashval *= 37
+            if isinstance(fval, FeatStruct):
+                hashval += fval._calculate_hashvalue(visited)
+            else:
+                hashval += hash(fval)
+            # Convert to a 32 bit int.
+            hashval = int(hashval & 0x7fffffff)
+        return hashval
+
+    ##////////////////////////////////////////////////////////////
+    #{ Freezing
+    ##////////////////////////////////////////////////////////////
+
+    #: Error message used by mutating methods when called on a frozen
+    #: feature structure.
+    _FROZEN_ERROR = "Frozen FeatStructs may not be modified."
+
+    def freeze(self):
+        """
+        Make this feature structure, and any feature structures it
+        contains, immutable.  Note: this method does not attempt to
+        'freeze' any feature value that is not a ``FeatStruct``; it
+        is recommended that you use only immutable feature values.
+        """
+        if self._frozen: return
+        self._freeze(set())
+
+    def frozen(self):
+        """
+        Return True if this feature structure is immutable.  Feature
+        structures can be made immutable with the ``freeze()`` method.
+        Immutable feature structures may not be made mutable again,
+        but new mutable copies can be produced with the ``copy()`` method.
+        """
+        return self._frozen
+
+    def _freeze(self, visited):
+        """
+        Make this feature structure, and any feature structure it
+        contains, immutable.
+
+        :param visited: A set containing the ids of all feature
+            structures we've already visited while freezing.
+        """
+        if id(self) in visited: return
+        visited.add(id(self))
+        self._frozen = True
+        for (fname, fval) in sorted(self._items()):
+            if isinstance(fval, FeatStruct):
+                fval._freeze(visited)
+
+    ##////////////////////////////////////////////////////////////
+    #{ Copying
+    ##////////////////////////////////////////////////////////////
+
+    def copy(self, deep=True):
+        """
+        Return a new copy of ``self``.  The new copy will not be frozen.
+
+        :param deep: If true, create a deep copy; if false, create
+            a shallow copy.
+        """
+        if deep:
+            return copy.deepcopy(self)
+        else:
+            return self.__class__(self)
+
+    # Subclasses should define __deepcopy__ to ensure that the new
+    # copy will not be frozen.
+    def __deepcopy__(self, memo):
+        raise NotImplementedError() # Implemented by subclasses.
+
+    ##////////////////////////////////////////////////////////////
+    #{ Structural Information
+    ##////////////////////////////////////////////////////////////
+
+    def cyclic(self):
+        """
+        Return True if this feature structure contains itself.
+        """
+        return self._find_reentrances({})[id(self)]
+
+    def walk(self):
+        """
+        Return an iterator that generates this feature structure, and
+        each feature structure it contains.  Each feature structure will
+        be generated exactly once.
+        """
+        return self._walk(set())
+
+    def _walk(self, visited):
+        """
+        Return an iterator that generates this feature structure, and
+        each feature structure it contains.
+
+        :param visited: A set containing the ids of all feature
+            structures we've already visited while freezing.
+        """
+        raise NotImplementedError() # Implemented by subclasses.
+
+    def _walk(self, visited):
+        if id(self) in visited: return
+        visited.add(id(self))
+        yield self
+        for fval in self._values():
+            if isinstance(fval, FeatStruct):
+                for elt in fval._walk(visited):
+                    yield elt
+
+    # Walk through the feature tree.  The first time we see a feature
+    # value, map it to False (not reentrant).  If we see a feature
+    # value more than once, then map it to True (reentrant).
+    def _find_reentrances(self, reentrances):
+        """
+        Return a dictionary that maps from the ``id`` of each feature
+        structure contained in ``self`` (including ``self``) to a
+        boolean value, indicating whether it is reentrant or not.
+        """
+        if id(self) in reentrances:
+            # We've seen it more than once.
+            reentrances[id(self)] = True
+        else:
+            # This is the first time we've seen it.
+            reentrances[id(self)] = False
+
+            # Recurse to contained feature structures.
+            for fval in self._values():
+                if isinstance(fval, FeatStruct):
+                    fval._find_reentrances(reentrances)
+
+        return reentrances
+
+    ##////////////////////////////////////////////////////////////
+    #{ Variables & Bindings
+    ##////////////////////////////////////////////////////////////
+
+    def substitute_bindings(self, bindings):
+        """:see: ``nltk.featstruct.substitute_bindings()``"""
+        return substitute_bindings(self, bindings)
+
+    def retract_bindings(self, bindings):
+        """:see: ``nltk.featstruct.retract_bindings()``"""
+        return retract_bindings(self, bindings)
+
+    def variables(self):
+        """:see: ``nltk.featstruct.find_variables()``"""
+        return find_variables(self)
+
+    def rename_variables(self, vars=None, used_vars=(), new_vars=None):
+        """:see: ``nltk.featstruct.rename_variables()``"""
+        return rename_variables(self, vars, used_vars, new_vars)
+
+    def remove_variables(self):
+        """
+        Return the feature structure that is obtained by deleting
+        any feature whose value is a ``Variable``.
+
+        :rtype: FeatStruct
+        """
+        return remove_variables(self)
+
+    ##////////////////////////////////////////////////////////////
+    #{ Unification
+    ##////////////////////////////////////////////////////////////
+
+    def unify(self, other, bindings=None, trace=False,
+              fail=None, rename_vars=True):
+        return unify(self, other, bindings, trace, fail, rename_vars)
+
+    def subsumes(self, other):
+        """
+        Return True if ``self`` subsumes ``other``.  I.e., return true
+        If unifying ``self`` with ``other`` would result in a feature
+        structure equal to ``other``.
+        """
+        return subsumes(self, other)
+
+    ##////////////////////////////////////////////////////////////
+    #{ String Representations
+    ##////////////////////////////////////////////////////////////
+
+    def __repr__(self):
+        """
+        Display a single-line representation of this feature structure,
+        suitable for embedding in other representations.
+        """
+        return self._repr(self._find_reentrances({}), {})
+
+    def _repr(self, reentrances, reentrance_ids):
+        """
+        Return a string representation of this feature structure.
+
+        :param reentrances: A dictionary that maps from the ``id`` of
+            each feature value in self, indicating whether that value
+            is reentrant or not.
+        :param reentrance_ids: A dictionary mapping from each ``id``
+            of a feature value to a unique identifier.  This is modified
+            by ``repr``: the first time a reentrant feature value is
+            displayed, an identifier is added to ``reentrance_ids`` for it.
+        """
+        raise NotImplementedError()
+
+# Mutation: disable if frozen.
+_FROZEN_ERROR = "Frozen FeatStructs may not be modified."
+_FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError."
+def _check_frozen(method, indent=''):
+    """
+    Given a method function, return a new method function that first
+    checks if ``self._frozen`` is true; and if so, raises ``ValueError``
+    with an appropriate message.  Otherwise, call the method and return
+    its result.
+    """
+    def wrapped(self, *args, **kwargs):
+        if self._frozen: raise ValueError(_FROZEN_ERROR)
+        else: return method(self, *args, **kwargs)
+    wrapped.__name__ = method.__name__
+    wrapped.__doc__ = (method.__doc__ or '') + (_FROZEN_NOTICE % indent)
+    return wrapped
+
+
+######################################################################
+# Feature Dictionary
+######################################################################
+
+@python_2_unicode_compatible
+class FeatDict(FeatStruct, dict):
+    """
+    A feature structure that acts like a Python dictionary.  I.e., a
+    mapping from feature identifiers to feature values, where a feature
+    identifier can be a string or a ``Feature``; and where a feature value
+    can be either a basic value (such as a string or an integer), or a nested
+    feature structure.  A feature identifiers for a ``FeatDict`` is
+    sometimes called a "feature name".
+
+    Two feature dicts are considered equal if they assign the same
+    values to all features, and have the same reentrances.
+
+    :see: ``FeatStruct`` for information about feature paths, reentrance,
+        cyclic feature structures, mutability, freezing, and hashing.
+    """
+    def __init__(self, features=None, **morefeatures):
+        """
+        Create a new feature dictionary, with the specified features.
+
+        :param features: The initial value for this feature
+            dictionary.  If ``features`` is a ``FeatStruct``, then its
+            features are copied (shallow copy).  If ``features`` is a
+            dict, then a feature is created for each item, mapping its
+            key to its value.  If ``features`` is a string, then it is
+            processed using ``FeatStructReader``.  If ``features`` is a list of
+            tuples ``(name, val)``, then a feature is created for each tuple.
+        :param morefeatures: Additional features for the new feature
+            dictionary.  If a feature is listed under both ``features`` and
+            ``morefeatures``, then the value from ``morefeatures`` will be
+            used.
+        """
+        if isinstance(features, string_types):
+            FeatStructReader().fromstring(features, self)
+            self.update(**morefeatures)
+        else:
+            # update() checks the types of features.
+            self.update(features, **morefeatures)
+
+    #////////////////////////////////////////////////////////////
+    #{ Dict methods
+    #////////////////////////////////////////////////////////////
+    _INDEX_ERROR = str("Expected feature name or path.  Got %r.")
+
+    def __getitem__(self, name_or_path):
+        """If the feature with the given name or path exists, return
+        its value; otherwise, raise ``KeyError``."""
+        if isinstance(name_or_path, (string_types, Feature)):
+            return dict.__getitem__(self, name_or_path)
+        elif isinstance(name_or_path, tuple):
+            try:
+                val = self
+                for fid in name_or_path:
+                    if not isinstance(val, FeatStruct):
+                        raise KeyError # path contains base value
+                    val = val[fid]
+                return val
+            except (KeyError, IndexError):
+                raise KeyError(name_or_path)
+        else:
+            raise TypeError(self._INDEX_ERROR % name_or_path)
+
+    def get(self, name_or_path, default=None):
+        """If the feature with the given name or path exists, return its
+        value; otherwise, return ``default``."""
+        try: return self[name_or_path]
+        except KeyError: return default
+
+    def __contains__(self, name_or_path):
+        """Return true if a feature with the given name or path exists."""
+        try: self[name_or_path]; return True
+        except KeyError: return False
+
+    def has_key(self, name_or_path):
+        """Return true if a feature with the given name or path exists."""
+        return name_or_path in self
+
+    def __delitem__(self, name_or_path):
+        """If the feature with the given name or path exists, delete
+        its value; otherwise, raise ``KeyError``."""
+        if self._frozen: raise ValueError(_FROZEN_ERROR)
+        if isinstance(name_or_path, (string_types, Feature)):
+            return dict.__delitem__(self, name_or_path)
+        elif isinstance(name_or_path, tuple):
+            if len(name_or_path) == 0:
+                raise ValueError("The path () can not be set")
+            else:
+                parent = self[name_or_path[:-1]]
+                if not isinstance(parent, FeatStruct):
+                    raise KeyError(name_or_path) # path contains base value
+                del parent[name_or_path[-1]]
+        else:
+            raise TypeError(self._INDEX_ERROR % name_or_path)
+
+    def __setitem__(self, name_or_path, value):
+        """Set the value for the feature with the given name or path
+        to ``value``.  If ``name_or_path`` is an invalid path, raise
+        ``KeyError``."""
+        if self._frozen: raise ValueError(_FROZEN_ERROR)
+        if isinstance(name_or_path, (string_types, Feature)):
+            return dict.__setitem__(self, name_or_path, value)
+        elif isinstance(name_or_path, tuple):
+            if len(name_or_path) == 0:
+                raise ValueError("The path () can not be set")
+            else:
+                parent = self[name_or_path[:-1]]
+                if not isinstance(parent, FeatStruct):
+                    raise KeyError(name_or_path) # path contains base value
+                parent[name_or_path[-1]] = value
+        else:
+            raise TypeError(self._INDEX_ERROR % name_or_path)
+
+    clear = _check_frozen(dict.clear)
+    pop = _check_frozen(dict.pop)
+    popitem = _check_frozen(dict.popitem)
+    setdefault = _check_frozen(dict.setdefault)
+
+    def update(self, features=None, **morefeatures):
+        if self._frozen: raise ValueError(_FROZEN_ERROR)
+        if features is None:
+            items = ()
+        elif hasattr(features, 'items') and callable(features.items):
+            items = features.items()
+        elif hasattr(features, '__iter__'):
+            items = features
+        else:
+            raise ValueError('Expected mapping or list of tuples')
+
+        for key, val in items:
+            if not isinstance(key, (string_types, Feature)):
+                raise TypeError('Feature names must be strings')
+            self[key] = val
+        for key, val in morefeatures.items():
+            if not isinstance(key, (string_types, Feature)):
+                raise TypeError('Feature names must be strings')
+            self[key] = val
+
+    ##////////////////////////////////////////////////////////////
+    #{ Copying
+    ##////////////////////////////////////////////////////////////
+
+    def __deepcopy__(self, memo):
+        memo[id(self)] = selfcopy = self.__class__()
+        for (key, val) in self._items():
+            selfcopy[copy.deepcopy(key,memo)] = copy.deepcopy(val,memo)
+        return selfcopy
+
+    ##////////////////////////////////////////////////////////////
+    #{ Uniform Accessor Methods
+    ##////////////////////////////////////////////////////////////
+
+    def _keys(self): return self.keys()
+    def _values(self): return self.values()
+    def _items(self): return self.items()
+
+    ##////////////////////////////////////////////////////////////
+    #{ String Representations
+    ##////////////////////////////////////////////////////////////
+
+    def __str__(self):
+        """
+        Display a multi-line representation of this feature dictionary
+        as an FVM (feature value matrix).
+        """
+        return '\n'.join(self._str(self._find_reentrances({}), {}))
+
+    def _repr(self, reentrances, reentrance_ids):
+        segments = []
+        prefix = ''
+        suffix = ''
+
+        # If this is the first time we've seen a reentrant structure,
+        # then assign it a unique identifier.
+        if reentrances[id(self)]:
+            assert id(self) not in reentrance_ids
+            reentrance_ids[id(self)] = repr(len(reentrance_ids)+1)
+
+        # sorting note: keys are unique strings, so we'll never fall
+        # through to comparing values.
+        for (fname, fval) in sorted(self.items()):
+            display = getattr(fname, 'display', None)
+            if id(fval) in reentrance_ids:
+                segments.append('%s->(%s)' %
+                                (fname, reentrance_ids[id(fval)]))
+            elif (display == 'prefix' and not prefix and
+                  isinstance(fval, (Variable, string_types))):
+                    prefix = '%s' % fval
+            elif display == 'slash' and not suffix:
+                if isinstance(fval, Variable):
+                    suffix = '/%s' % fval.name
+                else:
+                    suffix = '/%s' % unicode_repr(fval)
+            elif isinstance(fval, Variable):
+                segments.append('%s=%s' % (fname, fval.name))
+            elif fval is True:
+                segments.append('+%s' % fname)
+            elif fval is False:
+                segments.append('-%s' % fname)
+            elif isinstance(fval, Expression):
+                segments.append('%s=<%s>' % (fname, fval))
+            elif not isinstance(fval, FeatStruct):
+                segments.append('%s=%s' % (fname, unicode_repr(fval)))
+            else:
+                fval_repr = fval._repr(reentrances, reentrance_ids)
+                segments.append('%s=%s' % (fname, fval_repr))
+        # If it's reentrant, then add on an identifier tag.
+        if reentrances[id(self)]:
+            prefix = '(%s)%s' % (reentrance_ids[id(self)], prefix)
+        return '%s[%s]%s' % (prefix, ', '.join(segments), suffix)
+
+    def _str(self, reentrances, reentrance_ids):
+        """
+        :return: A list of lines composing a string representation of
+            this feature dictionary.
+        :param reentrances: A dictionary that maps from the ``id`` of
+            each feature value in self, indicating whether that value
+            is reentrant or not.
+        :param reentrance_ids: A dictionary mapping from each ``id``
+            of a feature value to a unique identifier.  This is modified
+            by ``repr``: the first time a reentrant feature value is
+            displayed, an identifier is added to ``reentrance_ids`` for
+            it.
+        """
+        # If this is the first time we've seen a reentrant structure,
+        # then tack on an id string.
+        if reentrances[id(self)]:
+            assert id(self) not in reentrance_ids
+            reentrance_ids[id(self)] = repr(len(reentrance_ids)+1)
+
+        # Special case: empty feature dict.
+        if len(self) == 0:
+            if reentrances[id(self)]:
+                return ['(%s) []' % reentrance_ids[id(self)]]
+            else:
+                return ['[]']
+
+        # What's the longest feature name?  Use this to align names.
+        maxfnamelen = max(len("%s" % k) for k in self.keys())
+
+        lines = []
+        # sorting note: keys are unique strings, so we'll never fall
+        # through to comparing values.
+        for (fname, fval) in sorted(self.items()):
+            fname = ("%s" % fname).ljust(maxfnamelen)
+            if isinstance(fval, Variable):
+                lines.append('%s = %s' % (fname,fval.name))
+
+            elif isinstance(fval, Expression):
+                lines.append('%s = <%s>' % (fname, fval))
+
+            elif isinstance(fval, FeatList):
+                fval_repr = fval._repr(reentrances, reentrance_ids)
+                lines.append('%s = %s' % (fname, unicode_repr(fval_repr)))
+
+            elif not isinstance(fval, FeatDict):
+                # It's not a nested feature structure -- just print it.
+                lines.append('%s = %s' % (fname, unicode_repr(fval)))
+
+            elif id(fval) in reentrance_ids:
+                # It's a feature structure we've seen before -- print
+                # the reentrance id.
+                lines.append('%s -> (%s)' % (fname, reentrance_ids[id(fval)]))
+
+            else:
+                # It's a new feature structure.  Separate it from
+                # other values by a blank line.
+                if lines and lines[-1] != '': lines.append('')
+
+                # Recursively print the feature's value (fval).
+                fval_lines = fval._str(reentrances, reentrance_ids)
+
+                # Indent each line to make room for fname.
+                fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines]
+
+                # Pick which line we'll display fname on, & splice it in.
+                nameline = (len(fval_lines)-1) // 2
+                fval_lines[nameline] = (
+                        fname+' ='+fval_lines[nameline][maxfnamelen+2:])
+
+                # Add the feature structure to the output.
+                lines += fval_lines
+
+                # Separate FeatStructs by a blank line.
+                lines.append('')
+
+        # Get rid of any excess blank lines.
+        if lines[-1] == '': lines.pop()
+
+        # Add brackets around everything.
+        maxlen = max(len(line) for line in lines)
+        lines = ['[ %s%s ]' % (line, ' '*(maxlen-len(line))) for line in lines]
+
+        # If it's reentrant, then add on an identifier tag.
+        if reentrances[id(self)]:
+            idstr = '(%s) ' % reentrance_ids[id(self)]
+            lines = [(' '*len(idstr))+l for l in lines]
+            idline = (len(lines)-1) // 2
+            lines[idline] = idstr + lines[idline][len(idstr):]
+
+        return lines
+
+
+######################################################################
+# Feature List
+######################################################################
+
+class FeatList(FeatStruct, list):
+    """
+    A list of feature values, where each feature value is either a
+    basic value (such as a string or an integer), or a nested feature
+    structure.
+
+    Feature lists may contain reentrant feature values.  A "reentrant
+    feature value" is a single feature value that can be accessed via
+    multiple feature paths.  Feature lists may also be cyclic.
+
+    Two feature lists are considered equal if they assign the same
+    values to all features, and have the same reentrances.
+
+    :see: ``FeatStruct`` for information about feature paths, reentrance,
+        cyclic feature structures, mutability, freezing, and hashing.
+    """
+    def __init__(self, features=()):
+        """
+        Create a new feature list, with the specified features.
+
+        :param features: The initial list of features for this feature
+            list.  If ``features`` is a string, then it is paresd using
+            ``FeatStructReader``.  Otherwise, it should be a sequence
+            of basic values and nested feature structures.
+        """
+        if isinstance(features, string_types):
+            FeatStructReader().fromstring(features, self)
+        else:
+            list.__init__(self, features)
+
+    #////////////////////////////////////////////////////////////
+    #{ List methods
+    #////////////////////////////////////////////////////////////
+    _INDEX_ERROR = "Expected int or feature path.  Got %r."
+
+    def __getitem__(self, name_or_path):
+        if isinstance(name_or_path, integer_types):
+            return list.__getitem__(self, name_or_path)
+        elif isinstance(name_or_path, tuple):
+            try:
+                val = self
+                for fid in name_or_path:
+                    if not isinstance(val, FeatStruct):
+                        raise KeyError # path contains base value
+                    val = val[fid]
+                return val
+            except (KeyError, IndexError):
+                raise KeyError(name_or_path)
+        else:
+            raise TypeError(self._INDEX_ERROR % name_or_path)
+
+    def __delitem__(self, name_or_path):
+        """If the feature with the given name or path exists, delete
+        its value; otherwise, raise ``KeyError``."""
+        if self._frozen: raise ValueError(_FROZEN_ERROR)
+        if isinstance(name_or_path, (integer_types, slice)):
+            return list.__delitem__(self, name_or_path)
+        elif isinstance(name_or_path, tuple):
+            if len(name_or_path) == 0:
+                raise ValueError("The path () can not be set")
+            else:
+                parent = self[name_or_path[:-1]]
+                if not isinstance(parent, FeatStruct):
+                    raise KeyError(name_or_path) # path contains base value
+                del parent[name_or_path[-1]]
+        else:
+            raise TypeError(self._INDEX_ERROR % name_or_path)
+
+    def __setitem__(self, name_or_path, value):
+        """Set the value for the feature with the given name or path
+        to ``value``.  If ``name_or_path`` is an invalid path, raise
+        ``KeyError``."""
+        if self._frozen: raise ValueError(_FROZEN_ERROR)
+        if isinstance(name_or_path, (integer_types, slice)):
+            return list.__setitem__(self, name_or_path, value)
+        elif isinstance(name_or_path, tuple):
+            if len(name_or_path) == 0:
+                raise ValueError("The path () can not be set")
+            else:
+                parent = self[name_or_path[:-1]]
+                if not isinstance(parent, FeatStruct):
+                    raise KeyError(name_or_path) # path contains base value
+                parent[name_or_path[-1]] = value
+        else:
+            raise TypeError(self._INDEX_ERROR % name_or_path)
+
+#    __delslice__ = _check_frozen(list.__delslice__, '               ')
+#    __setslice__ = _check_frozen(list.__setslice__, '               ')
+    __iadd__ = _check_frozen(list.__iadd__)
+    __imul__ = _check_frozen(list.__imul__)
+    append = _check_frozen(list.append)
+    extend = _check_frozen(list.extend)
+    insert = _check_frozen(list.insert)
+    pop = _check_frozen(list.pop)
+    remove = _check_frozen(list.remove)
+    reverse = _check_frozen(list.reverse)
+    sort = _check_frozen(list.sort)
+
+    ##////////////////////////////////////////////////////////////
+    #{ Copying
+    ##////////////////////////////////////////////////////////////
+
+    def __deepcopy__(self, memo):
+        memo[id(self)] = selfcopy = self.__class__()
+        selfcopy.extend(copy.deepcopy(fval,memo) for fval in self)
+        return selfcopy
+
+    ##////////////////////////////////////////////////////////////
+    #{ Uniform Accessor Methods
+    ##////////////////////////////////////////////////////////////
+
+    def _keys(self): return list(range(len(self)))
+    def _values(self): return self
+    def _items(self): return enumerate(self)
+
+    ##////////////////////////////////////////////////////////////
+    #{ String Representations
+    ##////////////////////////////////////////////////////////////
+
+    # Special handling for: reentrances, variables, expressions.
+    def _repr(self, reentrances, reentrance_ids):
+        # If this is the first time we've seen a reentrant structure,
+        # then assign it a unique identifier.
+        if reentrances[id(self)]:
+            assert id(self) not in reentrance_ids
+            reentrance_ids[id(self)] = repr(len(reentrance_ids)+1)
+            prefix = '(%s)' % reentrance_ids[id(self)]
+        else:
+            prefix = ''
+
+        segments = []
+        for fval in self:
+            if id(fval) in reentrance_ids:
+                segments.append('->(%s)' % reentrance_ids[id(fval)])
+            elif isinstance(fval, Variable):
+                segments.append(fval.name)
+            elif isinstance(fval, Expression):
+                segments.append('%s' % fval)
+            elif isinstance(fval, FeatStruct):
+                segments.append(fval._repr(reentrances, reentrance_ids))
+            else:
+                segments.append('%s' % unicode_repr(fval))
+
+        return '%s[%s]' % (prefix, ', '.join(segments))
+
+######################################################################
+# Variables & Bindings
+######################################################################
+
+def substitute_bindings(fstruct, bindings, fs_class='default'):
+    """
+    Return the feature structure that is obtained by replacing each
+    variable bound by ``bindings`` with its binding.  If a variable is
+    aliased to a bound variable, then it will be replaced by that
+    variable's value.  If a variable is aliased to an unbound
+    variable, then it will be replaced by that variable.
+
+    :type bindings: dict(Variable -> any)
+    :param bindings: A dictionary mapping from variables to values.
+    """
+    if fs_class == 'default': fs_class = _default_fs_class(fstruct)
+    fstruct = copy.deepcopy(fstruct)
+    _substitute_bindings(fstruct, bindings, fs_class, set())
+    return fstruct
+
+def _substitute_bindings(fstruct, bindings, fs_class, visited):
+    # Visit each node only once:
+    if id(fstruct) in visited: return
+    visited.add(id(fstruct))
+
+    if _is_mapping(fstruct): items = fstruct.items()
+    elif _is_sequence(fstruct): items = enumerate(fstruct)
+    else: raise ValueError('Expected mapping or sequence')
+    for (fname, fval) in items:
+        while (isinstance(fval, Variable) and fval in bindings):
+            fval = fstruct[fname] = bindings[fval]
+        if isinstance(fval, fs_class):
+            _substitute_bindings(fval, bindings, fs_class, visited)
+        elif isinstance(fval, SubstituteBindingsI):
+            fstruct[fname] = fval.substitute_bindings(bindings)
+
+def retract_bindings(fstruct, bindings, fs_class='default'):
+    """
+    Return the feature structure that is obtained by replacing each
+    feature structure value that is bound by ``bindings`` with the
+    variable that binds it.  A feature structure value must be
+    identical to a bound value (i.e., have equal id) to be replaced.
+
+    ``bindings`` is modified to point to this new feature structure,
+    rather than the original feature structure.  Feature structure
+    values in ``bindings`` may be modified if they are contained in
+    ``fstruct``.
+    """
+    if fs_class == 'default': fs_class = _default_fs_class(fstruct)
+    (fstruct, new_bindings) = copy.deepcopy((fstruct, bindings))
+    bindings.update(new_bindings)
+    inv_bindings = dict((id(val),var) for (var,val) in bindings.items())
+    _retract_bindings(fstruct, inv_bindings, fs_class, set())
+    return fstruct
+
+def _retract_bindings(fstruct, inv_bindings, fs_class, visited):
+    # Visit each node only once:
+    if id(fstruct) in visited: return
+    visited.add(id(fstruct))
+
+    if _is_mapping(fstruct): items = fstruct.items()
+    elif _is_sequence(fstruct): items = enumerate(fstruct)
+    else: raise ValueError('Expected mapping or sequence')
+    for (fname, fval) in items:
+        if isinstance(fval, fs_class):
+            if id(fval) in inv_bindings:
+                fstruct[fname] = inv_bindings[id(fval)]
+            _retract_bindings(fval, inv_bindings, fs_class, visited)
+
+
+def find_variables(fstruct, fs_class='default'):
+    """
+    :return: The set of variables used by this feature structure.
+    :rtype: set(Variable)
+    """
+    if fs_class == 'default': fs_class = _default_fs_class(fstruct)
+    return _variables(fstruct, set(), fs_class, set())
+
+def _variables(fstruct, vars, fs_class, visited):
+    # Visit each node only once:
+    if id(fstruct) in visited: return
+    visited.add(id(fstruct))
+    if _is_mapping(fstruct): items = fstruct.items()
+    elif _is_sequence(fstruct): items = enumerate(fstruct)
+    else: raise ValueError('Expected mapping or sequence')
+    for (fname, fval) in items:
+        if isinstance(fval, Variable):
+            vars.add(fval)
+        elif isinstance(fval, fs_class):
+            _variables(fval, vars, fs_class, visited)
+        elif isinstance(fval, SubstituteBindingsI):
+            vars.update(fval.variables())
+    return vars
+
+def rename_variables(fstruct, vars=None, used_vars=(), new_vars=None,
+                     fs_class='default'):
+    """
+    Return the feature structure that is obtained by replacing
+    any of this feature structure's variables that are in ``vars``
+    with new variables.  The names for these new variables will be
+    names that are not used by any variable in ``vars``, or in
+    ``used_vars``, or in this feature structure.
+
+    :type vars: set
+    :param vars: The set of variables that should be renamed.
+        If not specified, ``find_variables(fstruct)`` is used; i.e., all
+        variables will be given new names.
+    :type used_vars: set
+    :param used_vars: A set of variables whose names should not be
+        used by the new variables.
+    :type new_vars: dict(Variable -> Variable)
+    :param new_vars: A dictionary that is used to hold the mapping
+        from old variables to new variables.  For each variable *v*
+        in this feature structure:
+
+        - If ``new_vars`` maps *v* to *v'*, then *v* will be
+          replaced by *v'*.
+        - If ``new_vars`` does not contain *v*, but ``vars``
+          does contain *v*, then a new entry will be added to
+          ``new_vars``, mapping *v* to the new variable that is used
+          to replace it.
+
+    To consistently rename the variables in a set of feature
+    structures, simply apply rename_variables to each one, using
+    the same dictionary:
+
+        >>> from nltk.featstruct import FeatStruct
+        >>> fstruct1 = FeatStruct('[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]')
+        >>> fstruct2 = FeatStruct('[subj=[agr=[number=?z,gender=?y]], obj=[agr=[number=?z,gender=?y]]]')
+        >>> new_vars = {}  # Maps old vars to alpha-renamed vars
+        >>> fstruct1.rename_variables(new_vars=new_vars)
+        [obj=[agr=[gender=?y2]], subj=[agr=[gender=?y2]]]
+        >>> fstruct2.rename_variables(new_vars=new_vars)
+        [obj=[agr=[gender=?y2, number=?z2]], subj=[agr=[gender=?y2, number=?z2]]]
+
+    If new_vars is not specified, then an empty dictionary is used.
+    """
+    if fs_class == 'default': fs_class = _default_fs_class(fstruct)
+
+    # Default values:
+    if new_vars is None: new_vars = {}
+    if vars is None: vars = find_variables(fstruct, fs_class)
+    else: vars = set(vars)
+
+    # Add our own variables to used_vars.
+    used_vars = find_variables(fstruct, fs_class).union(used_vars)
+
+    # Copy ourselves, and rename variables in the copy.
+    return _rename_variables(copy.deepcopy(fstruct), vars, used_vars,
+                             new_vars, fs_class, set())
+
+def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited):
+    if id(fstruct) in visited: return
+    visited.add(id(fstruct))
+    if _is_mapping(fstruct): items = fstruct.items()
+    elif _is_sequence(fstruct): items = enumerate(fstruct)
+    else: raise ValueError('Expected mapping or sequence')
+    for (fname, fval) in items:
+        if isinstance(fval, Variable):
+            # If it's in new_vars, then rebind it.
+            if fval in new_vars:
+                fstruct[fname] = new_vars[fval]
+            # If it's in vars, pick a new name for it.
+            elif fval in vars:
+                new_vars[fval] = _rename_variable(fval, used_vars)
+                fstruct[fname] = new_vars[fval]
+                used_vars.add(new_vars[fval])
+        elif isinstance(fval, fs_class):
+            _rename_variables(fval, vars, used_vars, new_vars,
+                              fs_class, visited)
+        elif isinstance(fval, SubstituteBindingsI):
+            # Pick new names for any variables in `vars`
+            for var in fval.variables():
+                if var in vars and var not in new_vars:
+                    new_vars[var] = _rename_variable(var, used_vars)
+                    used_vars.add(new_vars[var])
+            # Replace all variables in `new_vars`.
+            fstruct[fname] = fval.substitute_bindings(new_vars)
+    return fstruct
+
+def _rename_variable(var, used_vars):
+    name, n = re.sub('\d+$', '', var.name), 2
+    if not name: name = '?'
+    while Variable('%s%s' % (name, n)) in used_vars: n += 1
+    return Variable('%s%s' % (name, n))
+
+def remove_variables(fstruct, fs_class='default'):
+    """
+    :rtype: FeatStruct
+    :return: The feature structure that is obtained by deleting
+        all features whose values are ``Variables``.
+    """
+    if fs_class == 'default': fs_class = _default_fs_class(fstruct)
+    return _remove_variables(copy.deepcopy(fstruct), fs_class, set())
+
+def _remove_variables(fstruct, fs_class, visited):
+    if id(fstruct) in visited:
+        return
+    visited.add(id(fstruct))
+
+    if _is_mapping(fstruct):
+        items = list(fstruct.items())
+    elif _is_sequence(fstruct):
+        items = list(enumerate(fstruct))
+    else:
+        raise ValueError('Expected mapping or sequence')
+
+    for (fname, fval) in items:
+        if isinstance(fval, Variable):
+            del fstruct[fname]
+        elif isinstance(fval, fs_class):
+            _remove_variables(fval, fs_class, visited)
+    return fstruct
+
+
+######################################################################
+# Unification
+######################################################################
+
+@python_2_unicode_compatible
+class _UnificationFailure(object):
+    def __repr__(self):
+        return 'nltk.featstruct.UnificationFailure'
+
+UnificationFailure = _UnificationFailure()
+"""A unique value used to indicate unification failure.  It can be
+   returned by ``Feature.unify_base_values()`` or by custom ``fail()``
+   functions to indicate that unificaiton should fail."""
+
+# The basic unification algorithm:
+#   1. Make copies of self and other (preserving reentrance)
+#   2. Destructively unify self and other
+#   3. Apply forward pointers, to preserve reentrance.
+#   4. Replace bound variables with their values.
+def unify(fstruct1, fstruct2, bindings=None, trace=False,
+          fail=None, rename_vars=True, fs_class='default'):
+    """
+    Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature
+    structure.  This unified feature structure is the minimal
+    feature structure that contains all feature value assignments from both
+    ``fstruct1`` and ``fstruct2``, and that preserves all reentrancies.
+
+    If no such feature structure exists (because ``fstruct1`` and
+    ``fstruct2`` specify incompatible values for some feature), then
+    unification fails, and ``unify`` returns None.
+
+    Bound variables are replaced by their values.  Aliased
+    variables are replaced by their representative variable
+    (if unbound) or the value of their representative variable
+    (if bound).  I.e., if variable *v* is in ``bindings``,
+    then *v* is replaced by ``bindings[v]``.  This will
+    be repeated until the variable is replaced by an unbound
+    variable or a non-variable value.
+
+    Unbound variables are bound when they are unified with
+    values; and aliased when they are unified with variables.
+    I.e., if variable *v* is not in ``bindings``, and is
+    unified with a variable or value *x*, then
+    ``bindings[v]`` is set to *x*.
+
+    If ``bindings`` is unspecified, then all variables are
+    assumed to be unbound.  I.e., ``bindings`` defaults to an
+    empty dict.
+
+        >>> from nltk.featstruct import FeatStruct
+        >>> FeatStruct('[a=?x]').unify(FeatStruct('[b=?x]'))
+        [a=?x, b=?x2]
+
+    :type bindings: dict(Variable -> any)
+    :param bindings: A set of variable bindings to be used and
+        updated during unification.
+    :type trace: bool
+    :param trace: If true, generate trace output.
+    :type rename_vars: bool
+    :param rename_vars: If True, then rename any variables in
+        ``fstruct2`` that are also used in ``fstruct1``, in order to
+        avoid collisions on variable names.
+    """
+    # Decide which class(es) will be treated as feature structures,
+    # for the purposes of unification.
+    if fs_class == 'default':
+        fs_class = _default_fs_class(fstruct1)
+        if _default_fs_class(fstruct2) != fs_class:
+            raise ValueError("Mixing FeatStruct objects with Python "
+                             "dicts and lists is not supported.")
+    assert isinstance(fstruct1, fs_class)
+    assert isinstance(fstruct2, fs_class)
+
+    # If bindings are unspecified, use an empty set of bindings.
+    user_bindings = (bindings is not None)
+    if bindings is None: bindings = {}
+
+    # Make copies of fstruct1 and fstruct2 (since the unification
+    # algorithm is destructive). Do it all at once, to preserve
+    # reentrance links between fstruct1 and fstruct2.  Copy bindings
+    # as well, in case there are any bound vars that contain parts
+    # of fstruct1 or fstruct2.
+    (fstruct1copy, fstruct2copy, bindings_copy) = (
+        copy.deepcopy((fstruct1, fstruct2, bindings)))
+
+    # Copy the bindings back to the original bindings dict.
+    bindings.update(bindings_copy)
+
+    if rename_vars:
+        vars1 = find_variables(fstruct1copy, fs_class)
+        vars2 = find_variables(fstruct2copy, fs_class)
+        _rename_variables(fstruct2copy, vars1, vars2, {}, fs_class, set())
+
+    # Do the actual unification.  If it fails, return None.
+    forward = {}
+    if trace: _trace_unify_start((), fstruct1copy, fstruct2copy)
+    try: result = _destructively_unify(fstruct1copy, fstruct2copy, bindings,
+                                       forward, trace, fail, fs_class, ())
+    except _UnificationFailureError: return None
+
+    # _destructively_unify might return UnificationFailure, e.g. if we
+    # tried to unify a mapping with a sequence.
+    if result is UnificationFailure:
+        if fail is None: return None
+        else: return fail(fstruct1copy, fstruct2copy, ())
+
+    # Replace any feature structure that has a forward pointer
+    # with the target of its forward pointer.
+    result = _apply_forwards(result, forward, fs_class, set())
+    if user_bindings: _apply_forwards_to_bindings(forward, bindings)
+
+    # Replace bound vars with values.
+    _resolve_aliases(bindings)
+    _substitute_bindings(result, bindings, fs_class, set())
+
+    # Return the result.
+    if trace: _trace_unify_succeed((), result)
+    if trace: _trace_bindings((), bindings)
+    return result
+
+class _UnificationFailureError(Exception):
+    """An exception that is used by ``_destructively_unify`` to abort
+    unification when a failure is encountered."""
+
+def _destructively_unify(fstruct1, fstruct2, bindings, forward,
+                         trace, fail, fs_class, path):
+    """
+    Attempt to unify ``fstruct1`` and ``fstruct2`` by modifying them
+    in-place.  If the unification succeeds, then ``fstruct1`` will
+    contain the unified value, the value of ``fstruct2`` is undefined,
+    and forward[id(fstruct2)] is set to fstruct1.  If the unification
+    fails, then a _UnificationFailureError is raised, and the
+    values of ``fstruct1`` and ``fstruct2`` are undefined.
+
+    :param bindings: A dictionary mapping variables to values.
+    :param forward: A dictionary mapping feature structures ids
+        to replacement structures.  When two feature structures
+        are merged, a mapping from one to the other will be added
+        to the forward dictionary; and changes will be made only
+        to the target of the forward dictionary.
+        ``_destructively_unify`` will always 'follow' any links
+        in the forward dictionary for fstruct1 and fstruct2 before
+        actually unifying them.
+    :param trace: If true, generate trace output
+    :param path: The feature path that led us to this unification
+        step.  Used for trace output.
+    """
+    # If fstruct1 is already identical to fstruct2, we're done.
+    # Note: this, together with the forward pointers, ensures
+    # that unification will terminate even for cyclic structures.
+    if fstruct1 is fstruct2:
+        if trace: _trace_unify_identity(path, fstruct1)
+        return fstruct1
+
+    # Set fstruct2's forward pointer to point to fstruct1; this makes
+    # fstruct1 the canonical copy for fstruct2.  Note that we need to
+    # do this before we recurse into any child structures, in case
+    # they're cyclic.
+    forward[id(fstruct2)] = fstruct1
+
+    # Unifying two mappings:
+    if _is_mapping(fstruct1) and _is_mapping(fstruct2):
+        for fname in fstruct1:
+            if getattr(fname, 'default', None) is not None:
+                fstruct2.setdefault(fname, fname.default)
+        for fname in fstruct2:
+            if getattr(fname, 'default', None) is not None:
+                fstruct1.setdefault(fname, fname.default)
+
+        # Unify any values that are defined in both fstruct1 and
+        # fstruct2.  Copy any values that are defined in fstruct2 but
+        # not in fstruct1 to fstruct1.  Note: sorting fstruct2's
+        # features isn't actually necessary; but we do it to give
+        # deterministic behavior, e.g. for tracing.
+        for fname, fval2 in sorted(fstruct2.items()):
+            if fname in fstruct1:
+                fstruct1[fname] = _unify_feature_values(
+                    fname, fstruct1[fname], fval2, bindings,
+                    forward, trace, fail, fs_class, path+(fname,))
+            else:
+                fstruct1[fname] = fval2
+
+        return fstruct1 # Contains the unified value.
+
+    # Unifying two sequences:
+    elif _is_sequence(fstruct1) and _is_sequence(fstruct2):
+        # If the lengths don't match, fail.
+        if len(fstruct1) != len(fstruct2):
+            return UnificationFailure
+
+        # Unify corresponding values in fstruct1 and fstruct2.
+        for findex in range(len(fstruct1)):
+            fstruct1[findex] = _unify_feature_values(
+                findex, fstruct1[findex], fstruct2[findex], bindings,
+                forward, trace, fail, fs_class, path+(findex,))
+
+        return fstruct1 # Contains the unified value.
+
+    # Unifying sequence & mapping: fail.  The failure function
+    # doesn't get a chance to recover in this case.
+    elif ((_is_sequence(fstruct1) or _is_mapping(fstruct1)) and
+          (_is_sequence(fstruct2) or _is_mapping(fstruct2))):
+        return UnificationFailure
+
+    # Unifying anything else: not allowed!
+    raise TypeError('Expected mappings or sequences')
+
+def _unify_feature_values(fname, fval1, fval2, bindings, forward,
+                          trace, fail, fs_class, fpath):
+    """
+    Attempt to unify ``fval1`` and and ``fval2``, and return the
+    resulting unified value.  The method of unification will depend on
+    the types of ``fval1`` and ``fval2``:
+
+      1. If they're both feature structures, then destructively
+         unify them (see ``_destructively_unify()``.
+      2. If they're both unbound variables, then alias one variable
+         to the other (by setting bindings[v2]=v1).
+      3. If one is an unbound variable, and the other is a value,
+         then bind the unbound variable to the value.
+      4. If one is a feature structure, and the other is a base value,
+         then fail.
+      5. If they're both base values, then unify them.  By default,
+         this will succeed if they are equal, and fail otherwise.
+    """
+    if trace: _trace_unify_start(fpath, fval1, fval2)
+
+    # Look up the "canonical" copy of fval1 and fval2
+    while id(fval1) in forward: fval1 = forward[id(fval1)]
+    while id(fval2) in forward: fval2 = forward[id(fval2)]
+
+    # If fval1 or fval2 is a bound variable, then
+    # replace it by the variable's bound value.  This
+    # includes aliased variables, which are encoded as
+    # variables bound to other variables.
+    fvar1 = fvar2 = None
+    while isinstance(fval1, Variable) and fval1 in bindings:
+        fvar1 = fval1
+        fval1 = bindings[fval1]
+    while isinstance(fval2, Variable) and fval2 in bindings:
+        fvar2 = fval2
+        fval2 = bindings[fval2]
+
+    # Case 1: Two feature structures (recursive case)
+    if isinstance(fval1, fs_class) and isinstance(fval2, fs_class):
+        result = _destructively_unify(fval1, fval2, bindings, forward,
+                                      trace, fail, fs_class, fpath)
+
+    # Case 2: Two unbound variables (create alias)
+    elif (isinstance(fval1, Variable) and
+          isinstance(fval2, Variable)):
+        if fval1 != fval2: bindings[fval2] = fval1
+        result = fval1
+
+    # Case 3: An unbound variable and a value (bind)
+    elif isinstance(fval1, Variable):
+        bindings[fval1] = fval2
+        result = fval1
+    elif isinstance(fval2, Variable):
+        bindings[fval2] = fval1
+        result = fval2
+
+    # Case 4: A feature structure & a base value (fail)
+    elif isinstance(fval1, fs_class) or isinstance(fval2, fs_class):
+        result = UnificationFailure
+
+    # Case 5: Two base values
+    else:
+        # Case 5a: Feature defines a custom unification method for base values
+        if isinstance(fname, Feature):
+            result = fname.unify_base_values(fval1, fval2, bindings)
+        # Case 5b: Feature value defines custom unification method
+        elif isinstance(fval1, CustomFeatureValue):
+            result = fval1.unify(fval2)
+            # Sanity check: unify value should be symmetric
+            if (isinstance(fval2, CustomFeatureValue) and
+                result != fval2.unify(fval1)):
+                raise AssertionError(
+                    'CustomFeatureValue objects %r and %r disagree '
+                    'about unification value: %r vs. %r' %
+                    (fval1, fval2, result, fval2.unify(fval1)))
+        elif isinstance(fval2, CustomFeatureValue):
+            result = fval2.unify(fval1)
+        # Case 5c: Simple values -- check if they're equal.
+        else:
+            if fval1 == fval2:
+                result = fval1
+            else:
+                result = UnificationFailure
+
+        # If either value was a bound variable, then update the
+        # bindings.  (This is really only necessary if fname is a
+        # Feature or if either value is a CustomFeatureValue.)
+        if result is not UnificationFailure:
+            if fvar1 is not None:
+                bindings[fvar1] = result
+                result = fvar1
+            if fvar2 is not None and fvar2 != fvar1:
+                bindings[fvar2] = result
+                result = fvar2
+
+    # If we unification failed, call the failure function; it
+    # might decide to continue anyway.
+    if result is UnificationFailure:
+        if fail is not None: result = fail(fval1, fval2, fpath)
+        if trace: _trace_unify_fail(fpath[:-1], result)
+        if result is UnificationFailure:
+            raise _UnificationFailureError
+
+    # Normalize the result.
+    if isinstance(result, fs_class):
+        result = _apply_forwards(result, forward, fs_class, set())
+
+    if trace: _trace_unify_succeed(fpath, result)
+    if trace and isinstance(result, fs_class):
+        _trace_bindings(fpath, bindings)
+
+    return result
+
+def _apply_forwards_to_bindings(forward, bindings):
+    """
+    Replace any feature structure that has a forward pointer with
+    the target of its forward pointer (to preserve reentrancy).
+    """
+    for (var, value) in bindings.items():
+        while id(value) in forward:
+            value = forward[id(value)]
+        bindings[var] = value
+
+def _apply_forwards(fstruct, forward, fs_class, visited):
+    """
+    Replace any feature structure that has a forward pointer with
+    the target of its forward pointer (to preserve reentrancy).
+    """
+    # Follow our own forwards pointers (if any)
+    while id(fstruct) in forward: fstruct = forward[id(fstruct)]
+
+    # Visit each node only once:
+    if id(fstruct) in visited: return
+    visited.add(id(fstruct))
+
+    if _is_mapping(fstruct): items = fstruct.items()
+    elif _is_sequence(fstruct): items = enumerate(fstruct)
+    else: raise ValueError('Expected mapping or sequence')
+    for fname, fval in items:
+        if isinstance(fval, fs_class):
+            # Replace w/ forwarded value.
+            while id(fval) in forward:
+                fval = forward[id(fval)]
+            fstruct[fname] = fval
+            # Recurse to child.
+            _apply_forwards(fval, forward, fs_class, visited)
+
+    return fstruct
+
+def _resolve_aliases(bindings):
+    """
+    Replace any bound aliased vars with their binding; and replace
+    any unbound aliased vars with their representative var.
+    """
+    for (var, value) in bindings.items():
+        while isinstance(value, Variable) and value in bindings:
+            value = bindings[var] = bindings[value]
+
+def _trace_unify_start(path, fval1, fval2):
+    if path == ():
+        print('\nUnification trace:')
+    else:
+        fullname = '.'.join("%s" % n for n in path)
+        print('  '+'|   '*(len(path)-1)+'|')
+        print('  '+'|   '*(len(path)-1)+'| Unify feature: %s' % fullname)
+    print('  '+'|   '*len(path)+' / '+_trace_valrepr(fval1))
+    print('  '+'|   '*len(path)+'|\\ '+_trace_valrepr(fval2))
+def _trace_unify_identity(path, fval1):
+    print('  '+'|   '*len(path)+'|')
+    print('  '+'|   '*len(path)+'| (identical objects)')
+    print('  '+'|   '*len(path)+'|')
+    print('  '+'|   '*len(path)+'+-->'+unicode_repr(fval1))
+def _trace_unify_fail(path, result):
+    if result is UnificationFailure: resume = ''
+    else: resume = ' (nonfatal)'
+    print('  '+'|   '*len(path)+'|   |')
+    print('  '+'X   '*len(path)+'X   X <-- FAIL'+resume)
+def _trace_unify_succeed(path, fval1):
+    # Print the result.
+    print('  '+'|   '*len(path)+'|')
+    print('  '+'|   '*len(path)+'+-->'+unicode_repr(fval1))
+def _trace_bindings(path, bindings):
+    # Print the bindings (if any).
+    if len(bindings) > 0:
+        binditems = sorted(bindings.items(), key=lambda v:v[0].name)
+        bindstr = '{%s}' % ', '.join(
+            '%s: %s' % (var, _trace_valrepr(val))
+            for (var, val) in binditems)
+        print('  '+'|   '*len(path)+'    Bindings: '+bindstr)
+def _trace_valrepr(val):
+    if isinstance(val, Variable):
+        return '%s' % val
+    else:
+        return '%s' % unicode_repr(val)
+
+def subsumes(fstruct1, fstruct2):
+    """
+    Return True if ``fstruct1`` subsumes ``fstruct2``.  I.e., return
+    true if unifying ``fstruct1`` with ``fstruct2`` would result in a
+    feature structure equal to ``fstruct2.``
+
+    :rtype: bool
+    """
+    return fstruct2 == unify(fstruct1, fstruct2)
+
+def conflicts(fstruct1, fstruct2, trace=0):
+    """
+    Return a list of the feature paths of all features which are
+    assigned incompatible values by ``fstruct1`` and ``fstruct2``.
+
+    :rtype: list(tuple)
+    """
+    conflict_list = []
+    def add_conflict(fval1, fval2, path):
+        conflict_list.append(path)
+        return fval1
+    unify(fstruct1, fstruct2, fail=add_conflict, trace=trace)
+    return conflict_list
+
+######################################################################
+# Helper Functions
+######################################################################
+
+def _is_mapping(v):
+    return hasattr(v, '__contains__') and hasattr(v, 'keys')
+
+def _is_sequence(v):
+    return (hasattr(v, '__iter__') and hasattr(v, '__len__') and
+            not isinstance(v, string_types))
+
+def _default_fs_class(obj):
+    if isinstance(obj, FeatStruct): return FeatStruct
+    if isinstance(obj, (dict, list)): return (dict, list)
+    else:
+        raise ValueError('To unify objects of type %s, you must specify '
+                         'fs_class explicitly.' % obj.__class__.__name__)
+######################################################################
+# FeatureValueSet & FeatureValueTuple
+######################################################################
+
+class SubstituteBindingsSequence(SubstituteBindingsI):
+    """
+    A mixin class for sequence clases that distributes variables() and
+    substitute_bindings() over the object's elements.
+    """
+    def variables(self):
+        return ([elt for elt in self if isinstance(elt, Variable)] +
+                sum([list(elt.variables()) for elt in self
+                     if isinstance(elt, SubstituteBindingsI)], []))
+
+    def substitute_bindings(self, bindings):
+        return self.__class__([self.subst(v, bindings) for v in self])
+
+    def subst(self, v, bindings):
+        if isinstance(v, SubstituteBindingsI):
+            return v.substitute_bindings(bindings)
+        else:
+            return bindings.get(v, v)
+
+@python_2_unicode_compatible
+class FeatureValueTuple(SubstituteBindingsSequence, tuple):
+    """
+    A base feature value that is a tuple of other base feature values.
+    FeatureValueTuple implements ``SubstituteBindingsI``, so it any
+    variable substitutions will be propagated to the elements
+    contained by the set.  A ``FeatureValueTuple`` is immutable.
+    """
+    def __repr__(self): # [xx] really use %s here?
+        if len(self) == 0: return '()'
+        return '(%s)' % ', '.join('%s' % (b,) for b in self)
+
+
+@python_2_unicode_compatible
+class FeatureValueSet(SubstituteBindingsSequence, frozenset):
+    """
+    A base feature value that is a set of other base feature values.
+    FeatureValueSet implements ``SubstituteBindingsI``, so it any
+    variable substitutions will be propagated to the elements
+    contained by the set.  A ``FeatureValueSet`` is immutable.
+    """
+    def __repr__(self): # [xx] really use %s here?
+        if len(self) == 0: return '{/}' # distinguish from dict.
+        # n.b., we sort the string reprs of our elements, to ensure
+        # that our own repr is deterministic.
+        return '{%s}' % ', '.join(sorted('%s' % (b,) for b in self))
+    __str__ = __repr__
+
+@python_2_unicode_compatible
+class FeatureValueUnion(SubstituteBindingsSequence, frozenset):
+    """
+    A base feature value that represents the union of two or more
+    ``FeatureValueSet`` or ``Variable``.
+    """
+    def __new__(cls, values):
+        # If values contains FeatureValueUnions, then collapse them.
+        values = _flatten(values, FeatureValueUnion)
+
+        # If the resulting list contains no variables, then
+        # use a simple FeatureValueSet instead.
+        if sum(isinstance(v, Variable) for v in values) == 0:
+            values = _flatten(values, FeatureValueSet)
+            return FeatureValueSet(values)
+
+        # If we contain a single variable, return that variable.
+        if len(values) == 1:
+            return list(values)[0]
+
+        # Otherwise, build the FeatureValueUnion.
+        return frozenset.__new__(cls, values)
+
+    def __repr__(self):
+        # n.b., we sort the string reprs of our elements, to ensure
+        # that our own repr is deterministic.  also, note that len(self)
+        # is guaranteed to be 2 or more.
+        return '{%s}' % '+'.join(sorted('%s' % (b,) for b in self))
+
+@python_2_unicode_compatible
+class FeatureValueConcat(SubstituteBindingsSequence, tuple):
+    """
+    A base feature value that represents the concatenation of two or
+    more ``FeatureValueTuple`` or ``Variable``.
+    """
+    def __new__(cls, values):
+        # If values contains FeatureValueConcats, then collapse them.
+        values = _flatten(values, FeatureValueConcat)
+
+        # If the resulting list contains no variables, then
+        # use a simple FeatureValueTuple instead.
+        if sum(isinstance(v, Variable) for v in values) == 0:
+            values = _flatten(values, FeatureValueTuple)
+            return FeatureValueTuple(values)
+
+        # If we contain a single variable, return that variable.
+        if len(values) == 1:
+            return list(values)[0]
+
+        # Otherwise, build the FeatureValueConcat.
+        return tuple.__new__(cls, values)
+
+    def __repr__(self):
+        # n.b.: len(self) is guaranteed to be 2 or more.
+        return '(%s)' % '+'.join('%s' % (b,) for b in self)
+
+
+def _flatten(lst, cls):
+    """
+    Helper function -- return a copy of list, with all elements of
+    type ``cls`` spliced in rather than appended in.
+    """
+    result = []
+    for elt in lst:
+        if isinstance(elt, cls): result.extend(elt)
+        else: result.append(elt)
+    return result
+
+######################################################################
+# Specialized Features
+######################################################################
+
+@total_ordering
+@python_2_unicode_compatible
+class Feature(object):
+    """
+    A feature identifier that's specialized to put additional
+    constraints, default values, etc.
+    """
+    def __init__(self, name, default=None, display=None):
+        assert display in (None, 'prefix', 'slash')
+
+        self._name = name # [xx] rename to .identifier?
+        self._default = default # [xx] not implemented yet.
+        self._display = display
+
+        if self._display == 'prefix':
+            self._sortkey = (-1, self._name)
+        elif self._display == 'slash':
+            self._sortkey = (1, self._name)
+        else:
+            self._sortkey = (0, self._name)
+
+    @property
+    def name(self):
+        """The name of this feature."""
+        return self._name
+
+    @property
+    def default(self):
+        """Default value for this feature."""
+        return self._default
+
+    @property
+    def display(self):
+        """Custom display location: can be prefix, or slash."""
+        return self._display
+
+    def __repr__(self):
+        return '*%s*' % self.name
+
+    def __lt__(self, other):
+        if isinstance(other, string_types):
+            return True
+        if not isinstance(other, Feature):
+            raise_unorderable_types("<", self, other)
+        return self._sortkey < other._sortkey
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self._name == other._name
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(self._name)
+
+    #////////////////////////////////////////////////////////////
+    # These can be overridden by subclasses:
+    #////////////////////////////////////////////////////////////
+
+    def read_value(self, s, position, reentrances, parser):
+        return parser.read_value(s, position, reentrances)
+
+    def unify_base_values(self, fval1, fval2, bindings):
+        """
+        If possible, return a single value..  If not, return
+        the value ``UnificationFailure``.
+        """
+        if fval1 == fval2: return fval1
+        else: return UnificationFailure
+
+
+class SlashFeature(Feature):
+    def read_value(self, s, position, reentrances, parser):
+        return parser.read_partial(s, position, reentrances)
+
+class RangeFeature(Feature):
+    RANGE_RE = re.compile('(-?\d+):(-?\d+)')
+    def read_value(self, s, position, reentrances, parser):
+        m = self.RANGE_RE.match(s, position)
+        if not m: raise ValueError('range', position)
+        return (int(m.group(1)), int(m.group(2))), m.end()
+
+    def unify_base_values(self, fval1, fval2, bindings):
+        if fval1 is None: return fval2
+        if fval2 is None: return fval1
+        rng = max(fval1[0], fval2[0]), min(fval1[1], fval2[1])
+        if rng[1] < rng[0]: return UnificationFailure
+        return rng
+
+SLASH = SlashFeature('slash', default=False, display='slash')
+TYPE = Feature('type', display='prefix')
+
+######################################################################
+# Specialized Feature Values
+######################################################################
+
+@total_ordering
+class CustomFeatureValue(object):
+    """
+    An abstract base class for base values that define a custom
+    unification method.  The custom unification method of
+    ``CustomFeatureValue`` will be used during unification if:
+
+      - The ``CustomFeatureValue`` is unified with another base value.
+      - The ``CustomFeatureValue`` is not the value of a customized
+        ``Feature`` (which defines its own unification method).
+
+    If two ``CustomFeatureValue`` objects are unified with one another
+    during feature structure unification, then the unified base values
+    they return *must* be equal; otherwise, an ``AssertionError`` will
+    be raised.
+
+    Subclasses must define ``unify()``, ``__eq__()`` and ``__lt__()``.
+    Subclasses may also wish to define ``__hash__()``.
+    """
+    def unify(self, other):
+        """
+        If this base value unifies with ``other``, then return the
+        unified value.  Otherwise, return ``UnificationFailure``.
+        """
+        raise NotImplementedError('abstract base class')
+
+    def __eq__(self, other):
+        raise NotImplementedError('abstract base class')
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        raise NotImplementedError('abstract base class')
+
+    def __hash__(self):
+        raise TypeError('%s objects or unhashable' % self.__class__.__name__)
+
+######################################################################
+# Feature Structure Reader
+######################################################################
+
+class FeatStructReader(object):
+    def __init__(self, features=(SLASH, TYPE), fdict_class=FeatStruct,
+                 flist_class=FeatList, logic_parser=None):
+        self._features = dict((f.name,f) for f in features)
+        self._fdict_class = fdict_class
+        self._flist_class = flist_class
+        self._prefix_feature = None
+        self._slash_feature = None
+        for feature in features:
+            if feature.display == 'slash':
+                if self._slash_feature:
+                    raise ValueError('Multiple features w/ display=slash')
+                self._slash_feature = feature
+            if feature.display == 'prefix':
+                if self._prefix_feature:
+                    raise ValueError('Multiple features w/ display=prefix')
+                self._prefix_feature = feature
+        self._features_with_defaults = [feature for feature in features
+                                        if feature.default is not None]
+        if logic_parser is None:
+            logic_parser = LogicParser()
+        self._logic_parser = logic_parser
+
+    def fromstring(self, s, fstruct=None):
+        """
+        Convert a string representation of a feature structure (as
+        displayed by repr) into a ``FeatStruct``.  This process
+        imposes the following restrictions on the string
+        representation:
+
+        - Feature names cannot contain any of the following:
+          whitespace, parentheses, quote marks, equals signs,
+          dashes, commas, and square brackets.  Feature names may
+          not begin with plus signs or minus signs.
+        - Only the following basic feature value are supported:
+          strings, integers, variables, None, and unquoted
+          alphanumeric strings.
+        - For reentrant values, the first mention must specify
+          a reentrance identifier and a value; and any subsequent
+          mentions must use arrows (``'->'``) to reference the
+          reentrance identifier.
+        """
+        s = s.strip()
+        value, position = self.read_partial(s, 0, {}, fstruct)
+        if position != len(s):
+            self._error(s, 'end of string', position)
+        return value
+
+    _START_FSTRUCT_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)')
+    _END_FSTRUCT_RE = re.compile(r'\s*]\s*')
+    _SLASH_RE = re.compile(r'/')
+    _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*')
+    _REENTRANCE_RE = re.compile(r'\s*->\s*')
+    _TARGET_RE = re.compile(r'\s*\((\d+)\)\s*')
+    _ASSIGN_RE = re.compile(r'\s*=\s*')
+    _COMMA_RE = re.compile(r'\s*,\s*')
+    _BARE_PREFIX_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()')
+    # This one is used to distinguish fdicts from flists:
+    _START_FDICT_RE = re.compile(r'(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))' % (
+        _BARE_PREFIX_RE.pattern, _START_FSTRUCT_RE.pattern,
+        _FEATURE_NAME_RE.pattern, _FEATURE_NAME_RE.pattern))
+
+    def read_partial(self, s, position=0, reentrances=None, fstruct=None):
+        """
+        Helper function that reads in a feature structure.
+
+        :param s: The string to read.
+        :param position: The position in the string to start parsing.
+        :param reentrances: A dictionary from reentrance ids to values.
+            Defaults to an empty dictionary.
+        :return: A tuple (val, pos) of the feature structure created by
+            parsing and the position where the parsed feature structure ends.
+        :rtype: bool
+        """
+        if reentrances is None: reentrances = {}
+        try:
+            return self._read_partial(s, position, reentrances, fstruct)
+        except ValueError as e:
+            if len(e.args) != 2: raise
+            self._error(s, *e.args)
+
+    def _read_partial(self, s, position, reentrances, fstruct=None):
+        # Create the new feature structure
+        if fstruct is None:
+            if self._START_FDICT_RE.match(s, position):
+                fstruct = self._fdict_class()
+            else:
+                fstruct = self._flist_class()
+
+        # Read up to the open bracket.
+        match = self._START_FSTRUCT_RE.match(s, position)
+        if not match:
+            match = self._BARE_PREFIX_RE.match(s, position)
+            if not match:
+                raise ValueError('open bracket or identifier', position)
+        position = match.end()
+
+        # If there as an identifier, record it.
+        if match.group(1):
+            identifier = match.group(1)
+            if identifier in reentrances:
+                raise ValueError('new identifier', match.start(1))
+            reentrances[identifier] = fstruct
+
+        if isinstance(fstruct, FeatDict):
+            fstruct.clear()
+            return self._read_partial_featdict(s, position, match,
+                                                reentrances, fstruct)
+        else:
+            del fstruct[:]
+            return self._read_partial_featlist(s, position, match,
+                                                reentrances, fstruct)
+
+    def _read_partial_featlist(self, s, position, match,
+                                reentrances, fstruct):
+        # Prefix features are not allowed:
+        if match.group(2): raise ValueError('open bracket')
+        # Bare prefixes are not allowed:
+        if not match.group(3): raise ValueError('open bracket')
+
+        # Build a list of the features defined by the structure.
+        while position < len(s):
+            # Check for the close bracket.
+            match = self._END_FSTRUCT_RE.match(s, position)
+            if match is not None:
+                return fstruct, match.end()
+
+            # Reentances have the form "-> (target)"
+            match = self._REENTRANCE_RE.match(s, position)
+            if match:
+                position = match.end()
+                match = self._TARGET_RE.match(s, position)
+                if not match: raise ValueError('identifier', position)
+                target = match.group(1)
+                if target not in reentrances:
+                    raise ValueError('bound identifier', position)
+                position = match.end()
+                fstruct.append(reentrances[target])
+
+            # Anything else is a value.
+            else:
+                value, position = (
+                    self._read_value(0, s, position, reentrances))
+                fstruct.append(value)
+
+            # If there's a close bracket, handle it at the top of the loop.
+            if self._END_FSTRUCT_RE.match(s, position):
+                continue
+
+            # Otherwise, there should be a comma
+            match = self._COMMA_RE.match(s, position)
+            if match is None: raise ValueError('comma', position)
+            position = match.end()
+
+        # We never saw a close bracket.
+        raise ValueError('close bracket', position)
+
+    def _read_partial_featdict(self, s, position, match,
+                                reentrances, fstruct):
+        # If there was a prefix feature, record it.
+        if match.group(2):
+            if self._prefix_feature is None:
+                raise ValueError('open bracket or identifier', match.start(2))
+            prefixval = match.group(2).strip()
+            if prefixval.startswith('?'):
+                prefixval = Variable(prefixval)
+            fstruct[self._prefix_feature] = prefixval
+
+        # If group 3 is empty, then we just have a bare prefix, so
+        # we're done.
+        if not match.group(3):
+            return self._finalize(s, match.end(), reentrances, fstruct)
+
+        # Build a list of the features defined by the structure.
+        # Each feature has one of the three following forms:
+        #     name = value
+        #     name -> (target)
+        #     +name
+        #     -name
+        while position < len(s):
+            # Use these variables to hold info about each feature:
+            name = value = None
+
+            # Check for the close bracket.
+            match = self._END_FSTRUCT_RE.match(s, position)
+            if match is not None:
+                return self._finalize(s, match.end(), reentrances, fstruct)
+
+            # Get the feature name's name
+            match = self._FEATURE_NAME_RE.match(s, position)
+            if match is None: raise ValueError('feature name', position)
+            name = match.group(2)
+            position = match.end()
+
+            # Check if it's a special feature.
+            if name[0] == '*' and name[-1] == '*':
+                name = self._features.get(name[1:-1])
+                if name is None:
+                    raise ValueError('known special feature', match.start(2))
+
+            # Check if this feature has a value already.
+            if name in fstruct:
+                raise ValueError('new name', match.start(2))
+
+            # Boolean value ("+name" or "-name")
+            if match.group(1) == '+': value = True
+            if match.group(1) == '-': value = False
+
+            # Reentrance link ("-> (target)")
+            if value is None:
+                match = self._REENTRANCE_RE.match(s, position)
+                if match is not None:
+                    position = match.end()
+                    match = self._TARGET_RE.match(s, position)
+                    if not match:
+                        raise ValueError('identifier', position)
+                    target = match.group(1)
+                    if target not in reentrances:
+                        raise ValueError('bound identifier', position)
+                    position = match.end()
+                    value = reentrances[target]
+
+            # Assignment ("= value").
+            if value is None:
+                match = self._ASSIGN_RE.match(s, position)
+                if match:
+                    position = match.end()
+                    value, position = (
+                        self._read_value(name, s, position, reentrances))
+                # None of the above: error.
+                else:
+                    raise ValueError('equals sign', position)
+
+            # Store the value.
+            fstruct[name] = value
+
+            # If there's a close bracket, handle it at the top of the loop.
+            if self._END_FSTRUCT_RE.match(s, position):
+                continue
+
+            # Otherwise, there should be a comma
+            match = self._COMMA_RE.match(s, position)
+            if match is None: raise ValueError('comma', position)
+            position = match.end()
+
+        # We never saw a close bracket.
+        raise ValueError('close bracket', position)
+
+    def _finalize(self, s, pos, reentrances, fstruct):
+        """
+        Called when we see the close brace -- checks for a slash feature,
+        and adds in default values.
+        """
+        # Add the slash feature (if any)
+        match = self._SLASH_RE.match(s, pos)
+        if match:
+            name = self._slash_feature
+            v, pos = self._read_value(name, s, match.end(), reentrances)
+            fstruct[name] = v
+        ## Add any default features.  -- handle in unficiation instead?
+        #for feature in self._features_with_defaults:
+        #    fstruct.setdefault(feature, feature.default)
+        # Return the value.
+        return fstruct, pos
+
+    def _read_value(self, name, s, position, reentrances):
+        if isinstance(name, Feature):
+            return name.read_value(s, position, reentrances, self)
+        else:
+            return self.read_value(s, position, reentrances)
+
+    def read_value(self, s, position, reentrances):
+        for (handler, regexp) in self.VALUE_HANDLERS:
+            match = regexp.match(s, position)
+            if match:
+                handler_func = getattr(self, handler)
+                return handler_func(s, position, reentrances, match)
+        raise ValueError('value', position)
+
+    def _error(self, s, expected, position):
+        lines = s.split('\n')
+        while position > len(lines[0]):
+            position -= len(lines.pop(0))+1 # +1 for the newline.
+        estr = ('Error parsing feature structure\n    ' +
+                lines[0] + '\n    ' + ' '*position + '^ ' +
+                'Expected %s' % expected)
+        raise ValueError(estr)
+
+    #////////////////////////////////////////////////////////////
+    #{ Value Readers
+    #////////////////////////////////////////////////////////////
+
+    #: A table indicating how feature values should be processed.  Each
+    #: entry in the table is a pair (handler, regexp).  The first entry
+    #: with a matching regexp will have its handler called.  Handlers
+    #: should have the following signature::
+    #:
+    #:    def handler(s, position, reentrances, match): ...
+    #:
+    #: and should return a tuple (value, position), where position is
+    #: the string position where the value ended.  (n.b.: order is
+    #: important here!)
+    VALUE_HANDLERS = [
+        ('read_fstruct_value', _START_FSTRUCT_RE),
+        ('read_var_value', re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*')),
+        ('read_str_value', re.compile("[uU]?[rR]?(['\"])")),
+        ('read_int_value', re.compile(r'-?\d+')),
+        ('read_sym_value', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')),
+        ('read_app_value', re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,'
+                                       r'\s*(\?[a-z][a-z]*)\)>')),
+#       ('read_logic_value', re.compile(r'<([^>]*)>')),
+        #lazily match any character after '<' until we hit a '>' not preceded by '-'
+        ('read_logic_value', re.compile(r'<(.*?)(?<!-)>')),
+        ('read_set_value', re.compile(r'{')),
+        ('read_tuple_value', re.compile(r'\(')),
+        ]
+
+    def read_fstruct_value(self, s, position, reentrances, match):
+        return self.read_partial(s, position, reentrances)
+
+    def read_str_value(self, s, position, reentrances, match):
+        return read_str(s, position)
+
+    def read_int_value(self, s, position, reentrances, match):
+        return int(match.group()), match.end()
+
+    # Note: the '?' is included in the variable name.
+    def read_var_value(self, s, position, reentrances, match):
+        return Variable(match.group()), match.end()
+
+    _SYM_CONSTS = {'None':None, 'True':True, 'False':False}
+    def read_sym_value(self, s, position, reentrances, match):
+        val, end = match.group(), match.end()
+        return self._SYM_CONSTS.get(val, val), end
+
+    def read_app_value(self, s, position, reentrances, match):
+        """Mainly included for backwards compat."""
+        return self._logic_parser.parse('%s(%s)' % match.group(2,3)), match.end()
+
+    def read_logic_value(self, s, position, reentrances, match):
+        try:
+            try:
+                expr = self._logic_parser.parse(match.group(1))
+            except LogicalExpressionException:
+                raise ValueError()
+            return expr, match.end()
+        except ValueError:
+            raise ValueError('logic expression', match.start(1))
+
+    def read_tuple_value(self, s, position, reentrances, match):
+        return self._read_seq_value(s, position, reentrances, match, ')',
+                                     FeatureValueTuple, FeatureValueConcat)
+
+    def read_set_value(self, s, position, reentrances, match):
+        return self._read_seq_value(s, position, reentrances, match, '}',
+                                     FeatureValueSet, FeatureValueUnion)
+
+    def _read_seq_value(self, s, position, reentrances, match,
+                         close_paren, seq_class, plus_class):
+        """
+        Helper function used by read_tuple_value and read_set_value.
+        """
+        cp = re.escape(close_paren)
+        position = match.end()
+        # Special syntax fo empty tuples:
+        m = re.compile(r'\s*/?\s*%s' % cp).match(s, position)
+        if m: return seq_class(), m.end()
+        # Read values:
+        values = []
+        seen_plus = False
+        while True:
+            # Close paren: return value.
+            m = re.compile(r'\s*%s' % cp).match(s, position)
+            if m:
+                if seen_plus: return plus_class(values), m.end()
+                else: return seq_class(values), m.end()
+
+            # Read the next value.
+            val, position = self.read_value(s, position, reentrances)
+            values.append(val)
+
+            # Comma or looking at close paren
+            m = re.compile(r'\s*(,|\+|(?=%s))\s*' % cp).match(s, position)
+            if not m: raise ValueError("',' or '+' or '%s'" % cp, position)
+            if m.group(1) == '+': seen_plus = True
+            position = m.end()
+
+######################################################################
+#{ Demo
+######################################################################
+
+def display_unification(fs1, fs2, indent='  '):
+    # Print the two input feature structures, side by side.
+    fs1_lines = ("%s" % fs1).split('\n')
+    fs2_lines = ("%s" % fs2).split('\n')
+    if len(fs1_lines) > len(fs2_lines):
+        blankline = '['+' '*(len(fs2_lines[0])-2)+']'
+        fs2_lines += [blankline]*len(fs1_lines)
+    else:
+        blankline = '['+' '*(len(fs1_lines[0])-2)+']'
+        fs1_lines += [blankline]*len(fs2_lines)
+    for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines):
+        print(indent + fs1_line + '   ' + fs2_line)
+    print(indent+'-'*len(fs1_lines[0])+'   '+'-'*len(fs2_lines[0]))
+
+    linelen = len(fs1_lines[0])*2+3
+    print(indent+'|               |'.center(linelen))
+    print(indent+'+-----UNIFY-----+'.center(linelen))
+    print(indent+'|'.center(linelen))
+    print(indent+'V'.center(linelen))
+
+    bindings = {}
+
+    result = fs1.unify(fs2, bindings)
+    if result is None:
+        print(indent+'(FAILED)'.center(linelen))
+    else:
+        print('\n'.join(indent+l.center(linelen)
+                         for l in ("%s" % result).split('\n')))
+        if bindings and len(bindings.bound_variables()) > 0:
+            print(repr(bindings).center(linelen))
+    return result
+
+def interactive_demo(trace=False):
+    import random, sys
+
+    HELP = '''
+    1-%d: Select the corresponding feature structure
+    q: Quit
+    t: Turn tracing on or off
+    l: List all feature structures
+    ?: Help
+    '''
+
+    print('''
+    This demo will repeatedly present you with a list of feature
+    structures, and ask you to choose two for unification.  Whenever a
+    new feature structure is generated, it is added to the list of
+    choices that you can pick from.  However, since this can be a
+    large number of feature structures, the demo will only print out a
+    random subset for you to choose between at a given time.  If you
+    want to see the complete lists, type "l".  For a list of valid
+    commands, type "?".
+    ''')
+    print('Press "Enter" to continue...')
+    sys.stdin.readline()
+
+    fstruct_strings = [
+        '[agr=[number=sing, gender=masc]]',
+        '[agr=[gender=masc, person=3]]',
+        '[agr=[gender=fem, person=3]]',
+        '[subj=[agr=(1)[]], agr->(1)]',
+        '[obj=?x]', '[subj=?x]',
+        '[/=None]', '[/=NP]',
+        '[cat=NP]', '[cat=VP]', '[cat=PP]',
+        '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+        '[gender=masc, agr=?C]',
+        '[gender=?S, agr=[gender=?S,person=3]]'
+        ]
+
+    all_fstructs = [(i, FeatStruct(fstruct_strings[i]))
+                    for i in range(len(fstruct_strings))]
+
+    def list_fstructs(fstructs):
+        for i, fstruct in fstructs:
+            print()
+            lines = ("%s" % fstruct).split('\n')
+            print('%3d: %s' % (i+1, lines[0]))
+            for line in lines[1:]: print('     '+line)
+        print()
+
+
+    while True:
+        # Pick 5 feature structures at random from the master list.
+        MAX_CHOICES = 5
+        if len(all_fstructs) > MAX_CHOICES:
+            fstructs = sorted(random.sample(all_fstructs, MAX_CHOICES))
+        else:
+            fstructs = all_fstructs
+
+        print('_'*75)
+
+        print('Choose two feature structures to unify:')
+        list_fstructs(fstructs)
+
+        selected = [None,None]
+        for (nth,i) in (('First',0), ('Second',1)):
+            while selected[i] is None:
+                print(('%s feature structure (1-%d,q,t,l,?): '
+                       % (nth, len(all_fstructs))), end=' ')
+                try:
+                    input = sys.stdin.readline().strip()
+                    if input in ('q', 'Q', 'x', 'X'): return
+                    if input in ('t', 'T'):
+                        trace = not trace
+                        print('   Trace = %s' % trace)
+                        continue
+                    if input in ('h', 'H', '?'):
+                        print(HELP % len(fstructs)); continue
+                    if input in ('l', 'L'):
+                        list_fstructs(all_fstructs); continue
+                    num = int(input)-1
+                    selected[i] = all_fstructs[num][1]
+                    print()
+                except:
+                    print('Bad sentence number')
+                    continue
+
+        if trace:
+            result = selected[0].unify(selected[1], trace=1)
+        else:
+            result = display_unification(selected[0], selected[1])
+        if result is not None:
+            for i, fstruct in all_fstructs:
+                if repr(result) == repr(fstruct): break
+            else:
+                all_fstructs.append((len(all_fstructs), result))
+
+        print('\nType "Enter" to continue unifying; or "q" to quit.')
+        input = sys.stdin.readline().strip()
+        if input in ('q', 'Q', 'x', 'X'): return
+
+def demo(trace=False):
+    """
+    Just for testing
+    """
+    #import random
+
+    # processor breaks with values like '3rd'
+    fstruct_strings = [
+        '[agr=[number=sing, gender=masc]]',
+        '[agr=[gender=masc, person=3]]',
+        '[agr=[gender=fem, person=3]]',
+        '[subj=[agr=(1)[]], agr->(1)]',
+        '[obj=?x]', '[subj=?x]',
+        '[/=None]', '[/=NP]',
+        '[cat=NP]', '[cat=VP]', '[cat=PP]',
+        '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+        '[gender=masc, agr=?C]',
+        '[gender=?S, agr=[gender=?S,person=3]]'
+    ]
+    all_fstructs = [FeatStruct(fss) for fss in fstruct_strings]
+    #MAX_CHOICES = 5
+    #if len(all_fstructs) > MAX_CHOICES:
+        #fstructs = random.sample(all_fstructs, MAX_CHOICES)
+        #fstructs.sort()
+    #else:
+        #fstructs = all_fstructs
+
+    for fs1 in all_fstructs:
+        for fs2 in all_fstructs:
+            print("\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s" % (fs1, fs2, unify(fs1, fs2)))
+
+
+if __name__ == '__main__':
+    demo()
+
+__all__ = ['FeatStruct', 'FeatDict', 'FeatList', 'unify', 'subsumes', 'conflicts',
+           'Feature', 'SlashFeature', 'RangeFeature', 'SLASH', 'TYPE',
+           'FeatStructReader']
diff --git a/nlp_resource_data/nltk/featstruct.pyc b/nlp_resource_data/nltk/featstruct.pyc

new file mode 100755 (executable)

index 0000000..08e80ca

Binary files /dev/null and b/nlp_resource_data/nltk/featstruct.pyc differ
diff --git a/nlp_resource_data/nltk/grammar.py b/nlp_resource_data/nltk/grammar.py

new file mode 100755 (executable)

index 0000000..6c09500
--- /dev/null
+++ b/nlp_resource_data/nltk/grammar.py
@@ -0,0 +1,1532 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Context Free Grammars
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Jason Narad <jason.narad@gmail.com>
+#         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Basic data classes for representing context free grammars.  A
+"grammar" specifies which trees can represent the structure of a
+given text.  Each of these trees is called a "parse tree" for the
+text (or simply a "parse").  In a "context free" grammar, the set of
+parse trees for any piece of a text can depend only on that piece, and
+not on the rest of the text (i.e., the piece's context).  Context free
+grammars are often used to find possible syntactic structures for
+sentences.  In this context, the leaves of a parse tree are word
+tokens; and the node values are phrasal categories, such as ``NP``
+and ``VP``.
+
+The ``CFG`` class is used to encode context free grammars.  Each
+``CFG`` consists of a start symbol and a set of productions.
+The "start symbol" specifies the root node value for parse trees.  For example,
+the start symbol for syntactic parsing is usually ``S``.  Start
+symbols are encoded using the ``Nonterminal`` class, which is discussed
+below.
+
+A Grammar's "productions" specify what parent-child relationships a parse
+tree can contain.  Each production specifies that a particular
+node can be the parent of a particular set of children.  For example,
+the production ``<S> -> <NP> <VP>`` specifies that an ``S`` node can
+be the parent of an ``NP`` node and a ``VP`` node.
+
+Grammar productions are implemented by the ``Production`` class.
+Each ``Production`` consists of a left hand side and a right hand
+side.  The "left hand side" is a ``Nonterminal`` that specifies the
+node type for a potential parent; and the "right hand side" is a list
+that specifies allowable children for that parent.  This lists
+consists of ``Nonterminals`` and text types: each ``Nonterminal``
+indicates that the corresponding child may be a ``TreeToken`` with the
+specified node type; and each text type indicates that the
+corresponding child may be a ``Token`` with the with that type.
+
+The ``Nonterminal`` class is used to distinguish node values from leaf
+values.  This prevents the grammar from accidentally using a leaf
+value (such as the English word "A") as the node of a subtree.  Within
+a ``CFG``, all node values are wrapped in the ``Nonterminal``
+class. Note, however, that the trees that are specified by the grammar do
+*not* include these ``Nonterminal`` wrappers.
+
+Grammars can also be given a more procedural interpretation.  According to
+this interpretation, a Grammar specifies any tree structure *tree* that
+can be produced by the following procedure:
+
+| Set tree to the start symbol
+| Repeat until tree contains no more nonterminal leaves:
+|   Choose a production prod with whose left hand side
+|     lhs is a nonterminal leaf of tree.
+|   Replace the nonterminal leaf with a subtree, whose node
+|     value is the value wrapped by the nonterminal lhs, and
+|     whose children are the right hand side of prod.
+
+The operation of replacing the left hand side (*lhs*) of a production
+with the right hand side (*rhs*) in a tree (*tree*) is known as
+"expanding" *lhs* to *rhs* in *tree*.
+"""
+from __future__ import print_function, unicode_literals, division
+
+import re
+from functools import total_ordering
+
+from six import string_types
+
+from nltk.util import transitive_closure, invert_graph
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+from nltk.internals import raise_unorderable_types
+
+from nltk.probability import ImmutableProbabilisticMixIn
+from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE
+
+#################################################################
+# Nonterminal
+#################################################################
+
+@total_ordering
+@python_2_unicode_compatible
+class Nonterminal(object):
+    """
+    A non-terminal symbol for a context free grammar.  ``Nonterminal``
+    is a wrapper class for node values; it is used by ``Production``
+    objects to distinguish node values from leaf values.
+    The node value that is wrapped by a ``Nonterminal`` is known as its
+    "symbol".  Symbols are typically strings representing phrasal
+    categories (such as ``"NP"`` or ``"VP"``).  However, more complex
+    symbol types are sometimes used (e.g., for lexicalized grammars).
+    Since symbols are node values, they must be immutable and
+    hashable.  Two ``Nonterminals`` are considered equal if their
+    symbols are equal.
+
+    :see: ``CFG``, ``Production``
+    :type _symbol: any
+    :ivar _symbol: The node value corresponding to this
+        ``Nonterminal``.  This value must be immutable and hashable.
+    """
+    def __init__(self, symbol):
+        """
+        Construct a new non-terminal from the given symbol.
+
+        :type symbol: any
+        :param symbol: The node value corresponding to this
+            ``Nonterminal``.  This value must be immutable and
+            hashable.
+        """
+        self._symbol = symbol
+        self._hash = hash(symbol)
+
+    def symbol(self):
+        """
+        Return the node value corresponding to this ``Nonterminal``.
+
+        :rtype: (any)
+        """
+        return self._symbol
+
+    def __eq__(self, other):
+        """
+        Return True if this non-terminal is equal to ``other``.  In
+        particular, return True if ``other`` is a ``Nonterminal``
+        and this non-terminal's symbol is equal to ``other`` 's symbol.
+
+        :rtype: bool
+        """
+        return type(self) == type(other) and self._symbol == other._symbol
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, Nonterminal):
+            raise_unorderable_types("<", self, other)
+        return self._symbol < other._symbol
+
+    def __hash__(self):
+        return self._hash
+
+    def __repr__(self):
+        """
+        Return a string representation for this ``Nonterminal``.
+
+        :rtype: str
+        """
+        if isinstance(self._symbol, string_types):
+            return '%s' % self._symbol
+        else:
+            return '%s' % unicode_repr(self._symbol)
+
+    def __str__(self):
+        """
+        Return a string representation for this ``Nonterminal``.
+
+        :rtype: str
+        """
+        if isinstance(self._symbol, string_types):
+            return '%s' % self._symbol
+        else:
+            return '%s' % unicode_repr(self._symbol)
+
+    def __div__(self, rhs):
+        """
+        Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
+        the symbol for this nonterminal, and ``B`` is the symbol for rhs.
+
+        :param rhs: The nonterminal used to form the right hand side
+            of the new nonterminal.
+        :type rhs: Nonterminal
+        :rtype: Nonterminal
+        """
+        return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
+
+
+    def __truediv__(self, rhs):
+        """
+        Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
+        the symbol for this nonterminal, and ``B`` is the symbol for rhs.
+        This function allows use of the slash ``/`` operator with
+        the future import of division.
+
+        :param rhs: The nonterminal used to form the right hand side
+            of the new nonterminal.
+        :type rhs: Nonterminal
+        :rtype: Nonterminal
+        """
+        return self.__div__(rhs)
+
+def nonterminals(symbols):
+    """
+    Given a string containing a list of symbol names, return a list of
+    ``Nonterminals`` constructed from those symbols.
+
+    :param symbols: The symbol name string.  This string can be
+        delimited by either spaces or commas.
+    :type symbols: str
+    :return: A list of ``Nonterminals`` constructed from the symbol
+        names given in ``symbols``.  The ``Nonterminals`` are sorted
+        in the same order as the symbols names.
+    :rtype: list(Nonterminal)
+    """
+    if ',' in symbols: symbol_list = symbols.split(',')
+    else: symbol_list = symbols.split()
+    return [Nonterminal(s.strip()) for s in symbol_list]
+
+class FeatStructNonterminal(FeatDict, Nonterminal):
+    """A feature structure that's also a nonterminal.  It acts as its
+    own symbol, and automatically freezes itself when hashed."""
+    def __hash__(self):
+        self.freeze()
+        return FeatStruct.__hash__(self)
+    def symbol(self):
+        return self
+
+def is_nonterminal(item):
+    """
+    :return: True if the item is a ``Nonterminal``.
+    :rtype: bool
+    """
+    return isinstance(item, Nonterminal)
+
+
+#################################################################
+# Terminals
+#################################################################
+
+def is_terminal(item):
+    """
+    Return True if the item is a terminal, which currently is
+    if it is hashable and not a ``Nonterminal``.
+
+    :rtype: bool
+    """
+    return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)
+
+
+#################################################################
+# Productions
+#################################################################
+
+@total_ordering
+@python_2_unicode_compatible
+class Production(object):
+    """
+    A grammar production.  Each production maps a single symbol
+    on the "left-hand side" to a sequence of symbols on the
+    "right-hand side".  (In the case of context-free productions,
+    the left-hand side must be a ``Nonterminal``, and the right-hand
+    side is a sequence of terminals and ``Nonterminals``.)
+    "terminals" can be any immutable hashable object that is
+    not a ``Nonterminal``.  Typically, terminals are strings
+    representing words, such as ``"dog"`` or ``"under"``.
+
+    :see: ``CFG``
+    :see: ``DependencyGrammar``
+    :see: ``Nonterminal``
+    :type _lhs: Nonterminal
+    :ivar _lhs: The left-hand side of the production.
+    :type _rhs: tuple(Nonterminal, terminal)
+    :ivar _rhs: The right-hand side of the production.
+    """
+
+    def __init__(self, lhs, rhs):
+        """
+        Construct a new ``Production``.
+
+        :param lhs: The left-hand side of the new ``Production``.
+        :type lhs: Nonterminal
+        :param rhs: The right-hand side of the new ``Production``.
+        :type rhs: sequence(Nonterminal and terminal)
+        """
+        if isinstance(rhs, string_types):
+            raise TypeError('production right hand side should be a list, '
+                            'not a string')
+        self._lhs = lhs
+        self._rhs = tuple(rhs)
+        self._hash = hash((self._lhs, self._rhs))
+
+    def lhs(self):
+        """
+        Return the left-hand side of this ``Production``.
+
+        :rtype: Nonterminal
+        """
+        return self._lhs
+
+    def rhs(self):
+        """
+        Return the right-hand side of this ``Production``.
+
+        :rtype: sequence(Nonterminal and terminal)
+        """
+        return self._rhs
+
+    def __len__(self):
+        """
+        Return the length of the right-hand side.
+
+        :rtype: int
+        """
+        return len(self._rhs)
+
+    def is_nonlexical(self):
+        """
+        Return True if the right-hand side only contains ``Nonterminals``
+
+        :rtype: bool
+        """
+        return all(is_nonterminal(n) for n in self._rhs)
+
+    def is_lexical(self):
+        """
+        Return True if the right-hand contain at least one terminal token.
+
+        :rtype: bool
+        """
+        return not self.is_nonlexical()
+
+    def __str__(self):
+        """
+        Return a verbose string representation of the ``Production``.
+
+        :rtype: str
+        """
+        result = '%s -> ' % unicode_repr(self._lhs)
+        result += " ".join(unicode_repr(el) for el in self._rhs)
+        return result
+
+    def __repr__(self):
+        """
+        Return a concise string representation of the ``Production``.
+
+        :rtype: str
+        """
+        return '%s' % self
+
+    def __eq__(self, other):
+        """
+        Return True if this ``Production`` is equal to ``other``.
+
+        :rtype: bool
+        """
+        return (type(self) == type(other) and
+                self._lhs == other._lhs and
+                self._rhs == other._rhs)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, Production):
+            raise_unorderable_types("<", self, other)
+        return (self._lhs, self._rhs) < (other._lhs, other._rhs)
+
+    def __hash__(self):
+        """
+        Return a hash value for the ``Production``.
+
+        :rtype: int
+        """
+        return self._hash
+
+
+@python_2_unicode_compatible
+class DependencyProduction(Production):
+    """
+    A dependency grammar production.  Each production maps a single
+    head word to an unordered list of one or more modifier words.
+    """
+    def __str__(self):
+        """
+        Return a verbose string representation of the ``DependencyProduction``.
+
+        :rtype: str
+        """
+        result = '\'%s\' ->' % (self._lhs,)
+        for elt in self._rhs:
+            result += ' \'%s\'' % (elt,)
+        return result
+
+
+@python_2_unicode_compatible
+class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
+    """
+    A probabilistic context free grammar production.
+    A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
+    has an associated probability, which represents how likely it is that
+    this production will be used.  In particular, the probability of a
+    ``ProbabilisticProduction`` records the likelihood that its right-hand side is
+    the correct instantiation for any given occurrence of its left-hand side.
+
+    :see: ``Production``
+    """
+    def __init__(self, lhs, rhs, **prob):
+        """
+        Construct a new ``ProbabilisticProduction``.
+
+        :param lhs: The left-hand side of the new ``ProbabilisticProduction``.
+        :type lhs: Nonterminal
+        :param rhs: The right-hand side of the new ``ProbabilisticProduction``.
+        :type rhs: sequence(Nonterminal and terminal)
+        :param prob: Probability parameters of the new ``ProbabilisticProduction``.
+        """
+        ImmutableProbabilisticMixIn.__init__(self, **prob)
+        Production.__init__(self, lhs, rhs)
+
+    def __str__(self):
+        return Production.__unicode__(self) + \
+            (' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob())
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self._lhs == other._lhs and
+                self._rhs == other._rhs and
+                self.prob() == other.prob())
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash((self._lhs, self._rhs, self.prob()))
+
+#################################################################
+# Grammars
+#################################################################
+
+@python_2_unicode_compatible
+class CFG(object):
+    """
+    A context-free grammar.  A grammar consists of a start state and
+    a set of productions.  The set of terminals and nonterminals is
+    implicitly specified by the productions.
+
+    If you need efficient key-based access to productions, you
+    can use a subclass to implement it.
+    """
+    def __init__(self, start, productions, calculate_leftcorners=True):
+        """
+        Create a new context-free grammar, from the given start state
+        and set of ``Production``s.
+
+        :param start: The start symbol
+        :type start: Nonterminal
+        :param productions: The list of productions that defines the grammar
+        :type productions: list(Production)
+        :param calculate_leftcorners: False if we don't want to calculate the
+            leftcorner relation. In that case, some optimized chart parsers won't work.
+        :type calculate_leftcorners: bool
+        """
+        if not is_nonterminal(start):
+            raise TypeError("start should be a Nonterminal object,"
+                            " not a %s" % type(start).__name__)
+
+        self._start = start
+        self._productions = productions
+        self._categories = set(prod.lhs() for prod in productions)
+        self._calculate_indexes()
+        self._calculate_grammar_forms()
+        if calculate_leftcorners:
+            self._calculate_leftcorners()
+
+    def _calculate_indexes(self):
+        self._lhs_index = {}
+        self._rhs_index = {}
+        self._empty_index = {}
+        self._lexical_index = {}
+        for prod in self._productions:
+            # Left hand side.
+            lhs = prod._lhs
+            if lhs not in self._lhs_index:
+                self._lhs_index[lhs] = []
+            self._lhs_index[lhs].append(prod)
+            if prod._rhs:
+                # First item in right hand side.
+                rhs0 = prod._rhs[0]
+                if rhs0 not in self._rhs_index:
+                    self._rhs_index[rhs0] = []
+                self._rhs_index[rhs0].append(prod)
+            else:
+                # The right hand side is empty.
+                self._empty_index[prod.lhs()] = prod
+            # Lexical tokens in the right hand side.
+            for token in prod._rhs:
+                if is_terminal(token):
+                    self._lexical_index.setdefault(token, set()).add(prod)
+
+    def _calculate_leftcorners(self):
+        # Calculate leftcorner relations, for use in optimized parsing.
+        self._immediate_leftcorner_categories = dict((cat, set([cat])) for cat in self._categories)
+        self._immediate_leftcorner_words = dict((cat, set()) for cat in self._categories)
+        for prod in self.productions():
+            if len(prod) > 0:
+                cat, left = prod.lhs(), prod.rhs()[0]
+                if is_nonterminal(left):
+                    self._immediate_leftcorner_categories[cat].add(left)
+                else:
+                    self._immediate_leftcorner_words[cat].add(left)
+
+        lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True)
+        self._leftcorners = lc
+        self._leftcorner_parents = invert_graph(lc)
+
+        nr_leftcorner_categories = sum(map(len, self._immediate_leftcorner_categories.values()))
+        nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values()))
+        if nr_leftcorner_words > nr_leftcorner_categories > 10000:
+            # If the grammar is big, the leftcorner-word dictionary will be too large.
+            # In that case it is better to calculate the relation on demand.
+            self._leftcorner_words = None
+            return
+
+        self._leftcorner_words = {}
+        for cat in self._leftcorners:
+            lefts = self._leftcorners[cat]
+            lc = self._leftcorner_words[cat] = set()
+            for left in lefts:
+                lc.update(self._immediate_leftcorner_words.get(left, set()))
+
+    @classmethod
+    def fromstring(cls, input, encoding=None):
+        """
+        Return the ``CFG`` corresponding to the input string(s).
+
+        :param input: a grammar, either in the form of a string or as a list of strings.
+        """
+        start, productions = read_grammar(input, standard_nonterm_parser,
+                                          encoding=encoding)
+        return CFG(start, productions)
+
+    def start(self):
+        """
+        Return the start symbol of the grammar
+
+        :rtype: Nonterminal
+        """
+        return self._start
+
+    # tricky to balance readability and efficiency here!
+    # can't use set operations as they don't preserve ordering
+    def productions(self, lhs=None, rhs=None, empty=False):
+        """
+        Return the grammar productions, filtered by the left-hand side
+        or the first item in the right-hand side.
+
+        :param lhs: Only return productions with the given left-hand side.
+        :param rhs: Only return productions with the given first item
+            in the right-hand side.
+        :param empty: Only return productions with an empty right-hand side.
+        :return: A list of productions matching the given constraints.
+        :rtype: list(Production)
+        """
+        if rhs and empty:
+            raise ValueError("You cannot select empty and non-empty "
+                             "productions at the same time.")
+
+        # no constraints so return everything
+        if not lhs and not rhs:
+            if not empty:
+                return self._productions
+            else:
+                return self._empty_index.values()
+
+        # only lhs specified so look up its index
+        elif lhs and not rhs:
+            if not empty:
+                return self._lhs_index.get(lhs, [])
+            elif lhs in self._empty_index:
+                return [self._empty_index[lhs]]
+            else:
+                return []
+
+        # only rhs specified so look up its index
+        elif rhs and not lhs:
+            return self._rhs_index.get(rhs, [])
+
+        # intersect
+        else:
+            return [prod for prod in self._lhs_index.get(lhs, [])
+                    if prod in self._rhs_index.get(rhs, [])]
+
+    def leftcorners(self, cat):
+        """
+        Return the set of all nonterminals that the given nonterminal
+        can start with, including itself.
+
+        This is the reflexive, transitive closure of the immediate
+        leftcorner relation:  (A > B)  iff  (A -> B beta)
+
+        :param cat: the parent of the leftcorners
+        :type cat: Nonterminal
+        :return: the set of all leftcorners
+        :rtype: set(Nonterminal)
+        """
+        return self._leftcorners.get(cat, set([cat]))
+
+    def is_leftcorner(self, cat, left):
+        """
+        True if left is a leftcorner of cat, where left can be a
+        terminal or a nonterminal.
+
+        :param cat: the parent of the leftcorner
+        :type cat: Nonterminal
+        :param left: the suggested leftcorner
+        :type left: Terminal or Nonterminal
+        :rtype: bool
+        """
+        if is_nonterminal(left):
+            return left in self.leftcorners(cat)
+        elif self._leftcorner_words:
+            return left in self._leftcorner_words.get(cat, set())
+        else:
+            return any(left in self._immediate_leftcorner_words.get(parent, set())
+                       for parent in self.leftcorners(cat))
+
+    def leftcorner_parents(self, cat):
+        """
+        Return the set of all nonterminals for which the given category
+        is a left corner. This is the inverse of the leftcorner relation.
+
+        :param cat: the suggested leftcorner
+        :type cat: Nonterminal
+        :return: the set of all parents to the leftcorner
+        :rtype: set(Nonterminal)
+        """
+        return self._leftcorner_parents.get(cat, set([cat]))
+
+    def check_coverage(self, tokens):
+        """
+        Check whether the grammar rules cover the given list of tokens.
+        If not, then raise an exception.
+
+        :type tokens: list(str)
+        """
+        missing = [tok for tok in tokens
+                   if not self._lexical_index.get(tok)]
+        if missing:
+            missing = ', '.join('%r' % (w,) for w in missing)
+            raise ValueError("Grammar does not cover some of the "
+                             "input words: %r." % missing)
+
+    def _calculate_grammar_forms(self):
+        """
+        Pre-calculate of which form(s) the grammar is.
+        """
+        prods = self._productions
+        self._is_lexical = all(p.is_lexical() for p in prods)
+        self._is_nonlexical = all(p.is_nonlexical() for p in prods
+                                  if len(p) != 1)
+        self._min_len = min(len(p) for p in prods)
+        self._max_len = max(len(p) for p in prods)
+        self._all_unary_are_lexical = all(p.is_lexical() for p in prods
+                                          if len(p) == 1)
+
+    def is_lexical(self):
+        """
+        Return True if all productions are lexicalised.
+        """
+        return self._is_lexical
+
+    def is_nonlexical(self):
+        """
+        Return True if all lexical rules are "preterminals", that is,
+        unary rules which can be separated in a preprocessing step.
+
+        This means that all productions are of the forms
+        A -> B1 ... Bn (n>=0), or A -> "s".
+
+        Note: is_lexical() and is_nonlexical() are not opposites.
+        There are grammars which are neither, and grammars which are both.
+        """
+        return self._is_nonlexical
+
+    def min_len(self):
+        """
+        Return the right-hand side length of the shortest grammar production.
+        """
+        return self._min_len
+
+    def max_len(self):
+        """
+        Return the right-hand side length of the longest grammar production.
+        """
+        return self._max_len
+
+    def is_nonempty(self):
+        """
+        Return True if there are no empty productions.
+        """
+        return self._min_len > 0
+
+    def is_binarised(self):
+        """
+        Return True if all productions are at most binary.
+        Note that there can still be empty and unary productions.
+        """
+        return self._max_len <= 2
+
+    def is_flexible_chomsky_normal_form(self):
+        """
+        Return True if all productions are of the forms
+        A -> B C, A -> B, or A -> "s".
+        """
+        return self.is_nonempty() and self.is_nonlexical() and self.is_binarised()
+
+    def is_chomsky_normal_form(self):
+        """
+        Return True if the grammar is of Chomsky Normal Form, i.e. all productions
+        are of the form A -> B C, or A -> "s".
+        """
+        return (self.is_flexible_chomsky_normal_form() and
+                self._all_unary_are_lexical)
+
+    def __repr__(self):
+        return '<Grammar with %d productions>' % len(self._productions)
+
+    def __str__(self):
+        result = 'Grammar with %d productions' % len(self._productions)
+        result += ' (start state = %r)' % self._start
+        for production in self._productions:
+            result += '\n    %s' % production
+        return result
+
+
+class FeatureGrammar(CFG):
+    """
+    A feature-based grammar.  This is equivalent to a
+    ``CFG`` whose nonterminals are all
+    ``FeatStructNonterminal``.
+
+    A grammar consists of a start state and a set of
+    productions.  The set of terminals and nonterminals
+    is implicitly specified by the productions.
+    """
+    def __init__(self, start, productions):
+        """
+        Create a new feature-based grammar, from the given start
+        state and set of ``Productions``.
+
+        :param start: The start symbol
+        :type start: FeatStructNonterminal
+        :param productions: The list of productions that defines the grammar
+        :type productions: list(Production)
+        """
+        CFG.__init__(self, start, productions)
+
+    # The difference with CFG is that the productions are
+    # indexed on the TYPE feature of the nonterminals.
+    # This is calculated by the method _get_type_if_possible().
+
+    def _calculate_indexes(self):
+        self._lhs_index = {}
+        self._rhs_index = {}
+        self._empty_index = {}
+        self._empty_productions = []
+        self._lexical_index = {}
+        for prod in self._productions:
+            # Left hand side.
+            lhs = self._get_type_if_possible(prod._lhs)
+            if lhs not in self._lhs_index:
+                self._lhs_index[lhs] = []
+            self._lhs_index[lhs].append(prod)
+            if prod._rhs:
+                # First item in right hand side.
+                rhs0 = self._get_type_if_possible(prod._rhs[0])
+                if rhs0 not in self._rhs_index:
+                    self._rhs_index[rhs0] = []
+                self._rhs_index[rhs0].append(prod)
+            else:
+                # The right hand side is empty.
+                if lhs not in self._empty_index:
+                    self._empty_index[lhs] = []
+                self._empty_index[lhs].append(prod)
+                self._empty_productions.append(prod)
+            # Lexical tokens in the right hand side.
+            for token in prod._rhs:
+                if is_terminal(token):
+                    self._lexical_index.setdefault(token, set()).add(prod)
+
+    @classmethod
+    def fromstring(cls, input, features=None, logic_parser=None, fstruct_reader=None,
+               encoding=None):
+        """
+        Return a feature structure based ``FeatureGrammar``.
+
+        :param input: a grammar, either in the form of a string or else
+        as a list of strings.
+        :param features: a tuple of features (default: SLASH, TYPE)
+        :param logic_parser: a parser for lambda-expressions,
+        by default, ``LogicParser()``
+        :param fstruct_reader: a feature structure parser
+        (only if features and logic_parser is None)
+        """
+        if features is None:
+            features = (SLASH, TYPE)
+
+        if fstruct_reader is None:
+            fstruct_reader = FeatStructReader(features, FeatStructNonterminal,
+                                              logic_parser=logic_parser)
+        elif logic_parser is not None:
+            raise Exception('\'logic_parser\' and \'fstruct_reader\' must '
+                            'not both be set')
+
+        start, productions = read_grammar(input, fstruct_reader.read_partial,
+                                          encoding=encoding)
+        return FeatureGrammar(start, productions)
+
+
+    def productions(self, lhs=None, rhs=None, empty=False):
+        """
+        Return the grammar productions, filtered by the left-hand side
+        or the first item in the right-hand side.
+
+        :param lhs: Only return productions with the given left-hand side.
+        :param rhs: Only return productions with the given first item
+            in the right-hand side.
+        :param empty: Only return productions with an empty right-hand side.
+        :rtype: list(Production)
+        """
+        if rhs and empty:
+            raise ValueError("You cannot select empty and non-empty "
+                             "productions at the same time.")
+
+        # no constraints so return everything
+        if not lhs and not rhs:
+            if empty:
+                return self._empty_productions
+            else:
+                return self._productions
+
+        # only lhs specified so look up its index
+        elif lhs and not rhs:
+            if empty:
+                return self._empty_index.get(self._get_type_if_possible(lhs), [])
+            else:
+                return self._lhs_index.get(self._get_type_if_possible(lhs), [])
+
+        # only rhs specified so look up its index
+        elif rhs and not lhs:
+            return self._rhs_index.get(self._get_type_if_possible(rhs), [])
+
+        # intersect
+        else:
+            return [prod for prod in self._lhs_index.get(self._get_type_if_possible(lhs), [])
+                    if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])]
+
+    def leftcorners(self, cat):
+        """
+        Return the set of all words that the given category can start with.
+        Also called the "first set" in compiler construction.
+        """
+        raise NotImplementedError("Not implemented yet")
+
+    def leftcorner_parents(self, cat):
+        """
+        Return the set of all categories for which the given category
+        is a left corner.
+        """
+        raise NotImplementedError("Not implemented yet")
+
+    def _get_type_if_possible(self, item):
+        """
+        Helper function which returns the ``TYPE`` feature of the ``item``,
+        if it exists, otherwise it returns the ``item`` itself
+        """
+        if isinstance(item, dict) and TYPE in item:
+            return FeatureValueType(item[TYPE])
+        else:
+            return item
+
+@total_ordering
+@python_2_unicode_compatible
+class FeatureValueType(object):
+    """
+    A helper class for ``FeatureGrammars``, designed to be different
+    from ordinary strings.  This is to stop the ``FeatStruct``
+    ``FOO[]`` from being compare equal to the terminal "FOO".
+    """
+    def __init__(self, value):
+        self._value = value
+        self._hash = hash(value)
+
+    def __repr__(self):
+        return '<%s>' % self._value
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self._value == other._value
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, FeatureValueType):
+            raise_unorderable_types("<", self, other)
+        return self._value < other._value
+
+    def __hash__(self):
+        return self._hash
+
+
+@python_2_unicode_compatible
+class DependencyGrammar(object):
+    """
+    A dependency grammar.  A DependencyGrammar consists of a set of
+    productions.  Each production specifies a head/modifier relationship
+    between a pair of words.
+    """
+    def __init__(self, productions):
+        """
+        Create a new dependency grammar, from the set of ``Productions``.
+
+        :param productions: The list of productions that defines the grammar
+        :type productions: list(Production)
+        """
+        self._productions = productions
+
+    @classmethod
+    def fromstring(cls, input):
+        productions = []
+        for linenum, line in enumerate(input.split('\n')):
+            line = line.strip()
+            if line.startswith('#') or line=='': continue
+            try: productions += _read_dependency_production(line)
+            except ValueError:
+                raise ValueError('Unable to parse line %s: %s' % (linenum, line))
+        if len(productions) == 0:
+            raise ValueError('No productions found!')
+        return DependencyGrammar(productions)
+
+    def contains(self, head, mod):
+        """
+        :param head: A head word.
+        :type head: str
+        :param mod: A mod word, to test as a modifier of 'head'.
+        :type mod: str
+
+        :return: true if this ``DependencyGrammar`` contains a
+            ``DependencyProduction`` mapping 'head' to 'mod'.
+        :rtype: bool
+        """
+        for production in self._productions:
+            for possibleMod in production._rhs:
+                if(production._lhs == head and possibleMod == mod):
+                    return True
+        return False
+
+    def __contains__(self, head, mod):
+        """
+        Return True if this ``DependencyGrammar`` contains a
+        ``DependencyProduction`` mapping 'head' to 'mod'.
+
+        :param head: A head word.
+        :type head: str
+        :param mod: A mod word, to test as a modifier of 'head'.
+        :type mod: str
+        :rtype: bool
+        """
+        for production in self._productions:
+            for possibleMod in production._rhs:
+                if(production._lhs == head and possibleMod == mod):
+                    return True
+        return False
+
+    #   # should be rewritten, the set comp won't work in all comparisons
+    # def contains_exactly(self, head, modlist):
+    #   for production in self._productions:
+    #       if(len(production._rhs) == len(modlist)):
+    #           if(production._lhs == head):
+    #               set1 = Set(production._rhs)
+    #               set2 = Set(modlist)
+    #               if(set1 == set2):
+    #                   return True
+    #   return False
+
+
+    def __str__(self):
+        """
+        Return a verbose string representation of the ``DependencyGrammar``
+
+        :rtype: str
+        """
+        str = 'Dependency grammar with %d productions' % len(self._productions)
+        for production in self._productions:
+            str += '\n  %s' % production
+        return str
+
+    def __repr__(self):
+        """
+        Return a concise string representation of the ``DependencyGrammar``
+        """
+        return 'Dependency grammar with %d productions' % len(self._productions)
+
+
+@python_2_unicode_compatible
+class ProbabilisticDependencyGrammar(object):
+    """
+
+    """
+
+    def __init__(self, productions, events, tags):
+        self._productions = productions
+        self._events = events
+        self._tags = tags
+
+    def contains(self, head, mod):
+        """
+        Return True if this ``DependencyGrammar`` contains a
+        ``DependencyProduction`` mapping 'head' to 'mod'.
+
+        :param head: A head word.
+        :type head: str
+        :param mod: A mod word, to test as a modifier of 'head'.
+        :type mod: str
+        :rtype: bool
+        """
+        for production in self._productions:
+            for possibleMod in production._rhs:
+                if(production._lhs == head and possibleMod == mod):
+                    return True
+        return False
+
+    def __str__(self):
+        """
+        Return a verbose string representation of the ``ProbabilisticDependencyGrammar``
+
+        :rtype: str
+        """
+        str = 'Statistical dependency grammar with %d productions' % len(self._productions)
+        for production in self._productions:
+            str += '\n  %s' % production
+        str += '\nEvents:'
+        for event in self._events:
+            str += '\n  %d:%s' % (self._events[event], event)
+        str += '\nTags:'
+        for tag_word in self._tags:
+            str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word])
+        return str
+
+    def __repr__(self):
+        """
+        Return a concise string representation of the ``ProbabilisticDependencyGrammar``
+        """
+        return 'Statistical Dependency grammar with %d productions' % len(self._productions)
+
+
+class PCFG(CFG):
+    """
+    A probabilistic context-free grammar.  A PCFG consists of a
+    start state and a set of productions with probabilities.  The set of
+    terminals and nonterminals is implicitly specified by the productions.
+
+    PCFG productions use the ``ProbabilisticProduction`` class.
+    ``PCFGs`` impose the constraint that the set of productions with
+    any given left-hand-side must have probabilities that sum to 1
+    (allowing for a small margin of error).
+
+    If you need efficient key-based access to productions, you can use
+    a subclass to implement it.
+
+    :type EPSILON: float
+    :cvar EPSILON: The acceptable margin of error for checking that
+        productions with a given left-hand side have probabilities
+        that sum to 1.
+    """
+    EPSILON = 0.01
+
+    def __init__(self, start, productions, calculate_leftcorners=True):
+        """
+        Create a new context-free grammar, from the given start state
+        and set of ``ProbabilisticProductions``.
+
+        :param start: The start symbol
+        :type start: Nonterminal
+        :param productions: The list of productions that defines the grammar
+        :type productions: list(Production)
+        :raise ValueError: if the set of productions with any left-hand-side
+            do not have probabilities that sum to a value within
+            EPSILON of 1.
+        :param calculate_leftcorners: False if we don't want to calculate the
+            leftcorner relation. In that case, some optimized chart parsers won't work.
+        :type calculate_leftcorners: bool
+        """
+        CFG.__init__(self, start, productions, calculate_leftcorners)
+
+        # Make sure that the probabilities sum to one.
+        probs = {}
+        for production in productions:
+            probs[production.lhs()] = (probs.get(production.lhs(), 0) +
+                                       production.prob())
+        for (lhs, p) in probs.items():
+            if not ((1-PCFG.EPSILON) < p <
+                    (1+PCFG.EPSILON)):
+                raise ValueError("Productions for %r do not sum to 1" % lhs)
+
+
+    @classmethod
+    def fromstring(cls, input, encoding=None):
+        """
+        Return a probabilistic ``PCFG`` corresponding to the
+        input string(s).
+
+        :param input: a grammar, either in the form of a string or else
+             as a list of strings.
+        """
+        start, productions = read_grammar(input, standard_nonterm_parser,
+                                          probabilistic=True, encoding=encoding)
+        return PCFG(start, productions)
+
+
+#################################################################
+# Inducing Grammars
+#################################################################
+
+# Contributed by Nathan Bodenstab <bodenstab@cslu.ogi.edu>
+
+def induce_pcfg(start, productions):
+    """
+    Induce a PCFG grammar from a list of productions.
+
+    The probability of a production A -> B C in a PCFG is:
+
+    |                count(A -> B C)
+    |  P(B, C | A) = ---------------       where \* is any right hand side
+    |                 count(A -> \*)
+
+    :param start: The start symbol
+    :type start: Nonterminal
+    :param productions: The list of productions that defines the grammar
+    :type productions: list(Production)
+    """
+    # Production count: the number of times a given production occurs
+    pcount = {}
+
+    # LHS-count: counts the number of times a given lhs occurs
+    lcount = {}
+
+    for prod in productions:
+        lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
+        pcount[prod]       = pcount.get(prod,       0) + 1
+
+    prods = [ProbabilisticProduction(p.lhs(), p.rhs(),
+                                prob=pcount[p] / lcount[p.lhs()])
+             for p in pcount]
+    return PCFG(start, prods)
+
+
+#################################################################
+# Helper functions for reading productions
+#################################################################
+
+def _read_cfg_production(input):
+    """
+    Return a list of context-free ``Productions``.
+    """
+    return _read_production(input, standard_nonterm_parser)
+
+def _read_pcfg_production(input):
+    """
+    Return a list of PCFG ``ProbabilisticProductions``.
+    """
+    return _read_production(input, standard_nonterm_parser, probabilistic=True)
+
+def _read_fcfg_production(input, fstruct_reader):
+    """
+    Return a list of feature-based ``Productions``.
+    """
+    return _read_production(input, fstruct_reader)
+
+
+# Parsing generic grammars
+
+_ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE)
+_PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE)
+_TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
+_DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)
+
+def _read_production(line, nonterm_parser, probabilistic=False):
+    """
+    Parse a grammar rule, given as a string, and return
+    a list of productions.
+    """
+    pos = 0
+
+    # Parse the left-hand side.
+    lhs, pos = nonterm_parser(line, pos)
+
+    # Skip over the arrow.
+    m = _ARROW_RE.match(line, pos)
+    if not m: raise ValueError('Expected an arrow')
+    pos = m.end()
+
+    # Parse the right hand side.
+    probabilities = [0.0]
+    rhsides = [[]]
+    while pos < len(line):
+        # Probability.
+        m = _PROBABILITY_RE.match(line, pos)
+        if probabilistic and m:
+            pos = m.end()
+            probabilities[-1] = float(m.group(1)[1:-1])
+            if probabilities[-1] > 1.0:
+                raise ValueError('Production probability %f, '
+                                 'should not be greater than 1.0' %
+                                 (probabilities[-1],))
+
+        # String -- add terminal.
+        elif line[pos] in "\'\"":
+            m = _TERMINAL_RE.match(line, pos)
+            if not m: raise ValueError('Unterminated string')
+            rhsides[-1].append(m.group(1)[1:-1])
+            pos = m.end()
+
+        # Vertical bar -- start new rhside.
+        elif line[pos] == '|':
+            m = _DISJUNCTION_RE.match(line, pos)
+            probabilities.append(0.0)
+            rhsides.append([])
+            pos = m.end()
+
+        # Anything else -- nonterminal.
+        else:
+            nonterm, pos = nonterm_parser(line, pos)
+            rhsides[-1].append(nonterm)
+
+    if probabilistic:
+        return [ProbabilisticProduction(lhs, rhs, prob=probability)
+                for (rhs, probability) in zip(rhsides, probabilities)]
+    else:
+        return [Production(lhs, rhs) for rhs in rhsides]
+
+
+#################################################################
+# Reading Phrase Structure Grammars
+#################################################################
+
+def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
+    """
+    Return a pair consisting of a starting category and a list of
+    ``Productions``.
+
+    :param input: a grammar, either in the form of a string or else
+        as a list of strings.
+    :param nonterm_parser: a function for parsing nonterminals.
+        It should take a ``(string, position)`` as argument and
+        return a ``(nonterminal, position)`` as result.
+    :param probabilistic: are the grammar rules probabilistic?
+    :type probabilistic: bool
+    :param encoding: the encoding of the grammar, if it is a binary string
+    :type encoding: str
+    """
+    if encoding is not None:
+        input = input.decode(encoding)
+    if isinstance(input, string_types):
+        lines = input.split('\n')
+    else:
+        lines = input
+
+    start = None
+    productions = []
+    continue_line = ''
+    for linenum, line in enumerate(lines):
+        line = continue_line + line.strip()
+        if line.startswith('#') or line=='': continue
+        if line.endswith('\\'):
+            continue_line = line[:-1].rstrip()+' '
+            continue
+        continue_line = ''
+        try:
+            if line[0] == '%':
+                directive, args = line[1:].split(None, 1)
+                if directive == 'start':
+                    start, pos = nonterm_parser(args, 0)
+                    if pos != len(args):
+                        raise ValueError('Bad argument to start directive')
+                else:
+                    raise ValueError('Bad directive')
+            else:
+                # expand out the disjunctions on the RHS
+                productions += _read_production(line, nonterm_parser, probabilistic)
+        except ValueError as e:
+            raise ValueError('Unable to parse line %s: %s\n%s' %
+                             (linenum+1, line, e))
+
+    if not productions:
+        raise ValueError('No productions found!')
+    if not start:
+        start = productions[0].lhs()
+    return (start, productions)
+
+_STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
+
+def standard_nonterm_parser(string, pos):
+    m = _STANDARD_NONTERM_RE.match(string, pos)
+    if not m: raise ValueError('Expected a nonterminal, found: '
+                               + string[pos:])
+    return (Nonterminal(m.group(1)), m.end())
+
+
+#################################################################
+# Reading Dependency Grammars
+#################################################################
+
+_READ_DG_RE = re.compile(r'''^\s*                # leading whitespace
+                              ('[^']+')\s*        # single-quoted lhs
+                              (?:[-=]+>)\s*        # arrow
+                              (?:(                 # rhs:
+                                   "[^"]+"         # doubled-quoted terminal
+                                 | '[^']+'         # single-quoted terminal
+                                 | \|              # disjunction
+                                 )
+                                 \s*)              # trailing space
+                                 *$''',            # zero or more copies
+                             re.VERBOSE)
+_SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')
+
+def _read_dependency_production(s):
+    if not _READ_DG_RE.match(s):
+        raise ValueError('Bad production string')
+    pieces = _SPLIT_DG_RE.split(s)
+    pieces = [p for i,p in enumerate(pieces) if i%2==1]
+    lhside = pieces[0].strip('\'\"')
+    rhsides = [[]]
+    for piece in pieces[2:]:
+        if piece == '|':
+            rhsides.append([])
+        else:
+            rhsides[-1].append(piece.strip('\'\"'))
+    return [DependencyProduction(lhside, rhside) for rhside in rhsides]
+
+
+#################################################################
+# Demonstration
+#################################################################
+
+def cfg_demo():
+    """
+    A demonstration showing how ``CFGs`` can be created and used.
+    """
+
+    from nltk import nonterminals, Production, CFG
+
+    # Create some nonterminals
+    S, NP, VP, PP = nonterminals('S, NP, VP, PP')
+    N, V, P, Det = nonterminals('N, V, P, Det')
+    VP_slash_NP = VP/NP
+
+    print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP])
+    print('    S.symbol() =>', repr(S.symbol()))
+    print()
+
+    print(Production(S, [NP]))
+
+    # Create some Grammar Productions
+    grammar = CFG.fromstring("""
+      S -> NP VP
+      PP -> P NP
+      NP -> Det N | NP PP
+      VP -> V NP | VP PP
+      Det -> 'a' | 'the'
+      N -> 'dog' | 'cat'
+      V -> 'chased' | 'sat'
+      P -> 'on' | 'in'
+    """)
+
+    print('A Grammar:', repr(grammar))
+    print('    grammar.start()       =>', repr(grammar.start()))
+    print('    grammar.productions() =>', end=' ')
+    # Use string.replace(...) is to line-wrap the output.
+    print(repr(grammar.productions()).replace(',', ',\n'+' '*25))
+    print()
+
+toy_pcfg1 = PCFG.fromstring("""
+    S -> NP VP [1.0]
+    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
+    Det -> 'the' [0.8] | 'my' [0.2]
+    N -> 'man' [0.5] | 'telescope' [0.5]
+    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
+    V -> 'ate' [0.35] | 'saw' [0.65]
+    PP -> P NP [1.0]
+    P -> 'with' [0.61] | 'under' [0.39]
+    """)
+
+toy_pcfg2 = PCFG.fromstring("""
+    S    -> NP VP         [1.0]
+    VP   -> V NP          [.59]
+    VP   -> V             [.40]
+    VP   -> VP PP         [.01]
+    NP   -> Det N         [.41]
+    NP   -> Name          [.28]
+    NP   -> NP PP         [.31]
+    PP   -> P NP          [1.0]
+    V    -> 'saw'         [.21]
+    V    -> 'ate'         [.51]
+    V    -> 'ran'         [.28]
+    N    -> 'boy'         [.11]
+    N    -> 'cookie'      [.12]
+    N    -> 'table'       [.13]
+    N    -> 'telescope'   [.14]
+    N    -> 'hill'        [.5]
+    Name -> 'Jack'        [.52]
+    Name -> 'Bob'         [.48]
+    P    -> 'with'        [.61]
+    P    -> 'under'       [.39]
+    Det  -> 'the'         [.41]
+    Det  -> 'a'           [.31]
+    Det  -> 'my'          [.28]
+    """)
+
+def pcfg_demo():
+    """
+    A demonstration showing how a ``PCFG`` can be created and used.
+    """
+
+    from nltk.corpus import treebank
+    from nltk import treetransforms
+    from nltk import induce_pcfg
+    from nltk.parse import pchart
+
+    pcfg_prods = toy_pcfg1.productions()
+
+    pcfg_prod = pcfg_prods[2]
+    print('A PCFG production:', repr(pcfg_prod))
+    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
+    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
+    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
+    print()
+
+    grammar = toy_pcfg2
+    print('A PCFG grammar:', repr(grammar))
+    print('    grammar.start()       =>', repr(grammar.start()))
+    print('    grammar.productions() =>', end=' ')
+    # Use .replace(...) is to line-wrap the output.
+    print(repr(grammar.productions()).replace(',', ',\n'+' '*26))
+    print()
+
+    # extract productions from three trees and induce the PCFG
+    print("Induce PCFG grammar from treebank data:")
+
+    productions = []
+    item = treebank._fileids[0]
+    for tree in treebank.parsed_sents(item)[:3]:
+        # perform optional tree transformations, e.g.:
+        tree.collapse_unary(collapsePOS = False)
+        tree.chomsky_normal_form(horzMarkov = 2)
+
+        productions += tree.productions()
+
+    S = Nonterminal('S')
+    grammar = induce_pcfg(S, productions)
+    print(grammar)
+    print()
+
+    print("Parse sentence using induced grammar:")
+
+    parser = pchart.InsideChartParser(grammar)
+    parser.trace(3)
+
+    # doesn't work as tokens are different:
+    #sent = treebank.tokenized('wsj_0001.mrg')[0]
+
+    sent = treebank.parsed_sents(item)[0].leaves()
+    print(sent)
+    for parse in parser.parse(sent):
+        print(parse)
+
+def fcfg_demo():
+    import nltk.data
+    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
+    print(g)
+    print()
+
+def dg_demo():
+    """
+    A demonstration showing the creation and inspection of a
+    ``DependencyGrammar``.
+    """
+    grammar = DependencyGrammar.fromstring("""
+    'scratch' -> 'cats' | 'walls'
+    'walls' -> 'the'
+    'cats' -> 'the'
+    """)
+    print(grammar)
+
+def sdg_demo():
+    """
+    A demonstration of how to read a string representation of
+    a CoNLL format dependency tree.
+    """
+    from nltk.parse import DependencyGraph
+
+    dg = DependencyGraph("""
+    1   Ze                ze                Pron  Pron  per|3|evofmv|nom                 2   su      _  _
+    2   had               heb               V     V     trans|ovt|1of2of3|ev             0   ROOT    _  _
+    3   met               met               Prep  Prep  voor                             8   mod     _  _
+    4   haar              haar              Pron  Pron  bez|3|ev|neut|attr               5   det     _  _
+    5   moeder            moeder            N     N     soort|ev|neut                    3   obj1    _  _
+    6   kunnen            kan               V     V     hulp|ott|1of2of3|mv              2   vc      _  _
+    7   gaan              ga                V     V     hulp|inf                         6   vc      _  _
+    8   winkelen          winkel            V     V     intrans|inf                      11  cnj     _  _
+    9   ,                 ,                 Punc  Punc  komma                            8   punct   _  _
+    10  zwemmen           zwem              V     V     intrans|inf                      11  cnj     _  _
+    11  of                of                Conj  Conj  neven                            7   vc      _  _
+    12  terrassen         terras            N     N     soort|mv|neut                    11  cnj     _  _
+    13  .                 .                 Punc  Punc  punt                             12  punct   _  _
+    """)
+    tree = dg.tree()
+    print(tree.pprint())
+
+def demo():
+    cfg_demo()
+    pcfg_demo()
+    fcfg_demo()
+    dg_demo()
+    sdg_demo()
+
+if __name__ == '__main__':
+    demo()
+
+__all__ = ['Nonterminal', 'nonterminals',
+           'CFG', 'Production',
+          'PCFG', 'ProbabilisticProduction',
+          'DependencyGrammar', 'DependencyProduction',
+           'ProbabilisticDependencyGrammar',
+          'induce_pcfg', 'read_grammar']
diff --git a/nlp_resource_data/nltk/grammar.pyc b/nlp_resource_data/nltk/grammar.pyc

new file mode 100755 (executable)

index 0000000..82d8d7a

Binary files /dev/null and b/nlp_resource_data/nltk/grammar.pyc differ
diff --git a/nlp_resource_data/nltk/help.py b/nlp_resource_data/nltk/help.py

new file mode 100755 (executable)

index 0000000..2d2f516
--- /dev/null
+++ b/nlp_resource_data/nltk/help.py
@@ -0,0 +1,56 @@
+# Natural Language Toolkit (NLTK) Help
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Provide structured access to documentation.
+"""
+from __future__ import print_function
+
+import re
+from textwrap import wrap
+
+from nltk.data import load
+
+def brown_tagset(tagpattern=None):
+    _format_tagset("brown_tagset", tagpattern)
+
+def claws5_tagset(tagpattern=None):
+    _format_tagset("claws5_tagset", tagpattern)
+
+def upenn_tagset(tagpattern=None):
+    _format_tagset("upenn_tagset", tagpattern)
+
+#####################################################################
+# UTILITIES
+#####################################################################
+
+def _print_entries(tags, tagdict):
+    for tag in tags:
+        entry = tagdict[tag]
+        defn = [tag + ": " + entry[0]]
+        examples = wrap(entry[1], width=75, initial_indent='    ', subsequent_indent='    ')
+        print("\n".join(defn + examples))
+
+def _format_tagset(tagset, tagpattern=None):
+    tagdict = load("help/tagsets/" + tagset + ".pickle")
+    if not tagpattern:
+        _print_entries(sorted(tagdict), tagdict)
+    elif tagpattern in tagdict:
+        _print_entries([tagpattern], tagdict)
+    else:
+        tagpattern = re.compile(tagpattern)
+        tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)]
+        if tags:
+            _print_entries(tags, tagdict)
+        else:
+            print("No matching tags found.")
+
+if __name__ == '__main__':
+    brown_tagset(r'NN.*')
+    upenn_tagset(r'.*\$')
+    claws5_tagset('UNDEFINED')
+    brown_tagset(r'NN')
diff --git a/nlp_resource_data/nltk/help.pyc b/nlp_resource_data/nltk/help.pyc

new file mode 100755 (executable)

index 0000000..cac98ea

Binary files /dev/null and b/nlp_resource_data/nltk/help.pyc differ
diff --git a/nlp_resource_data/nltk/inference/__init__.py b/nlp_resource_data/nltk/inference/__init__.py

new file mode 100755 (executable)

index 0000000..94581e5
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/__init__.py
@@ -0,0 +1,20 @@
+# Natural Language Toolkit: Inference
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#         Ewan Klein <ewan@inf.ed.ac.uk>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes and interfaces for theorem proving and model building.
+"""
+
+from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand
+from nltk.inference.mace import Mace, MaceCommand
+from nltk.inference.prover9 import Prover9, Prover9Command
+from nltk.inference.resolution import ResolutionProver, ResolutionProverCommand
+from nltk.inference.tableau import TableauProver, TableauProverCommand
+from nltk.inference.discourse import (ReadingCommand, CfgReadingCommand,
+                       DrtGlueReadingCommand, DiscourseTester)
diff --git a/nlp_resource_data/nltk/inference/__init__.pyc b/nlp_resource_data/nltk/inference/__init__.pyc

new file mode 100755 (executable)

index 0000000..0be4dc4

Binary files /dev/null and b/nlp_resource_data/nltk/inference/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/inference/api.py b/nlp_resource_data/nltk/inference/api.py

new file mode 100755 (executable)

index 0000000..ca03a96
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/api.py
@@ -0,0 +1,597 @@
+# Natural Language Toolkit: Classifier Interface
+#
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Dan Garrette <dhgarrette@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Interfaces and base classes for theorem provers and model builders.
+
+``Prover`` is a standard interface for a theorem prover which tries to prove a goal from a
+list of assumptions.
+
+``ModelBuilder`` is a standard interface for a model builder. Given just a set of assumptions.
+the model builder tries to build a model for the assumptions. Given a set of assumptions and a
+goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy
+the assumptions plus the negation of *G*.
+"""
+from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+import threading
+import time
+
+
+@add_metaclass(ABCMeta)
+class Prover(object):
+    """
+    Interface for trying to prove a goal from assumptions.  Both the goal and
+    the assumptions are constrained to be formulas of ``logic.Expression``.
+    """
+    def prove(self, goal=None, assumptions=None, verbose=False):
+        """
+        :return: Whether the proof was successful or not.
+        :rtype: bool
+        """
+        return self._prove(goal, assumptions, verbose)[0]
+
+    @abstractmethod
+    def _prove(self, goal=None, assumptions=None, verbose=False):
+        """
+        :return: Whether the proof was successful or not, along with the proof
+        :rtype: tuple: (bool, str)
+        """
+
+
+@add_metaclass(ABCMeta)
+class ModelBuilder(object):
+    """
+    Interface for trying to build a model of set of formulas.
+    Open formulas are assumed to be universally quantified.
+    Both the goal and the assumptions are constrained to be formulas
+    of ``logic.Expression``.
+    """
+    def build_model(self, goal=None, assumptions=None, verbose=False):
+        """
+        Perform the actual model building.
+        :return: Whether a model was generated
+        :rtype: bool
+        """
+        return self._build_model(goal, assumptions, verbose)[0]
+
+    @abstractmethod
+    def _build_model(self, goal=None, assumptions=None, verbose=False):
+        """
+        Perform the actual model building.
+        :return: Whether a model was generated, and the model itself
+        :rtype: tuple(bool, sem.Valuation)
+        """
+
+
+@add_metaclass(ABCMeta)
+class TheoremToolCommand(object):
+    """
+    This class holds a goal and a list of assumptions to be used in proving
+    or model building.
+    """
+    @abstractmethod
+    def add_assumptions(self, new_assumptions):
+        """
+        Add new assumptions to the assumption list.
+
+        :param new_assumptions: new assumptions
+        :type new_assumptions: list(sem.Expression)
+        """
+
+    @abstractmethod
+    def retract_assumptions(self, retracted, debug=False):
+        """
+        Retract assumptions from the assumption list.
+
+        :param debug: If True, give warning when ``retracted`` is not present on
+        assumptions list.
+        :type debug: bool
+        :param retracted: assumptions to be retracted
+        :type retracted: list(sem.Expression)
+        """
+
+    @abstractmethod
+    def assumptions(self):
+        """
+        List the current assumptions.
+
+        :return: list of ``Expression``
+        """
+
+    @abstractmethod
+    def goal(self):
+        """
+        Return the goal
+
+        :return: ``Expression``
+        """
+
+    @abstractmethod
+    def print_assumptions(self):
+        """
+        Print the list of the current assumptions.
+        """
+
+
+class ProverCommand(TheoremToolCommand):
+    """
+    This class holds a ``Prover``, a goal, and a list of assumptions.  When
+    prove() is called, the ``Prover`` is executed with the goal and assumptions.
+    """
+    @abstractmethod
+    def prove(self, verbose=False):
+        """
+        Perform the actual proof.
+        """
+
+    @abstractmethod
+    def proof(self, simplify=True):
+        """
+        Return the proof string
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+
+    @abstractmethod
+    def get_prover(self):
+        """
+        Return the prover object
+        :return: ``Prover``
+        """
+
+
+class ModelBuilderCommand(TheoremToolCommand):
+    """
+    This class holds a ``ModelBuilder``, a goal, and a list of assumptions.
+    When build_model() is called, the ``ModelBuilder`` is executed with the goal
+    and assumptions.
+    """
+    @abstractmethod
+    def build_model(self, verbose=False):
+        """
+        Perform the actual model building.
+        :return: A model if one is generated; None otherwise.
+        :rtype: sem.Valuation
+        """
+
+    @abstractmethod
+    def model(self, format=None):
+        """
+        Return a string representation of the model
+
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+
+    @abstractmethod
+    def get_model_builder(self):
+        """
+        Return the model builder object
+        :return: ``ModelBuilder``
+        """
+
+
+class BaseTheoremToolCommand(TheoremToolCommand):
+    """
+    This class holds a goal and a list of assumptions to be used in proving
+    or model building.
+    """
+    def __init__(self, goal=None, assumptions=None):
+        """
+        :param goal: Input expression to prove
+        :type goal: sem.Expression
+        :param assumptions: Input expressions to use as assumptions in
+            the proof.
+        :type assumptions: list(sem.Expression)
+        """
+        self._goal = goal
+
+        if not assumptions:
+            self._assumptions = []
+        else:
+            self._assumptions = list(assumptions)
+
+        self._result = None
+        """A holder for the result, to prevent unnecessary re-proving"""
+
+    def add_assumptions(self, new_assumptions):
+        """
+        Add new assumptions to the assumption list.
+
+        :param new_assumptions: new assumptions
+        :type new_assumptions: list(sem.Expression)
+        """
+        self._assumptions.extend(new_assumptions)
+        self._result = None
+
+    def retract_assumptions(self, retracted, debug=False):
+        """
+        Retract assumptions from the assumption list.
+
+        :param debug: If True, give warning when ``retracted`` is not present on
+        assumptions list.
+        :type debug: bool
+        :param retracted: assumptions to be retracted
+        :type retracted: list(sem.Expression)
+        """
+        retracted = set(retracted)
+        result_list = list(filter(lambda a: a not in retracted, self._assumptions))
+        if debug and result_list == self._assumptions:
+            print(Warning("Assumptions list has not been changed:"))
+            self.print_assumptions()
+
+        self._assumptions = result_list
+
+        self._result = None
+
+    def assumptions(self):
+        """
+        List the current assumptions.
+
+        :return: list of ``Expression``
+        """
+        return self._assumptions
+
+    def goal(self):
+        """
+        Return the goal
+
+        :return: ``Expression``
+        """
+        return self._goal
+
+    def print_assumptions(self):
+        """
+        Print the list of the current assumptions.
+        """
+        for a in self.assumptions():
+            print(a)
+
+
+class BaseProverCommand(BaseTheoremToolCommand, ProverCommand):
+    """
+    This class holds a ``Prover``, a goal, and a list of assumptions.  When
+    prove() is called, the ``Prover`` is executed with the goal and assumptions.
+    """
+    def __init__(self, prover, goal=None, assumptions=None):
+        """
+        :param prover: The theorem tool to execute with the assumptions
+        :type prover: Prover
+        :see: ``BaseTheoremToolCommand``
+        """
+        self._prover = prover
+        """The theorem tool to execute with the assumptions"""
+
+        BaseTheoremToolCommand.__init__(self, goal, assumptions)
+
+        self._proof = None
+
+    def prove(self, verbose=False):
+        """
+        Perform the actual proof.  Store the result to prevent unnecessary
+        re-proving.
+        """
+        if self._result is None:
+            self._result, self._proof = self._prover._prove(self.goal(),
+                                                            self.assumptions(),
+                                                            verbose)
+        return self._result
+
+    def proof(self, simplify=True):
+        """
+        Return the proof string
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+        if self._result is None:
+            raise LookupError("You have to call prove() first to get a proof!")
+        else:
+            return self.decorate_proof(self._proof, simplify)
+
+    def decorate_proof(self, proof_string, simplify=True):
+        """
+        Modify and return the proof string
+        :param proof_string: str the proof to decorate
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+        return proof_string
+
+    def get_prover(self):
+        return self._prover
+
+
+class BaseModelBuilderCommand(BaseTheoremToolCommand, ModelBuilderCommand):
+    """
+    This class holds a ``ModelBuilder``, a goal, and a list of assumptions.  When
+    build_model() is called, the ``ModelBuilder`` is executed with the goal and
+    assumptions.
+    """
+    def __init__(self, modelbuilder, goal=None, assumptions=None):
+        """
+        :param modelbuilder: The theorem tool to execute with the assumptions
+        :type modelbuilder: ModelBuilder
+        :see: ``BaseTheoremToolCommand``
+        """
+        self._modelbuilder = modelbuilder
+        """The theorem tool to execute with the assumptions"""
+
+        BaseTheoremToolCommand.__init__(self, goal, assumptions)
+
+        self._model = None
+
+    def build_model(self, verbose=False):
+        """
+        Attempt to build a model.  Store the result to prevent unnecessary
+        re-building.
+        """
+        if self._result is None:
+            self._result, self._model = \
+                    self._modelbuilder._build_model(self.goal(),
+                                                    self.assumptions(),
+                                                    verbose)
+        return self._result
+
+    def model(self, format=None):
+        """
+        Return a string representation of the model
+
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+        if self._result is None:
+            raise LookupError('You have to call build_model() first to '
+                              'get a model!')
+        else:
+            return self._decorate_model(self._model, format)
+
+    def _decorate_model(self, valuation_str, format=None):
+        """
+        :param valuation_str: str with the model builder's output
+        :param format: str indicating the format for displaying
+        :return: str
+        """
+        return valuation_str
+
+    def get_model_builder(self):
+        return self._modelbuilder
+
+
+class TheoremToolCommandDecorator(TheoremToolCommand):
+    """
+    A base decorator for the ``ProverCommandDecorator`` and
+    ``ModelBuilderCommandDecorator`` classes from which decorators can extend.
+    """
+    def __init__(self, command):
+        """
+        :param command: ``TheoremToolCommand`` to decorate
+        """
+        self._command = command
+
+        # The decorator has its own versions of 'result' different from the
+        # underlying command
+        self._result = None
+
+    def assumptions(self):
+        return self._command.assumptions()
+
+    def goal(self):
+        return self._command.goal()
+
+    def add_assumptions(self, new_assumptions):
+        self._command.add_assumptions(new_assumptions)
+        self._result = None
+
+    def retract_assumptions(self, retracted, debug=False):
+        self._command.retract_assumptions(retracted, debug)
+        self._result = None
+
+    def print_assumptions(self):
+        self._command.print_assumptions()
+
+
+class ProverCommandDecorator(TheoremToolCommandDecorator, ProverCommand):
+    """
+    A base decorator for the ``ProverCommand`` class from which other
+    prover command decorators can extend.
+    """
+    def __init__(self, proverCommand):
+        """
+        :param proverCommand: ``ProverCommand`` to decorate
+        """
+        TheoremToolCommandDecorator.__init__(self, proverCommand)
+
+        # The decorator has its own versions of 'result' and 'proof'
+        # because they may be different from the underlying command
+        self._proof = None
+
+    def prove(self, verbose=False):
+        if self._result is None:
+            prover = self.get_prover()
+            self._result, self._proof = prover._prove(self.goal(),
+                                                      self.assumptions(),
+                                                      verbose)
+        return self._result
+
+    def proof(self, simplify=True):
+        """
+        Return the proof string
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+        if self._result is None:
+            raise LookupError("You have to call prove() first to get a proof!")
+        else:
+            return self.decorate_proof(self._proof, simplify)
+
+    def decorate_proof(self, proof_string, simplify=True):
+        """
+        Modify and return the proof string
+        :param proof_string: str the proof to decorate
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+        return self._command.decorate_proof(proof_string, simplify)
+
+    def get_prover(self):
+        return self._command.get_prover()
+
+
+class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderCommand):
+    """
+    A base decorator for the ``ModelBuilderCommand`` class from which other
+    prover command decorators can extend.
+    """
+    def __init__(self, modelBuilderCommand):
+        """
+        :param modelBuilderCommand: ``ModelBuilderCommand`` to decorate
+        """
+        TheoremToolCommandDecorator.__init__(self, modelBuilderCommand)
+
+        # The decorator has its own versions of 'result' and 'valuation'
+        # because they may be different from the underlying command
+        self._model = None
+
+    def build_model(self, verbose=False):
+        """
+        Attempt to build a model.  Store the result to prevent unnecessary
+        re-building.
+        """
+        if self._result is None:
+            modelbuilder = self.get_model_builder()
+            self._result, self._model = \
+                            modelbuilder._build_model(self.goal(),
+                                                      self.assumptions(),
+                                                      verbose)
+        return self._result
+
+    def model(self, format=None):
+        """
+        Return a string representation of the model
+
+        :param simplify: bool simplify the proof?
+        :return: str
+        """
+        if self._result is None:
+            raise LookupError('You have to call build_model() first to '
+                              'get a model!')
+        else:
+            return self._decorate_model(self._model, format)
+
+    def _decorate_model(self, valuation_str, format=None):
+        """
+        Modify and return the proof string
+        :param valuation_str: str with the model builder's output
+        :param format: str indicating the format for displaying
+        :return: str
+        """
+        return self._command._decorate_model(valuation_str, format)
+
+    def get_model_builder(self):
+        return self._command.get_prover()
+
+
+class ParallelProverBuilder(Prover, ModelBuilder):
+    """
+    This class stores both a prover and a model builder and when either
+    prove() or build_model() is called, then both theorem tools are run in
+    parallel.  Whichever finishes first, the prover or the model builder, is the
+    result that will be used.
+    """
+    def __init__(self, prover, modelbuilder):
+        self._prover = prover
+        self._modelbuilder = modelbuilder
+
+    def _prove(self, goal=None, assumptions=None, verbose=False):
+        return self._run(goal, assumptions, verbose), ''
+
+    def _build_model(self, goal=None, assumptions=None, verbose=False):
+        return not self._run(goal, assumptions, verbose), ''
+
+    def _run(self, goal, assumptions, verbose):
+        # Set up two thread, Prover and ModelBuilder to run in parallel
+        tp_thread = TheoremToolThread(lambda: self._prover.prove(goal, assumptions, verbose), verbose, 'TP')
+        mb_thread = TheoremToolThread(lambda: self._modelbuilder.build_model(goal, assumptions, verbose), verbose, 'MB')
+
+        tp_thread.start()
+        mb_thread.start()
+
+        while tp_thread.isAlive() and mb_thread.isAlive():
+            # wait until either the prover or the model builder is done
+            pass
+
+        if tp_thread.result is not None:
+            return tp_thread.result
+        elif mb_thread.result is not None:
+            return not mb_thread.result
+        else:
+            return None
+
+
+class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand):
+    """
+    This command stores both a prover and a model builder and when either
+    prove() or build_model() is called, then both theorem tools are run in
+    parallel.  Whichever finishes first, the prover or the model builder, is the
+    result that will be used.
+
+    Because the theorem prover result is the opposite of the model builder
+    result, we will treat self._result as meaning "proof found/no model found".
+    """
+    def __init__(self, prover, modelbuilder, goal=None, assumptions=None):
+        BaseProverCommand.__init__(self, prover, goal, assumptions)
+        BaseModelBuilderCommand.__init__(self, modelbuilder, goal, assumptions)
+
+    def prove(self, verbose=False):
+        return self._run(verbose)
+
+    def build_model(self, verbose=False):
+        return not self._run(verbose)
+
+    def _run(self, verbose):
+        # Set up two thread, Prover and ModelBuilder to run in parallel
+        tp_thread = TheoremToolThread(lambda: BaseProverCommand.prove(self, verbose), verbose, 'TP')
+        mb_thread = TheoremToolThread(lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, 'MB')
+
+        tp_thread.start()
+        mb_thread.start()
+
+        while tp_thread.isAlive() and mb_thread.isAlive():
+            # wait until either the prover or the model builder is done
+            pass
+
+        if tp_thread.result is not None:
+            self._result = tp_thread.result
+        elif mb_thread.result is not None:
+            self._result = not mb_thread.result
+        return self._result
+
+
+class TheoremToolThread(threading.Thread):
+    def __init__(self, command, verbose, name=None):
+        threading.Thread.__init__(self)
+        self._command = command
+        self._result = None
+        self._verbose = verbose
+        self._name = name
+
+    def run(self):
+        try:
+            self._result = self._command()
+            if self._verbose:
+                print('Thread %s finished with result %s at %s' % \
+                      (self._name, self._result, time.localtime(time.time())))
+        except Exception as e:
+            print(e)
+            print('Thread %s completed abnormally' % (self._name))
+
+    @property
+    def result(self): return self._result
diff --git a/nlp_resource_data/nltk/inference/api.pyc b/nlp_resource_data/nltk/inference/api.pyc

new file mode 100755 (executable)

index 0000000..6f587f2

Binary files /dev/null and b/nlp_resource_data/nltk/inference/api.pyc differ
diff --git a/nlp_resource_data/nltk/inference/discourse.py b/nlp_resource_data/nltk/inference/discourse.py

new file mode 100755 (executable)

index 0000000..a04d360
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/discourse.py
@@ -0,0 +1,612 @@
+# Natural Language Toolkit: Discourse Processing
+#
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Dan Garrette <dhgarrette@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Module for incrementally developing simple discourses, and checking for semantic ambiguity,
+consistency and informativeness.
+
+Many of the ideas are based on the CURT family of programs of Blackburn and Bos
+(see http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html).
+
+Consistency checking is carried out  by using the ``mace`` module to call the Mace4 model builder.
+Informativeness checking is carried out with a call to ``Prover.prove()`` from
+the ``inference``  module.
+
+``DiscourseTester`` is a constructor for discourses.
+The basic data structure is a list of sentences, stored as ``self._sentences``. Each sentence in the list
+is assigned a "sentence ID" (``sid``) of the form ``s``\ *i*. For example::
+
+    s0: A boxer walks
+    s1: Every boxer chases a girl
+
+Each sentence can be ambiguous between a number of readings, each of which receives a
+"reading ID" (``rid``) of the form ``s``\ *i* -``r``\ *j*. For example::
+
+    s0 readings:
+
+    s0-r1: some x.(boxer(x) & walk(x))
+    s0-r0: some x.(boxerdog(x) & walk(x))
+
+A "thread" is a list of readings, represented as a list of ``rid``\ s.
+Each thread receives a "thread ID" (``tid``) of the form ``d``\ *i*.
+For example::
+
+    d0: ['s0-r0', 's1-r0']
+
+The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences.
+(This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show
+those threads which are consistent (taking into account any background assumptions).
+"""
+from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+import os
+
+from operator import and_, add
+from functools import reduce
+
+from nltk.data import show_cfg
+from nltk.tag import RegexpTagger
+from nltk.parse import load_parser
+from nltk.parse.malt import MaltParser
+from nltk.sem.drt import resolve_anaphora, AnaphoraResolutionException
+from nltk.sem.glue import DrtGlue
+from nltk.sem.logic import Expression
+
+from nltk.inference.mace import MaceCommand
+from nltk.inference.prover9 import Prover9Command
+
+
+@add_metaclass(ABCMeta)
+class ReadingCommand(object):
+    @abstractmethod
+    def parse_to_readings(self, sentence):
+        """
+        :param sentence: the sentence to read
+        :type sentence: str
+        """
+
+    def process_thread(self, sentence_readings):
+        """
+        This method should be used to handle dependencies between readings such
+        as resolving anaphora.
+
+        :param sentence_readings: readings to process
+        :type sentence_readings: list(Expression)
+        :return: the list of readings after processing
+        :rtype: list(Expression)
+        """
+        return sentence_readings
+
+    @abstractmethod
+    def combine_readings(self, readings):
+        """
+        :param readings: readings to combine
+        :type readings: list(Expression)
+        :return: one combined reading
+        :rtype: Expression
+        """
+
+    @abstractmethod
+    def to_fol(self, expression):
+        """
+        Convert this expression into a First-Order Logic expression.
+
+        :param expression: an expression
+        :type expression: Expression
+        :return: a FOL version of the input expression
+        :rtype: Expression
+        """
+
+
+class CfgReadingCommand(ReadingCommand):
+    def __init__(self, gramfile=None):
+        """
+        :param gramfile: name of file where grammar can be loaded
+        :type gramfile: str
+        """
+        self._gramfile = (gramfile if gramfile else 'grammars/book_grammars/discourse.fcfg')
+        self._parser = load_parser(self._gramfile)
+
+    def parse_to_readings(self, sentence):
+        """:see: ReadingCommand.parse_to_readings()"""
+        from nltk.sem import root_semrep
+        tokens = sentence.split()
+        trees = self._parser.parse(tokens)
+        return [root_semrep(tree) for tree in trees]
+
+    def combine_readings(self, readings):
+        """:see: ReadingCommand.combine_readings()"""
+        return reduce(and_, readings)
+
+    def to_fol(self, expression):
+        """:see: ReadingCommand.to_fol()"""
+        return expression
+
+
+class DrtGlueReadingCommand(ReadingCommand):
+    def __init__(self, semtype_file=None, remove_duplicates=False,
+                 depparser=None):
+        """
+        :param semtype_file: name of file where grammar can be loaded
+        :param remove_duplicates: should duplicates be removed?
+        :param depparser: the dependency parser
+        """
+        if semtype_file is None:
+            semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype')
+        self._glue = DrtGlue(semtype_file=semtype_file,
+                             remove_duplicates=remove_duplicates,
+                             depparser=depparser)
+
+    def parse_to_readings(self, sentence):
+        """:see: ReadingCommand.parse_to_readings()"""
+        return self._glue.parse_to_meaning(sentence)
+
+    def process_thread(self, sentence_readings):
+        """:see: ReadingCommand.process_thread()"""
+        try:
+            return [self.combine_readings(sentence_readings)]
+        except AnaphoraResolutionException:
+            return []
+
+    def combine_readings(self, readings):
+        """:see: ReadingCommand.combine_readings()"""
+        thread_reading = reduce(add, readings)
+        return resolve_anaphora(thread_reading.simplify())
+
+    def to_fol(self, expression):
+        """:see: ReadingCommand.to_fol()"""
+        return expression.fol()
+
+
+class DiscourseTester(object):
+    """
+    Check properties of an ongoing discourse.
+    """
+    def __init__(self, input, reading_command=None, background=None):
+        """
+        Initialize a ``DiscourseTester``.
+
+        :param input: the discourse sentences
+        :type input: list of str
+        :param background: Formulas which express background assumptions
+        :type background: list(Expression)
+        """
+        self._input = input
+        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)])
+        self._models = None
+        self._readings = {}
+        self._reading_command = (reading_command if reading_command else CfgReadingCommand())
+        self._threads = {}
+        self._filtered_threads = {}
+        if background is not None:
+            from nltk.sem.logic import Expression
+            for e in background:
+                assert isinstance(e, Expression)
+            self._background = background
+        else:
+            self._background = []
+
+    ###############################
+    # Sentences
+    ###############################
+
+    def sentences(self):
+        """
+        Display the list of sentences in the current discourse.
+        """
+        for id in sorted(self._sentences):
+            print("%s: %s" % (id, self._sentences[id]))
+
+    def add_sentence(self, sentence, informchk=False, consistchk=False,):
+        """
+        Add a sentence to the current discourse.
+
+        Updates ``self._input`` and ``self._sentences``.
+        :param sentence: An input sentence
+        :type sentence: str
+        :param informchk: if ``True``, check that the result of adding the sentence is thread-informative. Updates ``self._readings``.
+        :param consistchk: if ``True``, check that the result of adding the sentence is thread-consistent. Updates ``self._readings``.
+
+        """
+        # check whether the new sentence is informative (i.e. not entailed by the previous discourse)
+        if informchk:
+            self.readings(verbose=False)
+            for tid in sorted(self._threads):
+                assumptions = [reading for (rid, reading) in self.expand_threads(tid)]
+                assumptions += self._background
+                for sent_reading in self._get_readings(sentence):
+                    tp = Prover9Command(goal=sent_reading, assumptions=assumptions)
+                    if tp.prove():
+                        print("Sentence '%s' under reading '%s':" % (sentence, str(sent_reading)))
+                        print("Not informative relative to thread '%s'" % tid)
+
+        self._input.append(sentence)
+        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)])
+        # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of
+        # of assumptions
+        if consistchk:
+            self.readings(verbose=False)
+            self.models(show=False)
+
+    def retract_sentence(self, sentence, verbose=True):
+        """
+        Remove a sentence from the current discourse.
+
+        Updates ``self._input``, ``self._sentences`` and ``self._readings``.
+        :param sentence: An input sentence
+        :type sentence: str
+        :param verbose: If ``True``,  report on the updated list of sentences.
+        """
+        try:
+            self._input.remove(sentence)
+        except ValueError:
+            print("Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence)
+            self.sentences()
+            return None
+        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)])
+        self.readings(verbose=False)
+        if verbose:
+            print("Current sentences are ")
+            self.sentences()
+
+    def grammar(self):
+        """
+        Print out the grammar in use for parsing input sentences
+        """
+        show_cfg(self._reading_command._gramfile)
+
+    ###############################
+    # Readings and Threads
+    ###############################
+
+    def _get_readings(self, sentence):
+        """
+        Build a list of semantic readings for a sentence.
+
+        :rtype: list(Expression)
+        """
+        return self._reading_command.parse_to_readings(sentence)
+
+    def _construct_readings(self):
+        """
+        Use ``self._sentences`` to construct a value for ``self._readings``.
+        """
+        # re-initialize self._readings in case we have retracted a sentence
+        self._readings = {}
+        for sid in sorted(self._sentences):
+            sentence = self._sentences[sid]
+            readings = self._get_readings(sentence)
+            self._readings[sid] = dict([("%s-r%s" % (sid, rid), reading.simplify())
+                                                        for rid, reading in enumerate(sorted(readings, key=str))])
+
+    def _construct_threads(self):
+        """
+        Use ``self._readings`` to construct a value for ``self._threads``
+        and use the model builder to construct a value for ``self._filtered_threads``
+        """
+        thread_list = [[]]
+        for sid in sorted(self._readings):
+            thread_list = self.multiply(thread_list, sorted(self._readings[sid]))
+        self._threads = dict([("d%s" % tid, thread) for tid, thread in enumerate(thread_list)])
+        # re-initialize the filtered threads
+        self._filtered_threads = {}
+        # keep the same ids, but only include threads which get models
+        consistency_checked = self._check_consistency(self._threads)
+        for (tid, thread) in self._threads.items():
+            if (tid, True) in consistency_checked:
+                self._filtered_threads[tid] = thread
+
+    def _show_readings(self, sentence=None):
+        """
+        Print out the readings for  the discourse (or a single sentence).
+        """
+        if sentence is not None:
+            print("The sentence '%s' has these readings:" % sentence)
+            for r in [str(reading) for reading in (self._get_readings(sentence))]:
+                print("    %s" % r)
+        else:
+            for sid in sorted(self._readings):
+                print()
+                print('%s readings:' % sid)
+                print() #'-' * 30
+                for rid in sorted(self._readings[sid]):
+                    lf = self._readings[sid][rid]
+                    print("%s: %s" % (rid, lf.normalize()))
+
+    def _show_threads(self, filter=False, show_thread_readings=False):
+        """
+        Print out the value of ``self._threads`` or ``self._filtered_hreads``
+        """
+        threads = (self._filtered_threads if filter else self._threads)
+        for tid in sorted(threads):
+            if show_thread_readings:
+                readings = [self._readings[rid.split('-')[0]][rid]
+                            for rid in self._threads[tid]]
+                try:
+                    thread_reading = ": %s" % \
+                              self._reading_command.combine_readings(readings).normalize()
+                except Exception as e:
+                    thread_reading = ': INVALID: %s' % e.__class__.__name__
+            else:
+                thread_reading = ''
+
+            print("%s:" % tid, self._threads[tid], thread_reading)
+
+
+    def readings(self, sentence=None, threaded=False, verbose=True,
+                 filter=False, show_thread_readings=False):
+        """
+        Construct and show the readings of the discourse (or of a single sentence).
+
+        :param sentence: test just this sentence
+        :type sentence: str
+        :param threaded: if ``True``, print out each thread ID and the corresponding thread.
+        :param filter: if ``True``, only print out consistent thread IDs and threads.
+        """
+        self._construct_readings()
+        self._construct_threads()
+
+        # if we are filtering or showing thread readings, show threads
+        if filter or show_thread_readings:
+            threaded = True
+
+        if verbose:
+            if not threaded:
+                self._show_readings(sentence=sentence)
+            else:
+                self._show_threads(filter=filter,
+                                   show_thread_readings=show_thread_readings)
+
+    def expand_threads(self, thread_id, threads=None):
+        """
+        Given a thread ID, find the list of ``logic.Expression`` objects corresponding to the reading IDs in that thread.
+
+        :param thread_id: thread ID
+        :type thread_id: str
+        :param threads: a mapping from thread IDs to lists of reading IDs
+        :type threads: dict
+        :return: A list of pairs ``(rid, reading)`` where reading is the ``logic.Expression`` associated with a reading ID
+        :rtype: list of tuple
+        """
+        if threads is None:
+            threads = self._threads
+        return [(rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split('-')[:1]]
+
+
+    ###############################
+    # Models and Background
+    ###############################
+
+    def _check_consistency(self, threads, show=False, verbose=False):
+        results = []
+        for tid in sorted(threads):
+            assumptions = [reading for (rid, reading) in self.expand_threads(tid, threads=threads)]
+            assumptions = list(map(self._reading_command.to_fol, self._reading_command.process_thread(assumptions)))
+            if assumptions:
+                assumptions += self._background
+                # if Mace4 finds a model, it always seems to find it quickly
+                mb = MaceCommand(None, assumptions, max_models=20)
+                modelfound = mb.build_model()
+            else:
+                modelfound = False
+            results.append((tid, modelfound))
+            if show:
+                spacer(80)
+                print("Model for Discourse Thread %s" % tid)
+                spacer(80)
+                if verbose:
+                    for a in assumptions:
+                        print(a)
+                    spacer(80)
+                if modelfound:
+                    print(mb.model(format='cooked'))
+                else:
+                    print("No model found!\n")
+        return results
+
+    def models(self, thread_id=None, show=True, verbose=False):
+        """
+        Call Mace4 to build a model for each current discourse thread.
+
+        :param thread_id: thread ID
+        :type thread_id: str
+        :param show: If ``True``, display the model that has been found.
+        """
+        self._construct_readings()
+        self._construct_threads()
+        threads = ({thread_id: self._threads[thread_id]} if thread_id else self._threads)
+
+        for (tid, modelfound) in self._check_consistency(threads, show=show, verbose=verbose):
+            idlist = [rid for rid in threads[tid]]
+
+            if not modelfound:
+                print("Inconsistent discourse: %s %s:" % (tid, idlist))
+                for rid, reading in self.expand_threads(tid):
+                    print("    %s: %s" % (rid, reading.normalize()))
+                print()
+            else:
+                print("Consistent discourse: %s %s:" % (tid, idlist))
+                for rid, reading in self.expand_threads(tid):
+                    print("    %s: %s" % (rid, reading.normalize()))
+                print()
+
+    def add_background(self, background, verbose=False):
+        """
+        Add a list of background assumptions for reasoning about the discourse.
+
+        When called,  this method also updates the discourse model's set of readings and threads.
+        :param background: Formulas which contain background information
+        :type background: list(Expression)
+        """
+        from nltk.sem.logic import Expression
+        for (count, e) in enumerate(background):
+            assert isinstance(e, Expression)
+            if verbose:
+                print("Adding assumption %s to background" % count)
+            self._background.append(e)
+
+        #update the state
+        self._construct_readings()
+        self._construct_threads()
+
+    def background(self):
+        """
+        Show the current background assumptions.
+        """
+        for e in self._background:
+            print(str(e))
+
+   ###############################
+    # Misc
+    ###############################
+
+    @staticmethod
+    def multiply(discourse, readings):
+        """
+        Multiply every thread in ``discourse`` by every reading in ``readings``.
+
+        Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns
+        [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']]
+
+        :param discourse: the current list of readings
+        :type discourse: list of lists
+        :param readings: an additional list of readings
+        :type readings: list(Expression)
+        :rtype: A list of lists
+        """
+        result = []
+        for sublist in discourse:
+            for r in readings:
+                new = []
+                new += sublist
+                new.append(r)
+                result.append(new)
+        return result
+
+#multiply = DiscourseTester.multiply
+#L1 = [['A'], ['B']]
+#L2 = ['a', 'b', 'c']
+#print multiply(L1,L2)
+
+
+def load_fol(s):
+    """
+    Temporarily duplicated from ``nltk.sem.util``.
+    Convert a  file of first order formulas into a list of ``Expression`` objects.
+
+    :param s: the contents of the file
+    :type s: str
+    :return: a list of parsed formulas.
+    :rtype: list(Expression)
+    """
+    statements = []
+    for linenum, line in enumerate(s.splitlines()):
+        line = line.strip()
+        if line.startswith('#') or line == '':
+            continue
+        try:
+            statements.append(Expression.fromstring(line))
+        except Exception:
+            raise ValueError('Unable to parse line %s: %s' % (linenum, line))
+    return statements
+
+
+###############################
+# Demo
+###############################
+def discourse_demo(reading_command=None):
+    """
+    Illustrate the various methods of ``DiscourseTester``
+    """
+    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
+                         reading_command)
+    dt.models()
+    print()
+    # dt.grammar()
+    print()
+    dt.sentences()
+    print()
+    dt.readings()
+    print()
+    dt.readings(threaded=True)
+    print()
+    dt.models('d1')
+    dt.add_sentence('John is a boxer')
+    print()
+    dt.sentences()
+    print()
+    dt.readings(threaded=True)
+    print()
+    dt = DiscourseTester(['A student dances', 'Every student is a person'],
+                         reading_command)
+    print()
+    dt.add_sentence('No person dances', consistchk=True)
+    print()
+    dt.readings()
+    print()
+    dt.retract_sentence('No person dances', verbose=True)
+    print()
+    dt.models()
+    print()
+    dt.readings('A person dances')
+    print()
+    dt.add_sentence('A person dances', informchk=True)
+    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
+                          'Vincent is married', 'Fido barks'],
+                         reading_command)
+    dt.readings(filter=True)
+    import nltk.data
+    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
+    background = nltk.data.load(background_file)
+
+    print()
+    dt.add_background(background, verbose=False)
+    dt.background()
+    print()
+    dt.readings(filter=True)
+    print()
+    dt.models()
+
+
+def drt_discourse_demo(reading_command=None):
+    """
+    Illustrate the various methods of ``DiscourseTester``
+    """
+    dt = DiscourseTester(['every dog chases a boy', 'he runs'],
+                         reading_command)
+    dt.models()
+    print()
+    dt.sentences()
+    print()
+    dt.readings()
+    print()
+    dt.readings(show_thread_readings=True)
+    print()
+    dt.readings(filter=True, show_thread_readings=True)
+
+
+def spacer(num=30):
+    print('-' * num)
+
+
+def demo():
+    discourse_demo()
+
+    tagger = RegexpTagger([('^(chases|runs)$', 'VB'),
+                           ('^(a)$', 'ex_quant'),
+                           ('^(every)$', 'univ_quant'),
+                           ('^(dog|boy)$', 'NN'),
+                           ('^(he)$', 'PRP')])
+    depparser = MaltParser(tagger=tagger)
+    drt_discourse_demo(DrtGlueReadingCommand(remove_duplicates=False,
+                                             depparser=depparser))
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/inference/discourse.pyc b/nlp_resource_data/nltk/inference/discourse.pyc

new file mode 100755 (executable)

index 0000000..64f9d18

Binary files /dev/null and b/nlp_resource_data/nltk/inference/discourse.pyc differ
diff --git a/nlp_resource_data/nltk/inference/mace.py b/nlp_resource_data/nltk/inference/mace.py

new file mode 100755 (executable)

index 0000000..83e841b
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/mace.py
@@ -0,0 +1,311 @@
+# Natural Language Toolkit: Interface to the Mace4 Model Builder
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#         Ewan Klein <ewan@inf.ed.ac.uk>
+
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A model builder that makes use of the external 'Mace4' package.
+"""
+from __future__ import print_function
+
+import os
+import tempfile
+
+from nltk.sem.logic import is_indvar
+from nltk.sem import Valuation, Expression
+
+from nltk.inference.api import ModelBuilder, BaseModelBuilderCommand
+from nltk.inference.prover9 import Prover9CommandParent, Prover9Parent
+
+
+class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
+    """
+    A ``MaceCommand`` specific to the ``Mace`` model builder.  It contains
+    a print_assumptions() method that is used to print the list
+    of assumptions in multiple formats.
+    """
+    _interpformat_bin = None
+
+    def __init__(self, goal=None, assumptions=None, max_models=500, model_builder=None):
+        """
+        :param goal: Input expression to prove
+        :type goal: sem.Expression
+        :param assumptions: Input expressions to use as assumptions in
+            the proof.
+        :type assumptions: list(sem.Expression)
+        :param max_models: The maximum number of models that Mace will try before
+            simply returning false. (Use 0 for no maximum.)
+        :type max_models: int
+        """
+        if model_builder is not None:
+            assert isinstance(model_builder, Mace)
+        else:
+            model_builder = Mace(max_models)
+
+        BaseModelBuilderCommand.__init__(self, model_builder, goal, assumptions)
+
+    @property
+    def valuation(mbc): return mbc.model('valuation')
+
+    def _convert2val(self, valuation_str):
+        """
+        Transform the output file into an NLTK-style Valuation.
+
+        :return: A model if one is generated; None otherwise.
+        :rtype: sem.Valuation
+        """
+        valuation_standard_format = self._transform_output(valuation_str, 'standard')
+
+        val = []
+        for line in valuation_standard_format.splitlines(False):
+            l = line.strip()
+
+            if l.startswith('interpretation'):
+                # find the number of entities in the model
+                num_entities = int(l[l.index('(')+1:l.index(',')].strip())
+
+            elif l.startswith('function') and l.find('_') == -1:
+                # replace the integer identifier with a corresponding alphabetic character
+                name = l[l.index('(')+1:l.index(',')].strip()
+                if is_indvar(name):
+                    name = name.upper()
+                value = int(l[l.index('[')+1:l.index(']')].strip())
+                val.append((name, MaceCommand._make_model_var(value)))
+
+            elif l.startswith('relation'):
+                l = l[l.index('(')+1:]
+                if '(' in l:
+                    #relation is not nullary
+                    name = l[:l.index('(')].strip()
+                    values = [int(v.strip()) for v in l[l.index('[')+1:l.index(']')].split(',')]
+                    val.append((name, MaceCommand._make_relation_set(num_entities, values)))
+                else:
+                    #relation is nullary
+                    name = l[:l.index(',')].strip()
+                    value = int(l[l.index('[')+1:l.index(']')].strip())
+                    val.append((name, value == 1))
+
+        return Valuation(val)
+
+    @staticmethod
+    def _make_relation_set(num_entities, values):
+        """
+        Convert a Mace4-style relation table into a dictionary.
+
+        :param num_entities: the number of entities in the model; determines the row length in the table.
+        :type num_entities: int
+        :param values: a list of 1's and 0's that represent whether a relation holds in a Mace4 model.
+        :type values: list of int
+        """
+        r = set()
+        for position in [pos for (pos,v) in enumerate(values) if v == 1]:
+            r.add(tuple(MaceCommand._make_relation_tuple(position, values, num_entities)))
+        return r
+
+    @staticmethod
+    def _make_relation_tuple(position, values, num_entities):
+        if len(values) == 1:
+            return []
+        else:
+            sublist_size = len(values) // num_entities
+            sublist_start = position // sublist_size
+            sublist_position = int(position % sublist_size)
+
+            sublist = values[sublist_start*sublist_size:(sublist_start+1)*sublist_size]
+            return [MaceCommand._make_model_var(sublist_start)] + \
+                   MaceCommand._make_relation_tuple(sublist_position,
+                                                    sublist,
+                                                    num_entities)
+
+    @staticmethod
+    def _make_model_var(value):
+        """
+        Pick an alphabetic character as identifier for an entity in the model.
+
+        :param value: where to index into the list of characters
+        :type value: int
+        """
+        letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n',
+                  'o','p','q','r','s','t','u','v','w','x','y','z'][value]
+        num = value // 26
+        return (letter + str(num) if num > 0 else letter)
+
+    def _decorate_model(self, valuation_str, format):
+        """
+        Print out a Mace4 model using any Mace4 ``interpformat`` format.
+        See http://www.cs.unm.edu/~mccune/mace4/manual/ for details.
+
+        :param valuation_str: str with the model builder's output
+        :param format: str indicating the format for displaying
+        models. Defaults to 'standard' format.
+        :return: str
+        """
+        if not format:
+            return valuation_str
+        elif format == 'valuation':
+            return self._convert2val(valuation_str)
+        else:
+            return self._transform_output(valuation_str, format)
+
+    def _transform_output(self, valuation_str, format):
+        """
+        Transform the output file into any Mace4 ``interpformat`` format.
+
+        :param format: Output format for displaying models.
+        :type format: str
+        """
+        if format in ['standard', 'standard2', 'portable', 'tabular',
+                      'raw', 'cooked', 'xml', 'tex']:
+            return self._call_interpformat(valuation_str, [format])[0]
+        else:
+            raise LookupError("The specified format does not exist")
+
+    def _call_interpformat(self, input_str, args=[], verbose=False):
+        """
+        Call the ``interpformat`` binary with the given input.
+
+        :param input_str: A string whose contents are used as stdin.
+        :param args: A list of command-line arguments.
+        :return: A tuple (stdout, returncode)
+        :see: ``config_prover9``
+        """
+        if self._interpformat_bin is None:
+            self._interpformat_bin = self._modelbuilder._find_binary(
+                                                'interpformat', verbose)
+
+        return self._modelbuilder._call(input_str, self._interpformat_bin,
+                                        args, verbose)
+
+
+class Mace(Prover9Parent, ModelBuilder):
+    _mace4_bin = None
+
+    def __init__(self, end_size=500):
+        self._end_size = end_size
+        """The maximum model size that Mace will try before
+           simply returning false. (Use -1 for no maximum.)"""
+
+    def _build_model(self, goal=None, assumptions=None, verbose=False):
+        """
+        Use Mace4 to build a first order model.
+
+        :return: ``True`` if a model was found (i.e. Mace returns value of 0),
+        else ``False``
+        """
+        if not assumptions:
+            assumptions = []
+
+        stdout, returncode = self._call_mace4(self.prover9_input(goal, assumptions),
+                                              verbose=verbose)
+        return (returncode == 0, stdout)
+
+    def _call_mace4(self, input_str, args=[], verbose=False):
+        """
+        Call the ``mace4`` binary with the given input.
+
+        :param input_str: A string whose contents are used as stdin.
+        :param args: A list of command-line arguments.
+        :return: A tuple (stdout, returncode)
+        :see: ``config_prover9``
+        """
+        if self._mace4_bin is None:
+            self._mace4_bin = self._find_binary('mace4', verbose)
+
+        updated_input_str = ''
+        if self._end_size > 0:
+            updated_input_str += 'assign(end_size, %d).\n\n' % self._end_size
+        updated_input_str += input_str
+
+        return self._call(updated_input_str, self._mace4_bin, args, verbose)
+
+
+def spacer(num=30):
+    print('-' * num)
+
+def decode_result(found):
+    """
+    Decode the result of model_found()
+
+    :param found: The output of model_found()
+    :type found: bool
+    """
+    return {True: 'Countermodel found', False: 'No countermodel found', None: 'None'}[found]
+
+def test_model_found(arguments):
+    """
+    Try some proofs and exhibit the results.
+    """
+    for (goal, assumptions) in arguments:
+        g = Expression.fromstring(goal)
+        alist = [lp.parse(a) for a in assumptions]
+        m = MaceCommand(g, assumptions=alist, max_models=50)
+        found = m.build_model()
+        for a in alist:
+            print('   %s' % a)
+        print('|- %s: %s\n' % (g, decode_result(found)))
+
+
+def test_build_model(arguments):
+    """
+    Try to build a ``nltk.sem.Valuation``.
+    """
+    g = Expression.fromstring('all x.man(x)')
+    alist = [Expression.fromstring(a) for a in ['man(John)',
+                                   'man(Socrates)',
+                                   'man(Bill)',
+                                   'some x.(-(x = John) & man(x) & sees(John,x))',
+                                   'some x.(-(x = Bill) & man(x))',
+                                   'all x.some y.(man(x) -> gives(Socrates,x,y))']]
+
+    m = MaceCommand(g, assumptions=alist)
+    m.build_model()
+    spacer()
+    print("Assumptions and Goal")
+    spacer()
+    for a in alist:
+        print('   %s' % a)
+    print('|- %s: %s\n' % (g, decode_result(m.build_model())))
+    spacer()
+    #print m.model('standard')
+    #print m.model('cooked')
+    print("Valuation")
+    spacer()
+    print(m.valuation, '\n')
+
+def test_transform_output(argument_pair):
+    """
+    Transform the model into various Mace4 ``interpformat`` formats.
+    """
+    g = Expression.fromstring(argument_pair[0])
+    alist = [lp.parse(a) for a in argument_pair[1]]
+    m = MaceCommand(g, assumptions=alist)
+    m.build_model()
+    for a in alist:
+        print('   %s' % a)
+    print('|- %s: %s\n' % (g, m.build_model()))
+    for format in ['standard', 'portable', 'xml', 'cooked']:
+        spacer()
+        print("Using '%s' format" % format)
+        spacer()
+        print(m.model(format=format))
+
+def test_make_relation_set():
+    print(MaceCommand._make_relation_set(num_entities=3, values=[1,0,1]) == set([('c',), ('a',)]))
+    print(MaceCommand._make_relation_set(num_entities=3, values=[0,0,0,0,0,0,1,0,0]) == set([('c', 'a')]))
+    print(MaceCommand._make_relation_set(num_entities=2, values=[0,0,1,0,0,0,1,0]) == set([('a', 'b', 'a'), ('b', 'b', 'a')]))
+
+arguments = [
+    ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+    ('(not mortal(Socrates))', ['all x.(man(x) -> mortal(x))', 'man(Socrates)'])
+]
+
+def demo():
+    test_model_found(arguments)
+    test_build_model(arguments)
+    test_transform_output(arguments[1])
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/inference/mace.pyc b/nlp_resource_data/nltk/inference/mace.pyc

new file mode 100755 (executable)

index 0000000..d01e8e4

Binary files /dev/null and b/nlp_resource_data/nltk/inference/mace.pyc differ
diff --git a/nlp_resource_data/nltk/inference/nonmonotonic.py b/nlp_resource_data/nltk/inference/nonmonotonic.py

new file mode 100755 (executable)

index 0000000..60c9cd8
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/nonmonotonic.py
@@ -0,0 +1,509 @@
+# Natural Language Toolkit: Nonmonotonic Reasoning
+#
+# Author: Daniel H. Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+A module to perform nonmonotonic reasoning.  The ideas and demonstrations in
+this module are based on "Logical Foundations of Artificial Intelligence" by
+Michael R. Genesereth and Nils J. Nilsson.
+"""
+from __future__ import print_function, unicode_literals
+
+from nltk.inference.prover9 import Prover9, Prover9Command
+from collections import defaultdict
+from functools import reduce
+
+from nltk.sem.logic import (VariableExpression, EqualityExpression,
+                            ApplicationExpression, Expression,
+                            AbstractVariableExpression, AllExpression,
+                            BooleanExpression, NegatedExpression,
+                            ExistsExpression, Variable, ImpExpression,
+                            AndExpression, unique_variable, operator)
+
+from nltk.inference.api import Prover, ProverCommandDecorator
+from nltk.compat import python_2_unicode_compatible
+
+class ProverParseError(Exception): pass
+
+def get_domain(goal, assumptions):
+    if goal is None:
+        all_expressions = assumptions
+    else:
+        all_expressions = assumptions + [-goal]
+    return reduce(operator.or_, (a.constants() for a in all_expressions), set())
+
+class ClosedDomainProver(ProverCommandDecorator):
+    """
+    This is a prover decorator that adds domain closure assumptions before
+    proving.
+    """
+    def assumptions(self):
+        assumptions = [a for a in self._command.assumptions()]
+        goal = self._command.goal()
+        domain = get_domain(goal, assumptions)
+        return [self.replace_quants(ex, domain) for ex in assumptions]
+
+    def goal(self):
+        goal = self._command.goal()
+        domain = get_domain(goal, self._command.assumptions())
+        return self.replace_quants(goal, domain)
+
+    def replace_quants(self, ex, domain):
+        """
+        Apply the closed domain assumption to the expression
+         - Domain = union([e.free()|e.constants() for e in all_expressions])
+         - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR
+                     "P.replace(x, d1) | P.replace(x, d2) | ..."
+         - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..."
+        :param ex: ``Expression``
+        :param domain: set of {Variable}s
+        :return: ``Expression``
+        """
+        if isinstance(ex, AllExpression):
+            conjuncts = [ex.term.replace(ex.variable, VariableExpression(d))
+                         for d in domain]
+            conjuncts = [self.replace_quants(c, domain) for c in conjuncts]
+            return reduce(lambda x,y: x&y, conjuncts)
+        elif isinstance(ex, BooleanExpression):
+            return ex.__class__(self.replace_quants(ex.first, domain),
+                                self.replace_quants(ex.second, domain) )
+        elif isinstance(ex, NegatedExpression):
+            return -self.replace_quants(ex.term, domain)
+        elif isinstance(ex, ExistsExpression):
+            disjuncts = [ex.term.replace(ex.variable, VariableExpression(d))
+                         for d in domain]
+            disjuncts = [self.replace_quants(d, domain) for d in disjuncts]
+            return reduce(lambda x,y: x|y, disjuncts)
+        else:
+            return ex
+
+class UniqueNamesProver(ProverCommandDecorator):
+    """
+    This is a prover decorator that adds unique names assumptions before
+    proving.
+    """
+    def assumptions(self):
+        """
+         - Domain = union([e.free()|e.constants() for e in all_expressions])
+         - if "d1 = d2" cannot be proven from the premises, then add "d1 != d2"
+        """
+        assumptions = self._command.assumptions()
+
+        domain = list(get_domain(self._command.goal(), assumptions))
+
+        #build a dictionary of obvious equalities
+        eq_sets = SetHolder()
+        for a in assumptions:
+            if isinstance(a, EqualityExpression):
+                av = a.first.variable
+                bv = a.second.variable
+                #put 'a' and 'b' in the same set
+                eq_sets[av].add(bv)
+
+        new_assumptions = []
+        for i,a in enumerate(domain):
+            for b in domain[i+1:]:
+                #if a and b are not already in the same equality set
+                if b not in eq_sets[a]:
+                    newEqEx = EqualityExpression(VariableExpression(a),
+                                                 VariableExpression(b))
+                    if Prover9().prove(newEqEx, assumptions):
+                        #we can prove that the names are the same entity.
+                        #remember that they are equal so we don't re-check.
+                        eq_sets[a].add(b)
+                    else:
+                        #we can't prove it, so assume unique names
+                        new_assumptions.append(-newEqEx)
+
+        return assumptions + new_assumptions
+
+class SetHolder(list):
+    """
+    A list of sets of Variables.
+    """
+    def __getitem__(self, item):
+        """
+        :param item: ``Variable``
+        :return: the set containing 'item'
+        """
+        assert isinstance(item, Variable)
+        for s in self:
+            if item in s:
+                return s
+        #item is not found in any existing set.  so create a new set
+        new = set([item])
+        self.append(new)
+        return new
+
+class ClosedWorldProver(ProverCommandDecorator):
+    """
+    This is a prover decorator that completes predicates before proving.
+
+    If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P".
+    If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird".
+    If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P".
+
+    walk(Socrates)
+    Socrates != Bill
+    + all x.(walk(x) -> (x=Socrates))
+    ----------------
+    -walk(Bill)
+
+    see(Socrates, John)
+    see(John, Mary)
+    Socrates != John
+    John != Mary
+    + all x.all y.(see(x,y) -> ((x=Socrates & y=John) | (x=John & y=Mary)))
+    ----------------
+    -see(Socrates, Mary)
+
+    all x.(ostrich(x) -> bird(x))
+    bird(Tweety)
+    -ostrich(Sam)
+    Sam != Tweety
+    + all x.(bird(x) -> (ostrich(x) | x=Tweety))
+    + all x.-ostrich(x)
+    -------------------
+    -bird(Sam)
+    """
+    def assumptions(self):
+        assumptions = self._command.assumptions()
+
+        predicates = self._make_predicate_dict(assumptions)
+
+        new_assumptions = []
+        for p in predicates:
+            predHolder = predicates[p]
+            new_sig = self._make_unique_signature(predHolder)
+            new_sig_exs = [VariableExpression(v) for v in new_sig]
+
+            disjuncts = []
+
+            #Turn the signatures into disjuncts
+            for sig in predHolder.signatures:
+                equality_exs = []
+                for v1,v2 in zip(new_sig_exs, sig):
+                    equality_exs.append(EqualityExpression(v1,v2))
+                disjuncts.append(reduce(lambda x,y: x&y, equality_exs))
+
+            #Turn the properties into disjuncts
+            for prop in predHolder.properties:
+                #replace variables from the signature with new sig variables
+                bindings = {}
+                for v1,v2 in zip(new_sig_exs, prop[0]):
+                    bindings[v2] = v1
+                disjuncts.append(prop[1].substitute_bindings(bindings))
+
+            #make the assumption
+            if disjuncts:
+                #disjuncts exist, so make an implication
+                antecedent = self._make_antecedent(p, new_sig)
+                consequent = reduce(lambda x,y: x|y, disjuncts)
+                accum = ImpExpression(antecedent, consequent)
+            else:
+                #nothing has property 'p'
+                accum = NegatedExpression(self._make_antecedent(p, new_sig))
+
+            #quantify the implication
+            for new_sig_var in new_sig[::-1]:
+                accum = AllExpression(new_sig_var, accum)
+            new_assumptions.append(accum)
+
+        return assumptions + new_assumptions
+
+    def _make_unique_signature(self, predHolder):
+        """
+        This method figures out how many arguments the predicate takes and
+        returns a tuple containing that number of unique variables.
+        """
+        return tuple(unique_variable() for i in range(predHolder.signature_len))
+
+    def _make_antecedent(self, predicate, signature):
+        """
+        Return an application expression with 'predicate' as the predicate
+        and 'signature' as the list of arguments.
+        """
+        antecedent = predicate
+        for v in signature:
+            antecedent = antecedent(VariableExpression(v))
+        return antecedent
+
+    def _make_predicate_dict(self, assumptions):
+        """
+        Create a dictionary of predicates from the assumptions.
+
+        :param assumptions: a list of ``Expression``s
+        :return: dict mapping ``AbstractVariableExpression`` to ``PredHolder``
+        """
+        predicates = defaultdict(PredHolder)
+        for a in assumptions:
+            self._map_predicates(a, predicates)
+        return predicates
+
+    def _map_predicates(self, expression, predDict):
+        if isinstance(expression, ApplicationExpression):
+            func, args = expression.uncurry()
+            if isinstance(func, AbstractVariableExpression):
+                predDict[func].append_sig(tuple(args))
+        elif isinstance(expression, AndExpression):
+            self._map_predicates(expression.first, predDict)
+            self._map_predicates(expression.second, predDict)
+        elif isinstance(expression, AllExpression):
+            #collect all the universally quantified variables
+            sig = [expression.variable]
+            term = expression.term
+            while isinstance(term, AllExpression):
+                sig.append(term.variable)
+                term = term.term
+            if isinstance(term, ImpExpression):
+                if isinstance(term.first, ApplicationExpression) and \
+                   isinstance(term.second, ApplicationExpression):
+                    func1, args1 = term.first.uncurry()
+                    func2, args2 = term.second.uncurry()
+                    if isinstance(func1, AbstractVariableExpression) and \
+                       isinstance(func2, AbstractVariableExpression) and \
+                       sig == [v.variable for v in args1] and \
+                       sig == [v.variable for v in args2]:
+                        predDict[func2].append_prop((tuple(sig), term.first))
+                        predDict[func1].validate_sig_len(sig)
+
+@python_2_unicode_compatible
+class PredHolder(object):
+    """
+    This class will be used by a dictionary that will store information
+    about predicates to be used by the ``ClosedWorldProver``.
+
+    The 'signatures' property is a list of tuples defining signatures for
+    which the predicate is true.  For instance, 'see(john, mary)' would be
+    result in the signature '(john,mary)' for 'see'.
+
+    The second element of the pair is a list of pairs such that the first
+    element of the pair is a tuple of variables and the second element is an
+    expression of those variables that makes the predicate true.  For instance,
+    'all x.all y.(see(x,y) -> know(x,y))' would result in "((x,y),('see(x,y)'))"
+    for 'know'.
+    """
+    def __init__(self):
+        self.signatures = []
+        self.properties = []
+        self.signature_len = None
+
+    def append_sig(self, new_sig):
+        self.validate_sig_len(new_sig)
+        self.signatures.append(new_sig)
+
+    def append_prop(self, new_prop):
+        self.validate_sig_len(new_prop[0])
+        self.properties.append(new_prop)
+
+    def validate_sig_len(self, new_sig):
+        if self.signature_len is None:
+            self.signature_len = len(new_sig)
+        elif self.signature_len != len(new_sig):
+            raise Exception("Signature lengths do not match")
+
+    def __str__(self):
+        return '(%s,%s,%s)' % (self.signatures, self.properties,
+                               self.signature_len)
+
+    def __repr__(self):
+        return "%s" % self
+
+def closed_domain_demo():
+    lexpr = Expression.fromstring
+
+    p1 = lexpr(r'exists x.walk(x)')
+    p2 = lexpr(r'man(Socrates)')
+    c = lexpr(r'walk(Socrates)')
+    prover = Prover9Command(c, [p1,p2])
+    print(prover.prove())
+    cdp = ClosedDomainProver(prover)
+    print('assumptions:')
+    for a in cdp.assumptions(): print('   ', a)
+    print('goal:', cdp.goal())
+    print(cdp.prove())
+
+    p1 = lexpr(r'exists x.walk(x)')
+    p2 = lexpr(r'man(Socrates)')
+    p3 = lexpr(r'-walk(Bill)')
+    c = lexpr(r'walk(Socrates)')
+    prover = Prover9Command(c, [p1,p2,p3])
+    print(prover.prove())
+    cdp = ClosedDomainProver(prover)
+    print('assumptions:')
+    for a in cdp.assumptions(): print('   ', a)
+    print('goal:', cdp.goal())
+    print(cdp.prove())
+
+    p1 = lexpr(r'exists x.walk(x)')
+    p2 = lexpr(r'man(Socrates)')
+    p3 = lexpr(r'-walk(Bill)')
+    c = lexpr(r'walk(Socrates)')
+    prover = Prover9Command(c, [p1,p2,p3])
+    print(prover.prove())
+    cdp = ClosedDomainProver(prover)
+    print('assumptions:')
+    for a in cdp.assumptions(): print('   ', a)
+    print('goal:', cdp.goal())
+    print(cdp.prove())
+
+    p1 = lexpr(r'walk(Socrates)')
+    p2 = lexpr(r'walk(Bill)')
+    c = lexpr(r'all x.walk(x)')
+    prover = Prover9Command(c, [p1,p2])
+    print(prover.prove())
+    cdp = ClosedDomainProver(prover)
+    print('assumptions:')
+    for a in cdp.assumptions(): print('   ', a)
+    print('goal:', cdp.goal())
+    print(cdp.prove())
+
+    p1 = lexpr(r'girl(mary)')
+    p2 = lexpr(r'dog(rover)')
+    p3 = lexpr(r'all x.(girl(x) -> -dog(x))')
+    p4 = lexpr(r'all x.(dog(x) -> -girl(x))')
+    p5 = lexpr(r'chase(mary, rover)')
+    c = lexpr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))')
+    prover = Prover9Command(c, [p1,p2,p3,p4,p5])
+    print(prover.prove())
+    cdp = ClosedDomainProver(prover)
+    print('assumptions:')
+    for a in cdp.assumptions(): print('   ', a)
+    print('goal:', cdp.goal())
+    print(cdp.prove())
+
+def unique_names_demo():
+    lexpr = Expression.fromstring
+
+    p1 = lexpr(r'man(Socrates)')
+    p2 = lexpr(r'man(Bill)')
+    c = lexpr(r'exists x.exists y.(x != y)')
+    prover = Prover9Command(c, [p1,p2])
+    print(prover.prove())
+    unp = UniqueNamesProver(prover)
+    print('assumptions:')
+    for a in unp.assumptions(): print('   ', a)
+    print('goal:', unp.goal())
+    print(unp.prove())
+
+    p1 = lexpr(r'all x.(walk(x) -> (x = Socrates))')
+    p2 = lexpr(r'Bill = William')
+    p3 = lexpr(r'Bill = Billy')
+    c = lexpr(r'-walk(William)')
+    prover = Prover9Command(c, [p1,p2,p3])
+    print(prover.prove())
+    unp = UniqueNamesProver(prover)
+    print('assumptions:')
+    for a in unp.assumptions(): print('   ', a)
+    print('goal:', unp.goal())
+    print(unp.prove())
+
+def closed_world_demo():
+    lexpr = Expression.fromstring
+
+    p1 = lexpr(r'walk(Socrates)')
+    p2 = lexpr(r'(Socrates != Bill)')
+    c = lexpr(r'-walk(Bill)')
+    prover = Prover9Command(c, [p1,p2])
+    print(prover.prove())
+    cwp = ClosedWorldProver(prover)
+    print('assumptions:')
+    for a in cwp.assumptions(): print('   ', a)
+    print('goal:', cwp.goal())
+    print(cwp.prove())
+
+    p1 = lexpr(r'see(Socrates, John)')
+    p2 = lexpr(r'see(John, Mary)')
+    p3 = lexpr(r'(Socrates != John)')
+    p4 = lexpr(r'(John != Mary)')
+    c = lexpr(r'-see(Socrates, Mary)')
+    prover = Prover9Command(c, [p1,p2,p3,p4])
+    print(prover.prove())
+    cwp = ClosedWorldProver(prover)
+    print('assumptions:')
+    for a in cwp.assumptions(): print('   ', a)
+    print('goal:', cwp.goal())
+    print(cwp.prove())
+
+    p1 = lexpr(r'all x.(ostrich(x) -> bird(x))')
+    p2 = lexpr(r'bird(Tweety)')
+    p3 = lexpr(r'-ostrich(Sam)')
+    p4 = lexpr(r'Sam != Tweety')
+    c = lexpr(r'-bird(Sam)')
+    prover = Prover9Command(c, [p1,p2,p3,p4])
+    print(prover.prove())
+    cwp = ClosedWorldProver(prover)
+    print('assumptions:')
+    for a in cwp.assumptions(): print('   ', a)
+    print('goal:', cwp.goal())
+    print(cwp.prove())
+
+def combination_prover_demo():
+    lexpr = Expression.fromstring
+
+    p1 = lexpr(r'see(Socrates, John)')
+    p2 = lexpr(r'see(John, Mary)')
+    c = lexpr(r'-see(Socrates, Mary)')
+    prover = Prover9Command(c, [p1,p2])
+    print(prover.prove())
+    command = ClosedDomainProver(
+                  UniqueNamesProver(
+                      ClosedWorldProver(prover)))
+    for a in command.assumptions(): print(a)
+    print(command.prove())
+
+def default_reasoning_demo():
+    lexpr = Expression.fromstring
+
+    premises = []
+
+    #define taxonomy
+    premises.append(lexpr(r'all x.(elephant(x)        -> animal(x))'))
+    premises.append(lexpr(r'all x.(bird(x)            -> animal(x))'))
+    premises.append(lexpr(r'all x.(dove(x)            -> bird(x))'))
+    premises.append(lexpr(r'all x.(ostrich(x)         -> bird(x))'))
+    premises.append(lexpr(r'all x.(flying_ostrich(x)  -> ostrich(x))'))
+
+    #default properties
+    premises.append(lexpr(r'all x.((animal(x)  & -Ab1(x)) -> -fly(x))')) #normal animals don't fly
+    premises.append(lexpr(r'all x.((bird(x)    & -Ab2(x)) -> fly(x))')) #normal birds fly
+    premises.append(lexpr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly
+
+    #specify abnormal entities
+    premises.append(lexpr(r'all x.(bird(x)           -> Ab1(x))')) #flight
+    premises.append(lexpr(r'all x.(ostrich(x)        -> Ab2(x))')) #non-flying bird
+    premises.append(lexpr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich
+
+    #define entities
+    premises.append(lexpr(r'elephant(E)'))
+    premises.append(lexpr(r'dove(D)'))
+    premises.append(lexpr(r'ostrich(O)'))
+
+    #print the assumptions
+    prover = Prover9Command(None, premises)
+    command = UniqueNamesProver(ClosedWorldProver(prover))
+    for a in command.assumptions(): print(a)
+
+    print_proof('-fly(E)', premises)
+    print_proof('fly(D)', premises)
+    print_proof('-fly(O)', premises)
+
+def print_proof(goal, premises):
+    lexpr = Expression.fromstring
+    prover = Prover9Command(lexpr(goal), premises)
+    command = UniqueNamesProver(ClosedWorldProver(prover))
+    print(goal, prover.prove(), command.prove())
+
+def demo():
+    closed_domain_demo()
+    unique_names_demo()
+    closed_world_demo()
+    combination_prover_demo()
+    default_reasoning_demo()
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/inference/nonmonotonic.pyc b/nlp_resource_data/nltk/inference/nonmonotonic.pyc

new file mode 100755 (executable)

index 0000000..bdd8449

Binary files /dev/null and b/nlp_resource_data/nltk/inference/nonmonotonic.pyc differ
diff --git a/nlp_resource_data/nltk/inference/prover9.py b/nlp_resource_data/nltk/inference/prover9.py

new file mode 100755 (executable)

index 0000000..cfeeb1e
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/prover9.py
@@ -0,0 +1,431 @@
+# Natural Language Toolkit: Interface to the Prover9 Theorem Prover
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#         Ewan Klein <ewan@inf.ed.ac.uk>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+A theorem prover that makes use of the external 'Prover9' package.
+"""
+from __future__ import print_function
+
+import os
+import subprocess
+
+import nltk
+from nltk.sem.logic import Expression, ExistsExpression, AllExpression, \
+    NegatedExpression, AndExpression, IffExpression, OrExpression, \
+    EqualityExpression, ImpExpression
+from nltk.inference.api import BaseProverCommand, Prover
+
+#
+# Following is not yet used. Return code for 2 actually realized as 512.
+#
+p9_return_codes = {
+    0: True,
+    1:  "(FATAL)",      #A fatal error occurred (user's syntax error).
+    2: False,           # (SOS_EMPTY) Prover9 ran out of things to do
+                        #   (sos list exhausted).
+    3: "(MAX_MEGS)",    # The max_megs (memory limit) parameter was exceeded.
+    4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded.
+    5: "(MAX_GIVEN)",   # The max_given parameter was exceeded.
+    6: "(MAX_KEPT)",    # The max_kept parameter was exceeded.
+    7: "(ACTION)",      # A Prover9 action terminated the search.
+    101: "(SIGSEGV)",   # Prover9 crashed, most probably due to a bug.
+ }
+
+
+class Prover9CommandParent(object):
+    """
+    A common base class used by both ``Prover9Command`` and ``MaceCommand``,
+    which is responsible for maintaining a goal and a set of assumptions,
+    and generating prover9-style input files from them.
+    """
+    def print_assumptions(self, output_format='nltk'):
+        """
+        Print the list of the current assumptions.
+        """
+        if output_format.lower() == 'nltk':
+            for a in self.assumptions():
+                print(a)
+        elif output_format.lower() == 'prover9':
+            for a in convert_to_prover9(self.assumptions()):
+                print(a)
+        else:
+            raise NameError("Unrecognized value for 'output_format': %s" %
+                            output_format)
+
+class Prover9Command(Prover9CommandParent, BaseProverCommand):
+    """
+    A ``ProverCommand`` specific to the ``Prover9`` prover.  It contains
+    the a print_assumptions() method that is used to print the list
+    of assumptions in multiple formats.
+    """
+    def __init__(self, goal=None, assumptions=None, timeout=60, prover=None):
+        """
+        :param goal: Input expression to prove
+        :type goal: sem.Expression
+        :param assumptions: Input expressions to use as assumptions in
+            the proof.
+        :type assumptions: list(sem.Expression)
+        :param timeout: number of seconds before timeout; set to 0 for
+            no timeout.
+        :type timeout: int
+        :param prover: a prover.  If not set, one will be created.
+        :type prover: Prover9
+        """
+        if not assumptions:
+            assumptions = []
+
+        if prover is not None:
+            assert isinstance(prover, Prover9)
+        else:
+            prover = Prover9(timeout)
+
+        BaseProverCommand.__init__(self, prover, goal, assumptions)
+
+    def decorate_proof(self, proof_string, simplify=True):
+        """
+        :see BaseProverCommand.decorate_proof()
+        """
+        if simplify:
+            return self._prover._call_prooftrans(proof_string, ['striplabels'])[0].rstrip()
+        else:
+            return proof_string.rstrip()
+
+
+class Prover9Parent(object):
+    """
+    A common class extended by both ``Prover9`` and ``Mace <mace.Mace>``.
+    It contains the functionality required to convert NLTK-style
+    expressions into Prover9-style expressions.
+    """
+
+    _binary_location = None
+
+    def config_prover9(self, binary_location, verbose=False):
+        if binary_location is None:
+            self._binary_location = None
+            self._prover9_bin = None
+        else:
+            name = 'prover9'
+            self._prover9_bin = nltk.internals.find_binary(
+                                  name,
+                                  path_to_bin=binary_location,
+                                  env_vars=['PROVER9'],
+                                  url='http://www.cs.unm.edu/~mccune/prover9/',
+                                  binary_names=[name, name + '.exe'],
+                                  verbose=verbose)
+            self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1)
+
+    def prover9_input(self, goal, assumptions):
+        """
+        :return: The input string that should be provided to the
+        prover9 binary.  This string is formed based on the goal,
+        assumptions, and timeout value of this object.
+        """
+        s = ''
+
+        if assumptions:
+            s += 'formulas(assumptions).\n'
+            for p9_assumption in convert_to_prover9(assumptions):
+                s += '    %s.\n' % p9_assumption
+            s += 'end_of_list.\n\n'
+
+        if goal:
+            s += 'formulas(goals).\n'
+            s += '    %s.\n' % convert_to_prover9(goal)
+            s += 'end_of_list.\n\n'
+
+        return s
+
+    def binary_locations(self):
+        """
+        A list of directories that should be searched for the prover9
+        executables.  This list is used by ``config_prover9`` when searching
+        for the prover9 executables.
+        """
+        return ['/usr/local/bin/prover9',
+                '/usr/local/bin/prover9/bin',
+                '/usr/local/bin',
+                '/usr/bin',
+                '/usr/local/prover9',
+                '/usr/local/share/prover9']
+
+    def _find_binary(self, name, verbose=False):
+        binary_locations = self.binary_locations()
+        if self._binary_location is not None:
+            binary_locations += [self._binary_location]
+        return nltk.internals.find_binary(name,
+            searchpath=binary_locations,
+            env_vars=['PROVER9'],
+            url='http://www.cs.unm.edu/~mccune/prover9/',
+            binary_names=[name, name + '.exe'],
+            verbose=verbose)
+
+    def _call(self, input_str, binary, args=[], verbose=False):
+        """
+        Call the binary with the given input.
+
+        :param input_str: A string whose contents are used as stdin.
+        :param binary: The location of the binary to call
+        :param args: A list of command-line arguments.
+        :return: A tuple (stdout, returncode)
+        :see: ``config_prover9``
+        """
+        if verbose:
+            print('Calling:', binary)
+            print('Args:', args)
+            print('Input:\n', input_str, '\n')
+
+        # Call prover9 via a subprocess
+        cmd = [binary] + args
+        try:
+            input_str = input_str.encode("utf8")
+        except AttributeError:
+            pass
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT,
+                             stdin=subprocess.PIPE)
+        (stdout, stderr) = p.communicate(input=input_str)
+
+        if verbose:
+            print('Return code:', p.returncode)
+            if stdout: print('stdout:\n', stdout, '\n')
+            if stderr: print('stderr:\n', stderr, '\n')
+
+        return (stdout.decode("utf-8"), p.returncode)
+
+
+def convert_to_prover9(input):
+    """
+    Convert a ``logic.Expression`` to Prover9 format.
+    """
+    if isinstance(input, list):
+        result = []
+        for s in input:
+            try:
+                result.append(_convert_to_prover9(s.simplify()))
+            except:
+                print('input %s cannot be converted to Prover9 input syntax' % input)
+                raise
+        return result
+    else:
+        try:
+            return _convert_to_prover9(input.simplify())
+        except:
+            print('input %s cannot be converted to Prover9 input syntax' % input)
+            raise
+
+def _convert_to_prover9(expression):
+    """
+    Convert ``logic.Expression`` to Prover9 formatted string.
+    """
+    if isinstance(expression, ExistsExpression):
+        return 'exists ' + str(expression.variable) + ' ' + _convert_to_prover9(expression.term)
+    elif isinstance(expression, AllExpression):
+        return 'all ' + str(expression.variable) + ' ' + _convert_to_prover9(expression.term)
+    elif isinstance(expression, NegatedExpression):
+        return '-(' + _convert_to_prover9(expression.term) + ')'
+    elif isinstance(expression, AndExpression):
+        return '(' + _convert_to_prover9(expression.first) + ' & ' + \
+                     _convert_to_prover9(expression.second) + ')'
+    elif isinstance(expression, OrExpression):
+        return '(' + _convert_to_prover9(expression.first) + ' | ' + \
+                     _convert_to_prover9(expression.second) + ')'
+    elif isinstance(expression, ImpExpression):
+        return '(' + _convert_to_prover9(expression.first) + ' -> ' + \
+                     _convert_to_prover9(expression.second) + ')'
+    elif isinstance(expression, IffExpression):
+        return '(' + _convert_to_prover9(expression.first) + ' <-> ' + \
+                     _convert_to_prover9(expression.second) + ')'
+    elif isinstance(expression, EqualityExpression):
+        return '(' + _convert_to_prover9(expression.first) + ' = ' + \
+                     _convert_to_prover9(expression.second) + ')'
+    else:
+        return str(expression)
+
+
+class Prover9(Prover9Parent, Prover):
+    _prover9_bin = None
+    _prooftrans_bin = None
+
+    def __init__(self, timeout=60):
+        self._timeout = timeout
+        """The timeout value for prover9.  If a proof can not be found
+           in this amount of time, then prover9 will return false.
+           (Use 0 for no timeout.)"""
+
+    def _prove(self, goal=None, assumptions=None, verbose=False):
+        """
+        Use Prover9 to prove a theorem.
+        :return: A pair whose first element is a boolean indicating if the
+        proof was successful (i.e. returns value of 0) and whose second element
+        is the output of the prover.
+        """
+        if not assumptions:
+            assumptions = []
+
+        stdout, returncode = self._call_prover9(self.prover9_input(goal, assumptions),
+                                                verbose=verbose)
+        return (returncode == 0, stdout)
+
+    def prover9_input(self, goal, assumptions):
+        """
+        :see: Prover9Parent.prover9_input
+        """
+        s = 'clear(auto_denials).\n' #only one proof required
+        return s + Prover9Parent.prover9_input(self, goal, assumptions)
+
+    def _call_prover9(self, input_str, args=[], verbose=False):
+        """
+        Call the ``prover9`` binary with the given input.
+
+        :param input_str: A string whose contents are used as stdin.
+        :param args: A list of command-line arguments.
+        :return: A tuple (stdout, returncode)
+        :see: ``config_prover9``
+        """
+        if self._prover9_bin is None:
+            self._prover9_bin = self._find_binary('prover9', verbose)
+
+        updated_input_str = ''
+        if self._timeout > 0:
+            updated_input_str += 'assign(max_seconds, %d).\n\n' % self._timeout
+        updated_input_str += input_str
+
+        stdout, returncode = self._call(updated_input_str, self._prover9_bin, args, verbose)
+
+        if returncode not in [0,2]:
+            errormsgprefix = '%%ERROR:'
+            if errormsgprefix in stdout:
+                msgstart = stdout.index(errormsgprefix)
+                errormsg = stdout[msgstart:].strip()
+            else:
+                errormsg = None
+            if returncode in [3,4,5,6]:
+                raise Prover9LimitExceededException(returncode, errormsg)
+            else:
+                raise Prover9FatalException(returncode, errormsg)
+
+        return stdout, returncode
+
+    def _call_prooftrans(self, input_str, args=[], verbose=False):
+        """
+        Call the ``prooftrans`` binary with the given input.
+
+        :param input_str: A string whose contents are used as stdin.
+        :param args: A list of command-line arguments.
+        :return: A tuple (stdout, returncode)
+        :see: ``config_prover9``
+        """
+        if self._prooftrans_bin is None:
+            self._prooftrans_bin = self._find_binary('prooftrans', verbose)
+
+        return self._call(input_str, self._prooftrans_bin, args, verbose)
+
+
+class Prover9Exception(Exception):
+    def __init__(self, returncode, message):
+        msg = p9_return_codes[returncode]
+        if message:
+            msg += '\n%s' % message
+        Exception.__init__(self, msg)
+
+class Prover9FatalException(Prover9Exception):
+    pass
+
+class Prover9LimitExceededException(Prover9Exception):
+    pass
+
+
+
+######################################################################
+#{ Tests and Demos
+######################################################################
+
+def test_config():
+
+    a = Expression.fromstring('(walk(j) & sing(j))')
+    g = Expression.fromstring('walk(j)')
+    p = Prover9Command(g, assumptions=[a])
+    p._executable_path = None
+    p.prover9_search=[]
+    p.prove()
+    #config_prover9('/usr/local/bin')
+    print(p.prove())
+    print(p.proof())
+
+def test_convert_to_prover9(expr):
+    """
+    Test that parsing works OK.
+    """
+    for t in expr:
+        e = Expression.fromstring(t)
+        print(convert_to_prover9(e))
+
+def test_prove(arguments):
+    """
+    Try some proofs and exhibit the results.
+    """
+    for (goal, assumptions) in arguments:
+        g = Expression.fromstring(goal)
+        alist = [Expression.fromstring(a) for a in assumptions]
+        p = Prover9Command(g, assumptions=alist).prove()
+        for a in alist:
+            print('   %s' % a)
+        print('|- %s: %s\n' % (g, p))
+
+arguments = [
+    ('(man(x) <-> (not (not man(x))))', []),
+    ('(not (man(x) & (not man(x))))', []),
+    ('(man(x) | (not man(x)))', []),
+    ('(man(x) & (not man(x)))', []),
+    ('(man(x) -> man(x))', []),
+    ('(not (man(x) & (not man(x))))', []),
+    ('(man(x) | (not man(x)))', []),
+    ('(man(x) -> man(x))', []),
+    ('(man(x) <-> man(x))', []),
+    ('(not (man(x) <-> (not man(x))))', []),
+    ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+    ('((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))', []),
+    ('(all x.man(x) -> all x.man(x))', []),
+    ('some x.all y.sees(x,y)', []),
+    ('some e3.(walk(e3) & subj(e3, mary))',
+        ['some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))']),
+    ('some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))',
+       ['some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))'])
+]
+
+expressions = [r'some x y.sees(x,y)',
+               r'some x.(man(x) & walks(x))',
+               r'\x.(man(x) & walks(x))',
+               r'\x y.sees(x,y)',
+               r'walks(john)',
+               r'\x.big(x, \y.mouse(y))',
+               r'(walks(x) & (runs(x) & (threes(x) & fours(x))))',
+               r'(walks(x) -> runs(x))',
+               r'some x.(PRO(x) & sees(John, x))',
+               r'some x.(man(x) & (not walks(x)))',
+               r'all x.(man(x) -> walks(x))']
+
+def spacer(num=45):
+    print('-' * num)
+
+def demo():
+    print("Testing configuration")
+    spacer()
+    test_config()
+    print()
+    print("Testing conversion to Prover9 format")
+    spacer()
+    test_convert_to_prover9(expressions)
+    print()
+    print("Testing proofs")
+    spacer()
+    test_prove(arguments)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/inference/prover9.pyc b/nlp_resource_data/nltk/inference/prover9.pyc

new file mode 100755 (executable)

index 0000000..4916a77

Binary files /dev/null and b/nlp_resource_data/nltk/inference/prover9.pyc differ
diff --git a/nlp_resource_data/nltk/inference/resolution.py b/nlp_resource_data/nltk/inference/resolution.py

new file mode 100755 (executable)

index 0000000..eb38d73
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/resolution.py
@@ -0,0 +1,687 @@
+# Natural Language Toolkit: First-order Resolution-based Theorem Prover
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+Module for a resolution-based First Order theorem prover.
+"""
+from __future__ import print_function, unicode_literals
+
+import operator
+from collections import defaultdict
+from functools import reduce
+
+from nltk.sem import skolemize
+from nltk.sem.logic import (VariableExpression, EqualityExpression,
+                            ApplicationExpression, Expression,
+                            NegatedExpression, Variable,
+                            AndExpression, unique_variable, OrExpression,
+                            is_indvar, IndividualVariableExpression, Expression)
+
+from nltk.inference.api import Prover, BaseProverCommand
+from nltk.compat import python_2_unicode_compatible
+
+class ProverParseError(Exception): pass
+
+class ResolutionProver(Prover):
+    ANSWER_KEY = 'ANSWER'
+    _assume_false=True
+
+    def _prove(self, goal=None, assumptions=None, verbose=False):
+        """
+        :param goal: Input expression to prove
+        :type goal: sem.Expression
+        :param assumptions: Input expressions to use as assumptions in the proof
+        :type assumptions: list(sem.Expression)
+        """
+        if not assumptions:
+            assumptions = []
+
+        result = None
+        try:
+            clauses = []
+            if goal:
+                clauses.extend(clausify(-goal))
+            for a in assumptions:
+                clauses.extend(clausify(a))
+            result, clauses = self._attempt_proof(clauses)
+            if verbose:
+                print(ResolutionProverCommand._decorate_clauses(clauses))
+        except RuntimeError as e:
+            if self._assume_false and str(e).startswith('maximum recursion depth exceeded'):
+                result = False
+                clauses = []
+            else:
+                if verbose:
+                    print(e)
+                else:
+                    raise e
+        return (result, clauses)
+
+    def _attempt_proof(self, clauses):
+        #map indices to lists of indices, to store attempted unifications
+        tried = defaultdict(list)
+
+        i = 0
+        while i < len(clauses):
+            if not clauses[i].is_tautology():
+                #since we try clauses in order, we should start after the last
+                #index tried
+                if tried[i]:
+                    j = tried[i][-1] + 1
+                else:
+                    j = i + 1 #nothing tried yet for 'i', so start with the next
+
+                while j < len(clauses):
+                    #don't: 1) unify a clause with itself,
+                    #       2) use tautologies
+                    if i != j and j and not clauses[j].is_tautology():
+                        tried[i].append(j)
+                        newclauses = clauses[i].unify(clauses[j])
+                        if newclauses:
+                            for newclause in newclauses:
+                                newclause._parents = (i+1, j+1)
+                                clauses.append(newclause)
+                                if not len(newclause): #if there's an empty clause
+                                    return (True, clauses)
+                            i=-1 #since we added a new clause, restart from the top
+                            break
+                    j += 1
+            i += 1
+        return (False, clauses)
+
+class ResolutionProverCommand(BaseProverCommand):
+    def __init__(self, goal=None, assumptions=None, prover=None):
+        """
+        :param goal: Input expression to prove
+        :type goal: sem.Expression
+        :param assumptions: Input expressions to use as assumptions in
+            the proof.
+        :type assumptions: list(sem.Expression)
+        """
+        if prover is not None:
+            assert isinstance(prover, ResolutionProver)
+        else:
+            prover = ResolutionProver()
+
+        BaseProverCommand.__init__(self, prover, goal, assumptions)
+        self._clauses = None
+
+    def prove(self, verbose=False):
+        """
+        Perform the actual proof.  Store the result to prevent unnecessary
+        re-proving.
+        """
+        if self._result is None:
+            self._result, clauses = self._prover._prove(self.goal(),
+                                                        self.assumptions(),
+                                                        verbose)
+            self._clauses = clauses
+            self._proof = ResolutionProverCommand._decorate_clauses(clauses)
+        return self._result
+
+    def find_answers(self, verbose=False):
+        self.prove(verbose)
+
+        answers = set()
+        answer_ex = VariableExpression(Variable(ResolutionProver.ANSWER_KEY))
+        for clause in self._clauses:
+            for term in clause:
+                if isinstance(term, ApplicationExpression) and\
+                   term.function == answer_ex and\
+                   not isinstance(term.argument, IndividualVariableExpression):
+                    answers.add(term.argument)
+        return answers
+
+    @staticmethod
+    def _decorate_clauses(clauses):
+        """
+        Decorate the proof output.
+        """
+        out = ''
+        max_clause_len = max([len(str(clause)) for clause in clauses])
+        max_seq_len = len(str(len(clauses)))
+        for i in range(len(clauses)):
+            parents = 'A'
+            taut = ''
+            if clauses[i].is_tautology():
+                taut = 'Tautology'
+            if clauses[i]._parents:
+                parents = str(clauses[i]._parents)
+            parents = ' '*(max_clause_len-len(str(clauses[i]))+1) + parents
+            seq = ' '*(max_seq_len-len(str(i+1))) + str(i+1)
+            out += '[%s] %s %s %s\n' % (seq, clauses[i], parents, taut)
+        return out
+
+@python_2_unicode_compatible
+class Clause(list):
+    def __init__(self, data):
+        list.__init__(self, data)
+        self._is_tautology = None
+        self._parents = None
+
+    def unify(self, other, bindings=None, used=None, skipped=None, debug=False):
+        """
+        Attempt to unify this Clause with the other, returning a list of
+        resulting, unified, Clauses.
+
+        :param other: ``Clause`` with which to unify
+        :param bindings: ``BindingDict`` containing bindings that should be used
+        during the unification
+        :param used: tuple of two lists of atoms.  The first lists the
+        atoms from 'self' that were successfully unified with atoms from
+        'other'.  The second lists the atoms from 'other' that were successfully
+        unified with atoms from 'self'.
+        :param skipped: tuple of two ``Clause`` objects.  The first is a list of all
+        the atoms from the 'self' Clause that have not been unified with
+        anything on the path.  The second is same thing for the 'other' Clause.
+        :param debug: bool indicating whether debug statements should print
+        :return: list containing all the resulting ``Clause`` objects that could be
+        obtained by unification
+        """
+        if bindings is None: bindings = BindingDict()
+        if used is None: used = ([],[])
+        if skipped is None: skipped = ([],[])
+        if isinstance(debug, bool): debug = DebugObject(debug)
+
+        newclauses = _iterate_first(self, other, bindings, used, skipped, _complete_unify_path, debug)
+
+        #remove subsumed clauses.  make a list of all indices of subsumed
+        #clauses, and then remove them from the list
+        subsumed = []
+        for i, c1 in enumerate(newclauses):
+            if i not in subsumed:
+                for j, c2 in enumerate(newclauses):
+                    if i!=j and j not in subsumed and c1.subsumes(c2):
+                        subsumed.append(j)
+        result = []
+        for i in range(len(newclauses)):
+            if i not in subsumed:
+                result.append(newclauses[i])
+
+        return result
+
+    def isSubsetOf(self, other):
+        """
+        Return True iff every term in 'self' is a term in 'other'.
+
+        :param other: ``Clause``
+        :return: bool
+        """
+        for a in self:
+            if a not in other:
+                return False
+        return True
+
+    def subsumes(self, other):
+        """
+        Return True iff 'self' subsumes 'other', this is, if there is a
+        substitution such that every term in 'self' can be unified with a term
+        in 'other'.
+
+        :param other: ``Clause``
+        :return: bool
+        """
+        negatedother = []
+        for atom in other:
+            if isinstance(atom, NegatedExpression):
+                negatedother.append(atom.term)
+            else:
+                negatedother.append(-atom)
+
+        negatedotherClause = Clause(negatedother)
+
+        bindings = BindingDict()
+        used = ([],[])
+        skipped = ([],[])
+        debug = DebugObject(False)
+
+        return len(_iterate_first(self, negatedotherClause, bindings, used,
+                                      skipped, _subsumes_finalize,
+                                      debug)) > 0
+
+    def __getslice__(self, start, end):
+        return Clause(list.__getslice__(self, start, end))
+
+    def __sub__(self, other):
+        return Clause([a for a in self if a not in other])
+
+    def __add__(self, other):
+        return Clause(list.__add__(self, other))
+
+    def is_tautology(self):
+        """
+        Self is a tautology if it contains ground terms P and -P.  The ground
+        term, P, must be an exact match, ie, not using unification.
+        """
+        if self._is_tautology is not None:
+            return self._is_tautology
+        for i,a in enumerate(self):
+            if not isinstance(a, EqualityExpression):
+                j = len(self)-1
+                while j > i:
+                    b = self[j]
+                    if isinstance(a, NegatedExpression):
+                        if a.term == b:
+                            self._is_tautology = True
+                            return True
+                    elif isinstance(b, NegatedExpression):
+                        if a == b.term:
+                            self._is_tautology = True
+                            return True
+                    j -= 1
+        self._is_tautology = False
+        return False
+
+    def free(self):
+        return reduce(operator.or_, ((atom.free() | atom.constants()) for atom in self))
+
+    def replace(self, variable, expression):
+        """
+        Replace every instance of variable with expression across every atom
+        in the clause
+
+        :param variable: ``Variable``
+        :param expression: ``Expression``
+        """
+        return Clause([atom.replace(variable, expression) for atom in self])
+
+    def substitute_bindings(self, bindings):
+        """
+        Replace every binding
+
+        :param bindings: A list of tuples mapping Variable Expressions to the
+        Expressions to which they are bound
+        :return: ``Clause``
+        """
+        return Clause([atom.substitute_bindings(bindings) for atom in self])
+
+    def __str__(self):
+        return '{' + ', '.join("%s" % item for item in self) + '}'
+
+    def __repr__(self):
+        return "%s" % self
+
+def _iterate_first(first, second, bindings, used, skipped, finalize_method, debug):
+    """
+    This method facilitates movement through the terms of 'self'
+    """
+    debug.line('unify(%s,%s) %s'%(first, second, bindings))
+
+    if not len(first) or not len(second): #if no more recursions can be performed
+        return finalize_method(first, second, bindings, used, skipped, debug)
+    else:
+        #explore this 'self' atom
+        result = _iterate_second(first, second, bindings, used, skipped, finalize_method, debug+1)
+
+        #skip this possible 'self' atom
+        newskipped = (skipped[0]+[first[0]], skipped[1])
+        result += _iterate_first(first[1:], second, bindings, used, newskipped, finalize_method, debug+1)
+
+        try:
+            newbindings, newused, unused = _unify_terms(first[0], second[0], bindings, used)
+            #Unification found, so progress with this line of unification
+            #put skipped and unused terms back into play for later unification.
+            newfirst = first[1:] + skipped[0] + unused[0]
+            newsecond = second[1:] + skipped[1] + unused[1]
+            result += _iterate_first(newfirst, newsecond, newbindings, newused, ([],[]), finalize_method, debug+1)
+        except BindingException:
+            #the atoms could not be unified,
+            pass
+
+        return result
+
+def _iterate_second(first, second, bindings, used, skipped, finalize_method, debug):
+    """
+    This method facilitates movement through the terms of 'other'
+    """
+    debug.line('unify(%s,%s) %s'%(first, second, bindings))
+
+    if not len(first) or not len(second): #if no more recursions can be performed
+        return finalize_method(first, second, bindings, used, skipped, debug)
+    else:
+        #skip this possible pairing and move to the next
+        newskipped = (skipped[0], skipped[1]+[second[0]])
+        result = _iterate_second(first, second[1:], bindings, used, newskipped, finalize_method, debug+1)
+
+        try:
+            newbindings, newused, unused = _unify_terms(first[0], second[0], bindings, used)
+            #Unification found, so progress with this line of unification
+            #put skipped and unused terms back into play for later unification.
+            newfirst = first[1:] + skipped[0] + unused[0]
+            newsecond = second[1:] + skipped[1] + unused[1]
+            result += _iterate_second(newfirst, newsecond, newbindings, newused, ([],[]), finalize_method, debug+1)
+        except BindingException:
+            #the atoms could not be unified,
+            pass
+
+        return result
+
+def _unify_terms(a, b, bindings=None, used=None):
+    """
+    This method attempts to unify two terms.  Two expressions are unifiable
+    if there exists a substitution function S such that S(a) == S(-b).
+
+    :param a: ``Expression``
+    :param b: ``Expression``
+    :param bindings: ``BindingDict`` a starting set of bindings with which
+    the unification must be consistent
+    :return: ``BindingDict`` A dictionary of the bindings required to unify
+    :raise ``BindingException``: If the terms cannot be unified
+    """
+    assert isinstance(a, Expression)
+    assert isinstance(b, Expression)
+
+    if bindings is None: bindings = BindingDict()
+    if used is None: used = ([],[])
+
+    # Use resolution
+    if isinstance(a, NegatedExpression) and isinstance(b, ApplicationExpression):
+        newbindings = most_general_unification(a.term, b, bindings)
+        newused = (used[0]+[a], used[1]+[b])
+        unused = ([],[])
+    elif isinstance(a, ApplicationExpression) and isinstance(b, NegatedExpression):
+        newbindings = most_general_unification(a, b.term, bindings)
+        newused = (used[0]+[a], used[1]+[b])
+        unused = ([],[])
+
+    # Use demodulation
+    elif isinstance(a, EqualityExpression):
+        newbindings = BindingDict([(a.first.variable, a.second)])
+        newused = (used[0]+[a], used[1])
+        unused = ([],[b])
+    elif isinstance(b, EqualityExpression):
+        newbindings = BindingDict([(b.first.variable, b.second)])
+        newused = (used[0], used[1]+[b])
+        unused = ([a],[])
+
+    else:
+        raise BindingException((a, b))
+
+    return newbindings, newused, unused
+
+def _complete_unify_path(first, second, bindings, used, skipped, debug):
+    if used[0] or used[1]: #if bindings were made along the path
+        newclause = Clause(skipped[0] + skipped[1] + first + second)
+        debug.line('  -> New Clause: %s' % newclause)
+        return [newclause.substitute_bindings(bindings)]
+    else: #no bindings made means no unification occurred.  so no result
+        debug.line('  -> End')
+        return []
+
+def _subsumes_finalize(first, second, bindings, used, skipped, debug):
+    if not len(skipped[0]) and not len(first):
+        #If there are no skipped terms and no terms left in 'first', then
+        #all of the terms in the original 'self' were unified with terms
+        #in 'other'.  Therefore, there exists a binding (this one) such that
+        #every term in self can be unified with a term in other, which
+        #is the definition of subsumption.
+        return [True]
+    else:
+        return []
+
+def clausify(expression):
+    """
+    Skolemize, clausify, and standardize the variables apart.
+    """
+    clause_list = []
+    for clause in _clausify(skolemize(expression)):
+        for free in clause.free():
+            if is_indvar(free.name):
+                newvar = VariableExpression(unique_variable())
+                clause = clause.replace(free, newvar)
+        clause_list.append(clause)
+    return clause_list
+
+def _clausify(expression):
+    """
+    :param expression: a skolemized expression in CNF
+    """
+    if isinstance(expression, AndExpression):
+        return _clausify(expression.first) + _clausify(expression.second)
+    elif isinstance(expression, OrExpression):
+        first = _clausify(expression.first)
+        second = _clausify(expression.second)
+        assert len(first) == 1
+        assert len(second) == 1
+        return [first[0] + second[0]]
+    elif isinstance(expression, EqualityExpression):
+        return [Clause([expression])]
+    elif isinstance(expression, ApplicationExpression):
+        return [Clause([expression])]
+    elif isinstance(expression, NegatedExpression):
+        if isinstance(expression.term, ApplicationExpression):
+            return [Clause([expression])]
+        elif isinstance(expression.term, EqualityExpression):
+            return [Clause([expression])]
+    raise ProverParseError()
+
+
+@python_2_unicode_compatible
+class BindingDict(object):
+    def __init__(self, binding_list=None):
+        """
+        :param binding_list: list of (``AbstractVariableExpression``, ``AtomicExpression``) to initialize the dictionary
+        """
+        self.d = {}
+
+        if binding_list:
+            for (v, b) in binding_list:
+                self[v] = b
+
+    def __setitem__(self, variable, binding):
+        """
+        A binding is consistent with the dict if its variable is not already bound, OR if its
+        variable is already bound to its argument.
+
+        :param variable: ``Variable`` The variable to bind
+        :param binding: ``Expression`` The atomic to which 'variable' should be bound
+        :raise BindingException: If the variable cannot be bound in this dictionary
+        """
+        assert isinstance(variable, Variable)
+        assert isinstance(binding, Expression)
+
+        try:
+            existing = self[variable]
+        except KeyError:
+            existing = None
+
+        if not existing or binding == existing:
+            self.d[variable] = binding
+        elif isinstance(binding, IndividualVariableExpression):
+            # Since variable is already bound, try to bind binding to variable
+            try:
+                existing = self[binding.variable]
+            except KeyError:
+                existing = None
+
+            binding2 = VariableExpression(variable)
+
+            if not existing or binding2 == existing:
+                self.d[binding.variable] = binding2
+            else:
+                raise BindingException('Variable %s already bound to another '
+                                       'value' % (variable))
+        else:
+            raise BindingException('Variable %s already bound to another '
+                                   'value' % (variable))
+
+    def __getitem__(self, variable):
+        """
+        Return the expression to which 'variable' is bound
+        """
+        assert isinstance(variable, Variable)
+
+        intermediate = self.d[variable]
+        while intermediate:
+            try:
+                intermediate = self.d[intermediate]
+            except KeyError:
+                return intermediate
+
+    def __contains__(self, item):
+        return item in self.d
+
+    def __add__(self, other):
+        """
+        :param other: ``BindingDict`` The dict with which to combine self
+        :return: ``BindingDict`` A new dict containing all the elements of both parameters
+        :raise BindingException: If the parameter dictionaries are not consistent with each other
+        """
+        try:
+            combined = BindingDict()
+            for v in self.d:
+                combined[v] = self.d[v]
+            for v in other.d:
+                combined[v] = other.d[v]
+            return combined
+        except BindingException:
+            raise BindingException("Attempting to add two contradicting "
+                                   "BindingDicts: '%s' and '%s'"
+                                   % (self, other))
+
+    def __len__(self):
+        return len(self.d)
+
+    def __str__(self):
+        data_str = ', '.join('%s: %s' % (v, self.d[v]) for v in sorted(self.d.keys()))
+        return '{' + data_str + '}'
+
+    def __repr__(self):
+        return "%s" % self
+
+
+def most_general_unification(a, b, bindings=None):
+    """
+    Find the most general unification of the two given expressions
+
+    :param a: ``Expression``
+    :param b: ``Expression``
+    :param bindings: ``BindingDict`` a starting set of bindings with which the
+                     unification must be consistent
+    :return: a list of bindings
+    :raise BindingException: if the Expressions cannot be unified
+    """
+    if bindings is None:
+        bindings = BindingDict()
+
+    if a == b:
+        return bindings
+    elif isinstance(a, IndividualVariableExpression):
+        return _mgu_var(a, b, bindings)
+    elif isinstance(b, IndividualVariableExpression):
+        return _mgu_var(b, a, bindings)
+    elif isinstance(a, ApplicationExpression) and\
+         isinstance(b, ApplicationExpression):
+        return most_general_unification(a.function, b.function, bindings) +\
+               most_general_unification(a.argument, b.argument, bindings)
+    raise BindingException((a, b))
+
+def _mgu_var(var, expression, bindings):
+    if var.variable in expression.free()|expression.constants():
+        raise BindingException((var, expression))
+    else:
+        return BindingDict([(var.variable, expression)]) + bindings
+
+
+class BindingException(Exception):
+    def __init__(self, arg):
+        if isinstance(arg, tuple):
+            Exception.__init__(self, "'%s' cannot be bound to '%s'" % arg)
+        else:
+            Exception.__init__(self, arg)
+
+class UnificationException(Exception):
+    def __init__(self, a, b):
+        Exception.__init__(self, "'%s' cannot unify with '%s'" % (a,b))
+
+
+class DebugObject(object):
+    def __init__(self, enabled=True, indent=0):
+        self.enabled = enabled
+        self.indent = indent
+
+    def __add__(self, i):
+        return DebugObject(self.enabled, self.indent+i)
+
+    def line(self, line):
+        if self.enabled:
+            print('    '*self.indent + line)
+
+
+def testResolutionProver():
+    resolution_test(r'man(x)')
+    resolution_test(r'(man(x) -> man(x))')
+    resolution_test(r'(man(x) -> --man(x))')
+    resolution_test(r'-(man(x) and -man(x))')
+    resolution_test(r'(man(x) or -man(x))')
+    resolution_test(r'(man(x) -> man(x))')
+    resolution_test(r'-(man(x) and -man(x))')
+    resolution_test(r'(man(x) or -man(x))')
+    resolution_test(r'(man(x) -> man(x))')
+    resolution_test(r'(man(x) iff man(x))')
+    resolution_test(r'-(man(x) iff -man(x))')
+    resolution_test('all x.man(x)')
+    resolution_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
+    resolution_test('some x.all y.sees(x,y)')
+
+    p1 = Expression.fromstring(r'all x.(man(x) -> mortal(x))')
+    p2 = Expression.fromstring(r'man(Socrates)')
+    c = Expression.fromstring(r'mortal(Socrates)')
+    print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1,p2])))
+
+    p1 = Expression.fromstring(r'all x.(man(x) -> walks(x))')
+    p2 = Expression.fromstring(r'man(John)')
+    c = Expression.fromstring(r'some y.walks(y)')
+    print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1,p2])))
+
+    p = Expression.fromstring(r'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))')
+    c = Expression.fromstring(r'some e0.walk(e0,mary)')
+    print('%s |- %s: %s' % (p, c, ResolutionProver().prove(c, [p])))
+
+def resolution_test(e):
+    f = Expression.fromstring(e)
+    t = ResolutionProver().prove(f)
+    print('|- %s: %s' % (f, t))
+
+def test_clausify():
+    lexpr = Expression.fromstring
+
+    print(clausify(lexpr('P(x) | Q(x)')))
+    print(clausify(lexpr('(P(x) & Q(x)) | R(x)')))
+    print(clausify(lexpr('P(x) | (Q(x) & R(x))')))
+    print(clausify(lexpr('(P(x) & Q(x)) | (R(x) & S(x))')))
+
+    print(clausify(lexpr('P(x) | Q(x) | R(x)')))
+    print(clausify(lexpr('P(x) | (Q(x) & R(x)) | S(x)')))
+
+    print(clausify(lexpr('exists x.P(x) | Q(x)')))
+
+    print(clausify(lexpr('-(-P(x) & Q(x))')))
+    print(clausify(lexpr('P(x) <-> Q(x)')))
+    print(clausify(lexpr('-(P(x) <-> Q(x))')))
+    print(clausify(lexpr('-(all x.P(x))')))
+    print(clausify(lexpr('-(some x.P(x))')))
+
+    print(clausify(lexpr('some x.P(x)')))
+    print(clausify(lexpr('some x.all y.P(x,y)')))
+    print(clausify(lexpr('all y.some x.P(x,y)')))
+    print(clausify(lexpr('all z.all y.some x.P(x,y,z)')))
+    print(clausify(lexpr('all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))')))
+
+
+def demo():
+    test_clausify()
+    print()
+    testResolutionProver()
+    print()
+
+    p = Expression.fromstring('man(x)')
+    print(ResolutionProverCommand(p, [p]).prove())
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/inference/resolution.pyc b/nlp_resource_data/nltk/inference/resolution.pyc

new file mode 100755 (executable)

index 0000000..b2c1ad5

Binary files /dev/null and b/nlp_resource_data/nltk/inference/resolution.pyc differ
diff --git a/nlp_resource_data/nltk/inference/tableau.py b/nlp_resource_data/nltk/inference/tableau.py

new file mode 100755 (executable)

index 0000000..02e769c
--- /dev/null
+++ b/nlp_resource_data/nltk/inference/tableau.py
@@ -0,0 +1,607 @@
+# Natural Language Toolkit: First-Order Tableau Theorem Prover
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Module for a tableau-based First Order theorem prover.
+"""
+from __future__ import print_function, unicode_literals
+
+from nltk.internals import Counter
+
+from nltk.sem.logic import (VariableExpression, EqualityExpression,
+                            ApplicationExpression, Expression,
+                            AbstractVariableExpression, AllExpression,
+                            NegatedExpression,
+                            ExistsExpression, Variable, ImpExpression,
+                            AndExpression, unique_variable,
+                            LambdaExpression, IffExpression,
+                            OrExpression, FunctionVariableExpression)
+
+from nltk.inference.api import Prover, BaseProverCommand
+
+_counter = Counter()
+
+class ProverParseError(Exception): pass
+
+class TableauProver(Prover):
+    _assume_false=False
+
+    def _prove(self, goal=None, assumptions=None, verbose=False):
+        if not assumptions:
+            assumptions = []
+
+        result = None
+        try:
+            agenda = Agenda()
+            if goal:
+                agenda.put(-goal)
+            agenda.put_all(assumptions)
+            debugger = Debug(verbose)
+            result = self._attempt_proof(agenda, set(), set(), debugger)
+        except RuntimeError as e:
+            if self._assume_false and str(e).startswith('maximum recursion depth exceeded'):
+                result = False
+            else:
+                if verbose:
+                    print(e)
+                else:
+                    raise e
+        return (result, '\n'.join(debugger.lines))
+
+    def _attempt_proof(self, agenda, accessible_vars, atoms, debug):
+        (current, context), category = agenda.pop_first()
+
+        #if there's nothing left in the agenda, and we haven't closed the path
+        if not current:
+            debug.line('AGENDA EMPTY')
+            return False
+
+        proof_method = { Categories.ATOM:     self._attempt_proof_atom,
+                         Categories.PROP:     self._attempt_proof_prop,
+                         Categories.N_ATOM:   self._attempt_proof_n_atom,
+                         Categories.N_PROP:   self._attempt_proof_n_prop,
+                         Categories.APP:      self._attempt_proof_app,
+                         Categories.N_APP:    self._attempt_proof_n_app,
+                         Categories.N_EQ:     self._attempt_proof_n_eq,
+                         Categories.D_NEG:    self._attempt_proof_d_neg,
+                         Categories.N_ALL:    self._attempt_proof_n_all,
+                         Categories.N_EXISTS: self._attempt_proof_n_some,
+                         Categories.AND:      self._attempt_proof_and,
+                         Categories.N_OR:     self._attempt_proof_n_or,
+                         Categories.N_IMP:    self._attempt_proof_n_imp,
+                         Categories.OR:       self._attempt_proof_or,
+                         Categories.IMP:      self._attempt_proof_imp,
+                         Categories.N_AND:    self._attempt_proof_n_and,
+                         Categories.IFF:      self._attempt_proof_iff,
+                         Categories.N_IFF:    self._attempt_proof_n_iff,
+                         Categories.EQ:       self._attempt_proof_eq,
+                         Categories.EXISTS:   self._attempt_proof_some,
+                         Categories.ALL:      self._attempt_proof_all,
+                        }[category]
+
+        debug.line((current, context))
+        return proof_method(current, context, agenda, accessible_vars, atoms, debug)
+
+    def _attempt_proof_atom(self, current, context, agenda, accessible_vars, atoms, debug):
+        # Check if the branch is closed.  Return 'True' if it is
+        if (current, True) in atoms:
+            debug.line('CLOSED', 1)
+            return True
+
+        if context:
+            if isinstance(context.term, NegatedExpression):
+                current = current.negate()
+            agenda.put(context(current).simplify())
+            return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+        else:
+            #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+            agenda.mark_alls_fresh();
+            return self._attempt_proof(agenda, accessible_vars|set(current.args), atoms|set([(current, False)]), debug+1)
+
+    def _attempt_proof_n_atom(self, current, context, agenda, accessible_vars, atoms, debug):
+        # Check if the branch is closed.  Return 'True' if it is
+        if (current.term, False) in atoms:
+            debug.line('CLOSED', 1)
+            return True
+
+        if context:
+            if isinstance(context.term, NegatedExpression):
+                current = current.negate()
+            agenda.put(context(current).simplify())
+            return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+        else:
+            #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+            agenda.mark_alls_fresh();
+            return self._attempt_proof(agenda, accessible_vars|set(current.term.args), atoms|set([(current.term, True)]), debug+1)
+
+    def _attempt_proof_prop(self, current, context, agenda, accessible_vars, atoms, debug):
+        # Check if the branch is closed.  Return 'True' if it is
+        if (current, True) in atoms:
+            debug.line('CLOSED', 1)
+            return True
+
+        #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+        agenda.mark_alls_fresh();
+        return self._attempt_proof(agenda, accessible_vars, atoms|set([(current, False)]), debug+1)
+
+    def _attempt_proof_n_prop(self, current, context, agenda, accessible_vars, atoms, debug):
+        # Check if the branch is closed.  Return 'True' if it is
+        if (current.term, False) in atoms:
+            debug.line('CLOSED', 1)
+            return True
+
+        #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+        agenda.mark_alls_fresh();
+        return self._attempt_proof(agenda, accessible_vars, atoms|set([(current.term, True)]), debug+1)
+
+    def _attempt_proof_app(self, current, context, agenda, accessible_vars, atoms, debug):
+        f, args = current.uncurry()
+        for i, arg in enumerate(args):
+            if not TableauProver.is_atom(arg):
+                ctx = f
+                nv = Variable('X%s' % _counter.get())
+                for j,a in enumerate(args):
+                    ctx = (ctx(VariableExpression(nv)) if i == j else ctx(a))
+                if context:
+                    ctx = context(ctx).simplify()
+                ctx = LambdaExpression(nv, ctx)
+                agenda.put(arg, ctx)
+                return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+        raise Exception('If this method is called, there must be a non-atomic argument')
+
+    def _attempt_proof_n_app(self, current, context, agenda, accessible_vars, atoms, debug):
+        f, args = current.term.uncurry()
+        for i, arg in enumerate(args):
+            if not TableauProver.is_atom(arg):
+                ctx = f
+                nv = Variable('X%s' % _counter.get())
+                for j,a in enumerate(args):
+                    ctx = (ctx(VariableExpression(nv)) if i == j else ctx(a))
+                if context:
+                    #combine new context with existing
+                    ctx = context(ctx).simplify()
+                ctx = LambdaExpression(nv, -ctx)
+                agenda.put(-arg, ctx)
+                return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+        raise Exception('If this method is called, there must be a non-atomic argument')
+
+    def _attempt_proof_n_eq(self, current, context, agenda, accessible_vars, atoms, debug):
+        ###########################################################################
+        # Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b'
+        ###########################################################################
+        if current.term.first == current.term.second:
+            debug.line('CLOSED', 1)
+            return True
+
+        agenda[Categories.N_EQ].add((current,context))
+        current._exhausted = True
+        return self._attempt_proof(agenda, accessible_vars|set([current.term.first, current.term.second]), atoms, debug+1)
+
+    def _attempt_proof_d_neg(self, current, context, agenda, accessible_vars, atoms, debug):
+        agenda.put(current.term.term, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_n_all(self, current, context, agenda, accessible_vars, atoms, debug):
+        agenda[Categories.EXISTS].add((ExistsExpression(current.term.variable, -current.term.term), context))
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_n_some(self, current, context, agenda, accessible_vars, atoms, debug):
+        agenda[Categories.ALL].add((AllExpression(current.term.variable, -current.term.term), context))
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_and(self, current, context, agenda, accessible_vars, atoms, debug):
+        agenda.put(current.first, context)
+        agenda.put(current.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_n_or(self, current, context, agenda, accessible_vars, atoms, debug):
+        agenda.put(-current.term.first, context)
+        agenda.put(-current.term.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_n_imp(self, current, context, agenda, accessible_vars, atoms, debug):
+        agenda.put(current.term.first, context)
+        agenda.put(-current.term.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_or(self, current, context, agenda, accessible_vars, atoms, debug):
+        new_agenda = agenda.clone()
+        agenda.put(current.first, context)
+        new_agenda.put(current.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+                self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_imp(self, current, context, agenda, accessible_vars, atoms, debug):
+        new_agenda = agenda.clone()
+        agenda.put(-current.first, context)
+        new_agenda.put(current.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+                self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_n_and(self, current, context, agenda, accessible_vars, atoms, debug):
+        new_agenda = agenda.clone()
+        agenda.put(-current.term.first, context)
+        new_agenda.put(-current.term.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+                self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_iff(self, current, context, agenda, accessible_vars, atoms, debug):
+        new_agenda = agenda.clone()
+        agenda.put(current.first, context)
+        agenda.put(current.second, context)
+        new_agenda.put(-current.first, context)
+        new_agenda.put(-current.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+                self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_n_iff(self, current, context, agenda, accessible_vars, atoms, debug):
+        new_agenda = agenda.clone()
+        agenda.put(current.term.first, context)
+        agenda.put(-current.term.second, context)
+        new_agenda.put(-current.term.first, context)
+        new_agenda.put(current.term.second, context)
+        return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+                self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
+
+    def _attempt_proof_eq(self, current, context, agenda, accessible_vars, atoms, debug):
+        #########################################################################
+        # Since 'current' is of the form '(a = b)', replace ALL free instances
+        # of 'a' with 'b'
+        #########################################################################
+        agenda.put_atoms(atoms)
+        agenda.replace_all(current.first, current.second)
+        accessible_vars.discard(current.first)
+        agenda.mark_neqs_fresh();
+        return self._attempt_proof(agenda, accessible_vars, set(), debug+1)
+
+    def _attempt_proof_some(self, current, context, agenda, accessible_vars, atoms, debug):
+        new_unique_variable = VariableExpression(unique_variable())
+        agenda.put(current.term.replace(current.variable, new_unique_variable), context)
+        agenda.mark_alls_fresh()
+        return self._attempt_proof(agenda, accessible_vars|set([new_unique_variable]), atoms, debug+1)
+
+    def _attempt_proof_all(self, current, context, agenda, accessible_vars, atoms, debug):
+        try:
+            current._used_vars
+        except AttributeError:
+            current._used_vars = set()
+
+        #if there are accessible_vars on the path
+        if accessible_vars:
+            # get the set of bound variables that have not be used by this AllExpression
+            bv_available = accessible_vars - current._used_vars
+
+            if bv_available:
+                variable_to_use = list(bv_available)[0]
+                debug.line('--> Using \'%s\'' % variable_to_use, 2)
+                current._used_vars |= set([variable_to_use])
+                agenda.put(current.term.replace(current.variable, variable_to_use), context)
+                agenda[Categories.ALL].add((current,context))
+                return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+            else:
+                #no more available variables to substitute
+                debug.line('--> Variables Exhausted', 2)
+                current._exhausted = True
+                agenda[Categories.ALL].add((current,context))
+                return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+        else:
+            new_unique_variable = VariableExpression(unique_variable())
+            debug.line('--> Using \'%s\'' % new_unique_variable, 2)
+            current._used_vars |= set([new_unique_variable])
+            agenda.put(current.term.replace(current.variable, new_unique_variable), context)
+            agenda[Categories.ALL].add((current,context))
+            agenda.mark_alls_fresh()
+            return self._attempt_proof(agenda, accessible_vars|set([new_unique_variable]), atoms, debug+1)
+
+    @staticmethod
+    def is_atom(e):
+        if isinstance(e, NegatedExpression):
+            e = e.term
+
+        if isinstance(e, ApplicationExpression):
+            for arg in e.args:
+                if not TableauProver.is_atom(arg):
+                    return False
+            return True
+        elif isinstance(e, AbstractVariableExpression) or \
+             isinstance(e, LambdaExpression):
+            return True
+        else:
+            return False
+
+
+class TableauProverCommand(BaseProverCommand):
+    def __init__(self, goal=None, assumptions=None, prover=None):
+        """
+        :param goal: Input expression to prove
+        :type goal: sem.Expression
+        :param assumptions: Input expressions to use as assumptions in
+            the proof.
+        :type assumptions: list(sem.Expression)
+        """
+        if prover is not None:
+            assert isinstance(prover, TableauProver)
+        else:
+            prover = TableauProver()
+
+        BaseProverCommand.__init__(self, prover, goal, assumptions)
+
+
+class Agenda(object):
+    def __init__(self):
+        self.sets = tuple(set() for i in range(21))
+
+    def clone(self):
+        new_agenda = Agenda()
+        set_list = [s.copy() for s in self.sets]
+
+        new_allExs = set()
+        for allEx,_ in set_list[Categories.ALL]:
+            new_allEx = AllExpression(allEx.variable, allEx.term)
+            try:
+                new_allEx._used_vars = set(used for used in allEx._used_vars)
+            except AttributeError:
+                new_allEx._used_vars = set()
+            new_allExs.add((new_allEx,None))
+        set_list[Categories.ALL] = new_allExs
+
+        set_list[Categories.N_EQ] = set((NegatedExpression(n_eq.term),ctx)
+                                        for (n_eq,ctx) in set_list[Categories.N_EQ])
+
+        new_agenda.sets = tuple(set_list)
+        return new_agenda
+
+    def __getitem__(self, index):
+        return self.sets[index]
+
+    def put(self, expression, context=None):
+        if isinstance(expression, AllExpression):
+            ex_to_add = AllExpression(expression.variable, expression.term)
+            try:
+                ex_to_add._used_vars = set(used for used in expression._used_vars)
+            except AttributeError:
+                ex_to_add._used_vars = set()
+        else:
+            ex_to_add = expression
+        self.sets[self._categorize_expression(ex_to_add)].add((ex_to_add, context))
+
+    def put_all(self, expressions):
+        for expression in expressions:
+            self.put(expression)
+
+    def put_atoms(self, atoms):
+        for atom, neg in atoms:
+            if neg:
+                self[Categories.N_ATOM].add((-atom,None))
+            else:
+                self[Categories.ATOM].add((atom,None))
+
+    def pop_first(self):
+        """ Pop the first expression that appears in the agenda """
+        for i,s in enumerate(self.sets):
+            if s:
+                if i in [Categories.N_EQ, Categories.ALL]:
+                    for ex in s:
+                        try:
+                            if not ex[0]._exhausted:
+                                s.remove(ex)
+                                return (ex, i)
+                        except AttributeError:
+                            s.remove(ex)
+                            return (ex, i)
+                else:
+                    return (s.pop(), i)
+        return ((None, None), None)
+
+    def replace_all(self, old, new):
+        for s in self.sets:
+            for ex,ctx in s:
+                ex.replace(old.variable, new)
+                if ctx is not None:
+                    ctx.replace(old.variable, new)
+
+    def mark_alls_fresh(self):
+        for u,_ in self.sets[Categories.ALL]:
+            u._exhausted = False
+
+    def mark_neqs_fresh(self):
+        for neq,_ in self.sets[Categories.N_EQ]:
+            neq._exhausted = False
+
+    def _categorize_expression(self, current):
+        if isinstance(current, NegatedExpression):
+            return self._categorize_NegatedExpression(current)
+        elif isinstance(current, FunctionVariableExpression):
+            return Categories.PROP
+        elif TableauProver.is_atom(current):
+            return Categories.ATOM
+        elif isinstance(current, AllExpression):
+            return Categories.ALL
+        elif isinstance(current, AndExpression):
+            return Categories.AND
+        elif isinstance(current, OrExpression):
+            return Categories.OR
+        elif isinstance(current, ImpExpression):
+            return Categories.IMP
+        elif isinstance(current, IffExpression):
+            return Categories.IFF
+        elif isinstance(current, EqualityExpression):
+            return Categories.EQ
+        elif isinstance(current, ExistsExpression):
+            return Categories.EXISTS
+        elif isinstance(current, ApplicationExpression):
+            return Categories.APP
+        else:
+            raise ProverParseError("cannot categorize %s" % \
+                                   current.__class__.__name__)
+
+    def _categorize_NegatedExpression(self, current):
+        negated = current.term
+
+        if isinstance(negated, NegatedExpression):
+            return Categories.D_NEG
+        elif isinstance(negated, FunctionVariableExpression):
+            return Categories.N_PROP
+        elif TableauProver.is_atom(negated):
+            return Categories.N_ATOM
+        elif isinstance(negated, AllExpression):
+            return Categories.N_ALL
+        elif isinstance(negated, AndExpression):
+            return Categories.N_AND
+        elif isinstance(negated, OrExpression):
+            return Categories.N_OR
+        elif isinstance(negated, ImpExpression):
+            return Categories.N_IMP
+        elif isinstance(negated, IffExpression):
+            return Categories.N_IFF
+        elif isinstance(negated, EqualityExpression):
+            return Categories.N_EQ
+        elif isinstance(negated, ExistsExpression):
+            return Categories.N_EXISTS
+        elif isinstance(negated, ApplicationExpression):
+            return Categories.N_APP
+        else:
+            raise ProverParseError("cannot categorize %s" % \
+                                   negated.__class__.__name__)
+
+
+class Debug(object):
+    def __init__(self, verbose, indent=0, lines=None):
+        self.verbose = verbose
+        self.indent = indent
+
+        if not lines: lines = []
+        self.lines = lines
+
+    def __add__(self, increment):
+        return Debug(self.verbose, self.indent+1, self.lines)
+
+    def line(self, data, indent=0):
+        if isinstance(data, tuple):
+            ex, ctx = data
+            if ctx:
+                data = '%s, %s' % (ex, ctx)
+            else:
+                data = '%s' % ex
+
+            if isinstance(ex, AllExpression):
+                try:
+                    used_vars = "[%s]" % (",".join("%s" % ve.variable.name for ve in ex._used_vars))
+                    data += ':   %s' % used_vars
+                except AttributeError:
+                    data += ':   []'
+
+        newline = '%s%s' % ('   '*(self.indent+indent), data)
+        self.lines.append(newline)
+
+        if self.verbose:
+            print(newline)
+
+
+class Categories(object):
+    ATOM     = 0
+    PROP     = 1
+    N_ATOM   = 2
+    N_PROP   = 3
+    APP      = 4
+    N_APP    = 5
+    N_EQ     = 6
+    D_NEG    = 7
+    N_ALL    = 8
+    N_EXISTS = 9
+    AND      = 10
+    N_OR     = 11
+    N_IMP    = 12
+    OR       = 13
+    IMP      = 14
+    N_AND    = 15
+    IFF      = 16
+    N_IFF    = 17
+    EQ       = 18
+    EXISTS   = 19
+    ALL      = 20
+
+
+def testTableauProver():
+    tableau_test('P | -P')
+    tableau_test('P & -P')
+    tableau_test('Q', ['P', '(P -> Q)'])
+    tableau_test('man(x)')
+    tableau_test('(man(x) -> man(x))')
+    tableau_test('(man(x) -> --man(x))')
+    tableau_test('-(man(x) and -man(x))')
+    tableau_test('(man(x) or -man(x))')
+    tableau_test('(man(x) -> man(x))')
+    tableau_test('-(man(x) and -man(x))')
+    tableau_test('(man(x) or -man(x))')
+    tableau_test('(man(x) -> man(x))')
+    tableau_test('(man(x) iff man(x))')
+    tableau_test('-(man(x) iff -man(x))')
+    tableau_test('all x.man(x)')
+    tableau_test('all x.all y.((x = y) -> (y = x))')
+    tableau_test('all x.all y.all z.(((x = y) & (y = z)) -> (x = z))')
+#    tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
+#    tableau_test('some x.all y.sees(x,y)')
+
+    p1 = 'all x.(man(x) -> mortal(x))'
+    p2 = 'man(Socrates)'
+    c = 'mortal(Socrates)'
+    tableau_test(c, [p1, p2])
+
+    p1 = 'all x.(man(x) -> walks(x))'
+    p2 = 'man(John)'
+    c = 'some y.walks(y)'
+    tableau_test(c, [p1, p2])
+
+    p = '((x = y) & walks(y))'
+    c = 'walks(x)'
+    tableau_test(c, [p])
+
+    p = '((x = y) & ((y = z) & (z = w)))'
+    c = '(x = w)'
+    tableau_test(c, [p])
+
+    p = 'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))'
+    c = 'some e0.walk(e0,mary)'
+    tableau_test(c, [p])
+
+    c = '(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))'
+    tableau_test(c)
+
+#    p = 'some e1.some e2.((believe e1 john e2) and (walk e2 mary))'
+#    c = 'some x.some e3.some e4.((believe e3 x e4) and (walk e4 mary))'
+#    tableau_test(c, [p])
+
+
+def testHigherOrderTableauProver():
+    tableau_test('believe(j, -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
+    tableau_test('believe(j, lie(b) & cheat(b))', ['believe(j, lie(b))'])
+    tableau_test('believe(j, lie(b))', ['lie(b)']) #how do we capture that John believes all things that are true
+    tableau_test('believe(j, know(b, cheat(b)))', ['believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))'])
+    tableau_test('P(Q(y), R(y) & R(z))', ['P(Q(x) & Q(y), R(y) & R(z))'])
+
+    tableau_test('believe(j, cheat(b) & lie(b))', ['believe(j, lie(b) & cheat(b))'])
+    tableau_test('believe(j, -cheat(b) & -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
+
+
+def tableau_test(c, ps=None, verbose=False):
+    pc = Expression.fromstring(c)
+    pps = ([Expression.fromstring(p) for p in ps] if ps else [])
+    if not ps:
+        ps = []
+    print('%s |- %s: %s' % (', '.join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose)))
+
+def demo():
+    testTableauProver()
+    testHigherOrderTableauProver()
+
+if __name__ == '__main__':
+    demo()
+
diff --git a/nlp_resource_data/nltk/inference/tableau.pyc b/nlp_resource_data/nltk/inference/tableau.pyc

new file mode 100755 (executable)

index 0000000..324960f

Binary files /dev/null and b/nlp_resource_data/nltk/inference/tableau.pyc differ
diff --git a/nlp_resource_data/nltk/internals.py b/nlp_resource_data/nltk/internals.py

new file mode 100755 (executable)

index 0000000..84e28cf
--- /dev/null
+++ b/nlp_resource_data/nltk/internals.py
@@ -0,0 +1,982 @@
+# Natural Language Toolkit: Internal utility functions
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+#         Nitin Madnani <nmadnani@ets.org>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function
+
+import subprocess
+import os
+import fnmatch
+import re
+import warnings
+import textwrap
+import types
+import sys
+import stat
+import locale
+
+# Use the c version of ElementTree, which is faster, if possible:
+try:
+    from xml.etree import cElementTree as ElementTree
+except ImportError:
+    from xml.etree import ElementTree
+
+from six import string_types
+
+from nltk import __file__
+from nltk import compat
+
+##########################################################################
+# Java Via Command-Line
+##########################################################################
+
+_java_bin = None
+_java_options = []
+# [xx] add classpath option to config_java?
+def config_java(bin=None, options=None, verbose=False):
+    """
+    Configure nltk's java interface, by letting nltk know where it can
+    find the Java binary, and what extra options (if any) should be
+    passed to Java when it is run.
+
+    :param bin: The full path to the Java binary.  If not specified,
+        then nltk will search the system for a Java binary; and if
+        one is not found, it will raise a ``LookupError`` exception.
+    :type bin: str
+    :param options: A list of options that should be passed to the
+        Java binary when it is called.  A common value is
+        ``'-Xmx512m'``, which tells Java binary to increase
+        the maximum heap size to 512 megabytes.  If no options are
+        specified, then do not modify the options list.
+    :type options: list(str)
+    """
+    global _java_bin, _java_options
+    _java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME'], verbose=verbose, binary_names=['java.exe'])
+
+    if options is not None:
+        if isinstance(options, string_types):
+            options = options.split()
+        _java_options = list(options)
+
+def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
+         blocking=True):
+    """
+    Execute the given java command, by opening a subprocess that calls
+    Java.  If java has not yet been configured, it will be configured
+    by calling ``config_java()`` with no arguments.
+
+    :param cmd: The java command that should be called, formatted as
+        a list of strings.  Typically, the first string will be the name
+        of the java class; and the remaining strings will be arguments
+        for that java class.
+    :type cmd: list(str)
+
+    :param classpath: A ``':'`` separated list of directories, JAR
+        archives, and ZIP archives to search for class files.
+    :type classpath: str
+
+    :param stdin, stdout, stderr: Specify the executed programs'
+        standard input, standard output and standard error file
+        handles, respectively.  Valid values are ``subprocess.PIPE``,
+        an existing file descriptor (a positive integer), an existing
+        file object, and None.  ``subprocess.PIPE`` indicates that a
+        new pipe to the child should be created.  With None, no
+        redirection will occur; the child's file handles will be
+        inherited from the parent.  Additionally, stderr can be
+        ``subprocess.STDOUT``, which indicates that the stderr data
+        from the applications should be captured into the same file
+        handle as for stdout.
+
+    :param blocking: If ``false``, then return immediately after
+        spawning the subprocess.  In this case, the return value is
+        the ``Popen`` object, and not a ``(stdout, stderr)`` tuple.
+
+    :return: If ``blocking=True``, then return a tuple ``(stdout,
+        stderr)``, containing the stdout and stderr outputs generated
+        by the java command if the ``stdout`` and ``stderr`` parameters
+        were set to ``subprocess.PIPE``; or None otherwise.  If
+        ``blocking=False``, then return a ``subprocess.Popen`` object.
+
+    :raise OSError: If the java command returns a nonzero return code.
+    """
+    if stdin == 'pipe': stdin = subprocess.PIPE
+    if stdout == 'pipe': stdout = subprocess.PIPE
+    if stderr == 'pipe': stderr = subprocess.PIPE
+    if isinstance(cmd, string_types):
+        raise TypeError('cmd should be a list of strings')
+
+    # Make sure we know where a java binary is.
+    if _java_bin is None:
+        config_java()
+
+    # Set up the classpath.
+    if isinstance(classpath, string_types):
+        classpaths=[classpath]
+    else:
+        classpaths=list(classpath)
+    classpath=os.path.pathsep.join(classpaths)
+
+    # Construct the full command string.
+    cmd = list(cmd)
+    cmd = ['-cp', classpath] + cmd
+    cmd = [_java_bin] + _java_options + cmd
+
+    # Call java via a subprocess
+    p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
+    if not blocking: return p
+    (stdout, stderr) = p.communicate()
+
+    # Check the return code.
+    if p.returncode != 0:
+        print(_decode_stdoutdata(stderr))
+        raise OSError('Java command failed : ' + str(cmd))
+
+    return (stdout, stderr)
+
+if 0:
+    #config_java(options='-Xmx512m')
+    # Write:
+    #java('weka.classifiers.bayes.NaiveBayes',
+    #     ['-d', '/tmp/names.model', '-t', '/tmp/train.arff'],
+    #     classpath='/Users/edloper/Desktop/weka/weka.jar')
+    # Read:
+    (a,b) = java(['weka.classifiers.bayes.NaiveBayes',
+                  '-l', '/tmp/names.model', '-T', '/tmp/test.arff',
+                  '-p', '0'],#, '-distribution'],
+                 classpath='/Users/edloper/Desktop/weka/weka.jar')
+
+
+######################################################################
+# Parsing
+######################################################################
+
+class ReadError(ValueError):
+    """
+    Exception raised by read_* functions when they fail.
+    :param position: The index in the input string where an error occurred.
+    :param expected: What was expected when an error occurred.
+    """
+    def __init__(self, expected, position):
+        ValueError.__init__(self, expected, position)
+        self.expected = expected
+        self.position = position
+    def __str__(self):
+        return 'Expected %s at %s' % (self.expected, self.position)
+
+_STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
+def read_str(s, start_position):
+    """
+    If a Python string literal begins at the specified position in the
+    given string, then return a tuple ``(val, end_position)``
+    containing the value of the string literal and the position where
+    it ends.  Otherwise, raise a ``ReadError``.
+
+    :param s: A string that will be checked to see if within which a
+        Python string literal exists.
+    :type s: str
+
+    :param start_position: The specified beginning position of the string ``s``
+        to begin regex matching.
+    :type start_position: int
+
+    :return: A tuple containing the matched string literal evaluated as a
+        string and the end position of the string literal.
+    :rtype: tuple(str, int)
+
+    :raise ReadError: If the ``_STRING_START_RE`` regex doesn't return a
+        match in ``s`` at ``start_position``, i.e., open quote. If the
+        ``_STRING_END_RE`` regex doesn't return a match in ``s`` at the
+        end of the first match, i.e., close quote.
+    :raise ValueError: If an invalid string (i.e., contains an invalid
+        escape sequence) is passed into the ``eval``.
+
+    :Example:
+    >>> from nltk.internals import read_str
+    >>> read_str('"Hello", World!', 0)
+    ('Hello', 7)
+
+    """
+    # Read the open quote, and any modifiers.
+    m = _STRING_START_RE.match(s, start_position)
+    if not m: raise ReadError('open quote', start_position)
+    quotemark = m.group(1)
+
+    # Find the close quote.
+    _STRING_END_RE = re.compile(r'\\|%s' % quotemark)
+    position = m.end()
+    while True:
+        match = _STRING_END_RE.search(s, position)
+        if not match: raise ReadError('close quote', position)
+        if match.group(0) == '\\': position = match.end()+1
+        else: break
+
+    # Process it, using eval.  Strings with invalid escape sequences
+    # might raise ValueEerror.
+    try:
+        return eval(s[start_position:match.end()]), match.end()
+    except ValueError as e:
+        raise ReadError('invalid string (%s)' % e)
+
+_READ_INT_RE = re.compile(r'-?\d+')
+def read_int(s, start_position):
+    """
+    If an integer begins at the specified position in the given
+    string, then return a tuple ``(val, end_position)`` containing the
+    value of the integer and the position where it ends.  Otherwise,
+    raise a ``ReadError``.
+
+    :param s: A string that will be checked to see if within which a
+        Python integer exists.
+    :type s: str
+
+    :param start_position: The specified beginning position of the string ``s``
+        to begin regex matching.
+    :type start_position: int
+
+    :return: A tuple containing the matched integer casted to an int,
+        and the end position of the int in ``s``.
+    :rtype: tuple(int, int)
+
+    :raise ReadError: If the ``_READ_INT_RE`` regex doesn't return a
+        match in ``s`` at ``start_position``.
+
+    :Example:
+    >>> from nltk.internals import read_int
+    >>> read_int('42 is the answer', 0)
+    (42, 2)
+
+    """
+    m = _READ_INT_RE.match(s, start_position)
+    if not m: raise ReadError('integer', start_position)
+    return int(m.group()), m.end()
+
+_READ_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
+def read_number(s, start_position):
+    """
+    If an integer or float begins at the specified position in the
+    given string, then return a tuple ``(val, end_position)``
+    containing the value of the number and the position where it ends.
+    Otherwise, raise a ``ReadError``.
+
+    :param s: A string that will be checked to see if within which a
+        Python number exists.
+    :type s: str
+
+    :param start_position: The specified beginning position of the string ``s``
+        to begin regex matching.
+    :type start_position: int
+
+    :return: A tuple containing the matched number casted to a ``float``,
+        and the end position of the number in ``s``.
+    :rtype: tuple(float, int)
+
+    :raise ReadError: If the ``_READ_NUMBER_VALUE`` regex doesn't return a
+        match in ``s`` at ``start_position``.
+
+    :Example:
+    >>> from nltk.internals import read_number
+    >>> read_number('Pi is 3.14159', 6)
+    (3.14159, 13)
+
+    """
+    m = _READ_NUMBER_VALUE.match(s, start_position)
+    if not m or not (m.group(1) or m.group(2)):
+        raise ReadError('number', start_position)
+    if m.group(2): return float(m.group()), m.end()
+    else: return int(m.group()), m.end()
+
+
+
+######################################################################
+# Check if a method has been overridden
+######################################################################
+
+def overridden(method):
+    """
+    :return: True if ``method`` overrides some method with the same
+    name in a base class.  This is typically used when defining
+    abstract base classes or interfaces, to allow subclasses to define
+    either of two related methods:
+
+        >>> class EaterI:
+        ...     '''Subclass must define eat() or batch_eat().'''
+        ...     def eat(self, food):
+        ...         if overridden(self.batch_eat):
+        ...             return self.batch_eat([food])[0]
+        ...         else:
+        ...             raise NotImplementedError()
+        ...     def batch_eat(self, foods):
+        ...         return [self.eat(food) for food in foods]
+
+    :type method: instance method
+    """
+    # [xx] breaks on classic classes!
+    if isinstance(method, types.MethodType) and compat.get_im_class(method) is not None:
+        name = method.__name__
+        funcs = [cls.__dict__[name]
+                 for cls in _mro(compat.get_im_class(method))
+                 if name in cls.__dict__]
+        return len(funcs) > 1
+    else:
+        raise TypeError('Expected an instance method.')
+
+def _mro(cls):
+    """
+    Return the method resolution order for ``cls`` -- i.e., a list
+    containing ``cls`` and all its base classes, in the order in which
+    they would be checked by ``getattr``.  For new-style classes, this
+    is just cls.__mro__.  For classic classes, this can be obtained by
+    a depth-first left-to-right traversal of ``__bases__``.
+    """
+    if isinstance(cls, type):
+        return cls.__mro__
+    else:
+        mro = [cls]
+        for base in cls.__bases__: mro.extend(_mro(base))
+        return mro
+
+######################################################################
+# Deprecation decorator & base class
+######################################################################
+# [xx] dedent msg first if it comes from  a docstring.
+
+def _add_epytext_field(obj, field, message):
+    """Add an epytext @field to a given object's docstring."""
+    indent = ''
+    # If we already have a docstring, then add a blank line to separate
+    # it from the new field, and check its indentation.
+    if obj.__doc__:
+        obj.__doc__ = obj.__doc__.rstrip()+'\n\n'
+        indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
+        if indents: indent = min(indents)
+    # If we don't have a docstring, add an empty one.
+    else:
+        obj.__doc__ = ''
+
+    obj.__doc__ += textwrap.fill('@%s: %s' % (field, message),
+                                 initial_indent=indent,
+                                 subsequent_indent=indent+'    ')
+
+def deprecated(message):
+    """
+    A decorator used to mark functions as deprecated.  This will cause
+    a warning to be printed the when the function is used.  Usage:
+
+        >>> from nltk.internals import deprecated
+        >>> @deprecated('Use foo() instead')
+        ... def bar(x):
+        ...     print(x/10)
+
+    """
+
+    def decorator(func):
+        msg = ("Function %s() has been deprecated.  %s"
+               % (func.__name__, message))
+        msg = '\n' + textwrap.fill(msg, initial_indent='  ',
+                                   subsequent_indent='  ')
+        def newFunc(*args, **kwargs):
+            warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+            return func(*args, **kwargs)
+
+        # Copy the old function's name, docstring, & dict
+        newFunc.__dict__.update(func.__dict__)
+        newFunc.__name__ = func.__name__
+        newFunc.__doc__ = func.__doc__
+        newFunc.__deprecated__ = True
+        # Add a @deprecated field to the docstring.
+        _add_epytext_field(newFunc, 'deprecated', message)
+        return newFunc
+    return decorator
+
+class Deprecated(object):
+    """
+    A base class used to mark deprecated classes.  A typical usage is to
+    alert users that the name of a class has changed:
+
+        >>> from nltk.internals import Deprecated
+        >>> class NewClassName(object):
+        ...     pass # All logic goes here.
+        ...
+        >>> class OldClassName(Deprecated, NewClassName):
+        ...     "Use NewClassName instead."
+
+    The docstring of the deprecated class will be used in the
+    deprecation warning message.
+    """
+    def __new__(cls, *args, **kwargs):
+        # Figure out which class is the deprecated one.
+        dep_cls = None
+        for base in _mro(cls):
+            if Deprecated in base.__bases__:
+                dep_cls = base; break
+        assert dep_cls, 'Unable to determine which base is deprecated.'
+
+        # Construct an appropriate warning.
+        doc = dep_cls.__doc__ or ''.strip()
+        # If there's a @deprecated field, strip off the field marker.
+        doc = re.sub(r'\A\s*@deprecated:', r'', doc)
+        # Strip off any indentation.
+        doc = re.sub(r'(?m)^\s*', '', doc)
+        # Construct a 'name' string.
+        name = 'Class %s' % dep_cls.__name__
+        if cls != dep_cls:
+            name += ' (base class for %s)' % cls.__name__
+        # Put it all together.
+        msg = '%s has been deprecated.  %s' % (name, doc)
+        # Wrap it.
+        msg = '\n' + textwrap.fill(msg, initial_indent='    ',
+                                   subsequent_indent='    ')
+        warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+        # Do the actual work of __new__.
+        return object.__new__(cls)
+
+##########################################################################
+# COUNTER, FOR UNIQUE NAMING
+##########################################################################
+
+class Counter:
+    """
+    A counter that auto-increments each time its value is read.
+    """
+    def __init__(self, initial_value=0):
+        self._value = initial_value
+    def get(self):
+        self._value += 1
+        return self._value
+
+##########################################################################
+# Search for files/binaries
+##########################################################################
+
+def find_file_iter(filename, env_vars=(), searchpath=(),
+    file_names=None, url=None, verbose=False, finding_dir=False):
+    """
+    Search for a file to be used by nltk.
+
+    :param filename: The name or path of the file.
+    :param env_vars: A list of environment variable names to check.
+    :param file_names: A list of alternative file names to check.
+    :param searchpath: List of directories to search.
+    :param url: URL presented to user for download help.
+    :param verbose: Whether or not to print path when a file is found.
+    """
+    file_names = [filename] + (file_names or [])
+    assert isinstance(filename, string_types)
+    assert not isinstance(file_names, string_types)
+    assert not isinstance(searchpath, string_types)
+    if isinstance(env_vars, string_types):
+        env_vars = env_vars.split()
+    yielded = False
+
+    # File exists, no magic
+    for alternative in file_names:
+        path_to_file = os.path.join(filename, alternative)
+        if os.path.isfile(path_to_file):
+            if verbose:
+                print('[Found %s: %s]' % (filename, path_to_file))
+            yielded = True
+            yield path_to_file
+        # Check the bare alternatives
+        if os.path.isfile(alternative):
+            if verbose:
+                print('[Found %s: %s]' % (filename, alternative))
+            yielded = True
+            yield alternative
+        # Check if the alternative is inside a 'file' directory
+        path_to_file = os.path.join(filename, 'file', alternative)
+        if os.path.isfile(path_to_file):
+            if verbose:
+                print('[Found %s: %s]' % (filename, path_to_file))
+            yielded = True
+            yield path_to_file
+
+    # Check environment variables
+    for env_var in env_vars:
+        if env_var in os.environ:
+            if finding_dir: # This is to file a directory instead of file
+                yielded = True
+                yield os.environ[env_var]
+
+            for env_dir in os.environ[env_var].split(os.pathsep):
+                # Check if the environment variable contains a direct path to the bin
+                if os.path.isfile(env_dir):
+                    if verbose:
+                        print('[Found %s: %s]'%(filename, env_dir))
+                    yielded = True
+                    yield env_dir
+                # Check if the possible bin names exist inside the environment variable directories
+                for alternative in file_names:
+                    path_to_file = os.path.join(env_dir, alternative)
+                    if os.path.isfile(path_to_file):
+                        if verbose:
+                            print('[Found %s: %s]'%(filename, path_to_file))
+                        yielded = True
+                        yield path_to_file
+                    # Check if the alternative is inside a 'file' directory
+                    # path_to_file = os.path.join(env_dir, 'file', alternative)
+
+                    # Check if the alternative is inside a 'bin' directory
+                    path_to_file = os.path.join(env_dir, 'bin', alternative)
+
+                    if os.path.isfile(path_to_file):
+                        if verbose:
+                            print('[Found %s: %s]' % (filename, path_to_file))
+                        yielded = True
+                        yield path_to_file
+
+    # Check the path list.
+    for directory in searchpath:
+        for alternative in file_names:
+            path_to_file = os.path.join(directory, alternative)
+            if os.path.isfile(path_to_file):
+                yielded = True
+                yield path_to_file
+
+    # If we're on a POSIX system, then try using the 'which' command
+    # to find the file.
+    if os.name == 'posix':
+        for alternative in file_names:
+            try:
+                p = subprocess.Popen(['which', alternative],
+                        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                stdout, stderr = p.communicate()
+                path = _decode_stdoutdata(stdout).strip()
+                if path.endswith(alternative) and os.path.exists(path):
+                    if verbose:
+                        print('[Found %s: %s]' % (filename, path))
+                    yielded = True
+                    yield path
+            except (KeyboardInterrupt, SystemExit, OSError):
+                raise
+            except:
+                pass
+
+    if not yielded:
+        msg = ("NLTK was unable to find the %s file!" "\nUse software specific "
+               "configuration paramaters" % filename)
+        if env_vars: msg += ' or set the %s environment variable' % env_vars[0]
+        msg += '.'
+        if searchpath:
+            msg += '\n\n  Searched in:'
+            msg += ''.join('\n    - %s' % d for d in searchpath)
+        if url: msg += ('\n\n  For more information on %s, see:\n    <%s>' %
+                        (filename, url))
+        div = '='*75
+        raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
+
+
+def find_file(filename, env_vars=(), searchpath=(),
+        file_names=None, url=None, verbose=False):
+    return next(find_file_iter(filename, env_vars, searchpath,
+                               file_names, url, verbose))
+
+
+def find_dir(filename, env_vars=(), searchpath=(),
+        file_names=None, url=None, verbose=False):
+    return next(find_file_iter(filename, env_vars, searchpath,
+                               file_names, url, verbose, finding_dir=True))
+
+
+def find_binary_iter(name, path_to_bin=None, env_vars=(), searchpath=(),
+                binary_names=None, url=None, verbose=False):
+    """
+    Search for a file to be used by nltk.
+
+    :param name: The name or path of the file.
+    :param path_to_bin: The user-supplied binary location (deprecated)
+    :param env_vars: A list of environment variable names to check.
+    :param file_names: A list of alternative file names to check.
+    :param searchpath: List of directories to search.
+    :param url: URL presented to user for download help.
+    :param verbose: Whether or not to print path when a file is found.
+    """
+    for file in  find_file_iter(path_to_bin or name, env_vars, searchpath, binary_names,
+                     url, verbose):
+        yield file
+
+def find_binary(name, path_to_bin=None, env_vars=(), searchpath=(),
+                binary_names=None, url=None, verbose=False):
+    return next(find_binary_iter(name, path_to_bin, env_vars, searchpath,
+                                 binary_names, url, verbose))
+
+def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
+        searchpath=(), url=None, verbose=False, is_regex=False):
+    """
+    Search for a jar that is used by nltk.
+
+    :param name_pattern: The name of the jar file
+    :param path_to_jar: The user-supplied jar location, or None.
+    :param env_vars: A list of environment variable names to check
+                     in addition to the CLASSPATH variable which is
+                     checked by default.
+    :param searchpath: List of directories to search.
+    :param is_regex: Whether name is a regular expression.
+    """
+
+    assert isinstance(name_pattern, string_types)
+    assert not isinstance(searchpath, string_types)
+    if isinstance(env_vars, string_types):
+        env_vars = env_vars.split()
+    yielded = False
+
+    # Make sure we check the CLASSPATH first
+    env_vars = ['CLASSPATH'] + list(env_vars)
+
+    # If an explicit location was given, then check it, and yield it if
+    # it's present; otherwise, complain.
+    if path_to_jar is not None:
+        if os.path.isfile(path_to_jar):
+            yielded = True
+            yield path_to_jar
+        else:
+            raise LookupError('Could not find %s jar file at %s' %
+                            (name_pattern, path_to_jar))
+
+    # Check environment variables
+    for env_var in env_vars:
+        if env_var in os.environ:
+            if env_var == 'CLASSPATH':
+                classpath = os.environ['CLASSPATH']
+                for cp in classpath.split(os.path.pathsep):
+                    if os.path.isfile(cp):
+                        filename=os.path.basename(cp)
+                        if is_regex and re.match(name_pattern, filename) or \
+                                (not is_regex and filename == name_pattern):
+                            if verbose:
+                                print('[Found %s: %s]' % (name_pattern, cp))
+                            yielded = True
+                            yield cp
+                    # The case where user put directory containing the jar file in the classpath
+                    if os.path.isdir(cp):
+                        if not is_regex:
+                            if os.path.isfile(os.path.join(cp,name_pattern)):
+                                if verbose:
+                                    print('[Found %s: %s]' % (name_pattern, cp))
+                                yielded = True
+                                yield os.path.join(cp,name_pattern)
+                        else:
+                            # Look for file using regular expression
+                            for file_name in os.listdir(cp):
+                                if re.match(name_pattern,file_name):
+                                    if verbose:
+                                        print('[Found %s: %s]' % (name_pattern, os.path.join(cp,file_name)))
+                                    yielded = True
+                                    yield os.path.join(cp,file_name)
+
+            else:
+                jar_env = os.environ[env_var]
+                jar_iter = ((os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env))
+                            if os.path.isdir(jar_env) else (jar_env,))
+                for path_to_jar in jar_iter:
+                    if os.path.isfile(path_to_jar):
+                        filename=os.path.basename(path_to_jar)
+                        if is_regex and re.match(name_pattern, filename) or \
+                                (not is_regex and filename == name_pattern):
+                            if verbose:
+                                print('[Found %s: %s]' % (name_pattern, path_to_jar))
+                            yielded = True
+                            yield path_to_jar
+
+    # Check the path list.
+    for directory in searchpath:
+        if is_regex:
+            for filename in os.listdir(directory):
+                path_to_jar = os.path.join(directory, filename)
+                if os.path.isfile(path_to_jar):
+                    if re.match(name_pattern, filename):
+                        if verbose:
+                            print('[Found %s: %s]' % (filename, path_to_jar))
+                yielded = True
+                yield path_to_jar
+        else:
+            path_to_jar = os.path.join(directory, name_pattern)
+            if os.path.isfile(path_to_jar):
+                if verbose:
+                    print('[Found %s: %s]' % (name_pattern, path_to_jar))
+                yielded = True
+                yield path_to_jar
+
+    if not yielded:
+        # If nothing was found, raise an error
+        msg = ("NLTK was unable to find %s!" % name_pattern)
+        if env_vars: msg += ' Set the %s environment variable' % env_vars[0]
+        msg = textwrap.fill(msg+'.', initial_indent='  ',
+                            subsequent_indent='  ')
+        if searchpath:
+            msg += '\n\n  Searched in:'
+            msg += ''.join('\n    - %s' % d for d in searchpath)
+        if url:
+            msg += ('\n\n  For more information, on %s, see:\n    <%s>' %
+                    (name_pattern, url))
+        div = '='*75
+        raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
+
+def find_jar(name_pattern, path_to_jar=None, env_vars=(),
+        searchpath=(), url=None, verbose=False, is_regex=False):
+    return next(find_jar_iter(name_pattern, path_to_jar, env_vars,
+                         searchpath, url, verbose, is_regex))
+
+
+def find_jars_within_path(path_to_jars):
+       return [os.path.join(root, filename)
+                       for root, dirnames, filenames in os.walk(path_to_jars)
+                       for filename in fnmatch.filter(filenames, '*.jar')]
+
+def _decode_stdoutdata(stdoutdata):
+    """ Convert data read from stdout/stderr to unicode """
+    if not isinstance(stdoutdata, bytes):
+        return stdoutdata
+
+    encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding())
+    if encoding is None:
+        return stdoutdata.decode()
+    return stdoutdata.decode(encoding)
+
+##########################################################################
+# Import Stdlib Module
+##########################################################################
+
+def import_from_stdlib(module):
+    """
+    When python is run from within the nltk/ directory tree, the
+    current directory is included at the beginning of the search path.
+    Unfortunately, that means that modules within nltk can sometimes
+    shadow standard library modules.  As an example, the stdlib
+    'inspect' module will attempt to import the stdlib 'tokenize'
+    module, but will instead end up importing NLTK's 'tokenize' module
+    instead (causing the import to fail).
+    """
+    old_path = sys.path
+    sys.path = [d for d in sys.path if d not in ('', '.')]
+    m = __import__(module)
+    sys.path = old_path
+    return m
+
+
+##########################################################################
+# Wrapper for ElementTree Elements
+##########################################################################
+
+@compat.python_2_unicode_compatible
+class ElementWrapper(object):
+    """
+    A wrapper around ElementTree Element objects whose main purpose is
+    to provide nicer __repr__ and __str__ methods.  In addition, any
+    of the wrapped Element's methods that return other Element objects
+    are overridden to wrap those values before returning them.
+
+    This makes Elements more convenient to work with in
+    interactive sessions and doctests, at the expense of some
+    efficiency.
+    """
+
+    # Prevent double-wrapping:
+    def __new__(cls, etree):
+        """
+        Create and return a wrapper around a given Element object.
+        If ``etree`` is an ``ElementWrapper``, then ``etree`` is
+        returned as-is.
+        """
+        if isinstance(etree, ElementWrapper):
+            return etree
+        else:
+            return object.__new__(ElementWrapper)
+
+    def __init__(self, etree):
+        r"""
+        Initialize a new Element wrapper for ``etree``.
+
+        If ``etree`` is a string, then it will be converted to an
+        Element object using ``ElementTree.fromstring()`` first:
+
+            >>> ElementWrapper("<test></test>")
+            <Element "<?xml version='1.0' encoding='utf8'?>\n<test />">
+
+        """
+        if isinstance(etree, string_types):
+            etree = ElementTree.fromstring(etree)
+        self.__dict__['_etree'] = etree
+
+    def unwrap(self):
+        """
+        Return the Element object wrapped by this wrapper.
+        """
+        return self._etree
+
+    ##////////////////////////////////////////////////////////////
+    #{ String Representation
+    ##////////////////////////////////////////////////////////////
+
+    def __repr__(self):
+        s = ElementTree.tostring(self._etree, encoding='utf8').decode('utf8')
+        if len(s) > 60:
+            e = s.rfind('<')
+            if (len(s)-e) > 30: e = -20
+            s = '%s...%s' % (s[:30], s[e:])
+        return '<Element %r>' % s
+
+    def __str__(self):
+        """
+        :return: the result of applying ``ElementTree.tostring()`` to
+        the wrapped Element object.
+        """
+        return ElementTree.tostring(self._etree, encoding='utf8').decode('utf8').rstrip()
+
+    ##////////////////////////////////////////////////////////////
+    #{ Element interface Delegation (pass-through)
+    ##////////////////////////////////////////////////////////////
+
+    def __getattr__(self, attrib):
+        return getattr(self._etree, attrib)
+
+    def __setattr__(self, attr, value):
+        return setattr(self._etree, attr, value)
+
+    def __delattr__(self, attr):
+        return delattr(self._etree, attr)
+
+    def __setitem__(self, index, element):
+        self._etree[index] = element
+
+    def __delitem__(self, index):
+        del self._etree[index]
+
+    def __setslice__(self, start, stop, elements):
+        self._etree[start:stop] = elements
+
+    def __delslice__(self, start, stop):
+        del self._etree[start:stop]
+
+    def __len__(self):
+        return len(self._etree)
+
+    ##////////////////////////////////////////////////////////////
+    #{ Element interface Delegation (wrap result)
+    ##////////////////////////////////////////////////////////////
+
+    def __getitem__(self, index):
+        return ElementWrapper(self._etree[index])
+
+    def __getslice__(self, start, stop):
+        return [ElementWrapper(elt) for elt in self._etree[start:stop]]
+
+    def getchildren(self):
+        return [ElementWrapper(elt) for elt in self._etree]
+
+    def getiterator(self, tag=None):
+        return (ElementWrapper(elt)
+                for elt in self._etree.getiterator(tag))
+
+    def makeelement(self, tag, attrib):
+        return ElementWrapper(self._etree.makeelement(tag, attrib))
+
+    def find(self, path):
+        elt = self._etree.find(path)
+        if elt is None: return elt
+        else: return ElementWrapper(elt)
+
+    def findall(self, path):
+        return [ElementWrapper(elt) for elt in self._etree.findall(path)]
+
+######################################################################
+# Helper for Handling Slicing
+######################################################################
+
+def slice_bounds(sequence, slice_obj, allow_step=False):
+    """
+    Given a slice, return the corresponding (start, stop) bounds,
+    taking into account None indices and negative indices.  The
+    following guarantees are made for the returned start and stop values:
+
+      - 0 <= start <= len(sequence)
+      - 0 <= stop <= len(sequence)
+      - start <= stop
+
+    :raise ValueError: If ``slice_obj.step`` is not None.
+    :param allow_step: If true, then the slice object may have a
+        non-None step.  If it does, then return a tuple
+        (start, stop, step).
+    """
+    start, stop = (slice_obj.start, slice_obj.stop)
+
+    # If allow_step is true, then include the step in our return
+    # value tuple.
+    if allow_step:
+        step = slice_obj.step
+        if step is None: step = 1
+        # Use a recursive call without allow_step to find the slice
+        # bounds.  If step is negative, then the roles of start and
+        # stop (in terms of default values, etc), are swapped.
+        if step < 0:
+            start, stop = slice_bounds(sequence, slice(stop, start))
+        else:
+            start, stop = slice_bounds(sequence, slice(start, stop))
+        return start, stop, step
+
+    # Otherwise, make sure that no non-default step value is used.
+    elif slice_obj.step not in (None, 1):
+        raise ValueError('slices with steps are not supported by %s' %
+                         sequence.__class__.__name__)
+
+    # Supply default offsets.
+    if start is None: start = 0
+    if stop is None: stop = len(sequence)
+
+    # Handle negative indices.
+    if start < 0: start = max(0, len(sequence)+start)
+    if stop < 0: stop = max(0, len(sequence)+stop)
+
+    # Make sure stop doesn't go past the end of the list.  Note that
+    # we avoid calculating len(sequence) if possible, because for lazy
+    # sequences, calculating the length of a sequence can be expensive.
+    if stop > 0:
+        try: sequence[stop-1]
+        except IndexError: stop = len(sequence)
+
+    # Make sure start isn't past stop.
+    start = min(start, stop)
+
+    # That's all folks!
+    return start, stop
+
+######################################################################
+# Permission Checking
+######################################################################
+
+def is_writable(path):
+    # Ensure that it exists.
+    if not os.path.exists(path):
+        return False
+
+    # If we're on a posix system, check its permissions.
+    if hasattr(os, 'getuid'):
+        statdata = os.stat(path)
+        perm = stat.S_IMODE(statdata.st_mode)
+        # is it world-writable?
+        if (perm & 0o002):
+            return True
+        # do we own it?
+        elif statdata.st_uid == os.getuid() and (perm & 0o200):
+            return True
+        # are we in a group that can write to it?
+        elif (statdata.st_gid in [os.getgid()] + os.getgroups()) \
+            and (perm & 0o020):
+            return True
+        # otherwise, we can't write to it.
+        else:
+            return False
+
+    # Otherwise, we'll assume it's writable.
+    # [xx] should we do other checks on other platforms?
+    return True
+
+######################################################################
+# NLTK Error reporting
+######################################################################
+
+def raise_unorderable_types(ordering, a, b):
+    raise TypeError("unorderable types: %s() %s %s()" % (type(a).__name__, ordering, type(b).__name__))
diff --git a/nlp_resource_data/nltk/internals.pyc b/nlp_resource_data/nltk/internals.pyc

new file mode 100755 (executable)

index 0000000..62fbf52

Binary files /dev/null and b/nlp_resource_data/nltk/internals.pyc differ
diff --git a/nlp_resource_data/nltk/jsontags.py b/nlp_resource_data/nltk/jsontags.py

new file mode 100755 (executable)

index 0000000..3f74b12
--- /dev/null
+++ b/nlp_resource_data/nltk/jsontags.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: JSON Encoder/Decoder Helpers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Xu <xxu@student.unimelb.edu.au>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Register JSON tags, so the nltk data loader knows what module and class to look for.
+
+NLTK uses simple '!' tags to mark the types of objects, but the fully-qualified
+"tag:nltk.org,2011:" prefix is also accepted in case anyone ends up
+using it.
+"""
+
+import json
+
+json_tags = {}
+
+TAG_PREFIX = '!'
+
+def register_tag(cls):
+    """
+    Decorates a class to register it's json tag.
+    """
+    json_tags[TAG_PREFIX+getattr(cls, 'json_tag')] = cls
+    return cls
+
+class JSONTaggedEncoder(json.JSONEncoder):
+    def default(self, obj):
+        obj_tag = getattr(obj, 'json_tag', None)
+        if obj_tag is None:
+            return super(JSONTaggedEncoder, self).default(obj)
+        obj_tag = TAG_PREFIX + obj_tag
+        obj = obj.encode_json_obj()
+        return {obj_tag: obj}
+
+class JSONTaggedDecoder(json.JSONDecoder):
+    def decode(self, s):
+        return self.decode_obj(super(JSONTaggedDecoder, self).decode(s))
+
+    @classmethod
+    def decode_obj(cls, obj):
+        # Decode nested objects first.
+        if isinstance(obj, dict):
+            obj = dict((key, cls.decode_obj(val)) for (key, val) in obj.items())
+        elif isinstance(obj, list):
+            obj = list(cls.decode_obj(val) for val in obj)
+        # Check if we have a tagged object.
+        if not isinstance(obj, dict) or len(obj) != 1:
+            return obj
+        obj_tag = next(iter(obj.keys()))
+        if not obj_tag.startswith('!'):
+            return obj
+        if obj_tag not in json_tags:
+            raise ValueError('Unknown tag', obj_tag)
+        obj_cls = json_tags[obj_tag]
+        return obj_cls.decode_json_obj(obj[obj_tag])
+
+__all__ = ['register_tag', 'json_tags',
+           'JSONTaggedEncoder', 'JSONTaggedDecoder']
diff --git a/nlp_resource_data/nltk/jsontags.pyc b/nlp_resource_data/nltk/jsontags.pyc

new file mode 100755 (executable)

index 0000000..c455355

Binary files /dev/null and b/nlp_resource_data/nltk/jsontags.pyc differ
diff --git a/nlp_resource_data/nltk/lazyimport.py b/nlp_resource_data/nltk/lazyimport.py

new file mode 100755 (executable)

index 0000000..2c58878
--- /dev/null
+++ b/nlp_resource_data/nltk/lazyimport.py
@@ -0,0 +1,142 @@
+# This module is from mx/DateTime/LazyModule.py and is
+# distributed under the terms of the eGenix.com Public License Agreement
+# http://www.egenix.com/products/eGenix.com-Public-License-1.1.0.pdf
+
+""" Helper to enable simple lazy module import.
+
+    'Lazy' means the actual import is deferred until an attribute is
+    requested from the module's namespace. This has the advantage of
+    allowing all imports to be done at the top of a script (in a
+    prominent and visible place) without having a great impact
+    on startup time.
+
+    Copyright (c) 1999-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com
+    See the documentation for further information on copyrights,
+    or contact the author. All Rights Reserved.
+"""
+from __future__ import print_function
+
+### Constants
+
+_debug = 0
+
+###
+
+class LazyModule:
+
+    """ Lazy module class.
+
+        Lazy modules are imported into the given namespaces whenever a
+        non-special attribute (there are some attributes like __doc__
+        that class instances handle without calling __getattr__) is
+        requested. The module is then registered under the given name
+        in locals usually replacing the import wrapper instance. The
+        import itself is done using globals as global namespace.
+
+        Example of creating a lazy load module:
+
+        ISO = LazyModule('ISO',locals(),globals())
+
+        Later, requesting an attribute from ISO will load the module
+        automatically into the locals() namespace, overriding the
+        LazyModule instance:
+
+        t = ISO.Week(1998,1,1)
+
+    """
+    # Flag which inidicates whether the LazyModule is initialized or not
+    __lazymodule_init = 0
+
+    # Name of the module to load
+    __lazymodule_name = ''
+
+    # Flag which indicates whether the module was loaded or not
+    __lazymodule_loaded = 0
+
+    # Locals dictionary where to register the module
+    __lazymodule_locals = None
+
+    # Globals dictionary to use for the module import
+    __lazymodule_globals = None
+
+    def __init__(self, name, locals, globals=None):
+
+        """ Create a LazyModule instance wrapping module name.
+
+            The module will later on be registered in locals under the
+            given module name.
+
+            globals is optional and defaults to locals.
+
+        """
+        self.__lazymodule_locals = locals
+        if globals is None:
+            globals = locals
+        self.__lazymodule_globals = globals
+        mainname = globals.get('__name__', '')
+        if mainname:
+            self.__name__ = mainname + '.' + name
+            self.__lazymodule_name = name
+        else:
+            self.__name__ = self.__lazymodule_name = name
+        self.__lazymodule_init = 1
+
+    def __lazymodule_import(self):
+
+        """ Import the module now.
+        """
+        # Load and register module
+        name = self.__lazymodule_name
+        if self.__lazymodule_loaded:
+            return self.__lazymodule_locals[name]
+        if _debug:
+            print('LazyModule: Loading module %r' % name)
+        self.__lazymodule_locals[name] \
+             = module \
+             = __import__(name,
+                          self.__lazymodule_locals,
+                          self.__lazymodule_globals,
+                          '*')
+
+        # Fill namespace with all symbols from original module to
+        # provide faster access.
+        self.__dict__.update(module.__dict__)
+
+        # Set import flag
+        self.__dict__['__lazymodule_loaded'] = 1
+
+        if _debug:
+            print('LazyModule: Module %r loaded' % name)
+        return module
+
+    def __getattr__(self, name):
+
+        """ Import the module on demand and get the attribute.
+        """
+        if self.__lazymodule_loaded:
+            raise AttributeError(name)
+        if _debug:
+            print('LazyModule: ' \
+                  'Module load triggered by attribute %r read access' % name)
+        module = self.__lazymodule_import()
+        return getattr(module, name)
+
+    def __setattr__(self, name, value):
+
+        """ Import the module on demand and set the attribute.
+        """
+        if not self.__lazymodule_init:
+            self.__dict__[name] = value
+            return
+        if self.__lazymodule_loaded:
+            self.__lazymodule_locals[self.__lazymodule_name] = value
+            self.__dict__[name] = value
+            return
+        if _debug:
+            print('LazyModule: ' \
+                  'Module load triggered by attribute %r write access' % name)
+        module = self.__lazymodule_import()
+        setattr(module, name, value)
+
+    def __repr__(self):
+        return "<LazyModule '%s'>" % self.__name__
diff --git a/nlp_resource_data/nltk/lazyimport.pyc b/nlp_resource_data/nltk/lazyimport.pyc

new file mode 100755 (executable)

index 0000000..bba5094

Binary files /dev/null and b/nlp_resource_data/nltk/lazyimport.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/__init__.py b/nlp_resource_data/nltk/metrics/__init__.py

new file mode 100755 (executable)

index 0000000..2205cc6
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/__init__.py
@@ -0,0 +1,30 @@
+# Natural Language Toolkit: Metrics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+NLTK Metrics
+
+Classes and methods for scoring processing modules.
+"""
+
+from nltk.metrics.scores import          (accuracy, precision, recall, f_measure,
+                                          log_likelihood, approxrand)
+from nltk.metrics.confusionmatrix import ConfusionMatrix
+from nltk.metrics.distance        import (edit_distance, binary_distance,
+                                          jaccard_distance, masi_distance,
+                                          interval_distance, custom_distance,
+                                          presence, fractional_presence)
+from nltk.metrics.paice           import Paice
+from nltk.metrics.segmentation    import windowdiff, ghd, pk
+from nltk.metrics.agreement       import AnnotationTask
+from nltk.metrics.association     import (NgramAssocMeasures, BigramAssocMeasures,
+                                          TrigramAssocMeasures, ContingencyMeasures)
+from nltk.metrics.spearman        import (spearman_correlation, ranks_from_sequence,
+                                          ranks_from_scores)
+from nltk.metrics.aline           import align
diff --git a/nlp_resource_data/nltk/metrics/__init__.pyc b/nlp_resource_data/nltk/metrics/__init__.pyc

new file mode 100755 (executable)

index 0000000..419ee04

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/agreement.py b/nlp_resource_data/nltk/metrics/agreement.py

new file mode 100755 (executable)

index 0000000..887ad76
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/agreement.py
@@ -0,0 +1,421 @@
+# Natural Language Toolkit: Agreement Metrics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tom Lippincott <tom@cs.columbia.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Implementations of inter-annotator agreement coefficients surveyed by Artstein
+and Poesio (2007), Inter-Coder Agreement for Computational Linguistics.
+
+An agreement coefficient calculates the amount that annotators agreed on label
+assignments beyond what is expected by chance.
+
+In defining the AnnotationTask class, we use naming conventions similar to the
+paper's terminology.  There are three types of objects in an annotation task:
+
+    the coders (variables "c" and "C")
+    the items to be annotated (variables "i" and "I")
+    the potential categories to be assigned (variables "k" and "K")
+
+Additionally, it is often the case that we don't want to treat two different
+labels as complete disagreement, and so the AnnotationTask constructor can also
+take a distance metric as a final argument.  Distance metrics are simply
+functions that take two arguments, and return a value between 0.0 and 1.0
+indicating the distance between them.  If not supplied, the default is binary
+comparison between the arguments.
+
+The simplest way to initialize an AnnotationTask is with a list of triples,
+each containing a coder's assignment for one object in the task:
+
+    task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...])
+
+Note that the data list needs to contain the same number of triples for each
+individual coder, containing category values for the same set of items.
+
+Alpha (Krippendorff 1980)
+Kappa (Cohen 1960)
+S (Bennet, Albert and Goldstein 1954)
+Pi (Scott 1955)
+
+
+TODO: Describe handling of multiple coders and missing data
+
+Expected results from the Artstein and Poesio survey paper:
+
+    >>> from nltk.metrics.agreement import AnnotationTask
+    >>> import os.path
+    >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))])
+    >>> t.avg_Ao()
+    0.88
+    >>> t.pi()
+    0.7995322418977615...
+    >>> t.S()
+    0.8199999999999998...
+
+    This would have returned a wrong value (0.0) in @785fb79 as coders are in
+    the wrong order. Subsequently, all values for pi(), S(), and kappa() would
+    have been wrong as they are computed with avg_Ao().
+    >>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')])
+    >>> t2.avg_Ao()
+    1.0
+
+    The following, of course, also works.
+    >>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')])
+    >>> t3.avg_Ao()
+    1.0
+
+"""
+from __future__ import print_function, unicode_literals, division
+
+import logging
+from itertools import groupby
+from operator import itemgetter
+
+from six import iteritems
+
+from nltk.probability import FreqDist, ConditionalFreqDist
+from nltk.internals import deprecated
+from nltk.compat import python_2_unicode_compatible
+
+from nltk.metrics.distance import binary_distance
+
+log = logging.getLogger(__file__)
+
+@python_2_unicode_compatible
+class AnnotationTask(object):
+    """Represents an annotation task, i.e. people assign labels to items.
+
+    Notation tries to match notation in Artstein and Poesio (2007).
+
+    In general, coders and items can be represented as any hashable object.
+    Integers, for example, are fine, though strings are more readable.
+    Labels must support the distance functions applied to them, so e.g.
+    a string-edit-distance makes no sense if your labels are integers,
+    whereas interval distance needs numeric values.  A notable case of this
+    is the MASI metric, which requires Python sets.
+    """
+
+    def __init__(self, data=None, distance=binary_distance):
+        """Initialize an annotation task.
+
+        The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
+        each representing a coder's labeling of an item:
+            (coder,item,label)
+
+        The distance argument is a function taking two arguments (labels) and producing a numerical distance.
+        The distance from a label to itself should be zero:
+            distance(l,l) = 0
+        """
+        self.distance = distance
+        self.I = set()
+        self.K = set()
+        self.C = set()
+        self.data = []
+        if data is not None:
+            self.load_array(data)
+
+    def __str__(self):
+        return "\r\n".join(map(lambda x:"%s\t%s\t%s" %
+                               (x['coder'], x['item'].replace('_', "\t"),
+                                ",".join(x['labels'])), self.data))
+
+    def load_array(self, array):
+        """Load an sequence of annotation results, appending to any data already loaded.
+
+        The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
+            (coder,item,label)
+        """
+        for coder, item, labels in array:
+            self.C.add(coder)
+            self.K.add(labels)
+            self.I.add(item)
+            self.data.append({'coder':coder, 'labels':labels, 'item':item})
+
+    def agr(self, cA, cB, i, data=None):
+        """Agreement between two coders on a given item
+
+        """
+        data = data or self.data
+        # cfedermann: we don't know what combination of coder/item will come
+        # first in x; to avoid StopIteration problems due to assuming an order
+        # cA,cB, we allow either for k1 and then look up the missing as k2.
+        k1 = next((x for x in data if x['coder'] in (cA,cB) and x['item']==i))
+        if k1['coder'] == cA:
+            k2 = next((x for x in data if x['coder']==cB and x['item']==i))
+        else:
+            k2 = next((x for x in data if x['coder']==cA and x['item']==i))
+
+        ret = 1.0 - float(self.distance(k1['labels'], k2['labels']))
+        log.debug("Observed agreement between %s and %s on %s: %f",
+                      cA, cB, i, ret)
+        log.debug("Distance between \"%r\" and \"%r\": %f",
+                      k1['labels'], k2['labels'], 1.0 - ret)
+        return ret
+
+    def Nk(self, k):
+        return float(sum(1 for x in self.data if x['labels'] == k))
+
+    def Nik(self, i, k):
+        return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k))
+
+    def Nck(self, c, k):
+        return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k))
+
+    @deprecated('Use Nk, Nik or Nck instead')
+    def N(self, k=None, i=None, c=None):
+        """Implements the "n-notation" used in Artstein and Poesio (2007)
+
+        """
+        if k is not None and i is None and c is None:
+            ret = self.Nk(k)
+        elif k is not None and i is not None and c is None:
+            ret = self.Nik(i, k)
+        elif k is not None and c is not None and i is None:
+            ret = self.Nck(c, k)
+        else:
+            raise ValueError("You must pass either i or c, not both! (k=%r,i=%r,c=%r)" % (k, i, c))
+        log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
+        return ret
+
+    def _grouped_data(self, field, data=None):
+        data = data or self.data
+        return groupby(sorted(data, key=itemgetter(field)), itemgetter(field))
+
+    def Ao(self, cA, cB):
+        """Observed agreement between two coders on all items.
+
+        """
+        data = self._grouped_data('item', (x for x in self.data if x['coder'] in (cA, cB)))
+        ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(self.I)
+        log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
+        return ret
+
+    def _pairwise_average(self, function):
+        """
+        Calculates the average of function results for each coder pair
+        """
+        total = 0
+        n = 0
+        s = self.C.copy()
+        for cA in self.C:
+            s.remove(cA)
+            for cB in s:
+                total += function(cA, cB)
+                n += 1
+        ret = total / n
+        return ret
+
+    def avg_Ao(self):
+        """Average observed agreement across all coders and items.
+
+        """
+        ret = self._pairwise_average(self.Ao)
+        log.debug("Average observed agreement: %f", ret)
+        return ret
+
+    def Do_alpha(self):
+        """The observed disagreement for the alpha coefficient.
+
+        The alpha coefficient, unlike the other metrics, uses this rather than
+        observed agreement.
+        """
+        total = 0.0
+        for i, itemdata in self._grouped_data('item'):
+            label_freqs = FreqDist(x['labels'] for x in itemdata)
+
+            for j, nj in iteritems(label_freqs):
+                for l, nl in iteritems(label_freqs):
+                    total += float(nj * nl) * self.distance(l, j)
+        ret = (1.0 / (len(self.I) * len(self.C) * (len(self.C) - 1))) * total
+        log.debug("Observed disagreement: %f", ret)
+        return ret
+
+    def Do_Kw_pairwise(self,cA,cB,max_distance=1.0):
+        """The observed disagreement for the weighted kappa coefficient.
+
+        """
+        total = 0.0
+        data = (x for x in self.data if x['coder'] in (cA, cB))
+        for i, itemdata in self._grouped_data('item', data):
+            # we should have two items; distance doesn't care which comes first
+            total += self.distance(next(itemdata)['labels'],
+                                   next(itemdata)['labels'])
+
+        ret = total / (len(self.I) * max_distance)
+        log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
+        return ret
+
+    def Do_Kw(self, max_distance=1.0):
+        """Averaged over all labelers
+
+        """
+        ret = self._pairwise_average(lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance))
+        log.debug("Observed disagreement: %f", ret)
+        return ret
+
+    # Agreement Coefficients
+    def S(self):
+        """Bennett, Albert and Goldstein 1954
+
+        """
+        Ae = 1.0 / len(self.K)
+        ret = (self.avg_Ao() - Ae) / (1.0 - Ae)
+        return ret
+
+    def pi(self):
+        """Scott 1955; here, multi-pi.
+        Equivalent to K from Siegel and Castellan (1988).
+
+        """
+        total = 0.0
+        label_freqs = FreqDist(x['labels'] for x in self.data)
+        for k, f in iteritems(label_freqs):
+            total += f ** 2
+        Ae = total / ((len(self.I) * len(self.C)) ** 2)
+        return (self.avg_Ao() - Ae) / (1 - Ae)
+
+    def Ae_kappa(self, cA, cB):
+        Ae = 0.0
+        nitems = float(len(self.I))
+        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
+        for k in label_freqs.conditions():
+            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
+        return Ae
+
+    def kappa_pairwise(self, cA, cB):
+        """
+
+        """
+        Ae = self.Ae_kappa(cA, cB)
+        ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae)
+        log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae)
+        return ret
+
+    def kappa(self):
+        """Cohen 1960
+        Averages naively over kappas for each coder pair.
+
+        """
+        return self._pairwise_average(self.kappa_pairwise)
+
+    def multi_kappa(self):
+        """Davies and Fleiss 1982
+        Averages over observed and expected agreements for each coder pair.
+
+        """
+        Ae = self._pairwise_average(self.Ae_kappa)
+        return (self.avg_Ao() - Ae) / (1.0 - Ae)
+
+    def alpha(self):
+        """Krippendorff 1980
+
+        """
+        # check for degenerate cases
+        if len(self.K)==0:
+            raise ValueError("Cannot calculate alpha, no data present!")
+        if len(self.K) == 1:
+            log.debug("Only one annotation value, allpha returning 1.")
+            return 1
+        if len(self.C)==1 and len(self.I) == 1:
+            raise ValueError("Cannot calculate alpha, only one coder and item present!")
+
+        De = 0.0
+
+        label_freqs = FreqDist(x['labels'] for x in self.data)
+        for j in self.K:
+            nj = label_freqs[j]
+            for l in self.K:
+                De += float(nj * label_freqs[l]) * self.distance(j, l)
+        try:
+            De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De
+            log.debug("Expected disagreement: %f", De)
+            ret = 1.0 - (self.Do_alpha() / De)
+        except ZeroDivisionError:
+            raise ValueError("Cannot calculate alpha, expected disagreement zero, check the distance function!")
+        return ret
+
+    def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
+        """Cohen 1968
+
+        """
+        total = 0.0
+        label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
+                for x in self.data
+                if x['coder'] in (cA, cB))
+        for j in self.K:
+            for l in self.K:
+                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
+        De = total / (max_distance * pow(len(self.I), 2))
+        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
+        Do = self.Do_Kw_pairwise(cA, cB)
+        ret = 1.0 - (Do / De)
+        return ret
+
+    def weighted_kappa(self, max_distance=1.0):
+        """Cohen 1968
+
+        """
+        return self._pairwise_average(lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance))
+
+
+if __name__ == '__main__':
+
+    import re
+    import optparse
+    from nltk.metrics import distance
+
+    # process command-line arguments
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--distance", dest="distance", default="binary_distance",
+                      help="distance metric to use")
+    parser.add_option("-a", "--agreement", dest="agreement", default="kappa",
+                      help="agreement coefficient to calculate")
+    parser.add_option("-e", "--exclude", dest="exclude", action="append",
+                      default=[], help="coder names to exclude (may be specified multiple times)")
+    parser.add_option("-i", "--include", dest="include", action="append", default=[],
+                      help="coder names to include, same format as exclude")
+    parser.add_option("-f", "--file", dest="file",
+                      help="file to read labelings from, each line with three columns: 'labeler item labels'")
+    parser.add_option("-v", "--verbose", dest="verbose", default='0',
+                      help="how much debugging to print on stderr (0-4)")
+    parser.add_option("-c", "--columnsep", dest="columnsep", default="\t",
+                      help="char/string that separates the three columns in the file, defaults to tab")
+    parser.add_option("-l", "--labelsep", dest="labelsep", default=",",
+                      help="char/string that separates labels (if labelers can assign more than one), defaults to comma")
+    parser.add_option("-p", "--presence", dest="presence", default=None,
+                      help="convert each labeling into 1 or 0, based on presence of LABEL")
+    parser.add_option("-T", "--thorough", dest="thorough", default=False, action="store_true",
+                      help="calculate agreement for every subset of the annotators")
+    (options, remainder) = parser.parse_args()
+
+    if not options.file:
+        parser.print_help()
+        exit()
+
+    logging.basicConfig(level=50 - 10 * int(options.verbose))
+
+    # read in data from the specified file
+    data = []
+    with open(options.file, 'r') as infile:
+        for l in infile:
+            toks = l.split(options.columnsep)
+            coder, object_, labels = toks[0], str(toks[1:-1]), frozenset(toks[-1].strip().split(options.labelsep))
+            if ((options.include == options.exclude) or
+                (len(options.include) > 0 and coder in options.include) or
+                (len(options.exclude) > 0 and coder not in options.exclude)):
+                data.append((coder, object_, labels))
+
+    if options.presence:
+        task = AnnotationTask(data, getattr(distance, options.distance)(options.presence))
+    else:
+        task = AnnotationTask(data, getattr(distance, options.distance))
+
+    if options.thorough:
+        pass
+    else:
+        print(getattr(task, options.agreement)())
+
+    logging.shutdown()
diff --git a/nlp_resource_data/nltk/metrics/agreement.pyc b/nlp_resource_data/nltk/metrics/agreement.pyc

new file mode 100755 (executable)

index 0000000..c6c35ae

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/agreement.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/aline.py b/nlp_resource_data/nltk/metrics/aline.py

new file mode 100755 (executable)

index 0000000..aa1da0d
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/aline.py
@@ -0,0 +1,607 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: ALINE
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Greg Kondrak <gkondrak@ualberta.ca>
+#         Geoff Bacon <bacon@berkeley.edu> (Python port)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+ALINE
+http://webdocs.cs.ualberta.ca/~kondrak/
+Copyright 2002 by Grzegorz Kondrak.
+
+ALINE is an algorithm for aligning phonetic sequences, described in [1].
+This module is a port of Kondrak's (2002) ALINE. It provides functions for
+phonetic sequence alignment and similarity analysis. These are useful in
+historical linguistics, sociolinguistics and synchronic phonology.
+
+ALINE has parameters that can be tuned for desired output. These parameters are:
+- C_skip, C_sub, C_exp, C_vwl
+- Salience weights
+- Segmental features
+
+In this implementation, some parameters have been changed from their default
+values as described in [1], in order to replicate published results. All changes
+are noted in comments.
+
+Example usage
+-------------
+
+# Get optimal alignment of two phonetic sequences
+
+>>> align('θin', 'tenwis') # doctest: +SKIP
+[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
+
+[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
+University of Toronto.
+"""
+
+from __future__ import unicode_literals
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+# === Constants ===
+
+inf = float('inf')
+
+# Default values for maximum similarity scores (Kondrak 2002: 54)
+C_skip = 10 # Indels
+C_sub  = 35  # Substitutions
+C_exp  = 45  # Expansions/compressions
+C_vwl  = 5  # Vowel/consonant relative weight (decreased from 10)
+
+consonants = ['B', 'N', 'R', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
+              'n', 'p', 'q', 'r', 's', 't', 'v', 'x', 'z', 'ç', 'ð', 'ħ',
+              'ŋ', 'ɖ', 'ɟ', 'ɢ', 'ɣ', 'ɦ', 'ɬ', 'ɮ', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ',
+              'ɸ', 'ɹ', 'ɻ', 'ɽ', 'ɾ', 'ʀ', 'ʁ', 'ʂ', 'ʃ', 'ʈ', 'ʋ', 'ʐ ', 'ʒ',
+              'ʔ', 'ʕ', 'ʙ', 'ʝ', 'β', 'θ', 'χ', 'ʐ', 'w']
+
+# Relevant features for comparing consonants and vowels
+R_c = ['aspirated', 'lateral', 'manner', 'nasal', 'place', 'retroflex',
+       'syllabic', 'voice']
+# 'high' taken out of R_v because same as manner
+R_v = ['back', 'lateral', 'long', 'manner', 'nasal', 'place',
+       'retroflex', 'round', 'syllabic', 'voice']
+
+# Flattened feature matrix (Kondrak 2002: 56)
+similarity_matrix = {
+   #place
+   'bilabial': 1.0, 'labiodental': 0.95, 'dental': 0.9,
+   'alveolar': 0.85, 'retroflex': 0.8, 'palato-alveolar': 0.75,
+   'palatal': 0.7, 'velar': 0.6, 'uvular': 0.5, 'pharyngeal': 0.3,
+   'glottal': 0.1, 'labiovelar': 1.0, 'vowel': -1.0, # added 'vowel'
+   #manner
+   'stop': 1.0, 'affricate': 0.9, 'fricative': 0.85, # increased fricative from 0.8
+   'trill': 0.7, 'tap': 0.65, 'approximant': 0.6, 'high vowel': 0.4,
+   'mid vowel': 0.2, 'low vowel': 0.0, 'vowel2': 0.5, # added vowel
+   #high
+   'high': 1.0, 'mid': 0.5, 'low': 0.0,
+   #back
+   'front': 1.0, 'central': 0.5, 'back': 0.0,
+   #binary features
+   'plus': 1.0, 'minus': 0.0
+}
+
+# Relative weights of phonetic features (Kondrak 2002: 55)
+salience = {
+   'syllabic': 5,
+   'place': 40,
+   'manner': 50,
+   'voice': 5, # decreased from 10
+   'nasal': 20, # increased from 10
+   'retroflex': 10,
+   'lateral': 10,
+   'aspirated': 5,
+   'long': 0, # decreased from 1
+   'high': 3, # decreased from 5
+   'back': 2, # decreased from 5
+   'round': 2 # decreased from 5
+}
+
+# (Kondrak 2002: 59-60)
+feature_matrix = {
+# Consonants
+'p': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'b': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'t': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'d': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʈ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɖ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'c': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɟ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'k': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'g': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'q': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɢ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʔ': {'place': 'glottal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'m': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɱ': {'place': 'labiodental', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'n': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɳ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɲ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ŋ': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɴ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'N': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʙ': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'B': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'r': {'place': 'alveolar', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʀ': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'R': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɾ': {'place': 'alveolar', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɽ': {'place': 'retroflex', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɸ': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'β': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'f': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'v': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'θ': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ð': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'s': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'z': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʃ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʒ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʂ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʐ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ç': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʝ': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'x': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɣ': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'χ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʁ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ħ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʕ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'h': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɦ': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɬ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'ɮ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'ʋ': {'place': 'labiodental', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɹ': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɻ': {'place': 'retroflex', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'j': {'place': 'palatal', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɰ': {'place': 'velar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'l': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'w': {'place': 'labiovelar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+# Vowels
+
+'i': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'y': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'e': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'E': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ø': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ɛ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'œ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'æ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'a': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'A': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ɨ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ʉ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'central','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ə': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'u': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'U': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
+
+'o': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'O': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ɔ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ɒ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'back','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'I': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+}
+
+# === Algorithm ===
+
+def align(str1, str2, epsilon=0):
+    """
+    Compute the alignment of two phonetic strings.
+
+    :type str1, str2: str
+    :param str1, str2: Two strings to be aligned
+    :type epsilon: float (0.0 to 1.0)
+    :param epsilon: Adjusts threshold similarity score for near-optimal alignments
+
+    :rtpye: list(list(tuple(str, str)))
+    :return: Alignment(s) of str1 and str2
+
+    (Kondrak 2002: 51)
+    """
+    if np == None:
+      raise ImportError('You need numpy in order to use the align function')
+
+    assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
+    m = len(str1)
+    n = len(str2)
+    # This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+    S = np.zeros((m+1, n+1), dtype=float)
+
+    # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
+    # and breaks array and string indices. Make sure they never get chosen
+    # by setting them to -inf.
+    for i in range(1, m+1):
+        for j in range(1, n+1):
+            edit1 = S[i-1, j] + sigma_skip(str1[i-1])
+            edit2 = S[i, j-1] + sigma_skip(str2[j-1])
+            edit3 = S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1])
+            if i > 1:
+                edit4 = S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i])
+            else:
+                edit4 = -inf
+            if j > 1:
+                edit5 = S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j])
+            else:
+                edit5 = -inf
+            S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
+
+    T = (1-epsilon)*np.amax(S) # Threshold score for near-optimal alignments
+
+    alignments = []
+    for i in range(1, m+1):
+        for j in range(1, n+1):
+            if S[i,j] >= T:
+                alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
+    return alignments
+
+def _retrieve(i, j, s, S, T, str1, str2, out):
+    """
+    Retrieve the path through the similarity matrix S starting at (i, j).
+
+    :rtype: list(tuple(str, str))
+    :return: Alignment of str1 and str2
+    """
+    if S[i, j] == 0:
+        return out
+    else:
+        if j > 1 and S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j]) + s >= T:
+            out.insert(0, (str1[i-1], str2[j-2:j]))
+            _retrieve(i-1, j-2, s+sigma_exp(str1[i-1], str2[j-2:j]), S, T, str1, str2, out)
+        elif i > 1 and S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i]) + s >= T:
+            out.insert(0, (str1[i-2:i], str2[j-1]))
+            _retrieve(i-2, j-1, s+sigma_exp(str2[j-1], str1[i-2:i]), S, T, str1, str2, out)
+        elif S[i, j-1] + sigma_skip(str2[j-1]) + s >= T:
+            out.insert(0, ('-', str2[j-1]))
+            _retrieve(i, j-1, s+sigma_skip(str2[j-1]), S, T, str1, str2, out)
+        elif S[i-1, j] + sigma_skip(str1[i-1]) + s >= T:
+            out.insert(0, (str1[i-1], '-'))
+            _retrieve(i-1, j, s+sigma_skip(str1[i-1]), S, T, str1, str2, out)
+        elif S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1]) + s >= T:
+            out.insert(0, (str1[i-1], str2[j-1]))
+            _retrieve(i-1, j-1, s+sigma_sub(str1[i-1], str2[j-1]), S, T, str1, str2, out)
+    return out
+
+def sigma_skip(p):
+    """
+    Returns score of an indel of P.
+
+    (Kondrak 2002: 54)
+    """
+    return C_skip
+
+def sigma_sub(p, q):
+    """
+    Returns score of a substitution of P with Q.
+
+    (Kondrak 2002: 54)
+    """
+    return C_sub - delta(p, q) - V(p) - V(q)
+
+def sigma_exp(p, q):
+    """
+    Returns score of an expansion/compression.
+
+    (Kondrak 2002: 54)
+    """
+    q1 = q[0]
+    q2 = q[1]
+    return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
+
+def delta(p, q):
+    """
+    Return weighted sum of difference between P and Q.
+
+    (Kondrak 2002: 54)
+    """
+    features = R(p, q)
+    total = 0
+    for f in features:
+        total += diff(p, q, f) * salience[f]
+    return total
+
+def diff(p, q, f):
+    """
+    Returns difference between phonetic segments P and Q for feature F.
+
+    (Kondrak 2002: 52, 54)
+    """
+    p_features, q_features = feature_matrix[p], feature_matrix[q]
+    return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
+
+def R(p, q):
+    """
+    Return relevant features for segment comparsion.
+
+    (Kondrak 2002: 54)
+    """
+    if p in consonants or q in consonants:
+        return R_c
+    return R_v
+
+def V(p):
+    """
+    Return vowel weight if P is vowel.
+
+    (Kondrak 2002: 54)
+    """
+    if p in consonants:
+        return 0
+    return C_vwl
+
+# === Test ===
+
+def demo():
+    """
+    A demonstration of the result of aligning phonetic sequences
+    used in Kondrak's (2002) dissertation.
+    """
+    data = [pair.split(',') for pair in cognate_data.split('\n')]
+    for pair in data:
+        alignment = align(pair[0], pair[1])[0]
+        alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
+        alignment = ' '.join(alignment)
+        print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
+
+cognate_data = """jo,ʒə
+tu,ty
+nosotros,nu
+kjen,ki
+ke,kwa
+todos,tu
+una,ən
+dos,dø
+tres,trwa
+ombre,om
+arbol,arbrə
+pluma,plym
+kabeθa,kap
+boka,buʃ
+pje,pje
+koraθon,kœr
+ber,vwar
+benir,vənir
+deθir,dir
+pobre,povrə
+ðis,dIzes
+ðæt,das
+wat,vas
+nat,nixt
+loŋ,laŋ
+mæn,man
+fleʃ,flajʃ
+bləd,blyt
+feðər,fEdər
+hær,hAr
+ir,Or
+aj,awgə
+nowz,nAzə
+mawθ,munt
+təŋ,tsuŋə
+fut,fys
+nij,knI
+hænd,hant
+hart,herts
+livər,lEbər
+ænd,ante
+æt,ad
+blow,flAre
+ir,awris
+ijt,edere
+fiʃ,piʃkis
+flow,fluere
+staɾ,stella
+ful,plenus
+græs,gramen
+hart,kordis
+horn,korny
+aj,ego
+nij,genU
+məðər,mAter
+mawntən,mons
+nejm,nomen
+njuw,nowus
+wən,unus
+rawnd,rotundus
+sow,suere
+sit,sedere
+θrij,tres
+tuwθ,dentis
+θin,tenwis
+kinwawa,kenuaʔ
+nina,nenah
+napewa,napɛw
+wapimini,wapemen
+namesa,namɛʔs
+okimawa,okemaw
+ʃiʃipa,seʔsep
+ahkohkwa,ahkɛh
+pematesiweni,pematesewen
+asenja,aʔsɛn"""
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/metrics/aline.pyc b/nlp_resource_data/nltk/metrics/aline.pyc

new file mode 100755 (executable)

index 0000000..174df77

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/aline.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/association.py b/nlp_resource_data/nltk/metrics/association.py

new file mode 100755 (executable)

index 0000000..3e012f6
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/association.py
@@ -0,0 +1,414 @@
+# Natural Language Toolkit: Ngram Association Measures
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Joel Nothman <jnothman@student.usyd.edu.au>
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+Provides scoring functions for a number of association measures through a
+generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
+``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
+"""
+
+from __future__ import division
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+import math as _math
+from functools import reduce
+_log2 = lambda x: _math.log(x, 2.0)
+_ln = _math.log
+
+_product = lambda s: reduce(lambda x, y: x * y, s)
+
+_SMALL = 1e-20
+
+try:
+    from scipy.stats import fisher_exact
+except ImportError:
+    def fisher_exact(*_args, **_kwargs):
+        raise NotImplementedError
+
+### Indices to marginals arguments:
+
+NGRAM = 0
+"""Marginals index for the ngram count"""
+
+UNIGRAMS = -2
+"""Marginals index for a tuple of each unigram count"""
+
+TOTAL = -1
+"""Marginals index for the number of words in the data"""
+
+
+@add_metaclass(ABCMeta)
+class NgramAssocMeasures(object):
+    """
+    An abstract class defining a collection of generic association measures.
+    Each public method returns a score, taking the following arguments::
+
+        score_fn(count_of_ngram,
+                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
+                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
+                 ...,
+                 (count_of_1gram_1, ..., count_of_1gram_n),
+                 count_of_total_words)
+
+    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
+
+    Inheriting classes should define a property _n, and a method _contingency
+    which calculates contingency values from marginals in order for all
+    association measures defined here to be usable.
+    """
+
+    _n = 0
+
+    @staticmethod
+    @abstractmethod
+    def _contingency(*marginals):
+        """Calculates values of a contingency table from marginal values."""
+        raise NotImplementedError("The contingency table is not available"
+                                  "in the general ngram case")
+
+    @staticmethod
+    @abstractmethod
+    def _marginals(*contingency):
+        """Calculates values of contingency table marginals from its values."""
+        raise NotImplementedError("The contingency table is not available"
+                                  "in the general ngram case")
+
+    @classmethod
+    def _expected_values(cls, cont):
+        """Calculates expected values for a contingency table."""
+        n_all = sum(cont)
+        bits = [1 << i for i in range(cls._n)]
+
+        # For each contingency table cell
+        for i in range(len(cont)):
+            # Yield the expected value
+            yield (_product(sum(cont[x] for x in range(2 ** cls._n)
+                                if (x & j) == (i & j))
+                            for j in bits) /
+                   (n_all ** (cls._n - 1)))
+
+    @staticmethod
+    def raw_freq(*marginals):
+        """Scores ngrams by their frequency"""
+        return marginals[NGRAM] / marginals[TOTAL]
+
+    @classmethod
+    def student_t(cls, *marginals):
+        """Scores ngrams using Student's t test with independence hypothesis
+        for unigrams, as in Manning and Schutze 5.3.1.
+        """
+        return ((marginals[NGRAM] -
+                  _product(marginals[UNIGRAMS]) /
+                  (marginals[TOTAL] ** (cls._n - 1))) /
+                (marginals[NGRAM] + _SMALL) ** .5)
+
+    @classmethod
+    def chi_sq(cls, *marginals):
+        """Scores ngrams using Pearson's chi-square as in Manning and Schutze
+        5.3.3.
+        """
+        cont = cls._contingency(*marginals)
+        exps = cls._expected_values(cont)
+        return sum((obs - exp) ** 2 / (exp + _SMALL)
+                   for obs, exp in zip(cont, exps))
+
+    @staticmethod
+    def mi_like(*marginals, **kwargs):
+        """Scores ngrams using a variant of mutual information. The keyword
+        argument power sets an exponent (default 3) for the numerator. No
+        logarithm of the result is calculated.
+        """
+        return (marginals[NGRAM] ** kwargs.get('power', 3) /
+                _product(marginals[UNIGRAMS]))
+
+    @classmethod
+    def pmi(cls, *marginals):
+        """Scores ngrams by pointwise mutual information, as in Manning and
+        Schutze 5.4.
+        """
+        return (_log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) -
+                _log2(_product(marginals[UNIGRAMS])))
+
+    @classmethod
+    def likelihood_ratio(cls, *marginals):
+        """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
+        """
+        cont = cls._contingency(*marginals)
+        return (cls._n *
+                sum(obs * _ln(obs / (exp + _SMALL) + _SMALL)
+                    for obs, exp in zip(cont, cls._expected_values(cont))))
+
+    @classmethod
+    def poisson_stirling(cls, *marginals):
+        """Scores ngrams using the Poisson-Stirling measure."""
+        exp = (_product(marginals[UNIGRAMS]) /
+               (marginals[TOTAL] ** (cls._n - 1)))
+        return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
+
+    @classmethod
+    def jaccard(cls, *marginals):
+        """Scores ngrams using the Jaccard index."""
+        cont = cls._contingency(*marginals)
+        return cont[0] / sum(cont[:-1])
+
+
+class BigramAssocMeasures(NgramAssocMeasures):
+    """
+    A collection of bigram association measures. Each association measure
+    is provided as a function with three arguments::
+
+        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+
+        n_ii counts (w1, w2), i.e. the bigram being scored
+        n_ix counts (w1, *)
+        n_xi counts (*, w2)
+        n_xx counts (*, *), i.e. any bigram
+
+    This may be shown with respect to a contingency table::
+
+                w1    ~w1
+             ------ ------
+         w2 | n_ii | n_oi | = n_xi
+             ------ ------
+        ~w2 | n_io | n_oo |
+             ------ ------
+             = n_ix        TOTAL = n_xx
+    """
+
+    _n = 2
+
+    @staticmethod
+    def _contingency(n_ii, n_ix_xi_tuple, n_xx):
+        """Calculates values of a bigram contingency table from marginal values."""
+        (n_ix, n_xi) = n_ix_xi_tuple
+        n_oi = n_xi - n_ii
+        n_io = n_ix - n_ii
+        return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
+
+    @staticmethod
+    def _marginals(n_ii, n_oi, n_io, n_oo):
+        """Calculates values of contingency table marginals from its values."""
+        return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
+
+    @staticmethod
+    def _expected_values(cont):
+        """Calculates expected values for a contingency table."""
+        n_xx = sum(cont)
+        # For each contingency table cell
+        for i in range(4):
+            yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
+
+    @classmethod
+    def phi_sq(cls, *marginals):
+        """Scores bigrams using phi-square, the square of the Pearson correlation
+        coefficient.
+        """
+        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
+
+        return ((n_ii*n_oo - n_io*n_oi)**2 /
+                ((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))
+
+    @classmethod
+    def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
+        """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
+        of bigrams, as in Manning and Schutze 5.3.3.
+        """
+        (n_ix, n_xi) = n_ix_xi_tuple
+        return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
+
+    @classmethod
+    def fisher(cls, *marginals):
+        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
+        sensitive to small counts than PMI or Chi Sq, but also more expensive
+        to compute. Requires scipy.
+        """
+
+        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
+
+        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
+        return pvalue
+
+    @staticmethod
+    def dice(n_ii, n_ix_xi_tuple, n_xx):
+        """Scores bigrams using Dice's coefficient."""
+        (n_ix, n_xi) = n_ix_xi_tuple
+        return 2 * n_ii / (n_ix + n_xi)
+
+
+class TrigramAssocMeasures(NgramAssocMeasures):
+    """
+    A collection of trigram association measures. Each association measure
+    is provided as a function with four arguments::
+
+        trigram_score_fn(n_iii,
+                         (n_iix, n_ixi, n_xii),
+                         (n_ixx, n_xix, n_xxi),
+                         n_xxx)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+    n_iii counts (w1, w2, w3), i.e. the trigram being scored
+    n_ixx counts (w1, *, *)
+    n_xxx counts (*, *, *), i.e. any trigram
+    """
+
+    _n = 3
+
+    @staticmethod
+    def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
+        """Calculates values of a trigram contingency table (or cube) from
+        marginal values.
+        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
+        (1, 0, 0, 0, 0, 72, 0, 1927)
+        """
+        (n_iix, n_ixi, n_xii) = n_iix_tuple
+        (n_ixx, n_xix, n_xxi) = n_ixx_tuple
+        n_oii = n_xii - n_iii
+        n_ioi = n_ixi - n_iii
+        n_iio = n_iix - n_iii
+        n_ooi = n_xxi - n_iii - n_oii - n_ioi
+        n_oio = n_xix - n_iii - n_oii - n_iio
+        n_ioo = n_ixx - n_iii - n_ioi - n_iio
+        n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
+
+        return (n_iii, n_oii, n_ioi, n_ooi,
+                n_iio, n_oio, n_ioo, n_ooo)
+
+    @staticmethod
+    def _marginals(*contingency):
+        """Calculates values of contingency table marginals from its values.
+        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
+        (1, (1, 1, 1), (1, 73, 1), 2000)
+        """
+        n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
+        return (n_iii,
+                (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
+                (n_iii + n_ioi + n_iio + n_ioo,
+                 n_iii + n_oii + n_iio + n_oio,
+                 n_iii + n_oii + n_ioi + n_ooi),
+                sum(contingency))
+
+
+class QuadgramAssocMeasures(NgramAssocMeasures):
+    """
+    A collection of quadgram association measures. Each association measure
+    is provided as a function with five arguments::
+
+        trigram_score_fn(n_iiii,
+                        (n_iiix, n_iixi, n_ixii, n_xiii),
+                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
+                        n_all)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+    n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
+    n_ixxi counts (w1, *, *, w4)
+    n_xxxx counts (*, *, *, *), i.e. any quadgram
+    """
+
+    _n = 4
+
+    @staticmethod
+    def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
+        """Calculates values of a quadgram contingency table from
+        marginal values.
+        """
+        (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
+        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
+        (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
+        n_oiii = n_xiii - n_iiii
+        n_ioii = n_ixii - n_iiii
+        n_iioi = n_iixi - n_iiii
+        n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
+        n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
+        n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
+        n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
+        n_iiio = n_iiix - n_iiii
+        n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
+        n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
+        n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
+        n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
+        n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
+        n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
+        n_oooo = n_xxxx - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_oioi - n_iooi - \
+                 n_oooi - n_iiio - n_oiio - n_ioio - n_ooio - n_iioo - n_oioo - n_iooo
+
+        return (n_iiii, n_oiii, n_ioii, n_ooii, n_iioi,
+                n_oioi, n_iooi, n_oooi, n_iiio, n_oiio,
+                n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo)
+
+    @staticmethod
+    def _marginals(*contingency):
+        """Calculates values of contingency table marginals from its values.
+        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
+        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
+        """
+        n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, \
+        n_iioo, n_oioo, n_iooo, n_oooo = contingency
+
+        n_iiix = n_iiii + n_iiio
+        n_iixi = n_iiii + n_iioi
+        n_ixii = n_iiii + n_ioii
+        n_xiii = n_iiii + n_oiii
+
+        n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
+        n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
+        n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
+        n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
+        n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
+        n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
+
+        n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
+        n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
+        n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
+        n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
+
+        n_all = sum(contingency)
+
+        return (n_iiii,
+                (n_iiix, n_iixi, n_ixii, n_xiii),
+                (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+                (n_ixxx, n_xixx, n_xxix, n_xxxi),
+                n_all)
+
+
+class ContingencyMeasures(object):
+    """Wraps NgramAssocMeasures classes such that the arguments of association
+    measures are contingency table values rather than marginals.
+    """
+
+    def __init__(self, measures):
+        """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
+        self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
+        for k in dir(measures):
+            if k.startswith('__'):
+                continue
+            v = getattr(measures, k)
+            if not k.startswith('_'):
+                v = self._make_contingency_fn(measures, v)
+            setattr(self, k, v)
+
+    @staticmethod
+    def _make_contingency_fn(measures, old_fn):
+        """From an association measure function, produces a new function which
+        accepts contingency table values as its arguments.
+        """
+        def res(*contingency):
+            return old_fn(*measures._marginals(*contingency))
+        res.__doc__ = old_fn.__doc__
+        res.__name__ = old_fn.__name__
+        return res
diff --git a/nlp_resource_data/nltk/metrics/association.pyc b/nlp_resource_data/nltk/metrics/association.pyc

new file mode 100755 (executable)

index 0000000..aa1f11b

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/association.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/confusionmatrix.py b/nlp_resource_data/nltk/metrics/confusionmatrix.py

new file mode 100755 (executable)

index 0000000..611d82a
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/confusionmatrix.py
@@ -0,0 +1,206 @@
+# Natural Language Toolkit: Confusion Matrices
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class ConfusionMatrix(object):
+    """
+    The confusion matrix between a list of reference values and a
+    corresponding list of test values.  Entry *[r,t]* of this
+    matrix is a count of the number of times that the reference value
+    *r* corresponds to the test value *t*.  E.g.:
+
+        >>> from nltk.metrics import ConfusionMatrix
+        >>> ref  = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+        >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+        >>> cm = ConfusionMatrix(ref, test)
+        >>> print(cm['NN', 'NN'])
+        3
+
+    Note that the diagonal entries *Ri=Tj* of this matrix
+    corresponds to correct values; and the off-diagonal entries
+    correspond to incorrect values.
+    """
+
+    def __init__(self, reference, test, sort_by_count=False):
+        """
+        Construct a new confusion matrix from a list of reference
+        values and a corresponding list of test values.
+
+        :type reference: list
+        :param reference: An ordered list of reference values.
+        :type test: list
+        :param test: A list of values to compare against the
+            corresponding reference values.
+        :raise ValueError: If ``reference`` and ``length`` do not have
+            the same length.
+        """
+        if len(reference) != len(test):
+            raise ValueError('Lists must have the same length.')
+
+        # Get a list of all values.
+        if sort_by_count:
+            ref_fdist = FreqDist(reference)
+            test_fdist = FreqDist(test)
+            def key(v): return -(ref_fdist[v]+test_fdist[v])
+            values = sorted(set(reference+test), key=key)
+        else:
+            values = sorted(set(reference+test))
+
+        # Construct a value->index dictionary
+        indices = dict((val,i) for (i,val) in enumerate(values))
+
+        # Make a confusion matrix table.
+        confusion = [[0 for val in values] for val in values]
+        max_conf = 0 # Maximum confusion
+        for w,g in zip(reference, test):
+            confusion[indices[w]][indices[g]] += 1
+            max_conf = max(max_conf, confusion[indices[w]][indices[g]])
+
+        #: A list of all values in ``reference`` or ``test``.
+        self._values = values
+        #: A dictionary mapping values in ``self._values`` to their indices.
+        self._indices = indices
+        #: The confusion matrix itself (as a list of lists of counts).
+        self._confusion = confusion
+        #: The greatest count in ``self._confusion`` (used for printing).
+        self._max_conf = max_conf
+        #: The total number of values in the confusion matrix.
+        self._total = len(reference)
+        #: The number of correct (on-diagonal) values in the matrix.
+        self._correct = sum(confusion[i][i] for i in range(len(values)))
+
+    def __getitem__(self, li_lj_tuple):
+        """
+        :return: The number of times that value ``li`` was expected and
+        value ``lj`` was given.
+        :rtype: int
+        """
+        (li, lj) = li_lj_tuple
+        i = self._indices[li]
+        j = self._indices[lj]
+        return self._confusion[i][j]
+
+    def __repr__(self):
+        return '<ConfusionMatrix: %s/%s correct>' % (self._correct,
+                                                     self._total)
+
+    def __str__(self):
+        return self.pretty_format()
+
+    def pretty_format(self, show_percents=False, values_in_chart=True,
+           truncate=None, sort_by_count=False):
+        """
+        :return: A multi-line string representation of this confusion matrix.
+        :type truncate: int
+        :param truncate: If specified, then only show the specified
+            number of values.  Any sorting (e.g., sort_by_count)
+            will be performed before truncation.
+        :param sort_by_count: If true, then sort by the count of each
+            label in the reference data.  I.e., labels that occur more
+            frequently in the reference label will be towards the left
+            edge of the matrix, and labels that occur less frequently
+            will be towards the right edge.
+
+        @todo: add marginals?
+        """
+        confusion = self._confusion
+
+        values = self._values
+        if sort_by_count:
+            values = sorted(values, key=lambda v:
+                            -sum(self._confusion[self._indices[v]]))
+
+        if truncate:
+            values = values[:truncate]
+
+        if values_in_chart:
+            value_strings = ["%s" % val for val in values]
+        else:
+            value_strings = [str(n+1) for n in range(len(values))]
+
+        # Construct a format string for row values
+        valuelen = max(len(val) for val in value_strings)
+        value_format = '%' + repr(valuelen) + 's | '
+        # Construct a format string for matrix entries
+        if show_percents:
+            entrylen = 6
+            entry_format = '%5.1f%%'
+            zerostr = '     .'
+        else:
+            entrylen = len(repr(self._max_conf))
+            entry_format = '%' + repr(entrylen) + 'd'
+            zerostr = ' '*(entrylen-1) + '.'
+
+        # Write the column values.
+        s = ''
+        for i in range(valuelen):
+            s += (' '*valuelen)+' |'
+            for val in value_strings:
+                if i >= valuelen-len(val):
+                    s += val[i-valuelen+len(val)].rjust(entrylen+1)
+                else:
+                    s += ' '*(entrylen+1)
+            s += ' |\n'
+
+        # Write a dividing line
+        s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values)))
+
+        # Write the entries.
+        for val, li in zip(value_strings, values):
+            i = self._indices[li]
+            s += value_format % val
+            for lj in values:
+                j = self._indices[lj]
+                if confusion[i][j] == 0:
+                    s += zerostr
+                elif show_percents:
+                    s += entry_format % (100.0*confusion[i][j]/self._total)
+                else:
+                    s += entry_format % confusion[i][j]
+                if i == j:
+                    prevspace = s.rfind(' ')
+                    s = s[:prevspace] + '<' + s[prevspace+1:] + '>'
+                else: s += ' '
+            s += '|\n'
+
+        # Write a dividing line
+        s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values)))
+
+        # Write a key
+        s += '(row = reference; col = test)\n'
+        if not values_in_chart:
+            s += 'Value key:\n'
+            for i, value in enumerate(values):
+                s += '%6d: %s\n' % (i+1, value)
+
+        return s
+
+    def key(self):
+        values = self._values
+        str = 'Value key:\n'
+        indexlen = len(repr(len(values)-1))
+        key_format = '  %'+repr(indexlen)+'d: %s\n'
+        for i in range(len(values)):
+            str += key_format % (i, values[i])
+
+        return str
+
+def demo():
+    reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+    test    = 'DET VB VB DET NN NN NN IN DET NN'.split()
+    print('Reference =', reference)
+    print('Test    =', test)
+    print('Confusion matrix:')
+    print(ConfusionMatrix(reference, test))
+    print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/metrics/confusionmatrix.pyc b/nlp_resource_data/nltk/metrics/confusionmatrix.pyc

new file mode 100755 (executable)

index 0000000..decee4f

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/confusionmatrix.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/distance.py b/nlp_resource_data/nltk/metrics/distance.py

new file mode 100755 (executable)

index 0000000..e8957bf
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/distance.py
@@ -0,0 +1,207 @@
+# Natural Language Toolkit: Distance Metrics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Tom Lippincott <tom@cs.columbia.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Distance Metrics.
+
+Compute the distance between two items (usually strings).
+As metrics, they must satisfy the following three requirements:
+
+1. d(a, a) = 0
+2. d(a, b) >= 0
+3. d(a, c) <= d(a, b) + d(b, c)
+"""
+
+from __future__ import print_function
+from __future__ import division
+
+
+def _edit_dist_init(len1, len2):
+    lev = []
+    for i in range(len1):
+        lev.append([0] * len2)  # initialize 2D array to zero
+    for i in range(len1):
+        lev[i][0] = i           # column 0: 0,1,2,3,4,...
+    for j in range(len2):
+        lev[0][j] = j           # row 0: 0,1,2,3,4,...
+    return lev
+
+
+def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False):
+    c1 = s1[i - 1]
+    c2 = s2[j - 1]
+
+    # skipping a character in s1
+    a = lev[i - 1][j] + 1
+    # skipping a character in s2
+    b = lev[i][j - 1] + 1
+    # substitution
+    c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
+
+    # transposition
+    d = c + 1  # never picked by default
+    if transpositions and i > 1 and j > 1:
+        if s1[i - 2] == c2 and s2[j - 2] == c1:
+            d = lev[i - 2][j - 2] + 1
+
+    # pick the cheapest
+    lev[i][j] = min(a, b, c, d)
+
+
+def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
+    """
+    Calculate the Levenshtein edit-distance between two strings.
+    The edit distance is the number of characters that need to be
+    substituted, inserted, or deleted, to transform s1 into s2.  For
+    example, transforming "rain" to "shine" requires three steps,
+    consisting of two substitutions and one insertion:
+    "rain" -> "sain" -> "shin" -> "shine".  These operations could have
+    been done in other orders, but at least three steps are needed.
+
+    Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
+    because sometimes it makes sense to assign greater penalties to substitutions.
+
+    This also optionally allows transposition edits (e.g., "ab" -> "ba"),
+    though this is disabled by default.
+
+    :param s1, s2: The strings to be analysed
+    :param transpositions: Whether to allow transposition edits
+    :type s1: str
+    :type s2: str
+    :type substitution_cost: int
+    :type transpositions: bool
+    :rtype int
+    """
+    # set up a 2-D array
+    len1 = len(s1)
+    len2 = len(s2)
+    lev = _edit_dist_init(len1 + 1, len2 + 1)
+
+    # iterate over the array
+    for i in range(len1):
+        for j in range(len2):
+            _edit_dist_step(lev, i + 1, j + 1, s1, s2,
+                            substitution_cost=substitution_cost, transpositions=transpositions)
+    return lev[len1][len2]
+
+
+def binary_distance(label1, label2):
+    """Simple equality test.
+
+    0.0 if the labels are identical, 1.0 if they are different.
+
+    >>> from nltk.metrics import binary_distance
+    >>> binary_distance(1,1)
+    0.0
+
+    >>> binary_distance(1,3)
+    1.0
+    """
+
+    return 0.0 if label1 == label2 else 1.0
+
+
+def jaccard_distance(label1, label2):
+    """Distance metric comparing set-similarity.
+
+    """
+    return (len(label1.union(label2)) - len(label1.intersection(label2)))/len(label1.union(label2))
+
+
+def masi_distance(label1, label2):
+    """Distance metric that takes into account partial agreement when multiple
+    labels are assigned.
+
+    >>> from nltk.metrics import masi_distance
+    >>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
+    0.335
+
+    Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
+    for Semantic and Pragmatic Annotation.
+    """
+
+    len_intersection = len(label1.intersection(label2))
+    len_union = len(label1.union(label2))
+    len_label1 = len(label1)
+    len_label2 = len(label2)
+    if len_label1 == len_label2 and len_label1 == len_intersection:
+        m = 1
+    elif len_intersection == min(len_label1, len_label2):
+        m = 0.67
+    elif len_intersection > 0:
+        m = 0.33
+    else:
+        m = 0
+
+    return (1 - (len_intersection / float(len_union))) * m
+
+
+def interval_distance(label1,label2):
+    """Krippendorff's interval distance metric
+
+    >>> from nltk.metrics import interval_distance
+    >>> interval_distance(1,10)
+    81
+
+    Krippendorff 1980, Content Analysis: An Introduction to its Methodology
+    """
+
+    try:
+        return pow(label1 - label2, 2)
+#        return pow(list(label1)[0]-list(label2)[0],2)
+    except:
+        print("non-numeric labels not supported with interval distance")
+
+
+def presence(label):
+    """Higher-order function to test presence of a given label
+    """
+
+    return lambda x, y: 1.0 * ((label in x) == (label in y))
+
+
+def fractional_presence(label):
+    return lambda x, y:\
+        abs(((1.0 / len(x)) - (1.0 / len(y)))) * (label in x and label in y) \
+        or 0.0 * (label not in x and label not in y) \
+        or abs((1.0 / len(x))) * (label in x and label not in y) \
+        or ((1.0 / len(y))) * (label not in x and label in y)
+
+
+def custom_distance(file):
+    data = {}
+    with open(file, 'r') as infile:
+        for l in infile:
+            labelA, labelB, dist = l.strip().split("\t")
+            labelA = frozenset([labelA])
+            labelB = frozenset([labelB])
+            data[frozenset([labelA,labelB])] = float(dist)
+    return lambda x,y:data[frozenset([x,y])]
+
+
+def demo():
+    edit_distance_examples = [
+        ("rain", "shine"), ("abcdef", "acbdef"), ("language", "lnaguaeg"),
+        ("language", "lnaugage"), ("language", "lngauage")]
+    for s1, s2 in edit_distance_examples:
+        print("Edit distance between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2))
+    for s1, s2 in edit_distance_examples:
+        print("Edit distance with transpositions between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2, transpositions=True))
+
+    s1 = set([1, 2, 3, 4])
+    s2 = set([3, 4, 5])
+    print("s1:", s1)
+    print("s2:", s2)
+    print("Binary distance:", binary_distance(s1, s2))
+    print("Jaccard distance:", jaccard_distance(s1, s2))
+    print("MASI distance:", masi_distance(s1, s2))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/metrics/distance.pyc b/nlp_resource_data/nltk/metrics/distance.pyc

new file mode 100755 (executable)

index 0000000..a81fe5d

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/distance.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/paice.py b/nlp_resource_data/nltk/metrics/paice.py

new file mode 100755 (executable)

index 0000000..d23e4b5
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/paice.py
@@ -0,0 +1,381 @@
+# Natural Language Toolkit: Agreement Metrics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Lauri Hallila <laurihallila@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""Counts Paice's performance statistics for evaluating stemming algorithms.
+
+What is required:
+ - A dictionary of words grouped by their real lemmas
+ - A dictionary of words grouped by stems from a stemming algorithm
+
+When these are given, Understemming Index (UI), Overstemming Index (OI),
+Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted.
+
+References:
+Chris D. Paice (1994). An evaluation method for stemming algorithms.
+In Proceedings of SIGIR, 42--50.
+"""
+
+from math import sqrt
+
+
+def get_words_from_dictionary(lemmas):
+    '''
+    Get original set of words used for analysis.
+
+    :param lemmas: A dictionary where keys are lemmas and values are sets
+    or lists of words corresponding to that lemma.
+    :type lemmas: dict(str): list(str)
+    :return: Set of words that exist as values in the dictionary
+    :rtype: set(str)
+    '''
+    words = set()
+    for lemma in lemmas:
+        words.update(set(lemmas[lemma]))
+    return words
+
+
+def _truncate(words, cutlength):
+    '''Group words by stems defined by truncating them at given length.
+
+    :param words: Set of words used for analysis
+    :param cutlength: Words are stemmed by cutting at this length.
+    :type words: set(str) or list(str)
+    :type cutlength: int
+    :return: Dictionary where keys are stems and values are sets of words
+    corresponding to that stem.
+    :rtype: dict(str): set(str)
+    '''
+    stems = {}
+    for word in words:
+        stem = word[:cutlength]
+        try:
+            stems[stem].update([word])
+        except KeyError:
+            stems[stem] = set([word])
+    return stems
+
+
+# Reference: http://en.wikipedia.org/wiki/Line-line_intersection
+def _count_intersection(l1, l2):
+    '''Count intersection between two line segments defined by coordinate pairs.
+
+    :param l1: Tuple of two coordinate pairs defining the first line segment
+    :param l2: Tuple of two coordinate pairs defining the second line segment
+    :type l1: tuple(float, float)
+    :type l2: tuple(float, float)
+    :return: Coordinates of the intersection
+    :rtype: tuple(float, float)
+    '''
+    x1, y1 = l1[0]
+    x2, y2 = l1[1]
+    x3, y3 = l2[0]
+    x4, y4 = l2[1]
+
+    denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+
+    if denominator == 0.0: # lines are parallel
+        if x1 == x2 == x3 == x4 == 0.0:
+            # When lines are parallel, they must be on the y-axis.
+            # We can ignore x-axis because we stop counting the
+            # truncation line when we get there.
+            # There are no other options as UI (x-axis) grows and
+            # OI (y-axis) diminishes when we go along the truncation line.
+            return (0.0, y4)
+
+    x = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denominator
+    y = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denominator
+    return (x, y)
+
+
+def _get_derivative(coordinates):
+    '''Get derivative of the line from (0,0) to given coordinates.
+
+    :param coordinates: A coordinate pair
+    :type coordinates: tuple(float, float)
+    :return: Derivative; inf if x is zero
+    :rtype: float
+    '''
+    try:
+        return coordinates[1] / coordinates[0]
+    except ZeroDivisionError:
+        return float('inf')
+
+
+def _calculate_cut(lemmawords, stems):
+    '''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
+
+    :param lemmawords: Set or list of words corresponding to certain lemma.
+    :param stems: A dictionary where keys are stems and values are sets
+    or lists of words corresponding to that stem.
+    :type lemmawords: set(str) or list(str)
+    :type stems: dict(str): set(str)
+    :return: Amount of understemmed and overstemmed pairs contributed by words
+    existing in both lemmawords and stems.
+    :rtype: tuple(float, float)
+    '''
+    umt, wmt = 0.0, 0.0
+    for stem in stems:
+        cut = set(lemmawords) & set(stems[stem])
+        if cut:
+            cutcount = len(cut)
+            stemcount = len(stems[stem])
+            # Unachieved merge total
+            umt += cutcount * (len(lemmawords) - cutcount)
+            # Wrongly merged total
+            wmt += cutcount * (stemcount - cutcount)
+    return (umt, wmt)
+
+
+def _calculate(lemmas, stems):
+    '''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
+
+    :param lemmas: A dictionary where keys are lemmas and values are sets
+    or lists of words corresponding to that lemma.
+    :param stems: A dictionary where keys are stems and values are sets
+    or lists of words corresponding to that stem.
+    :type lemmas: dict(str): list(str)
+    :type stems: dict(str): set(str)
+    :return: Global unachieved merge total (gumt),
+    global desired merge total (gdmt),
+    global wrongly merged total (gwmt) and
+    global desired non-merge total (gdnt).
+    :rtype: tuple(float, float, float, float)
+    '''
+
+    n = sum(len(lemmas[word]) for word in lemmas)
+
+    gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0)
+
+    for lemma in lemmas:
+        lemmacount = len(lemmas[lemma])
+
+        # Desired merge total
+        gdmt += lemmacount * (lemmacount - 1)
+
+        # Desired non-merge total
+        gdnt += lemmacount * (n - lemmacount)
+
+        # For each (lemma, stem) pair with common words, count how many
+        # pairs are understemmed and overstemmed.
+        umt, wmt = _calculate_cut(lemmas[lemma], stems)
+
+        # Add to total undesired and wrongly-merged totals
+        gumt += umt
+        gwmt += wmt
+
+    # Each object is counted twice, so divide by two
+    return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2)
+
+
+def _indexes(gumt, gdmt, gwmt, gdnt):
+    '''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
+
+    :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
+    global desired merge total (gdmt),
+    global wrongly merged total (gwmt) and
+    global desired non-merge total (gdnt).
+    :type gumt, gdmt, gwmt, gdnt: float
+    :return: Understemming Index (UI),
+    Overstemming Index (OI) and
+    Stemming Weight (SW).
+    :rtype: tuple(float, float, float)
+    '''
+    # Calculate Understemming Index (UI),
+    # Overstemming Index (OI) and Stemming Weight (SW)
+    try:
+        ui = gumt / gdmt
+    except ZeroDivisionError:
+        # If GDMT (max merge total) is 0, define UI as 0
+        ui = 0.0
+    try:
+        oi = gwmt / gdnt
+    except ZeroDivisionError:
+        # IF GDNT (max non-merge total) is 0, define OI as 0
+        oi = 0.0
+    try:
+        sw = oi / ui
+    except ZeroDivisionError:
+        if oi == 0.0:
+            # OI and UI are 0, define SW as 'not a number'
+            sw = float('nan')
+        else:
+            # UI is 0, define SW as infinity
+            sw = float('inf')
+    return (ui, oi, sw)
+
+
+class Paice(object):
+    '''Class for storing lemmas, stems and evaluation metrics.'''
+    def __init__(self, lemmas, stems):
+        '''
+        :param lemmas: A dictionary where keys are lemmas and values are sets
+        or lists of words corresponding to that lemma.
+        :param stems: A dictionary where keys are stems and values are sets
+        or lists of words corresponding to that stem.
+        :type lemmas: dict(str): list(str)
+        :type stems: dict(str): set(str)
+        '''
+        self.lemmas = lemmas
+        self.stems = stems
+        self.coords = []
+        self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None)
+        self.ui, self.oi, self.sw = (None, None, None)
+        self.errt = None
+        self.update()
+
+    def __str__(self):
+        text = ['Global Unachieved Merge Total (GUMT): %s\n' % self.gumt]
+        text.append('Global Desired Merge Total (GDMT): %s\n' % self.gdmt)
+        text.append('Global Wrongly-Merged Total (GWMT): %s\n' % self.gwmt)
+        text.append('Global Desired Non-merge Total (GDNT): %s\n' % self.gdnt)
+        text.append('Understemming Index (GUMT / GDMT): %s\n' % self.ui)
+        text.append('Overstemming Index (GWMT / GDNT): %s\n' % self.oi)
+        text.append('Stemming Weight (OI / UI): %s\n' % self.sw)
+        text.append('Error-Rate Relative to Truncation (ERRT): %s\r\n' % self.errt)
+        coordinates = ' '.join(['(%s, %s)' % item for item in self.coords])
+        text.append('Truncation line: %s' % coordinates)
+        return ''.join(text)
+
+    def _get_truncation_indexes(self, words, cutlength):
+        '''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
+
+        :param words: Words used for the analysis
+        :param cutlength: Words are stemmed by cutting them at this length
+        :type words: set(str) or list(str)
+        :type cutlength: int
+        :return: Understemming and overstemming indexes
+        :rtype: tuple(int, int)
+        '''
+
+        truncated = _truncate(words, cutlength)
+        gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
+        ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2]
+        return (ui, oi)
+
+    def _get_truncation_coordinates(self, cutlength=0):
+        '''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
+
+        :param cutlength: Optional parameter to start counting from (ui, oi)
+        coordinates gotten by stemming at this length. Useful for speeding up
+        the calculations when you know the approximate location of the
+        intersection.
+        :type cutlength: int
+        :return: List of coordinate pairs that define the truncation line
+        :rtype: list(tuple(float, float))
+        '''
+        words = get_words_from_dictionary(self.lemmas)
+        maxlength = max(len(word) for word in words)
+
+        # Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line
+        coords = []
+        while cutlength <= maxlength:
+            # Get (UI, OI) pair of current truncation point
+            pair = self._get_truncation_indexes(words, cutlength)
+
+            # Store only new coordinates so we'll have an actual
+            # line segment when counting the intersection point
+            if pair not in coords:
+                coords.append(pair)
+            if pair == (0.0, 0.0):
+                # Stop counting if truncation line goes through origo;
+                # length from origo to truncation line is 0
+                return coords
+            if len(coords) >= 2 and pair[0] > 0.0:
+                derivative1 = _get_derivative(coords[-2])
+                derivative2 = _get_derivative(coords[-1])
+                # Derivative of the truncation line is a decreasing value;
+                # when it passes Stemming Weight, we've found the segment
+                # of truncation line intersecting with (0, 0) - (ui, oi) segment
+                if derivative1 >= self.sw >= derivative2:
+                    return coords
+            cutlength += 1
+        return coords
+
+    def _errt(self):
+        '''Count Error-Rate Relative to Truncation (ERRT).
+
+        :return: ERRT, length of the line from origo to (UI, OI) divided by
+        the length of the line from origo to the point defined by the same
+        line when extended until the truncation line.
+        :rtype: float
+        '''
+        # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
+        self.coords = self._get_truncation_coordinates()
+        if (0.0, 0.0) in self.coords:
+            # Truncation line goes through origo, so ERRT cannot be counted
+            if (self.ui, self.oi) != (0.0, 0.0):
+                return float('inf')
+            else:
+                return float('nan')
+        if (self.ui, self.oi) == (0.0, 0.0):
+            # (ui, oi) is origo; define errt as 0.0
+            return 0.0
+        # Count the intersection point
+        # Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates
+        # so we have actual line segments instead of a line segment and a point
+        intersection = _count_intersection(((0, 0), (self.ui, self.oi)),
+                                           self.coords[-2:]
+                                           )
+        # Count OP (length of the line from origo to (ui, oi))
+        op = sqrt(self.ui ** 2 + self.oi ** 2)
+        # Count OT (length of the line from origo to truncation line that goes through (ui, oi))
+        ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2)
+        # OP / OT tells how well the stemming algorithm works compared to just truncating words
+        return op / ot
+
+    def update(self):
+        '''Update statistics after lemmas and stems have been set.'''
+        self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
+        self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
+        self.errt = self._errt()
+
+
+def demo():
+    '''Demonstration of the module.'''
+    # Some words with their real lemmas
+    lemmas = {'kneel': ['kneel', 'knelt'],
+              'range': ['range', 'ranged'],
+              'ring': ['ring', 'rang', 'rung']
+              }
+    # Same words with stems from a stemming algorithm
+    stems = {'kneel': ['kneel'],
+             'knelt': ['knelt'],
+             'rang': ['rang', 'range', 'ranged'],
+             'ring': ['ring'],
+             'rung': ['rung']
+             }
+    print('Words grouped by their lemmas:')
+    for lemma in sorted(lemmas):
+        print('%s => %s' % (lemma, ' '.join(lemmas[lemma])))
+    print()
+    print('Same words grouped by a stemming algorithm:')
+    for stem in sorted(stems):
+        print('%s => %s' % (stem, ' '.join(stems[stem])))
+    print()
+    p = Paice(lemmas, stems)
+    print(p)
+    print()
+    # Let's "change" results from a stemming algorithm
+    stems = {'kneel': ['kneel'],
+             'knelt': ['knelt'],
+             'rang': ['rang'],
+             'range': ['range', 'ranged'],
+             'ring': ['ring'],
+             'rung': ['rung']
+             }
+    print('Counting stats after changing stemming results:')
+    for stem in sorted(stems):
+        print('%s => %s' % (stem, ' '.join(stems[stem])))
+    print()
+    p.stems = stems
+    p.update()
+    print(p)
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/metrics/paice.pyc b/nlp_resource_data/nltk/metrics/paice.pyc

new file mode 100755 (executable)

index 0000000..bbcfd69

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/paice.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/scores.py b/nlp_resource_data/nltk/metrics/scores.py

new file mode 100755 (executable)

index 0000000..ad78cc8
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/scores.py
@@ -0,0 +1,228 @@
+# Natural Language Toolkit: Evaluation
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, division
+
+from math import fabs
+import operator
+from random import shuffle
+from functools import reduce
+
+from six.moves import range, zip
+
+try:
+    from scipy.stats.stats import betai
+except ImportError:
+    betai = None
+
+from nltk.util import LazyConcatenation, LazyMap
+
+def accuracy(reference, test):
+    """
+    Given a list of reference values and a corresponding list of test
+    values, return the fraction of corresponding values that are
+    equal.  In particular, return the fraction of indices
+    ``0<i<=len(test)`` such that ``test[i] == reference[i]``.
+
+    :type reference: list
+    :param reference: An ordered list of reference values.
+    :type test: list
+    :param test: A list of values to compare against the corresponding
+        reference values.
+    :raise ValueError: If ``reference`` and ``length`` do not have the
+        same length.
+    """
+    if len(reference) != len(test):
+        raise ValueError("Lists must have the same length.")
+    return sum(x == y for x, y in zip(reference, test)) / len(test)
+
+def precision(reference, test):
+    """
+    Given a set of reference values and a set of test values, return
+    the fraction of test values that appear in the reference set.
+    In particular, return card(``reference`` intersection ``test``)/card(``test``).
+    If ``test`` is empty, then return None.
+
+    :type reference: set
+    :param reference: A set of reference values.
+    :type test: set
+    :param test: A set of values to compare against the reference set.
+    :rtype: float or None
+    """
+    if (not hasattr(reference, 'intersection') or
+        not hasattr(test, 'intersection')):
+        raise TypeError('reference and test should be sets')
+
+    if len(test) == 0:
+        return None
+    else:
+        return len(reference.intersection(test)) / len(test)
+
+def recall(reference, test):
+    """
+    Given a set of reference values and a set of test values, return
+    the fraction of reference values that appear in the test set.
+    In particular, return card(``reference`` intersection ``test``)/card(``reference``).
+    If ``reference`` is empty, then return None.
+
+    :type reference: set
+    :param reference: A set of reference values.
+    :type test: set
+    :param test: A set of values to compare against the reference set.
+    :rtype: float or None
+    """
+    if (not hasattr(reference, 'intersection') or
+        not hasattr(test, 'intersection')):
+        raise TypeError('reference and test should be sets')
+
+    if len(reference) == 0:
+        return None
+    else:
+        return len(reference.intersection(test)) / len(reference)
+
+def f_measure(reference, test, alpha=0.5):
+    """
+    Given a set of reference values and a set of test values, return
+    the f-measure of the test values, when compared against the
+    reference values.  The f-measure is the harmonic mean of the
+    ``precision`` and ``recall``, weighted by ``alpha``.  In particular,
+    given the precision *p* and recall *r* defined by:
+
+    - *p* = card(``reference`` intersection ``test``)/card(``test``)
+    - *r* = card(``reference`` intersection ``test``)/card(``reference``)
+
+    The f-measure is:
+
+    - *1/(alpha/p + (1-alpha)/r)*
+
+    If either ``reference`` or ``test`` is empty, then ``f_measure``
+    returns None.
+
+    :type reference: set
+    :param reference: A set of reference values.
+    :type test: set
+    :param test: A set of values to compare against the reference set.
+    :rtype: float or None
+    """
+    p = precision(reference, test)
+    r = recall(reference, test)
+    if p is None or r is None:
+        return None
+    if p == 0 or r == 0:
+        return 0
+    return 1.0 / (alpha / p + (1-alpha) / r)
+
+def log_likelihood(reference, test):
+    """
+    Given a list of reference values and a corresponding list of test
+    probability distributions, return the average log likelihood of
+    the reference values, given the probability distributions.
+
+    :param reference: A list of reference values
+    :type reference: list
+    :param test: A list of probability distributions over values to
+        compare against the corresponding reference values.
+    :type test: list(ProbDistI)
+    """
+    if len(reference) != len(test):
+        raise ValueError("Lists must have the same length.")
+
+    # Return the average value of dist.logprob(val).
+    total_likelihood = sum(dist.logprob(val)
+                            for (val, dist) in zip(reference, test))
+    return total_likelihood / len(reference)
+
+def approxrand(a, b, **kwargs):
+    """
+    Returns an approximate significance level between two lists of
+    independently generated test values.
+
+    Approximate randomization calculates significance by randomly drawing
+    from a sample of the possible permutations. At the limit of the number
+    of possible permutations, the significance level is exact. The
+    approximate significance level is the sample mean number of times the
+    statistic of the permutated lists varies from the actual statistic of
+    the unpermuted argument lists.
+
+    :return: a tuple containing an approximate significance level, the count
+             of the number of times the pseudo-statistic varied from the
+             actual statistic, and the number of shuffles
+    :rtype: tuple
+    :param a: a list of test values
+    :type a: list
+    :param b: another list of independently generated test values
+    :type b: list
+    """
+    shuffles = kwargs.get('shuffles', 999)
+    # there's no point in trying to shuffle beyond all possible permutations
+    shuffles = \
+        min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
+    stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
+    verbose = kwargs.get('verbose', False)
+
+    if verbose:
+        print('shuffles: %d' % shuffles)
+
+    actual_stat = fabs(stat(a) - stat(b))
+
+    if verbose:
+        print('actual statistic: %f' % actual_stat)
+        print('-' * 60)
+
+    c = 1e-100
+    lst = LazyConcatenation([a, b])
+    indices = list(range(len(a) + len(b)))
+
+    for i in range(shuffles):
+        if verbose and i % 10 == 0:
+            print('shuffle: %d' % i)
+
+        shuffle(indices)
+
+        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
+        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
+        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
+
+        if pseudo_stat >= actual_stat:
+            c += 1
+
+        if verbose and i % 10 == 0:
+            print('pseudo-statistic: %f' % pseudo_stat)
+            print('significance: %f' % ((c + 1) / (i + 1)))
+            print('-' * 60)
+
+    significance = (c + 1) / (shuffles + 1)
+
+    if verbose:
+        print('significance: %f' % significance)
+        if betai:
+            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
+                print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))
+
+    return (significance, c, shuffles)
+
+
+def demo():
+    print('-'*75)
+    reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+    test    = 'DET VB VB DET NN NN NN IN DET NN'.split()
+    print('Reference =', reference)
+    print('Test    =', test)
+    print('Accuracy:', accuracy(reference, test))
+
+    print('-'*75)
+    reference_set = set(reference)
+    test_set = set(test)
+    print('Reference =', reference_set)
+    print('Test =   ', test_set)
+    print('Precision:', precision(reference_set, test_set))
+    print('   Recall:', recall(reference_set, test_set))
+    print('F-Measure:', f_measure(reference_set, test_set))
+    print('-'*75)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/metrics/scores.pyc b/nlp_resource_data/nltk/metrics/scores.pyc

new file mode 100755 (executable)

index 0000000..394df8b

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/scores.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/segmentation.py b/nlp_resource_data/nltk/metrics/segmentation.py

new file mode 100755 (executable)

index 0000000..668f68e
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/segmentation.py
@@ -0,0 +1,231 @@
+# Natural Language Toolkit: Text Segmentation Metrics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         David Doukhan <david.doukhan@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+
+"""
+Text Segmentation Metrics
+
+1. Windowdiff
+
+Pevzner, L., and Hearst, M., A Critique and Improvement of
+  an Evaluation Metric for Text Segmentation,
+Computational Linguistics 28, 19-36
+
+
+2. Generalized Hamming Distance
+
+Bookstein A., Kulyukin V.A., Raita T.
+Generalized Hamming Distance
+Information Retrieval 5, 2002, pp 353-375
+
+Baseline implementation in C++
+http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
+
+Study describing benefits of Generalized Hamming Distance Versus
+WindowDiff for evaluating text segmentation tasks
+Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
+TALN 2009
+
+
+3. Pk text segmentation metric
+
+Beeferman D., Berger A., Lafferty J. (1999)
+Statistical Models for Text Segmentation
+Machine Learning, 34, 177-210
+"""
+
+try:
+    import numpy as np
+except ImportError:
+    pass
+
+from six.moves import range
+
+
+def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
+    """
+    Compute the windowdiff score for a pair of segmentations.  A
+    segmentation is any sequence over a vocabulary of two items
+    (e.g. "0", "1"), where the specified boundary value is used to
+    mark the edge of a segmentation.
+
+        >>> s1 = "000100000010"
+        >>> s2 = "000010000100"
+        >>> s3 = "100000010000"
+        >>> '%.2f' % windowdiff(s1, s1, 3)
+        '0.00'
+        >>> '%.2f' % windowdiff(s1, s2, 3)
+        '0.30'
+        >>> '%.2f' % windowdiff(s2, s3, 3)
+        '0.80'
+
+    :param seg1: a segmentation
+    :type seg1: str or list
+    :param seg2: a segmentation
+    :type seg2: str or list
+    :param k: window width
+    :type k: int
+    :param boundary: boundary value
+    :type boundary: str or int or bool
+    :param weighted: use the weighted variant of windowdiff
+    :type weighted: boolean
+    :rtype: float
+    """
+
+    if len(seg1) != len(seg2):
+        raise ValueError("Segmentations have unequal length")
+    if k > len(seg1):
+        raise ValueError("Window width k should be smaller or equal than segmentation lengths")
+    wd = 0
+    for i in range(len(seg1) - k + 1):
+        ndiff = abs(seg1[i:i+k].count(boundary) - seg2[i:i+k].count(boundary))
+        if weighted:
+            wd += ndiff
+        else:
+            wd += min(1, ndiff)
+    return wd / (len(seg1) - k + 1.)
+
+
+
+# Generalized Hamming Distance
+
+def _init_mat(nrows, ncols, ins_cost, del_cost):
+    mat = np.empty((nrows, ncols))
+    mat[0, :] = ins_cost * np.arange(ncols)
+    mat[:, 0] = del_cost * np.arange(nrows)
+    return mat
+
+
+def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
+    for i, rowi in enumerate(rowv):
+        for j, colj in enumerate(colv):
+            shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
+            if rowi == colj:
+                # boundaries are at the same location, no transformation required
+                tcost = mat[i, j]
+            elif rowi > colj:
+                # boundary match through a deletion
+                tcost = del_cost + mat[i, j + 1]
+            else:
+                # boundary match through an insertion
+                tcost = ins_cost + mat[i + 1, j]
+            mat[i + 1, j + 1] = min(tcost, shift_cost)
+
+
+def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
+    """
+    Compute the Generalized Hamming Distance for a reference and a hypothetical
+    segmentation, corresponding to the cost related to the transformation
+    of the hypothetical segmentation into the reference segmentation
+    through boundary insertion, deletion and shift operations.
+
+    A segmentation is any sequence over a vocabulary of two items
+    (e.g. "0", "1"), where the specified boundary value is used to
+    mark the edge of a segmentation.
+
+    Recommended parameter values are a shift_cost_coeff of 2.
+    Associated with a ins_cost, and del_cost equal to the mean segment
+    length in the reference segmentation.
+
+        >>> # Same examples as Kulyukin C++ implementation
+        >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
+        0.5
+        >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
+        2.0
+        >>> ghd('011', '110', 1.0, 1.0, 0.5)
+        1.0
+        >>> ghd('1', '0', 1.0, 1.0, 0.5)
+        1.0
+        >>> ghd('111', '000', 1.0, 1.0, 0.5)
+        3.0
+        >>> ghd('000', '111', 1.0, 2.0, 0.5)
+        6.0
+
+    :param ref: the reference segmentation
+    :type ref: str or list
+    :param hyp: the hypothetical segmentation
+    :type hyp: str or list
+    :param ins_cost: insertion cost
+    :type ins_cost: float
+    :param del_cost: deletion cost
+    :type del_cost: float
+    :param shift_cost_coeff: constant used to compute the cost of a shift.
+    shift cost = shift_cost_coeff * |i - j| where i and j are
+    the positions indicating the shift
+    :type shift_cost_coeff: float
+    :param boundary: boundary value
+    :type boundary: str or int or bool
+    :rtype: float
+    """
+
+    ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
+    hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
+
+    nref_bound = len(ref_idx)
+    nhyp_bound = len(hyp_idx)
+
+    if nref_bound == 0 and nhyp_bound == 0:
+        return 0.0
+    elif nref_bound > 0 and nhyp_bound == 0:
+        return nref_bound * ins_cost
+    elif nref_bound == 0 and nhyp_bound > 0:
+        return nhyp_bound * del_cost
+
+    mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
+    _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
+    return mat[-1, -1]
+
+
+# Beeferman's Pk text segmentation evaluation metric
+
+def pk(ref, hyp, k=None, boundary='1'):
+    """
+    Compute the Pk metric for a pair of segmentations A segmentation
+    is any sequence over a vocabulary of two items (e.g. "0", "1"),
+    where the specified boundary value is used to mark the edge of a
+    segmentation.
+
+    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
+    '0.50'
+    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
+    '0.50'
+    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
+    '0.00'
+
+    :param ref: the reference segmentation
+    :type ref: str or list
+    :param hyp: the segmentation to evaluate
+    :type hyp: str or list
+    :param k: window size, if None, set to half of the average reference segment length
+    :type boundary: str or int or bool
+    :param boundary: boundary value
+    :type boundary: str or int or bool
+    :rtype: float
+    """
+
+    if k is None:
+        k = int(round(len(ref) / (ref.count(boundary) * 2.)))
+
+    err = 0
+    for i in range(len(ref)-k +1):
+        r = ref[i:i+k].count(boundary) > 0
+        h = hyp[i:i+k].count(boundary) > 0
+        if r != h:
+           err += 1
+    return err / (len(ref)-k +1.)
+
+
+# skip doctests if numpy is not installed
+def setup_module(module):
+    from nose import SkipTest
+    try:
+        import numpy
+    except ImportError:
+        raise SkipTest("numpy is required for nltk.metrics.segmentation")
diff --git a/nlp_resource_data/nltk/metrics/segmentation.pyc b/nlp_resource_data/nltk/metrics/segmentation.pyc

new file mode 100755 (executable)

index 0000000..839321d

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/segmentation.pyc differ
diff --git a/nlp_resource_data/nltk/metrics/spearman.py b/nlp_resource_data/nltk/metrics/spearman.py

new file mode 100755 (executable)

index 0000000..07b158e
--- /dev/null
+++ b/nlp_resource_data/nltk/metrics/spearman.py
@@ -0,0 +1,69 @@
+# Natural Language Toolkit: Spearman Rank Correlation
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Joel Nothman <jnothman@student.usyd.edu.au>
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+from __future__ import division
+
+"""
+Tools for comparing ranked lists.
+"""
+
+def _rank_dists(ranks1, ranks2):
+    """Finds the difference between the values in ranks1 and ranks2 for keys
+    present in both dicts. If the arguments are not dicts, they are converted
+    from (key, rank) sequences.
+    """
+    ranks1 = dict(ranks1)
+    ranks2 = dict(ranks2)
+    for k in ranks1:
+        try:
+            yield k, ranks1[k] - ranks2[k]
+        except KeyError:
+            pass
+
+
+def spearman_correlation(ranks1, ranks2):
+    """Returns the Spearman correlation coefficient for two rankings, which
+    should be dicts or sequences of (key, rank). The coefficient ranges from
+    -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
+    calculated for keys in both rankings (for meaningful results, remove keys
+    present in only one list before ranking)."""
+    n = 0
+    res = 0
+    for k, d in _rank_dists(ranks1, ranks2):
+        res += d * d
+        n += 1
+    try:
+        return 1 - (6 * res / (n * (n*n - 1)))
+    except ZeroDivisionError:
+        # Result is undefined if only one item is ranked
+        return 0.0
+
+
+def ranks_from_sequence(seq):
+    """Given a sequence, yields each element with an increasing rank, suitable
+    for use as an argument to ``spearman_correlation``.
+    """
+    return ((k, i) for i, k in enumerate(seq))
+
+
+def ranks_from_scores(scores, rank_gap=1e-15):
+    """Given a sequence of (key, score) tuples, yields each key with an
+    increasing rank, tying with previous key's rank if the difference between
+    their scores is less than rank_gap. Suitable for use as an argument to
+    ``spearman_correlation``.
+    """
+    prev_score = None
+    rank = 0
+    for i, (key, score) in enumerate(scores):
+        try:
+            if abs(score - prev_score) > rank_gap:
+                rank = i
+        except TypeError:
+            pass
+
+        yield key, rank
+        prev_score = score
+
diff --git a/nlp_resource_data/nltk/metrics/spearman.pyc b/nlp_resource_data/nltk/metrics/spearman.pyc

new file mode 100755 (executable)

index 0000000..4be8dee

Binary files /dev/null and b/nlp_resource_data/nltk/metrics/spearman.pyc differ
diff --git a/nlp_resource_data/nltk/misc/__init__.py b/nlp_resource_data/nltk/misc/__init__.py

new file mode 100755 (executable)

index 0000000..e03dc4e
--- /dev/null
+++ b/nlp_resource_data/nltk/misc/__init__.py
@@ -0,0 +1,11 @@
+# Natural Language Toolkit: Miscellaneous modules
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from nltk.misc.chomsky import generate_chomsky
+from nltk.misc.wordfinder import word_finder
+from nltk.misc.minimalset import MinimalSet
+from nltk.misc.babelfish import babelize_shell
diff --git a/nlp_resource_data/nltk/misc/__init__.pyc b/nlp_resource_data/nltk/misc/__init__.pyc

new file mode 100755 (executable)

index 0000000..5619b02

Binary files /dev/null and b/nlp_resource_data/nltk/misc/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/misc/babelfish.py b/nlp_resource_data/nltk/misc/babelfish.py

new file mode 100755 (executable)

index 0000000..481a508
--- /dev/null
+++ b/nlp_resource_data/nltk/misc/babelfish.py
@@ -0,0 +1,10 @@
+"""
+This module previously provided an interface to Babelfish online
+translation service; this service is no longer available; this
+module is kept in NLTK source code in order to provide better error
+messages for people following the NLTK Book 2.0.
+"""
+from __future__ import print_function
+
+def babelize_shell():
+    print("Babelfish online translation service is no longer available.")
diff --git a/nlp_resource_data/nltk/misc/babelfish.pyc b/nlp_resource_data/nltk/misc/babelfish.pyc

new file mode 100755 (executable)

index 0000000..0b25c4e

Binary files /dev/null and b/nlp_resource_data/nltk/misc/babelfish.pyc differ
diff --git a/nlp_resource_data/nltk/misc/chomsky.py b/nlp_resource_data/nltk/misc/chomsky.py

new file mode 100755 (executable)

index 0000000..9cfb5c2
--- /dev/null
+++ b/nlp_resource_data/nltk/misc/chomsky.py
@@ -0,0 +1,135 @@
+# Chomsky random text generator, version 1.1, Raymond Hettinger, 2005/09/13
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/440546
+
+"""
+CHOMSKY is an aid to writing linguistic papers in the style
+of the great master.  It is based on selected phrases taken
+from actual books and articles written by Noam Chomsky.
+Upon request, it assembles the phrases in the elegant
+stylistic patterns that Chomsky is noted for.
+To generate n sentences of linguistic wisdom, type
+
+    (CHOMSKY n)  -- for example
+    (CHOMSKY 5) generates half a screen of linguistic truth.
+"""
+from __future__ import print_function
+
+leadins = """To characterize a linguistic level L,
+    On the other hand,
+    This suggests that
+    It appears that
+    Furthermore,
+    We will bring evidence in favor of the following thesis:
+    To provide a constituent structure for T(Z,K),
+    From C1, it follows that
+    For any transformation which is sufficiently diversified in \
+application to be of any interest,
+    Analogously,
+    Clearly,
+    Note that
+    Of course,
+    Suppose, for instance, that
+    Thus
+    With this clarification,
+    Conversely,
+    We have already seen that
+    By combining adjunctions and certain deformations,
+    I suggested that these results would follow from the assumption that
+    If the position of the trace in (99c) were only relatively \
+inaccessible to movement,
+    However, this assumption is not correct, since
+    Comparing these examples with their parasitic gap counterparts in \
+(96) and (97), we see that
+    In the discussion of resumptive pronouns following (81),
+    So far,
+    Nevertheless,
+    For one thing,
+    Summarizing, then, we assume that
+    A consequence of the approach just outlined is that
+    Presumably,
+    On our assumptions,
+    It may be, then, that
+    It must be emphasized, once again, that
+    Let us continue to suppose that
+    Notice, incidentally, that """
+# List of LEADINs to buy time.
+
+subjects = """ the notion of level of grammaticalness
+    a case of semigrammaticalness of a different sort
+    most of the methodological work in modern linguistics
+    a subset of English sentences interesting on quite independent grounds
+    the natural general principle that will subsume this case
+    an important property of these three types of EC
+    any associated supporting element
+    the appearance of parasitic gaps in domains relatively inaccessible \
+to ordinary extraction
+    the speaker-hearer's linguistic intuition
+    the descriptive power of the base component
+    the earlier discussion of deviance
+    this analysis of a formative as a pair of sets of features
+    this selectionally introduced contextual feature
+    a descriptively adequate grammar
+    the fundamental error of regarding functional notions as categorial
+    relational information
+    the systematic use of complex symbols
+    the theory of syntactic features developed earlier"""
+# List of SUBJECTs chosen for maximum professorial macho.
+
+verbs = """can be defined in such a way as to impose
+    delimits
+    suffices to account for
+    cannot be arbitrary in
+    is not subject to
+    does not readily tolerate
+    raises serious doubts about
+    is not quite equivalent to
+    does not affect the structure of
+    may remedy and, at the same time, eliminate
+    is not to be considered in determining
+    is to be regarded as
+    is unspecified with respect to
+    is, apparently, determined by
+    is necessary to impose an interpretation on
+    appears to correlate rather closely with
+    is rather different from"""
+#List of VERBs chosen for autorecursive obfuscation.
+
+objects = """ problems of phonemic and morphological analysis.
+    a corpus of utterance tokens upon which conformity has been defined \
+by the paired utterance test.
+    the traditional practice of grammarians.
+    the levels of acceptability from fairly high (e.g. (99a)) to virtual \
+gibberish (e.g. (98d)).
+    a stipulation to place the constructions into these various categories.
+    a descriptive fact.
+    a parasitic gap construction.
+    the extended c-command discussed in connection with (34).
+    the ultimate standard that determines the accuracy of any proposed grammar.
+    the system of base rules exclusive of the lexicon.
+    irrelevant intervening contexts in selectional rules.
+    nondistinctness in the sense of distinctive feature theory.
+    a general convention regarding the forms of the grammar.
+    an abstract underlying order.
+    an important distinction in language use.
+    the requirement that branching is not tolerated within the dominance \
+scope of a complex symbol.
+    the strong generative capacity of the theory."""
+# List of OBJECTs selected for profound sententiousness.
+
+import textwrap, random
+from itertools import chain, islice
+
+from six.moves import zip
+
+
+def generate_chomsky(times=5, line_length=72):
+    parts = []
+    for part in (leadins, subjects, verbs, objects):
+        phraselist = list(map(str.strip, part.splitlines()))
+        random.shuffle(phraselist)
+        parts.append(phraselist)
+    output = chain(*islice(zip(*parts), 0, times))
+    print(textwrap.fill(" ".join(output), line_length))
+
+if __name__ == '__main__':
+    generate_chomsky()
diff --git a/nlp_resource_data/nltk/misc/chomsky.pyc b/nlp_resource_data/nltk/misc/chomsky.pyc

new file mode 100755 (executable)

index 0000000..4cd5a7e

Binary files /dev/null and b/nlp_resource_data/nltk/misc/chomsky.pyc differ
diff --git a/nlp_resource_data/nltk/misc/minimalset.py b/nlp_resource_data/nltk/misc/minimalset.py

new file mode 100755 (executable)

index 0000000..1299f19
--- /dev/null
+++ b/nlp_resource_data/nltk/misc/minimalset.py
@@ -0,0 +1,83 @@
+# Natural Language Toolkit: Minimal Sets
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+from collections import defaultdict
+
+class MinimalSet(object):
+    """
+    Find contexts where more than one possible target value can
+    appear.  E.g. if targets are word-initial letters, and contexts
+    are the remainders of words, then we would like to find cases like
+    "fat" vs "cat", and "training" vs "draining".  If targets are
+    parts-of-speech and contexts are words, then we would like to find
+    cases like wind (noun) 'air in rapid motion', vs wind (verb)
+    'coil, wrap'.
+    """
+    def __init__(self, parameters=None):
+        """
+        Create a new minimal set.
+
+        :param parameters: The (context, target, display) tuples for the item
+        :type parameters: list(tuple(str, str, str))
+        """
+        self._targets = set()  # the contrastive information
+        self._contexts = set() # what we are controlling for
+        self._seen = defaultdict(set)  # to record what we have seen
+        self._displays = {}    # what we will display
+
+        if parameters:
+            for context, target, display in parameters:
+                self.add(context, target, display)
+
+    def add(self, context, target, display):
+        """
+        Add a new item to the minimal set, having the specified
+        context, target, and display form.
+
+        :param context: The context in which the item of interest appears
+        :type context: str
+        :param target: The item of interest
+        :type target: str
+        :param display: The information to be reported for each item
+        :type display: str
+        """
+        # Store the set of targets that occurred in this context
+        self._seen[context].add(target)
+
+        # Keep track of which contexts and targets we have seen
+        self._contexts.add(context)
+        self._targets.add(target)
+
+        # For a given context and target, store the display form
+        self._displays[(context, target)] = display
+
+    def contexts(self, minimum=2):
+        """
+        Determine which contexts occurred with enough distinct targets.
+
+        :param minimum: the minimum number of distinct target forms
+        :type minimum: int
+        :rtype list
+        """
+        return [c for c in self._contexts if len(self._seen[c]) >= minimum]
+
+    def display(self, context, target, default=""):
+        if (context, target) in self._displays:
+            return self._displays[(context, target)]
+        else:
+            return default
+
+    def display_all(self, context):
+        result = []
+        for target in self._targets:
+            x = self.display(context, target)
+            if x: result.append(x)
+        return result
+
+    def targets(self):
+        return self._targets
+
diff --git a/nlp_resource_data/nltk/misc/minimalset.pyc b/nlp_resource_data/nltk/misc/minimalset.pyc

new file mode 100755 (executable)

index 0000000..35253fa

Binary files /dev/null and b/nlp_resource_data/nltk/misc/minimalset.pyc differ
diff --git a/nlp_resource_data/nltk/misc/sort.py b/nlp_resource_data/nltk/misc/sort.py

new file mode 100755 (executable)

index 0000000..8e1dd38
--- /dev/null
+++ b/nlp_resource_data/nltk/misc/sort.py
@@ -0,0 +1,157 @@
+# Natural Language Toolkit: List Sorting
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This module provides a variety of list sorting algorithms, to
+illustrate the many different algorithms (recipes) for solving a
+problem, and how to analyze algorithms experimentally.
+"""
+from __future__ import print_function, division
+
+# These algorithms are taken from:
+# Levitin (2004) The Design and Analysis of Algorithms
+
+##################################################################
+# Selection Sort
+##################################################################
+
+def selection(a):
+    """
+    Selection Sort: scan the list to find its smallest element, then
+    swap it with the first element.  The remainder of the list is one
+    element smaller; apply the same method to this list, and so on.
+    """
+    count = 0
+
+    for i in range(len(a) - 1):
+        min = i
+
+        for j in range(i+1, len(a)):
+            if a[j] < a[min]:
+                min = j
+
+            count += 1
+
+        a[min],a[i] = a[i],a[min]
+
+    return count
+
+##################################################################
+# Bubble Sort
+##################################################################
+
+def bubble(a):
+    """
+    Bubble Sort: compare adjacent elements of the list left-to-right,
+    and swap them if they are out of order.  After one pass through
+    the list swapping adjacent items, the largest item will be in
+    the rightmost position.  The remainder is one element smaller;
+    apply the same method to this list, and so on.
+    """
+    count = 0
+    for i in range(len(a)-1):
+        for j in range(len(a)-i-1):
+            if a[j+1] < a[j]:
+                a[j],a[j+1] = a[j+1],a[j]
+                count += 1
+    return count
+
+
+##################################################################
+# Merge Sort
+##################################################################
+
+def _merge_lists(b, c):
+    count = 0
+    i = j = 0
+    a = []
+    while (i < len(b) and j < len(c)):
+        count += 1
+        if b[i] <= c[j]:
+            a.append(b[i])
+            i += 1
+        else:
+            a.append(c[j])
+            j += 1
+    if i == len(b):
+        a += c[j:]
+    else:
+        a += b[i:]
+    return a, count
+
+def merge(a):
+    """
+    Merge Sort: split the list in half, and sort each half, then
+    combine the sorted halves.
+    """
+    count = 0
+    if len(a) > 1:
+        midpoint = len(a) // 2
+        b = a[:midpoint]
+        c = a[midpoint:]
+        count_b = merge(b)
+        count_c = merge(c)
+        result, count_a = _merge_lists(b, c)
+        a[:] = result # copy the result back into a.
+        count = count_a + count_b + count_c
+    return count
+
+##################################################################
+# Quick Sort
+##################################################################
+
+def _partition(a, l, r):
+    p = a[l]; i = l; j = r+1
+    count = 0
+    while True:
+        while i < r:
+            i += 1
+            if a[i] >= p: break
+        while j > l:
+            j -= 1
+            if j < l or a[j] <= p: break
+        a[i],a[j] = a[j],a[i]               # swap
+        count += 1
+        if i >= j: break
+    a[i],a[j] = a[j],a[i]                   # undo last swap
+    a[l],a[j] = a[j],a[l]
+    return j, count
+
+def _quick(a, l, r):
+    count = 0
+    if l<r:
+        s, count = _partition(a, l, r)
+        count += _quick(a, l, s-1)
+        count += _quick(a, s+1, r)
+    return count
+
+def quick(a):
+    return _quick(a, 0, len(a)-1)
+
+##################################################################
+# Demonstration
+##################################################################
+
+def demo():
+    from random import shuffle
+
+    for size in (10, 20, 50, 100, 200, 500, 1000):
+        a = list(range(size))
+
+        # various sort methods
+        shuffle(a); count_selection = selection(a)
+        shuffle(a); count_bubble    = bubble(a)
+        shuffle(a); count_merge     = merge(a)
+        shuffle(a); count_quick     = quick(a)
+
+        print((("size=%5d:  selection=%8d,  bubble=%8d,  "
+                "merge=%6d,  quick=%6d") %
+               (size, count_selection, count_bubble,
+                count_merge, count_quick)))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/misc/sort.pyc b/nlp_resource_data/nltk/misc/sort.pyc

new file mode 100755 (executable)

index 0000000..ed561a1

Binary files /dev/null and b/nlp_resource_data/nltk/misc/sort.pyc differ
diff --git a/nlp_resource_data/nltk/misc/wordfinder.py b/nlp_resource_data/nltk/misc/wordfinder.py

new file mode 100755 (executable)

index 0000000..c2d5449
--- /dev/null
+++ b/nlp_resource_data/nltk/misc/wordfinder.py
@@ -0,0 +1,129 @@
+# Natural Language Toolkit: Word Finder
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# Simplified from PHP version by Robert Klein <brathna@gmail.com>
+# http://fswordfinder.sourceforge.net/
+from __future__ import print_function
+
+import random
+
+
+# reverse a word with probability 0.5
+def revword(word):
+    if random.randint(1,2) == 1:
+        return word[::-1]
+    return word
+
+# try to insert word at position x,y; direction encoded in xf,yf
+def step(word, x, xf, y, yf, grid):
+    for i in range(len(word)):
+        if grid[xf(i)][yf(i)] != "" and grid[xf(i)][yf(i)] != word[i]:
+            return False
+    for i in range(len(word)):
+        grid[xf(i)][yf(i)] = word[i]
+    return True
+
+# try to insert word at position x,y, in direction dir
+def check(word, dir, x, y, grid, rows, cols):
+    if dir==1:
+        if x-len(word)<0 or y-len(word)<0:
+            return False
+        return step(word, x, lambda i:x-i, y, lambda i:y-i, grid)
+    elif dir==2:
+        if x-len(word)<0:
+            return False
+        return step(word, x, lambda i:x-i, y, lambda i:y, grid)
+    elif dir==3:
+        if x-len(word)<0 or y+(len(word)-1)>=cols:
+            return False
+        return step(word, x, lambda i:x-i, y, lambda i:y+i, grid)
+    elif dir==4:
+        if y-len(word)<0:
+            return False
+        return step(word, x, lambda i:x, y, lambda i:y-i, grid)
+
+def wordfinder(words, rows=20, cols=20, attempts=50,
+               alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
+    """
+    Attempt to arrange words into a letter-grid with the specified
+    number of rows and columns.  Try each word in several positions
+    and directions, until it can be fitted into the grid, or the
+    maximum number of allowable attempts is exceeded.  Returns a tuple
+    consisting of the grid and the words that were successfully
+    placed.
+
+    :param words: the list of words to be put into the grid
+    :type words: list
+    :param rows: the number of rows in the grid
+    :type rows: int
+    :param cols: the number of columns in the grid
+    :type cols: int
+    :param attempts: the number of times to attempt placing a word
+    :type attempts: int
+    :param alph: the alphabet, to be used for filling blank cells
+    :type alph: list
+    :rtype: tuple
+    """
+
+    # place longer words first
+    words = sorted(words, key=len, reverse=True)
+
+    grid = []  # the letter grid
+    used = []  # the words we used
+
+    # initialize the grid
+    for i in range(rows):
+        grid.append([""] * cols)
+
+    # try to place each word
+    for word in words:
+        word = word.strip().upper()  # normalize
+        save = word                  # keep a record of the word
+        word = revword(word)
+        for attempt in range(attempts):
+            r = random.randint(0, len(word))
+            dir = random.choice([1,2,3,4])
+            x = random.randint(0,rows)
+            y = random.randint(0,cols)
+            if   dir==1: x+=r; y+=r
+            elif dir==2: x+=r
+            elif dir==3: x+=r; y-=r
+            elif dir==4: y+=r
+            if 0<=x<rows and 0<=y<cols:
+                if check(word, dir, x, y, grid, rows, cols):
+#                   used.append((save, dir, x, y, word))
+                    used.append(save)
+                    break
+
+    # Fill up the remaining spaces
+    for i in range(rows):
+        for j in range(cols):
+            if grid[i][j] == '':
+                grid[i][j] = random.choice(alph)
+
+    return grid, used
+
+def word_finder():
+    from nltk.corpus import words
+    wordlist = words.words()
+    random.shuffle(wordlist)
+    wordlist = wordlist[:200]
+    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
+    grid, used = wordfinder(wordlist)
+
+    print("Word Finder\n")
+    for i in range(len(grid)):
+        for j in range(len(grid[i])):
+            print(grid[i][j], end=' ')
+        print()
+    print()
+
+    for i in range(len(used)):
+        print("%d:" % (i+1), used[i])
+
+if __name__ == '__main__':
+    word_finder()
diff --git a/nlp_resource_data/nltk/misc/wordfinder.pyc b/nlp_resource_data/nltk/misc/wordfinder.pyc

new file mode 100755 (executable)

index 0000000..5f2b23c

Binary files /dev/null and b/nlp_resource_data/nltk/misc/wordfinder.pyc differ
diff --git a/nlp_resource_data/nltk/parse/__init__.py b/nlp_resource_data/nltk/parse/__init__.py

new file mode 100755 (executable)

index 0000000..66441de
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/__init__.py
@@ -0,0 +1,81 @@
+# Natural Language Toolkit: Parsers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+NLTK Parsers
+
+Classes and interfaces for producing tree structures that represent
+the internal organization of a text.  This task is known as "parsing"
+the text, and the resulting tree structures are called the text's
+"parses".  Typically, the text is a single sentence, and the tree
+structure represents the syntactic structure of the sentence.
+However, parsers can also be used in other domains.  For example,
+parsers can be used to derive the morphological structure of the
+morphemes that make up a word, or to derive the discourse structure
+for a set of utterances.
+
+Sometimes, a single piece of text can be represented by more than one
+tree structure.  Texts represented by more than one tree structure are
+called "ambiguous" texts.  Note that there are actually two ways in
+which a text can be ambiguous:
+
+    - The text has multiple correct parses.
+    - There is not enough information to decide which of several
+      candidate parses is correct.
+
+However, the parser module does *not* distinguish these two types of
+ambiguity.
+
+The parser module defines ``ParserI``, a standard interface for parsing
+texts; and two simple implementations of that interface,
+``ShiftReduceParser`` and ``RecursiveDescentParser``.  It also contains
+three sub-modules for specialized kinds of parsing:
+
+  - ``nltk.parser.chart`` defines chart parsing, which uses dynamic
+    programming to efficiently parse texts.
+  - ``nltk.parser.probabilistic`` defines probabilistic parsing, which
+    associates a probability with each parse.
+"""
+
+from nltk.parse.api import ParserI
+from nltk.parse.chart import (ChartParser, SteppingChartParser, TopDownChartParser,
+                              BottomUpChartParser, BottomUpLeftCornerChartParser,
+                              LeftCornerChartParser)
+from nltk.parse.featurechart import (FeatureChartParser, FeatureTopDownChartParser,
+                                     FeatureBottomUpChartParser,
+                                     FeatureBottomUpLeftCornerChartParser)
+from nltk.parse.earleychart import (IncrementalChartParser, EarleyChartParser,
+                                    IncrementalTopDownChartParser,
+                                    IncrementalBottomUpChartParser,
+                                    IncrementalBottomUpLeftCornerChartParser,
+                                    IncrementalLeftCornerChartParser,
+                                    FeatureIncrementalChartParser,
+                                    FeatureEarleyChartParser,
+                                    FeatureIncrementalTopDownChartParser,
+                                    FeatureIncrementalBottomUpChartParser,
+                                    FeatureIncrementalBottomUpLeftCornerChartParser)
+from nltk.parse.pchart import (BottomUpProbabilisticChartParser, InsideChartParser,
+                               RandomChartParser, UnsortedChartParser,
+                               LongestChartParser)
+from nltk.parse.recursivedescent import (RecursiveDescentParser,
+                                         SteppingRecursiveDescentParser)
+from nltk.parse.shiftreduce import (ShiftReduceParser, SteppingShiftReduceParser)
+from nltk.parse.util import load_parser, TestGrammar, extract_test_sentences
+from nltk.parse.viterbi import ViterbiParser
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.parse.projectivedependencyparser import (ProjectiveDependencyParser,
+                                                   ProbabilisticProjectiveDependencyParser)
+from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyParser,
+                                                      NaiveBayesDependencyScorer,
+                                                      ProbabilisticNonprojectiveParser)
+from nltk.parse.malt import MaltParser
+from nltk.parse.evaluate import DependencyEvaluator
+from nltk.parse.transitionparser import TransitionParser
+from nltk.parse.bllip import BllipParser
+from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser
diff --git a/nlp_resource_data/nltk/parse/__init__.pyc b/nlp_resource_data/nltk/parse/__init__.pyc

new file mode 100755 (executable)

index 0000000..18bb47c

Binary files /dev/null and b/nlp_resource_data/nltk/parse/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/parse/api.py b/nlp_resource_data/nltk/parse/api.py

new file mode 100755 (executable)

index 0000000..6ddd9aa
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/api.py
@@ -0,0 +1,66 @@
+# Natural Language Toolkit: Parser API
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+import itertools
+
+from nltk.internals import overridden
+
+class ParserI(object):
+    """
+    A processing class for deriving trees that represent possible
+    structures for a sequence of tokens.  These tree structures are
+    known as "parses".  Typically, parsers are used to derive syntax
+    trees for sentences.  But parsers can also be used to derive other
+    kinds of tree structure, such as morphological trees and discourse
+    structures.
+
+    Subclasses must define:
+      - at least one of: ``parse()``, ``parse_sents()``.
+
+    Subclasses may define:
+      - ``grammar()``
+    """
+    def grammar(self):
+        """
+        :return: The grammar used by this parser.
+        """
+        raise NotImplementedError()
+
+    def parse(self, sent, *args, **kwargs):
+        """
+        :return: An iterator that generates parse trees for the sentence.
+        When possible this list is sorted from most likely to least likely.
+
+        :param sent: The sentence to be parsed
+        :type sent: list(str)
+        :rtype: iter(Tree)
+        """
+        if overridden(self.parse_sents):
+            return next(self.parse_sents([sent], *args, **kwargs))
+        elif overridden(self.parse_one):
+            return (tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None)
+        elif overridden(self.parse_all):
+            return iter(self.parse_all(sent, *args, **kwargs))
+        else:
+            raise NotImplementedError()
+
+    def parse_sents(self, sents, *args, **kwargs):
+        """
+        Apply ``self.parse()`` to each element of ``sents``.
+        :rtype: iter(iter(Tree))
+        """
+        return (self.parse(sent, *args, **kwargs) for sent in sents)
+
+    def parse_all(self, sent, *args, **kwargs):
+        """:rtype: list(Tree)"""
+        return list(self.parse(sent, *args, **kwargs))
+
+    def parse_one(self, sent, *args, **kwargs):
+        """:rtype: Tree or None"""
+        return next(self.parse(sent, *args, **kwargs), None)
diff --git a/nlp_resource_data/nltk/parse/api.pyc b/nlp_resource_data/nltk/parse/api.pyc

new file mode 100755 (executable)

index 0000000..886afbb

Binary files /dev/null and b/nlp_resource_data/nltk/parse/api.pyc differ
diff --git a/nlp_resource_data/nltk/parse/bllip.py b/nlp_resource_data/nltk/parse/bllip.py

new file mode 100755 (executable)

index 0000000..06d0051
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/bllip.py
@@ -0,0 +1,282 @@
+# Natural Language Toolkit: Interface to BLLIP Parser
+#
+# Author: David McClosky <dmcc@bigasterisk.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import print_function
+
+from nltk.parse.api import ParserI
+from nltk.tree import Tree
+
+"""
+Interface for parsing with BLLIP Parser. Requires the Python
+bllipparser module. BllipParser objects can be constructed with the
+``BllipParser.from_unified_model_dir`` class method or manually using the
+``BllipParser`` constructor. The former is generally easier if you have
+a BLLIP Parser unified model directory -- a basic model can be obtained
+from NLTK's downloader. More unified parsing models can be obtained with
+BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
+or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).
+
+Basic usage::
+
+    # download and install a basic unified parsing model (Wall Street Journal)
+    # sudo python -m nltk.downloader bllip_wsj_no_aux
+
+    >>> from nltk.data import find
+    >>> model_dir = find('models/bllip_wsj_no_aux').path
+    >>> bllip = BllipParser.from_unified_model_dir(model_dir)
+
+    # 1-best parsing
+    >>> sentence1 = 'British left waffles on Falklands .'.split()
+    >>> top_parse = bllip.parse_one(sentence1)
+    >>> print(top_parse)
+    (S1
+      (S
+        (NP (JJ British) (NN left))
+        (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))
+        (. .)))
+
+    # n-best parsing
+    >>> sentence2 = 'Time flies'.split()
+    >>> all_parses = bllip.parse_all(sentence2)
+    >>> print(len(all_parses))
+    50
+    >>> print(all_parses[0])
+    (S1 (S (NP (NNP Time)) (VP (VBZ flies))))
+
+    # incorporating external tagging constraints (None means unconstrained tag)
+    >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])
+    >>> print(next(constrained1))
+    (S1 (NP (VB Time) (NNS flies)))
+    >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])
+    >>> print(next(constrained2))
+    (S1 (NP (NN Time) (VBZ flies)))
+
+References
+----------
+
+- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of
+  the 1st North American chapter of the Association for Computational
+  Linguistics conference. Association for Computational Linguistics,
+  2000.
+
+- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing
+  and MaxEnt discriminative reranking." Proceedings of the 43rd Annual
+  Meeting on Association for Computational Linguistics. Association
+  for Computational Linguistics, 2005.
+
+Known issues
+------------
+
+Note that BLLIP Parser is not currently threadsafe. Since this module
+uses a SWIG interface, it is potentially unsafe to create multiple
+``BllipParser`` objects in the same process. BLLIP Parser currently
+has issues with non-ASCII text and will raise an error if given any.
+
+See http://pypi.python.org/pypi/bllipparser/ for more information
+on BLLIP Parser's Python interface.
+"""
+
+__all__ = ['BllipParser']
+
+# this block allows this module to be imported even if bllipparser isn't
+# available
+try:
+    from bllipparser import RerankingParser
+    from bllipparser.RerankingParser import get_unified_model_parameters
+
+    def _ensure_bllip_import_or_error():
+        pass
+except ImportError as ie:
+    def _ensure_bllip_import_or_error(ie=ie):
+        raise ImportError("Couldn't import bllipparser module: %s" % ie)
+
+def _ensure_ascii(words):
+    try:
+        for i, word in enumerate(words):
+            word.decode('ascii')
+    except UnicodeDecodeError:
+        raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser "
+                         "currently doesn't support non-ASCII inputs." %
+                         (i, word))
+
+def _scored_parse_to_nltk_tree(scored_parse):
+    return Tree.fromstring(str(scored_parse.ptb_parse))
+
+class BllipParser(ParserI):
+    """
+    Interface for parsing with BLLIP Parser. BllipParser objects can be
+    constructed with the ``BllipParser.from_unified_model_dir`` class
+    method or manually using the ``BllipParser`` constructor.
+    """
+    def __init__(self, parser_model=None, reranker_features=None,
+                 reranker_weights=None, parser_options=None,
+                 reranker_options=None):
+        """
+        Load a BLLIP Parser model from scratch. You'll typically want to
+        use the ``from_unified_model_dir()`` class method to construct
+        this object.
+
+        :param parser_model: Path to parser model directory
+        :type parser_model: str
+
+        :param reranker_features: Path the reranker model's features file
+        :type reranker_features: str
+
+        :param reranker_weights: Path the reranker model's weights file
+        :type reranker_weights: str
+
+        :param parser_options: optional dictionary of parser options, see
+        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
+        for more information.
+        :type parser_options: dict(str)
+
+        :param reranker_options: optional
+        dictionary of reranker options, see
+        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
+        for more information.
+        :type reranker_options: dict(str)
+        """
+        _ensure_bllip_import_or_error()
+
+        parser_options = parser_options or {}
+        reranker_options = reranker_options or {}
+
+        self.rrp = RerankingParser()
+        self.rrp.load_parser_model(parser_model, **parser_options)
+        if reranker_features and reranker_weights:
+            self.rrp.load_reranker_model(features_filename=reranker_features,
+                                         weights_filename=reranker_weights,
+                                         **reranker_options)
+
+    def parse(self, sentence):
+        """
+        Use BLLIP Parser to parse a sentence. Takes a sentence as a list
+        of words; it will be automatically tagged with this BLLIP Parser
+        instance's tagger.
+
+        :return: An iterator that generates parse trees for the sentence
+        from most likely to least likely.
+
+        :param sentence: The sentence to be parsed
+        :type sentence: list(str)
+        :rtype: iter(Tree)
+        """
+        _ensure_ascii(sentence)
+        nbest_list = self.rrp.parse(sentence)
+        for scored_parse in nbest_list:
+            yield _scored_parse_to_nltk_tree(scored_parse)
+
+    def tagged_parse(self, word_and_tag_pairs):
+        """
+        Use BLLIP to parse a sentence. Takes a sentence as a list of
+        (word, tag) tuples; the sentence must have already been tokenized
+        and tagged. BLLIP will attempt to use the tags provided but may
+        use others if it can't come up with a complete parse subject
+        to those constraints. You may also specify a tag as ``None``
+        to leave a token's tag unconstrained.
+
+        :return: An iterator that generates parse trees for the sentence
+        from most likely to least likely.
+
+        :param sentence: Input sentence to parse as (word, tag) pairs
+        :type sentence: list(tuple(str, str))
+        :rtype: iter(Tree)
+        """
+        words = []
+        tag_map = {}
+        for i, (word, tag) in enumerate(word_and_tag_pairs):
+            words.append(word)
+            if tag is not None:
+                tag_map[i] = tag
+
+        _ensure_ascii(words)
+        nbest_list = self.rrp.parse_tagged(words, tag_map)
+        for scored_parse in nbest_list:
+            yield _scored_parse_to_nltk_tree(scored_parse)
+
+    @classmethod
+    def from_unified_model_dir(this_class, model_dir, parser_options=None,
+                               reranker_options=None):
+        """
+        Create a ``BllipParser`` object from a unified parsing model
+        directory. Unified parsing model directories are a standardized
+        way of storing BLLIP parser and reranker models together on disk.
+        See ``bllipparser.RerankingParser.get_unified_model_parameters()``
+        for more information about unified model directories.
+
+        :return: A ``BllipParser`` object using the parser and reranker
+        models in the model directory.
+
+        :param model_dir: Path to the unified model directory.
+        :type model_dir: str
+        :param parser_options: optional dictionary of parser options, see
+        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
+        for more information.
+        :type parser_options: dict(str)
+        :param reranker_options: optional dictionary of reranker options, see
+        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
+        for more information.
+        :type reranker_options: dict(str)
+        :rtype: BllipParser
+        """
+        (parser_model_dir, reranker_features_filename,
+         reranker_weights_filename) = get_unified_model_parameters(model_dir)
+        return this_class(parser_model_dir, reranker_features_filename,
+                          reranker_weights_filename, parser_options,
+                          reranker_options)
+
+def demo():
+    """This assumes the Python module bllipparser is installed."""
+
+    # download and install a basic unified parsing model (Wall Street Journal)
+    # sudo python -m nltk.downloader bllip_wsj_no_aux
+
+    from nltk.data import find
+    model_dir = find('models/bllip_wsj_no_aux').path
+
+    print('Loading BLLIP Parsing models...')
+    # the easiest way to get started is to use a unified model
+    bllip = BllipParser.from_unified_model_dir(model_dir)
+    print('Done.')
+
+    sentence1 = 'British left waffles on Falklands .'.split()
+    sentence2 = 'I saw the man with the telescope .'.split()
+    # this sentence is known to fail under the WSJ parsing model
+    fail1 = '# ! ? : -'.split()
+    for sentence in (sentence1, sentence2, fail1):
+        print('Sentence: %r' % ' '.join(sentence))
+        try:
+            tree = next(bllip.parse(sentence))
+            print(tree)
+        except StopIteration:
+            print("(parse failed)")
+
+    # n-best parsing demo
+    for i, parse in enumerate(bllip.parse(sentence1)):
+        print('parse %d:\n%s' % (i, parse))
+
+    # using external POS tag constraints
+    print("forcing 'tree' to be 'NN':",
+          next(bllip.tagged_parse([('A', None), ('tree', 'NN')])))
+    print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
+          next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])))
+    # constraints don't have to make sense... (though on more complicated
+    # sentences, they may cause the parse to fail)
+    print("forcing 'A' to be 'NNP':",
+          next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])))
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        _ensure_bllip_import_or_error()
+    except ImportError:
+        raise SkipTest('doctests from nltk.parse.bllip are skipped because '
+                       'the bllipparser module is not installed')
+
+
diff --git a/nlp_resource_data/nltk/parse/bllip.pyc b/nlp_resource_data/nltk/parse/bllip.pyc

new file mode 100755 (executable)

index 0000000..50d5d0c

Binary files /dev/null and b/nlp_resource_data/nltk/parse/bllip.pyc differ
diff --git a/nlp_resource_data/nltk/parse/chart.py b/nlp_resource_data/nltk/parse/chart.py

new file mode 100755 (executable)

index 0000000..79c3bd4
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/chart.py
@@ -0,0 +1,1682 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: A Chart Parser
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jean Mark Gawron <gawron@mail.sdsu.edu>
+#         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Data classes and parser implementations for "chart parsers", which
+use dynamic programming to efficiently parse a text.  A chart
+parser derives parse trees for a text by iteratively adding "edges"
+to a "chart."  Each edge represents a hypothesis about the tree
+structure for a subsequence of the text.  The chart is a
+"blackboard" for composing and combining these hypotheses.
+
+When a chart parser begins parsing a text, it creates a new (empty)
+chart, spanning the text.  It then incrementally adds new edges to the
+chart.  A set of "chart rules" specifies the conditions under which
+new edges should be added to the chart.  Once the chart reaches a
+stage where none of the chart rules adds any new edges, parsing is
+complete.
+
+Charts are encoded with the ``Chart`` class, and edges are encoded with
+the ``TreeEdge`` and ``LeafEdge`` classes.  The chart parser module
+defines three chart parsers:
+
+  - ``ChartParser`` is a simple and flexible chart parser.  Given a
+    set of chart rules, it will apply those rules to the chart until
+    no more edges are added.
+
+  - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
+    be used to step through the parsing process.
+"""
+from __future__ import print_function, division, unicode_literals
+
+import itertools
+import re
+import warnings
+from functools import total_ordering
+
+from six.moves import range
+
+from nltk.tree import Tree
+from nltk.grammar import PCFG, is_nonterminal, is_terminal
+from nltk.util import OrderedDict
+from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
+from nltk.parse.api import ParserI
+
+
+########################################################################
+##  Edges
+########################################################################
+
+@total_ordering
+class EdgeI(object):
+    """
+    A hypothesis about the structure of part of a sentence.
+    Each edge records the fact that a structure is (partially)
+    consistent with the sentence.  An edge contains:
+
+    - A span, indicating what part of the sentence is
+      consistent with the hypothesized structure.
+    - A left-hand side, specifying what kind of structure is
+      hypothesized.
+    - A right-hand side, specifying the contents of the
+      hypothesized structure.
+    - A dot position, indicating how much of the hypothesized
+      structure is consistent with the sentence.
+
+    Every edge is either complete or incomplete:
+
+    - An edge is complete if its structure is fully consistent
+      with the sentence.
+    - An edge is incomplete if its structure is partially
+      consistent with the sentence.  For every incomplete edge, the
+      span specifies a possible prefix for the edge's structure.
+
+    There are two kinds of edge:
+
+    - A ``TreeEdge`` records which trees have been found to
+      be (partially) consistent with the text.
+    - A ``LeafEdge`` records the tokens occurring in the text.
+
+    The ``EdgeI`` interface provides a common interface to both types
+    of edge, allowing chart parsers to treat them in a uniform manner.
+    """
+    def __init__(self):
+        if self.__class__ == EdgeI:
+            raise TypeError('Edge is an abstract interface')
+
+    #////////////////////////////////////////////////////////////
+    # Span
+    #////////////////////////////////////////////////////////////
+
+    def span(self):
+        """
+        Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the
+        portion of the sentence that is consistent with this
+        edge's structure.
+
+        :rtype: tuple(int, int)
+        """
+        raise NotImplementedError()
+
+    def start(self):
+        """
+        Return the start index of this edge's span.
+
+        :rtype: int
+        """
+        raise NotImplementedError()
+
+    def end(self):
+        """
+        Return the end index of this edge's span.
+
+        :rtype: int
+        """
+        raise NotImplementedError()
+
+    def length(self):
+        """
+        Return the length of this edge's span.
+
+        :rtype: int
+        """
+        raise NotImplementedError()
+
+    #////////////////////////////////////////////////////////////
+    # Left Hand Side
+    #////////////////////////////////////////////////////////////
+
+    def lhs(self):
+        """
+        Return this edge's left-hand side, which specifies what kind
+        of structure is hypothesized by this edge.
+
+        :see: ``TreeEdge`` and ``LeafEdge`` for a description of
+            the left-hand side values for each edge type.
+        """
+        raise NotImplementedError()
+
+    #////////////////////////////////////////////////////////////
+    # Right Hand Side
+    #////////////////////////////////////////////////////////////
+
+    def rhs(self):
+        """
+        Return this edge's right-hand side, which specifies
+        the content of the structure hypothesized by this edge.
+
+        :see: ``TreeEdge`` and ``LeafEdge`` for a description of
+            the right-hand side values for each edge type.
+        """
+        raise NotImplementedError()
+
+    def dot(self):
+        """
+        Return this edge's dot position, which indicates how much of
+        the hypothesized structure is consistent with the
+        sentence.  In particular, ``self.rhs[:dot]`` is consistent
+        with ``tokens[self.start():self.end()]``.
+
+        :rtype: int
+        """
+        raise NotImplementedError()
+
+    def nextsym(self):
+        """
+        Return the element of this edge's right-hand side that
+        immediately follows its dot.
+
+        :rtype: Nonterminal or terminal or None
+        """
+        raise NotImplementedError()
+
+    def is_complete(self):
+        """
+        Return True if this edge's structure is fully consistent
+        with the text.
+
+        :rtype: bool
+        """
+        raise NotImplementedError()
+
+    def is_incomplete(self):
+        """
+        Return True if this edge's structure is partially consistent
+        with the text.
+
+        :rtype: bool
+        """
+        raise NotImplementedError()
+
+    #////////////////////////////////////////////////////////////
+    # Comparisons & hashing
+    #////////////////////////////////////////////////////////////
+
+    def __eq__(self, other):
+        return (self.__class__ is other.__class__ and
+                self._comparison_key == other._comparison_key)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, EdgeI):
+            raise_unorderable_types("<", self, other)
+        if self.__class__ is other.__class__:
+            return self._comparison_key < other._comparison_key
+        else:
+            return self.__class__.__name__ < other.__class__.__name__
+
+    def __hash__(self):
+        try:
+            return self._hash
+        except AttributeError:
+            self._hash = hash(self._comparison_key)
+            return self._hash
+
+
+@python_2_unicode_compatible
+class TreeEdge(EdgeI):
+    """
+    An edge that records the fact that a tree is (partially)
+    consistent with the sentence.  A tree edge consists of:
+
+    - A span, indicating what part of the sentence is
+      consistent with the hypothesized tree.
+    - A left-hand side, specifying the hypothesized tree's node
+      value.
+    - A right-hand side, specifying the hypothesized tree's
+      children.  Each element of the right-hand side is either a
+      terminal, specifying a token with that terminal as its leaf
+      value; or a nonterminal, specifying a subtree with that
+      nonterminal's symbol as its node value.
+    - A dot position, indicating which children are consistent
+      with part of the sentence.  In particular, if ``dot`` is the
+      dot position, ``rhs`` is the right-hand size, ``(start,end)``
+      is the span, and ``sentence`` is the list of tokens in the
+      sentence, then ``tokens[start:end]`` can be spanned by the
+      children specified by ``rhs[:dot]``.
+
+    For more information about edges, see the ``EdgeI`` interface.
+    """
+    def __init__(self, span, lhs, rhs, dot=0):
+        """
+        Construct a new ``TreeEdge``.
+
+        :type span: tuple(int, int)
+        :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the
+            portion of the sentence that is consistent with the new
+            edge's structure.
+        :type lhs: Nonterminal
+        :param lhs: The new edge's left-hand side, specifying the
+            hypothesized tree's node value.
+        :type rhs: list(Nonterminal and str)
+        :param rhs: The new edge's right-hand side, specifying the
+            hypothesized tree's children.
+        :type dot: int
+        :param dot: The position of the new edge's dot.  This position
+            specifies what prefix of the production's right hand side
+            is consistent with the text.  In particular, if
+            ``sentence`` is the list of tokens in the sentence, then
+            ``okens[span[0]:span[1]]`` can be spanned by the
+            children specified by ``rhs[:dot]``.
+        """
+        self._span = span
+        self._lhs = lhs
+        rhs = tuple(rhs)
+        self._rhs = rhs
+        self._dot = dot
+        self._comparison_key = (span, lhs, rhs, dot)
+
+    @staticmethod
+    def from_production(production, index):
+        """
+        Return a new ``TreeEdge`` formed from the given production.
+        The new edge's left-hand side and right-hand side will
+        be taken from ``production``; its span will be
+        ``(index,index)``; and its dot position will be ``0``.
+
+        :rtype: TreeEdge
+        """
+        return TreeEdge(span=(index, index), lhs=production.lhs(),
+                        rhs=production.rhs(), dot=0)
+
+    def move_dot_forward(self, new_end):
+        """
+        Return a new ``TreeEdge`` formed from this edge.
+        The new edge's dot position is increased by ``1``,
+        and its end index will be replaced by ``new_end``.
+
+        :param new_end: The new end index.
+        :type new_end: int
+        :rtype: TreeEdge
+        """
+        return TreeEdge(span=(self._span[0], new_end),
+                        lhs=self._lhs, rhs=self._rhs,
+                        dot=self._dot+1)
+
+    # Accessors
+    def lhs(self): return self._lhs
+    def span(self): return self._span
+    def start(self): return self._span[0]
+    def end(self): return self._span[1]
+    def length(self): return self._span[1] - self._span[0]
+    def rhs(self): return self._rhs
+    def dot(self): return self._dot
+    def is_complete(self): return self._dot == len(self._rhs)
+    def is_incomplete(self): return self._dot != len(self._rhs)
+    def nextsym(self):
+        if self._dot >= len(self._rhs): return None
+        else: return self._rhs[self._dot]
+
+    # String representation
+    def __str__(self):
+        str = '[%s:%s] ' % (self._span[0], self._span[1])
+        str += '%-2r ->' % (self._lhs,)
+
+        for i in range(len(self._rhs)):
+            if i == self._dot: str += ' *'
+            str += ' %s' % unicode_repr(self._rhs[i])
+        if len(self._rhs) == self._dot: str += ' *'
+        return str
+
+    def __repr__(self):
+        return '[Edge: %s]' % self
+
+
+@python_2_unicode_compatible
+class LeafEdge(EdgeI):
+    """
+    An edge that records the fact that a leaf value is consistent with
+    a word in the sentence.  A leaf edge consists of:
+
+    - An index, indicating the position of the word.
+    - A leaf, specifying the word's content.
+
+    A leaf edge's left-hand side is its leaf value, and its right hand
+    side is ``()``.  Its span is ``[index, index+1]``, and its dot
+    position is ``0``.
+    """
+    def __init__(self, leaf, index):
+        """
+        Construct a new ``LeafEdge``.
+
+        :param leaf: The new edge's leaf value, specifying the word
+            that is recorded by this edge.
+        :param index: The new edge's index, specifying the position of
+            the word that is recorded by this edge.
+        """
+        self._leaf = leaf
+        self._index = index
+        self._comparison_key = (leaf, index)
+
+    # Accessors
+    def lhs(self): return self._leaf
+    def span(self): return (self._index, self._index+1)
+    def start(self): return self._index
+    def end(self): return self._index+1
+    def length(self): return 1
+    def rhs(self): return ()
+    def dot(self): return 0
+    def is_complete(self): return True
+    def is_incomplete(self): return False
+    def nextsym(self): return None
+
+    # String representations
+    def __str__(self):
+        return '[%s:%s] %s' % (self._index, self._index+1, unicode_repr(self._leaf))
+    def __repr__(self):
+        return '[Edge: %s]' % (self)
+
+########################################################################
+##  Chart
+########################################################################
+
+class Chart(object):
+    """
+    A blackboard for hypotheses about the syntactic constituents of a
+    sentence.  A chart contains a set of edges, and each edge encodes
+    a single hypothesis about the structure of some portion of the
+    sentence.
+
+    The ``select`` method can be used to select a specific collection
+    of edges.  For example ``chart.select(is_complete=True, start=0)``
+    yields all complete edges whose start indices are 0.  To ensure
+    the efficiency of these selection operations, ``Chart`` dynamically
+    creates and maintains an index for each set of attributes that
+    have been selected on.
+
+    In order to reconstruct the trees that are represented by an edge,
+    the chart associates each edge with a set of child pointer lists.
+    A child pointer list is a list of the edges that license an
+    edge's right-hand side.
+
+    :ivar _tokens: The sentence that the chart covers.
+    :ivar _num_leaves: The number of tokens.
+    :ivar _edges: A list of the edges in the chart
+    :ivar _edge_to_cpls: A dictionary mapping each edge to a set
+        of child pointer lists that are associated with that edge.
+    :ivar _indexes: A dictionary mapping tuples of edge attributes
+        to indices, where each index maps the corresponding edge
+        attribute values to lists of edges.
+    """
+    def __init__(self, tokens):
+        """
+        Construct a new chart. The chart is initialized with the
+        leaf edges corresponding to the terminal leaves.
+
+        :type tokens: list
+        :param tokens: The sentence that this chart will be used to parse.
+        """
+        # Record the sentence token and the sentence length.
+        self._tokens = tuple(tokens)
+        self._num_leaves = len(self._tokens)
+
+        # Initialise the chart.
+        self.initialize()
+
+    def initialize(self):
+        """
+        Clear the chart.
+        """
+        # A list of edges contained in this chart.
+        self._edges = []
+
+        # The set of child pointer lists associated with each edge.
+        self._edge_to_cpls = {}
+
+        # Indexes mapping attribute values to lists of edges
+        # (used by select()).
+        self._indexes = {}
+
+    #////////////////////////////////////////////////////////////
+    # Sentence Access
+    #////////////////////////////////////////////////////////////
+
+    def num_leaves(self):
+        """
+        Return the number of words in this chart's sentence.
+
+        :rtype: int
+        """
+        return self._num_leaves
+
+    def leaf(self, index):
+        """
+        Return the leaf value of the word at the given index.
+
+        :rtype: str
+        """
+        return self._tokens[index]
+
+    def leaves(self):
+        """
+        Return a list of the leaf values of each word in the
+        chart's sentence.
+
+        :rtype: list(str)
+        """
+        return self._tokens
+
+    #////////////////////////////////////////////////////////////
+    # Edge access
+    #////////////////////////////////////////////////////////////
+
+    def edges(self):
+        """
+        Return a list of all edges in this chart.  New edges
+        that are added to the chart after the call to edges()
+        will *not* be contained in this list.
+
+        :rtype: list(EdgeI)
+        :see: ``iteredges``, ``select``
+        """
+        return self._edges[:]
+
+    def iteredges(self):
+        """
+        Return an iterator over the edges in this chart.  It is
+        not guaranteed that new edges which are added to the
+        chart before the iterator is exhausted will also be generated.
+
+        :rtype: iter(EdgeI)
+        :see: ``edges``, ``select``
+        """
+        return iter(self._edges)
+
+    # Iterating over the chart yields its edges.
+    __iter__ = iteredges
+
+    def num_edges(self):
+        """
+        Return the number of edges contained in this chart.
+
+        :rtype: int
+        """
+        return len(self._edge_to_cpls)
+
+    def select(self, **restrictions):
+        """
+        Return an iterator over the edges in this chart.  Any
+        new edges that are added to the chart before the iterator
+        is exahusted will also be generated.  ``restrictions``
+        can be used to restrict the set of edges that will be
+        generated.
+
+        :param span: Only generate edges ``e`` where ``e.span()==span``
+        :param start: Only generate edges ``e`` where ``e.start()==start``
+        :param end: Only generate edges ``e`` where ``e.end()==end``
+        :param length: Only generate edges ``e`` where ``e.length()==length``
+        :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs``
+        :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs``
+        :param nextsym: Only generate edges ``e`` where
+            ``e.nextsym()==nextsym``
+        :param dot: Only generate edges ``e`` where ``e.dot()==dot``
+        :param is_complete: Only generate edges ``e`` where
+            ``e.is_complete()==is_complete``
+        :param is_incomplete: Only generate edges ``e`` where
+            ``e.is_incomplete()==is_incomplete``
+        :rtype: iter(EdgeI)
+        """
+        # If there are no restrictions, then return all edges.
+        if restrictions=={}: return iter(self._edges)
+
+        # Find the index corresponding to the given restrictions.
+        restr_keys = sorted(restrictions.keys())
+        restr_keys = tuple(restr_keys)
+
+        # If it doesn't exist, then create it.
+        if restr_keys not in self._indexes:
+            self._add_index(restr_keys)
+
+        vals = tuple(restrictions[key] for key in restr_keys)
+        return iter(self._indexes[restr_keys].get(vals, []))
+
+    def _add_index(self, restr_keys):
+        """
+        A helper function for ``select``, which creates a new index for
+        a given set of attributes (aka restriction keys).
+        """
+        # Make sure it's a valid index.
+        for key in restr_keys:
+            if not hasattr(EdgeI, key):
+                raise ValueError('Bad restriction: %s' % key)
+
+        # Create the index.
+        index = self._indexes[restr_keys] = {}
+
+        # Add all existing edges to the index.
+        for edge in self._edges:
+            vals = tuple(getattr(edge, key)() for key in restr_keys)
+            index.setdefault(vals, []).append(edge)
+
+    def _register_with_indexes(self, edge):
+        """
+        A helper function for ``insert``, which registers the new
+        edge with all existing indexes.
+        """
+        for (restr_keys, index) in self._indexes.items():
+            vals = tuple(getattr(edge, key)() for key in restr_keys)
+            index.setdefault(vals, []).append(edge)
+
+    #////////////////////////////////////////////////////////////
+    # Edge Insertion
+    #////////////////////////////////////////////////////////////
+
+    def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
+        """
+        Add a new edge to the chart, using a pointer to the previous edge.
+        """
+        cpls = self.child_pointer_lists(previous_edge)
+        new_cpls = [cpl+(child_edge,) for cpl in cpls]
+        return self.insert(new_edge, *new_cpls)
+
+    def insert(self, edge, *child_pointer_lists):
+        """
+        Add a new edge to the chart, and return True if this operation
+        modified the chart.  In particular, return true iff the chart
+        did not already contain ``edge``, or if it did not already associate
+        ``child_pointer_lists`` with ``edge``.
+
+        :type edge: EdgeI
+        :param edge: The new edge
+        :type child_pointer_lists: sequence of tuple(EdgeI)
+        :param child_pointer_lists: A sequence of lists of the edges that
+            were used to form this edge.  This list is used to reconstruct
+            the trees (or partial trees) that are associated with ``edge``.
+        :rtype: bool
+        """
+        # Is it a new edge?
+        if edge not in self._edge_to_cpls:
+            # Add it to the list of edges.
+            self._append_edge(edge)
+            # Register with indexes.
+            self._register_with_indexes(edge)
+
+        # Get the set of child pointer lists for this edge.
+        cpls = self._edge_to_cpls.setdefault(edge, OrderedDict())
+        chart_was_modified = False
+        for child_pointer_list in child_pointer_lists:
+            child_pointer_list = tuple(child_pointer_list)
+            if child_pointer_list not in cpls:
+                # It's a new CPL; register it, and return true.
+                cpls[child_pointer_list] = True
+                chart_was_modified = True
+        return chart_was_modified
+
+    def _append_edge(self, edge):
+        self._edges.append(edge)
+
+    #////////////////////////////////////////////////////////////
+    # Tree extraction & child pointer lists
+    #////////////////////////////////////////////////////////////
+
+    def parses(self, root, tree_class=Tree):
+        """
+        Return an iterator of the complete tree structures that span
+        the entire chart, and whose root node is ``root``.
+        """
+        for edge in self.select(start=0, end=self._num_leaves, lhs=root):
+            for tree in self.trees(edge, tree_class=tree_class, complete=True):
+                yield tree
+
+    def trees(self, edge, tree_class=Tree, complete=False):
+        """
+        Return an iterator of the tree structures that are associated
+        with ``edge``.
+
+        If ``edge`` is incomplete, then the unexpanded children will be
+        encoded as childless subtrees, whose node value is the
+        corresponding terminal or nonterminal.
+
+        :rtype: list(Tree)
+        :note: If two trees share a common subtree, then the same
+            Tree may be used to encode that subtree in
+            both trees.  If you need to eliminate this subtree
+            sharing, then create a deep copy of each tree.
+        """
+        return iter(self._trees(edge, complete, memo={}, tree_class=tree_class))
+
+    def _trees(self, edge, complete, memo, tree_class):
+        """
+        A helper function for ``trees``.
+
+        :param memo: A dictionary used to record the trees that we've
+            generated for each edge, so that when we see an edge more
+            than once, we can reuse the same trees.
+        """
+        # If we've seen this edge before, then reuse our old answer.
+        if edge in memo:
+            return memo[edge]
+
+        # when we're reading trees off the chart, don't use incomplete edges
+        if complete and edge.is_incomplete():
+            return []
+
+        # Leaf edges.
+        if isinstance(edge, LeafEdge):
+            leaf = self._tokens[edge.start()]
+            memo[edge] = [leaf]
+            return [leaf]
+
+        # Until we're done computing the trees for edge, set
+        # memo[edge] to be empty.  This has the effect of filtering
+        # out any cyclic trees (i.e., trees that contain themselves as
+        # descendants), because if we reach this edge via a cycle,
+        # then it will appear that the edge doesn't generate any trees.
+        memo[edge] = []
+        trees = []
+        lhs = edge.lhs().symbol()
+
+        # Each child pointer list can be used to form trees.
+        for cpl in self.child_pointer_lists(edge):
+            # Get the set of child choices for each child pointer.
+            # child_choices[i] is the set of choices for the tree's
+            # ith child.
+            child_choices = [self._trees(cp, complete, memo, tree_class)
+                             for cp in cpl]
+
+            # For each combination of children, add a tree.
+            for children in itertools.product(*child_choices):
+                trees.append(tree_class(lhs, children))
+
+        # If the edge is incomplete, then extend it with "partial trees":
+        if edge.is_incomplete():
+            unexpanded = [tree_class(elt,[])
+                          for elt in edge.rhs()[edge.dot():]]
+            for tree in trees:
+                tree.extend(unexpanded)
+
+        # Update the memoization dictionary.
+        memo[edge] = trees
+
+        # Return the list of trees.
+        return trees
+
+    def child_pointer_lists(self, edge):
+        """
+        Return the set of child pointer lists for the given edge.
+        Each child pointer list is a list of edges that have
+        been used to form this edge.
+
+        :rtype: list(list(EdgeI))
+        """
+        # Make a copy, in case they modify it.
+        return self._edge_to_cpls.get(edge, {}).keys()
+
+    #////////////////////////////////////////////////////////////
+    # Display
+    #////////////////////////////////////////////////////////////
+    def pretty_format_edge(self, edge, width=None):
+        """
+        Return a pretty-printed string representation of a given edge
+        in this chart.
+
+        :rtype: str
+        :param width: The number of characters allotted to each
+            index in the sentence.
+        """
+        if width is None: width = 50 // (self.num_leaves()+1)
+        (start, end) = (edge.start(), edge.end())
+
+        str = '|' + ('.'+' '*(width-1))*start
+
+        # Zero-width edges are "#" if complete, ">" if incomplete
+        if start == end:
+            if edge.is_complete(): str += '#'
+            else: str += '>'
+
+        # Spanning complete edges are "[===]"; Other edges are
+        # "[---]" if complete, "[--->" if incomplete
+        elif edge.is_complete() and edge.span() == (0,self._num_leaves):
+            str += '['+('='*width)*(end-start-1) + '='*(width-1)+']'
+        elif edge.is_complete():
+            str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']'
+        else:
+            str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>'
+
+        str += (' '*(width-1)+'.')*(self._num_leaves-end)
+        return str + '| %s' % edge
+
+    def pretty_format_leaves(self, width=None):
+        """
+        Return a pretty-printed string representation of this
+        chart's leaves.  This string can be used as a header
+        for calls to ``pretty_format_edge``.
+        """
+        if width is None: width = 50 // (self.num_leaves()+1)
+
+        if self._tokens is not None and width>1:
+            header = '|.'
+            for tok in self._tokens:
+                header += tok[:width-1].center(width-1)+'.'
+            header += '|'
+        else:
+            header = ''
+
+        return header
+
+    def pretty_format(self, width=None):
+        """
+        Return a pretty-printed string representation of this chart.
+
+        :param width: The number of characters allotted to each
+            index in the sentence.
+        :rtype: str
+        """
+        if width is None: width = 50 // (self.num_leaves()+1)
+        # sort edges: primary key=length, secondary key=start index.
+        # (and filter out the token edges)
+        edges = sorted([(e.length(), e.start(), e) for e in self])
+        edges = [e for (_,_,e) in edges]
+
+        return (self.pretty_format_leaves(width) + '\n' +
+                '\n'.join(self.pretty_format_edge(edge, width) for edge in edges))
+
+    #////////////////////////////////////////////////////////////
+    # Display: Dot (AT&T Graphviz)
+    #////////////////////////////////////////////////////////////
+
+    def dot_digraph(self):
+        # Header
+        s = 'digraph nltk_chart {\n'
+        #s += '  size="5,5";\n'
+        s += '  rankdir=LR;\n'
+        s += '  node [height=0.1,width=0.1];\n'
+        s += '  node [style=filled, color="lightgray"];\n'
+
+        # Set up the nodes
+        for y in range(self.num_edges(), -1, -1):
+            if y == 0:
+                s += '  node [style=filled, color="black"];\n'
+            for x in range(self.num_leaves()+1):
+                if y == 0 or (x <= self._edges[y-1].start() or
+                              x >= self._edges[y-1].end()):
+                    s += '  %04d.%04d [label=""];\n' % (x,y)
+
+        # Add a spacer
+        s += '  x [style=invis]; x->0000.0000 [style=invis];\n'
+
+        # Declare ranks.
+        for x in range(self.num_leaves()+1):
+            s += '  {rank=same;'
+            for y in range(self.num_edges()+1):
+                if y == 0 or (x <= self._edges[y-1].start() or
+                              x >= self._edges[y-1].end()):
+                    s += ' %04d.%04d' % (x,y)
+            s += '}\n'
+
+        # Add the leaves
+        s += '  edge [style=invis, weight=100];\n'
+        s += '  node [shape=plaintext]\n'
+        s += '  0000.0000'
+        for x in range(self.num_leaves()):
+            s += '->%s->%04d.0000' % (self.leaf(x), x+1)
+        s += ';\n\n'
+
+        # Add the edges
+        s += '  edge [style=solid, weight=1];\n'
+        for y, edge in enumerate(self):
+            for x in range(edge.start()):
+                s += ('  %04d.%04d -> %04d.%04d [style="invis"];\n' %
+                      (x, y+1, x+1, y+1))
+            s += ('  %04d.%04d -> %04d.%04d [label="%s"];\n' %
+                  (edge.start(), y+1, edge.end(), y+1, edge))
+            for x in range(edge.end(), self.num_leaves()):
+                s += ('  %04d.%04d -> %04d.%04d [style="invis"];\n' %
+                      (x, y+1, x+1, y+1))
+        s += '}\n'
+        return s
+
+########################################################################
+##  Chart Rules
+########################################################################
+
+class ChartRuleI(object):
+    """
+    A rule that specifies what new edges are licensed by any given set
+    of existing edges.  Each chart rule expects a fixed number of
+    edges, as indicated by the class variable ``NUM_EDGES``.  In
+    particular:
+
+    - A chart rule with ``NUM_EDGES=0`` specifies what new edges are
+      licensed, regardless of existing edges.
+    - A chart rule with ``NUM_EDGES=1`` specifies what new edges are
+      licensed by a single existing edge.
+    - A chart rule with ``NUM_EDGES=2`` specifies what new edges are
+      licensed by a pair of existing edges.
+
+    :type NUM_EDGES: int
+    :cvar NUM_EDGES: The number of existing edges that this rule uses
+        to license new edges.  Typically, this number ranges from zero
+        to two.
+    """
+    def apply(self, chart, grammar, *edges):
+        """
+        Return a generator that will add edges licensed by this rule
+        and the given edges to the chart, one at a time.  Each
+        time the generator is resumed, it will either add a new
+        edge and yield that edge; or return.
+
+        :type edges: list(EdgeI)
+        :param edges: A set of existing edges.  The number of edges
+            that should be passed to ``apply()`` is specified by the
+            ``NUM_EDGES`` class variable.
+        :rtype: iter(EdgeI)
+        """
+        raise NotImplementedError()
+
+    def apply_everywhere(self, chart, grammar):
+        """
+        Return a generator that will add all edges licensed by
+        this rule, given the edges that are currently in the
+        chart, one at a time.  Each time the generator is resumed,
+        it will either add a new edge and yield that edge; or return.
+
+        :rtype: iter(EdgeI)
+        """
+        raise NotImplementedError()
+
+
+@python_2_unicode_compatible
+class AbstractChartRule(ChartRuleI):
+    """
+    An abstract base class for chart rules.  ``AbstractChartRule``
+    provides:
+
+    - A default implementation for ``apply``.
+    - A default implementation for ``apply_everywhere``,
+      (Currently, this implementation assumes that ``NUM_EDGES``<=3.)
+    - A default implementation for ``__str__``, which returns a
+      name based on the rule's class name.
+    """
+
+    # Subclasses must define apply.
+    def apply(self, chart, grammar, *edges):
+        raise NotImplementedError()
+
+    # Default: loop through the given number of edges, and call
+    # self.apply() for each set of edges.
+    def apply_everywhere(self, chart, grammar):
+        if self.NUM_EDGES == 0:
+            for new_edge in self.apply(chart, grammar):
+                yield new_edge
+
+        elif self.NUM_EDGES == 1:
+            for e1 in chart:
+                for new_edge in self.apply(chart, grammar, e1):
+                    yield new_edge
+
+        elif self.NUM_EDGES == 2:
+            for e1 in chart:
+                for e2 in chart:
+                    for new_edge in self.apply(chart, grammar, e1, e2):
+                        yield new_edge
+
+        elif self.NUM_EDGES == 3:
+            for e1 in chart:
+                for e2 in chart:
+                    for e3 in chart:
+                        for new_edge in self.apply(chart,grammar,e1,e2,e3):
+                            yield new_edge
+
+        else:
+            raise AssertionError('NUM_EDGES>3 is not currently supported')
+
+    # Default: return a name based on the class name.
+    def __str__(self):
+        # Add spaces between InitialCapsWords.
+        return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
+
+#////////////////////////////////////////////////////////////
+# Fundamental Rule
+#////////////////////////////////////////////////////////////
+
+class FundamentalRule(AbstractChartRule):
+    """
+    A rule that joins two adjacent edges to form a single combined
+    edge.  In particular, this rule specifies that any pair of edges
+
+    - ``[A -> alpha \* B beta][i:j]``
+    - ``[B -> gamma \*][j:k]``
+
+    licenses the edge:
+
+    - ``[A -> alpha B * beta][i:j]``
+    """
+    NUM_EDGES = 2
+    def apply(self, chart, grammar, left_edge, right_edge):
+        # Make sure the rule is applicable.
+        if not (left_edge.is_incomplete() and
+                right_edge.is_complete() and
+                left_edge.end() == right_edge.start() and
+                left_edge.nextsym() == right_edge.lhs()):
+            return
+
+        # Construct the new edge.
+        new_edge = left_edge.move_dot_forward(right_edge.end())
+
+        # Insert it into the chart.
+        if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+            yield new_edge
+
+class SingleEdgeFundamentalRule(FundamentalRule):
+    """
+    A rule that joins a given edge with adjacent edges in the chart,
+    to form combined edges.  In particular, this rule specifies that
+    either of the edges:
+
+    - ``[A -> alpha \* B beta][i:j]``
+    - ``[B -> gamma \*][j:k]``
+
+    licenses the edge:
+
+    - ``[A -> alpha B * beta][i:j]``
+
+    if the other edge is already in the chart.
+
+    :note: This is basically ``FundamentalRule``, with one edge left
+        unspecified.
+    """
+    NUM_EDGES = 1
+
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete():
+            for new_edge in self._apply_incomplete(chart, grammar, edge):
+                yield new_edge
+        else:
+            for new_edge in self._apply_complete(chart, grammar, edge):
+                yield new_edge
+
+    def _apply_complete(self, chart, grammar, right_edge):
+        for left_edge in chart.select(end=right_edge.start(),
+                                      is_complete=False,
+                                      nextsym=right_edge.lhs()):
+            new_edge = left_edge.move_dot_forward(right_edge.end())
+            if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+                yield new_edge
+
+    def _apply_incomplete(self, chart, grammar, left_edge):
+        for right_edge in chart.select(start=left_edge.end(),
+                                       is_complete=True,
+                                       lhs=left_edge.nextsym()):
+            new_edge = left_edge.move_dot_forward(right_edge.end())
+            if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+                yield new_edge
+
+#////////////////////////////////////////////////////////////
+# Inserting Terminal Leafs
+#////////////////////////////////////////////////////////////
+
+class LeafInitRule(AbstractChartRule):
+    NUM_EDGES=0
+    def apply(self, chart, grammar):
+        for index in range(chart.num_leaves()):
+            new_edge = LeafEdge(chart.leaf(index), index)
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+#////////////////////////////////////////////////////////////
+# Top-Down Prediction
+#////////////////////////////////////////////////////////////
+
+class TopDownInitRule(AbstractChartRule):
+    """
+    A rule licensing edges corresponding to the grammar productions for
+    the grammar's start symbol.  In particular, this rule specifies that
+    ``[S -> \* alpha][0:i]`` is licensed for each grammar production
+    ``S -> alpha``, where ``S`` is the grammar's start symbol.
+    """
+    NUM_EDGES = 0
+    def apply(self, chart, grammar):
+        for prod in grammar.productions(lhs=grammar.start()):
+            new_edge = TreeEdge.from_production(prod, 0)
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+class TopDownPredictRule(AbstractChartRule):
+    """
+    A rule licensing edges corresponding to the grammar productions
+    for the nonterminal following an incomplete edge's dot.  In
+    particular, this rule specifies that
+    ``[A -> alpha \* B beta][i:j]`` licenses the edge
+    ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``.
+
+    :note: This rule corresponds to the Predictor Rule in Earley parsing.
+    """
+    NUM_EDGES = 1
+    def apply(self, chart, grammar, edge):
+        if edge.is_complete(): return
+        for prod in grammar.productions(lhs=edge.nextsym()):
+            new_edge = TreeEdge.from_production(prod, edge.end())
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+class CachedTopDownPredictRule(TopDownPredictRule):
+    """
+    A cached version of ``TopDownPredictRule``.  After the first time
+    this rule is applied to an edge with a given ``end`` and ``next``,
+    it will not generate any more edges for edges with that ``end`` and
+    ``next``.
+
+    If ``chart`` or ``grammar`` are changed, then the cache is flushed.
+    """
+    def __init__(self):
+        TopDownPredictRule.__init__(self)
+        self._done = {}
+
+    def apply(self, chart, grammar, edge):
+        if edge.is_complete(): return
+        nextsym, index = edge.nextsym(), edge.end()
+        if not is_nonterminal(nextsym): return
+
+        # If we've already applied this rule to an edge with the same
+        # next & end, and the chart & grammar have not changed, then
+        # just return (no new edges to add).
+        done = self._done.get((nextsym, index), (None,None))
+        if done[0] is chart and done[1] is grammar: return
+
+        # Add all the edges indicated by the top down expand rule.
+        for prod in grammar.productions(lhs=nextsym):
+            # If the left corner in the predicted production is
+            # leaf, it must match with the input.
+            if prod.rhs():
+                first = prod.rhs()[0]
+                if is_terminal(first):
+                    if index >= chart.num_leaves() or first != chart.leaf(index): continue
+
+            new_edge = TreeEdge.from_production(prod, index)
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+        # Record the fact that we've applied this rule.
+        self._done[nextsym, index] = (chart, grammar)
+
+#////////////////////////////////////////////////////////////
+# Bottom-Up Prediction
+#////////////////////////////////////////////////////////////
+
+class BottomUpPredictRule(AbstractChartRule):
+    """
+    A rule licensing any edge corresponding to a production whose
+    right-hand side begins with a complete edge's left-hand side.  In
+    particular, this rule specifies that ``[A -> alpha \*]`` licenses
+    the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
+    """
+    NUM_EDGES = 1
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete(): return
+        for prod in grammar.productions(rhs=edge.lhs()):
+            new_edge = TreeEdge.from_production(prod, edge.start())
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+class BottomUpPredictCombineRule(BottomUpPredictRule):
+    """
+    A rule licensing any edge corresponding to a production whose
+    right-hand side begins with a complete edge's left-hand side.  In
+    particular, this rule specifies that ``[A -> alpha \*]``
+    licenses the edge ``[B -> A \* beta]`` for each grammar
+    production ``B -> A beta``.
+
+    :note: This is like ``BottomUpPredictRule``, but it also applies
+        the ``FundamentalRule`` to the resulting edge.
+    """
+    NUM_EDGES = 1
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete(): return
+        for prod in grammar.productions(rhs=edge.lhs()):
+            new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
+            if chart.insert(new_edge, (edge,)):
+                yield new_edge
+
+class EmptyPredictRule(AbstractChartRule):
+    """
+    A rule that inserts all empty productions as passive edges,
+    in every position in the chart.
+    """
+    NUM_EDGES = 0
+    def apply(self, chart, grammar):
+        for prod in grammar.productions(empty=True):
+            for index in range(chart.num_leaves() + 1):
+                new_edge = TreeEdge.from_production(prod, index)
+                if chart.insert(new_edge, ()):
+                    yield new_edge
+
+
+########################################################################
+##  Filtered Bottom Up
+########################################################################
+
+class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
+    def _apply_complete(self, chart, grammar, right_edge):
+        end = right_edge.end()
+        nexttoken = end < chart.num_leaves() and chart.leaf(end)
+        for left_edge in chart.select(end=right_edge.start(),
+                                      is_complete=False,
+                                      nextsym=right_edge.lhs()):
+            if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
+                new_edge = left_edge.move_dot_forward(right_edge.end())
+                if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+                    yield new_edge
+
+    def _apply_incomplete(self, chart, grammar, left_edge):
+        for right_edge in chart.select(start=left_edge.end(),
+                                       is_complete=True,
+                                       lhs=left_edge.nextsym()):
+            end = right_edge.end()
+            nexttoken = end < chart.num_leaves() and chart.leaf(end)
+            if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
+                new_edge = left_edge.move_dot_forward(right_edge.end())
+                if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+                    yield new_edge
+
+class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete():
+            return
+
+        end = edge.end()
+        nexttoken = end < chart.num_leaves() and chart.leaf(end)
+        for prod in grammar.productions(rhs=edge.lhs()):
+            if _bottomup_filter(grammar, nexttoken, prod.rhs()):
+                new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
+                if chart.insert(new_edge, (edge,)):
+                    yield new_edge
+
+def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
+    if len(rhs) <= dot + 1:
+        return True
+    _next = rhs[dot + 1]
+    if is_terminal(_next):
+        return nexttoken == _next
+    else:
+        return grammar.is_leftcorner(_next, nexttoken)
+
+
+########################################################################
+##  Generic Chart Parser
+########################################################################
+
+TD_STRATEGY = [LeafInitRule(),
+               TopDownInitRule(),
+               CachedTopDownPredictRule(),
+               SingleEdgeFundamentalRule()]
+BU_STRATEGY = [LeafInitRule(),
+               EmptyPredictRule(),
+               BottomUpPredictRule(),
+               SingleEdgeFundamentalRule()]
+BU_LC_STRATEGY = [LeafInitRule(),
+                  EmptyPredictRule(),
+                  BottomUpPredictCombineRule(),
+                  SingleEdgeFundamentalRule()]
+
+LC_STRATEGY = [LeafInitRule(),
+               FilteredBottomUpPredictCombineRule(),
+               FilteredSingleEdgeFundamentalRule()]
+
+class ChartParser(ParserI):
+    """
+    A generic chart parser.  A "strategy", or list of
+    ``ChartRuleI`` instances, is used to decide what edges to add to
+    the chart.  In particular, ``ChartParser`` uses the following
+    algorithm to parse texts:
+
+    | Until no new edges are added:
+    |   For each *rule* in *strategy*:
+    |     Apply *rule* to any applicable edges in the chart.
+    | Return any complete parses in the chart
+    """
+    def __init__(self, grammar, strategy=BU_LC_STRATEGY, trace=0,
+                 trace_chart_width=50, use_agenda=True, chart_class=Chart):
+        """
+        Create a new chart parser, that uses ``grammar`` to parse
+        texts.
+
+        :type grammar: CFG
+        :param grammar: The grammar used to parse texts.
+        :type strategy: list(ChartRuleI)
+        :param strategy: A list of rules that should be used to decide
+            what edges to add to the chart (top-down strategy by default).
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        :type trace_chart_width: int
+        :param trace_chart_width: The default total width reserved for
+            the chart in trace output.  The remainder of each line will
+            be used to display edges.
+        :type use_agenda: bool
+        :param use_agenda: Use an optimized agenda-based algorithm,
+            if possible.
+        :param chart_class: The class that should be used to create
+            the parse charts.
+        """
+        self._grammar = grammar
+        self._strategy = strategy
+        self._trace = trace
+        self._trace_chart_width = trace_chart_width
+        # If the strategy only consists of axioms (NUM_EDGES==0) and
+        # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm:
+        self._use_agenda = use_agenda
+        self._chart_class = chart_class
+
+        self._axioms = []
+        self._inference_rules = []
+        for rule in strategy:
+            if rule.NUM_EDGES == 0:
+                self._axioms.append(rule)
+            elif rule.NUM_EDGES == 1:
+                self._inference_rules.append(rule)
+            else:
+                self._use_agenda = False
+
+    def grammar(self):
+        return self._grammar
+
+    def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
+        if not trace: return
+        print_rule_header = trace > 1
+        for edge in new_edges:
+            if print_rule_header:
+                print('%s:' % rule)
+                print_rule_header = False
+            print(chart.pretty_format_edge(edge, edge_width))
+
+    def chart_parse(self, tokens, trace=None):
+        """
+        Return the final parse ``Chart`` from which all possible
+        parse trees can be extracted.
+
+        :param tokens: The sentence to be parsed
+        :type tokens: list(str)
+        :rtype: Chart
+        """
+        if trace is None: trace = self._trace
+        trace_new_edges = self._trace_new_edges
+
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+        chart = self._chart_class(tokens)
+        grammar = self._grammar
+
+        # Width, for printing trace edges.
+        trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
+        if trace: print(chart.pretty_format_leaves(trace_edge_width))
+
+        if self._use_agenda:
+            # Use an agenda-based algorithm.
+            for axiom in self._axioms:
+                new_edges = list(axiom.apply(chart, grammar))
+                trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
+
+            inference_rules = self._inference_rules
+            agenda = chart.edges()
+            # We reverse the initial agenda, since it is a stack
+            # but chart.edges() functions as a queue.
+            agenda.reverse()
+            while agenda:
+                edge = agenda.pop()
+                for rule in inference_rules:
+                    new_edges = list(rule.apply(chart, grammar, edge))
+                    if trace:
+                        trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
+                    agenda += new_edges
+
+        else:
+            # Do not use an agenda-based algorithm.
+            edges_added = True
+            while edges_added:
+                edges_added = False
+                for rule in self._strategy:
+                    new_edges = list(rule.apply_everywhere(chart, grammar))
+                    edges_added = len(new_edges)
+                    trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
+
+        # Return the final chart.
+        return chart
+
+    def parse(self, tokens, tree_class=Tree):
+        chart = self.chart_parse(tokens)
+        return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
+
+class TopDownChartParser(ChartParser):
+    """
+    A ``ChartParser`` using a top-down parsing strategy.
+    See ``ChartParser`` for more information.
+    """
+    def __init__(self, grammar, **parser_args):
+        ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
+
+class BottomUpChartParser(ChartParser):
+    """
+    A ``ChartParser`` using a bottom-up parsing strategy.
+    See ``ChartParser`` for more information.
+    """
+    def __init__(self, grammar, **parser_args):
+        if isinstance(grammar, PCFG):
+            warnings.warn("BottomUpChartParser only works for CFG, "
+                          "use BottomUpProbabilisticChartParser instead",
+                          category=DeprecationWarning)
+        ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
+
+class BottomUpLeftCornerChartParser(ChartParser):
+    """
+    A ``ChartParser`` using a bottom-up left-corner parsing strategy.
+    This strategy is often more efficient than standard bottom-up.
+    See ``ChartParser`` for more information.
+    """
+    def __init__(self, grammar, **parser_args):
+        ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
+
+class LeftCornerChartParser(ChartParser):
+    def __init__(self, grammar, **parser_args):
+        if not grammar.is_nonempty():
+            raise ValueError("LeftCornerParser only works for grammars "
+                             "without empty productions.")
+        ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
+
+########################################################################
+##  Stepping Chart Parser
+########################################################################
+
+class SteppingChartParser(ChartParser):
+    """
+    A ``ChartParser`` that allows you to step through the parsing
+    process, adding a single edge at a time.  It also allows you to
+    change the parser's strategy or grammar midway through parsing a
+    text.
+
+    The ``initialize`` method is used to start parsing a text.  ``step``
+    adds a single edge to the chart.  ``set_strategy`` changes the
+    strategy used by the chart parser.  ``parses`` returns the set of
+    parses that has been found by the chart parser.
+
+    :ivar _restart: Records whether the parser's strategy, grammar,
+        or chart has been changed.  If so, then ``step`` must restart
+        the parsing algorithm.
+    """
+    def __init__(self, grammar, strategy=[], trace=0):
+        self._chart = None
+        self._current_chartrule = None
+        self._restart = False
+        ChartParser.__init__(self, grammar, strategy, trace)
+
+    #////////////////////////////////////////////////////////////
+    # Initialization
+    #////////////////////////////////////////////////////////////
+
+    def initialize(self, tokens):
+        "Begin parsing the given tokens."
+        self._chart = Chart(list(tokens))
+        self._restart = True
+
+    #////////////////////////////////////////////////////////////
+    # Stepping
+    #////////////////////////////////////////////////////////////
+
+    def step(self):
+        """
+        Return a generator that adds edges to the chart, one at a
+        time.  Each time the generator is resumed, it adds a single
+        edge and yields that edge.  If no more edges can be added,
+        then it yields None.
+
+        If the parser's strategy, grammar, or chart is changed, then
+        the generator will continue adding edges using the new
+        strategy, grammar, or chart.
+
+        Note that this generator never terminates, since the grammar
+        or strategy might be changed to values that would add new
+        edges.  Instead, it yields None when no more edges can be
+        added with the current strategy and grammar.
+        """
+        if self._chart is None:
+            raise ValueError('Parser must be initialized first')
+        while True:
+            self._restart = False
+            w = 50 // (self._chart.num_leaves()+1)
+
+            for e in self._parse():
+                if self._trace > 1: print(self._current_chartrule)
+                if self._trace > 0: print(self._chart.pretty_format_edge(e,w))
+                yield e
+                if self._restart: break
+            else:
+                yield None # No more edges.
+
+    def _parse(self):
+        """
+        A generator that implements the actual parsing algorithm.
+        ``step`` iterates through this generator, and restarts it
+        whenever the parser's strategy, grammar, or chart is modified.
+        """
+        chart = self._chart
+        grammar = self._grammar
+        edges_added = 1
+        while edges_added > 0:
+            edges_added = 0
+            for rule in self._strategy:
+                self._current_chartrule = rule
+                for e in rule.apply_everywhere(chart, grammar):
+                    edges_added += 1
+                    yield e
+
+    #////////////////////////////////////////////////////////////
+    # Accessors
+    #////////////////////////////////////////////////////////////
+
+    def strategy(self):
+        "Return the strategy used by this parser."
+        return self._strategy
+
+    def grammar(self):
+        "Return the grammar used by this parser."
+        return self._grammar
+
+    def chart(self):
+        "Return the chart that is used by this parser."
+        return self._chart
+
+    def current_chartrule(self):
+        "Return the chart rule used to generate the most recent edge."
+        return self._current_chartrule
+
+    def parses(self, tree_class=Tree):
+        "Return the parse trees currently contained in the chart."
+        return self._chart.parses(self._grammar.start(), tree_class)
+
+    #////////////////////////////////////////////////////////////
+    # Parser modification
+    #////////////////////////////////////////////////////////////
+
+    def set_strategy(self, strategy):
+        """
+        Change the strategy that the parser uses to decide which edges
+        to add to the chart.
+
+        :type strategy: list(ChartRuleI)
+        :param strategy: A list of rules that should be used to decide
+            what edges to add to the chart.
+        """
+        if strategy == self._strategy: return
+        self._strategy = strategy[:] # Make a copy.
+        self._restart = True
+
+    def set_grammar(self, grammar):
+        "Change the grammar used by the parser."
+        if grammar is self._grammar: return
+        self._grammar = grammar
+        self._restart = True
+
+    def set_chart(self, chart):
+        "Load a given chart into the chart parser."
+        if chart is self._chart: return
+        self._chart = chart
+        self._restart = True
+
+    #////////////////////////////////////////////////////////////
+    # Standard parser methods
+    #////////////////////////////////////////////////////////////
+
+    def parse(self, tokens, tree_class=Tree):
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+
+        # Initialize ourselves.
+        self.initialize(tokens)
+
+        # Step until no more edges are generated.
+        for e in self.step():
+            if e is None: break
+
+        # Return an iterator of complete parses.
+        return self.parses(tree_class=tree_class)
+
+########################################################################
+##  Demo Code
+########################################################################
+
+def demo_grammar():
+    from nltk.grammar import CFG
+    return CFG.fromstring("""
+S  -> NP VP
+PP -> "with" NP
+NP -> NP PP
+VP -> VP PP
+VP -> Verb NP
+VP -> Verb
+NP -> Det Noun
+NP -> "John"
+NP -> "I"
+Det -> "the"
+Det -> "my"
+Det -> "a"
+Noun -> "dog"
+Noun -> "cookie"
+Verb -> "ate"
+Verb -> "saw"
+Prep -> "with"
+Prep -> "under"
+""")
+
+def demo(choice=None,
+         print_times=True, print_grammar=False,
+         print_trees=True, trace=2,
+         sent='I saw John with a dog with my cookie', numparses=5):
+    """
+    A demonstration of the chart parsers.
+    """
+    import sys, time
+    from nltk import nonterminals, Production, CFG
+
+    # The grammar for ChartParser and SteppingChartParser:
+    grammar = demo_grammar()
+    if print_grammar:
+        print("* Grammar")
+        print(grammar)
+
+    # Tokenize the sample sentence.
+    print("* Sentence:")
+    print(sent)
+    tokens = sent.split()
+    print(tokens)
+    print()
+
+    # Ask the user which parser to test,
+    # if the parser wasn't provided as an argument
+    if choice is None:
+        print('  1: Top-down chart parser')
+        print('  2: Bottom-up chart parser')
+        print('  3: Bottom-up left-corner chart parser')
+        print('  4: Left-corner chart parser with bottom-up filter')
+        print('  5: Stepping chart parser (alternating top-down & bottom-up)')
+        print('  6: All parsers')
+        print('\nWhich parser (1-6)? ', end=' ')
+        choice = sys.stdin.readline().strip()
+        print()
+
+    choice = str(choice)
+    if choice not in "123456":
+        print('Bad parser number')
+        return
+
+    # Keep track of how long each parser takes.
+    times = {}
+
+    strategies = {'1': ('Top-down', TD_STRATEGY),
+                  '2': ('Bottom-up', BU_STRATEGY),
+                  '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
+                  '4': ('Filtered left-corner', LC_STRATEGY)}
+    choices = []
+    if choice in strategies: choices = [choice]
+    if choice=='6': choices = "1234"
+
+    # Run the requested chart parser(s), except the stepping parser.
+    for strategy in choices:
+        print("* Strategy: " + strategies[strategy][0])
+        print()
+        cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
+        t = time.time()
+        chart = cp.chart_parse(tokens)
+        parses = list(chart.parses(grammar.start()))
+
+        times[strategies[strategy][0]] = time.time()-t
+        print("Nr edges in chart:", len(chart.edges()))
+        if numparses:
+            assert len(parses)==numparses, 'Not all parses found'
+        if print_trees:
+            for tree in parses: print(tree)
+        else:
+            print("Nr trees:", len(parses))
+        print()
+
+    # Run the stepping parser, if requested.
+    if choice in "56":
+        print("* Strategy: Stepping (top-down vs bottom-up)")
+        print()
+        t = time.time()
+        cp = SteppingChartParser(grammar, trace=trace)
+        cp.initialize(tokens)
+        for i in range(5):
+            print('*** SWITCH TO TOP DOWN')
+            cp.set_strategy(TD_STRATEGY)
+            for j, e in enumerate(cp.step()):
+                if j>20 or e is None: break
+            print('*** SWITCH TO BOTTOM UP')
+            cp.set_strategy(BU_STRATEGY)
+            for j, e in enumerate(cp.step()):
+                if j>20 or e is None: break
+        times['Stepping'] = time.time()-t
+        print("Nr edges in chart:", len(cp.chart().edges()))
+        if numparses:
+            assert len(list(cp.parses()))==numparses, 'Not all parses found'
+        if print_trees:
+            for tree in cp.parses(): print(tree)
+        else:
+            print("Nr trees:", len(list(cp.parses())))
+        print()
+
+    # Print the times of all parsers:
+    if not (print_times and times): return
+    print("* Parsing times")
+    print()
+    maxlen = max(len(key) for key in times)
+    format = '%' + repr(maxlen) + 's parser: %6.3fsec'
+    times_items = times.items()
+    for (parser, t) in sorted(times_items, key=lambda a:a[1]):
+        print(format % (parser, t))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/chart.pyc b/nlp_resource_data/nltk/parse/chart.pyc

new file mode 100755 (executable)

index 0000000..5d502cf

Binary files /dev/null and b/nlp_resource_data/nltk/parse/chart.pyc differ
diff --git a/nlp_resource_data/nltk/parse/corenlp.py b/nlp_resource_data/nltk/parse/corenlp.py

new file mode 100755 (executable)

index 0000000..f6043ef
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/corenlp.py
@@ -0,0 +1,716 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the CoreNLP REST API.
+#
+# Copyright (C) 2001-2016 NLTK Project
+# Author: Dmitrijs Milajevs <dimazest@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals
+
+import re
+import json
+import time
+import socket
+
+from nltk.internals import find_jar_iter, config_java, java, _java_options
+
+from nltk.parse.api import ParserI
+from nltk.tokenize.api import TokenizerI
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.tree import Tree
+
+_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/'
+
+
+class CoreNLPServerError(EnvironmentError):
+    """Exceptions associated with the Core NLP server."""
+
+
+def try_port(port=0):
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind(('', port))
+
+    p = sock.getsockname()[1]
+    sock.close()
+
+    return p
+
+
+class CoreNLPServer(object):
+
+    _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar'
+    _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar'
+
+    def __init__(
+        self, path_to_jar=None, path_to_models_jar=None, verbose=False,
+        java_options=None, corenlp_options=None, port=None,
+    ):
+
+        if corenlp_options is None:
+            corenlp_options = [
+                '-preload', 'tokenize,ssplit,pos,lemma,parse,depparse',
+            ]
+
+        jars = list(find_jar_iter(
+            self._JAR,
+            path_to_jar,
+            env_vars=('CORENLP', ),
+            searchpath=(),
+            url=_stanford_url,
+            verbose=verbose,
+            is_regex=True,
+        ))
+
+        # find the most recent code and model jar
+        stanford_jar = max(
+            jars,
+            key=lambda model_name: re.match(self._JAR, model_name)
+        )
+
+        if port is None:
+            try:
+                port = try_port(9000)
+            except socket.error:
+                port = try_port()
+                corenlp_options.append(str(port))
+        else:
+            try_port(port)
+
+        self.url = 'http://localhost:{}'.format(port)
+
+        model_jar = max(
+            find_jar_iter(
+                self._MODEL_JAR_PATTERN,
+                path_to_models_jar,
+                env_vars=('CORENLP_MODELS', ),
+                searchpath=(),
+                url=_stanford_url,
+                verbose=verbose,
+                is_regex=True,
+            ),
+            key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
+        )
+
+        self.verbose = verbose
+
+        self._classpath = stanford_jar, model_jar
+
+        self.corenlp_options = corenlp_options
+        self.java_options = java_options or ['-mx2g']
+
+    def start(self):
+        import requests
+
+        cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']
+
+        if self.corenlp_options:
+            cmd.extend(self.corenlp_options)
+
+        # Configure java.
+        default_options = ' '.join(_java_options)
+        config_java(options=self.java_options, verbose=self.verbose)
+
+        try:
+            # TODO: it's probably a bad idea to pipe stdout, as it will
+            #       accumulate when lots of text is being parsed.
+            self.popen = java(
+                cmd,
+                classpath=self._classpath,
+                blocking=False,
+                stdout='pipe',
+                stderr='pipe',
+            )
+        finally:
+            # Return java configurations to their default values.
+            config_java(options=default_options, verbose=self.verbose)
+
+        # Check that the server is istill running.
+        returncode = self.popen.poll()
+        if returncode is not None:
+            _, stderrdata = self.popen.communicate()
+            raise CoreNLPServerError(
+                returncode,
+                'Could not start the server. '
+                'The error was: {}'.format(stderrdata.decode('ascii'))
+            )
+
+        for i in range(30):
+            try:
+                response = requests.get(requests.compat.urljoin(self.url, 'live'))
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+            else:
+                if response.ok:
+                    break
+        else:
+            raise CoreNLPServerError(
+                'Could not connect to the server.'
+            )
+
+        for i in range(60):
+            try:
+                response = requests.get(requests.compat.urljoin(self.url, 'ready'))
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+            else:
+                if response.ok:
+                    break
+        else:
+            raise CoreNLPServerError(
+                'The server is not ready.'
+            )
+
+    def stop(self):
+        self.popen.terminate()
+        self.popen.wait()
+
+    def __enter__(self):
+        self.start()
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+        return False
+
+
+class GenericCoreNLPParser(ParserI, TokenizerI):
+    """Interface to the CoreNLP Parser."""
+
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        import requests
+
+        self.url = url
+        self.encoding = encoding
+
+        self.session = requests.Session()
+
+    def parse_sents(self, sentences, *args, **kwargs):
+        """Parse multiple sentences.
+
+        Takes multiple sentences as a list where each sentence is a list of
+        words. Each sentence will be automatically tagged with this
+        CoreNLPParser instance's tagger.
+
+        If a whitespace exists inside a token, then the token will be treated as
+        several tokens.
+
+        :param sentences: Input sentences to parse
+        :type sentences: list(list(str))
+        :rtype: iter(iter(Tree))
+        """
+        # Converting list(list(str)) -> list(str)
+        sentences = (' '.join(words) for words in sentences)
+        return self.raw_parse_sents(sentences, *args, **kwargs)
+
+    def raw_parse(self, sentence, properties=None, *args, **kwargs):
+        """Parse a sentence.
+
+        Takes a sentence as a string; before parsing, it will be automatically
+        tokenized and tagged by the CoreNLP Parser.
+
+        :param sentence: Input sentence to parse
+        :type sentence: str
+        :rtype: iter(Tree)
+        """
+        default_properties = {
+            'tokenize.whitespace': 'false',
+        }
+        default_properties.update(properties or {})
+
+        return next(
+            self.raw_parse_sents(
+                [sentence],
+                properties=default_properties,
+                *args,
+                **kwargs
+            )
+        )
+
+    def api_call(self, data, properties=None):
+        default_properties = {
+            'outputFormat': 'json',
+            'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format(
+                parser_annotator=self.parser_annotator,
+            ),
+        }
+
+        default_properties.update(properties or {})
+
+        response = self.session.post(
+            self.url,
+            params={
+                'properties': json.dumps(default_properties),
+            },
+            data=data.encode(self.encoding),
+            timeout=60,
+        )
+
+        response.raise_for_status()
+
+        return response.json()
+
+    def raw_parse_sents(
+        self,
+        sentences,
+        verbose=False,
+        properties=None,
+        *args,
+        **kwargs
+    ):
+        """Parse multiple sentences.
+
+        Takes multiple sentences as a list of strings. Each sentence will be
+        automatically tokenized and tagged.
+
+        :param sentences: Input sentences to parse.
+        :type sentences: list(str)
+        :rtype: iter(iter(Tree))
+
+        """
+        default_properties = {
+            # Only splits on '\n', never inside the sentence.
+            'ssplit.ssplit.eolonly': 'true',
+        }
+
+        default_properties.update(properties or {})
+
+        """
+        for sentence in sentences:
+            parsed_data = self.api_call(sentence, properties=default_properties)
+
+            assert len(parsed_data['sentences']) == 1
+
+            for parse in parsed_data['sentences']:
+                tree = self.make_tree(parse)
+                yield iter([tree])
+        """
+        parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
+        for parsed_sent in parsed_data['sentences']:
+            tree = self.make_tree(parsed_sent)
+            yield iter([tree])
+
+
+    def parse_text(self, text, *args, **kwargs):
+        """Parse a piece of text.
+
+        The text might contain several sentences which will be split by CoreNLP.
+
+        :param str text: text to be split.
+        :returns: an iterable of syntactic structures.  # TODO: should it be an iterable of iterables?
+
+        """
+        parsed_data = self.api_call(text, *args, **kwargs)
+
+        for parse in parsed_data['sentences']:
+            yield self.make_tree(parse)
+
+    def tokenize(self, text, properties=None):
+        """Tokenize a string of text.
+
+        >>> parser = CoreNLPParser(url='http://localhost:9000')
+
+        >>> text = 'Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\nThanks.'
+        >>> list(parser.tokenize(text))
+        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+        >>> s = "The colour of the wall is blue."
+        >>> list(
+        ...     parser.tokenize(
+        ...         'The colour of the wall is blue.',
+        ...             properties={'tokenize.options': 'americanize=true'},
+        ...     )
+        ... )
+        ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
+
+        """
+        default_properties = {
+            'annotators': 'tokenize,ssplit',
+
+        }
+
+        default_properties.update(properties or {})
+
+        result = self.api_call(text, properties=default_properties)
+
+        for sentence in result['sentences']:
+            for token in sentence['tokens']:
+                yield token['originalText'] or token['word']
+
+
+class CoreNLPParser(GenericCoreNLPParser):
+    """
+    >>> parser = CoreNLPParser(url='http://localhost:9000')
+
+    >>> next(
+    ...     parser.raw_parse('The quick brown fox jumps over the lazy dog.')
+    ... ).pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+                         ROOT
+                          |
+                          S
+           _______________|__________________________
+          |                         VP               |
+          |                _________|___             |
+          |               |             PP           |
+          |               |     ________|___         |
+          NP              |    |            NP       |
+      ____|__________     |    |     _______|____    |
+     DT   JJ    JJ   NN  VBZ   IN   DT      JJ   NN  .
+     |    |     |    |    |    |    |       |    |   |
+    The quick brown fox jumps over the     lazy dog  .
+
+    >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
+    ...     [
+    ...         'The quick brown fox jumps over the lazy dog.',
+    ...         'The quick grey wolf jumps over the lazy fox.',
+    ...     ]
+    ... )
+
+    >>> parse_fox.pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+                         ROOT
+                          |
+                          S
+           _______________|__________________________
+          |                         VP               |
+          |                _________|___             |
+          |               |             PP           |
+          |               |     ________|___         |
+          NP              |    |            NP       |
+      ____|__________     |    |     _______|____    |
+     DT   JJ    JJ   NN  VBZ   IN   DT      JJ   NN  .
+     |    |     |    |    |    |    |       |    |   |
+    The quick brown fox jumps over the     lazy dog  .
+
+    >>> parse_wolf.pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+                         ROOT
+                          |
+                          S
+           _______________|__________________________
+          |                         VP               |
+          |                _________|___             |
+          |               |             PP           |
+          |               |     ________|___         |
+          NP              |    |            NP       |
+      ____|_________      |    |     _______|____    |
+     DT   JJ   JJ   NN   VBZ   IN   DT      JJ   NN  .
+     |    |    |    |     |    |    |       |    |   |
+    The quick grey wolf jumps over the     lazy fox  .
+
+    >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
+    ...     [
+    ...         "I 'm a dog".split(),
+    ...         "This is my friends ' cat ( the tabby )".split(),
+    ...     ]
+    ... )
+
+    >>> parse_dog.pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+            ROOT
+             |
+             S
+      _______|____
+     |            VP
+     |    ________|___
+     NP  |            NP
+     |   |         ___|___
+    PRP VBP       DT      NN
+     |   |        |       |
+     I   'm       a      dog
+
+    >>> parse_friends.pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+         ROOT
+          |
+          S
+      ____|___________
+     |                VP
+     |     ___________|_____________
+     |    |                         NP
+     |    |                  _______|_________
+     |    |                 NP               PRN
+     |    |            _____|_______      ____|______________
+     NP   |           NP            |    |        NP         |
+     |    |     ______|_________    |    |     ___|____      |
+     DT  VBZ  PRP$   NNS       POS  NN -LRB-  DT       NN  -RRB-
+     |    |    |      |         |   |    |    |        |     |
+    This  is   my  friends      '  cat -LRB- the     tabby -RRB-
+
+    >>> parse_john, parse_mary, = parser.parse_text(
+    ...     'John loves Mary. Mary walks.'
+    ... )
+
+    >>> parse_john.pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+          ROOT
+           |
+           S
+      _____|_____________
+     |          VP       |
+     |      ____|___     |
+     NP    |        NP   |
+     |     |        |    |
+    NNP   VBZ      NNP   .
+     |     |        |    |
+    John loves     Mary  .
+
+    >>> parse_mary.pretty_print()  # doctest: +NORMALIZE_WHITESPACE
+          ROOT
+           |
+           S
+      _____|____
+     NP    VP   |
+     |     |    |
+    NNP   VBZ   .
+     |     |    |
+    Mary walks  .
+
+    Special cases
+    -------------
+
+    >>> next(
+    ...     parser.raw_parse(
+    ...         'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
+    ...         'Jessica Lynch have angrily dismissed claims made in her biography '
+    ...         'that she was raped by her Iraqi captors.'
+    ...     )
+    ... ).height()
+    20
+
+    >>> next(
+    ...     parser.raw_parse(
+    ...         "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
+    ...         '0.05 percent, at 997.02.'
+    ...     )
+    ... ).height()
+    9
+
+    """
+
+    _OUTPUT_FORMAT = 'penn'
+    parser_annotator = 'parse'
+
+    def make_tree(self, result):
+        return Tree.fromstring(result['parse'])
+
+
+class CoreNLPDependencyParser(GenericCoreNLPParser):
+    """Dependency parser.
+
+    >>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
+
+    >>> parse, = dep_parser.raw_parse(
+    ...     'The quick brown fox jumps over the lazy dog.'
+    ... )
+    >>> print(parse.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    The     DT      4       det
+    quick   JJ      4       amod
+    brown   JJ      4       amod
+    fox     NN      5       nsubj
+    jumps   VBZ     0       ROOT
+    over    IN      9       case
+    the     DT      9       det
+    lazy    JJ      9       amod
+    dog     NN      5       nmod
+    .       .       5       punct
+
+    >>> print(parse.tree())  # doctest: +NORMALIZE_WHITESPACE
+    (jumps (fox The quick brown) (dog over the lazy) .)
+
+    >>> for governor, dep, dependent in parse.triples():
+    ...     print(governor, dep, dependent)  # doctest: +NORMALIZE_WHITESPACE
+        ('jumps', 'VBZ') nsubj ('fox', 'NN')
+        ('fox', 'NN') det ('The', 'DT')
+        ('fox', 'NN') amod ('quick', 'JJ')
+        ('fox', 'NN') amod ('brown', 'JJ')
+        ('jumps', 'VBZ') nmod ('dog', 'NN')
+        ('dog', 'NN') case ('over', 'IN')
+        ('dog', 'NN') det ('the', 'DT')
+        ('dog', 'NN') amod ('lazy', 'JJ')
+        ('jumps', 'VBZ') punct ('.', '.')
+
+    >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
+    ...     [
+    ...         'The quick brown fox jumps over the lazy dog.',
+    ...         'The quick grey wolf jumps over the lazy fox.',
+    ...     ]
+    ... )
+    >>> print(parse_fox.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    The DT      4       det
+    quick       JJ      4       amod
+    brown       JJ      4       amod
+    fox NN      5       nsubj
+    jumps       VBZ     0       ROOT
+    over        IN      9       case
+    the DT      9       det
+    lazy        JJ      9       amod
+    dog NN      5       nmod
+    .   .       5       punct
+
+    >>> print(parse_dog.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    The DT      4       det
+    quick       JJ      4       amod
+    grey        JJ      4       amod
+    wolf        NN      5       nsubj
+    jumps       VBZ     0       ROOT
+    over        IN      9       case
+    the DT      9       det
+    lazy        JJ      9       amod
+    fox NN      5       nmod
+    .   .       5       punct
+
+    >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
+    ...     [
+    ...         "I 'm a dog".split(),
+    ...         "This is my friends ' cat ( the tabby )".split(),
+    ...     ]
+    ... )
+    >>> print(parse_dog.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    I   PRP     4       nsubj
+    'm  VBP     4       cop
+    a   DT      4       det
+    dog NN      0       ROOT
+
+    >>> print(parse_friends.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    This        DT      6       nsubj
+    is  VBZ     6       cop
+    my  PRP$    4       nmod:poss
+    friends     NNS     6       nmod:poss
+    '   POS     4       case
+    cat NN      0       ROOT
+    -LRB-       -LRB-   9       punct
+    the DT      9       det
+    tabby       NN      6       appos
+    -RRB-       -RRB-   9       punct
+
+    >>> parse_john, parse_mary, = dep_parser.parse_text(
+    ...     'John loves Mary. Mary walks.'
+    ... )
+
+    >>> print(parse_john.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    John        NNP     2       nsubj
+    loves       VBZ     0       ROOT
+    Mary        NNP     2       dobj
+    .   .       2       punct
+
+    >>> print(parse_mary.to_conll(4))  # doctest: +NORMALIZE_WHITESPACE
+    Mary        NNP     2       nsubj
+    walks       VBZ     0       ROOT
+    .   .       2       punct
+
+    Special cases
+    -------------
+
+    Non-breaking space inside of a token.
+
+    >>> len(
+    ...     next(
+    ...         dep_parser.raw_parse(
+    ...             'Anhalt said children typically treat a 20-ounce soda bottle as one '
+    ...             'serving, while it actually contains 2 1/2 servings.'
+    ...         )
+    ...     ).nodes
+    ... )
+    21
+
+    Phone  numbers.
+
+    >>> len(
+    ...     next(
+    ...         dep_parser.raw_parse('This is not going to crash: 01 111 555.')
+    ...     ).nodes
+    ... )
+    10
+
+    >>> print(
+    ...     next(
+    ...         dep_parser.raw_parse('The underscore _ should not simply disappear.')
+    ...     ).to_conll(4)
+    ... )  # doctest: +NORMALIZE_WHITESPACE
+    The         DT  3   det
+    underscore  VBP 3   amod
+    _           NN  7   nsubj
+    should      MD  7   aux
+    not         RB  7   neg
+    simply      RB  7   advmod
+    disappear   VB  0   ROOT
+    .           .   7   punct
+
+    >>> print(
+    ...     '\\n'.join(
+    ...         next(
+    ...             dep_parser.raw_parse(
+    ...                 'for all of its insights into the dream world of teen life , and its electronic expression through '
+    ...                 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
+    ...                 '1/2-hour running time .'
+    ...             )
+    ...         ).to_conll(4).split('\\n')[-8:]
+    ...     )
+    ... )
+    its        PRP$    40      nmod:poss
+    2 1/2     CD      40      nummod
+    -  :       40      punct
+    hour       NN      31      nmod
+    running    VBG     42      amod
+    time       NN      40      dep
+    .  .       24      punct
+    <BLANKLINE>
+
+    """
+
+    _OUTPUT_FORMAT = 'conll2007'
+    parser_annotator = 'depparse'
+
+    def make_tree(self, result):
+
+        return DependencyGraph(
+            (
+                ' '.join(n_items[1:])  # NLTK expects an iterable of strings...
+                for n_items in sorted(transform(result))
+            ),
+            cell_separator=' ',  # To make sure that a non-breaking space is kept inside of a token.
+        )
+
+
+def transform(sentence):
+    for dependency in sentence['basicDependencies']:
+
+        dependent_index = dependency['dependent']
+        token = sentence['tokens'][dependent_index - 1]
+
+        # Return values that we don't know as '_'. Also, consider tag and ctag
+        # to be equal.
+        yield (
+            dependent_index,
+            '_',
+            token['word'],
+            token['lemma'],
+            token['pos'],
+            token['pos'],
+            '_',
+            str(dependency['governor']),
+            dependency['dep'],
+            '_',
+            '_',
+        )
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    global server
+    try:
+        server = CoreNLPServer(port=9000)
+    except LookupError as e:
+        raise SkipTest('Could not instantiate CoreNLPServer.')
+
+    try:
+        server.start()
+    except CoreNLPServerError as e:
+        raise SkipTest(
+            'Skipping CoreNLP tests because the server could not be started. '
+            'Make sure that the 9000 port is free. '
+            '{}'.format(e.strerror)
+        )
+
+
+def teardown_module(module):
+    server.stop()
diff --git a/nlp_resource_data/nltk/parse/corenlp.pyc b/nlp_resource_data/nltk/parse/corenlp.pyc

new file mode 100755 (executable)

index 0000000..b9fe849

Binary files /dev/null and b/nlp_resource_data/nltk/parse/corenlp.pyc differ
diff --git a/nlp_resource_data/nltk/parse/dependencygraph.py b/nlp_resource_data/nltk/parse/dependencygraph.py

new file mode 100755 (executable)

index 0000000..6fadad9
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/dependencygraph.py
@@ -0,0 +1,760 @@
+# Natural Language Toolkit: Dependency Grammars
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Jason Narad <jason.narad@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (modifications)
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Tools for reading and writing dependency trees.
+The input is assumed to be in Malt-TAB format
+(http://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
+"""
+from __future__ import print_function, unicode_literals
+
+from collections import defaultdict
+from itertools import chain
+from pprint import pformat
+import subprocess
+import warnings
+
+from six import string_types
+
+from nltk.tree import Tree
+from nltk.compat import python_2_unicode_compatible
+
+
+#################################################################
+# DependencyGraph Class
+#################################################################
+
+
+@python_2_unicode_compatible
+class DependencyGraph(object):
+    """
+    A container for the nodes and labelled edges of a dependency structure.
+    """
+
+    def __init__(self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
+        """Dependency graph.
+
+        We place a dummy `TOP` node with the index 0, since the root node is
+        often assigned 0 as its head. This also means that the indexing of the
+        nodes corresponds directly to the Malt-TAB format, which starts at 1.
+
+        If zero-based is True, then Malt-TAB-like input with node numbers
+        starting at 0 and the root node assigned -1 (as produced by, e.g.,
+        zpar).
+
+        :param str cell_separator: the cell separator. If not provided, cells
+        are split by whitespace.
+
+        :param str top_relation_label: the label by which the top relation is
+        identified, for examlple, `ROOT`, `null` or `TOP`.
+
+        """
+        self.nodes = defaultdict(lambda:  {'address': None,
+                                           'word': None,
+                                           'lemma': None,
+                                           'ctag': None,
+                                           'tag': None,
+                                           'feats': None,
+                                           'head': None,
+                                           'deps': defaultdict(list),
+                                           'rel': None,
+                                           })
+
+        self.nodes[0].update(
+            {
+                'ctag': 'TOP',
+                'tag': 'TOP',
+                'address': 0,
+            }
+        )
+
+        self.root = None
+
+        if tree_str:
+            self._parse(
+                tree_str,
+                cell_extractor=cell_extractor,
+                zero_based=zero_based,
+                cell_separator=cell_separator,
+                top_relation_label=top_relation_label,
+            )
+
+    def remove_by_address(self, address):
+        """
+        Removes the node with the given address.  References
+        to this node in others will still exist.
+        """
+        del self.nodes[address]
+
+    def redirect_arcs(self, originals, redirect):
+        """
+        Redirects arcs to any of the nodes in the originals list
+        to the redirect node address.
+        """
+        for node in self.nodes.values():
+            new_deps = []
+            for dep in node['deps']:
+                if dep in originals:
+                    new_deps.append(redirect)
+                else:
+                    new_deps.append(dep)
+            node['deps'] = new_deps
+
+    def add_arc(self, head_address, mod_address):
+        """
+        Adds an arc from the node specified by head_address to the
+        node specified by the mod address.
+        """
+        relation = self.nodes[mod_address]['rel']
+        self.nodes[head_address]['deps'].setdefault(relation, [])
+        self.nodes[head_address]['deps'][relation].append(mod_address)
+        #self.nodes[head_address]['deps'].append(mod_address)
+
+
+    def connect_graph(self):
+        """
+        Fully connects all non-root nodes.  All nodes are set to be dependents
+        of the root node.
+        """
+        for node1 in self.nodes.values():
+            for node2 in self.nodes.values():
+                if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
+                    relation = node2['rel']
+                    node1['deps'].setdefault(relation, [])
+                    node1['deps'][relation].append(node2['address'])
+                    #node1['deps'].append(node2['address'])
+
+    def get_by_address(self, node_address):
+        """Return the node with the given address."""
+        return self.nodes[node_address]
+
+    def contains_address(self, node_address):
+        """
+        Returns true if the graph contains a node with the given node
+        address, false otherwise.
+        """
+        return node_address in self.nodes
+
+    def to_dot(self):
+        """Return a dot representation suitable for using with Graphviz.
+
+        >>> dg = DependencyGraph(
+        ...     'John N 2\\n'
+        ...     'loves V 0\\n'
+        ...     'Mary N 2'
+        ... )
+        >>> print(dg.to_dot())
+        digraph G{
+        edge [dir=forward]
+        node [shape=plaintext]
+        <BLANKLINE>
+        0 [label="0 (None)"]
+        0 -> 2 [label="ROOT"]
+        1 [label="1 (John)"]
+        2 [label="2 (loves)"]
+        2 -> 1 [label=""]
+        2 -> 3 [label=""]
+        3 [label="3 (Mary)"]
+        }
+
+        """
+        # Start the digraph specification
+        s = 'digraph G{\n'
+        s += 'edge [dir=forward]\n'
+        s += 'node [shape=plaintext]\n'
+
+        # Draw the remaining nodes
+        for node in sorted(self.nodes.values(), key=lambda v: v['address']):
+            s += '\n%s [label="%s (%s)"]' % (node['address'], node['address'], node['word'])
+            for rel, deps in node['deps'].items():
+                for dep in deps:
+                    if rel is not None:
+                        s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel)
+                    else:
+                        s += '\n%s -> %s ' % (node['address'], dep)
+        s += "\n}"
+
+        return s
+
+    def _repr_svg_(self):
+        """Show SVG representation of the transducer (IPython magic).
+
+        >>> dg = DependencyGraph(
+        ...     'John N 2\\n'
+        ...     'loves V 0\\n'
+        ...     'Mary N 2'
+        ... )
+        >>> dg._repr_svg_().split('\\n')[0]
+        '<?xml version="1.0" encoding="UTF-8" standalone="no"?>'
+
+        """
+        dot_string = self.to_dot()
+
+        try:
+            process = subprocess.Popen(
+                ['dot', '-Tsvg'],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                universal_newlines=True,
+            )
+        except OSError:
+            raise Exception('Cannot find the dot binary from Graphviz package')
+        out, err = process.communicate(dot_string)
+        if err:
+            raise Exception(
+                'Cannot create svg representation by running dot from string: {}'
+                ''.format(dot_string))
+        return out
+
+    def __str__(self):
+        return pformat(self.nodes)
+
+    def __repr__(self):
+        return "<DependencyGraph with {0} nodes>".format(len(self.nodes))
+
+    @staticmethod
+    def load(filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
+        """
+        :param filename: a name of a file in Malt-TAB format
+        :param zero_based: nodes in the input file are numbered starting from 0
+        rather than 1 (as produced by, e.g., zpar)
+        :param str cell_separator: the cell separator. If not provided, cells
+        are split by whitespace.
+        :param str top_relation_label: the label by which the top relation is
+        identified, for examlple, `ROOT`, `null` or `TOP`.
+
+        :return: a list of DependencyGraphs
+
+        """
+        with open(filename) as infile:
+            return [
+                DependencyGraph(
+                    tree_str,
+                    zero_based=zero_based,
+                    cell_separator=cell_separator,
+                    top_relation_label=top_relation_label,
+                )
+                for tree_str in infile.read().split('\n\n')
+            ]
+
+    def left_children(self, node_index):
+        """
+        Returns the number of left children under the node specified
+        by the given address.
+        """
+        children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+        index = self.nodes[node_index]['address']
+        return sum(1 for c in children if c < index)
+
+    def right_children(self, node_index):
+        """
+        Returns the number of right children under the node specified
+        by the given address.
+        """
+        children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+        index = self.nodes[node_index]['address']
+        return sum(1 for c in children if c > index)
+
+    def add_node(self, node):
+        if not self.contains_address(node['address']):
+            self.nodes[node['address']].update(node)
+
+    def _parse(self, input_, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
+        """Parse a sentence.
+
+        :param extractor: a function that given a tuple of cells returns a
+        7-tuple, where the values are ``word, lemma, ctag, tag, feats, head,
+        rel``.
+
+        :param str cell_separator: the cell separator. If not provided, cells
+        are split by whitespace.
+
+        :param str top_relation_label: the label by which the top relation is
+        identified, for examlple, `ROOT`, `null` or `TOP`.
+
+        """
+
+        def extract_3_cells(cells, index):
+            word, tag, head = cells
+            return index, word, word, tag, tag, '', head, ''
+
+        def extract_4_cells(cells, index):
+            word, tag, head, rel = cells
+            return index, word, word, tag, tag, '', head, rel
+
+        def extract_7_cells(cells, index):
+            line_index, word, lemma, tag, _, head, rel = cells
+            try:
+                index = int(line_index)
+            except ValueError:
+                # index can't be parsed as an integer, use default
+                pass
+            return index, word, lemma, tag, tag, '', head, rel
+
+        def extract_10_cells(cells, index):
+            line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
+            try:
+                index = int(line_index)
+            except ValueError:
+                # index can't be parsed as an integer, use default
+                pass
+            return index, word, lemma, ctag, tag, feats, head, rel
+
+        extractors = {
+            3: extract_3_cells,
+            4: extract_4_cells,
+            7: extract_7_cells,
+            10: extract_10_cells,
+        }
+
+        if isinstance(input_, string_types):
+            input_ = (line for line in input_.split('\n'))
+
+        lines = (l.rstrip() for l in input_)
+        lines = (l for l in lines if l)
+
+        cell_number = None
+        for index, line in enumerate(lines, start=1):
+            cells = line.split(cell_separator)
+            if cell_number is None:
+                cell_number = len(cells)
+            else:
+                assert cell_number == len(cells)
+
+            if cell_extractor is None:
+                try:
+                    cell_extractor = extractors[cell_number]
+                except KeyError:
+                    raise ValueError(
+                        'Number of tab-delimited fields ({0}) not supported by '
+                        'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
+                    )
+
+            try:
+                index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells, index)
+            except (TypeError, ValueError):
+                # cell_extractor doesn't take 2 arguments or doesn't return 8
+                # values; assume the cell_extractor is an older external
+                # extractor and doesn't accept or return an index.
+                word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
+
+            if head == '_':
+                continue
+
+            head = int(head)
+            if zero_based:
+                head += 1
+
+            self.nodes[index].update(
+                {
+                    'address': index,
+                    'word': word,
+                    'lemma': lemma,
+                    'ctag': ctag,
+                    'tag': tag,
+                    'feats': feats,
+                    'head': head,
+                    'rel': rel,
+                }
+            )
+
+            # Make sure that the fake root node has labeled dependencies.
+            if (cell_number == 3) and (head == 0):
+                rel = top_relation_label
+            self.nodes[head]['deps'][rel].append(index)
+
+        if self.nodes[0]['deps'][top_relation_label]:
+            root_address = self.nodes[0]['deps'][top_relation_label][0]
+            self.root = self.nodes[root_address]
+            self.top_relation_label = top_relation_label
+        else:
+            warnings.warn(
+                "The graph doesn't contain a node "
+                "that depends on the root element."
+            )
+
+    def _word(self, node, filter=True):
+        w = node['word']
+        if filter:
+            if w != ',':
+                return w
+        return w
+
+    def _tree(self, i):
+        """ Turn dependency graphs into NLTK trees.
+
+        :param int i: index of a node
+        :return: either a word (if the indexed node is a leaf) or a ``Tree``.
+        """
+        node = self.get_by_address(i)
+        word = node['word']
+        deps = sorted(chain.from_iterable(node['deps'].values()))
+
+        if deps:
+            return Tree(word, [self._tree(dep) for dep in deps])
+        else:
+            return word
+
+    def tree(self):
+        """
+        Starting with the ``root`` node, build a dependency tree using the NLTK
+        ``Tree`` constructor. Dependency labels are omitted.
+        """
+        node = self.root
+
+        word = node['word']
+        deps = sorted(chain.from_iterable(node['deps'].values()))
+        return Tree(word, [self._tree(dep) for dep in deps])
+
+    def triples(self, node=None):
+        """
+        Extract dependency triples of the form:
+        ((head word, head tag), rel, (dep word, dep tag))
+        """
+
+        if not node:
+            node = self.root
+
+        head = (node['word'], node['ctag'])
+        for i in sorted(chain.from_iterable(node['deps'].values())):
+            dep = self.get_by_address(i)
+            yield (head, dep['rel'], (dep['word'], dep['ctag']))
+            for triple in self.triples(node=dep):
+                yield triple
+
+    def _hd(self, i):
+        try:
+            return self.nodes[i]['head']
+        except IndexError:
+            return None
+
+    def _rel(self, i):
+        try:
+            return self.nodes[i]['rel']
+        except IndexError:
+            return None
+
+    # what's the return type?  Boolean or list?
+    def contains_cycle(self):
+        """Check whether there are cycles.
+
+        >>> dg = DependencyGraph(treebank_data)
+        >>> dg.contains_cycle()
+        False
+
+        >>> cyclic_dg = DependencyGraph()
+        >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
+        >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
+        >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
+        >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
+        >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
+        >>> cyclic_dg.nodes = {
+        ...     0: top,
+        ...     1: child1,
+        ...     2: child2,
+        ...     3: child3,
+        ...     4: child4,
+        ... }
+        >>> cyclic_dg.root = top
+
+        >>> cyclic_dg.contains_cycle()
+        [3, 1, 2, 4]
+
+        """
+        distances = {}
+
+        for node in self.nodes.values():
+            for dep in node['deps']:
+                key = tuple([node['address'], dep])
+                distances[key] = 1
+
+        for _ in self.nodes:
+            new_entries = {}
+
+            for pair1 in distances:
+                for pair2 in distances:
+                    if pair1[1] == pair2[0]:
+                        key = tuple([pair1[0], pair2[1]])
+                        new_entries[key] = distances[pair1] + distances[pair2]
+
+            for pair in new_entries:
+                distances[pair] = new_entries[pair]
+                if pair[0] == pair[1]:
+                    path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0])
+                    return path
+
+        return False  # return []?
+
+    def get_cycle_path(self, curr_node, goal_node_index):
+        for dep in curr_node['deps']:
+            if dep == goal_node_index:
+                return [curr_node['address']]
+        for dep in curr_node['deps']:
+            path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
+            if len(path) > 0:
+                path.insert(0, curr_node['address'])
+                return path
+        return []
+
+    def to_conll(self, style):
+        """
+        The dependency graph in CoNLL format.
+
+        :param style: the style to use for the format (3, 4, 10 columns)
+        :type style: int
+        :rtype: str
+        """
+
+        if style == 3:
+            template = '{word}\t{tag}\t{head}\n'
+        elif style == 4:
+            template = '{word}\t{tag}\t{head}\t{rel}\n'
+        elif style == 10:
+            template = '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n'
+        else:
+            raise ValueError(
+                'Number of tab-delimited fields ({0}) not supported by '
+                'CoNLL(10) or Malt-Tab(4) format'.format(style)
+            )
+
+        return ''.join(template.format(i=i, **node) for i, node in sorted(self.nodes.items()) if node['tag'] != 'TOP')
+
+    def nx_graph(self):
+        """Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
+        import networkx
+
+        nx_nodelist = list(range(1, len(self.nodes)))
+        nx_edgelist = [
+            (n, self._hd(n), self._rel(n))
+            for n in nx_nodelist if self._hd(n)
+        ]
+        self.nx_labels = {}
+        for n in nx_nodelist:
+            self.nx_labels[n] = self.nodes[n]['word']
+
+        g = networkx.MultiDiGraph()
+        g.add_nodes_from(nx_nodelist)
+        g.add_edges_from(nx_edgelist)
+
+        return g
+
+
+class DependencyGraphError(Exception):
+    """Dependency graph exception."""
+
+
+def demo():
+    malt_demo()
+    conll_demo()
+    conll_file_demo()
+    cycle_finding_demo()
+
+
+def malt_demo(nx=False):
+    """
+    A demonstration of the result of reading a dependency
+    version of the first sentence of the Penn Treebank.
+    """
+    dg = DependencyGraph("""Pierre  NNP     2       NMOD
+Vinken  NNP     8       SUB
+,       ,       2       P
+61      CD      5       NMOD
+years   NNS     6       AMOD
+old     JJ      2       NMOD
+,       ,       2       P
+will    MD      0       ROOT
+join    VB      8       VC
+the     DT      11      NMOD
+board   NN      9       OBJ
+as      IN      9       VMOD
+a       DT      15      NMOD
+nonexecutive    JJ      15      NMOD
+director        NN      12      PMOD
+Nov.    NNP     9       VMOD
+29      CD      16      NMOD
+.       .       9       VMOD
+""")
+    tree = dg.tree()
+    tree.pprint()
+    if nx:
+        # currently doesn't work
+        import networkx
+        from matplotlib import pylab
+
+        g = dg.nx_graph()
+        g.info()
+        pos = networkx.spring_layout(g, dim=1)
+        networkx.draw_networkx_nodes(g, pos, node_size=50)
+        # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
+        networkx.draw_networkx_labels(g, pos, dg.nx_labels)
+        pylab.xticks([])
+        pylab.yticks([])
+        pylab.savefig('tree.png')
+        pylab.show()
+
+
+def conll_demo():
+    """
+    A demonstration of how to read a string representation of
+    a CoNLL format dependency tree.
+    """
+    dg = DependencyGraph(conll_data1)
+    tree = dg.tree()
+    tree.pprint()
+    print(dg)
+    print(dg.to_conll(4))
+
+
+def conll_file_demo():
+    print('Mass conll_read demo...')
+    graphs = [DependencyGraph(entry)
+              for entry in conll_data2.split('\n\n') if entry]
+    for graph in graphs:
+        tree = graph.tree()
+        print('\n')
+        tree.pprint()
+
+
+def cycle_finding_demo():
+    dg = DependencyGraph(treebank_data)
+    print(dg.contains_cycle())
+    cyclic_dg = DependencyGraph()
+    cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
+    cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
+    cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
+    cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
+    cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
+    print(cyclic_dg.contains_cycle())
+
+treebank_data = """Pierre  NNP     2       NMOD
+Vinken  NNP     8       SUB
+,       ,       2       P
+61      CD      5       NMOD
+years   NNS     6       AMOD
+old     JJ      2       NMOD
+,       ,       2       P
+will    MD      0       ROOT
+join    VB      8       VC
+the     DT      11      NMOD
+board   NN      9       OBJ
+as      IN      9       VMOD
+a       DT      15      NMOD
+nonexecutive    JJ      15      NMOD
+director        NN      12      PMOD
+Nov.    NNP     9       VMOD
+29      CD      16      NMOD
+.       .       9       VMOD
+"""
+
+conll_data1 = """
+1   Ze                ze                Pron  Pron  per|3|evofmv|nom                 2   su      _  _
+2   had               heb               V     V     trans|ovt|1of2of3|ev             0   ROOT    _  _
+3   met               met               Prep  Prep  voor                             8   mod     _  _
+4   haar              haar              Pron  Pron  bez|3|ev|neut|attr               5   det     _  _
+5   moeder            moeder            N     N     soort|ev|neut                    3   obj1    _  _
+6   kunnen            kan               V     V     hulp|ott|1of2of3|mv              2   vc      _  _
+7   gaan              ga                V     V     hulp|inf                         6   vc      _  _
+8   winkelen          winkel            V     V     intrans|inf                      11  cnj     _  _
+9   ,                 ,                 Punc  Punc  komma                            8   punct   _  _
+10  zwemmen           zwem              V     V     intrans|inf                      11  cnj     _  _
+11  of                of                Conj  Conj  neven                            7   vc      _  _
+12  terrassen         terras            N     N     soort|mv|neut                    11  cnj     _  _
+13  .                 .                 Punc  Punc  punt                             12  punct   _  _
+"""
+
+conll_data2 = """1   Cathy             Cathy             N     N     eigen|ev|neut                    2   su      _  _
+2   zag               zie               V     V     trans|ovt|1of2of3|ev             0   ROOT    _  _
+3   hen               hen               Pron  Pron  per|3|mv|datofacc                2   obj1    _  _
+4   wild              wild              Adj   Adj   attr|stell|onverv                5   mod     _  _
+5   zwaaien           zwaai             N     N     soort|mv|neut                    2   vc      _  _
+6   .                 .                 Punc  Punc  punt                             5   punct   _  _
+
+1   Ze                ze                Pron  Pron  per|3|evofmv|nom                 2   su      _  _
+2   had               heb               V     V     trans|ovt|1of2of3|ev             0   ROOT    _  _
+3   met               met               Prep  Prep  voor                             8   mod     _  _
+4   haar              haar              Pron  Pron  bez|3|ev|neut|attr               5   det     _  _
+5   moeder            moeder            N     N     soort|ev|neut                    3   obj1    _  _
+6   kunnen            kan               V     V     hulp|ott|1of2of3|mv              2   vc      _  _
+7   gaan              ga                V     V     hulp|inf                         6   vc      _  _
+8   winkelen          winkel            V     V     intrans|inf                      11  cnj     _  _
+9   ,                 ,                 Punc  Punc  komma                            8   punct   _  _
+10  zwemmen           zwem              V     V     intrans|inf                      11  cnj     _  _
+11  of                of                Conj  Conj  neven                            7   vc      _  _
+12  terrassen         terras            N     N     soort|mv|neut                    11  cnj     _  _
+13  .                 .                 Punc  Punc  punt                             12  punct   _  _
+
+1   Dat               dat               Pron  Pron  aanw|neut|attr                   2   det     _  _
+2   werkwoord         werkwoord         N     N     soort|ev|neut                    6   obj1    _  _
+3   had               heb               V     V     hulp|ovt|1of2of3|ev              0   ROOT    _  _
+4   ze                ze                Pron  Pron  per|3|evofmv|nom                 6   su      _  _
+5   zelf              zelf              Pron  Pron  aanw|neut|attr|wzelf             3   predm   _  _
+6   uitgevonden       vind              V     V     trans|verldw|onverv              3   vc      _  _
+7   .                 .                 Punc  Punc  punt                             6   punct   _  _
+
+1   Het               het               Pron  Pron  onbep|neut|zelfst                2   su      _  _
+2   hoorde            hoor              V     V     trans|ovt|1of2of3|ev             0   ROOT    _  _
+3   bij               bij               Prep  Prep  voor                             2   ld      _  _
+4   de                de                Art   Art   bep|zijdofmv|neut                6   det     _  _
+5   warme             warm              Adj   Adj   attr|stell|vervneut              6   mod     _  _
+6   zomerdag          zomerdag          N     N     soort|ev|neut                    3   obj1    _  _
+7   die               die               Pron  Pron  betr|neut|zelfst                 6   mod     _  _
+8   ze                ze                Pron  Pron  per|3|evofmv|nom                 12  su      _  _
+9   ginds             ginds             Adv   Adv   gew|aanw                         12  mod     _  _
+10  achter            achter            Adv   Adv   gew|geenfunc|stell|onverv        12  svp     _  _
+11  had               heb               V     V     hulp|ovt|1of2of3|ev              7   body    _  _
+12  gelaten           laat              V     V     trans|verldw|onverv              11  vc      _  _
+13  .                 .                 Punc  Punc  punt                             12  punct   _  _
+
+1   Ze                ze                Pron  Pron  per|3|evofmv|nom                 2   su      _  _
+2   hadden            heb               V     V     trans|ovt|1of2of3|mv             0   ROOT    _  _
+3   languit           languit           Adv   Adv   gew|geenfunc|stell|onverv        11  mod     _  _
+4   naast             naast             Prep  Prep  voor                             11  mod     _  _
+5   elkaar            elkaar            Pron  Pron  rec|neut                         4   obj1    _  _
+6   op                op                Prep  Prep  voor                             11  ld      _  _
+7   de                de                Art   Art   bep|zijdofmv|neut                8   det     _  _
+8   strandstoelen     strandstoel       N     N     soort|mv|neut                    6   obj1    _  _
+9   kunnen            kan               V     V     hulp|inf                         2   vc      _  _
+10  gaan              ga                V     V     hulp|inf                         9   vc      _  _
+11  liggen            lig               V     V     intrans|inf                      10  vc      _  _
+12  .                 .                 Punc  Punc  punt                             11  punct   _  _
+
+1   Zij               zij               Pron  Pron  per|3|evofmv|nom                 2   su      _  _
+2   zou               zal               V     V     hulp|ovt|1of2of3|ev              7   cnj     _  _
+3   mams              mams              N     N     soort|ev|neut                    4   det     _  _
+4   rug               rug               N     N     soort|ev|neut                    5   obj1    _  _
+5   ingewreven        wrijf             V     V     trans|verldw|onverv              6   vc      _  _
+6   hebben            heb               V     V     hulp|inf                         2   vc      _  _
+7   en                en                Conj  Conj  neven                            0   ROOT    _  _
+8   mam               mam               V     V     trans|ovt|1of2of3|ev             7   cnj     _  _
+9   de                de                Art   Art   bep|zijdofmv|neut                10  det     _  _
+10  hare              hare              Pron  Pron  bez|3|ev|neut|attr               8   obj1    _  _
+11  .                 .                 Punc  Punc  punt                             10  punct   _  _
+
+1   Of                of                Conj  Conj  onder|metfin                     0   ROOT    _  _
+2   ze                ze                Pron  Pron  per|3|evofmv|nom                 3   su      _  _
+3   had               heb               V     V     hulp|ovt|1of2of3|ev              0   ROOT    _  _
+4   gewoon            gewoon            Adj   Adj   adv|stell|onverv                 10  mod     _  _
+5   met               met               Prep  Prep  voor                             10  mod     _  _
+6   haar              haar              Pron  Pron  bez|3|ev|neut|attr               7   det     _  _
+7   vriendinnen       vriendin          N     N     soort|mv|neut                    5   obj1    _  _
+8   rond              rond              Adv   Adv   deelv                            10  svp     _  _
+9   kunnen            kan               V     V     hulp|inf                         3   vc      _  _
+10  slenteren         slenter           V     V     intrans|inf                      9   vc      _  _
+11  in                in                Prep  Prep  voor                             10  mod     _  _
+12  de                de                Art   Art   bep|zijdofmv|neut                13  det     _  _
+13  buurt             buurt             N     N     soort|ev|neut                    11  obj1    _  _
+14  van               van               Prep  Prep  voor                             13  mod     _  _
+15  Trafalgar_Square  Trafalgar_Square  MWU   N_N   eigen|ev|neut_eigen|ev|neut      14  obj1    _  _
+16  .                 .                 Punc  Punc  punt                             15  punct   _  _
+"""
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/dependencygraph.pyc b/nlp_resource_data/nltk/parse/dependencygraph.pyc

new file mode 100755 (executable)

index 0000000..eb0a12a

Binary files /dev/null and b/nlp_resource_data/nltk/parse/dependencygraph.pyc differ
diff --git a/nlp_resource_data/nltk/parse/earleychart.py b/nlp_resource_data/nltk/parse/earleychart.py

new file mode 100755 (executable)

index 0000000..5955b50
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/earleychart.py
@@ -0,0 +1,452 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: An Incremental Earley Chart Parser
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+#         Rob Speer <rspeer@mit.edu>
+#         Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Jean Mark Gawron <gawron@mail.sdsu.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Data classes and parser implementations for *incremental* chart
+parsers, which use dynamic programming to efficiently parse a text.
+A "chart parser" derives parse trees for a text by iteratively adding
+\"edges\" to a \"chart\".  Each "edge" represents a hypothesis about the tree
+structure for a subsequence of the text.  The "chart" is a
+\"blackboard\" for composing and combining these hypotheses.
+
+A parser is "incremental", if it guarantees that for all i, j where i < j,
+all edges ending at i are built before any edges ending at j.
+This is appealing for, say, speech recognizer hypothesis filtering.
+
+The main parser class is ``EarleyChartParser``, which is a top-down
+algorithm, originally formulated by Jay Earley (1970).
+"""
+from __future__ import print_function, division
+
+from six.moves import range
+
+from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule,
+                              BottomUpPredictRule, BottomUpPredictCombineRule,
+                              TopDownInitRule, SingleEdgeFundamentalRule,
+                              EmptyPredictRule,
+                              CachedTopDownPredictRule,
+                              FilteredSingleEdgeFundamentalRule,
+                              FilteredBottomUpPredictCombineRule)
+from nltk.parse.featurechart import (FeatureChart, FeatureChartParser,
+                                     FeatureTopDownInitRule,
+                                     FeatureTopDownPredictRule,
+                                     FeatureEmptyPredictRule,
+                                     FeatureBottomUpPredictRule,
+                                     FeatureBottomUpPredictCombineRule,
+                                     FeatureSingleEdgeFundamentalRule)
+
+#////////////////////////////////////////////////////////////
+# Incremental Chart
+#////////////////////////////////////////////////////////////
+
+class IncrementalChart(Chart):
+    def initialize(self):
+        # A sequence of edge lists contained in this chart.
+        self._edgelists = tuple([] for x in self._positions())
+
+        # The set of child pointer lists associated with each edge.
+        self._edge_to_cpls = {}
+
+        # Indexes mapping attribute values to lists of edges
+        # (used by select()).
+        self._indexes = {}
+
+    def edges(self):
+        return list(self.iteredges())
+
+    def iteredges(self):
+        return (edge for edgelist in self._edgelists for edge in edgelist)
+
+    def select(self, end, **restrictions):
+        edgelist = self._edgelists[end]
+
+        # If there are no restrictions, then return all edges.
+        if restrictions=={}: return iter(edgelist)
+
+        # Find the index corresponding to the given restrictions.
+        restr_keys = sorted(restrictions.keys())
+        restr_keys = tuple(restr_keys)
+
+        # If it doesn't exist, then create it.
+        if restr_keys not in self._indexes:
+            self._add_index(restr_keys)
+
+        vals = tuple(restrictions[key] for key in restr_keys)
+        return iter(self._indexes[restr_keys][end].get(vals, []))
+
+    def _add_index(self, restr_keys):
+        # Make sure it's a valid index.
+        for key in restr_keys:
+            if not hasattr(EdgeI, key):
+                raise ValueError('Bad restriction: %s' % key)
+
+        # Create the index.
+        index = self._indexes[restr_keys] = tuple({} for x in self._positions())
+
+        # Add all existing edges to the index.
+        for end, edgelist in enumerate(self._edgelists):
+            this_index = index[end]
+            for edge in edgelist:
+                vals = tuple(getattr(edge, key)() for key in restr_keys)
+                this_index.setdefault(vals, []).append(edge)
+
+    def _register_with_indexes(self, edge):
+        end = edge.end()
+        for (restr_keys, index) in self._indexes.items():
+            vals = tuple(getattr(edge, key)() for key in restr_keys)
+            index[end].setdefault(vals, []).append(edge)
+
+    def _append_edge(self, edge):
+        self._edgelists[edge.end()].append(edge)
+
+    def _positions(self):
+        return range(self.num_leaves() + 1)
+
+
+class FeatureIncrementalChart(IncrementalChart, FeatureChart):
+    def select(self, end, **restrictions):
+        edgelist = self._edgelists[end]
+
+        # If there are no restrictions, then return all edges.
+        if restrictions=={}: return iter(edgelist)
+
+        # Find the index corresponding to the given restrictions.
+        restr_keys = sorted(restrictions.keys())
+        restr_keys = tuple(restr_keys)
+
+        # If it doesn't exist, then create it.
+        if restr_keys not in self._indexes:
+            self._add_index(restr_keys)
+
+        vals = tuple(self._get_type_if_possible(restrictions[key])
+                     for key in restr_keys)
+        return iter(self._indexes[restr_keys][end].get(vals, []))
+
+    def _add_index(self, restr_keys):
+        # Make sure it's a valid index.
+        for key in restr_keys:
+            if not hasattr(EdgeI, key):
+                raise ValueError('Bad restriction: %s' % key)
+
+        # Create the index.
+        index = self._indexes[restr_keys] = tuple({} for x in self._positions())
+
+        # Add all existing edges to the index.
+        for end, edgelist in enumerate(self._edgelists):
+            this_index = index[end]
+            for edge in edgelist:
+                vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+                             for key in restr_keys)
+                this_index.setdefault(vals, []).append(edge)
+
+    def _register_with_indexes(self, edge):
+        end = edge.end()
+        for (restr_keys, index) in self._indexes.items():
+            vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+                         for key in restr_keys)
+            index[end].setdefault(vals, []).append(edge)
+
+#////////////////////////////////////////////////////////////
+# Incremental CFG Rules
+#////////////////////////////////////////////////////////////
+
+class CompleteFundamentalRule(SingleEdgeFundamentalRule):
+    def _apply_incomplete(self, chart, grammar, left_edge):
+        end = left_edge.end()
+        # When the chart is incremental, we only have to look for
+        # empty complete edges here.
+        for right_edge in chart.select(start=end, end=end,
+                                       is_complete=True,
+                                       lhs=left_edge.nextsym()):
+            new_edge = left_edge.move_dot_forward(right_edge.end())
+            if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+                yield new_edge
+
+class CompleterRule(CompleteFundamentalRule):
+    _fundamental_rule = CompleteFundamentalRule()
+    def apply(self, chart, grammar, edge):
+        if not isinstance(edge, LeafEdge):
+            for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
+                yield new_edge
+
+class ScannerRule(CompleteFundamentalRule):
+    _fundamental_rule = CompleteFundamentalRule()
+    def apply(self, chart, grammar, edge):
+        if isinstance(edge, LeafEdge):
+            for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
+                yield new_edge
+
+class PredictorRule(CachedTopDownPredictRule):
+    pass
+
+class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
+    def apply(self, chart, grammar, edge):
+        # Since the Filtered rule only works for grammars without empty productions,
+        # we only have to bother with complete edges here.
+        if edge.is_complete():
+            for new_edge in self._apply_complete(chart, grammar, edge):
+                yield new_edge
+
+#////////////////////////////////////////////////////////////
+# Incremental FCFG Rules
+#////////////////////////////////////////////////////////////
+
+class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
+    def _apply_incomplete(self, chart, grammar, left_edge):
+        fr = self._fundamental_rule
+        end = left_edge.end()
+        # When the chart is incremental, we only have to look for
+        # empty complete edges here.
+        for right_edge in chart.select(start=end, end=end,
+                                       is_complete=True,
+                                       lhs=left_edge.nextsym()):
+            for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
+                yield new_edge
+
+class FeatureCompleterRule(CompleterRule):
+    _fundamental_rule = FeatureCompleteFundamentalRule()
+
+class FeatureScannerRule(ScannerRule):
+    _fundamental_rule = FeatureCompleteFundamentalRule()
+
+class FeaturePredictorRule(FeatureTopDownPredictRule):
+    pass
+
+#////////////////////////////////////////////////////////////
+# Incremental CFG Chart Parsers
+#////////////////////////////////////////////////////////////
+
+EARLEY_STRATEGY = [LeafInitRule(),
+                   TopDownInitRule(),
+                   CompleterRule(),
+                   ScannerRule(),
+                   PredictorRule()]
+TD_INCREMENTAL_STRATEGY = [LeafInitRule(),
+                           TopDownInitRule(),
+                           CachedTopDownPredictRule(),
+                           CompleteFundamentalRule()]
+BU_INCREMENTAL_STRATEGY = [LeafInitRule(),
+                           EmptyPredictRule(),
+                           BottomUpPredictRule(),
+                           CompleteFundamentalRule()]
+BU_LC_INCREMENTAL_STRATEGY = [LeafInitRule(),
+                              EmptyPredictRule(),
+                              BottomUpPredictCombineRule(),
+                              CompleteFundamentalRule()]
+
+LC_INCREMENTAL_STRATEGY = [LeafInitRule(),
+                           FilteredBottomUpPredictCombineRule(),
+                           FilteredCompleteFundamentalRule()]
+
+class IncrementalChartParser(ChartParser):
+    """
+    An *incremental* chart parser implementing Jay Earley's
+    parsing algorithm:
+
+    | For each index end in [0, 1, ..., N]:
+    |   For each edge such that edge.end = end:
+    |     If edge is incomplete and edge.next is not a part of speech:
+    |       Apply PredictorRule to edge
+    |     If edge is incomplete and edge.next is a part of speech:
+    |       Apply ScannerRule to edge
+    |     If edge is complete:
+    |       Apply CompleterRule to edge
+    | Return any complete parses in the chart
+    """
+    def __init__(self, grammar, strategy=BU_LC_INCREMENTAL_STRATEGY,
+                 trace=0, trace_chart_width=50,
+                 chart_class=IncrementalChart):
+        """
+        Create a new Earley chart parser, that uses ``grammar`` to
+        parse texts.
+
+        :type grammar: CFG
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        :type trace_chart_width: int
+        :param trace_chart_width: The default total width reserved for
+            the chart in trace output.  The remainder of each line will
+            be used to display edges.
+        :param chart_class: The class that should be used to create
+            the charts used by this parser.
+        """
+        self._grammar = grammar
+        self._trace = trace
+        self._trace_chart_width = trace_chart_width
+        self._chart_class = chart_class
+
+        self._axioms = []
+        self._inference_rules = []
+        for rule in strategy:
+            if rule.NUM_EDGES == 0:
+                self._axioms.append(rule)
+            elif rule.NUM_EDGES == 1:
+                self._inference_rules.append(rule)
+            else:
+                raise ValueError("Incremental inference rules must have "
+                                 "NUM_EDGES == 0 or 1")
+
+    def chart_parse(self, tokens, trace=None):
+        if trace is None: trace = self._trace
+        trace_new_edges = self._trace_new_edges
+
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+        chart = self._chart_class(tokens)
+        grammar = self._grammar
+
+        # Width, for printing trace edges.
+        trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
+        if trace: print(chart.pretty_format_leaves(trace_edge_width))
+
+        for axiom in self._axioms:
+            new_edges = list(axiom.apply(chart, grammar))
+            trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
+
+        inference_rules = self._inference_rules
+        for end in range(chart.num_leaves()+1):
+            if trace > 1: print("\n* Processing queue:", end, "\n")
+            agenda = list(chart.select(end=end))
+            while agenda:
+                edge = agenda.pop()
+                for rule in inference_rules:
+                    new_edges = list(rule.apply(chart, grammar, edge))
+                    trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
+                    for new_edge in new_edges:
+                        if new_edge.end()==end:
+                            agenda.append(new_edge)
+
+        return chart
+
+class EarleyChartParser(IncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
+    pass
+
+class IncrementalTopDownChartParser(IncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        IncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args)
+
+class IncrementalBottomUpChartParser(IncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        IncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args)
+
+class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        IncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args)
+
+class IncrementalLeftCornerChartParser(IncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        if not grammar.is_nonempty():
+            raise ValueError("IncrementalLeftCornerParser only works for grammars "
+                             "without empty productions.")
+        IncrementalChartParser.__init__(self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args)
+
+#////////////////////////////////////////////////////////////
+# Incremental FCFG Chart Parsers
+#////////////////////////////////////////////////////////////
+
+EARLEY_FEATURE_STRATEGY = [LeafInitRule(),
+                           FeatureTopDownInitRule(),
+                           FeatureCompleterRule(),
+                           FeatureScannerRule(),
+                           FeaturePredictorRule()]
+TD_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(),
+                                   FeatureTopDownInitRule(),
+                                   FeatureTopDownPredictRule(),
+                                   FeatureCompleteFundamentalRule()]
+BU_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(),
+                                   FeatureEmptyPredictRule(),
+                                   FeatureBottomUpPredictRule(),
+                                   FeatureCompleteFundamentalRule()]
+BU_LC_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(),
+                                      FeatureEmptyPredictRule(),
+                                      FeatureBottomUpPredictCombineRule(),
+                                      FeatureCompleteFundamentalRule()]
+
+class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
+    def __init__(self, grammar,
+                 strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
+                 trace_chart_width=20,
+                 chart_class=FeatureIncrementalChart,
+                 **parser_args):
+        IncrementalChartParser.__init__(self, grammar,
+                                        strategy=strategy,
+                                        trace_chart_width=trace_chart_width,
+                                        chart_class=chart_class,
+                                        **parser_args)
+
+class FeatureEarleyChartParser(FeatureIncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureIncrementalChartParser.__init__(self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args)
+
+class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureIncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
+
+class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureIncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
+
+class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureIncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
+
+
+#////////////////////////////////////////////////////////////
+# Demonstration
+#////////////////////////////////////////////////////////////
+
+def demo(print_times=True, print_grammar=False,
+         print_trees=True, trace=2,
+         sent='I saw John with a dog with my cookie', numparses=5):
+    """
+    A demonstration of the Earley parsers.
+    """
+    import sys, time
+    from nltk.parse.chart import demo_grammar
+
+    # The grammar for ChartParser and SteppingChartParser:
+    grammar = demo_grammar()
+    if print_grammar:
+        print("* Grammar")
+        print(grammar)
+
+    # Tokenize the sample sentence.
+    print("* Sentence:")
+    print(sent)
+    tokens = sent.split()
+    print(tokens)
+    print()
+
+    # Do the parsing.
+    earley = EarleyChartParser(grammar, trace=trace)
+    t = time.clock()
+    chart = earley.chart_parse(tokens)
+    parses = list(chart.parses(grammar.start()))
+    t = time.clock()-t
+
+    # Print results.
+    if numparses:
+        assert len(parses)==numparses, 'Not all parses found'
+    if print_trees:
+        for tree in parses: print(tree)
+    else:
+        print("Nr trees:", len(parses))
+    if print_times:
+        print("Time:", t)
+
+if __name__ == '__main__': demo()
diff --git a/nlp_resource_data/nltk/parse/earleychart.pyc b/nlp_resource_data/nltk/parse/earleychart.pyc

new file mode 100755 (executable)

index 0000000..fc81b0e

Binary files /dev/null and b/nlp_resource_data/nltk/parse/earleychart.pyc differ
diff --git a/nlp_resource_data/nltk/parse/evaluate.py b/nlp_resource_data/nltk/parse/evaluate.py

new file mode 100755 (executable)

index 0000000..0d101bf
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/evaluate.py
@@ -0,0 +1,129 @@
+# Natural Language Toolkit: evaluation of dependency parser
+#
+# Author: Long Duong <longdt219@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import division
+
+import unicodedata
+
+
+class DependencyEvaluator(object):
+    """
+    Class for measuring labelled and unlabelled attachment score for
+    dependency parsing. Note that the evaluation ignores punctuation.
+
+    >>> from nltk.parse import DependencyGraph, DependencyEvaluator
+
+    >>> gold_sent = DependencyGraph(\"""
+    ... Pierre  NNP     2       NMOD
+    ... Vinken  NNP     8       SUB
+    ... ,       ,       2       P
+    ... 61      CD      5       NMOD
+    ... years   NNS     6       AMOD
+    ... old     JJ      2       NMOD
+    ... ,       ,       2       P
+    ... will    MD      0       ROOT
+    ... join    VB      8       VC
+    ... the     DT      11      NMOD
+    ... board   NN      9       OBJ
+    ... as      IN      9       VMOD
+    ... a       DT      15      NMOD
+    ... nonexecutive    JJ      15      NMOD
+    ... director        NN      12      PMOD
+    ... Nov.    NNP     9       VMOD
+    ... 29      CD      16      NMOD
+    ... .       .       9       VMOD
+    ... \""")
+
+    >>> parsed_sent = DependencyGraph(\"""
+    ... Pierre  NNP     8       NMOD
+    ... Vinken  NNP     1       SUB
+    ... ,       ,       3       P
+    ... 61      CD      6       NMOD
+    ... years   NNS     6       AMOD
+    ... old     JJ      2       NMOD
+    ... ,       ,       3       AMOD
+    ... will    MD      0       ROOT
+    ... join    VB      8       VC
+    ... the     DT      11      AMOD
+    ... board   NN      9       OBJECT
+    ... as      IN      9       NMOD
+    ... a       DT      15      NMOD
+    ... nonexecutive    JJ      15      NMOD
+    ... director        NN      12      PMOD
+    ... Nov.    NNP     9       VMOD
+    ... 29      CD      16      NMOD
+    ... .       .       9       VMOD
+    ... \""")
+
+    >>> de = DependencyEvaluator([parsed_sent],[gold_sent])
+    >>> las, uas = de.eval()
+    >>> las
+    0.8...
+    >>> abs(uas - 0.6) < 0.00001
+    True
+    """
+
+    def __init__(self, parsed_sents, gold_sents):
+        """
+        :param parsed_sents: the list of parsed_sents as the output of parser
+        :type parsed_sents: list(DependencyGraph)
+        """
+        self._parsed_sents = parsed_sents
+        self._gold_sents = gold_sents
+
+    def _remove_punct(self, inStr):
+        """
+        Function to remove punctuation from Unicode string.
+        :param input: the input string
+        :return: Unicode string after remove all punctuation
+        """
+        punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
+        return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat)
+
+    def eval(self):
+        """
+        Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS)
+
+        :return : tuple(float,float)
+        """
+        if (len(self._parsed_sents) != len(self._gold_sents)):
+            raise ValueError(" Number of parsed sentence is different with number of gold sentence.")
+
+        corr = 0
+        corrL = 0
+        total = 0
+
+        for i in range(len(self._parsed_sents)):
+            parsed_sent_nodes = self._parsed_sents[i].nodes
+            gold_sent_nodes = self._gold_sents[i].nodes
+
+            if (len(parsed_sent_nodes) != len(gold_sent_nodes)):
+                raise ValueError("Sentences must have equal length.")
+
+            for parsed_node_address, parsed_node in parsed_sent_nodes.items():
+                gold_node = gold_sent_nodes[parsed_node_address]
+
+                if parsed_node["word"] is None:
+                    continue
+                if parsed_node["word"] != gold_node["word"]:
+                    raise ValueError("Sentence sequence is not matched.")
+
+                # Ignore if word is punctuation by default
+                # if (parsed_sent[j]["word"] in string.punctuation):
+                if self._remove_punct(parsed_node["word"]) == "":
+                    continue
+
+                total += 1
+                if parsed_node["head"] == gold_node["head"]:
+                    corr += 1
+                    if parsed_node["rel"] == gold_node["rel"]:
+                        corrL += 1
+
+        return corr / total, corrL / total
+
+
diff --git a/nlp_resource_data/nltk/parse/evaluate.pyc b/nlp_resource_data/nltk/parse/evaluate.pyc

new file mode 100755 (executable)

index 0000000..ee55343

Binary files /dev/null and b/nlp_resource_data/nltk/parse/evaluate.pyc differ
diff --git a/nlp_resource_data/nltk/parse/featurechart.py b/nlp_resource_data/nltk/parse/featurechart.py

new file mode 100755 (executable)

index 0000000..eafd0bf
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/featurechart.py
@@ -0,0 +1,580 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Rob Speer <rspeer@mit.edu>
+#         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Extension of chart parsing implementation to handle grammars with
+feature structures as nodes.
+"""
+from __future__ import print_function, unicode_literals
+
+from six.moves import range
+
+from nltk.compat import python_2_unicode_compatible
+from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
+from nltk.sem import logic
+from nltk.tree import Tree
+from nltk.grammar import (Nonterminal, Production, CFG,
+                          FeatStructNonterminal, is_nonterminal,
+                          is_terminal)
+from nltk.parse.chart import (TreeEdge, Chart, ChartParser, EdgeI,
+                              FundamentalRule, LeafInitRule,
+                              EmptyPredictRule, BottomUpPredictRule,
+                              SingleEdgeFundamentalRule,
+                              BottomUpPredictCombineRule,
+                              CachedTopDownPredictRule,
+                              TopDownInitRule)
+
+#////////////////////////////////////////////////////////////
+# Tree Edge
+#////////////////////////////////////////////////////////////
+
+@python_2_unicode_compatible
+class FeatureTreeEdge(TreeEdge):
+    """
+    A specialized tree edge that allows shared variable bindings
+    between nonterminals on the left-hand side and right-hand side.
+
+    Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a
+    dictionary mapping from variables to values.  If the edge is not
+    complete, then these bindings are simply stored.  However, if the
+    edge is complete, then the constructor applies these bindings to
+    every nonterminal in the edge whose symbol implements the
+    interface ``SubstituteBindingsI``.
+    """
+    def __init__(self, span, lhs, rhs, dot=0, bindings=None):
+        """
+        Construct a new edge.  If the edge is incomplete (i.e., if
+        ``dot<len(rhs)``), then store the bindings as-is.  If the edge
+        is complete (i.e., if ``dot==len(rhs)``), then apply the
+        bindings to all nonterminals in ``lhs`` and ``rhs``, and then
+        clear the bindings.  See ``TreeEdge`` for a description of
+        the other arguments.
+        """
+        if bindings is None: bindings = {}
+
+        # If the edge is complete, then substitute in the bindings,
+        # and then throw them away.  (If we didn't throw them away, we
+        # might think that 2 complete edges are different just because
+        # they have different bindings, even though all bindings have
+        # already been applied.)
+        if dot == len(rhs) and bindings:
+            lhs = self._bind(lhs, bindings)
+            rhs = [self._bind(elt, bindings) for elt in rhs]
+            bindings = {}
+
+        # Initialize the edge.
+        TreeEdge.__init__(self, span, lhs, rhs, dot)
+        self._bindings = bindings
+        self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items())))
+
+    @staticmethod
+    def from_production(production, index):
+        """
+        :return: A new ``TreeEdge`` formed from the given production.
+            The new edge's left-hand side and right-hand side will
+            be taken from ``production``; its span will be
+            ``(index,index)``; and its dot position will be ``0``.
+        :rtype: TreeEdge
+        """
+        return FeatureTreeEdge(span=(index, index), lhs=production.lhs(),
+                               rhs=production.rhs(), dot=0)
+
+    def move_dot_forward(self, new_end, bindings=None):
+        """
+        :return: A new ``FeatureTreeEdge`` formed from this edge.
+            The new edge's dot position is increased by ``1``,
+            and its end index will be replaced by ``new_end``.
+        :rtype: FeatureTreeEdge
+        :param new_end: The new end index.
+        :type new_end: int
+        :param bindings: Bindings for the new edge.
+        :type bindings: dict
+        """
+        return FeatureTreeEdge(span=(self._span[0], new_end),
+                               lhs=self._lhs, rhs=self._rhs,
+                               dot=self._dot+1, bindings=bindings)
+
+    def _bind(self, nt, bindings):
+        if not isinstance(nt, FeatStructNonterminal): return nt
+        return nt.substitute_bindings(bindings)
+
+    def next_with_bindings(self):
+        return self._bind(self.nextsym(), self._bindings)
+
+    def bindings(self):
+        """
+        Return a copy of this edge's bindings dictionary.
+        """
+        return self._bindings.copy()
+
+    def variables(self):
+        """
+        :return: The set of variables used by this edge.
+        :rtype: set(Variable)
+        """
+        return find_variables([self._lhs] + list(self._rhs) +
+                              list(self._bindings.keys()) +
+                              list(self._bindings.values()),
+                              fs_class=FeatStruct)
+
+    def __str__(self):
+        if self.is_complete():
+            return TreeEdge.__unicode__(self)
+        else:
+            bindings = '{%s}' % ', '.join('%s: %r' % item for item in
+                                           sorted(self._bindings.items()))
+            return '%s %s' % (TreeEdge.__unicode__(self), bindings)
+
+
+#////////////////////////////////////////////////////////////
+# A specialized Chart for feature grammars
+#////////////////////////////////////////////////////////////
+
+# TODO: subsumes check when adding new edges
+
+class FeatureChart(Chart):
+    """
+    A Chart for feature grammars.
+    :see: ``Chart`` for more information.
+    """
+
+    def select(self, **restrictions):
+        """
+        Returns an iterator over the edges in this chart.
+        See ``Chart.select`` for more information about the
+        ``restrictions`` on the edges.
+        """
+        # If there are no restrictions, then return all edges.
+        if restrictions=={}: return iter(self._edges)
+
+        # Find the index corresponding to the given restrictions.
+        restr_keys = sorted(restrictions.keys())
+        restr_keys = tuple(restr_keys)
+
+        # If it doesn't exist, then create it.
+        if restr_keys not in self._indexes:
+            self._add_index(restr_keys)
+
+        vals = tuple(self._get_type_if_possible(restrictions[key])
+                     for key in restr_keys)
+        return iter(self._indexes[restr_keys].get(vals, []))
+
+    def _add_index(self, restr_keys):
+        """
+        A helper function for ``select``, which creates a new index for
+        a given set of attributes (aka restriction keys).
+        """
+        # Make sure it's a valid index.
+        for key in restr_keys:
+            if not hasattr(EdgeI, key):
+                raise ValueError('Bad restriction: %s' % key)
+
+        # Create the index.
+        index = self._indexes[restr_keys] = {}
+
+        # Add all existing edges to the index.
+        for edge in self._edges:
+            vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+                         for key in restr_keys)
+            index.setdefault(vals, []).append(edge)
+
+    def _register_with_indexes(self, edge):
+        """
+        A helper function for ``insert``, which registers the new
+        edge with all existing indexes.
+        """
+        for (restr_keys, index) in self._indexes.items():
+            vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+                         for key in restr_keys)
+            index.setdefault(vals, []).append(edge)
+
+    def _get_type_if_possible(self, item):
+        """
+        Helper function which returns the ``TYPE`` feature of the ``item``,
+        if it exists, otherwise it returns the ``item`` itself
+        """
+        if isinstance(item, dict) and TYPE in item:
+            return item[TYPE]
+        else:
+            return item
+
+    def parses(self, start, tree_class=Tree):
+        for edge in self.select(start=0, end=self._num_leaves):
+            if ((isinstance(edge, FeatureTreeEdge)) and
+                (edge.lhs()[TYPE] == start[TYPE]) and
+                (unify(edge.lhs(), start, rename_vars=True))
+                ):
+                for tree in self.trees(edge, complete=True, tree_class=tree_class):
+                    yield tree
+
+
+#////////////////////////////////////////////////////////////
+# Fundamental Rule
+#////////////////////////////////////////////////////////////
+
+class FeatureFundamentalRule(FundamentalRule):
+    """
+    A specialized version of the fundamental rule that operates on
+    nonterminals whose symbols are ``FeatStructNonterminal``s.  Rather
+    tha simply comparing the nonterminals for equality, they are
+    unified.  Variable bindings from these unifications are collected
+    and stored in the chart using a ``FeatureTreeEdge``.  When a
+    complete edge is generated, these bindings are applied to all
+    nonterminals in the edge.
+
+    The fundamental rule states that:
+
+    - ``[A -> alpha \* B1 beta][i:j]``
+    - ``[B2 -> gamma \*][j:k]``
+
+    licenses the edge:
+
+    - ``[A -> alpha B3 \* beta][i:j]``
+
+    assuming that B1 and B2 can be unified to generate B3.
+    """
+    def apply(self, chart, grammar, left_edge, right_edge):
+        # Make sure the rule is applicable.
+        if not (left_edge.end() == right_edge.start() and
+                left_edge.is_incomplete() and
+                right_edge.is_complete() and
+                isinstance(left_edge, FeatureTreeEdge)):
+            return
+        found = right_edge.lhs()
+        nextsym = left_edge.nextsym()
+        if isinstance(right_edge, FeatureTreeEdge):
+            if not is_nonterminal(nextsym): return
+            if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return
+            # Create a copy of the bindings.
+            bindings = left_edge.bindings()
+            # We rename vars here, because we don't want variables
+            # from the two different productions to match.
+            found = found.rename_variables(used_vars=left_edge.variables())
+            # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
+            # generate B3 (result).
+            result = unify(nextsym, found, bindings, rename_vars=False)
+            if result is None: return
+        else:
+            if nextsym != found: return
+            # Create a copy of the bindings.
+            bindings = left_edge.bindings()
+
+        # Construct the new edge.
+        new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)
+
+        # Add it to the chart, with appropriate child pointers.
+        if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
+            yield new_edge
+
+class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
+    """
+    A specialized version of the completer / single edge fundamental rule
+    that operates on nonterminals whose symbols are ``FeatStructNonterminal``s.
+    Rather than simply comparing the nonterminals for equality, they are
+    unified.
+    """
+    _fundamental_rule = FeatureFundamentalRule()
+
+    def _apply_complete(self, chart, grammar, right_edge):
+        fr = self._fundamental_rule
+        for left_edge in chart.select(end=right_edge.start(),
+                                      is_complete=False,
+                                      nextsym=right_edge.lhs()):
+            for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
+                yield new_edge
+
+    def _apply_incomplete(self, chart, grammar, left_edge):
+        fr = self._fundamental_rule
+        for right_edge in chart.select(start=left_edge.end(),
+                                       is_complete=True,
+                                       lhs=left_edge.nextsym()):
+            for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
+                yield new_edge
+
+
+#////////////////////////////////////////////////////////////
+# Top-Down Prediction
+#////////////////////////////////////////////////////////////
+
+class FeatureTopDownInitRule(TopDownInitRule):
+    def apply(self, chart, grammar):
+        for prod in grammar.productions(lhs=grammar.start()):
+            new_edge = FeatureTreeEdge.from_production(prod, 0)
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+class FeatureTopDownPredictRule(CachedTopDownPredictRule):
+    """
+    A specialized version of the (cached) top down predict rule that operates
+    on nonterminals whose symbols are ``FeatStructNonterminal``s.  Rather
+    than simply comparing the nonterminals for equality, they are
+    unified.
+
+    The top down expand rule states that:
+
+    - ``[A -> alpha \* B1 beta][i:j]``
+
+    licenses the edge:
+
+    - ``[B2 -> \* gamma][j:j]``
+
+    for each grammar production ``B2 -> gamma``, assuming that B1
+    and B2 can be unified.
+    """
+    def apply(self, chart, grammar, edge):
+        if edge.is_complete(): return
+        nextsym, index = edge.nextsym(), edge.end()
+        if not is_nonterminal(nextsym): return
+
+        # If we've already applied this rule to an edge with the same
+        # next & end, and the chart & grammar have not changed, then
+        # just return (no new edges to add).
+        nextsym_with_bindings = edge.next_with_bindings()
+        done = self._done.get((nextsym_with_bindings, index), (None, None))
+        if done[0] is chart and done[1] is grammar:
+            return
+
+        for prod in grammar.productions(lhs=nextsym):
+            # If the left corner in the predicted production is
+            # leaf, it must match with the input.
+            if prod.rhs():
+                first = prod.rhs()[0]
+                if is_terminal(first):
+                    if index >= chart.num_leaves(): continue
+                    if first != chart.leaf(index): continue
+
+            # We rename vars here, because we don't want variables
+            # from the two different productions to match.
+            if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
+                new_edge = FeatureTreeEdge.from_production(prod, edge.end())
+                if chart.insert(new_edge, ()):
+                    yield new_edge
+
+        # Record the fact that we've applied this rule.
+        self._done[nextsym_with_bindings, index] = (chart, grammar)
+
+
+#////////////////////////////////////////////////////////////
+# Bottom-Up Prediction
+#////////////////////////////////////////////////////////////
+
+class FeatureBottomUpPredictRule(BottomUpPredictRule):
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete(): return
+        for prod in grammar.productions(rhs=edge.lhs()):
+            if isinstance(edge, FeatureTreeEdge):
+                _next = prod.rhs()[0]
+                if not is_nonterminal(_next): continue
+
+            new_edge = FeatureTreeEdge.from_production(prod, edge.start())
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete(): return
+        found = edge.lhs()
+        for prod in grammar.productions(rhs=found):
+            bindings = {}
+            if isinstance(edge, FeatureTreeEdge):
+                _next = prod.rhs()[0]
+                if not is_nonterminal(_next): continue
+
+                # We rename vars here, because we don't want variables
+                # from the two different productions to match.
+                used_vars = find_variables((prod.lhs(),) + prod.rhs(),
+                                           fs_class=FeatStruct)
+                found = found.rename_variables(used_vars=used_vars)
+
+                result = unify(_next, found, bindings, rename_vars=False)
+                if result is None: continue
+
+            new_edge = (FeatureTreeEdge.from_production(prod, edge.start())
+                        .move_dot_forward(edge.end(), bindings))
+            if chart.insert(new_edge, (edge,)):
+                yield new_edge
+
+class FeatureEmptyPredictRule(EmptyPredictRule):
+    def apply(self, chart, grammar):
+        for prod in grammar.productions(empty=True):
+            for index in range(chart.num_leaves() + 1):
+                new_edge = FeatureTreeEdge.from_production(prod, index)
+                if chart.insert(new_edge, ()):
+                    yield new_edge
+
+
+#////////////////////////////////////////////////////////////
+# Feature Chart Parser
+#////////////////////////////////////////////////////////////
+
+TD_FEATURE_STRATEGY = [LeafInitRule(),
+                       FeatureTopDownInitRule(),
+                       FeatureTopDownPredictRule(),
+                       FeatureSingleEdgeFundamentalRule()]
+BU_FEATURE_STRATEGY = [LeafInitRule(),
+                       FeatureEmptyPredictRule(),
+                       FeatureBottomUpPredictRule(),
+                       FeatureSingleEdgeFundamentalRule()]
+BU_LC_FEATURE_STRATEGY = [LeafInitRule(),
+                          FeatureEmptyPredictRule(),
+                          FeatureBottomUpPredictCombineRule(),
+                          FeatureSingleEdgeFundamentalRule()]
+
+class FeatureChartParser(ChartParser):
+    def __init__(self, grammar,
+                 strategy=BU_LC_FEATURE_STRATEGY,
+                 trace_chart_width=20,
+                 chart_class=FeatureChart,
+                 **parser_args):
+        ChartParser.__init__(self, grammar,
+                             strategy=strategy,
+                             trace_chart_width=trace_chart_width,
+                             chart_class=chart_class,
+                             **parser_args)
+
+class FeatureTopDownChartParser(FeatureChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
+
+class FeatureBottomUpChartParser(FeatureChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
+
+class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
+    def __init__(self, grammar, **parser_args):
+        FeatureChartParser.__init__(self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args)
+
+
+#////////////////////////////////////////////////////////////
+# Instantiate Variable Chart
+#////////////////////////////////////////////////////////////
+
+class InstantiateVarsChart(FeatureChart):
+    """
+    A specialized chart that 'instantiates' variables whose names
+    start with '@', by replacing them with unique new variables.
+    In particular, whenever a complete edge is added to the chart, any
+    variables in the edge's ``lhs`` whose names start with '@' will be
+    replaced by unique new ``Variable``s.
+    """
+    def __init__(self, tokens):
+        FeatureChart.__init__(self, tokens)
+
+    def initialize(self):
+        self._instantiated = set()
+        FeatureChart.initialize(self)
+
+    def insert(self, edge, child_pointer_list):
+        if edge in self._instantiated: return False
+        self.instantiate_edge(edge)
+        return FeatureChart.insert(self, edge, child_pointer_list)
+
+    def instantiate_edge(self, edge):
+        """
+        If the edge is a ``FeatureTreeEdge``, and it is complete,
+        then instantiate all variables whose names start with '@',
+        by replacing them with unique new variables.
+
+        Note that instantiation is done in-place, since the
+        parsing algorithms might already hold a reference to
+        the edge for future use.
+        """
+        # If the edge is a leaf, or is not complete, or is
+        # already in the chart, then just return it as-is.
+        if not isinstance(edge, FeatureTreeEdge): return
+        if not edge.is_complete(): return
+        if edge in self._edge_to_cpls: return
+
+        # Get a list of variables that need to be instantiated.
+        # If there are none, then return as-is.
+        inst_vars = self.inst_vars(edge)
+        if not inst_vars: return
+
+        # Instantiate the edge!
+        self._instantiated.add(edge)
+        edge._lhs = edge.lhs().substitute_bindings(inst_vars)
+
+    def inst_vars(self, edge):
+        return dict((var, logic.unique_variable())
+                    for var in edge.lhs().variables()
+                    if var.name.startswith('@'))
+
+
+#////////////////////////////////////////////////////////////
+# Demo
+#////////////////////////////////////////////////////////////
+
+def demo_grammar():
+    from nltk.grammar import FeatureGrammar
+    return FeatureGrammar.fromstring("""
+S  -> NP VP
+PP -> Prep NP
+NP -> NP PP
+VP -> VP PP
+VP -> Verb NP
+VP -> Verb
+NP -> Det[pl=?x] Noun[pl=?x]
+NP -> "John"
+NP -> "I"
+Det -> "the"
+Det -> "my"
+Det[-pl] -> "a"
+Noun[-pl] -> "dog"
+Noun[-pl] -> "cookie"
+Verb -> "ate"
+Verb -> "saw"
+Prep -> "with"
+Prep -> "under"
+""")
+
+def demo(print_times=True, print_grammar=True,
+         print_trees=True, print_sentence=True,
+         trace=1,
+         parser=FeatureChartParser,
+         sent='I saw John with a dog with my cookie'):
+    import sys, time
+    print()
+    grammar = demo_grammar()
+    if print_grammar:
+        print(grammar)
+        print()
+    print("*", parser.__name__)
+    if print_sentence:
+        print("Sentence:", sent)
+    tokens = sent.split()
+    t = time.clock()
+    cp = parser(grammar, trace=trace)
+    chart = cp.chart_parse(tokens)
+    trees = list(chart.parses(grammar.start()))
+    if print_times:
+        print("Time: %s" % (time.clock() - t))
+    if print_trees:
+        for tree in trees: print(tree)
+    else:
+        print("Nr trees:", len(trees))
+
+def run_profile():
+    import profile
+    profile.run('for i in range(1): demo()', '/tmp/profile.out')
+    import pstats
+    p = pstats.Stats('/tmp/profile.out')
+    p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
+    p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
+
+if __name__ == '__main__':
+    from nltk.data import load
+    demo()
+    print()
+    grammar = load('grammars/book_grammars/feat0.fcfg')
+    cp = FeatureChartParser(grammar, trace=2)
+    sent = 'Kim likes children'
+    tokens = sent.split()
+    trees = cp.parse(tokens)
+    for tree in trees:
+        print(tree)
diff --git a/nlp_resource_data/nltk/parse/featurechart.pyc b/nlp_resource_data/nltk/parse/featurechart.pyc

new file mode 100755 (executable)

index 0000000..7addf46

Binary files /dev/null and b/nlp_resource_data/nltk/parse/featurechart.pyc differ
diff --git a/nlp_resource_data/nltk/parse/generate.py b/nlp_resource_data/nltk/parse/generate.py

new file mode 100755 (executable)

index 0000000..8326f5d
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/generate.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Generating from a CFG
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+from __future__ import print_function
+
+import itertools
+import sys
+from nltk.grammar import Nonterminal
+
+
+def generate(grammar, start=None, depth=None, n=None):
+    """
+    Generates an iterator of all sentences from a CFG.
+
+    :param grammar: The Grammar used to generate sentences.
+    :param start: The Nonterminal from which to start generate sentences.
+    :param depth: The maximal depth of the generated tree.
+    :param n: The maximum number of sentences to return.
+    :return: An iterator of lists of terminal tokens.
+    """
+    if not start:
+        start = grammar.start()
+    if depth is None:
+        depth = sys.maxsize
+
+    iter = _generate_all(grammar, [start], depth)
+
+    if n:
+        iter = itertools.islice(iter, n)
+
+    return iter
+
+
+def _generate_all(grammar, items, depth):
+    if items:
+        try:
+            for frag1 in _generate_one(grammar, items[0], depth):
+                for frag2 in _generate_all(grammar, items[1:], depth):
+                    yield frag1 + frag2
+        except RuntimeError as _error:
+            if _error.message == "maximum recursion depth exceeded":
+                # Helpful error message while still showing the recursion stack.
+                raise RuntimeError("The grammar has rule(s) that yield infinite recursion!!")
+            else:
+                raise
+    else:
+        yield []
+
+
+def _generate_one(grammar, item, depth):
+    if depth > 0:
+        if isinstance(item, Nonterminal):
+            for prod in grammar.productions(lhs=item):
+                for frag in _generate_all(grammar, prod.rhs(), depth-1):
+                    yield frag
+        else:
+            yield [item]
+
+demo_grammar = """
+  S -> NP VP
+  NP -> Det N
+  PP -> P NP
+  VP -> 'slept' | 'saw' NP | 'walked' PP
+  Det -> 'the' | 'a'
+  N -> 'man' | 'park' | 'dog'
+  P -> 'in' | 'with'
+"""
+
+
+def demo(N=23):
+    from nltk.grammar import CFG
+
+    print('Generating the first %d sentences for demo grammar:' % (N,))
+    print(demo_grammar)
+    grammar = CFG.fromstring(demo_grammar)
+    for n, sent in enumerate(generate(grammar, n=N), 1):
+        print('%3d. %s' % (n, ' '.join(sent)))
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/generate.pyc b/nlp_resource_data/nltk/parse/generate.pyc

new file mode 100755 (executable)

index 0000000..abe1f7a

Binary files /dev/null and b/nlp_resource_data/nltk/parse/generate.pyc differ
diff --git a/nlp_resource_data/nltk/parse/malt.py b/nlp_resource_data/nltk/parse/malt.py

new file mode 100755 (executable)

index 0000000..68bb396
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/malt.py
@@ -0,0 +1,356 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to MaltParser
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+# Contributor: Liling Tan, Mustufain, osamamukhtar11
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from six import text_type
+import os
+import sys
+import tempfile
+import subprocess
+import inspect
+
+from nltk.data import ZipFilePathPointer
+from nltk.internals import find_dir, find_file, find_jars_within_path
+
+from nltk.parse.api import ParserI
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.parse.util import taggedsents_to_conll
+
+
+def malt_regex_tagger():
+    from nltk.tag import RegexpTagger
+    _tagger = RegexpTagger(
+    [(r'\.$','.'), (r'\,$',','), (r'\?$','?'),    # fullstop, comma, Qmark
+    (r'\($','('), (r'\)$',')'),             # round brackets
+    (r'\[$','['), (r'\]$',']'),             # square brackets
+    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),        # cardinal numbers
+    (r'(The|the|A|a|An|an)$', 'DT'),        # articles
+    (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
+    (r'(His|his|Her|her|Its|its)$', 'PRP$'),    # possesive
+    (r'(my|Your|your|Yours|yours)$', 'PRP$'),   # possesive
+    (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions
+    (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions
+    (r'(till|Till|until|Until)$', 'IN'),        # time prepopsitions
+    (r'(by|By|beside|Beside)$', 'IN'),          # space prepopsitions
+    (r'(under|Under|below|Below)$', 'IN'),      # space prepopsitions
+    (r'(over|Over|above|Above)$', 'IN'),        # space prepopsitions
+    (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions
+    (r'(into|Into|towards|Towards)$', 'IN'),    # space prepopsitions
+    (r'(onto|Onto|from|From)$', 'IN'),          # space prepopsitions
+    (r'.*able$', 'JJ'), # adjectives
+    (r'.*ness$', 'NN'), # nouns formed from adjectives
+    (r'.*ly$', 'RB'),   # adverbs
+    (r'.*s$', 'NNS'),   # plural nouns
+    (r'.*ing$', 'VBG'), # gerunds
+    (r'.*ed$', 'VBD'),  # past tense verbs
+    (r'.*', 'NN'),      # nouns (default)
+    ])
+    return _tagger.tag
+
+
+def find_maltparser(parser_dirname):
+    """
+    A module to find MaltParser .jar file and its dependencies.
+    """
+    if os.path.exists(parser_dirname): # If a full path is given.
+        _malt_dir = parser_dirname
+    else: # Try to find path to maltparser directory in environment variables.
+        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
+    # Checks that that the found directory contains all the necessary .jar
+    malt_dependencies = ['','','']
+    _malt_jars = set(find_jars_within_path(_malt_dir))
+    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
+    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
+
+    assert malt_dependencies.issubset(_jars)
+    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
+    return list(_malt_jars)
+
+
+def find_malt_model(model_filename):
+    """
+    A module to find pre-trained MaltParser model.
+    """
+    if model_filename == None:
+        return 'malt_temp.mco'
+    elif os.path.exists(model_filename): # If a full path is given.
+        return model_filename
+    else: # Try to find path to malt model in environment variables.
+        return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
+
+
+class MaltParser(ParserI):
+    """
+    A class for dependency parsing with MaltParser. The input is the paths to:
+    - a maltparser directory
+    - (optionally) the path to a pre-trained MaltParser .mco model file
+    - (optionally) the tagger to use for POS tagging before parsing
+    - (optionally) additional Java arguments
+
+    Example:
+        >>> from nltk.parse import malt
+        >>> # With MALT_PARSER and MALT_MODEL environment set.
+        >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
+        >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
+        (shot I (elephant an) (in (pajamas my)) .)
+        >>> # Without MALT_PARSER and MALT_MODEL environment.
+        >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
+        >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
+        (shot I (elephant an) (in (pajamas my)) .)
+    """
+    def __init__(self, parser_dirname, model_filename=None, tagger=None, additional_java_args=None):
+        """
+        An interface for parsing with the Malt Parser.
+
+        :param parser_dirname: The path to the maltparser directory that
+        contains the maltparser-1.x.jar
+        :type parser_dirname: str
+        :param model_filename: The name of the pre-trained model with .mco file
+        extension. If provided, training will not be required.
+        (see http://www.maltparser.org/mco/mco.html and
+        see http://www.patful.com/chalk/node/185)
+        :type model_filename: str
+        :param tagger: The tagger used to POS tag the raw string before
+        formatting to CONLL format. It should behave like `nltk.pos_tag`
+        :type tagger: function
+        :param additional_java_args: This is the additional Java arguments that
+        one can use when calling Maltparser, usually this is the heapsize
+        limits, e.g. `additional_java_args=['-Xmx1024m']`
+        (see http://goo.gl/mpDBvQ)
+        :type additional_java_args: list
+        """
+
+        # Find all the necessary jar files for MaltParser.
+        self.malt_jars = find_maltparser(parser_dirname)
+        # Initialize additional java arguments.
+        self.additional_java_args = additional_java_args if \
+                        additional_java_args is not None else []
+        # Initialize model.
+        self.model = find_malt_model(model_filename)
+        self._trained = self.model != 'malt_temp.mco'
+        # Set the working_dir parameters i.e. `-w` from MaltParser's option.
+        self.working_dir = tempfile.gettempdir()
+        # Initialize POS tagger.
+        self.tagger = tagger if tagger is not None else malt_regex_tagger()
+
+    def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
+        """
+        Use MaltParser to parse multiple POS tagged sentences. Takes multiple
+        sentences where each sentence is a list of (word, tag) tuples.
+        The sentences must have already been tokenized and tagged.
+
+        :param sentences: Input sentences to parse
+        :type sentence: list(list(tuple(str, str)))
+        :return: iter(iter(``DependencyGraph``)) the dependency graph
+        representation of each sentence
+        """
+        if not self._trained:
+            raise Exception("Parser has not been trained. Call train() first.")
+
+        with tempfile.NamedTemporaryFile(prefix='malt_input.conll.',
+              dir=self.working_dir, mode='w', delete=False) as input_file:
+              with tempfile.NamedTemporaryFile(prefix='malt_output.conll.',
+                     dir=self.working_dir, mode='w', delete=False) as output_file:
+                # Convert list of sentences to CONLL format.
+                for line in taggedsents_to_conll(sentences):
+                    input_file.write(text_type(line))
+                input_file.close()
+
+                # Generate command to run maltparser.
+                cmd =self.generate_malt_command(input_file.name,
+                                output_file.name, mode="parse")
+
+                # This is a maltparser quirk, it needs to be run
+                # where the model file is. otherwise it goes into an awkward
+                # missing .jars or strange -w working_dir problem.
+                _current_path = os.getcwd() # Remembers the current path.
+                try: # Change to modelfile path
+                    os.chdir(os.path.split(self.model)[0])
+                except:
+                    pass
+                ret = self._execute(cmd, verbose) # Run command.
+                os.chdir(_current_path) # Change back to current path.
+
+                if ret is not 0:
+                    raise Exception("MaltParser parsing (%s) failed with exit "
+                            "code %d" % (' '.join(cmd), ret))
+
+                # Must return iter(iter(Tree))
+                with open(output_file.name) as infile:
+                    for tree_str in infile.read().split('\n\n'):
+                        yield(iter([DependencyGraph(tree_str, top_relation_label=top_relation_label)]))
+
+        os.remove(input_file.name)
+        os.remove(output_file.name)
+
+    def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
+        """
+        Use MaltParser to parse multiple sentences.
+        Takes a list of sentences, where each sentence is a list of words.
+        Each sentence will be automatically tagged with this
+        MaltParser instance's tagger.
+
+        :param sentences: Input sentences to parse
+        :type sentence: list(list(str))
+        :return: iter(DependencyGraph)
+        """
+        tagged_sentences = (self.tagger(sentence) for sentence in sentences)
+        return self.parse_tagged_sents(tagged_sentences, verbose, top_relation_label=top_relation_label)
+
+    def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
+        """
+        This function generates the maltparser command use at the terminal.
+
+        :param inputfilename: path to the input file
+        :type inputfilename: str
+        :param outputfilename: path to the output file
+        :type outputfilename: str
+        """
+
+        cmd = ['java']
+        cmd+= self.additional_java_args # Adds additional java arguments
+        # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
+        classpaths_separator = ';' if sys.platform.startswith('win') else ':'
+        cmd+= ['-cp', classpaths_separator.join(self.malt_jars)] # Adds classpaths for jars
+        cmd+= ['org.maltparser.Malt'] # Adds the main function.
+
+        # Adds the model file.
+        if os.path.exists(self.model): # when parsing
+            cmd+= ['-c', os.path.split(self.model)[-1]]
+        else: # when learning
+            cmd+= ['-c', self.model]
+
+        cmd+= ['-i', inputfilename]
+        if mode == 'parse':
+            cmd+= ['-o', outputfilename]
+        cmd+= ['-m', mode] # mode use to generate parses.
+        return cmd
+
+    @staticmethod
+    def _execute(cmd, verbose=False):
+        output = None if verbose else subprocess.PIPE
+        p = subprocess.Popen(cmd, stdout=output, stderr=output)
+        return p.wait()
+
+    def train(self, depgraphs, verbose=False):
+        """
+        Train MaltParser from a list of ``DependencyGraph`` objects
+
+        :param depgraphs: list of ``DependencyGraph`` objects for training input data
+        :type depgraphs: DependencyGraph
+        """
+
+        # Write the conll_str to malt_train.conll file in /tmp/
+        with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
+             dir=self.working_dir, mode='w', delete=False) as input_file:
+            input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs))
+            input_file.write(text_type(input_str))
+        # Trains the model with the malt_train.conll
+        self.train_from_file(input_file.name, verbose=verbose)
+        # Removes the malt_train.conll once training finishes.
+        os.remove(input_file.name)
+
+    def train_from_file(self, conll_file, verbose=False):
+        """
+        Train MaltParser from a file
+        :param conll_file: str for the filename of the training input data
+        :type conll_file: str
+        """
+
+        # If conll_file is a ZipFilePathPointer,
+        # then we need to do some extra massaging
+        if isinstance(conll_file, ZipFilePathPointer):
+            with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
+            dir=self.working_dir, mode='w', delete=False) as input_file:
+                with conll_file.open() as conll_input_file:
+                    conll_str = conll_input_file.read()
+                    input_file.write(text_type(conll_str))
+                return self.train_from_file(input_file.name, verbose=verbose)
+
+        # Generate command to run maltparser.
+        cmd =self.generate_malt_command(conll_file, mode="learn")
+        ret = self._execute(cmd, verbose)
+        if ret != 0:
+            raise Exception("MaltParser training (%s) failed with exit "
+                    "code %d" % (' '.join(cmd), ret))
+        self._trained = True
+
+
+if __name__ == '__main__':
+    '''
+    A demostration function to show how NLTK users can use the malt parser API.
+
+    >>> from nltk import pos_tag
+    >>> assert 'MALT_PARSER' in os.environ, str(
+    ... "Please set MALT_PARSER in your global environment, e.g.:\n"
+    ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
+    >>>
+    >>> assert 'MALT_MODEL' in os.environ, str(
+    ... "Please set MALT_MODEL in your global environment, e.g.:\n"
+    ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
+    >>>
+    >>> _dg1_str = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
+    ...             "2    sees    _    VB    _    _    0    ROOT    _    _\n"
+    ...             "3    a       _    DT    _    _    4    SPEC    _    _\n"
+    ...             "4    dog     _    NN    _    _    2    OBJ     _    _\n"
+    ...             "5    .     _    .    _    _    2    PUNCT     _    _\n")
+    >>>
+    >>>
+    >>> _dg2_str  = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
+    ...             "2    walks   _    VB    _    _    0    ROOT    _    _\n"
+    ...             "3    .     _    .    _    _    2    PUNCT     _    _\n")
+    >>> dg1 = DependencyGraph(_dg1_str)
+    >>> dg2 = DependencyGraph(_dg2_str)
+    >>> # Initialize a MaltParser object
+    >>> parser_dirname = 'maltparser-1.7.2'
+    >>> mp = MaltParser(parser_dirname=parser_dirname)
+    >>>
+    >>> # Trains a model.
+    >>> mp.train([dg1,dg2], verbose=False)
+    >>> sent1 = ['John','sees','Mary', '.']
+    >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
+    >>>
+    >>> # Parse a single sentence.
+    >>> parsed_sent1 = mp.parse_one(sent1)
+    >>> parsed_sent2 = mp.parse_one(sent2)
+    >>> print (parsed_sent1.tree())
+    (sees John Mary .)
+    >>> print (parsed_sent2.tree())
+    (walks John (dog a) .)
+    >>>
+    >>> # Parsing multiple sentences.
+    >>> sentences = [sent1,sent2]
+    >>> parsed_sents = mp.parse_sents(sentences)
+    >>> print(next(next(parsed_sents)).tree())
+    (sees John Mary .)
+    >>> print(next(next(parsed_sents)).tree())
+    (walks John (dog a) .)
+    >>>
+    >>> # Initialize a MaltParser object with an English pre-trained model.
+    >>> parser_dirname = 'maltparser-1.7.2'
+    >>> model_name = 'engmalt.linear-1.7.mco'
+    >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
+    >>> sent1 = 'I shot an elephant in my pajamas .'.split()
+    >>> sent2 = 'Time flies like banana .'.split()
+    >>> # Parse a single sentence.
+    >>> print(mp.parse_one(sent1).tree())
+    (shot I (elephant an) (in (pajamas my)) .)
+    # Parsing multiple sentences
+    >>> sentences = [sent1,sent2]
+    >>> parsed_sents = mp.parse_sents(sentences)
+    >>> print(next(next(parsed_sents)).tree())
+    (shot I (elephant an) (in (pajamas my)) .)
+    >>> print(next(next(parsed_sents)).tree())
+    (flies Time (like banana) .)
+    '''
+    import doctest
+    doctest.testmod()
diff --git a/nlp_resource_data/nltk/parse/malt.pyc b/nlp_resource_data/nltk/parse/malt.pyc

new file mode 100755 (executable)

index 0000000..241bbc7

Binary files /dev/null and b/nlp_resource_data/nltk/parse/malt.pyc differ
diff --git a/nlp_resource_data/nltk/parse/nonprojectivedependencyparser.py b/nlp_resource_data/nltk/parse/nonprojectivedependencyparser.py

new file mode 100755 (executable)

index 0000000..5adcd2c
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/nonprojectivedependencyparser.py
@@ -0,0 +1,783 @@
+# Natural Language Toolkit: Dependency Grammars
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Jason Narad <jason.narad@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+from __future__ import print_function
+
+import math
+import logging
+
+from six.moves import range
+
+from nltk.parse.dependencygraph import DependencyGraph
+
+logger = logging.getLogger(__name__)
+
+#################################################################
+# DependencyScorerI - Interface for Graph-Edge Weight Calculation
+#################################################################
+
+
+class DependencyScorerI(object):
+    """
+    A scorer for calculated the weights on the edges of a weighted
+    dependency graph.  This is used by a
+    ``ProbabilisticNonprojectiveParser`` to initialize the edge
+    weights of a ``DependencyGraph``.  While typically this would be done
+    by training a binary classifier, any class that can return a
+    multidimensional list representation of the edge weights can
+    implement this interface.  As such, it has no necessary
+    fields.
+    """
+
+    def __init__(self):
+        if self.__class__ == DependencyScorerI:
+            raise TypeError('DependencyScorerI is an abstract interface')
+
+    def train(self, graphs):
+        """
+        :type graphs: list(DependencyGraph)
+        :param graphs: A list of dependency graphs to train the scorer.
+        Typically the edges present in the graphs can be used as
+        positive training examples, and the edges not present as negative
+        examples.
+        """
+        raise NotImplementedError()
+
+    def score(self, graph):
+        """
+        :type graph: DependencyGraph
+        :param graph: A dependency graph whose set of edges need to be
+        scored.
+        :rtype: A three-dimensional list of numbers.
+        :return: The score is returned in a multidimensional(3) list, such
+        that the outer-dimension refers to the head, and the
+        inner-dimension refers to the dependencies.  For instance,
+        scores[0][1] would reference the list of scores corresponding to
+        arcs from node 0 to node 1.  The node's 'address' field can be used
+        to determine its number identification.
+
+        For further illustration, a score list corresponding to Fig.2 of
+        Keith Hall's 'K-best Spanning Tree Parsing' paper:
+              scores = [[[], [5],  [1],  [1]],
+                       [[], [],   [11], [4]],
+                       [[], [10], [],   [5]],
+                       [[], [8],  [8],  []]]
+        When used in conjunction with a MaxEntClassifier, each score would
+        correspond to the confidence of a particular edge being classified
+        with the positive training examples.
+        """
+        raise NotImplementedError()
+
+#################################################################
+# NaiveBayesDependencyScorer
+#################################################################
+
+
+class NaiveBayesDependencyScorer(DependencyScorerI):
+    """
+    A dependency scorer built around a MaxEnt classifier.  In this
+    particular class that classifier is a ``NaiveBayesClassifier``.
+    It uses head-word, head-tag, child-word, and child-tag features
+    for classification.
+
+    >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
+
+    >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
+    >>> npp = ProbabilisticNonprojectiveParser()
+    >>> npp.train(graphs, NaiveBayesDependencyScorer())
+    >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
+    >>> len(list(parses))
+    1
+
+    """
+
+    def __init__(self):
+        pass  # Do nothing without throwing error
+
+    def train(self, graphs):
+        """
+        Trains a ``NaiveBayesClassifier`` using the edges present in
+        graphs list as positive examples, the edges not present as
+        negative examples.  Uses a feature vector of head-word,
+        head-tag, child-word, and child-tag.
+
+        :type graphs: list(DependencyGraph)
+        :param graphs: A list of dependency graphs to train the scorer.
+        """
+
+        from nltk.classify import NaiveBayesClassifier
+
+        # Create training labeled training examples
+        labeled_examples = []
+        for graph in graphs:
+            for head_node in graph.nodes.values():
+                for child_index, child_node in graph.nodes.items():
+                    if child_index in head_node['deps']:
+                        label = "T"
+                    else:
+                        label = "F"
+                    labeled_examples.append(
+                        (
+                            dict(
+                                a=head_node['word'],
+                                b=head_node['tag'],
+                                c=child_node['word'],
+                                d=child_node['tag'],
+                            ),
+                            label,
+                        )
+                    )
+
+        self.classifier = NaiveBayesClassifier.train(labeled_examples)
+
+    def score(self, graph):
+        """
+        Converts the graph into a feature-based representation of
+        each edge, and then assigns a score to each based on the
+        confidence of the classifier in assigning it to the
+        positive label.  Scores are returned in a multidimensional list.
+
+        :type graph: DependencyGraph
+        :param graph: A dependency graph to score.
+        :rtype: 3 dimensional list
+        :return: Edge scores for the graph parameter.
+        """
+        # Convert graph to feature representation
+        edges = []
+        for head_node in graph.nodes.values():
+            for child_node in graph.nodes.values():
+                edges.append(
+                    (
+                        dict(
+                            a=head_node['word'],
+                            b=head_node['tag'],
+                            c=child_node['word'],
+                            d=child_node['tag'],
+                        )
+                    )
+                )
+
+        # Score edges
+        edge_scores = []
+        row = []
+        count = 0
+        for pdist in self.classifier.prob_classify_many(edges):
+            logger.debug('%.4f %.4f', pdist.prob('T'), pdist.prob('F'))
+            # smoothing in case the probability = 0
+            row.append([math.log(pdist.prob("T")+0.00000000001)])
+            count += 1
+            if count == len(graph.nodes):
+                edge_scores.append(row)
+                row = []
+                count = 0
+        return edge_scores
+
+
+#################################################################
+# A Scorer for Demo Purposes
+#################################################################
+# A short class necessary to show parsing example from paper
+class DemoScorer(DependencyScorerI):
+    def train(self, graphs):
+        print('Training...')
+
+    def score(self, graph):
+        # scores for Keith Hall 'K-best Spanning Tree Parsing' paper
+        return [[[], [5],  [1],  [1]],
+                [[], [],   [11], [4]],
+                [[], [10], [],   [5]],
+                [[], [8],  [8],  []]]
+
+#################################################################
+# Non-Projective Probabilistic Parsing
+#################################################################
+
+
+class ProbabilisticNonprojectiveParser(object):
+    """A probabilistic non-projective dependency parser.
+
+    Nonprojective dependencies allows for "crossing branches" in the parse tree
+    which is necessary for representing particular linguistic phenomena, or even
+    typical parses in some languages.  This parser follows the MST parsing
+    algorithm, outlined in McDonald(2005), which likens the search for the best
+    non-projective parse to finding the maximum spanning tree in a weighted
+    directed graph.
+
+    >>> class Scorer(DependencyScorerI):
+    ...     def train(self, graphs):
+    ...         pass
+    ...
+    ...     def score(self, graph):
+    ...         return [
+    ...             [[], [5],  [1],  [1]],
+    ...             [[], [],   [11], [4]],
+    ...             [[], [10], [],   [5]],
+    ...             [[], [8],  [8],  []],
+    ...         ]
+
+
+    >>> npp = ProbabilisticNonprojectiveParser()
+    >>> npp.train([], Scorer())
+
+    >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
+    >>> len(list(parses))
+    1
+
+    Rule based example
+    ------------------
+
+    >>> from nltk.grammar import DependencyGrammar
+
+    >>> grammar = DependencyGrammar.fromstring('''
+    ... 'taught' -> 'play' | 'man'
+    ... 'man' -> 'the' | 'in'
+    ... 'in' -> 'corner'
+    ... 'corner' -> 'the'
+    ... 'play' -> 'golf' | 'dachshund' | 'to'
+    ... 'dachshund' -> 'his'
+    ... ''')
+
+    >>> ndp = NonprojectiveDependencyParser(grammar)
+    >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
+    >>> len(list(parses))
+    4
+
+    """
+    def __init__(self):
+        """
+        Creates a new non-projective parser.
+        """
+        logging.debug('initializing prob. nonprojective...')
+
+    def train(self, graphs, dependency_scorer):
+        """
+        Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
+        and establishes this as the parser's scorer.  This is used to
+        initialize the scores on a ``DependencyGraph`` during the parsing
+        procedure.
+
+        :type graphs: list(DependencyGraph)
+        :param graphs: A list of dependency graphs to train the scorer.
+        :type dependency_scorer: DependencyScorerI
+        :param dependency_scorer: A scorer which implements the
+            ``DependencyScorerI`` interface.
+        """
+        self._scorer = dependency_scorer
+        self._scorer.train(graphs)
+
+    def initialize_edge_scores(self, graph):
+        """
+        Assigns a score to every edge in the ``DependencyGraph`` graph.
+        These scores are generated via the parser's scorer which
+        was assigned during the training process.
+
+        :type graph: DependencyGraph
+        :param graph: A dependency graph to assign scores to.
+        """
+        self.scores = self._scorer.score(graph)
+
+    def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
+        """
+        Takes a list of nodes that have been identified to belong to a cycle,
+        and collapses them into on larger node.  The arcs of all nodes in
+        the graph must be updated to account for this.
+
+        :type new_node: Node.
+        :param new_node: A Node (Dictionary) to collapse the cycle nodes into.
+        :type cycle_path: A list of integers.
+        :param cycle_path: A list of node addresses, each of which is in the cycle.
+        :type g_graph, b_graph, c_graph: DependencyGraph
+        :param g_graph, b_graph, c_graph: Graphs which need to be updated.
+        """
+        logger.debug('Collapsing nodes...')
+        # Collapse all cycle nodes into v_n+1 in G_Graph
+        for cycle_node_index in cycle_path:
+            g_graph.remove_by_address(cycle_node_index)
+        g_graph.add_node(new_node)
+        g_graph.redirect_arcs(cycle_path, new_node['address'])
+
+    def update_edge_scores(self, new_node, cycle_path):
+        """
+        Updates the edge scores to reflect a collapse operation into
+        new_node.
+
+        :type new_node: A Node.
+        :param new_node: The node which cycle nodes are collapsed into.
+        :type cycle_path: A list of integers.
+        :param cycle_path: A list of node addresses that belong to the cycle.
+        """
+        logger.debug('cycle %s', cycle_path)
+
+        cycle_path = self.compute_original_indexes(cycle_path)
+
+        logger.debug('old cycle %s', cycle_path)
+        logger.debug('Prior to update: %s', self.scores)
+
+        for i, row in enumerate(self.scores):
+            for j, column in enumerate(self.scores[i]):
+                logger.debug(self.scores[i][j])
+                if (
+                    j in cycle_path
+                    and i not in cycle_path
+                    and self.scores[i][j]
+                ):
+                    subtract_val = self.compute_max_subtract_score(j, cycle_path)
+
+                    logger.debug('%s - %s', self.scores[i][j], subtract_val)
+
+                    new_vals = []
+                    for cur_val in self.scores[i][j]:
+                        new_vals.append(cur_val - subtract_val)
+
+                    self.scores[i][j] = new_vals
+
+        for i, row in enumerate(self.scores):
+            for j, cell in enumerate(self.scores[i]):
+                if i in cycle_path and j in cycle_path:
+                    self.scores[i][j] = []
+
+        logger.debug('After update: %s', self.scores)
+
+    def compute_original_indexes(self, new_indexes):
+        """
+        As nodes are collapsed into others, they are replaced
+        by the new node in the graph, but it's still necessary
+        to keep track of what these original nodes were.  This
+        takes a list of node addresses and replaces any collapsed
+        node addresses with their original addresses.
+
+        :type new_indexes: A list of integers.
+        :param new_indexes: A list of node addresses to check for
+        subsumed nodes.
+        """
+        swapped = True
+        while swapped:
+            originals = []
+            swapped = False
+            for new_index in new_indexes:
+                if new_index in self.inner_nodes:
+                    for old_val in self.inner_nodes[new_index]:
+                        if old_val not in originals:
+                            originals.append(old_val)
+                            swapped = True
+                else:
+                    originals.append(new_index)
+            new_indexes = originals
+        return new_indexes
+
+    def compute_max_subtract_score(self, column_index, cycle_indexes):
+        """
+        When updating scores the score of the highest-weighted incoming
+        arc is subtracted upon collapse.  This returns the correct
+        amount to subtract from that edge.
+
+        :type column_index: integer.
+        :param column_index: A index representing the column of incoming arcs
+        to a particular node being updated
+        :type cycle_indexes: A list of integers.
+        :param cycle_indexes: Only arcs from cycle nodes are considered.  This
+        is a list of such nodes addresses.
+        """
+        max_score = -100000
+        for row_index in cycle_indexes:
+            for subtract_val in self.scores[row_index][column_index]:
+                if subtract_val > max_score:
+                    max_score = subtract_val
+        return max_score
+
+    def best_incoming_arc(self, node_index):
+        """
+        Returns the source of the best incoming arc to the
+        node with address: node_index
+
+        :type node_index: integer.
+        :param node_index: The address of the 'destination' node,
+        the node that is arced to.
+        """
+        originals = self.compute_original_indexes([node_index])
+        logger.debug('originals: %s', originals)
+
+        max_arc = None
+        max_score = None
+        for row_index in range(len(self.scores)):
+            for col_index in range(len(self.scores[row_index])):
+                # print self.scores[row_index][col_index]
+                if col_index in originals and (max_score is None or self.scores[row_index][col_index] > max_score):
+                    max_score = self.scores[row_index][col_index]
+                    max_arc = row_index
+                    logger.debug('%s, %s', row_index, col_index)
+
+        logger.debug(max_score)
+
+        for key in self.inner_nodes:
+            replaced_nodes = self.inner_nodes[key]
+            if max_arc in replaced_nodes:
+                return key
+
+        return max_arc
+
+    def original_best_arc(self, node_index):
+        originals = self.compute_original_indexes([node_index])
+        max_arc = None
+        max_score = None
+        max_orig = None
+        for row_index in range(len(self.scores)):
+            for col_index in range(len(self.scores[row_index])):
+                if col_index in originals and (max_score is None or self.scores[row_index][col_index] > max_score):
+                    max_score = self.scores[row_index][col_index]
+                    max_arc = row_index
+                    max_orig = col_index
+        return [max_arc, max_orig]
+
+    def parse(self, tokens, tags):
+        """
+        Parses a list of tokens in accordance to the MST parsing algorithm
+        for non-projective dependency parses.  Assumes that the tokens to
+        be parsed have already been tagged and those tags are provided.  Various
+        scoring methods can be used by implementing the ``DependencyScorerI``
+        interface and passing it to the training algorithm.
+
+        :type tokens: list(str)
+        :param tokens: A list of words or punctuation to be parsed.
+        :type tags: list(str)
+        :param tags: A list of tags corresponding by index to the words in the tokens list.
+        :return: An iterator of non-projective parses.
+        :rtype: iter(DependencyGraph)
+        """
+        self.inner_nodes = {}
+
+        # Initialize g_graph
+        g_graph = DependencyGraph()
+        for index, token in enumerate(tokens):
+            g_graph.nodes[index + 1].update(
+                {
+                    'word': token,
+                    'tag': tags[index],
+                    'rel': 'NTOP',
+                    'address': index + 1,
+                }
+            )
+        #print (g_graph.nodes)
+
+
+        # Fully connect non-root nodes in g_graph
+        g_graph.connect_graph()
+        original_graph = DependencyGraph()
+        for index, token in enumerate(tokens):
+            original_graph.nodes[index + 1].update(
+                {
+                    'word': token,
+                    'tag': tags[index],
+                    'rel': 'NTOP',
+                    'address': index+1,
+                }
+            )
+
+        b_graph = DependencyGraph()
+        c_graph = DependencyGraph()
+
+        for index, token in enumerate(tokens):
+            c_graph.nodes[index + 1].update(
+                {
+                    'word': token,
+                    'tag': tags[index],
+                    'rel': 'NTOP',
+                    'address': index + 1,
+                }
+            )
+
+        # Assign initial scores to g_graph edges
+        self.initialize_edge_scores(g_graph)
+        logger.debug(self.scores)
+        # Initialize a list of unvisited vertices (by node address)
+        unvisited_vertices = [
+            vertex['address'] for vertex in c_graph.nodes.values()
+        ]
+        # Iterate over unvisited vertices
+        nr_vertices = len(tokens)
+        betas = {}
+        while unvisited_vertices:
+            # Mark current node as visited
+            current_vertex = unvisited_vertices.pop(0)
+            logger.debug('current_vertex: %s', current_vertex)
+            # Get corresponding node n_i to vertex v_i
+            current_node = g_graph.get_by_address(current_vertex)
+            logger.debug('current_node: %s', current_node)
+            # Get best in-edge node b for current node
+            best_in_edge = self.best_incoming_arc(current_vertex)
+            betas[current_vertex] = self.original_best_arc(current_vertex)
+            logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
+            # b_graph = Union(b_graph, b)
+            for new_vertex in [current_vertex, best_in_edge]:
+                b_graph.nodes[new_vertex].update(
+                    {
+                        'word': 'TEMP',
+                        'rel': 'NTOP',
+                        'address': new_vertex,
+                    }
+                )
+            b_graph.add_arc(best_in_edge, current_vertex)
+            # Beta(current node) = b  - stored for parse recovery
+            # If b_graph contains a cycle, collapse it
+            cycle_path = b_graph.contains_cycle()
+            if cycle_path:
+                # Create a new node v_n+1 with address = len(nodes) + 1
+                new_node = {
+                    'word': 'NONE',
+                    'rel': 'NTOP',
+                    'address': nr_vertices + 1,
+                }
+                # c_graph = Union(c_graph, v_n+1)
+                c_graph.add_node(new_node)
+                # Collapse all nodes in cycle C into v_n+1
+                self.update_edge_scores(new_node, cycle_path)
+                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
+                for cycle_index in cycle_path:
+                    c_graph.add_arc(new_node['address'], cycle_index)
+                    # self.replaced_by[cycle_index] = new_node['address']
+
+                self.inner_nodes[new_node['address']] = cycle_path
+
+                # Add v_n+1 to list of unvisited vertices
+                unvisited_vertices.insert(0, nr_vertices + 1)
+
+                # increment # of nodes counter
+                nr_vertices += 1
+
+                # Remove cycle nodes from b_graph; B = B - cycle c
+                for cycle_node_address in cycle_path:
+                    b_graph.remove_by_address(cycle_node_address)
+
+            logger.debug('g_graph: %s', g_graph)
+            logger.debug('b_graph: %s', b_graph)
+            logger.debug('c_graph: %s', c_graph)
+            logger.debug('Betas: %s', betas)
+            logger.debug('replaced nodes %s', self.inner_nodes)
+
+        # Recover parse tree
+        logger.debug('Final scores: %s', self.scores)
+
+        logger.debug('Recovering parse...')
+        for i in range(len(tokens) + 1, nr_vertices + 1):
+            betas[betas[i][1]] = betas[i]
+
+        logger.debug('Betas: %s', betas)
+        for node in original_graph.nodes.values():
+            # TODO: It's dangerous to assume that deps it a dictionary
+            # because it's a default dictionary. Ideally, here we should not
+            # be concerned how dependencies are stored inside of a dependency
+            # graph.
+            node['deps'] = {}
+        for i in range(1, len(tokens) + 1):
+            original_graph.add_arc(betas[i][0], betas[i][1])
+
+        logger.debug('Done.')
+        yield original_graph
+
+#################################################################
+# Rule-based Non-Projective Parser
+#################################################################
+
+
+class NonprojectiveDependencyParser(object):
+    """
+    A non-projective, rule-based, dependency parser.  This parser
+    will return the set of all possible non-projective parses based on
+    the word-to-word relations defined in the parser's dependency
+    grammar, and will allow the branches of the parse tree to cross
+    in order to capture a variety of linguistic phenomena that a
+    projective parser will not.
+    """
+
+    def __init__(self, dependency_grammar):
+        """
+        Creates a new ``NonprojectiveDependencyParser``.
+
+        :param dependency_grammar: a grammar of word-to-word relations.
+        :type dependency_grammar: DependencyGrammar
+        """
+        self._grammar = dependency_grammar
+
+    def parse(self, tokens):
+        """
+        Parses the input tokens with respect to the parser's grammar.  Parsing
+        is accomplished by representing the search-space of possible parses as
+        a fully-connected directed graph.  Arcs that would lead to ungrammatical
+        parses are removed and a lattice is constructed of length n, where n is
+        the number of input tokens, to represent all possible grammatical
+        traversals.  All possible paths through the lattice are then enumerated
+        to produce the set of non-projective parses.
+
+        param tokens: A list of tokens to parse.
+        type tokens: list(str)
+        return: An iterator of non-projective parses.
+        rtype: iter(DependencyGraph)
+        """
+        # Create graph representation of tokens
+        self._graph = DependencyGraph()
+
+        for index, token in enumerate(tokens):
+            self._graph.nodes[index] = {
+                'word': token,
+                'deps': [],
+                'rel': 'NTOP',
+                'address': index,
+            }
+
+        for head_node in self._graph.nodes.values():
+            deps = []
+            for dep_node in self._graph.nodes.values()  :
+                if (
+                    self._grammar.contains(head_node['word'], dep_node['word'])
+                    and head_node['word'] != dep_node['word']
+                ):
+                    deps.append(dep_node['address'])
+            head_node['deps'] = deps
+
+        # Create lattice of possible heads
+        roots = []
+        possible_heads = []
+        for i, word in enumerate(tokens):
+            heads = []
+            for j, head in enumerate(tokens):
+                if (i != j) and self._grammar.contains(head, word):
+                    heads.append(j)
+            if len(heads) == 0:
+                roots.append(i)
+            possible_heads.append(heads)
+
+        # Set roots to attempt
+        if len(roots) < 2:
+            if len(roots) == 0:
+                for i in range(len(tokens)):
+                    roots.append(i)
+
+            # Traverse lattice
+            analyses = []
+            for root in roots:
+                stack = []
+                analysis = [[] for i in range(len(possible_heads))]
+            i = 0
+            forward = True
+            while i >= 0:
+                if forward:
+                    if len(possible_heads[i]) == 1:
+                        analysis[i] = possible_heads[i][0]
+                    elif len(possible_heads[i]) == 0:
+                        analysis[i] = -1
+                    else:
+                        head = possible_heads[i].pop()
+                        analysis[i] = head
+                        stack.append([i, head])
+                if not forward:
+                    index_on_stack = False
+                    for stack_item in stack:
+                        if stack_item[0] == i:
+                            index_on_stack = True
+                    orig_length = len(possible_heads[i])
+
+                    if index_on_stack and orig_length == 0:
+                        for j in range(len(stack) - 1, -1, -1):
+                            stack_item = stack[j]
+                            if stack_item[0] == i:
+                                possible_heads[i].append(stack.pop(j)[1])
+
+                    elif index_on_stack and orig_length > 0:
+                        head = possible_heads[i].pop()
+                        analysis[i] = head
+                        stack.append([i, head])
+                        forward = True
+
+                if i + 1 == len(possible_heads):
+                    analyses.append(analysis[:])
+                    forward = False
+                if forward:
+                    i += 1
+                else:
+                    i -= 1
+
+        # Filter parses
+        # ensure 1 root, every thing has 1 head
+        for analysis in analyses:
+            if analysis.count(-1) > 1:
+                # there are several root elements!
+                continue
+
+            graph = DependencyGraph()
+            graph.root = graph.nodes[analysis.index(-1) + 1]
+
+            for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
+                head_address = head_index + 1
+
+                node = graph.nodes[address]
+                node.update(
+                    {
+                        'word': token,
+                        'address': address,
+                    }
+                )
+
+                if head_address == 0:
+                    rel = 'ROOT'
+                else:
+                    rel = ''
+                graph.nodes[head_index + 1]['deps'][rel].append(address)
+
+            # TODO: check for cycles
+            yield graph
+
+
+#################################################################
+# Demos
+#################################################################
+
+def demo():
+    # hall_demo()
+    nonprojective_conll_parse_demo()
+    rule_based_demo()
+
+
+def hall_demo():
+    npp = ProbabilisticNonprojectiveParser()
+    npp.train([], DemoScorer())
+    for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]):
+        print(parse_graph)
+
+
+def nonprojective_conll_parse_demo():
+    from nltk.parse.dependencygraph import conll_data2
+
+    graphs = [
+        DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry
+    ]
+    npp = ProbabilisticNonprojectiveParser()
+    npp.train(graphs, NaiveBayesDependencyScorer())
+    for parse_graph in npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']):
+        print(parse_graph)
+
+
+def rule_based_demo():
+    from nltk.grammar import DependencyGrammar
+
+    grammar = DependencyGrammar.fromstring("""
+    'taught' -> 'play' | 'man'
+    'man' -> 'the' | 'in'
+    'in' -> 'corner'
+    'corner' -> 'the'
+    'play' -> 'golf' | 'dachshund' | 'to'
+    'dachshund' -> 'his'
+    """)
+    print(grammar)
+    ndp = NonprojectiveDependencyParser(grammar)
+    graphs = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
+    print('Graphs:')
+    for graph in graphs:
+        print(graph)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/nonprojectivedependencyparser.pyc b/nlp_resource_data/nltk/parse/nonprojectivedependencyparser.pyc

new file mode 100755 (executable)

index 0000000..d862974

Binary files /dev/null and b/nlp_resource_data/nltk/parse/nonprojectivedependencyparser.pyc differ
diff --git a/nlp_resource_data/nltk/parse/pchart.py b/nlp_resource_data/nltk/parse/pchart.py

new file mode 100755 (executable)

index 0000000..288d8c2
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/pchart.py
@@ -0,0 +1,522 @@
+# Natural Language Toolkit: Probabilistic Chart Parsers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes and interfaces for associating probabilities with tree
+structures that represent the internal organization of a text.  The
+probabilistic parser module defines ``BottomUpProbabilisticChartParser``.
+
+``BottomUpProbabilisticChartParser`` is an abstract class that implements
+a bottom-up chart parser for ``PCFG`` grammars.  It maintains a queue of edges,
+and adds them to the chart one at a time.  The ordering of this queue
+is based on the probabilities associated with the edges, allowing the
+parser to expand more likely edges before less likely ones.  Each
+subclass implements a different queue ordering, producing different
+search strategies.  Currently the following subclasses are defined:
+
+  - ``InsideChartParser`` searches edges in decreasing order of
+    their trees' inside probabilities.
+  - ``RandomChartParser`` searches edges in random order.
+  - ``LongestChartParser`` searches edges in decreasing order of their
+    location's length.
+
+The ``BottomUpProbabilisticChartParser`` constructor has an optional
+argument beam_size.  If non-zero, this controls the size of the beam
+(aka the edge queue).  This option is most useful with InsideChartParser.
+"""
+from __future__ import print_function, unicode_literals
+
+##//////////////////////////////////////////////////////
+##  Bottom-Up PCFG Chart Parser
+##//////////////////////////////////////////////////////
+
+# [XX] This might not be implemented quite right -- it would be better
+# to associate probabilities with child pointer lists.
+
+from functools import reduce
+from nltk.tree import Tree, ProbabilisticTree
+from nltk.grammar import Nonterminal, PCFG
+
+from nltk.parse.api import ParserI
+from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
+from nltk.compat import python_2_unicode_compatible
+
+# Probabilistic edges
+class ProbabilisticLeafEdge(LeafEdge):
+    def prob(self): return 1.0
+
+class ProbabilisticTreeEdge(TreeEdge):
+    def __init__(self, prob, *args, **kwargs):
+        TreeEdge.__init__(self, *args, **kwargs)
+        self._prob = prob
+        # two edges with different probabilities are not equal.
+        self._comparison_key = (self._comparison_key, prob)
+
+    def prob(self): return self._prob
+
+    @staticmethod
+    def from_production(production, index, p):
+        return ProbabilisticTreeEdge(p, (index, index), production.lhs(),
+                                     production.rhs(), 0)
+
+# Rules using probabilistic edges
+class ProbabilisticBottomUpInitRule(AbstractChartRule):
+    NUM_EDGES=0
+    def apply(self, chart, grammar):
+        for index in range(chart.num_leaves()):
+            new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
+            if chart.insert(new_edge, ()):
+                yield new_edge
+
+class ProbabilisticBottomUpPredictRule(AbstractChartRule):
+    NUM_EDGES=1
+    def apply(self, chart, grammar, edge):
+        if edge.is_incomplete(): return
+        for prod in grammar.productions():
+            if edge.lhs() == prod.rhs()[0]:
+                new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
+                if chart.insert(new_edge, ()):
+                    yield new_edge
+
+class ProbabilisticFundamentalRule(AbstractChartRule):
+    NUM_EDGES=2
+    def apply(self, chart, grammar, left_edge, right_edge):
+        # Make sure the rule is applicable.
+        if not (left_edge.end() == right_edge.start() and
+                left_edge.nextsym() == right_edge.lhs() and
+                left_edge.is_incomplete() and right_edge.is_complete()):
+            return
+
+        # Construct the new edge.
+        p = left_edge.prob() * right_edge.prob()
+        new_edge = ProbabilisticTreeEdge(p,
+                            span=(left_edge.start(), right_edge.end()),
+                            lhs=left_edge.lhs(), rhs=left_edge.rhs(),
+                            dot=left_edge.dot()+1)
+
+        # Add it to the chart, with appropriate child pointers.
+        changed_chart = False
+        for cpl1 in chart.child_pointer_lists(left_edge):
+            if chart.insert(new_edge, cpl1+(right_edge,)):
+                changed_chart = True
+
+        # If we changed the chart, then generate the edge.
+        if changed_chart: yield new_edge
+
+@python_2_unicode_compatible
+class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
+    NUM_EDGES=1
+
+    _fundamental_rule = ProbabilisticFundamentalRule()
+
+    def apply(self, chart, grammar, edge1):
+        fr = self._fundamental_rule
+        if edge1.is_incomplete():
+            # edge1 = left_edge; edge2 = right_edge
+            for edge2 in chart.select(start=edge1.end(), is_complete=True,
+                                     lhs=edge1.nextsym()):
+                for new_edge in fr.apply(chart, grammar, edge1, edge2):
+                    yield new_edge
+        else:
+            # edge2 = left_edge; edge1 = right_edge
+            for edge2 in chart.select(end=edge1.start(), is_complete=False,
+                                      nextsym=edge1.lhs()):
+                for new_edge in fr.apply(chart, grammar, edge2, edge1):
+                    yield new_edge
+
+    def __str__(self):
+        return 'Fundamental Rule'
+
+class BottomUpProbabilisticChartParser(ParserI):
+    """
+    An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to
+    record partial results.  ``BottomUpProbabilisticChartParser`` maintains
+    a queue of edges that can be added to the chart.  This queue is
+    initialized with edges for each token in the text that is being
+    parsed.  ``BottomUpProbabilisticChartParser`` inserts these edges into
+    the chart one at a time, starting with the most likely edges, and
+    proceeding to less likely edges.  For each edge that is added to
+    the chart, it may become possible to insert additional edges into
+    the chart; these are added to the queue.  This process continues
+    until enough complete parses have been generated, or until the
+    queue is empty.
+
+    The sorting order for the queue is not specified by
+    ``BottomUpProbabilisticChartParser``.  Different sorting orders will
+    result in different search strategies.  The sorting order for the
+    queue is defined by the method ``sort_queue``; subclasses are required
+    to provide a definition for this method.
+
+    :type _grammar: PCFG
+    :ivar _grammar: The grammar used to parse sentences.
+    :type _trace: int
+    :ivar _trace: The level of tracing output that should be generated
+        when parsing a text.
+    """
+    def __init__(self, grammar, beam_size=0, trace=0):
+        """
+        Create a new ``BottomUpProbabilisticChartParser``, that uses
+        ``grammar`` to parse texts.
+
+        :type grammar: PCFG
+        :param grammar: The grammar used to parse texts.
+        :type beam_size: int
+        :param beam_size: The maximum length for the parser's edge queue.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        if not isinstance(grammar, PCFG):
+            raise ValueError("The grammar must be probabilistic PCFG")
+        self._grammar = grammar
+        self.beam_size = beam_size
+        self._trace = trace
+
+    def grammar(self):
+        return self._grammar
+
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        self._trace = trace
+
+    # TODO: change this to conform more with the standard ChartParser
+    def parse(self, tokens):
+        self._grammar.check_coverage(tokens)
+        chart = Chart(list(tokens))
+        grammar = self._grammar
+
+        # Chart parser rules.
+        bu_init = ProbabilisticBottomUpInitRule()
+        bu = ProbabilisticBottomUpPredictRule()
+        fr = SingleEdgeProbabilisticFundamentalRule()
+
+        # Our queue
+        queue = []
+
+        # Initialize the chart.
+        for edge in bu_init.apply(chart, grammar):
+            if self._trace > 1:
+                print('  %-50s [%s]' % (chart.pretty_format_edge(edge,width=2),
+                                        edge.prob()))
+            queue.append(edge)
+
+        while len(queue) > 0:
+            # Re-sort the queue.
+            self.sort_queue(queue, chart)
+
+            # Prune the queue to the correct size if a beam was defined
+            if self.beam_size:
+                self._prune(queue, chart)
+
+            # Get the best edge.
+            edge = queue.pop()
+            if self._trace > 0:
+                print('  %-50s [%s]' % (chart.pretty_format_edge(edge,width=2),
+                                        edge.prob()))
+
+            # Apply BU & FR to it.
+            queue.extend(bu.apply(chart, grammar, edge))
+            queue.extend(fr.apply(chart, grammar, edge))
+
+        # Get a list of complete parses.
+        parses = list(chart.parses(grammar.start(), ProbabilisticTree))
+
+        # Assign probabilities to the trees.
+        prod_probs = {}
+        for prod in grammar.productions():
+            prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
+        for parse in parses:
+            self._setprob(parse, prod_probs)
+
+        # Sort by probability
+        parses.sort(reverse=True, key=lambda tree: tree.prob())
+
+        return iter(parses)
+
+    def _setprob(self, tree, prod_probs):
+        if tree.prob() is not None: return
+
+        # Get the prob of the CFG production.
+        lhs = Nonterminal(tree.label())
+        rhs = []
+        for child in tree:
+            if isinstance(child, Tree):
+                rhs.append(Nonterminal(child.label()))
+            else:
+                rhs.append(child)
+        prob = prod_probs[lhs, tuple(rhs)]
+
+        # Get the probs of children.
+        for child in tree:
+            if isinstance(child, Tree):
+                self._setprob(child, prod_probs)
+                prob *= child.prob()
+
+        tree.set_prob(prob)
+
+    def sort_queue(self, queue, chart):
+        """
+        Sort the given queue of ``Edge`` objects, placing the edge that should
+        be tried first at the beginning of the queue.  This method
+        will be called after each ``Edge`` is added to the queue.
+
+        :param queue: The queue of ``Edge`` objects to sort.  Each edge in
+            this queue is an edge that could be added to the chart by
+            the fundamental rule; but that has not yet been added.
+        :type queue: list(Edge)
+        :param chart: The chart being used to parse the text.  This
+            chart can be used to provide extra information for sorting
+            the queue.
+        :type chart: Chart
+        :rtype: None
+        """
+        raise NotImplementedError()
+
+    def _prune(self, queue, chart):
+        """ Discard items in the queue if the queue is longer than the beam."""
+        if len(queue) > self.beam_size:
+            split = len(queue)-self.beam_size
+            if self._trace > 2:
+                for edge in queue[:split]:
+                    print('  %-50s [DISCARDED]' % chart.pretty_format_edge(edge,2))
+            del queue[:split]
+
+class InsideChartParser(BottomUpProbabilisticChartParser):
+    """
+    A bottom-up parser for ``PCFG`` grammars that tries edges in descending
+    order of the inside probabilities of their trees.  The "inside
+    probability" of a tree is simply the
+    probability of the entire tree, ignoring its context.  In
+    particular, the inside probability of a tree generated by
+    production *p* with children *c[1], c[2], ..., c[n]* is
+    *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside
+    probability of a token is 1 if it is present in the text, and 0 if
+    it is absent.
+
+    This sorting order results in a type of lowest-cost-first search
+    strategy.
+    """
+    # Inherit constructor.
+    def sort_queue(self, queue, chart):
+        """
+        Sort the given queue of edges, in descending order of the
+        inside probabilities of the edges' trees.
+
+        :param queue: The queue of ``Edge`` objects to sort.  Each edge in
+            this queue is an edge that could be added to the chart by
+            the fundamental rule; but that has not yet been added.
+        :type queue: list(Edge)
+        :param chart: The chart being used to parse the text.  This
+            chart can be used to provide extra information for sorting
+            the queue.
+        :type chart: Chart
+        :rtype: None
+        """
+        queue.sort(key=lambda edge: edge.prob())
+
+# Eventually, this will become some sort of inside-outside parser:
+# class InsideOutsideParser(BottomUpProbabilisticChartParser):
+#     def __init__(self, grammar, trace=0):
+#         # Inherit docs.
+#         BottomUpProbabilisticChartParser.__init__(self, grammar, trace)
+#
+#         # Find the best path from S to each nonterminal
+#         bestp = {}
+#         for production in grammar.productions(): bestp[production.lhs()]=0
+#         bestp[grammar.start()] = 1.0
+#
+#         for i in range(len(grammar.productions())):
+#             for production in grammar.productions():
+#                 lhs = production.lhs()
+#                 for elt in production.rhs():
+#                     bestp[elt] = max(bestp[lhs]*production.prob(),
+#                                      bestp.get(elt,0))
+#
+#         self._bestp = bestp
+#         for (k,v) in self._bestp.items(): print k,v
+#
+#     def _sortkey(self, edge):
+#         return edge.structure()[PROB] * self._bestp[edge.lhs()]
+#
+#     def sort_queue(self, queue, chart):
+#         queue.sort(key=self._sortkey)
+
+import random
+class RandomChartParser(BottomUpProbabilisticChartParser):
+    """
+    A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
+    This sorting order results in a random search strategy.
+    """
+    # Inherit constructor
+    def sort_queue(self, queue, chart):
+        i = random.randint(0, len(queue)-1)
+        (queue[-1], queue[i]) = (queue[i], queue[-1])
+
+class UnsortedChartParser(BottomUpProbabilisticChartParser):
+    """
+    A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
+    """
+    # Inherit constructor
+    def sort_queue(self, queue, chart): return
+
+class LongestChartParser(BottomUpProbabilisticChartParser):
+    """
+    A bottom-up parser for ``PCFG`` grammars that tries longer edges before
+    shorter ones.  This sorting order results in a type of best-first
+    search strategy.
+    """
+    # Inherit constructor
+    def sort_queue(self, queue, chart):
+        queue.sort(key=lambda edge: edge.length())
+
+##//////////////////////////////////////////////////////
+##  Test Code
+##//////////////////////////////////////////////////////
+
+def demo(choice=None, draw_parses=None, print_parses=None):
+    """
+    A demonstration of the probabilistic parsers.  The user is
+    prompted to select which demo to run, and how many parses should
+    be found; and then each parser is run on the same demo, and a
+    summary of the results are displayed.
+    """
+    import sys, time
+    from nltk import tokenize
+    from nltk.parse import pchart
+
+    # Define two demos.  Each demo has a sentence and a grammar.
+    toy_pcfg1 = PCFG.fromstring("""
+    S -> NP VP [1.0]
+    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
+    Det -> 'the' [0.8] | 'my' [0.2]
+    N -> 'man' [0.5] | 'telescope' [0.5]
+    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
+    V -> 'ate' [0.35] | 'saw' [0.65]
+    PP -> P NP [1.0]
+    P -> 'with' [0.61] | 'under' [0.39]
+    """)
+
+    toy_pcfg2 = PCFG.fromstring("""
+    S    -> NP VP         [1.0]
+    VP   -> V NP          [.59]
+    VP   -> V             [.40]
+    VP   -> VP PP         [.01]
+    NP   -> Det N         [.41]
+    NP   -> Name          [.28]
+    NP   -> NP PP         [.31]
+    PP   -> P NP          [1.0]
+    V    -> 'saw'         [.21]
+    V    -> 'ate'         [.51]
+    V    -> 'ran'         [.28]
+    N    -> 'boy'         [.11]
+    N    -> 'cookie'      [.12]
+    N    -> 'table'       [.13]
+    N    -> 'telescope'   [.14]
+    N    -> 'hill'        [.5]
+    Name -> 'Jack'        [.52]
+    Name -> 'Bob'         [.48]
+    P    -> 'with'        [.61]
+    P    -> 'under'       [.39]
+    Det  -> 'the'         [.41]
+    Det  -> 'a'           [.31]
+    Det  -> 'my'          [.28]
+    """)
+
+    demos = [('I saw John with my telescope', toy_pcfg1),
+             ('the boy saw Jack with Bob under the table with a telescope',
+              toy_pcfg2)]
+
+    if choice is None:
+        # Ask the user which demo they want to use.
+        print()
+        for i in range(len(demos)):
+            print('%3s: %s' % (i+1, demos[i][0]))
+            print('     %r' % demos[i][1])
+            print()
+        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
+        choice = int(sys.stdin.readline().strip())-1
+    try:
+        sent, grammar = demos[choice]
+    except:
+        print('Bad sentence number')
+        return
+
+    # Tokenize the sentence.
+    tokens = sent.split()
+
+    # Define a list of parsers.  We'll use all parsers.
+    parsers = [
+        pchart.InsideChartParser(grammar),
+        pchart.RandomChartParser(grammar),
+        pchart.UnsortedChartParser(grammar),
+        pchart.LongestChartParser(grammar),
+        pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)   # was BeamParser
+        ]
+
+    # Run the parsers on the tokenized sentence.
+    times = []
+    average_p = []
+    num_parses = []
+    all_parses = {}
+    for parser in parsers:
+        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
+        parser.trace(3)
+        t = time.time()
+        parses = list(parser.parse(tokens))
+        times.append(time.time()-t)
+        p = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0)
+        average_p.append(p)
+        num_parses.append(len(parses))
+        for p in parses: all_parses[p.freeze()] = 1
+
+    # Print some summary statistics
+    print()
+    print('       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
+    print('------------------------+------------------------------------------')
+    for i in range(len(parsers)):
+        print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
+                                             parsers[i].beam_size,
+                                             times[i],num_parses[i],average_p[i]))
+    parses = all_parses.keys()
+    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
+    else: p = 0
+    print('------------------------+------------------------------------------')
+    print('%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))
+
+    if draw_parses is None:
+        # Ask the user if we should draw the parses.
+        print()
+        print('Draw parses (y/n)? ', end=' ')
+        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
+    if draw_parses:
+        from nltk.draw.tree import draw_trees
+        print('  please wait...')
+        draw_trees(*parses)
+
+    if print_parses is None:
+        # Ask the user if we should print the parses.
+        print()
+        print('Print parses (y/n)? ', end=' ')
+        print_parses = sys.stdin.readline().strip().lower().startswith('y')
+    if print_parses:
+        for parse in parses:
+            print(parse)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/pchart.pyc b/nlp_resource_data/nltk/parse/pchart.pyc

new file mode 100755 (executable)

index 0000000..27d60e6

Binary files /dev/null and b/nlp_resource_data/nltk/parse/pchart.pyc differ
diff --git a/nlp_resource_data/nltk/parse/projectivedependencyparser.py b/nlp_resource_data/nltk/parse/projectivedependencyparser.py

new file mode 100755 (executable)

index 0000000..273851d
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/projectivedependencyparser.py
@@ -0,0 +1,579 @@
+# Natural Language Toolkit: Dependency Grammars
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Jason Narad <jason.narad@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+from __future__ import print_function, unicode_literals
+
+from collections import defaultdict
+from itertools import chain
+from functools import total_ordering
+
+from nltk.grammar import (DependencyProduction, DependencyGrammar,
+                          ProbabilisticDependencyGrammar)
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
+
+#################################################################
+# Dependency Span
+#################################################################
+
+@total_ordering
+@python_2_unicode_compatible
+class DependencySpan(object):
+    """
+    A contiguous span over some part of the input string representing
+    dependency (head -> modifier) relationships amongst words.  An atomic
+    span corresponds to only one word so it isn't a 'span' in the conventional
+    sense, as its _start_index = _end_index = _head_index for concatenation
+    purposes.  All other spans are assumed to have arcs between all nodes
+    within the start and end indexes of the span, and one head index corresponding
+    to the head word for the entire span.  This is the same as the root node if
+    the dependency structure were depicted as a graph.
+    """
+    def __init__(self, start_index, end_index, head_index, arcs, tags):
+        self._start_index = start_index
+        self._end_index = end_index
+        self._head_index = head_index
+        self._arcs = arcs
+        self._tags = tags
+        self._comparison_key = (start_index, end_index, head_index, tuple(arcs))
+        self._hash = hash(self._comparison_key)
+
+    def head_index(self):
+        """
+        :return: An value indexing the head of the entire ``DependencySpan``.
+        :rtype: int
+        """
+        return self._head_index
+
+    def __repr__(self):
+        """
+        :return: A concise string representatino of the ``DependencySpan``.
+        :rtype: str.
+        """
+        return 'Span %d-%d; Head Index: %d' % (self._start_index, self._end_index, self._head_index)
+
+    def __str__(self):
+        """
+        :return: A verbose string representation of the ``DependencySpan``.
+        :rtype: str
+        """
+        str = 'Span %d-%d; Head Index: %d' % (self._start_index, self._end_index, self._head_index)
+        for i in range(len(self._arcs)):
+            str += '\n%d <- %d, %s' % (i, self._arcs[i], self._tags[i])
+        return str
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self._comparison_key == other._comparison_key)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, DependencySpan):
+            raise_unorderable_types("<", self, other)
+        return self._comparison_key < other._comparison_key
+
+    def __hash__(self):
+        """
+        :return: The hash value of this ``DependencySpan``.
+        """
+        return self._hash
+
+#################################################################
+# Chart Cell
+#################################################################
+
+@python_2_unicode_compatible
+class ChartCell(object):
+    """
+    A cell from the parse chart formed when performing the CYK algorithm.
+    Each cell keeps track of its x and y coordinates (though this will probably
+    be discarded), and a list of spans serving as the cell's entries.
+    """
+    def __init__(self, x, y):
+        """
+        :param x: This cell's x coordinate.
+        :type x: int.
+        :param y: This cell's y coordinate.
+        :type y: int.
+        """
+        self._x = x
+        self._y = y
+        self._entries = set([])
+
+    def add(self, span):
+        """
+        Appends the given span to the list of spans
+        representing the chart cell's entries.
+
+        :param span: The span to add.
+        :type span: DependencySpan
+        """
+        self._entries.add(span)
+
+    def __str__(self):
+        """
+        :return: A verbose string representation of this ``ChartCell``.
+        :rtype: str.
+        """
+        return 'CC[%d,%d]: %s' % (self._x, self._y, self._entries)
+
+    def __repr__(self):
+        """
+        :return: A concise string representation of this ``ChartCell``.
+        :rtype: str.
+        """
+        return '%s' % self
+
+
+#################################################################
+# Parsing  with Dependency Grammars
+#################################################################
+
+
+class ProjectiveDependencyParser(object):
+    """
+    A projective, rule-based, dependency parser.  A ProjectiveDependencyParser
+    is created with a DependencyGrammar, a set of productions specifying
+    word-to-word dependency relations.  The parse() method will then
+    return the set of all parses, in tree representation, for a given input
+    sequence of tokens.  Each parse must meet the requirements of the both
+    the grammar and the projectivity constraint which specifies that the
+    branches of the dependency tree are not allowed to cross.  Alternatively,
+    this can be understood as stating that each parent node and its children
+    in the parse tree form a continuous substring of the input sequence.
+    """
+
+    def __init__(self, dependency_grammar):
+        """
+        Create a new ProjectiveDependencyParser, from a word-to-word
+        dependency grammar ``DependencyGrammar``.
+
+        :param dependency_grammar: A word-to-word relation dependencygrammar.
+        :type dependency_grammar: DependencyGrammar
+        """
+        self._grammar = dependency_grammar
+
+    def parse(self, tokens):
+        """
+        Performs a projective dependency parse on the list of tokens using
+        a chart-based, span-concatenation algorithm similar to Eisner (1996).
+
+        :param tokens: The list of input tokens.
+        :type tokens: list(str)
+        :return: An iterator over parse trees.
+        :rtype: iter(Tree)
+        """
+        self._tokens = list(tokens)
+        chart = []
+        for i in range(0, len(self._tokens) + 1):
+            chart.append([])
+            for j in range(0, len(self._tokens) + 1):
+                chart[i].append(ChartCell(i,j))
+                if i==j+1:
+                    chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], ['null']))
+
+        for i in range(1,len(self._tokens)+1):
+            for j in range(i-2,-1,-1):
+                for k in range(i-1,j,-1):
+                    for span1 in chart[k][j]._entries:
+                        for span2 in chart[i][k]._entries:
+                            for newspan in self.concatenate(span1, span2):
+                                chart[i][j].add(newspan)
+
+        for parse in chart[len(self._tokens)][0]._entries:
+            conll_format = ""
+#            malt_format = ""
+            for i in range(len(tokens)):
+#                malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
+                #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
+                # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) 
+                conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'ROOT', '-', '-')
+            dg = DependencyGraph(conll_format)
+#           if self.meets_arity(dg):
+            yield dg.tree()
+
+
+    def concatenate(self, span1, span2):
+        """
+        Concatenates the two spans in whichever way possible.  This
+        includes rightward concatenation (from the leftmost word of the
+        leftmost span to the rightmost word of the rightmost span) and
+        leftward concatenation (vice-versa) between adjacent spans.  Unlike
+        Eisner's presentation of span concatenation, these spans do not
+        share or pivot on a particular word/word-index.
+
+        :return: A list of new spans formed through concatenation.
+        :rtype: list(DependencySpan)
+        """
+        spans = []
+        if span1._start_index == span2._start_index:
+            print('Error: Mismatched spans - replace this with thrown error')
+        if span1._start_index > span2._start_index:
+            temp_span = span1
+            span1 = span2
+            span2 = temp_span
+        # adjacent rightward covered concatenation
+        new_arcs = span1._arcs + span2._arcs
+        new_tags = span1._tags + span2._tags
+        if self._grammar.contains(self._tokens[span1._head_index], self._tokens[span2._head_index]):
+#           print 'Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)
+            new_arcs[span2._head_index - span1._start_index] = span1._head_index
+            spans.append(DependencySpan(span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags))
+        # adjacent leftward covered concatenation
+        new_arcs = span1._arcs + span2._arcs
+        if self._grammar.contains(self._tokens[span2._head_index], self._tokens[span1._head_index]):
+#           print 'performing leftward cover %d to %d' % (span2._head_index, span1._head_index)
+            new_arcs[span1._head_index - span1._start_index] = span2._head_index
+            spans.append(DependencySpan(span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags))
+        return spans
+
+
+#################################################################
+# Parsing  with Probabilistic Dependency Grammars
+#################################################################
+
+
+class ProbabilisticProjectiveDependencyParser(object):
+    """A probabilistic, projective dependency parser.
+
+    This parser returns the most probable projective parse derived from the
+    probabilistic dependency grammar derived from the train() method.  The
+    probabilistic model is an implementation of Eisner's (1996) Model C, which
+    conditions on head-word, head-tag, child-word, and child-tag.  The decoding
+    uses a bottom-up chart-based span concatenation algorithm that's identical
+    to the one utilized by the rule-based projective parser.
+
+    Usage example
+    -------------
+    >>> from nltk.parse.dependencygraph import conll_data2
+
+    >>> graphs = [
+    ... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry
+    ... ]
+
+    >>> ppdp = ProbabilisticProjectiveDependencyParser()
+    >>> ppdp.train(graphs)
+
+    >>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
+    >>> list(ppdp.parse(sent))
+    [Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])]
+
+    """
+
+    def __init__(self):
+        """
+        Create a new probabilistic dependency parser.  No additional
+        operations are necessary.
+        """
+
+    def parse(self, tokens):
+        """
+        Parses the list of tokens subject to the projectivity constraint
+        and the productions in the parser's grammar.  This uses a method
+        similar to the span-concatenation algorithm defined in Eisner (1996).
+        It returns the most probable parse derived from the parser's
+        probabilistic dependency grammar.
+        """
+        self._tokens = list(tokens)
+        chart = []
+        for i in range(0, len(self._tokens) + 1):
+            chart.append([])
+            for j in range(0, len(self._tokens) + 1):
+                chart[i].append(ChartCell(i,j))
+                if i==j+1:
+                    if tokens[i-1] in self._grammar._tags:
+                        for tag in self._grammar._tags[tokens[i-1]]:
+                            chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [tag]))
+                    else:
+                        print('No tag found for input token \'%s\', parse is impossible.' % tokens[i-1])
+                        return []
+        for i in range(1,len(self._tokens)+1):
+            for j in range(i-2,-1,-1):
+                for k in range(i-1,j,-1):
+                    for span1 in chart[k][j]._entries:
+                            for span2 in chart[i][k]._entries:
+                                for newspan in self.concatenate(span1, span2):
+                                    chart[i][j].add(newspan)
+        trees = []
+        max_parse = None
+        max_score = 0
+        for parse in chart[len(self._tokens)][0]._entries:
+            conll_format = ""
+            malt_format = ""
+            for i in range(len(tokens)):
+                malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
+                #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
+                # Modify to comply with recent change in dependency graph such that there must be a ROOT element. 
+                conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-')
+            dg = DependencyGraph(conll_format)
+            score = self.compute_prob(dg)            
+            trees.append((score, dg.tree()))
+        trees.sort()
+        return (tree for (score, tree) in trees)
+
+
+    def concatenate(self, span1, span2):
+        """
+        Concatenates the two spans in whichever way possible.  This
+        includes rightward concatenation (from the leftmost word of the
+        leftmost span to the rightmost word of the rightmost span) and
+        leftward concatenation (vice-versa) between adjacent spans.  Unlike
+        Eisner's presentation of span concatenation, these spans do not
+        share or pivot on a particular word/word-index.
+
+        :return: A list of new spans formed through concatenation.
+        :rtype: list(DependencySpan)
+        """
+        spans = []
+        if span1._start_index == span2._start_index:
+            print('Error: Mismatched spans - replace this with thrown error')
+        if span1._start_index > span2._start_index:
+            temp_span = span1
+            span1 = span2
+            span2 = temp_span
+        # adjacent rightward covered concatenation
+        new_arcs = span1._arcs + span2._arcs
+        new_tags = span1._tags + span2._tags
+        if self._grammar.contains(self._tokens[span1._head_index], self._tokens[span2._head_index]):
+            new_arcs[span2._head_index - span1._start_index] = span1._head_index
+            spans.append(DependencySpan(span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags))
+        # adjacent leftward covered concatenation
+        new_arcs = span1._arcs + span2._arcs
+        new_tags = span1._tags + span2._tags
+        if self._grammar.contains(self._tokens[span2._head_index], self._tokens[span1._head_index]):
+            new_arcs[span1._head_index - span1._start_index] = span2._head_index
+            spans.append(DependencySpan(span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags))
+        return spans
+
+    def train(self, graphs):
+        """
+        Trains a ProbabilisticDependencyGrammar based on the list of input
+        DependencyGraphs.  This model is an implementation of Eisner's (1996)
+        Model C, which derives its statistics from head-word, head-tag,
+        child-word, and child-tag relationships.
+
+        :param graphs: A list of dependency graphs to train from.
+        :type: list(DependencyGraph)
+        """
+        productions = []
+        events = defaultdict(int)
+        tags = {}
+        for dg in graphs:
+            for node_index in range(1, len(dg.nodes)):
+                #children = dg.nodes[node_index]['deps']
+                children = list(chain(*dg.nodes[node_index]['deps'].values()))
+                
+                nr_left_children = dg.left_children(node_index)
+                nr_right_children = dg.right_children(node_index)
+                nr_children = nr_left_children + nr_right_children
+                for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
+                    head_word = dg.nodes[node_index]['word']
+                    head_tag = dg.nodes[node_index]['tag']
+                    if head_word in tags:
+                        tags[head_word].add(head_tag)
+                    else:
+                        tags[head_word] = set([head_tag])
+                    child = 'STOP'
+                    child_tag = 'STOP'
+                    prev_word = 'START'
+                    prev_tag = 'START'
+                    if child_index < 0:
+                        array_index = child_index + nr_left_children
+                        if array_index >= 0:
+                            child = dg.nodes[children[array_index]]['word']
+                            child_tag = dg.nodes[children[array_index]]['tag']
+                        if child_index != -1:
+                            prev_word = dg.nodes[children[array_index + 1]]['word']
+                            prev_tag = dg.nodes[children[array_index + 1]]['tag']
+                        if child != 'STOP':
+                            productions.append(DependencyProduction(head_word, [child]))
+                        head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag)
+                        mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag)
+                        events[head_event] += 1
+                        events[mod_event] += 1
+                    elif child_index > 0:
+                        array_index = child_index + nr_left_children - 1
+                        if array_index < nr_children:
+                            child = dg.nodes[children[array_index]]['word']
+                            child_tag = dg.nodes[children[array_index]]['tag']
+                        if child_index != 1:
+                            prev_word = dg.nodes[children[array_index - 1]]['word']
+                            prev_tag =  dg.nodes[children[array_index - 1]]['tag']
+                        if child != 'STOP':
+                            productions.append(DependencyProduction(head_word, [child]))
+                        head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag)
+                        mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag)
+                        events[head_event] += 1
+                        events[mod_event] += 1
+        self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
+
+    def compute_prob(self, dg):
+        """
+        Computes the probability of a dependency graph based
+        on the parser's probability model (defined by the parser's
+        statistical dependency grammar).
+
+        :param dg: A dependency graph to score.
+        :type dg: DependencyGraph
+        :return: The probability of the dependency graph.
+        :rtype: int
+        """
+        prob = 1.0
+        for node_index in range(1, len(dg.nodes)):
+            #children = dg.nodes[node_index]['deps']
+            children = list(chain(*dg.nodes[node_index]['deps'].values()))
+            
+            nr_left_children = dg.left_children(node_index)
+            nr_right_children = dg.right_children(node_index)
+            nr_children = nr_left_children + nr_right_children
+            for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
+                head_word = dg.nodes[node_index]['word']
+                head_tag = dg.nodes[node_index]['tag']
+                child = 'STOP'
+                child_tag = 'STOP'
+                prev_word = 'START'
+                prev_tag = 'START'
+                if child_index < 0:
+                    array_index = child_index + nr_left_children
+                    if array_index >= 0:
+                        child = dg.nodes[children[array_index]]['word']
+                        child_tag = dg.nodes[children[array_index]]['tag']
+                    if child_index != -1:
+                        prev_word = dg.nodes[children[array_index + 1]]['word']
+                        prev_tag = dg.nodes[children[array_index + 1]]['tag']
+                    head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag)
+                    mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag)
+                    h_count = self._grammar._events[head_event]
+                    m_count = self._grammar._events[mod_event]
+                    
+                    # If the grammar is not covered 
+                    if m_count != 0:
+                        prob *= (h_count / m_count)
+                    else:
+                        prob = 0.00000001  # Very small number  
+                    
+                elif child_index > 0:
+                    array_index = child_index + nr_left_children - 1
+                    if array_index < nr_children:
+                        child = dg.nodes[children[array_index]]['word']
+                        child_tag = dg.nodes[children[array_index]]['tag']
+                    if child_index != 1:
+                        prev_word = dg.nodes[children[array_index - 1]]['word']
+                        prev_tag = dg.nodes[children[array_index - 1]]['tag']
+                    head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag)
+                    mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag)
+                    h_count = self._grammar._events[head_event]
+                    m_count = self._grammar._events[mod_event]
+
+                    if m_count != 0:
+                        prob *= (h_count / m_count)
+                    else:
+                        prob = 0.00000001  # Very small number  
+
+        return prob
+
+
+#################################################################
+# Demos
+#################################################################
+
+def demo():
+    projective_rule_parse_demo()
+#    arity_parse_demo()
+    projective_prob_parse_demo()
+
+
+def projective_rule_parse_demo():
+    """
+    A demonstration showing the creation and use of a
+    ``DependencyGrammar`` to perform a projective dependency
+    parse.
+    """
+    grammar = DependencyGrammar.fromstring("""
+    'scratch' -> 'cats' | 'walls'
+    'walls' -> 'the'
+    'cats' -> 'the'
+    """)
+    print(grammar)
+    pdp = ProjectiveDependencyParser(grammar)
+    trees = pdp.parse(['the', 'cats', 'scratch', 'the', 'walls'])
+    for tree in trees:
+        print(tree)
+
+def arity_parse_demo():
+    """
+    A demonstration showing the creation of a ``DependencyGrammar``
+    in which a specific number of modifiers is listed for a given
+    head.  This can further constrain the number of possible parses
+    created by a ``ProjectiveDependencyParser``.
+    """
+    print()
+    print('A grammar with no arity constraints. Each DependencyProduction')
+    print('specifies a relationship between one head word and only one')
+    print('modifier word.')
+    grammar = DependencyGrammar.fromstring("""
+    'fell' -> 'price' | 'stock'
+    'price' -> 'of' | 'the'
+    'of' -> 'stock'
+    'stock' -> 'the'
+    """)
+    print(grammar)
+
+    print()
+    print('For the sentence \'The price of the stock fell\', this grammar')
+    print('will produce the following three parses:')
+    pdp = ProjectiveDependencyParser(grammar)
+    trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
+    for tree in trees:
+        print(tree)
+
+    print()
+    print('By contrast, the following grammar contains a ')
+    print('DependencyProduction that specifies a relationship')
+    print('between a single head word, \'price\', and two modifier')
+    print('words, \'of\' and \'the\'.')
+    grammar = DependencyGrammar.fromstring("""
+    'fell' -> 'price' | 'stock'
+    'price' -> 'of' 'the'
+    'of' -> 'stock'
+    'stock' -> 'the'
+    """)
+    print(grammar)
+
+    print()
+    print('This constrains the number of possible parses to just one:') # unimplemented, soon to replace
+    pdp = ProjectiveDependencyParser(grammar)
+    trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
+    for tree in trees:
+        print(tree)
+
+
+def projective_prob_parse_demo():
+    """
+    A demo showing the training and use of a projective
+    dependency parser.
+    """
+    from nltk.parse.dependencygraph import conll_data2
+
+    graphs = [DependencyGraph(entry)
+              for entry in conll_data2.split('\n\n') if entry]
+    ppdp = ProbabilisticProjectiveDependencyParser()
+    print('Training Probabilistic Projective Dependency Parser...')
+    ppdp.train(graphs)
+    
+    sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
+    print('Parsing \'', " ".join(sent), '\'...')
+    print('Parse:')
+    for tree in ppdp.parse(sent):
+        print(tree)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/projectivedependencyparser.pyc b/nlp_resource_data/nltk/parse/projectivedependencyparser.pyc

new file mode 100755 (executable)

index 0000000..5a83c10

Binary files /dev/null and b/nlp_resource_data/nltk/parse/projectivedependencyparser.pyc differ
diff --git a/nlp_resource_data/nltk/parse/recursivedescent.py b/nlp_resource_data/nltk/parse/recursivedescent.py

new file mode 100755 (executable)

index 0000000..a84a12f
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/recursivedescent.py
@@ -0,0 +1,655 @@
+# Natural Language Toolkit: Recursive Descent Parser
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from nltk.grammar import Nonterminal
+from nltk.tree import Tree, ImmutableTree
+from nltk.compat import unicode_repr
+
+from nltk.parse.api import ParserI
+
+##//////////////////////////////////////////////////////
+##  Recursive Descent Parser
+##//////////////////////////////////////////////////////
+class RecursiveDescentParser(ParserI):
+    """
+    A simple top-down CFG parser that parses texts by recursively
+    expanding the fringe of a Tree, and matching it against a
+    text.
+
+    ``RecursiveDescentParser`` uses a list of tree locations called a
+    "frontier" to remember which subtrees have not yet been expanded
+    and which leaves have not yet been matched against the text.  Each
+    tree location consists of a list of child indices specifying the
+    path from the root of the tree to a subtree or a leaf; see the
+    reference documentation for Tree for more information
+    about tree locations.
+
+    When the parser begins parsing a text, it constructs a tree
+    containing only the start symbol, and a frontier containing the
+    location of the tree's root node.  It then extends the tree to
+    cover the text, using the following recursive procedure:
+
+      - If the frontier is empty, and the text is covered by the tree,
+        then return the tree as a possible parse.
+      - If the frontier is empty, and the text is not covered by the
+        tree, then return no parses.
+      - If the first element of the frontier is a subtree, then
+        use CFG productions to "expand" it.  For each applicable
+        production, add the expanded subtree's children to the
+        frontier, and recursively find all parses that can be
+        generated by the new tree and frontier.
+      - If the first element of the frontier is a token, then "match"
+        it against the next token from the text.  Remove the token
+        from the frontier, and recursively find all parses that can be
+        generated by the new tree and frontier.
+
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        """
+        Create a new ``RecursiveDescentParser``, that uses ``grammar``
+        to parse texts.
+
+        :type grammar: CFG
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        self._grammar = grammar
+        self._trace = trace
+
+    def grammar(self):
+        return self._grammar
+
+    def parse(self, tokens):
+        # Inherit docs from ParserI
+
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+
+        # Start a recursive descent parse, with an initial tree
+        # containing just the start symbol.
+        start = self._grammar.start().symbol()
+        initial_tree = Tree(start, [])
+        frontier = [()]
+        if self._trace:
+            self._trace_start(initial_tree, frontier, tokens)
+        return self._parse(tokens, initial_tree, frontier)
+
+    def _parse(self, remaining_text, tree, frontier):
+        """
+        Recursively expand and match each elements of ``tree``
+        specified by ``frontier``, to cover ``remaining_text``.  Return
+        a list of all parses found.
+
+        :return: An iterator of all parses that can be generated by
+            matching and expanding the elements of ``tree``
+            specified by ``frontier``.
+        :rtype: iter(Tree)
+        :type tree: Tree
+        :param tree: A partial structure for the text that is
+            currently being parsed.  The elements of ``tree``
+            that are specified by ``frontier`` have not yet been
+            expanded or matched.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``tree``.
+        :type frontier: list(tuple(int))
+        :param frontier: A list of the locations within ``tree`` of
+            all subtrees that have not yet been expanded, and all
+            leaves that have not yet been matched.  This list sorted
+            in left-to-right order of location within the tree.
+        """
+
+        # If the tree covers the text, and there's nothing left to
+        # expand, then we've found a complete parse; return it.
+        if len(remaining_text) == 0 and len(frontier) == 0:
+            if self._trace:
+                self._trace_succeed(tree, frontier)
+            yield tree
+
+        # If there's still text, but nothing left to expand, we failed.
+        elif len(frontier) == 0:
+            if self._trace:
+                self._trace_backtrack(tree, frontier)
+
+        # If the next element on the frontier is a tree, expand it.
+        elif isinstance(tree[frontier[0]], Tree):
+            for result in self._expand(remaining_text, tree, frontier):
+                yield result
+
+        # If the next element on the frontier is a token, match it.
+        else:
+            for result in self._match(remaining_text, tree, frontier):
+                yield result
+
+    def _match(self, rtext, tree, frontier):
+        """
+        :rtype: iter(Tree)
+        :return: an iterator of all parses that can be generated by
+            matching the first element of ``frontier`` against the
+            first token in ``rtext``.  In particular, if the first
+            element of ``frontier`` has the same type as the first
+            token in ``rtext``, then substitute the token into
+            ``tree``; and return all parses that can be generated by
+            matching and expanding the remaining elements of
+            ``frontier``.  If the first element of ``frontier`` does not
+            have the same type as the first token in ``rtext``, then
+            return empty list.
+
+        :type tree: Tree
+        :param tree: A partial structure for the text that is
+            currently being parsed.  The elements of ``tree``
+            that are specified by ``frontier`` have not yet been
+            expanded or matched.
+        :type rtext: list(str)
+        :param rtext: The portion of the text that is not yet
+            covered by ``tree``.
+        :type frontier: list of tuple of int
+        :param frontier: A list of the locations within ``tree`` of
+            all subtrees that have not yet been expanded, and all
+            leaves that have not yet been matched.
+        """
+
+        tree_leaf = tree[frontier[0]]
+        if (len(rtext) > 0 and tree_leaf == rtext[0]):
+            # If it's a terminal that matches rtext[0], then substitute
+            # in the token, and continue parsing.
+            newtree = tree.copy(deep=True)
+            newtree[frontier[0]] = rtext[0]
+            if self._trace:
+                self._trace_match(newtree, frontier[1:], rtext[0])
+            for result in self._parse(rtext[1:], newtree, frontier[1:]):
+                yield result
+        else:
+            # If it's a non-matching terminal, fail.
+            if self._trace:
+                self._trace_backtrack(tree, frontier, rtext[:1])
+
+    def _expand(self, remaining_text, tree, frontier, production=None):
+        """
+        :rtype: iter(Tree)
+        :return: An iterator of all parses that can be generated by
+            expanding the first element of ``frontier`` with
+            ``production``.  In particular, if the first element of
+            ``frontier`` is a subtree whose node type is equal to
+            ``production``'s left hand side, then add a child to that
+            subtree for each element of ``production``'s right hand
+            side; and return all parses that can be generated by
+            matching and expanding the remaining elements of
+            ``frontier``.  If the first element of ``frontier`` is not a
+            subtree whose node type is equal to ``production``'s left
+            hand side, then return an empty list.  If ``production`` is
+            not specified, then return a list of all parses that can
+            be generated by expanding the first element of ``frontier``
+            with *any* CFG production.
+
+        :type tree: Tree
+        :param tree: A partial structure for the text that is
+            currently being parsed.  The elements of ``tree``
+            that are specified by ``frontier`` have not yet been
+            expanded or matched.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``tree``.
+        :type frontier: list(tuple(int))
+        :param frontier: A list of the locations within ``tree`` of
+            all subtrees that have not yet been expanded, and all
+            leaves that have not yet been matched.
+        """
+
+        if production is None: productions = self._grammar.productions()
+        else: productions = [production]
+
+        for production in productions:
+            lhs = production.lhs().symbol()
+            if lhs == tree[frontier[0]].label():
+                subtree = self._production_to_tree(production)
+                if frontier[0] == ():
+                    newtree = subtree
+                else:
+                    newtree = tree.copy(deep=True)
+                    newtree[frontier[0]] = subtree
+                new_frontier = [frontier[0]+(i,) for i in
+                                range(len(production.rhs()))]
+                if self._trace:
+                    self._trace_expand(newtree, new_frontier, production)
+                for result in self._parse(remaining_text, newtree,
+                                          new_frontier + frontier[1:]):
+                    yield result
+
+    def _production_to_tree(self, production):
+        """
+        :rtype: Tree
+        :return: The Tree that is licensed by ``production``.
+            In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
+            return a tree that has a node ``lhs.symbol``, and
+            ``n`` children.  For each nonterminal element
+            ``elt[i]`` in the production, the tree token has a
+            childless subtree with node value ``elt[i].symbol``; and
+            for each terminal element ``elt[j]``, the tree token has
+            a leaf token with type ``elt[j]``.
+
+        :param production: The CFG production that licenses the tree
+            token that should be returned.
+        :type production: Production
+        """
+        children = []
+        for elt in production.rhs():
+            if isinstance(elt, Nonterminal):
+                children.append(Tree(elt.symbol(), []))
+            else:
+                # This will be matched.
+                children.append(elt)
+        return Tree(production.lhs().symbol(), children)
+
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        self._trace = trace
+
+    def _trace_fringe(self, tree, treeloc=None):
+        """
+        Print trace output displaying the fringe of ``tree``.  The
+        fringe of ``tree`` consists of all of its leaves and all of
+        its childless subtrees.
+
+        :rtype: None
+        """
+
+        if treeloc == (): print("*", end=' ')
+        if isinstance(tree, Tree):
+            if len(tree) == 0:
+                print(unicode_repr(Nonterminal(tree.label())), end=' ')
+            for i in range(len(tree)):
+                if treeloc is not None and i == treeloc[0]:
+                    self._trace_fringe(tree[i], treeloc[1:])
+                else:
+                    self._trace_fringe(tree[i])
+        else:
+            print(unicode_repr(tree), end=' ')
+
+    def _trace_tree(self, tree, frontier, operation):
+        """
+        Print trace output displaying the parser's current state.
+
+        :param operation: A character identifying the operation that
+            generated the current state.
+        :rtype: None
+        """
+        if self._trace == 2: print('  %c [' % operation, end=' ')
+        else: print('    [', end=' ')
+        if len(frontier) > 0: self._trace_fringe(tree, frontier[0])
+        else: self._trace_fringe(tree)
+        print(']')
+
+    def _trace_start(self, tree, frontier, text):
+        print('Parsing %r' % " ".join(text))
+        if self._trace > 2: print('Start:')
+        if self._trace > 1: self._trace_tree(tree, frontier, ' ')
+
+    def _trace_expand(self, tree, frontier, production):
+        if self._trace > 2: print('Expand: %s' % production)
+        if self._trace > 1: self._trace_tree(tree, frontier, 'E')
+
+    def _trace_match(self, tree, frontier, tok):
+        if self._trace > 2: print('Match: %r' % tok)
+        if self._trace > 1: self._trace_tree(tree, frontier, 'M')
+
+    def _trace_succeed(self, tree, frontier):
+        if self._trace > 2: print('GOOD PARSE:')
+        if self._trace == 1: print('Found a parse:\n%s' % tree)
+        if self._trace > 1: self._trace_tree(tree, frontier, '+')
+
+    def _trace_backtrack(self, tree, frontier, toks=None):
+        if self._trace > 2:
+            if toks: print('Backtrack: %r match failed' % toks[0])
+            else: print('Backtrack')
+
+##//////////////////////////////////////////////////////
+##  Stepping Recursive Descent Parser
+##//////////////////////////////////////////////////////
+class SteppingRecursiveDescentParser(RecursiveDescentParser):
+    """
+    A ``RecursiveDescentParser`` that allows you to step through the
+    parsing process, performing a single operation at a time.
+
+    The ``initialize`` method is used to start parsing a text.
+    ``expand`` expands the first element on the frontier using a single
+    CFG production, and ``match`` matches the first element on the
+    frontier against the next text token. ``backtrack`` undoes the most
+    recent expand or match operation.  ``step`` performs a single
+    expand, match, or backtrack operation.  ``parses`` returns the set
+    of parses that have been found by the parser.
+
+    :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
+        containing the previous states of the parser.  This history is
+        used to implement the ``backtrack`` operation.
+    :ivar _tried_e: A record of all productions that have been tried
+        for a given tree.  This record is used by ``expand`` to perform
+        the next untried production.
+    :ivar _tried_m: A record of what tokens have been matched for a
+        given tree.  This record is used by ``step`` to decide whether
+        or not to match a token.
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
+        self._rtext = None
+        self._tree = None
+        self._frontier = [()]
+        self._tried_e = {}
+        self._tried_m = {}
+        self._history = []
+        self._parses = []
+
+    # [XX] TEMPORARY HACK WARNING!  This should be replaced with
+    # something nicer when we get the chance.
+    def _freeze(self, tree):
+        c = tree.copy()
+#        for pos in c.treepositions('leaves'):
+#            c[pos] = c[pos].freeze()
+        return ImmutableTree.convert(c)
+
+    def parse(self, tokens):
+        tokens = list(tokens)
+        self.initialize(tokens)
+        while self.step() is not None:
+            pass
+        return self.parses()
+
+    def initialize(self, tokens):
+        """
+        Start parsing a given text.  This sets the parser's tree to
+        the start symbol, its frontier to the root node, and its
+        remaining text to ``token['SUBTOKENS']``.
+        """
+
+        self._rtext = tokens
+        start = self._grammar.start().symbol()
+        self._tree = Tree(start, [])
+        self._frontier = [()]
+        self._tried_e = {}
+        self._tried_m = {}
+        self._history = []
+        self._parses = []
+        if self._trace:
+            self._trace_start(self._tree, self._frontier, self._rtext)
+
+    def remaining_text(self):
+        """
+        :return: The portion of the text that is not yet covered by the
+            tree.
+        :rtype: list(str)
+        """
+        return self._rtext
+
+    def frontier(self):
+        """
+        :return: A list of the tree locations of all subtrees that
+            have not yet been expanded, and all leaves that have not
+            yet been matched.
+        :rtype: list(tuple(int))
+        """
+        return self._frontier
+
+    def tree(self):
+        """
+        :return: A partial structure for the text that is
+            currently being parsed.  The elements specified by the
+            frontier have not yet been expanded or matched.
+        :rtype: Tree
+        """
+        return self._tree
+
+    def step(self):
+        """
+        Perform a single parsing operation.  If an untried match is
+        possible, then perform the match, and return the matched
+        token.  If an untried expansion is possible, then perform the
+        expansion, and return the production that it is based on.  If
+        backtracking is possible, then backtrack, and return True.
+        Otherwise, return None.
+
+        :return: None if no operation was performed; a token if a match
+            was performed; a production if an expansion was performed;
+            and True if a backtrack operation was performed.
+        :rtype: Production or String or bool
+        """
+        # Try matching (if we haven't already)
+        if self.untried_match():
+            token = self.match()
+            if token is not None: return token
+
+        # Try expanding.
+        production = self.expand()
+        if production is not None: return production
+
+        # Try backtracking
+        if self.backtrack():
+            self._trace_backtrack(self._tree, self._frontier)
+            return True
+
+        # Nothing left to do.
+        return None
+
+    def expand(self, production=None):
+        """
+        Expand the first element of the frontier.  In particular, if
+        the first element of the frontier is a subtree whose node type
+        is equal to ``production``'s left hand side, then add a child
+        to that subtree for each element of ``production``'s right hand
+        side.  If ``production`` is not specified, then use the first
+        untried expandable production.  If all expandable productions
+        have been tried, do nothing.
+
+        :return: The production used to expand the frontier, if an
+           expansion was performed.  If no expansion was performed,
+           return None.
+        :rtype: Production or None
+        """
+
+        # Make sure we *can* expand.
+        if len(self._frontier) == 0:
+            return None
+        if not isinstance(self._tree[self._frontier[0]], Tree):
+            return None
+
+        # If they didn't specify a production, check all untried ones.
+        if production is None:
+            productions = self.untried_expandable_productions()
+        else: productions = [production]
+
+        parses = []
+        for prod in productions:
+            # Record that we've tried this production now.
+            self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
+
+            # Try expanding.
+            for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
+                return prod
+
+        # We didn't expand anything.
+        return None
+
+    def match(self):
+        """
+        Match the first element of the frontier.  In particular, if
+        the first element of the frontier has the same type as the
+        next text token, then substitute the text token into the tree.
+
+        :return: The token matched, if a match operation was
+            performed.  If no match was performed, return None
+        :rtype: str or None
+        """
+
+        # Record that we've tried matching this token.
+        tok = self._rtext[0]
+        self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
+
+        # Make sure we *can* match.
+        if len(self._frontier) == 0:
+            return None
+        if isinstance(self._tree[self._frontier[0]], Tree):
+            return None
+
+        for _result in self._match(self._rtext, self._tree, self._frontier):
+            # Return the token we just matched.
+            return self._history[-1][0][0]
+        return None
+
+    def backtrack(self):
+        """
+        Return the parser to its state before the most recent
+        match or expand operation.  Calling ``undo`` repeatedly return
+        the parser to successively earlier states.  If no match or
+        expand operations have been performed, ``undo`` will make no
+        changes.
+
+        :return: true if an operation was successfully undone.
+        :rtype: bool
+        """
+        if len(self._history) == 0: return False
+        (self._rtext, self._tree, self._frontier) = self._history.pop()
+        return True
+
+    def expandable_productions(self):
+        """
+        :return: A list of all the productions for which expansions
+            are available for the current parser state.
+        :rtype: list(Production)
+        """
+        # Make sure we *can* expand.
+        if len(self._frontier) == 0: return []
+        frontier_child = self._tree[self._frontier[0]]
+        if (len(self._frontier) == 0 or
+            not isinstance(frontier_child, Tree)):
+            return []
+
+        return [p for p in self._grammar.productions()
+                if p.lhs().symbol() == frontier_child.label()]
+
+    def untried_expandable_productions(self):
+        """
+        :return: A list of all the untried productions for which
+            expansions are available for the current parser state.
+        :rtype: list(Production)
+        """
+
+        tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
+        return [p for p in self.expandable_productions()
+                if p not in tried_expansions]
+
+    def untried_match(self):
+        """
+        :return: Whether the first element of the frontier is a token
+            that has not yet been matched.
+        :rtype: bool
+        """
+
+        if len(self._rtext) == 0: return False
+        tried_matches = self._tried_m.get(self._freeze(self._tree), [])
+        return (self._rtext[0] not in tried_matches)
+
+    def currently_complete(self):
+        """
+        :return: Whether the parser's current state represents a
+            complete parse.
+        :rtype: bool
+        """
+        return (len(self._frontier) == 0 and len(self._rtext) == 0)
+
+    def _parse(self, remaining_text, tree, frontier):
+        """
+        A stub version of ``_parse`` that sets the parsers current
+        state to the given arguments.  In ``RecursiveDescentParser``,
+        the ``_parse`` method is used to recursively continue parsing a
+        text.  ``SteppingRecursiveDescentParser`` overrides it to
+        capture these recursive calls.  It records the parser's old
+        state in the history (to allow for backtracking), and updates
+        the parser's new state using the given arguments.  Finally, it
+        returns ``[1]``, which is used by ``match`` and ``expand`` to
+        detect whether their operations were successful.
+
+        :return: ``[1]``
+        :rtype: list of int
+        """
+        self._history.append( (self._rtext, self._tree, self._frontier) )
+        self._rtext = remaining_text
+        self._tree = tree
+        self._frontier = frontier
+
+        # Is it a good parse?  If so, record it.
+        if (len(frontier) == 0 and len(remaining_text) == 0):
+            self._parses.append(tree)
+            self._trace_succeed(self._tree, self._frontier)
+
+        return [1]
+
+    def parses(self):
+        """
+        :return: An iterator of the parses that have been found by this
+            parser so far.
+        :rtype: list of Tree
+        """
+        return iter(self._parses)
+
+    def set_grammar(self, grammar):
+        """
+        Change the grammar used to parse texts.
+
+        :param grammar: The new grammar.
+        :type grammar: CFG
+        """
+        self._grammar = grammar
+
+##//////////////////////////////////////////////////////
+##  Demonstration Code
+##//////////////////////////////////////////////////////
+
+def demo():
+    """
+    A demonstration of the recursive descent parser.
+    """
+
+    from nltk import parse, CFG
+
+    grammar = CFG.fromstring("""
+    S -> NP VP
+    NP -> Det N | Det N PP
+    VP -> V NP | V NP PP
+    PP -> P NP
+    NP -> 'I'
+    N -> 'man' | 'park' | 'telescope' | 'dog'
+    Det -> 'the' | 'a'
+    P -> 'in' | 'with'
+    V -> 'saw'
+    """)
+
+    for prod in grammar.productions():
+        print(prod)
+
+    sent = 'I saw a man in the park'.split()
+    parser = parse.RecursiveDescentParser(grammar, trace=2)
+    for p in parser.parse(sent):
+        print(p)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/recursivedescent.pyc b/nlp_resource_data/nltk/parse/recursivedescent.pyc

new file mode 100755 (executable)

index 0000000..714523d

Binary files /dev/null and b/nlp_resource_data/nltk/parse/recursivedescent.pyc differ
diff --git a/nlp_resource_data/nltk/parse/shiftreduce.py b/nlp_resource_data/nltk/parse/shiftreduce.py

new file mode 100755 (executable)

index 0000000..7fc8289
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/shiftreduce.py
@@ -0,0 +1,458 @@
+# Natural Language Toolkit: Shift-Reduce Parser
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from nltk.grammar import Nonterminal
+from nltk.tree import Tree
+from nltk.compat import unicode_repr
+
+from nltk.parse.api import ParserI
+
+##//////////////////////////////////////////////////////
+##  Shift/Reduce Parser
+##//////////////////////////////////////////////////////
+class ShiftReduceParser(ParserI):
+    """
+    A simple bottom-up CFG parser that uses two operations, "shift"
+    and "reduce", to find a single parse for a text.
+
+    ``ShiftReduceParser`` maintains a stack, which records the
+    structure of a portion of the text.  This stack is a list of
+    strings and Trees that collectively cover a portion of
+    the text.  For example, while parsing the sentence "the dog saw
+    the man" with a typical grammar, ``ShiftReduceParser`` will produce
+    the following stack, which covers "the dog saw"::
+
+       [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
+
+    ``ShiftReduceParser`` attempts to extend the stack to cover the
+    entire text, and to combine the stack elements into a single tree,
+    producing a complete parse for the sentence.
+
+    Initially, the stack is empty.  It is extended to cover the text,
+    from left to right, by repeatedly applying two operations:
+
+      - "shift" moves a token from the beginning of the text to the
+        end of the stack.
+      - "reduce" uses a CFG production to combine the rightmost stack
+        elements into a single Tree.
+
+    Often, more than one operation can be performed on a given stack.
+    In this case, ``ShiftReduceParser`` uses the following heuristics
+    to decide which operation to perform:
+
+      - Only shift if no reductions are available.
+      - If multiple reductions are available, then apply the reduction
+        whose CFG production is listed earliest in the grammar.
+
+    Note that these heuristics are not guaranteed to choose an
+    operation that leads to a parse of the text.  Also, if multiple
+    parses exists, ``ShiftReduceParser`` will return at most one of
+    them.
+
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        """
+        Create a new ``ShiftReduceParser``, that uses ``grammar`` to
+        parse texts.
+
+        :type grammar: Grammar
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        self._grammar = grammar
+        self._trace = trace
+        self._check_grammar()
+
+    def grammar(self):
+        return self._grammar
+
+    def parse(self, tokens):
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+
+        # initialize the stack.
+        stack = []
+        remaining_text = tokens
+
+        # Trace output.
+        if self._trace:
+            print('Parsing %r' % " ".join(tokens))
+            self._trace_stack(stack, remaining_text)
+
+        # iterate through the text, pushing the token onto
+        # the stack, then reducing the stack.
+        while len(remaining_text) > 0:
+            self._shift(stack, remaining_text)
+            while self._reduce(stack, remaining_text): pass
+
+        # Did we reduce everything?
+        if len(stack) == 1: 
+            # Did we end up with the right category?
+            if stack[0].label() == self._grammar.start().symbol():
+                yield stack[0]
+
+    def _shift(self, stack, remaining_text):
+        """
+        Move a token from the beginning of ``remaining_text`` to the
+        end of ``stack``.
+
+        :type stack: list(str and Tree)
+        :param stack: A list of strings and Trees, encoding
+            the structure of the text that has been parsed so far.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``stack``.
+        :rtype: None
+        """
+        stack.append(remaining_text[0])
+        remaining_text.remove(remaining_text[0])
+        if self._trace: self._trace_shift(stack, remaining_text)
+
+    def _match_rhs(self, rhs, rightmost_stack):
+        """
+        :rtype: bool
+        :return: true if the right hand side of a CFG production
+            matches the rightmost elements of the stack.  ``rhs``
+            matches ``rightmost_stack`` if they are the same length,
+            and each element of ``rhs`` matches the corresponding
+            element of ``rightmost_stack``.  A nonterminal element of
+            ``rhs`` matches any Tree whose node value is equal
+            to the nonterminal's symbol.  A terminal element of ``rhs``
+            matches any string whose type is equal to the terminal.
+        :type rhs: list(terminal and Nonterminal)
+        :param rhs: The right hand side of a CFG production.
+        :type rightmost_stack: list(string and Tree)
+        :param rightmost_stack: The rightmost elements of the parser's
+            stack.
+        """
+
+        if len(rightmost_stack) != len(rhs): return False
+        for i in range(len(rightmost_stack)):
+            if isinstance(rightmost_stack[i], Tree):
+                if not isinstance(rhs[i], Nonterminal): return False
+                if rightmost_stack[i].label() != rhs[i].symbol(): return False
+            else:
+                if isinstance(rhs[i], Nonterminal): return False
+                if rightmost_stack[i] != rhs[i]: return False
+        return True
+
+    def _reduce(self, stack, remaining_text, production=None):
+        """
+        Find a CFG production whose right hand side matches the
+        rightmost stack elements; and combine those stack elements
+        into a single Tree, with the node specified by the
+        production's left-hand side.  If more than one CFG production
+        matches the stack, then use the production that is listed
+        earliest in the grammar.  The new Tree replaces the
+        elements in the stack.
+
+        :rtype: Production or None
+        :return: If a reduction is performed, then return the CFG
+            production that the reduction is based on; otherwise,
+            return false.
+        :type stack: list(string and Tree)
+        :param stack: A list of strings and Trees, encoding
+            the structure of the text that has been parsed so far.
+        :type remaining_text: list(str)
+        :param remaining_text: The portion of the text that is not yet
+            covered by ``stack``.
+        """
+        if production is None:
+            productions = self._grammar.productions()
+        else:
+            productions = [production]
+
+        # Try each production, in order.
+        for production in productions:
+            rhslen = len(production.rhs())
+
+            # check if the RHS of a production matches the top of the stack
+            if self._match_rhs(production.rhs(), stack[-rhslen:]):
+
+                # combine the tree to reflect the reduction
+                tree = Tree(production.lhs().symbol(), stack[-rhslen:])
+                stack[-rhslen:] = [tree]
+
+                # We reduced something
+                if self._trace:
+                    self._trace_reduce(stack, production, remaining_text)
+                return production
+
+        # We didn't reduce anything
+        return None
+
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        # 1: just show shifts.
+        # 2: show shifts & reduces
+        # 3: display which tokens & productions are shifed/reduced
+        self._trace = trace
+
+    def _trace_stack(self, stack, remaining_text, marker=' '):
+        """
+        Print trace output displaying the given stack and text.
+
+        :rtype: None
+        :param marker: A character that is printed to the left of the
+            stack.  This is used with trace level 2 to print 'S'
+            before shifted stacks and 'R' before reduced stacks.
+        """
+        s = '  '+marker+' [ '
+        for elt in stack:
+            if isinstance(elt, Tree):
+                s += unicode_repr(Nonterminal(elt.label())) + ' '
+            else:
+                s += unicode_repr(elt) + ' '
+        s += '* ' + ' '.join(remaining_text) + ']'
+        print(s)
+
+    def _trace_shift(self, stack, remaining_text):
+        """
+        Print trace output displaying that a token has been shifted.
+
+        :rtype: None
+        """
+        if self._trace > 2: print('Shift %r:' % stack[-1])
+        if self._trace == 2: self._trace_stack(stack, remaining_text, 'S')
+        elif self._trace > 0: self._trace_stack(stack, remaining_text)
+
+    def _trace_reduce(self, stack, production, remaining_text):
+        """
+        Print trace output displaying that ``production`` was used to
+        reduce ``stack``.
+
+        :rtype: None
+        """
+        if self._trace > 2:
+            rhs = " ".join(production.rhs())
+            print('Reduce %r <- %s' % (production.lhs(), rhs))
+        if self._trace == 2: self._trace_stack(stack, remaining_text, 'R')
+        elif self._trace > 1: self._trace_stack(stack, remaining_text)
+
+    def _check_grammar(self):
+        """
+        Check to make sure that all of the CFG productions are
+        potentially useful.  If any productions can never be used,
+        then print a warning.
+
+        :rtype: None
+        """
+        productions = self._grammar.productions()
+
+        # Any production whose RHS is an extension of another production's RHS
+        # will never be used.
+        for i in range(len(productions)):
+            for j in range(i+1, len(productions)):
+                rhs1 = productions[i].rhs()
+                rhs2 = productions[j].rhs()
+                if rhs1[:len(rhs2)] == rhs2:
+                    print('Warning: %r will never be used' % productions[i])
+
+##//////////////////////////////////////////////////////
+##  Stepping Shift/Reduce Parser
+##//////////////////////////////////////////////////////
+class SteppingShiftReduceParser(ShiftReduceParser):
+    """
+    A ``ShiftReduceParser`` that allows you to setp through the parsing
+    process, performing a single operation at a time.  It also allows
+    you to change the parser's grammar midway through parsing a text.
+
+    The ``initialize`` method is used to start parsing a text.
+    ``shift`` performs a single shift operation, and ``reduce`` performs
+    a single reduce operation.  ``step`` will perform a single reduce
+    operation if possible; otherwise, it will perform a single shift
+    operation.  ``parses`` returns the set of parses that have been
+    found by the parser.
+
+    :ivar _history: A list of ``(stack, remaining_text)`` pairs,
+        containing all of the previous states of the parser.  This
+        history is used to implement the ``undo`` operation.
+    :see: ``nltk.grammar``
+    """
+    def __init__(self, grammar, trace=0):
+        super(SteppingShiftReduceParser, self).__init__(grammar, trace)
+        self._stack = None
+        self._remaining_text = None
+        self._history = []
+
+    def parse(self, tokens):
+        tokens = list(tokens)
+        self.initialize(tokens)
+        while self.step():
+            pass
+        return self.parses()
+
+    def stack(self):
+        """
+        :return: The parser's stack.
+        :rtype: list(str and Tree)
+        """
+        return self._stack
+
+    def remaining_text(self):
+        """
+        :return: The portion of the text that is not yet covered by the
+            stack.
+        :rtype: list(str)
+        """
+        return self._remaining_text
+
+    def initialize(self, tokens):
+        """
+        Start parsing a given text.  This sets the parser's stack to
+        ``[]`` and sets its remaining text to ``tokens``.
+        """
+        self._stack = []
+        self._remaining_text = tokens
+        self._history = []
+
+    def step(self):
+        """
+        Perform a single parsing operation.  If a reduction is
+        possible, then perform that reduction, and return the
+        production that it is based on.  Otherwise, if a shift is
+        possible, then perform it, and return True.  Otherwise,
+        return False.
+
+        :return: False if no operation was performed; True if a shift was
+            performed; and the CFG production used to reduce if a
+            reduction was performed.
+        :rtype: Production or bool
+        """
+        return self.reduce() or self.shift()
+
+    def shift(self):
+        """
+        Move a token from the beginning of the remaining text to the
+        end of the stack.  If there are no more tokens in the
+        remaining text, then do nothing.
+
+        :return: True if the shift operation was successful.
+        :rtype: bool
+        """
+        if len(self._remaining_text) == 0: return False
+        self._history.append( (self._stack[:], self._remaining_text[:]) )
+        self._shift(self._stack, self._remaining_text)
+        return True
+
+    def reduce(self, production=None):
+        """
+        Use ``production`` to combine the rightmost stack elements into
+        a single Tree.  If ``production`` does not match the
+        rightmost stack elements, then do nothing.
+
+        :return: The production used to reduce the stack, if a
+            reduction was performed.  If no reduction was performed,
+            return None.
+
+        :rtype: Production or None
+        """
+        self._history.append( (self._stack[:], self._remaining_text[:]) )
+        return_val = self._reduce(self._stack, self._remaining_text,
+                                  production)
+
+        if not return_val: self._history.pop()
+        return return_val
+
+    def undo(self):
+        """
+        Return the parser to its state before the most recent
+        shift or reduce operation.  Calling ``undo`` repeatedly return
+        the parser to successively earlier states.  If no shift or
+        reduce operations have been performed, ``undo`` will make no
+        changes.
+
+        :return: true if an operation was successfully undone.
+        :rtype: bool
+        """
+        if len(self._history) == 0: return False
+        (self._stack, self._remaining_text) = self._history.pop()
+        return True
+
+    def reducible_productions(self):
+        """
+        :return: A list of the productions for which reductions are
+            available for the current parser state.
+        :rtype: list(Production)
+        """
+        productions = []
+        for production in self._grammar.productions():
+            rhslen = len(production.rhs())
+            if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
+                productions.append(production)
+        return productions
+
+    def parses(self):
+        """
+        :return: An iterator of the parses that have been found by this
+            parser so far.
+        :rtype: iter(Tree)
+        """
+        if (len(self._remaining_text) == 0 and
+            len(self._stack) == 1 and
+            self._stack[0].label() == self._grammar.start().symbol()
+            ):
+            yield self._stack[0]
+
+# copied from nltk.parser
+
+    def set_grammar(self, grammar):
+        """
+        Change the grammar used to parse texts.
+
+        :param grammar: The new grammar.
+        :type grammar: CFG
+        """
+        self._grammar = grammar
+
+##//////////////////////////////////////////////////////
+##  Demonstration Code
+##//////////////////////////////////////////////////////
+
+def demo():
+    """
+    A demonstration of the shift-reduce parser.
+    """
+
+    from nltk import parse, CFG
+
+    grammar = CFG.fromstring("""
+    S -> NP VP
+    NP -> Det N | Det N PP
+    VP -> V NP | V NP PP
+    PP -> P NP
+    NP -> 'I'
+    N -> 'man' | 'park' | 'telescope' | 'dog'
+    Det -> 'the' | 'a'
+    P -> 'in' | 'with'
+    V -> 'saw'
+    """)
+
+    sent = 'I saw a man in the park'.split()
+
+    parser = parse.ShiftReduceParser(grammar, trace=2)
+    for p in parser.parse(sent):
+        print(p)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/shiftreduce.pyc b/nlp_resource_data/nltk/parse/shiftreduce.pyc

new file mode 100755 (executable)

index 0000000..6d91abc

Binary files /dev/null and b/nlp_resource_data/nltk/parse/shiftreduce.pyc differ
diff --git a/nlp_resource_data/nltk/parse/stanford.py b/nlp_resource_data/nltk/parse/stanford.py

new file mode 100755 (executable)

index 0000000..34939a9
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/stanford.py
@@ -0,0 +1,412 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Stanford Parser
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Xu <xxu@student.unimelb.edu.au>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals
+
+import tempfile
+import os
+import re
+import warnings
+from subprocess import PIPE
+from io import StringIO
+
+from six import text_type
+
+from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options, find_jars_within_path
+
+from nltk.parse.api import ParserI
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.tree import Tree
+
+_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
+
+class GenericStanfordParser(ParserI):
+    """Interface to the Stanford Parser"""
+
+    _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar'
+    _JAR = r'stanford-parser\.jar'
+    _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
+
+    _USE_STDIN = False
+    _DOUBLE_SPACED_OUTPUT = False
+
+    def __init__(self, path_to_jar=None, path_to_models_jar=None,
+                 model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
+                 encoding='utf8', verbose=False,
+                 java_options='-mx1000m', corenlp_options=''):
+
+        # find the most recent code and model jar
+        stanford_jar = max(
+            find_jar_iter(
+                self._JAR, path_to_jar,
+                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
+                searchpath=(), url=_stanford_url,
+                verbose=verbose, is_regex=True
+            ),
+            key=lambda model_path: os.path.dirname(model_path)
+        )
+
+        model_jar=max(
+            find_jar_iter(
+                self._MODEL_JAR_PATTERN, path_to_models_jar,
+                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
+                searchpath=(), url=_stanford_url,
+                verbose=verbose, is_regex=True
+            ),
+            key=lambda model_path: os.path.dirname(model_path)
+        )
+
+
+        #self._classpath = (stanford_jar, model_jar)
+
+        # Adding logging jar files to classpath
+        stanford_dir = os.path.split(stanford_jar)[0]
+        self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
+
+        self.model_path = model_path
+        self._encoding = encoding
+        self.corenlp_options = corenlp_options
+        self.java_options = java_options
+
+    def _parse_trees_output(self, output_):
+        res = []
+        cur_lines = []
+        cur_trees = []
+        blank = False
+        for line in output_.splitlines(False):
+            if line == '':
+                if blank:
+                    res.append(iter(cur_trees))
+                    cur_trees = []
+                    blank = False
+                elif self._DOUBLE_SPACED_OUTPUT:
+                    cur_trees.append(self._make_tree('\n'.join(cur_lines)))
+                    cur_lines = []
+                    blank = True
+                else:
+                    res.append(iter([self._make_tree('\n'.join(cur_lines))]))
+                    cur_lines = []
+            else:
+                cur_lines.append(line)
+                blank = False
+        return iter(res)
+
+    def parse_sents(self, sentences, verbose=False):
+        """
+        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
+        list where each sentence is a list of words.
+        Each sentence will be automatically tagged with this StanfordParser instance's
+        tagger.
+        If whitespaces exists inside a token, then the token will be treated as
+        separate tokens.
+
+        :param sentences: Input sentences to parse
+        :type sentences: list(list(str))
+        :rtype: iter(iter(Tree))
+        """
+        cmd = [
+            self._MAIN_CLASS,
+            '-model', self.model_path,
+            '-sentences', 'newline',
+            '-outputFormat', self._OUTPUT_FORMAT,
+            '-tokenized',
+            '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor',
+        ]
+        return self._parse_trees_output(self._execute(
+            cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose))
+
+    def raw_parse(self, sentence, verbose=False):
+        """
+        Use StanfordParser to parse a sentence. Takes a sentence as a string;
+        before parsing, it will be automatically tokenized and tagged by
+        the Stanford Parser.
+
+        :param sentence: Input sentence to parse
+        :type sentence: str
+        :rtype: iter(Tree)
+        """
+        return next(self.raw_parse_sents([sentence], verbose))
+
+    def raw_parse_sents(self, sentences, verbose=False):
+        """
+        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
+        list of strings.
+        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
+
+        :param sentences: Input sentences to parse
+        :type sentences: list(str)
+        :rtype: iter(iter(Tree))
+        """
+        cmd = [
+            self._MAIN_CLASS,
+            '-model', self.model_path,
+            '-sentences', 'newline',
+            '-outputFormat', self._OUTPUT_FORMAT,
+        ]
+        return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose))
+
+    def tagged_parse(self, sentence, verbose=False):
+        """
+        Use StanfordParser to parse a sentence. Takes a sentence as a list of
+        (word, tag) tuples; the sentence must have already been tokenized and
+        tagged.
+
+        :param sentence: Input sentence to parse
+        :type sentence: list(tuple(str, str))
+        :rtype: iter(Tree)
+        """
+        return next(self.tagged_parse_sents([sentence], verbose))
+
+    def tagged_parse_sents(self, sentences, verbose=False):
+        """
+        Use StanfordParser to parse multiple sentences. Takes multiple sentences
+        where each sentence is a list of (word, tag) tuples.
+        The sentences must have already been tokenized and tagged.
+
+        :param sentences: Input sentences to parse
+        :type sentences: list(list(tuple(str, str)))
+        :rtype: iter(iter(Tree))
+        """
+        tag_separator = '/'
+        cmd = [
+            self._MAIN_CLASS,
+            '-model', self.model_path,
+            '-sentences', 'newline',
+            '-outputFormat', self._OUTPUT_FORMAT,
+            '-tokenized',
+            '-tagSeparator', tag_separator,
+            '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer',
+            '-tokenizerMethod', 'newCoreLabelTokenizerFactory',
+        ]
+        # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
+        return self._parse_trees_output(self._execute(
+            cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose))
+
+    def _execute(self, cmd, input_, verbose=False):
+        encoding = self._encoding
+        cmd.extend(['-encoding', encoding])
+        if self.corenlp_options:
+            cmd.append(self.corenlp_options)
+
+        default_options = ' '.join(_java_options)
+
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+
+        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
+        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
+            # Write the actual sentences to the temporary input file
+            if isinstance(input_, text_type) and encoding:
+                input_ = input_.encode(encoding)
+            input_file.write(input_)
+            input_file.flush()
+
+            # Run the tagger and get the output.
+            if self._USE_STDIN:
+                input_file.seek(0)
+                stdout, stderr = java(cmd, classpath=self._classpath,
+                                      stdin=input_file, stdout=PIPE, stderr=PIPE)
+            else:
+                cmd.append(input_file.name)
+                stdout, stderr = java(cmd, classpath=self._classpath,
+                                      stdout=PIPE, stderr=PIPE)
+
+            stdout = stdout.replace(b'\xc2\xa0',b' ')
+            stdout = stdout.replace(b'\x00\xa0',b' ')
+            stdout = stdout.decode(encoding)
+
+        os.unlink(input_file.name)
+
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+
+        return stdout
+
+class StanfordParser(GenericStanfordParser):
+    """
+    >>> parser=StanfordParser(
+    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+    ... )
+
+    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
+
+    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
+    ...     "the quick brown fox jumps over the lazy dog",
+    ...     "the quick grey wolf jumps over the lazy fox"
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
+    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
+    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
+    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
+
+    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
+    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
+    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
+    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']),
+    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])]
+
+    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
+    ...     (
+    ...         ("The", "DT"),
+    ...         ("quick", "JJ"),
+    ...         ("brown", "JJ"),
+    ...         ("fox", "NN"),
+    ...         ("jumped", "VBD"),
+    ...         ("over", "IN"),
+    ...         ("the", "DT"),
+    ...         ("lazy", "JJ"),
+    ...         ("dog", "NN"),
+    ...         (".", "."),
+    ...     ),
+    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
+    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
+    """
+
+    _OUTPUT_FORMAT = 'penn'
+
+    def _make_tree(self, result):
+        return Tree.fromstring(result)
+
+
+class StanfordDependencyParser(GenericStanfordParser):
+
+    """
+    >>> dep_parser=StanfordDependencyParser(
+    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+    ... )
+
+    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
+
+    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
+    ...     "The quick brown fox jumps over the lazy dog.",
+    ...     "The quick grey wolf jumps over the lazy fox."
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
+    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
+
+    >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
+    ...     (
+    ...         ("The", "DT"),
+    ...         ("quick", "JJ"),
+    ...         ("brown", "JJ"),
+    ...         ("fox", "NN"),
+    ...         ("jumped", "VBD"),
+    ...         ("over", "IN"),
+    ...         ("the", "DT"),
+    ...         ("lazy", "JJ"),
+    ...         ("dog", "NN"),
+    ...         (".", "."),
+    ...     ),
+    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
+    [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+
+    """
+
+    _OUTPUT_FORMAT = 'conll2007'
+
+    def _make_tree(self, result):
+        return DependencyGraph(result, top_relation_label='root')
+
+
+class StanfordNeuralDependencyParser(GenericStanfordParser):
+    '''
+    >>> from nltk.parse.stanford import StanfordNeuralDependencyParser
+    >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx3g')
+
+    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
+
+    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
+    (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
+    u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
+    ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
+    (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
+    u'punct', (u'.', u'.'))]]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
+    ...     "The quick brown fox jumps over the lazy dog.",
+    ...     "The quick grey wolf jumps over the lazy fox."
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
+    'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
+    Tree('fox', ['over', 'the', 'lazy']), '.'])]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
+    ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
+    '''
+
+    _OUTPUT_FORMAT = 'conll'
+    _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
+    _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar'
+    _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar'
+    _USE_STDIN = True
+    _DOUBLE_SPACED_OUTPUT = True
+
+    def __init__(self, *args, **kwargs):
+        super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
+        self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse'
+
+    def tagged_parse_sents(self, sentences, verbose=False):
+        '''
+        Currently unimplemented because the neural dependency parser (and
+        the StanfordCoreNLP pipeline class) doesn't support passing in pre-
+        tagged tokens.
+        '''
+        raise NotImplementedError(
+            'tagged_parse[_sents] is not supported by '
+            'StanfordNeuralDependencyParser; use '
+            'parse[_sents] or raw_parse[_sents] instead.'
+        )
+
+    def _make_tree(self, result):
+        return DependencyGraph(result, top_relation_label='ROOT')
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        StanfordParser(
+            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
+        )
+        StanfordNeuralDependencyParser()
+    except LookupError:
+        raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist')
diff --git a/nlp_resource_data/nltk/parse/stanford.pyc b/nlp_resource_data/nltk/parse/stanford.pyc

new file mode 100755 (executable)

index 0000000..bf3b11b

Binary files /dev/null and b/nlp_resource_data/nltk/parse/stanford.pyc differ
diff --git a/nlp_resource_data/nltk/parse/transitionparser.py b/nlp_resource_data/nltk/parse/transitionparser.py

new file mode 100755 (executable)

index 0000000..cad2261
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/transitionparser.py
@@ -0,0 +1,774 @@
+# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
+#
+# Author: Long Duong <longdt219@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import pickle
+
+from os import remove
+from copy import deepcopy
+from operator import itemgetter
+try:
+    from numpy import array
+    from scipy import sparse
+    from sklearn.datasets import load_svmlight_file
+    from sklearn import svm
+except ImportError:
+    pass
+
+from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
+
+
+
+class Configuration(object):
+    """
+    Class for holding configuration which is the partial analysis of the input sentence.
+    The transition based parser aims at finding set of operators that transfer the initial
+    configuration to the terminal configuration.
+
+    The configuration includes:
+        - Stack: for storing partially proceeded words
+        - Buffer: for storing remaining input words
+        - Set of arcs: for storing partially built dependency tree
+
+    This class also provides a method to represent a configuration as list of features.
+    """
+
+    def __init__(self, dep_graph):
+        """
+        :param dep_graph: the representation of an input in the form of dependency graph.
+        :type dep_graph: DependencyGraph where the dependencies are not specified.
+        """
+        # dep_graph.nodes contain list of token for a sentence
+        self.stack = [0]  # The root element
+        self.buffer = list(range(1, len(dep_graph.nodes)))  # The rest is in the buffer
+        self.arcs = []  # empty set of arc
+        self._tokens = dep_graph.nodes
+        self._max_address = len(self.buffer)
+
+    def __str__(self):
+        return 'Stack : ' + \
+            str(self.stack) + '  Buffer : ' + str(self.buffer) + '   Arcs : ' + str(self.arcs)
+
+    def _check_informative(self, feat, flag=False):
+        """
+        Check whether a feature is informative
+        The flag control whether "_" is informative or not
+        """
+        if feat is None:
+            return False
+        if feat == '':
+            return False
+        if flag is False:
+            if feat == '_':
+                return False
+        return True
+
+    def extract_features(self):
+        """
+        Extract the set of features for the current configuration. Implement standard features as describe in
+        Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
+        Please note that these features are very basic.
+        :return: list(str)
+        """
+        result = []
+        # Todo : can come up with more complicated features set for better
+        # performance.
+        if len(self.stack) > 0:
+            # Stack 0
+            stack_idx0 = self.stack[len(self.stack) - 1]
+            token = self._tokens[stack_idx0]
+            if self._check_informative(token['word'], True):
+                result.append('STK_0_FORM_' + token['word'])
+            if 'lemma' in token and self._check_informative(token['lemma']):
+                result.append('STK_0_LEMMA_' + token['lemma'])
+            if self._check_informative(token['tag']):
+                result.append('STK_0_POS_' + token['tag'])
+            if 'feats' in token and self._check_informative(token['feats']):
+                feats = token['feats'].split("|")
+                for feat in feats:
+                    result.append('STK_0_FEATS_' + feat)
+            # Stack 1
+            if len(self.stack) > 1:
+                stack_idx1 = self.stack[len(self.stack) - 2]
+                token = self._tokens[stack_idx1]
+                if self._check_informative(token['tag']):
+                    result.append('STK_1_POS_' + token['tag'])
+
+            # Left most, right most dependency of stack[0]
+            left_most = 1000000
+            right_most = -1
+            dep_left_most = ''
+            dep_right_most = ''
+            for (wi, r, wj) in self.arcs:
+                if wi == stack_idx0:
+                    if (wj > wi) and (wj > right_most):
+                        right_most = wj
+                        dep_right_most = r
+                    if (wj < wi) and (wj < left_most):
+                        left_most = wj
+                        dep_left_most = r
+            if self._check_informative(dep_left_most):
+                result.append('STK_0_LDEP_' + dep_left_most)
+            if self._check_informative(dep_right_most):
+                result.append('STK_0_RDEP_' + dep_right_most)
+
+        # Check Buffered 0
+        if len(self.buffer) > 0:
+            # Buffer 0
+            buffer_idx0 = self.buffer[0]
+            token = self._tokens[buffer_idx0]
+            if self._check_informative(token['word'], True):
+                result.append('BUF_0_FORM_' + token['word'])
+            if 'lemma' in token and self._check_informative(token['lemma']):
+                result.append('BUF_0_LEMMA_' + token['lemma'])
+            if self._check_informative(token['tag']):
+                result.append('BUF_0_POS_' + token['tag'])
+            if 'feats' in token and self._check_informative(token['feats']):
+                feats = token['feats'].split("|")
+                for feat in feats:
+                    result.append('BUF_0_FEATS_' + feat)
+            # Buffer 1
+            if len(self.buffer) > 1:
+                buffer_idx1 = self.buffer[1]
+                token = self._tokens[buffer_idx1]
+                if self._check_informative(token['word'], True):
+                    result.append('BUF_1_FORM_' + token['word'])
+                if self._check_informative(token['tag']):
+                    result.append('BUF_1_POS_' + token['tag'])
+            if len(self.buffer) > 2:
+                buffer_idx2 = self.buffer[2]
+                token = self._tokens[buffer_idx2]
+                if self._check_informative(token['tag']):
+                    result.append('BUF_2_POS_' + token['tag'])
+            if len(self.buffer) > 3:
+                buffer_idx3 = self.buffer[3]
+                token = self._tokens[buffer_idx3]
+                if self._check_informative(token['tag']):
+                    result.append('BUF_3_POS_' + token['tag'])
+                    # Left most, right most dependency of stack[0]
+            left_most = 1000000
+            right_most = -1
+            dep_left_most = ''
+            dep_right_most = ''
+            for (wi, r, wj) in self.arcs:
+                if wi == buffer_idx0:
+                    if (wj > wi) and (wj > right_most):
+                        right_most = wj
+                        dep_right_most = r
+                    if (wj < wi) and (wj < left_most):
+                        left_most = wj
+                        dep_left_most = r
+            if self._check_informative(dep_left_most):
+                result.append('BUF_0_LDEP_' + dep_left_most)
+            if self._check_informative(dep_right_most):
+                result.append('BUF_0_RDEP_' + dep_right_most)
+
+        return result
+
+
+class Transition(object):
+    """
+    This class defines a set of transition which is applied to a configuration to get another configuration
+    Note that for different parsing algorithm, the transition is different.
+    """
+    # Define set of transitions
+    LEFT_ARC = 'LEFTARC'
+    RIGHT_ARC = 'RIGHTARC'
+    SHIFT = 'SHIFT'
+    REDUCE = 'REDUCE'
+
+    def __init__(self, alg_option):
+        """
+        :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
+        :type alg_option: str
+        """
+        self._algo = alg_option
+        if alg_option not in [
+                TransitionParser.ARC_STANDARD,
+                TransitionParser.ARC_EAGER]:
+            raise ValueError(" Currently we only support %s and %s " %
+                                        (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER))
+
+    def left_arc(self, conf, relation):
+        """
+        Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
+            :param configuration: is the current configuration
+            :return : A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
+            return -1
+        if conf.buffer[0] == 0:
+            # here is the Root element
+            return -1
+
+        idx_wi = conf.stack[len(conf.stack) - 1]
+
+        flag = True
+        if self._algo == TransitionParser.ARC_EAGER:
+            for (idx_parent, r, idx_child) in conf.arcs:
+                if idx_child == idx_wi:
+                    flag = False
+
+        if flag:
+            conf.stack.pop()
+            idx_wj = conf.buffer[0]
+            conf.arcs.append((idx_wj, relation, idx_wi))
+        else:
+            return -1
+
+    def right_arc(self, conf, relation):
+        """
+        Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
+            :param configuration: is the current configuration
+            :return : A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
+            return -1
+        if self._algo == TransitionParser.ARC_STANDARD:
+            idx_wi = conf.stack.pop()
+            idx_wj = conf.buffer[0]
+            conf.buffer[0] = idx_wi
+            conf.arcs.append((idx_wi, relation, idx_wj))
+        else:  # arc-eager
+            idx_wi = conf.stack[len(conf.stack) - 1]
+            idx_wj = conf.buffer.pop(0)
+            conf.stack.append(idx_wj)
+            conf.arcs.append((idx_wi, relation, idx_wj))
+
+    def reduce(self, conf):
+        """
+        Note that the algorithm for reduce is only available for arc-eager
+            :param configuration: is the current configuration
+            :return : A new configuration or -1 if the pre-condition is not satisfied
+        """
+
+        if self._algo != TransitionParser.ARC_EAGER:
+            return -1
+        if len(conf.stack) <= 0:
+            return -1
+
+        idx_wi = conf.stack[len(conf.stack) - 1]
+        flag = False
+        for (idx_parent, r, idx_child) in conf.arcs:
+            if idx_child == idx_wi:
+                flag = True
+        if flag:
+            conf.stack.pop()  # reduce it
+        else:
+            return -1
+
+    def shift(self, conf):
+        """
+        Note that the algorithm for shift is the SAME for arc-standard and arc-eager
+            :param configuration: is the current configuration
+            :return : A new configuration or -1 if the pre-condition is not satisfied
+        """
+        if len(conf.buffer) <= 0:
+            return -1
+        idx_wi = conf.buffer.pop(0)
+        conf.stack.append(idx_wi)
+
+
+class TransitionParser(ParserI):
+
+    """
+    Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
+    """
+    ARC_STANDARD = 'arc-standard'
+    ARC_EAGER = 'arc-eager'
+
+    def __init__(self, algorithm):
+        """
+        :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
+        :type algorithm: str
+        """
+        if not(algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
+            raise ValueError(" Currently we only support %s and %s " %
+                                        (self.ARC_STANDARD, self.ARC_EAGER))
+        self._algorithm = algorithm
+
+        self._dictionary = {}
+        self._transition = {}
+        self._match_transition = {}
+
+    def _get_dep_relation(self, idx_parent, idx_child, depgraph):
+        p_node = depgraph.nodes[idx_parent]
+        c_node = depgraph.nodes[idx_child]
+
+        if c_node['word'] is None:
+            return None  # Root word
+
+        if c_node['head'] == p_node['address']:
+            return c_node['rel']
+        else:
+            return None
+
+    def _convert_to_binary_features(self, features):
+        """
+        :param features: list of feature string which is needed to convert to binary features
+        :type features: list(str)
+        :return : string of binary features in libsvm format  which is 'featureID:value' pairs
+        """
+        unsorted_result = []
+        for feature in features:
+            self._dictionary.setdefault(feature, len(self._dictionary))
+            unsorted_result.append(self._dictionary[feature])
+
+        # Default value of each feature is 1.0
+        return ' '.join(str(featureID) + ':1.0' for featureID in sorted(unsorted_result))
+
+    def _is_projective(self, depgraph):
+        arc_list = []
+        for key in depgraph.nodes:
+            node = depgraph.nodes[key]
+
+            if 'head' in node:
+                childIdx = node['address']
+                parentIdx = node['head']
+                if parentIdx is not None:
+                    arc_list.append((parentIdx, childIdx))
+
+        for (parentIdx, childIdx) in arc_list:
+            # Ensure that childIdx < parentIdx
+            if childIdx > parentIdx:
+                temp = childIdx
+                childIdx = parentIdx
+                parentIdx = temp
+            for k in range(childIdx + 1, parentIdx):
+                for m in range(len(depgraph.nodes)):
+                    if (m < childIdx) or (m > parentIdx):
+                        if (k, m) in arc_list:
+                            return False
+                        if (m, k) in arc_list:
+                            return False
+        return True
+
+    def _write_to_file(self, key, binary_features, input_file):
+        """
+        write the binary features to input file and update the transition dictionary
+        """
+        self._transition.setdefault(key, len(self._transition) + 1)
+        self._match_transition[self._transition[key]] = key
+
+        input_str = str(self._transition[key]) + ' ' + binary_features + '\n'
+        input_file.write(input_str.encode('utf-8'))
+
+    def _create_training_examples_arc_std(self, depgraphs, input_file):
+        """
+        Create the training example in the libsvm format and write it to the input_file.
+        Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
+        """
+        operation = Transition(self.ARC_STANDARD)
+        count_proj = 0
+        training_seq = []
+
+        for depgraph in depgraphs:
+            if not self._is_projective(depgraph):
+                continue
+
+            count_proj += 1
+            conf = Configuration(depgraph)
+            while len(conf.buffer) > 0:
+                b0 = conf.buffer[0]
+                features = conf.extract_features()
+                binary_features = self._convert_to_binary_features(features)
+
+                if len(conf.stack) > 0:
+                    s0 = conf.stack[len(conf.stack) - 1]
+                    # Left-arc operation
+                    rel = self._get_dep_relation(b0, s0, depgraph)
+                    if rel is not None:
+                        key = Transition.LEFT_ARC + ':' + rel
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.left_arc(conf, rel)
+                        training_seq.append(key)
+                        continue
+
+                    # Right-arc operation
+                    rel = self._get_dep_relation(s0, b0, depgraph)
+                    if rel is not None:
+                        precondition = True
+                        # Get the max-index of buffer
+                        maxID = conf._max_address
+
+                        for w in range(maxID + 1):
+                            if w != b0:
+                                relw = self._get_dep_relation(b0, w, depgraph)
+                                if relw is not None:
+                                    if (b0, relw, w) not in conf.arcs:
+                                        precondition = False
+
+                        if precondition:
+                            key = Transition.RIGHT_ARC + ':' + rel
+                            self._write_to_file(
+                                key,
+                                binary_features,
+                                input_file)
+                            operation.right_arc(conf, rel)
+                            training_seq.append(key)
+                            continue
+
+                # Shift operation as the default
+                key = Transition.SHIFT
+                self._write_to_file(key, binary_features, input_file)
+                operation.shift(conf)
+                training_seq.append(key)
+
+        print(" Number of training examples : " + str(len(depgraphs)))
+        print(" Number of valid (projective) examples : " + str(count_proj))
+        return training_seq
+
+    def _create_training_examples_arc_eager(self, depgraphs, input_file):
+        """
+        Create the training example in the libsvm format and write it to the input_file.
+        Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
+        """
+        operation = Transition(self.ARC_EAGER)
+        countProj = 0
+        training_seq = []
+
+        for depgraph in depgraphs:
+            if not self._is_projective(depgraph):
+                continue
+
+            countProj += 1
+            conf = Configuration(depgraph)
+            while len(conf.buffer) > 0:
+                b0 = conf.buffer[0]
+                features = conf.extract_features()
+                binary_features = self._convert_to_binary_features(features)
+
+                if len(conf.stack) > 0:
+                    s0 = conf.stack[len(conf.stack) - 1]
+                    # Left-arc operation
+                    rel = self._get_dep_relation(b0, s0, depgraph)
+                    if rel is not None:
+                        key = Transition.LEFT_ARC + ':' + rel
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.left_arc(conf, rel)
+                        training_seq.append(key)
+                        continue
+
+                    # Right-arc operation
+                    rel = self._get_dep_relation(s0, b0, depgraph)
+                    if rel is not None:
+                        key = Transition.RIGHT_ARC + ':' + rel
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.right_arc(conf, rel)
+                        training_seq.append(key)
+                        continue
+
+                    # reduce operation
+                    flag = False
+                    for k in range(s0):
+                        if self._get_dep_relation(k, b0, depgraph) is not None:
+                            flag = True
+                        if self._get_dep_relation(b0, k, depgraph) is not None:
+                            flag = True
+                    if flag:
+                        key = Transition.REDUCE
+                        self._write_to_file(key, binary_features, input_file)
+                        operation.reduce(conf)
+                        training_seq.append(key)
+                        continue
+
+                # Shift operation as the default
+                key = Transition.SHIFT
+                self._write_to_file(key, binary_features, input_file)
+                operation.shift(conf)
+                training_seq.append(key)
+
+        print(" Number of training examples : " + str(len(depgraphs)))
+        print(" Number of valid (projective) examples : " + str(countProj))
+        return training_seq
+
+    def train(self, depgraphs, modelfile, verbose=True):
+        """
+        :param depgraphs : list of DependencyGraph as the training data
+        :type depgraphs : DependencyGraph
+        :param modelfile : file name to save the trained model
+        :type modelfile : str
+        """
+
+        try:
+            input_file = tempfile.NamedTemporaryFile(
+                prefix='transition_parse.train',
+                dir=tempfile.gettempdir(),
+                delete=False)
+
+            if self._algorithm == self.ARC_STANDARD:
+                self._create_training_examples_arc_std(depgraphs, input_file)
+            else:
+                self._create_training_examples_arc_eager(depgraphs, input_file)
+
+            input_file.close()
+            # Using the temporary file to train the libsvm classifier
+            x_train, y_train = load_svmlight_file(input_file.name)
+            # The parameter is set according to the paper:
+            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
+            # Todo : because of probability = True => very slow due to
+            # cross-validation. Need to improve the speed here
+            model = svm.SVC(
+                kernel='poly',
+                degree=2,
+                coef0=0,
+                gamma=0.2,
+                C=0.5,
+                verbose=verbose,
+                probability=True)
+
+            model.fit(x_train, y_train)
+            # Save the model to file name (as pickle)
+            pickle.dump(model, open(modelfile, 'wb'))
+        finally:
+            remove(input_file.name)
+
+    def parse(self, depgraphs, modelFile):
+        """
+        :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
+        :type depgraphs: list(DependencyGraph)
+        :param modelfile: the model file
+        :type modelfile: str
+        :return: list (DependencyGraph) with the 'head' and 'rel' information
+        """
+        result = []
+        # First load the model
+        model = pickle.load(open(modelFile, 'rb'))
+        operation = Transition(self._algorithm)
+
+        for depgraph in depgraphs:
+            conf = Configuration(depgraph)
+            while len(conf.buffer) > 0:
+                features = conf.extract_features()
+                col = []
+                row = []
+                data = []
+                for feature in features:
+                    if feature in self._dictionary:
+                        col.append(self._dictionary[feature])
+                        row.append(0)
+                        data.append(1.0)
+                np_col = array(sorted(col))  # NB : index must be sorted
+                np_row = array(row)
+                np_data = array(data)
+
+                x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary)))
+
+                # It's best to use decision function as follow BUT it's not supported yet for sparse SVM
+                # Using decision funcion to build the votes array
+                #dec_func = model.decision_function(x_test)[0]
+                #votes = {}
+                #k = 0
+                # for i in range(len(model.classes_)):
+                #    for j in range(i+1, len(model.classes_)):
+                #        #if  dec_func[k] > 0:
+                #            votes.setdefault(i,0)
+                #            votes[i] +=1
+                #        else:
+                #           votes.setdefault(j,0)
+                #           votes[j] +=1
+                #        k +=1
+                # Sort votes according to the values
+                #sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
+
+                # We will use predict_proba instead of decision_function
+                prob_dict = {}
+                pred_prob = model.predict_proba(x_test)[0]
+                for i in range(len(pred_prob)):
+                    prob_dict[i] = pred_prob[i]
+                sorted_Prob = sorted(
+                    prob_dict.items(),
+                    key=itemgetter(1),
+                    reverse=True)
+
+                # Note that SHIFT is always a valid operation
+                for (y_pred_idx, confidence) in sorted_Prob:
+                    #y_pred = model.predict(x_test)[0]
+                    # From the prediction match to the operation
+                    y_pred = model.classes_[y_pred_idx]
+
+                    if y_pred in self._match_transition:
+                        strTransition = self._match_transition[y_pred]
+                        baseTransition = strTransition.split(":")[0]
+
+                        if baseTransition == Transition.LEFT_ARC:
+                            if operation.left_arc(conf, strTransition.split(":")[1]) != -1:
+                                break
+                        elif baseTransition == Transition.RIGHT_ARC:
+                            if operation.right_arc(conf, strTransition.split(":")[1]) != -1:
+                                break
+                        elif baseTransition == Transition.REDUCE:
+                            if operation.reduce(conf) != -1:
+                                break
+                        elif baseTransition == Transition.SHIFT:
+                            if operation.shift(conf) != -1:
+                                break
+                    else:
+                        raise ValueError("The predicted transition is not recognized, expected errors")
+
+            # Finish with operations build the dependency graph from Conf.arcs
+
+            new_depgraph = deepcopy(depgraph)
+            for key in new_depgraph.nodes:
+                node = new_depgraph.nodes[key]
+                node['rel'] = ''
+                # With the default, all the token depend on the Root
+                node['head'] = 0
+            for (head, rel, child) in conf.arcs:
+                c_node = new_depgraph.nodes[child]
+                c_node['head'] = head
+                c_node['rel'] = rel
+            result.append(new_depgraph)
+
+        return result
+
+
+def demo():
+    """
+    >>> from nltk.parse import DependencyGraph, DependencyEvaluator
+    >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
+    >>> gold_sent = DependencyGraph(\"""
+    ... Economic  JJ     2      ATT
+    ... news  NN     3       SBJ
+    ... has       VBD       0       ROOT
+    ... little      JJ      5       ATT
+    ... effect   NN     3       OBJ
+    ... on     IN      5       ATT
+    ... financial       JJ       8       ATT
+    ... markets    NNS      6       PC
+    ... .    .      3       PU
+    ... \""")
+
+    >>> conf = Configuration(gold_sent)
+
+    ###################### Check the Initial Feature ########################
+
+    >>> print(', '.join(conf.extract_features()))
+    STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
+
+    ###################### Check The Transition #######################
+    Check the Initialized Configuration
+    >>> print(conf)
+    Stack : [0]  Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9]   Arcs : []
+
+    A. Do some transition checks for ARC-STANDARD
+
+    >>> operation = Transition('arc-standard')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf, "ATT")
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,"SBJ")
+    >>> operation.shift(conf)
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf, "ATT")
+    >>> operation.shift(conf)
+    >>> operation.shift(conf)
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf, "ATT")
+
+    Middle Configuration and Features Check
+    >>> print(conf)
+    Stack : [0, 3, 5, 6]  Buffer : [8, 9]   Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
+
+    >>> print(', '.join(conf.extract_features()))
+    STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
+
+    >>> operation.right_arc(conf, "PC")
+    >>> operation.right_arc(conf, "ATT")
+    >>> operation.right_arc(conf, "OBJ")
+    >>> operation.shift(conf)
+    >>> operation.right_arc(conf, "PU")
+    >>> operation.right_arc(conf, "ROOT")
+    >>> operation.shift(conf)
+
+    Terminated Configuration Check
+    >>> print(conf)
+    Stack : [0]  Buffer : []   Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
+
+
+    B. Do some transition checks for ARC-EAGER
+
+    >>> conf = Configuration(gold_sent)
+    >>> operation = Transition('arc-eager')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'ATT')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'SBJ')
+    >>> operation.right_arc(conf,'ROOT')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'ATT')
+    >>> operation.right_arc(conf,'OBJ')
+    >>> operation.right_arc(conf,'ATT')
+    >>> operation.shift(conf)
+    >>> operation.left_arc(conf,'ATT')
+    >>> operation.right_arc(conf,'PC')
+    >>> operation.reduce(conf)
+    >>> operation.reduce(conf)
+    >>> operation.reduce(conf)
+    >>> operation.right_arc(conf,'PU')
+    >>> print(conf)
+    Stack : [0, 3, 9]  Buffer : []   Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
+
+    ###################### Check The Training Function #######################
+
+    A. Check the ARC-STANDARD training
+    >>> import tempfile
+    >>> import os
+    >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
+
+    >>> parser_std = TransitionParser('arc-standard')
+    >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
+
+    >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    >>> remove(input_file.name)
+
+    B. Check the ARC-EAGER training
+
+    >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
+    >>> parser_eager = TransitionParser('arc-eager')
+    >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+    SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
+
+    >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
+     Number of training examples : 1
+     Number of valid (projective) examples : 1
+
+    >>> remove(input_file.name)
+
+    ###################### Check The Parsing Function ########################
+
+    A. Check the ARC-STANDARD parser
+
+    >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
+    >>> de = DependencyEvaluator(result, [gold_sent])
+    >>> de.eval() >= (0, 0)
+    True
+
+    B. Check the ARC-EAGER parser
+    >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
+    >>> de = DependencyEvaluator(result, [gold_sent])
+    >>> de.eval() >= (0, 0)
+    True
+
+    Remove test temporary files
+    >>> remove('temp.arceager.model')
+    >>> remove('temp.arcstd.model')
+
+    Note that result is very poor because of only one training example.
+    """
+
diff --git a/nlp_resource_data/nltk/parse/transitionparser.pyc b/nlp_resource_data/nltk/parse/transitionparser.pyc

new file mode 100755 (executable)

index 0000000..98676f7

Binary files /dev/null and b/nlp_resource_data/nltk/parse/transitionparser.pyc differ
diff --git a/nlp_resource_data/nltk/parse/util.py b/nlp_resource_data/nltk/parse/util.py

new file mode 100755 (executable)

index 0000000..e8694b6
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/util.py
@@ -0,0 +1,231 @@
+# Natural Language Toolkit: Parser Utility Functions
+#
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+Utility functions for parsers.
+"""
+from __future__ import print_function
+
+from nltk.grammar import CFG, FeatureGrammar, PCFG
+from nltk.data import load
+
+from nltk.parse.chart import Chart, ChartParser
+from nltk.parse.pchart import InsideChartParser
+from nltk.parse.featurechart import FeatureChart, FeatureChartParser
+
+def load_parser(grammar_url, trace=0,
+                parser=None, chart_class=None,
+                beam_size=0, **load_args):
+    """
+    Load a grammar from a file, and build a parser based on that grammar.
+    The parser depends on the grammar format, and might also depend
+    on properties of the grammar itself.
+
+    The following grammar formats are currently supported:
+      - ``'cfg'``  (CFGs: ``CFG``)
+      - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
+      - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
+
+    :type grammar_url: str
+    :param grammar_url: A URL specifying where the grammar is located.
+        The default protocol is ``"nltk:"``, which searches for the file
+        in the the NLTK data package.
+    :type trace: int
+    :param trace: The level of tracing that should be used when
+        parsing a text.  ``0`` will generate no tracing output;
+        and higher numbers will produce more verbose tracing output.
+    :param parser: The class used for parsing; should be ``ChartParser``
+        or a subclass.
+        If None, the class depends on the grammar format.
+    :param chart_class: The class used for storing the chart;
+        should be ``Chart`` or a subclass.
+        Only used for CFGs and feature CFGs.
+        If None, the chart class depends on the grammar format.
+    :type beam_size: int
+    :param beam_size: The maximum length for the parser's edge queue.
+        Only used for probabilistic CFGs.
+    :param load_args: Keyword parameters used when loading the grammar.
+        See ``data.load`` for more information.
+    """
+    grammar = load(grammar_url, **load_args)
+    if not isinstance(grammar, CFG):
+        raise ValueError("The grammar must be a CFG, "
+                         "or a subclass thereof.")
+    if isinstance(grammar, PCFG):
+        if parser is None:
+            parser = InsideChartParser
+        return parser(grammar, trace=trace, beam_size=beam_size)
+
+    elif isinstance(grammar, FeatureGrammar):
+        if parser is None:
+            parser = FeatureChartParser
+        if chart_class is None:
+            chart_class = FeatureChart
+        return parser(grammar, trace=trace, chart_class=chart_class)
+
+    else: # Plain CFG.
+        if parser is None:
+            parser = ChartParser
+        if chart_class is None:
+            chart_class = Chart
+        return parser(grammar, trace=trace, chart_class=chart_class)
+
+def taggedsent_to_conll(sentence):
+       """
+       A module to convert a single POS tagged sentence into CONLL format.
+       
+       >>> from nltk import word_tokenize, pos_tag
+       >>> text = "This is a foobar sentence."
+       >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
+       ...     print(line, end="")
+        1      This    _       DT      DT      _       0       a       _       _
+        2      is      _       VBZ     VBZ     _       0       a       _       _
+        3      a       _       DT      DT      _       0       a       _       _
+        4      foobar  _       JJ      JJ      _       0       a       _       _
+        5      sentence        _       NN      NN      _       0       a       _       _
+        6      .               _       .       .       _       0       a       _       _
+       
+       :param sentence: A single input sentence to parse
+       :type sentence: list(tuple(str, str))
+       :rtype: iter(str) 
+       :return: a generator yielding a single sentence in CONLL format.
+       """
+       for (i, (word, tag)) in enumerate(sentence, start=1):
+               input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
+               input_str = "\t".join(input_str) + "\n"
+               yield input_str
+
+
+def taggedsents_to_conll(sentences):
+       """
+       A module to convert the a POS tagged document stream
+       (i.e. list of list of tuples, a list of sentences) and yield lines 
+       in CONLL format. This module yields one line per word and two newlines 
+       for end of sentence. 
+
+       >>> from nltk import word_tokenize, sent_tokenize, pos_tag
+       >>> text = "This is a foobar sentence. Is that right?"
+       >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
+       >>> for line in taggedsents_to_conll(sentences):
+        ...     if line:
+       ...         print(line, end="")
+        1      This    _       DT      DT      _       0       a       _       _
+        2      is      _       VBZ     VBZ     _       0       a       _       _
+        3      a       _       DT      DT      _       0       a       _       _
+        4      foobar  _       JJ      JJ      _       0       a       _       _
+        5      sentence        _       NN      NN      _       0       a       _       _
+        6      .               _       .       .       _       0       a       _       _
+        <BLANKLINE>
+        <BLANKLINE>
+        1      Is      _       VBZ     VBZ     _       0       a       _       _
+        2      that    _       IN      IN      _       0       a       _       _
+        3      right   _       NN      NN      _       0       a       _       _
+        4      ?       _       .       .       _       0       a       _       _
+        <BLANKLINE>
+        <BLANKLINE>
+
+       :param sentences: Input sentences to parse
+       :type sentence: list(list(tuple(str, str)))
+       :rtype: iter(str) 
+       :return: a generator yielding sentences in CONLL format.
+       """
+       for sentence in sentences:
+               for input_str in taggedsent_to_conll(sentence):
+                       yield input_str
+               yield '\n\n'            
+
+######################################################################
+#{ Test Suites
+######################################################################
+
+class TestGrammar(object):
+    """
+    Unit tests for  CFG.
+    """
+    def __init__(self, grammar, suite, accept=None, reject=None):
+        self.test_grammar = grammar
+
+        self.cp = load_parser(grammar, trace=0)
+        self.suite = suite
+        self._accept = accept
+        self._reject = reject
+
+
+    def run(self, show_trees=False):
+        """
+        Sentences in the test suite are divided into two classes:
+         - grammatical (``accept``) and
+         - ungrammatical (``reject``).
+        If a sentence should parse accordng to the grammar, the value of
+        ``trees`` will be a non-empty list. If a sentence should be rejected
+        according to the grammar, then the value of ``trees`` will be None.
+        """
+        for test in self.suite:
+            print(test['doc'] + ":", end=' ')
+            for key in ['accept', 'reject']:
+                for sent in test[key]:
+                    tokens = sent.split()
+                    trees = list(self.cp.parse(tokens))
+                    if show_trees and trees:
+                        print()
+                        print(sent)
+                        for tree in trees:
+                            print(tree)
+                    if key == 'accept':
+                        if trees == []:
+                            raise ValueError("Sentence '%s' failed to parse'" % sent)
+                        else:
+                            accepted = True
+                    else:
+                        if trees:
+                            raise ValueError("Sentence '%s' received a parse'" % sent)
+                        else:
+                            rejected = True
+            if accepted and rejected:
+                print("All tests passed!")
+
+def extract_test_sentences(string, comment_chars="#%;", encoding=None):
+    """
+    Parses a string with one test sentence per line.
+    Lines can optionally begin with:
+      - a bool, saying if the sentence is grammatical or not, or
+      - an int, giving the number of parse trees is should have,
+    The result information is followed by a colon, and then the sentence.
+    Empty lines and lines beginning with a comment char are ignored.
+
+    :return: a list of tuple of sentences and expected results,
+        where a sentence is a list of str,
+        and a result is None, or bool, or int
+
+    :param comment_chars: ``str`` of possible comment characters.
+    :param encoding: the encoding of the string, if it is binary
+    """
+    if encoding is not None:
+        string = string.decode(encoding)
+    sentences = []
+    for sentence in string.split('\n'):
+        if sentence == '' or sentence[0] in comment_chars:
+            continue
+        split_info = sentence.split(':', 1)
+        result = None
+        if len(split_info) == 2:
+            if split_info[0] in ['True','true','False','false']:
+                result = split_info[0] in ['True','true']
+                sentence = split_info[1]
+            else:
+                result = int(split_info[0])
+                sentence = split_info[1]
+        tokens = sentence.split()
+        if tokens == []:
+            continue
+        sentences += [(tokens, result)]
+    return sentences
+
+# nose thinks it is a test
+extract_test_sentences.__test__ = False
diff --git a/nlp_resource_data/nltk/parse/util.pyc b/nlp_resource_data/nltk/parse/util.pyc

new file mode 100755 (executable)

index 0000000..8564030

Binary files /dev/null and b/nlp_resource_data/nltk/parse/util.pyc differ
diff --git a/nlp_resource_data/nltk/parse/viterbi.py b/nlp_resource_data/nltk/parse/viterbi.py

new file mode 100755 (executable)

index 0000000..dce5979
--- /dev/null
+++ b/nlp_resource_data/nltk/parse/viterbi.py
@@ -0,0 +1,401 @@
+# Natural Language Toolkit: Viterbi Probabilistic Parser
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from functools import reduce
+from nltk.tree import Tree, ProbabilisticTree
+from nltk.compat import python_2_unicode_compatible
+
+from nltk.parse.api import ParserI
+
+##//////////////////////////////////////////////////////
+##  Viterbi PCFG Parser
+##//////////////////////////////////////////////////////
+
+@python_2_unicode_compatible
+class ViterbiParser(ParserI):
+    """
+    A bottom-up ``PCFG`` parser that uses dynamic programming to find
+    the single most likely parse for a text.  The ``ViterbiParser`` parser
+    parses texts by filling in a "most likely constituent table".
+    This table records the most probable tree representation for any
+    given span and node value.  In particular, it has an entry for
+    every start index, end index, and node value, recording the most
+    likely subtree that spans from the start index to the end index,
+    and has the given node value.
+
+    The ``ViterbiParser`` parser fills in this table incrementally.  It starts
+    by filling in all entries for constituents that span one element
+    of text (i.e., entries where the end index is one greater than the
+    start index).  After it has filled in all table entries for
+    constituents that span one element of text, it fills in the
+    entries for constitutants that span two elements of text.  It
+    continues filling in the entries for constituents spanning larger
+    and larger portions of the text, until the entire table has been
+    filled.  Finally, it returns the table entry for a constituent
+    spanning the entire text, whose node value is the grammar's start
+    symbol.
+
+    In order to find the most likely constituent with a given span and
+    node value, the ``ViterbiParser`` parser considers all productions that
+    could produce that node value.  For each production, it finds all
+    children that collectively cover the span and have the node values
+    specified by the production's right hand side.  If the probability
+    of the tree formed by applying the production to the children is
+    greater than the probability of the current entry in the table,
+    then the table is updated with this new tree.
+
+    A pseudo-code description of the algorithm used by
+    ``ViterbiParser`` is:
+
+    | Create an empty most likely constituent table, *MLC*.
+    | For width in 1...len(text):
+    |   For start in 1...len(text)-width:
+    |     For prod in grammar.productions:
+    |       For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
+    |         where t[i].label()==prod.rhs[i],
+    |         and the sequence covers [start:start+width]:
+    |           old_p = MLC[start, start+width, prod.lhs]
+    |           new_p = P(t[1])P(t[1])...P(t[n])P(prod)
+    |           if new_p > old_p:
+    |             new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
+    |             MLC[start, start+width, prod.lhs] = new_tree
+    | Return MLC[0, len(text), start_symbol]
+
+    :type _grammar: PCFG
+    :ivar _grammar: The grammar used to parse sentences.
+    :type _trace: int
+    :ivar _trace: The level of tracing output that should be generated
+        when parsing a text.
+    """
+    def __init__(self, grammar, trace=0):
+        """
+        Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
+        parse texts.
+
+        :type grammar: PCFG
+        :param grammar: The grammar used to parse texts.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            and higher numbers will produce more verbose tracing
+            output.
+        """
+        self._grammar = grammar
+        self._trace = trace
+
+    def grammar(self):
+        return self._grammar
+
+    def trace(self, trace=2):
+        """
+        Set the level of tracing output that should be generated when
+        parsing a text.
+
+        :type trace: int
+        :param trace: The trace level.  A trace level of ``0`` will
+            generate no tracing output; and higher trace levels will
+            produce more verbose tracing output.
+        :rtype: None
+        """
+        self._trace = trace
+
+    def parse(self, tokens):
+        # Inherit docs from ParserI
+
+        tokens = list(tokens)
+        self._grammar.check_coverage(tokens)
+
+        # The most likely constituent table.  This table specifies the
+        # most likely constituent for a given span and type.
+        # Constituents can be either Trees or tokens.  For Trees,
+        # the "type" is the Nonterminal for the tree's root node
+        # value.  For Tokens, the "type" is the token's type.
+        # The table is stored as a dictionary, since it is sparse.
+        constituents = {}
+
+        # Initialize the constituents dictionary with the words from
+        # the text.
+        if self._trace: print(('Inserting tokens into the most likely'+
+                               ' constituents table...'))
+        for index in range(len(tokens)):
+            token = tokens[index]
+            constituents[index,index+1,token] = token
+            if self._trace > 1:
+                self._trace_lexical_insertion(token, index, len(tokens))
+
+        # Consider each span of length 1, 2, ..., n; and add any trees
+        # that might cover that span to the constituents dictionary.
+        for length in range(1, len(tokens)+1):
+            if self._trace:
+                print(('Finding the most likely constituents'+
+                       ' spanning %d text elements...' % length))
+            for start in range(len(tokens)-length+1):
+                span = (start, start+length)
+                self._add_constituents_spanning(span, constituents,
+                                                tokens)
+
+        # Return the tree that spans the entire text & have the right cat
+        tree = constituents.get((0, len(tokens), self._grammar.start()))
+        if tree is not None:
+            yield tree
+
+    def _add_constituents_spanning(self, span, constituents, tokens):
+        """
+        Find any constituents that might cover ``span``, and add them
+        to the most likely constituents table.
+
+        :rtype: None
+        :type span: tuple(int, int)
+        :param span: The section of the text for which we are
+            trying to find possible constituents.  The span is
+            specified as a pair of integers, where the first integer
+            is the index of the first token that should be included in
+            the constituent; and the second integer is the index of
+            the first token that should not be included in the
+            constituent.  I.e., the constituent should cover
+            ``text[span[0]:span[1]]``, where ``text`` is the text
+            that we are parsing.
+
+        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
+        :param constituents: The most likely constituents table.  This
+            table records the most probable tree representation for
+            any given span and node value.  In particular,
+            ``constituents(s,e,nv)`` is the most likely
+            ``ProbabilisticTree`` that covers ``text[s:e]``
+            and has a node value ``nv.symbol()``, where ``text``
+            is the text that we are parsing.  When
+            ``_add_constituents_spanning`` is called, ``constituents``
+            should contain all possible constituents that are shorter
+            than ``span``.
+
+        :type tokens: list of tokens
+        :param tokens: The text we are parsing.  This is only used for
+            trace output.
+        """
+        # Since some of the grammar productions may be unary, we need to
+        # repeatedly try all of the productions until none of them add any
+        # new constituents.
+        changed = True
+        while changed:
+            changed = False
+
+            # Find all ways instantiations of the grammar productions that
+            # cover the span.
+            instantiations = self._find_instantiations(span, constituents)
+
+            # For each production instantiation, add a new
+            # ProbabilisticTree whose probability is the product
+            # of the childrens' probabilities and the production's
+            # probability.
+            for (production, children) in instantiations:
+                subtrees = [c for c in children if isinstance(c, Tree)]
+                p = reduce(lambda pr,t:pr*t.prob(),
+                           subtrees, production.prob())
+                node = production.lhs().symbol()
+                tree = ProbabilisticTree(node, children, prob=p)
+
+                # If it's new a constituent, then add it to the
+                # constituents dictionary.
+                c = constituents.get((span[0], span[1], production.lhs()))
+                if self._trace > 1:
+                    if c is None or c != tree:
+                        if c is None or c.prob() < tree.prob():
+                            print('   Insert:', end=' ')
+                        else:
+                            print('  Discard:', end=' ')
+                        self._trace_production(production, p, span, len(tokens))
+                if c is None or c.prob() < tree.prob():
+                    constituents[span[0], span[1], production.lhs()] = tree
+                    changed = True
+
+    def _find_instantiations(self, span, constituents):
+        """
+        :return: a list of the production instantiations that cover a
+            given span of the text.  A "production instantiation" is
+            a tuple containing a production and a list of children,
+            where the production's right hand side matches the list of
+            children; and the children cover ``span``.  :rtype: list
+            of ``pair`` of ``Production``, (list of
+            (``ProbabilisticTree`` or token.
+
+        :type span: tuple(int, int)
+        :param span: The section of the text for which we are
+            trying to find production instantiations.  The span is
+            specified as a pair of integers, where the first integer
+            is the index of the first token that should be covered by
+            the production instantiation; and the second integer is
+            the index of the first token that should not be covered by
+            the production instantiation.
+        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
+        :param constituents: The most likely constituents table.  This
+            table records the most probable tree representation for
+            any given span and node value.  See the module
+            documentation for more information.
+        """
+        rv = []
+        for production in self._grammar.productions():
+            childlists = self._match_rhs(production.rhs(), span, constituents)
+
+            for childlist in childlists:
+                rv.append( (production, childlist) )
+        return rv
+
+    def _match_rhs(self, rhs, span, constituents):
+        """
+        :return: a set of all the lists of children that cover ``span``
+            and that match ``rhs``.
+        :rtype: list(list(ProbabilisticTree or token)
+
+        :type rhs: list(Nonterminal or any)
+        :param rhs: The list specifying what kinds of children need to
+            cover ``span``.  Each nonterminal in ``rhs`` specifies
+            that the corresponding child should be a tree whose node
+            value is that nonterminal's symbol.  Each terminal in ``rhs``
+            specifies that the corresponding child should be a token
+            whose type is that terminal.
+        :type span: tuple(int, int)
+        :param span: The section of the text for which we are
+            trying to find child lists.  The span is specified as a
+            pair of integers, where the first integer is the index of
+            the first token that should be covered by the child list;
+            and the second integer is the index of the first token
+            that should not be covered by the child list.
+        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
+        :param constituents: The most likely constituents table.  This
+            table records the most probable tree representation for
+            any given span and node value.  See the module
+            documentation for more information.
+        """
+        (start, end) = span
+
+        # Base case
+        if start >= end and rhs == (): return [[]]
+        if start >= end or rhs == (): return []
+
+        # Find everything that matches the 1st symbol of the RHS
+        childlists = []
+        for split in range(start, end+1):
+            l=constituents.get((start,split,rhs[0]))
+            if l is not None:
+                rights = self._match_rhs(rhs[1:], (split,end), constituents)
+                childlists += [[l]+r for r in rights]
+
+        return childlists
+
+    def _trace_production(self, production, p, span, width):
+        """
+        Print trace output indicating that a given production has been
+        applied at a given location.
+
+        :param production: The production that has been applied
+        :type production: Production
+        :param p: The probability of the tree produced by the production.
+        :type p: float
+        :param span: The span of the production
+        :type span: tuple
+        :rtype: None
+        """
+
+        str = '|' + '.' * span[0]
+        str += '=' * (span[1] - span[0])
+        str += '.' * (width - span[1]) + '| '
+        str += '%s' % production
+        if self._trace > 2: str = '%-40s %12.10f ' % (str, p)
+
+        print(str)
+
+    def _trace_lexical_insertion(self, token, index, width):
+        str = '   Insert: |' + '.' * index + '=' + '.' * (width-index-1) + '| '
+        str += '%s' % (token,)
+        print(str)
+
+    def __repr__(self):
+        return '<ViterbiParser for %r>' % self._grammar
+
+
+##//////////////////////////////////////////////////////
+##  Test Code
+##//////////////////////////////////////////////////////
+
+def demo():
+    """
+    A demonstration of the probabilistic parsers.  The user is
+    prompted to select which demo to run, and how many parses should
+    be found; and then each parser is run on the same demo, and a
+    summary of the results are displayed.
+    """
+    import sys, time
+    from nltk import tokenize
+    from nltk.parse import ViterbiParser
+    from nltk.grammar import toy_pcfg1, toy_pcfg2
+
+    # Define two demos.  Each demo has a sentence and a grammar.
+    demos = [('I saw the man with my telescope', toy_pcfg1),
+             ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)]
+
+    # Ask the user which demo they want to use.
+    print()
+    for i in range(len(demos)):
+        print('%3s: %s' % (i+1, demos[i][0]))
+        print('     %r' % demos[i][1])
+        print()
+    print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
+    try:
+        snum = int(sys.stdin.readline().strip())-1
+        sent, grammar = demos[snum]
+    except:
+        print('Bad sentence number')
+        return
+
+    # Tokenize the sentence.
+    tokens = sent.split()
+
+    parser = ViterbiParser(grammar)
+    all_parses = {}
+
+    print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
+    parser.trace(3)
+    t = time.time()
+    parses = parser.parse_all(tokens)
+    time = time.time()-t
+    average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
+               if parses else 0)
+    num_parses = len(parses)
+    for p in parses:
+        all_parses[p.freeze()] = 1
+
+    # Print some summary statistics
+    print()
+    print('Time (secs)   # Parses   Average P(parse)')
+    print('-----------------------------------------')
+    print('%11.4f%11d%19.14f' % (time, num_parses, average))
+    parses = all_parses.keys()
+    if parses:
+        p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
+    else: p = 0
+    print('------------------------------------------')
+    print('%11s%11d%19.14f' % ('n/a', len(parses), p))
+
+    # Ask the user if we should draw the parses.
+    print()
+    print('Draw parses (y/n)? ', end=' ')
+    if sys.stdin.readline().strip().lower().startswith('y'):
+        from nltk.draw.tree import draw_trees
+        print('  please wait...')
+        draw_trees(*parses)
+
+    # Ask the user if we should print the parses.
+    print()
+    print('Print parses (y/n)? ', end=' ')
+    if sys.stdin.readline().strip().lower().startswith('y'):
+        for parse in parses:
+            print(parse)
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/parse/viterbi.pyc b/nlp_resource_data/nltk/parse/viterbi.pyc

new file mode 100755 (executable)

index 0000000..a89c0d9

Binary files /dev/null and b/nlp_resource_data/nltk/parse/viterbi.pyc differ
diff --git a/nlp_resource_data/nltk/probability.py b/nlp_resource_data/nltk/probability.py

new file mode 100755 (executable)

index 0000000..0528c2b
--- /dev/null
+++ b/nlp_resource_data/nltk/probability.py
@@ -0,0 +1,2401 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Probability and Statistics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (additions)
+#         Trevor Cohn <tacohn@cs.mu.oz.au> (additions)
+#         Peter Ljunglöf <peter.ljunglof@heatherleaf.se> (additions)
+#         Liang Dong <ldong@clemson.edu> (additions)
+#         Geoffrey Sampson <sampson@cantab.net> (additions)
+#         Ilia Kurenkov <ilia.kurenkov@gmail.com> (additions)
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes for representing and processing probabilistic information.
+
+The ``FreqDist`` class is used to encode "frequency distributions",
+which count the number of times that each outcome of an experiment
+occurs.
+
+The ``ProbDistI`` class defines a standard interface for "probability
+distributions", which encode the probability of each outcome for an
+experiment.  There are two types of probability distribution:
+
+  - "derived probability distributions" are created from frequency
+    distributions.  They attempt to model the probability distribution
+    that generated the frequency distribution.
+  - "analytic probability distributions" are created directly from
+    parameters (such as variance).
+
+The ``ConditionalFreqDist`` class and ``ConditionalProbDistI`` interface
+are used to encode conditional distributions.  Conditional probability
+distributions can be derived or analytic; but currently the only
+implementation of the ``ConditionalProbDistI`` interface is
+``ConditionalProbDist``, a derived distribution.
+
+"""
+from __future__ import print_function, unicode_literals, division
+
+import math
+import random
+import warnings
+import array
+from operator import itemgetter
+from collections import defaultdict, Counter
+from functools import reduce
+from abc import ABCMeta, abstractmethod
+
+from six import itervalues, text_type, add_metaclass
+
+from nltk import compat
+from nltk.internals import raise_unorderable_types
+
+_NINF = float('-1e300')
+
+##//////////////////////////////////////////////////////
+##  Frequency Distributions
+##//////////////////////////////////////////////////////
+
+@compat.python_2_unicode_compatible
+class FreqDist(Counter):
+    """
+    A frequency distribution for the outcomes of an experiment.  A
+    frequency distribution records the number of times each outcome of
+    an experiment has occurred.  For example, a frequency distribution
+    could be used to record the frequency of each word type in a
+    document.  Formally, a frequency distribution can be defined as a
+    function mapping from each sample to the number of times that
+    sample occurred as an outcome.
+
+    Frequency distributions are generally constructed by running a
+    number of experiments, and incrementing the count for a sample
+    every time it is an outcome of an experiment.  For example, the
+    following code will produce a frequency distribution that encodes
+    how often each word occurs in a text:
+
+        >>> from nltk.tokenize import word_tokenize
+        >>> from nltk.probability import FreqDist
+        >>> sent = 'This is an example sentence'
+        >>> fdist = FreqDist()
+        >>> for word in word_tokenize(sent):
+        ...    fdist[word.lower()] += 1
+
+    An equivalent way to do this is with the initializer:
+
+        >>> fdist = FreqDist(word.lower() for word in word_tokenize(sent))
+
+    """
+
+    def __init__(self, samples=None):
+        """
+        Construct a new frequency distribution.  If ``samples`` is
+        given, then the frequency distribution will be initialized
+        with the count of each object in ``samples``; otherwise, it
+        will be initialized to be empty.
+
+        In particular, ``FreqDist()`` returns an empty frequency
+        distribution; and ``FreqDist(samples)`` first creates an empty
+        frequency distribution, and then calls ``update`` with the
+        list ``samples``.
+
+        :param samples: The samples to initialize the frequency
+            distribution with.
+        :type samples: Sequence
+        """
+        Counter.__init__(self, samples)
+
+        # Cached number of samples in this FreqDist
+        self._N = None
+
+    def N(self):
+        """
+        Return the total number of sample outcomes that have been
+        recorded by this FreqDist.  For the number of unique
+        sample values (or bins) with counts greater than zero, use
+        ``FreqDist.B()``.
+
+        :rtype: int
+        """
+        if self._N is None:
+            # Not already cached, or cache has been invalidated
+            self._N = sum(self.values())
+        return self._N
+
+    def __setitem__(self, key, val):
+        """
+        Override ``Counter.__setitem__()`` to invalidate the cached N
+        """
+        self._N = None
+        super(FreqDist, self).__setitem__(key, val)
+
+    def __delitem__(self, key):
+        """
+        Override ``Counter.__delitem__()`` to invalidate the cached N
+        """
+        self._N = None
+        super(FreqDist, self).__delitem__(key)
+
+    def update(self, *args, **kwargs):
+        """
+        Override ``Counter.update()`` to invalidate the cached N
+        """
+        self._N = None
+        super(FreqDist, self).update(*args, **kwargs)
+
+    def setdefault(self, key, val):
+        """
+        Override ``Counter.setdefault()`` to invalidate the cached N
+        """
+        self._N = None
+        super(FreqDist, self).setdefault(key, val)
+
+    def B(self):
+        """
+        Return the total number of sample values (or "bins") that
+        have counts greater than zero.  For the total
+        number of sample outcomes recorded, use ``FreqDist.N()``.
+        (FreqDist.B() is the same as len(FreqDist).)
+
+        :rtype: int
+        """
+        return len(self)
+
+    def hapaxes(self):
+        """
+        Return a list of all samples that occur once (hapax legomena)
+
+        :rtype: list
+        """
+        return [item for item in self if self[item] == 1]
+
+
+    def Nr(self, r, bins=None):
+        return self.r_Nr(bins)[r]
+
+    def r_Nr(self, bins=None):
+        """
+        Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0.
+
+        :type bins: int
+        :param bins: The number of possible sample outcomes.  ``bins``
+            is used to calculate Nr(0).  In particular, Nr(0) is
+            ``bins-self.B()``.  If ``bins`` is not specified, it
+            defaults to ``self.B()`` (so Nr(0) will be 0).
+        :rtype: int
+        """
+
+        _r_Nr = defaultdict(int)
+        for count in self.values():
+            _r_Nr[count] += 1
+
+        # Special case for Nr[0]:
+        _r_Nr[0] = bins - self.B() if bins is not None else 0
+
+        return _r_Nr
+
+    def _cumulative_frequencies(self, samples):
+        """
+        Return the cumulative frequencies of the specified samples.
+        If no samples are specified, all counts are returned, starting
+        with the largest.
+
+        :param samples: the samples whose frequencies should be returned.
+        :type samples: any
+        :rtype: list(float)
+        """
+        cf = 0.0
+        for sample in samples:
+            cf += self[sample]
+            yield cf
+
+    # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs,
+    # here, freq() does probs
+    def freq(self, sample):
+        """
+        Return the frequency of a given sample.  The frequency of a
+        sample is defined as the count of that sample divided by the
+        total number of sample outcomes that have been recorded by
+        this FreqDist.  The count of a sample is defined as the
+        number of times that sample outcome was recorded by this
+        FreqDist.  Frequencies are always real numbers in the range
+        [0, 1].
+
+        :param sample: the sample whose frequency
+               should be returned.
+        :type sample: any
+        :rtype: float
+        """
+        n = self.N()
+        if n == 0:
+            return 0
+        return self[sample] / n
+
+    def max(self):
+        """
+        Return the sample with the greatest number of outcomes in this
+        frequency distribution.  If two or more samples have the same
+        number of outcomes, return one of them; which sample is
+        returned is undefined.  If no outcomes have occurred in this
+        frequency distribution, return None.
+
+        :return: The sample with the maximum number of outcomes in this
+                frequency distribution.
+        :rtype: any or None
+        """
+        if len(self) == 0:
+            raise ValueError('A FreqDist must have at least one sample before max is defined.')
+        return self.most_common(1)[0][0]
+
+    def plot(self, *args, **kwargs):
+        """
+        Plot samples from the frequency distribution
+        displaying the most frequent sample first.  If an integer
+        parameter is supplied, stop after this many samples have been
+        plotted.  For a cumulative plot, specify cumulative=True.
+        (Requires Matplotlib to be installed.)
+
+        :param title: The title for the graph
+        :type title: str
+        :param cumulative: A flag to specify whether the plot is cumulative (default = False)
+        :type title: bool
+        """
+        try:
+            from matplotlib import pylab
+        except ImportError:
+            raise ValueError('The plot function requires matplotlib to be installed.'
+                         'See http://matplotlib.org/')
+
+        if len(args) == 0:
+            args = [len(self)]
+        samples = [item for item, _ in self.most_common(*args)]
+
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
+        if cumulative:
+            freqs = list(self._cumulative_frequencies(samples))
+            ylabel = "Cumulative Counts"
+        else:
+            freqs = [self[sample] for sample in samples]
+            ylabel = "Counts"
+        # percents = [f * 100 for f in freqs]  only in ProbDist?
+
+        pylab.grid(True, color="silver")
+        if not "linewidth" in kwargs:
+            kwargs["linewidth"] = 2
+        if "title" in kwargs:
+            pylab.title(kwargs["title"])
+            del kwargs["title"]
+        pylab.plot(freqs, **kwargs)
+        pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+        pylab.xlabel("Samples")
+        pylab.ylabel(ylabel)
+        pylab.show()
+
+    def tabulate(self, *args, **kwargs):
+        """
+        Tabulate the given samples from the frequency distribution (cumulative),
+        displaying the most frequent sample first.  If an integer
+        parameter is supplied, stop after this many samples have been
+        plotted.
+
+        :param samples: The samples to plot (default is all samples)
+        :type samples: list
+        :param cumulative: A flag to specify whether the freqs are cumulative (default = False)
+        :type title: bool
+        """
+        if len(args) == 0:
+            args = [len(self)]
+        samples = [item for item, _ in self.most_common(*args)]
+
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
+        if cumulative:
+            freqs = list(self._cumulative_frequencies(samples))
+        else:
+            freqs = [self[sample] for sample in samples]
+        # percents = [f * 100 for f in freqs]  only in ProbDist?
+
+        width = max(len("%s" % s) for s in samples)
+        width = max(width, max(len("%d" % f) for f in freqs))
+
+        for i in range(len(samples)):
+            print("%*s" % (width, samples[i]), end=' ')
+        print()
+        for i in range(len(samples)):
+            print("%*d" % (width, freqs[i]), end=' ')
+        print()
+
+    def copy(self):
+        """
+        Create a copy of this frequency distribution.
+
+        :rtype: FreqDist
+        """
+        return self.__class__(self)
+
+    # Mathematical operatiors
+
+    def __add__(self, other):
+        """
+        Add counts from two counters.
+
+        >>> FreqDist('abbb') + FreqDist('bcc')
+        FreqDist({'b': 4, 'c': 2, 'a': 1})
+
+        """
+        return self.__class__(super(FreqDist, self).__add__(other))
+
+    def __sub__(self, other):
+        """
+        Subtract count, but keep only results with positive counts.
+
+        >>> FreqDist('abbbc') - FreqDist('bccd')
+        FreqDist({'b': 2, 'a': 1})
+
+        """
+        return self.__class__(super(FreqDist, self).__sub__(other))
+
+    def __or__(self, other):
+        """
+        Union is the maximum of value in either of the input counters.
+
+        >>> FreqDist('abbb') | FreqDist('bcc')
+        FreqDist({'b': 3, 'c': 2, 'a': 1})
+
+        """
+        return self.__class__(super(FreqDist, self).__or__(other))
+
+    def __and__(self, other):
+        """
+        Intersection is the minimum of corresponding counts.
+
+        >>> FreqDist('abbb') & FreqDist('bcc')
+        FreqDist({'b': 1})
+
+        """
+        return self.__class__(super(FreqDist, self).__and__(other))
+
+    def __le__(self, other):
+        if not isinstance(other, FreqDist):
+            raise_unorderable_types("<=", self, other)
+        return set(self).issubset(other) and all(self[key] <= other[key] for key in self)
+
+    # @total_ordering doesn't work here, since the class inherits from a builtin class
+    __ge__ = lambda self, other: not self <= other or self == other
+    __lt__ = lambda self, other: self <= other and not self == other
+    __gt__ = lambda self, other: not self <= other
+
+    def __repr__(self):
+        """
+        Return a string representation of this FreqDist.
+
+        :rtype: string
+        """
+        return self.pformat()
+
+    def pprint(self, maxlen=10, stream=None):
+        """
+        Print a string representation of this FreqDist to 'stream'
+
+        :param maxlen: The maximum number of items to print
+        :type maxlen: int
+        :param stream: The stream to print to. stdout by default
+        """
+        print(self.pformat(maxlen=maxlen), file=stream)
+
+    def pformat(self, maxlen=10):
+        """
+        Return a string representation of this FreqDist.
+
+        :param maxlen: The maximum number of items to display
+        :type maxlen: int
+        :rtype: string
+        """
+        items = ['{0!r}: {1!r}'.format(*item) for item in self.most_common(maxlen)]
+        if len(self) > maxlen:
+            items.append('...')
+        return 'FreqDist({{{0}}})'.format(', '.join(items))
+
+    def __str__(self):
+        """
+        Return a string representation of this FreqDist.
+
+        :rtype: string
+        """
+        return '<FreqDist with %d samples and %d outcomes>' % (len(self), self.N())
+
+
+##//////////////////////////////////////////////////////
+##  Probability Distributions
+##//////////////////////////////////////////////////////
+
+@add_metaclass(ABCMeta)
+class ProbDistI(object):
+    """
+    A probability distribution for the outcomes of an experiment.  A
+    probability distribution specifies how likely it is that an
+    experiment will have any given outcome.  For example, a
+    probability distribution could be used to predict the probability
+    that a token in a document will have a given type.  Formally, a
+    probability distribution can be defined as a function mapping from
+    samples to nonnegative real numbers, such that the sum of every
+    number in the function's range is 1.0.  A ``ProbDist`` is often
+    used to model the probability distribution of the experiment used
+    to generate a frequency distribution.
+    """
+    SUM_TO_ONE = True
+    """True if the probabilities of the samples in this probability
+       distribution will always sum to one."""
+
+    @abstractmethod
+    def __init__(self):
+        """
+        Classes inheriting from ProbDistI should implement __init__.
+        """
+
+    @abstractmethod
+    def prob(self, sample):
+        """
+        Return the probability for a given sample.  Probabilities
+        are always real numbers in the range [0, 1].
+
+        :param sample: The sample whose probability
+               should be returned.
+        :type sample: any
+        :rtype: float
+        """
+
+    def logprob(self, sample):
+        """
+        Return the base 2 logarithm of the probability for a given sample.
+
+        :param sample: The sample whose probability
+               should be returned.
+        :type sample: any
+        :rtype: float
+        """
+        # Default definition, in terms of prob()
+        p = self.prob(sample)
+        return (math.log(p, 2) if p != 0 else _NINF)
+
+    @abstractmethod
+    def max(self):
+        """
+        Return the sample with the greatest probability.  If two or
+        more samples have the same probability, return one of them;
+        which sample is returned is undefined.
+
+        :rtype: any
+        """
+
+    @abstractmethod
+    def samples(self):
+        """
+        Return a list of all samples that have nonzero probabilities.
+        Use ``prob`` to find the probability of each sample.
+
+        :rtype: list
+        """
+
+    # cf self.SUM_TO_ONE
+    def discount(self):
+        """
+        Return the ratio by which counts are discounted on average: c*/c
+
+        :rtype: float
+        """
+        return 0.0
+
+    # Subclasses should define more efficient implementations of this,
+    # where possible.
+    def generate(self):
+        """
+        Return a randomly selected sample from this probability distribution.
+        The probability of returning each sample ``samp`` is equal to
+        ``self.prob(samp)``.
+        """
+        p = random.random()
+        p_init = p
+        for sample in self.samples():
+            p -= self.prob(sample)
+            if p <= 0: return sample
+        # allow for some rounding error:
+        if p < .0001:
+            return sample
+        # we *should* never get here
+        if self.SUM_TO_ONE:
+            warnings.warn("Probability distribution %r sums to %r; generate()"
+                          " is returning an arbitrary sample." % (self, p_init-p))
+        return random.choice(list(self.samples()))
+
+
+@compat.python_2_unicode_compatible
+class UniformProbDist(ProbDistI):
+    """
+    A probability distribution that assigns equal probability to each
+    sample in a given set; and a zero probability to all other
+    samples.
+    """
+    def __init__(self, samples):
+        """
+        Construct a new uniform probability distribution, that assigns
+        equal probability to each sample in ``samples``.
+
+        :param samples: The samples that should be given uniform
+            probability.
+        :type samples: list
+        :raise ValueError: If ``samples`` is empty.
+        """
+        if len(samples) == 0:
+            raise ValueError('A Uniform probability distribution must '+
+                             'have at least one sample.')
+        self._sampleset = set(samples)
+        self._prob = 1.0/len(self._sampleset)
+        self._samples = list(self._sampleset)
+
+    def prob(self, sample):
+        return (self._prob if sample in self._sampleset else 0)
+
+    def max(self):
+        return self._samples[0]
+
+    def samples(self):
+        return self._samples
+
+    def __repr__(self):
+        return '<UniformProbDist with %d samples>' % len(self._sampleset)
+
+
+@compat.python_2_unicode_compatible
+class RandomProbDist(ProbDistI):
+    """
+    Generates a random probability distribution whereby each sample
+    will be between 0 and 1 with equal probability (uniform random distribution.
+    Also called a continuous uniform distribution).
+    """
+    def __init__(self, samples):
+        if len(samples) == 0:
+            raise ValueError('A probability distribution must '+
+                             'have at least one sample.')
+        self._probs = self.unirand(samples)
+        self._samples = list(self._probs.keys())
+
+    @classmethod
+    def unirand(cls, samples):
+        """
+        The key function that creates a randomized initial distribution
+        that still sums to 1. Set as a dictionary of prob values so that
+        it can still be passed to MutableProbDist and called with identical
+        syntax to UniformProbDist
+        """
+        samples = set(samples)
+        randrow = [random.random() for i in range(len(samples))]
+        total = sum(randrow)
+        for i, x in enumerate(randrow):
+            randrow[i] = x/total
+
+        total = sum(randrow)
+        if total != 1:
+            #this difference, if present, is so small (near NINF) that it
+            #can be subtracted from any element without risking probs not (0 1)
+            randrow[-1] -= total - 1
+
+        return dict((s, randrow[i]) for i, s in enumerate(samples))
+
+    def prob(self, sample):
+        return self._probs.get(sample, 0)
+
+    def samples(self):
+        return self._samples
+
+    def __repr__(self):
+        return '<RandomUniformProbDist with %d samples>' %len(self._probs)
+
+
+@compat.python_2_unicode_compatible
+class DictionaryProbDist(ProbDistI):
+    """
+    A probability distribution whose probabilities are directly
+    specified by a given dictionary.  The given dictionary maps
+    samples to probabilities.
+    """
+    def __init__(self, prob_dict=None, log=False, normalize=False):
+        """
+        Construct a new probability distribution from the given
+        dictionary, which maps values to probabilities (or to log
+        probabilities, if ``log`` is true).  If ``normalize`` is
+        true, then the probability values are scaled by a constant
+        factor such that they sum to 1.
+
+        If called without arguments, the resulting probability
+        distribution assigns zero probability to all values.
+        """
+
+        self._prob_dict = (prob_dict.copy() if prob_dict is not None else {})
+        self._log = log
+
+        # Normalize the distribution, if requested.
+        if normalize:
+            if len(prob_dict) == 0:
+                raise ValueError('A DictionaryProbDist must have at least one sample ' +
+                             'before it can be normalized.')
+            if log:
+                value_sum = sum_logs(list(self._prob_dict.values()))
+                if value_sum <= _NINF:
+                    logp = math.log(1.0/len(prob_dict), 2)
+                    for x in prob_dict:
+                        self._prob_dict[x] = logp
+                else:
+                    for (x, p) in self._prob_dict.items():
+                        self._prob_dict[x] -= value_sum
+            else:
+                value_sum = sum(self._prob_dict.values())
+                if value_sum == 0:
+                    p = 1.0/len(prob_dict)
+                    for x in prob_dict:
+                        self._prob_dict[x] = p
+                else:
+                    norm_factor = 1.0/value_sum
+                    for (x, p) in self._prob_dict.items():
+                        self._prob_dict[x] *= norm_factor
+
+    def prob(self, sample):
+        if self._log:
+            return (2**(self._prob_dict[sample]) if sample in self._prob_dict else 0)
+        else:
+            return self._prob_dict.get(sample, 0)
+
+    def logprob(self, sample):
+        if self._log:
+            return self._prob_dict.get(sample, _NINF)
+        else:
+            if sample not in self._prob_dict: return _NINF
+            elif self._prob_dict[sample] == 0: return _NINF
+            else: return math.log(self._prob_dict[sample], 2)
+
+    def max(self):
+        if not hasattr(self, '_max'):
+            self._max = max((p,v) for (v,p) in self._prob_dict.items())[1]
+        return self._max
+    def samples(self):
+        return self._prob_dict.keys()
+    def __repr__(self):
+        return '<ProbDist with %d samples>' % len(self._prob_dict)
+
+
+@compat.python_2_unicode_compatible
+class MLEProbDist(ProbDistI):
+    """
+    The maximum likelihood estimate for the probability distribution
+    of the experiment used to generate a frequency distribution.  The
+    "maximum likelihood estimate" approximates the probability of
+    each sample as the frequency of that sample in the frequency
+    distribution.
+    """
+    def __init__(self, freqdist, bins=None):
+        """
+        Use the maximum likelihood estimate to create a probability
+        distribution for the experiment used to generate ``freqdist``.
+
+        :type freqdist: FreqDist
+        :param freqdist: The frequency distribution that the
+            probability estimates should be based on.
+        """
+        self._freqdist = freqdist
+
+    def freqdist(self):
+        """
+        Return the frequency distribution that this probability
+        distribution is based on.
+
+        :rtype: FreqDist
+        """
+        return self._freqdist
+
+    def prob(self, sample):
+        return self._freqdist.freq(sample)
+
+    def max(self):
+        return self._freqdist.max()
+
+    def samples(self):
+        return self._freqdist.keys()
+
+    def __repr__(self):
+        """
+        :rtype: str
+        :return: A string representation of this ``ProbDist``.
+        """
+        return '<MLEProbDist based on %d samples>' % self._freqdist.N()
+
+
+@compat.python_2_unicode_compatible
+class LidstoneProbDist(ProbDistI):
+    """
+    The Lidstone estimate for the probability distribution of the
+    experiment used to generate a frequency distribution.  The
+    "Lidstone estimate" is parameterized by a real number *gamma*,
+    which typically ranges from 0 to 1.  The Lidstone estimate
+    approximates the probability of a sample with count *c* from an
+    experiment with *N* outcomes and *B* bins as
+    ``c+gamma)/(N+B*gamma)``.  This is equivalent to adding
+    *gamma* to the count for each bin, and taking the maximum
+    likelihood estimate of the resulting frequency distribution.
+    """
+    SUM_TO_ONE = False
+    def __init__(self, freqdist, gamma, bins=None):
+        """
+        Use the Lidstone estimate to create a probability distribution
+        for the experiment used to generate ``freqdist``.
+
+        :type freqdist: FreqDist
+        :param freqdist: The frequency distribution that the
+            probability estimates should be based on.
+        :type gamma: float
+        :param gamma: A real number used to parameterize the
+            estimate.  The Lidstone estimate is equivalent to adding
+            *gamma* to the count for each bin, and taking the
+            maximum likelihood estimate of the resulting frequency
+            distribution.
+        :type bins: int
+        :param bins: The number of sample values that can be generated
+            by the experiment that is described by the probability
+            distribution.  This value must be correctly set for the
+            probabilities of the sample values to sum to one.  If
+            ``bins`` is not specified, it defaults to ``freqdist.B()``.
+        """
+        if (bins == 0) or (bins is None and freqdist.N() == 0):
+            name = self.__class__.__name__[:-8]
+            raise ValueError('A %s probability distribution ' % name +
+                             'must have at least one bin.')
+        if (bins is not None) and (bins < freqdist.B()):
+            name = self.__class__.__name__[:-8]
+            raise ValueError('\nThe number of bins in a %s distribution ' % name +
+                             '(%d) must be greater than or equal to\n' % bins +
+                             'the number of bins in the FreqDist used ' +
+                             'to create it (%d).' % freqdist.B())
+
+        self._freqdist = freqdist
+        self._gamma = float(gamma)
+        self._N = self._freqdist.N()
+
+        if bins is None:
+            bins = freqdist.B()
+        self._bins = bins
+
+        self._divisor = self._N + bins * gamma
+        if self._divisor == 0.0:
+            # In extreme cases we force the probability to be 0,
+            # which it will be, since the count will be 0:
+            self._gamma = 0
+            self._divisor = 1
+
+    def freqdist(self):
+        """
+        Return the frequency distribution that this probability
+        distribution is based on.
+
+        :rtype: FreqDist
+        """
+        return self._freqdist
+
+    def prob(self, sample):
+        c = self._freqdist[sample]
+        return (c + self._gamma) / self._divisor
+
+    def max(self):
+        # For Lidstone distributions, probability is monotonic with
+        # frequency, so the most probable sample is the one that
+        # occurs most frequently.
+        return self._freqdist.max()
+
+    def samples(self):
+        return self._freqdist.keys()
+
+    def discount(self):
+        gb = self._gamma * self._bins
+        return gb / (self._N + gb)
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ProbDist``.
+
+        :rtype: str
+        """
+        return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
+
+
+@compat.python_2_unicode_compatible
+class LaplaceProbDist(LidstoneProbDist):
+    """
+    The Laplace estimate for the probability distribution of the
+    experiment used to generate a frequency distribution.  The
+    "Laplace estimate" approximates the probability of a sample with
+    count *c* from an experiment with *N* outcomes and *B* bins as
+    *(c+1)/(N+B)*.  This is equivalent to adding one to the count for
+    each bin, and taking the maximum likelihood estimate of the
+    resulting frequency distribution.
+    """
+    def __init__(self, freqdist, bins=None):
+        """
+        Use the Laplace estimate to create a probability distribution
+        for the experiment used to generate ``freqdist``.
+
+        :type freqdist: FreqDist
+        :param freqdist: The frequency distribution that the
+            probability estimates should be based on.
+        :type bins: int
+        :param bins: The number of sample values that can be generated
+            by the experiment that is described by the probability
+            distribution.  This value must be correctly set for the
+            probabilities of the sample values to sum to one.  If
+            ``bins`` is not specified, it defaults to ``freqdist.B()``.
+        """
+        LidstoneProbDist.__init__(self, freqdist, 1, bins)
+
+    def __repr__(self):
+        """
+        :rtype: str
+        :return: A string representation of this ``ProbDist``.
+        """
+        return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
+
+
+@compat.python_2_unicode_compatible
+class ELEProbDist(LidstoneProbDist):
+    """
+    The expected likelihood estimate for the probability distribution
+    of the experiment used to generate a frequency distribution.  The
+    "expected likelihood estimate" approximates the probability of a
+    sample with count *c* from an experiment with *N* outcomes and
+    *B* bins as *(c+0.5)/(N+B/2)*.  This is equivalent to adding 0.5
+    to the count for each bin, and taking the maximum likelihood
+    estimate of the resulting frequency distribution.
+    """
+    def __init__(self, freqdist, bins=None):
+        """
+        Use the expected likelihood estimate to create a probability
+        distribution for the experiment used to generate ``freqdist``.
+
+        :type freqdist: FreqDist
+        :param freqdist: The frequency distribution that the
+            probability estimates should be based on.
+        :type bins: int
+        :param bins: The number of sample values that can be generated
+            by the experiment that is described by the probability
+            distribution.  This value must be correctly set for the
+            probabilities of the sample values to sum to one.  If
+            ``bins`` is not specified, it defaults to ``freqdist.B()``.
+        """
+        LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ProbDist``.
+
+        :rtype: str
+        """
+        return '<ELEProbDist based on %d samples>' % self._freqdist.N()
+
+
+@compat.python_2_unicode_compatible
+class HeldoutProbDist(ProbDistI):
+    """
+    The heldout estimate for the probability distribution of the
+    experiment used to generate two frequency distributions.  These
+    two frequency distributions are called the "heldout frequency
+    distribution" and the "base frequency distribution."  The
+    "heldout estimate" uses uses the "heldout frequency
+    distribution" to predict the probability of each sample, given its
+    frequency in the "base frequency distribution".
+
+    In particular, the heldout estimate approximates the probability
+    for a sample that occurs *r* times in the base distribution as
+    the average frequency in the heldout distribution of all samples
+    that occur *r* times in the base distribution.
+
+    This average frequency is *Tr[r]/(Nr[r].N)*, where:
+
+    - *Tr[r]* is the total count in the heldout distribution for
+      all samples that occur *r* times in the base distribution.
+    - *Nr[r]* is the number of samples that occur *r* times in
+      the base distribution.
+    - *N* is the number of outcomes recorded by the heldout
+      frequency distribution.
+
+    In order to increase the efficiency of the ``prob`` member
+    function, *Tr[r]/(Nr[r].N)* is precomputed for each value of *r*
+    when the ``HeldoutProbDist`` is created.
+
+    :type _estimate: list(float)
+    :ivar _estimate: A list mapping from *r*, the number of
+        times that a sample occurs in the base distribution, to the
+        probability estimate for that sample.  ``_estimate[r]`` is
+        calculated by finding the average frequency in the heldout
+        distribution of all samples that occur *r* times in the base
+        distribution.  In particular, ``_estimate[r]`` =
+        *Tr[r]/(Nr[r].N)*.
+    :type _max_r: int
+    :ivar _max_r: The maximum number of times that any sample occurs
+        in the base distribution.  ``_max_r`` is used to decide how
+        large ``_estimate`` must be.
+    """
+    SUM_TO_ONE = False
+    def __init__(self, base_fdist, heldout_fdist, bins=None):
+        """
+        Use the heldout estimate to create a probability distribution
+        for the experiment used to generate ``base_fdist`` and
+        ``heldout_fdist``.
+
+        :type base_fdist: FreqDist
+        :param base_fdist: The base frequency distribution.
+        :type heldout_fdist: FreqDist
+        :param heldout_fdist: The heldout frequency distribution.
+        :type bins: int
+        :param bins: The number of sample values that can be generated
+            by the experiment that is described by the probability
+            distribution.  This value must be correctly set for the
+            probabilities of the sample values to sum to one.  If
+            ``bins`` is not specified, it defaults to ``freqdist.B()``.
+        """
+
+        self._base_fdist = base_fdist
+        self._heldout_fdist = heldout_fdist
+
+        # The max number of times any sample occurs in base_fdist.
+        self._max_r = base_fdist[base_fdist.max()]
+
+        # Calculate Tr, Nr, and N.
+        Tr = self._calculate_Tr()
+        r_Nr = base_fdist.r_Nr(bins)
+        Nr = [r_Nr[r] for r in range(self._max_r+1)]
+        N = heldout_fdist.N()
+
+        # Use Tr, Nr, and N to compute the probability estimate for
+        # each value of r.
+        self._estimate = self._calculate_estimate(Tr, Nr, N)
+
+    def _calculate_Tr(self):
+        """
+        Return the list *Tr*, where *Tr[r]* is the total count in
+        ``heldout_fdist`` for all samples that occur *r*
+        times in ``base_fdist``.
+
+        :rtype: list(float)
+        """
+        Tr = [0.0] * (self._max_r+1)
+        for sample in self._heldout_fdist:
+            r = self._base_fdist[sample]
+            Tr[r] += self._heldout_fdist[sample]
+        return Tr
+
+    def _calculate_estimate(self, Tr, Nr, N):
+        """
+        Return the list *estimate*, where *estimate[r]* is the probability
+        estimate for any sample that occurs *r* times in the base frequency
+        distribution.  In particular, *estimate[r]* is *Tr[r]/(N[r].N)*.
+        In the special case that *N[r]=0*, *estimate[r]* will never be used;
+        so we define *estimate[r]=None* for those cases.
+
+        :rtype: list(float)
+        :type Tr: list(float)
+        :param Tr: the list *Tr*, where *Tr[r]* is the total count in
+            the heldout distribution for all samples that occur *r*
+            times in base distribution.
+        :type Nr: list(float)
+        :param Nr: The list *Nr*, where *Nr[r]* is the number of
+            samples that occur *r* times in the base distribution.
+        :type N: int
+        :param N: The total number of outcomes recorded by the heldout
+            frequency distribution.
+        """
+        estimate = []
+        for r in range(self._max_r+1):
+            if Nr[r] == 0: estimate.append(None)
+            else: estimate.append(Tr[r]/(Nr[r]*N))
+        return estimate
+
+    def base_fdist(self):
+        """
+        Return the base frequency distribution that this probability
+        distribution is based on.
+
+        :rtype: FreqDist
+        """
+        return self._base_fdist
+
+    def heldout_fdist(self):
+        """
+        Return the heldout frequency distribution that this
+        probability distribution is based on.
+
+        :rtype: FreqDist
+        """
+        return self._heldout_fdist
+
+    def samples(self):
+        return self._base_fdist.keys()
+
+    def prob(self, sample):
+        # Use our precomputed probability estimate.
+        r = self._base_fdist[sample]
+        return self._estimate[r]
+
+    def max(self):
+        # Note: the Heldout estimation is *not* necessarily monotonic;
+        # so this implementation is currently broken.  However, it
+        # should give the right answer *most* of the time. :)
+        return self._base_fdist.max()
+
+    def discount(self):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        """
+        :rtype: str
+        :return: A string representation of this ``ProbDist``.
+        """
+        s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
+        return s % (self._base_fdist.N(), self._heldout_fdist.N())
+
+
+@compat.python_2_unicode_compatible
+class CrossValidationProbDist(ProbDistI):
+    """
+    The cross-validation estimate for the probability distribution of
+    the experiment used to generate a set of frequency distribution.
+    The "cross-validation estimate" for the probability of a sample
+    is found by averaging the held-out estimates for the sample in
+    each pair of frequency distributions.
+    """
+    SUM_TO_ONE = False
+    def __init__(self, freqdists, bins):
+        """
+        Use the cross-validation estimate to create a probability
+        distribution for the experiment used to generate
+        ``freqdists``.
+
+        :type freqdists: list(FreqDist)
+        :param freqdists: A list of the frequency distributions
+            generated by the experiment.
+        :type bins: int
+        :param bins: The number of sample values that can be generated
+            by the experiment that is described by the probability
+            distribution.  This value must be correctly set for the
+            probabilities of the sample values to sum to one.  If
+            ``bins`` is not specified, it defaults to ``freqdist.B()``.
+        """
+        self._freqdists = freqdists
+
+        # Create a heldout probability distribution for each pair of
+        # frequency distributions in freqdists.
+        self._heldout_probdists = []
+        for fdist1 in freqdists:
+            for fdist2 in freqdists:
+                if fdist1 is not fdist2:
+                    probdist = HeldoutProbDist(fdist1, fdist2, bins)
+                    self._heldout_probdists.append(probdist)
+
+    def freqdists(self):
+        """
+        Return the list of frequency distributions that this ``ProbDist`` is based on.
+
+        :rtype: list(FreqDist)
+        """
+        return self._freqdists
+
+    def samples(self):
+        # [xx] nb: this is not too efficient
+        return set(sum([list(fd) for fd in self._freqdists], []))
+
+    def prob(self, sample):
+        # Find the average probability estimate returned by each
+        # heldout distribution.
+        prob = 0.0
+        for heldout_probdist in self._heldout_probdists:
+            prob += heldout_probdist.prob(sample)
+        return prob/len(self._heldout_probdists)
+
+    def discount(self):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ProbDist``.
+
+        :rtype: str
+        """
+        return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
+
+
+@compat.python_2_unicode_compatible
+class WittenBellProbDist(ProbDistI):
+    """
+    The Witten-Bell estimate of a probability distribution. This distribution
+    allocates uniform probability mass to as yet unseen events by using the
+    number of events that have only been seen once. The probability mass
+    reserved for unseen events is equal to *T / (N + T)*
+    where *T* is the number of observed event types and *N* is the total
+    number of observed events. This equates to the maximum likelihood estimate
+    of a new type event occurring. The remaining probability mass is discounted
+    such that all probability estimates sum to one, yielding:
+
+        - *p = T / Z (N + T)*, if count = 0
+        - *p = c / (N + T)*, otherwise
+    """
+
+    def __init__(self, freqdist, bins=None):
+        """
+        Creates a distribution of Witten-Bell probability estimates.  This
+        distribution allocates uniform probability mass to as yet unseen
+        events by using the number of events that have only been seen once. The
+        probability mass reserved for unseen events is equal to *T / (N + T)*
+        where *T* is the number of observed event types and *N* is the total
+        number of observed events. This equates to the maximum likelihood
+        estimate of a new type event occurring. The remaining probability mass
+        is discounted such that all probability estimates sum to one,
+        yielding:
+
+            - *p = T / Z (N + T)*, if count = 0
+            - *p = c / (N + T)*, otherwise
+
+        The parameters *T* and *N* are taken from the ``freqdist`` parameter
+        (the ``B()`` and ``N()`` values). The normalizing factor *Z* is
+        calculated using these values along with the ``bins`` parameter.
+
+        :param freqdist: The frequency counts upon which to base the
+            estimation.
+        :type freqdist: FreqDist
+        :param bins: The number of possible event types. This must be at least
+            as large as the number of bins in the ``freqdist``. If None, then
+            it's assumed to be equal to that of the ``freqdist``
+        :type bins: int
+        """
+        assert bins is None or bins >= freqdist.B(),\
+               'bins parameter must not be less than %d=freqdist.B()' % freqdist.B()
+        if bins is None:
+            bins = freqdist.B()
+        self._freqdist = freqdist
+        self._T = self._freqdist.B()
+        self._Z = bins - self._freqdist.B()
+        self._N = self._freqdist.N()
+        # self._P0 is P(0), precalculated for efficiency:
+        if self._N==0:
+            # if freqdist is empty, we approximate P(0) by a UniformProbDist:
+            self._P0 = 1.0 / self._Z
+        else:
+            self._P0 = self._T / (self._Z * (self._N + self._T))
+
+    def prob(self, sample):
+        # inherit docs from ProbDistI
+        c = self._freqdist[sample]
+        return (c / (self._N + self._T) if c != 0 else self._P0)
+
+    def max(self):
+        return self._freqdist.max()
+
+    def samples(self):
+        return self._freqdist.keys()
+
+    def freqdist(self):
+        return self._freqdist
+
+    def discount(self):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ProbDist``.
+
+        :rtype: str
+        """
+        return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
+
+
+##//////////////////////////////////////////////////////
+##  Good-Turing Probability Distributions
+##//////////////////////////////////////////////////////
+
+# Good-Turing frequency estimation was contributed by Alan Turing and
+# his statistical assistant I.J. Good, during their collaboration in
+# the WWII.  It is a statistical technique for predicting the
+# probability of occurrence of objects belonging to an unknown number
+# of species, given past observations of such objects and their
+# species. (In drawing balls from an urn, the 'objects' would be balls
+# and the 'species' would be the distinct colors of the balls (finite
+# but unknown in number).
+#
+# Good-Turing method calculates the probability mass to assign to
+# events with zero or low counts based on the number of events with
+# higher counts. It does so by using the adjusted count *c\**:
+#
+#     - *c\* = (c + 1) N(c + 1) / N(c)*   for c >= 1
+#     - *things with frequency zero in training* = N(1)  for c == 0
+#
+# where *c* is the original count, *N(i)* is the number of event types
+# observed with count *i*. We can think the count of unseen as the count
+# of frequency one (see Jurafsky & Martin 2nd Edition, p101).
+#
+# This method is problematic because the situation ``N(c+1) == 0``
+# is quite common in the original Good-Turing estimation; smoothing or
+# interpolation of *N(i)* values is essential in practice.
+#
+# Bill Gale and Geoffrey Sampson present a simple and effective approach,
+# Simple Good-Turing.  As a smoothing curve they simply use a power curve:
+#
+#     Nr = a*r^b (with b < -1 to give the appropriate hyperbolic
+#     relationship)
+#
+# They estimate a and b by simple linear regression technique on the
+# logarithmic form of the equation:
+#
+#     log Nr = a + b*log(r)
+#
+# However, they suggest that such a simple curve is probably only
+# appropriate for high values of r. For low values of r, they use the
+# measured Nr directly.  (see M&S, p.213)
+#
+# Gale and Sampson propose to use r while the difference between r and
+# r* is 1.96 greater than the standard deviation, and switch to r* if
+# it is less or equal:
+#
+#     |r - r*| > 1.96 * sqrt((r + 1)^2 (Nr+1 / Nr^2) (1 + Nr+1 / Nr))
+#
+# The 1.96 coefficient correspond to a 0.05 significance criterion,
+# some implementations can use a coefficient of 1.65 for a 0.1
+# significance criterion.
+#
+
+##//////////////////////////////////////////////////////
+##  Simple Good-Turing Probablity Distributions
+##//////////////////////////////////////////////////////
+
+@compat.python_2_unicode_compatible
+class SimpleGoodTuringProbDist(ProbDistI):
+    """
+    SimpleGoodTuring ProbDist approximates from frequency to frequency of
+    frequency into a linear line under log space by linear regression.
+    Details of Simple Good-Turing algorithm can be found in:
+
+    - Good Turing smoothing without tears" (Gale & Sampson 1995),
+      Journal of Quantitative Linguistics, vol. 2 pp. 217-237.
+    - "Speech and Language Processing (Jurafsky & Martin),
+      2nd Edition, Chapter 4.5 p103 (log(Nc) =  a + b*log(c))
+    - http://www.grsampson.net/RGoodTur.html
+
+    Given a set of pair (xi, yi),  where the xi denotes the frequency and
+    yi denotes the frequency of frequency, we want to minimize their
+    square variation. E(x) and E(y) represent the mean of xi and yi.
+
+    - slope: b = sigma ((xi-E(x)(yi-E(y))) / sigma ((xi-E(x))(xi-E(x)))
+    - intercept: a = E(y) - b.E(x)
+    """
+    SUM_TO_ONE = False
+    def __init__(self, freqdist, bins=None):
+        """
+        :param freqdist: The frequency counts upon which to base the
+            estimation.
+        :type freqdist: FreqDist
+        :param bins: The number of possible event types. This must be
+            larger than the number of bins in the ``freqdist``. If None,
+            then it's assumed to be equal to ``freqdist``.B() + 1
+        :type bins: int
+        """
+        assert bins is None or bins > freqdist.B(),\
+               'bins parameter must not be less than %d=freqdist.B()+1' % (freqdist.B()+1)
+        if bins is None:
+            bins = freqdist.B() + 1
+        self._freqdist = freqdist
+        self._bins = bins
+        r, nr = self._r_Nr()
+        self.find_best_fit(r, nr)
+        self._switch(r, nr)
+        self._renormalize(r, nr)
+
+    def _r_Nr_non_zero(self):
+        r_Nr = self._freqdist.r_Nr()
+        del r_Nr[0]
+        return r_Nr
+
+    def _r_Nr(self):
+        """
+        Split the frequency distribution in two list (r, Nr), where Nr(r) > 0
+        """
+        nonzero = self._r_Nr_non_zero()
+
+        if not nonzero:
+            return [], []
+        return zip(*sorted(nonzero.items()))
+
+    def find_best_fit(self, r, nr):
+        """
+        Use simple linear regression to tune parameters self._slope and
+        self._intercept in the log-log space based on count and Nr(count)
+        (Work in log space to avoid floating point underflow.)
+        """
+        # For higher sample frequencies the data points becomes horizontal
+        # along line Nr=1. To create a more evident linear model in log-log
+        # space, we average positive Nr values with the surrounding zero
+        # values. (Church and Gale, 1991)
+
+        if not r or not nr:
+            # Empty r or nr?
+            return
+
+        zr = []
+        for j in range(len(r)):
+            i = (r[j-1] if j > 0 else 0)
+            k = (2 * r[j] - i if j == len(r) - 1 else r[j+1])
+            zr_ = 2.0 * nr[j] / (k - i)
+            zr.append(zr_)
+
+        log_r = [math.log(i) for i in r]
+        log_zr = [math.log(i) for i in zr]
+
+        xy_cov = x_var = 0.0
+        x_mean = sum(log_r) / len(log_r)
+        y_mean = sum(log_zr) / len(log_zr)
+        for (x, y) in zip(log_r, log_zr):
+            xy_cov += (x - x_mean) * (y - y_mean)
+            x_var += (x - x_mean)**2
+        self._slope = (xy_cov / x_var if x_var != 0 else 0.0)
+        if self._slope >= -1:
+            warnings.warn('SimpleGoodTuring did not find a proper best fit '
+                          'line for smoothing probabilities of occurrences. '
+                          'The probability estimates are likely to be '
+                          'unreliable.')
+        self._intercept = y_mean - self._slope * x_mean
+
+    def _switch(self, r, nr):
+        """
+        Calculate the r frontier where we must switch from Nr to Sr
+        when estimating E[Nr].
+        """
+        for i, r_ in enumerate(r):
+            if len(r) == i + 1 or r[i+1] != r_ + 1:
+                # We are at the end of r, or there is a gap in r
+                self._switch_at = r_
+                break
+
+            Sr = self.smoothedNr
+            smooth_r_star = (r_ + 1) * Sr(r_+1) / Sr(r_)
+            unsmooth_r_star = (r_ + 1) * nr[i+1] / nr[i]
+
+            std = math.sqrt(self._variance(r_, nr[i], nr[i+1]))
+            if abs(unsmooth_r_star-smooth_r_star) <= 1.96 * std:
+                self._switch_at = r_
+                break
+
+    def _variance(self, r, nr, nr_1):
+        r = float(r)
+        nr = float(nr)
+        nr_1 = float(nr_1)
+        return (r + 1.0)**2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr)
+
+    def _renormalize(self, r, nr):
+        """
+        It is necessary to renormalize all the probability estimates to
+        ensure a proper probability distribution results. This can be done
+        by keeping the estimate of the probability mass for unseen items as
+        N(1)/N and renormalizing all the estimates for previously seen items
+        (as Gale and Sampson (1995) propose). (See M&S P.213, 1999)
+        """
+        prob_cov = 0.0
+        for r_, nr_ in zip(r, nr):
+            prob_cov  += nr_ * self._prob_measure(r_)
+        if prob_cov:
+            self._renormal = (1 - self._prob_measure(0)) / prob_cov
+
+    def smoothedNr(self, r):
+        """
+        Return the number of samples with count r.
+
+        :param r: The amount of frequency.
+        :type r: int
+        :rtype: float
+        """
+
+        # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic
+        # relationship)
+        # Estimate a and b by simple linear regression technique on
+        # the logarithmic form of the equation: log Nr = a + b*log(r)
+
+        return math.exp(self._intercept + self._slope * math.log(r))
+
+    def prob(self, sample):
+        """
+        Return the sample's probability.
+
+        :param sample: sample of the event
+        :type sample: str
+        :rtype: float
+        """
+        count = self._freqdist[sample]
+        p = self._prob_measure(count)
+        if count == 0:
+            if self._bins == self._freqdist.B():
+                p = 0.0
+            else:
+                p = p / (self._bins - self._freqdist.B())
+        else:
+            p = p * self._renormal
+        return p
+
+    def _prob_measure(self, count):
+        if count == 0 and self._freqdist.N() == 0 :
+            return 1.0
+        elif count == 0 and self._freqdist.N() != 0:
+            return self._freqdist.Nr(1) / self._freqdist.N()
+
+        if self._switch_at > count:
+            Er_1 = self._freqdist.Nr(count+1)
+            Er = self._freqdist.Nr(count)
+        else:
+            Er_1 = self.smoothedNr(count+1)
+            Er = self.smoothedNr(count)
+
+        r_star = (count + 1) * Er_1 / Er
+        return r_star / self._freqdist.N()
+
+    def check(self):
+        prob_sum = 0.0
+        for i in  range(0, len(self._Nr)):
+            prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal
+        print("Probability Sum:", prob_sum)
+        #assert prob_sum != 1.0, "probability sum should be one!"
+
+    def discount(self):
+        """
+        This function returns the total mass of probability transfers from the
+        seen samples to the unseen samples.
+        """
+        return  self.smoothedNr(1) / self._freqdist.N()
+
+    def max(self):
+        return self._freqdist.max()
+
+    def samples(self):
+        return self._freqdist.keys()
+
+    def freqdist(self):
+        return self._freqdist
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ProbDist``.
+
+        :rtype: str
+        """
+        return '<SimpleGoodTuringProbDist based on %d samples>'\
+                % self._freqdist.N()
+
+
+class MutableProbDist(ProbDistI):
+    """
+    An mutable probdist where the probabilities may be easily modified. This
+    simply copies an existing probdist, storing the probability values in a
+    mutable dictionary and providing an update method.
+    """
+
+    def __init__(self, prob_dist, samples, store_logs=True):
+        """
+        Creates the mutable probdist based on the given prob_dist and using
+        the list of samples given. These values are stored as log
+        probabilities if the store_logs flag is set.
+
+        :param prob_dist: the distribution from which to garner the
+            probabilities
+        :type prob_dist: ProbDist
+        :param samples: the complete set of samples
+        :type samples: sequence of any
+        :param store_logs: whether to store the probabilities as logarithms
+        :type store_logs: bool
+        """
+        self._samples = samples
+        self._sample_dict = dict((samples[i], i) for i in range(len(samples)))
+        self._data = array.array(str("d"), [0.0]) * len(samples)
+        for i in range(len(samples)):
+            if store_logs:
+                self._data[i] = prob_dist.logprob(samples[i])
+            else:
+                self._data[i] = prob_dist.prob(samples[i])
+        self._logs = store_logs
+
+    def samples(self):
+        # inherit documentation
+        return self._samples
+
+    def prob(self, sample):
+        # inherit documentation
+        i = self._sample_dict.get(sample)
+        if i is None:
+            return 0.0
+        return (2**(self._data[i]) if self._logs else self._data[i])
+
+    def logprob(self, sample):
+        # inherit documentation
+        i = self._sample_dict.get(sample)
+        if i is None:
+            return float('-inf')
+        return (self._data[i] if self._logs else math.log(self._data[i], 2))
+
+    def update(self, sample, prob, log=True):
+        """
+        Update the probability for the given sample. This may cause the object
+        to stop being the valid probability distribution - the user must
+        ensure that they update the sample probabilities such that all samples
+        have probabilities between 0 and 1 and that all probabilities sum to
+        one.
+
+        :param sample: the sample for which to update the probability
+        :type sample: any
+        :param prob: the new probability
+        :type prob: float
+        :param log: is the probability already logged
+        :type log: bool
+        """
+        i = self._sample_dict.get(sample)
+        assert i is not None
+        if self._logs:
+            self._data[i] = (prob if log else math.log(prob, 2))
+        else:
+            self._data[i] = (2**(prob) if log else prob)
+
+##/////////////////////////////////////////////////////
+##  Kneser-Ney Probability Distribution
+##//////////////////////////////////////////////////////
+
+# This method for calculating probabilities was introduced in 1995 by Reinhard
+# Kneser and Hermann Ney. It was meant to improve the accuracy of language
+# models that use backing-off to deal with sparse data. The authors propose two
+# ways of doing so: a marginal distribution constraint on the back-off
+# distribution and a leave-one-out distribution. For a start, the first one is
+# implemented as a class below.
+#
+# The idea behind a back-off n-gram model is that we have a series of
+# frequency distributions for our n-grams so that in case we have not seen a
+# given n-gram during training (and as a result have a 0 probability for it) we
+# can 'back off' (hence the name!) and try testing whether we've seen the
+# n-1-gram part of the n-gram in training.
+#
+# The novelty of Kneser and Ney's approach was that they decided to fiddle
+# around with the way this latter, backed off probability was being calculated
+# whereas their peers seemed to focus on the primary probability.
+#
+# The implementation below uses one of the techniques described in their paper
+# titled "Improved backing-off for n-gram language modeling." In the same paper
+# another technique is introduced to attempt to smooth the back-off
+# distribution as well as the primary one. There is also a much-cited
+# modification of this method proposed by Chen and Goodman.
+#
+# In order for the implementation of Kneser-Ney to be more efficient, some
+# changes have been made to the original algorithm. Namely, the calculation of
+# the normalizing function gamma has been significantly simplified and
+# combined slightly differently with beta. None of these changes affect the
+# nature of the algorithm, but instead aim to cut out unnecessary calculations
+# and take advantage of storing and retrieving information in dictionaries
+# where possible.
+
+@compat.python_2_unicode_compatible
+class KneserNeyProbDist(ProbDistI):
+    """
+    Kneser-Ney estimate of a probability distribution. This is a version of
+    back-off that counts how likely an n-gram is provided the n-1-gram had
+    been seen in training. Extends the ProbDistI interface, requires a trigram
+    FreqDist instance to train on. Optionally, a different from default discount
+    value can be specified. The default discount is set to 0.75.
+
+    """
+    def __init__(self, freqdist, bins=None, discount=0.75):
+        """
+        :param freqdist: The trigram frequency distribution upon which to base
+            the estimation
+        :type freqdist: FreqDist
+        :param bins: Included for compatibility with nltk.tag.hmm
+        :type bins: int or float
+        :param discount: The discount applied when retrieving counts of
+            trigrams
+        :type discount: float (preferred, but can be set to int)
+        """
+
+        if not bins:
+            self._bins = freqdist.B()
+        else:
+            self._bins = bins
+        self._D = discount
+
+        # cache for probability calculation
+        self._cache = {}
+
+        # internal bigram and trigram frequency distributions
+        self._bigrams = defaultdict(int)
+        self._trigrams = freqdist
+
+        # helper dictionaries used to calculate probabilities
+        self._wordtypes_after = defaultdict(float)
+        self._trigrams_contain = defaultdict(float)
+        self._wordtypes_before = defaultdict(float)
+        for w0, w1, w2 in freqdist:
+            self._bigrams[(w0,w1)] += freqdist[(w0, w1, w2)]
+            self._wordtypes_after[(w0,w1)] += 1
+            self._trigrams_contain[w1] += 1
+            self._wordtypes_before[(w1,w2)] += 1
+
+    def prob(self, trigram):
+        # sample must be a triple
+        if len(trigram) != 3:
+            raise ValueError('Expected an iterable with 3 members.')
+        trigram = tuple(trigram)
+        w0, w1, w2 = trigram
+
+        if trigram in self._cache:
+            return self._cache[trigram]
+        else:
+            # if the sample trigram was seen during training
+            if trigram in self._trigrams:
+                prob = (self._trigrams[trigram]
+                        - self.discount())/self._bigrams[(w0, w1)]
+
+            # else if the 'rougher' environment was seen during training
+            elif (w0,w1) in self._bigrams and (w1,w2) in self._wordtypes_before:
+                aftr = self._wordtypes_after[(w0, w1)]
+                bfr = self._wordtypes_before[(w1, w2)]
+
+                # the probability left over from alphas
+                leftover_prob = ((aftr * self.discount())
+                                 / self._bigrams[(w0, w1)])
+
+                # the beta (including normalization)
+                beta = bfr /(self._trigrams_contain[w1] - aftr)
+
+                prob = leftover_prob * beta
+
+            # else the sample was completely unseen during training
+            else:
+                prob = 0.0
+
+            self._cache[trigram] = prob
+            return prob
+
+    def discount(self):
+        """
+        Return the value by which counts are discounted. By default set to 0.75.
+
+        :rtype: float
+        """
+        return self._D
+
+    def set_discount(self, discount):
+        """
+        Set the value by which counts are discounted to the value of discount.
+
+        :param discount: the new value to discount counts by
+        :type discount: float (preferred, but int possible)
+        :rtype: None
+        """
+        self._D = discount
+
+    def samples(self):
+        return self._trigrams.keys()
+
+    def max(self):
+        return self._trigrams.max()
+
+    def __repr__(self):
+        '''
+        Return a string representation of this ProbDist
+
+        :rtype: str
+        '''
+        return '<KneserNeyProbDist based on {0} trigrams'.format(self._trigrams.N())
+
+##//////////////////////////////////////////////////////
+##  Probability Distribution Operations
+##//////////////////////////////////////////////////////
+
+def log_likelihood(test_pdist, actual_pdist):
+    if (not isinstance(test_pdist, ProbDistI) or
+        not isinstance(actual_pdist, ProbDistI)):
+        raise ValueError('expected a ProbDist.')
+    # Is this right?
+    return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2)
+               for s in actual_pdist)
+
+def entropy(pdist):
+    probs = (pdist.prob(s) for s in pdist.samples())
+    return -sum(p * math.log(p,2) for p in probs)
+
+##//////////////////////////////////////////////////////
+##  Conditional Distributions
+##//////////////////////////////////////////////////////
+
+@compat.python_2_unicode_compatible
+class ConditionalFreqDist(defaultdict):
+    """
+    A collection of frequency distributions for a single experiment
+    run under different conditions.  Conditional frequency
+    distributions are used to record the number of times each sample
+    occurred, given the condition under which the experiment was run.
+    For example, a conditional frequency distribution could be used to
+    record the frequency of each word (type) in a document, given its
+    length.  Formally, a conditional frequency distribution can be
+    defined as a function that maps from each condition to the
+    FreqDist for the experiment under that condition.
+
+    Conditional frequency distributions are typically constructed by
+    repeatedly running an experiment under a variety of conditions,
+    and incrementing the sample outcome counts for the appropriate
+    conditions.  For example, the following code will produce a
+    conditional frequency distribution that encodes how often each
+    word type occurs, given the length of that word type:
+
+        >>> from nltk.probability import ConditionalFreqDist
+        >>> from nltk.tokenize import word_tokenize
+        >>> sent = "the the the dog dog some other words that we do not care about"
+        >>> cfdist = ConditionalFreqDist()
+        >>> for word in word_tokenize(sent):
+        ...     condition = len(word)
+        ...     cfdist[condition][word] += 1
+
+    An equivalent way to do this is with the initializer:
+
+        >>> cfdist = ConditionalFreqDist((len(word), word) for word in word_tokenize(sent))
+
+    The frequency distribution for each condition is accessed using
+    the indexing operator:
+
+        >>> cfdist[3]
+        FreqDist({'the': 3, 'dog': 2, 'not': 1})
+        >>> cfdist[3].freq('the')
+        0.5
+        >>> cfdist[3]['dog']
+        2
+
+    When the indexing operator is used to access the frequency
+    distribution for a condition that has not been accessed before,
+    ``ConditionalFreqDist`` creates a new empty FreqDist for that
+    condition.
+
+    """
+    def __init__(self, cond_samples=None):
+        """
+        Construct a new empty conditional frequency distribution.  In
+        particular, the count for every sample, under every condition,
+        is zero.
+
+        :param cond_samples: The samples to initialize the conditional
+            frequency distribution with
+        :type cond_samples: Sequence of (condition, sample) tuples
+        """
+        defaultdict.__init__(self, FreqDist)
+
+        if cond_samples:
+            for (cond, sample) in cond_samples:
+                self[cond][sample] += 1
+
+    def __reduce__(self):
+        kv_pairs = ((cond, self[cond]) for cond in self.conditions())
+        return (self.__class__, (), None, None, kv_pairs)
+
+    def conditions(self):
+        """
+        Return a list of the conditions that have been accessed for
+        this ``ConditionalFreqDist``.  Use the indexing operator to
+        access the frequency distribution for a given condition.
+        Note that the frequency distributions for some conditions
+        may contain zero sample outcomes.
+
+        :rtype: list
+        """
+        return list(self.keys())
+
+    def N(self):
+        """
+        Return the total number of sample outcomes that have been
+        recorded by this ``ConditionalFreqDist``.
+
+        :rtype: int
+        """
+        return sum(fdist.N() for fdist in itervalues(self))
+
+    def plot(self, *args, **kwargs):
+        """
+        Plot the given samples from the conditional frequency distribution.
+        For a cumulative plot, specify cumulative=True.
+        (Requires Matplotlib to be installed.)
+
+        :param samples: The samples to plot
+        :type samples: list
+        :param title: The title for the graph
+        :type title: str
+        :param conditions: The conditions to plot (default is all)
+        :type conditions: list
+        """
+        try:
+            from matplotlib import pylab
+        except ImportError:
+            raise ValueError('The plot function requires matplotlib to be installed.'
+                         'See http://matplotlib.org/')
+
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
+        conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
+        title = _get_kwarg(kwargs, 'title', '')
+        samples = _get_kwarg(kwargs, 'samples',
+                             sorted(set(v for c in conditions for v in self[c])))  # this computation could be wasted
+        if not "linewidth" in kwargs:
+            kwargs["linewidth"] = 2
+
+        for condition in conditions:
+            if cumulative:
+                freqs = list(self[condition]._cumulative_frequencies(samples))
+                ylabel = "Cumulative Counts"
+                legend_loc = 'lower right'
+            else:
+                freqs = [self[condition][sample] for sample in samples]
+                ylabel = "Counts"
+                legend_loc = 'upper right'
+            # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
+            kwargs['label'] = "%s" % condition
+            pylab.plot(freqs, *args, **kwargs)
+
+        pylab.legend(loc=legend_loc)
+        pylab.grid(True, color="silver")
+        pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+        if title:
+            pylab.title(title)
+        pylab.xlabel("Samples")
+        pylab.ylabel(ylabel)
+        pylab.show()
+
+    def tabulate(self, *args, **kwargs):
+        """
+        Tabulate the given samples from the conditional frequency distribution.
+
+        :param samples: The samples to plot
+        :type samples: list
+        :param conditions: The conditions to plot (default is all)
+        :type conditions: list
+        :param cumulative: A flag to specify whether the freqs are cumulative (default = False)
+        :type title: bool
+        """
+
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
+        conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
+        samples = _get_kwarg(kwargs, 'samples',
+                             sorted(set(v for c in conditions for v in self[c])))  # this computation could be wasted
+
+        width = max(len("%s" % s) for s in samples)
+        freqs = dict()
+        for c in conditions:
+            if cumulative:
+                freqs[c] = list(self[c]._cumulative_frequencies(samples))
+            else:
+                freqs[c] = [self[c][sample] for sample in samples]
+            width = max(width, max(len("%d" % f) for f in freqs[c]))
+
+        condition_size = max(len("%s" % c) for c in conditions)
+        print(' ' * condition_size, end=' ')
+        for s in samples:
+            print("%*s" % (width, s), end=' ')
+        print()
+        for c in conditions:
+            print("%*s" % (condition_size, c), end=' ')
+            for f in freqs[c]:
+                print("%*d" % (width, f), end=' ')
+            print()
+
+    # Mathematical operators
+
+    def __add__(self, other):
+        """
+        Add counts from two ConditionalFreqDists.
+        """
+        if not isinstance(other, ConditionalFreqDist):
+            return NotImplemented
+        result = ConditionalFreqDist()
+        for cond in self.conditions():
+            newfreqdist = self[cond] + other[cond]
+            if newfreqdist:
+                result[cond] = newfreqdist
+        for cond in other.conditions():
+            if cond not in self.conditions():
+                for elem, count in other[cond].items():
+                    if count > 0:
+                        result[cond][elem] = count
+        return result
+
+    def __sub__(self, other):
+        """
+        Subtract count, but keep only results with positive counts.
+        """
+        if not isinstance(other, ConditionalFreqDist):
+            return NotImplemented
+        result = ConditionalFreqDist()
+        for cond in self.conditions():
+            newfreqdist = self[cond] - other[cond]
+            if newfreqdist:
+                result[cond] = newfreqdist
+        for cond in other.conditions():
+            if cond not in self.conditions():
+                for elem, count in other[cond].items():
+                    if count < 0:
+                        result[cond][elem] = 0 - count
+        return result
+
+    def __or__(self, other):
+        """
+        Union is the maximum of value in either of the input counters.
+        """
+        if not isinstance(other, ConditionalFreqDist):
+            return NotImplemented
+        result = ConditionalFreqDist()
+        for cond in self.conditions():
+            newfreqdist = self[cond] | other[cond]
+            if newfreqdist:
+                result[cond] = newfreqdist
+        for cond in other.conditions():
+            if cond not in self.conditions():
+                for elem, count in other[cond].items():
+                    if count > 0:
+                        result[cond][elem] = count
+        return result
+
+    def __and__(self, other):
+        """
+        Intersection is the minimum of corresponding counts.
+        """
+        if not isinstance(other, ConditionalFreqDist):
+            return NotImplemented
+        result = ConditionalFreqDist()
+        for cond in self.conditions():
+            newfreqdist = self[cond] & other[cond]
+            if newfreqdist:
+                result[cond] = newfreqdist
+        return result
+
+    # @total_ordering doesn't work here, since the class inherits from a builtin class
+    def __le__(self, other):
+        if not isinstance(other, ConditionalFreqDist):
+            raise_unorderable_types("<=", self, other)
+        return set(self.conditions()).issubset(other.conditions()) \
+               and all(self[c] <= other[c] for c in self.conditions())
+    def __lt__(self, other):
+        if not isinstance(other, ConditionalFreqDist):
+            raise_unorderable_types("<", self, other)
+        return self <= other and self != other
+    def __ge__(self, other):
+        if not isinstance(other, ConditionalFreqDist):
+            raise_unorderable_types(">=", self, other)
+        return other <= self
+    def __gt__(self, other):
+        if not isinstance(other, ConditionalFreqDist):
+            raise_unorderable_types(">", self, other)
+        return other < self
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ConditionalFreqDist``.
+
+        :rtype: str
+        """
+        return '<ConditionalFreqDist with %d conditions>' % len(self)
+
+
+@compat.python_2_unicode_compatible
+@add_metaclass(ABCMeta)
+class ConditionalProbDistI(dict):
+    """
+    A collection of probability distributions for a single experiment
+    run under different conditions.  Conditional probability
+    distributions are used to estimate the likelihood of each sample,
+    given the condition under which the experiment was run.  For
+    example, a conditional probability distribution could be used to
+    estimate the probability of each word type in a document, given
+    the length of the word type.  Formally, a conditional probability
+    distribution can be defined as a function that maps from each
+    condition to the ``ProbDist`` for the experiment under that
+    condition.
+    """
+    @abstractmethod
+    def __init__(self):
+        """
+        Classes inheriting from ConditionalProbDistI should implement __init__.
+        """
+
+    def conditions(self):
+        """
+        Return a list of the conditions that are represented by
+        this ``ConditionalProbDist``.  Use the indexing operator to
+        access the probability distribution for a given condition.
+
+        :rtype: list
+        """
+        return list(self.keys())
+
+    def __repr__(self):
+        """
+        Return a string representation of this ``ConditionalProbDist``.
+
+        :rtype: str
+        """
+        return '<%s with %d conditions>' % (type(self).__name__, len(self))
+
+
+class ConditionalProbDist(ConditionalProbDistI):
+    """
+    A conditional probability distribution modeling the experiments
+    that were used to generate a conditional frequency distribution.
+    A ConditionalProbDist is constructed from a
+    ``ConditionalFreqDist`` and a ``ProbDist`` factory:
+
+    - The ``ConditionalFreqDist`` specifies the frequency
+      distribution for each condition.
+    - The ``ProbDist`` factory is a function that takes a
+      condition's frequency distribution, and returns its
+      probability distribution.  A ``ProbDist`` class's name (such as
+      ``MLEProbDist`` or ``HeldoutProbDist``) can be used to specify
+      that class's constructor.
+
+    The first argument to the ``ProbDist`` factory is the frequency
+    distribution that it should model; and the remaining arguments are
+    specified by the ``factory_args`` parameter to the
+    ``ConditionalProbDist`` constructor.  For example, the following
+    code constructs a ``ConditionalProbDist``, where the probability
+    distribution for each condition is an ``ELEProbDist`` with 10 bins:
+
+        >>> from nltk.corpus import brown
+        >>> from nltk.probability import ConditionalFreqDist
+        >>> from nltk.probability import ConditionalProbDist, ELEProbDist
+        >>> cfdist = ConditionalFreqDist(brown.tagged_words()[:5000])
+        >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10)
+        >>> cpdist['passed'].max()
+        'VBD'
+        >>> cpdist['passed'].prob('VBD')
+        0.423...
+
+    """
+    def __init__(self, cfdist, probdist_factory,
+                 *factory_args, **factory_kw_args):
+        """
+        Construct a new conditional probability distribution, based on
+        the given conditional frequency distribution and ``ProbDist``
+        factory.
+
+        :type cfdist: ConditionalFreqDist
+        :param cfdist: The ``ConditionalFreqDist`` specifying the
+            frequency distribution for each condition.
+        :type probdist_factory: class or function
+        :param probdist_factory: The function or class that maps
+            a condition's frequency distribution to its probability
+            distribution.  The function is called with the frequency
+            distribution as its first argument,
+            ``factory_args`` as its remaining arguments, and
+            ``factory_kw_args`` as keyword arguments.
+        :type factory_args: (any)
+        :param factory_args: Extra arguments for ``probdist_factory``.
+            These arguments are usually used to specify extra
+            properties for the probability distributions of individual
+            conditions, such as the number of bins they contain.
+        :type factory_kw_args: (any)
+        :param factory_kw_args: Extra keyword arguments for ``probdist_factory``.
+        """
+        self._probdist_factory = probdist_factory
+        self._factory_args = factory_args
+        self._factory_kw_args = factory_kw_args
+
+        for condition in cfdist:
+            self[condition] = probdist_factory(cfdist[condition],
+                                               *factory_args, **factory_kw_args)
+
+    def __missing__(self, key):
+        self[key] = self._probdist_factory(FreqDist(),
+                                           *self._factory_args,
+                                           **self._factory_kw_args)
+        return self[key]
+
+class DictionaryConditionalProbDist(ConditionalProbDistI):
+    """
+    An alternative ConditionalProbDist that simply wraps a dictionary of
+    ProbDists rather than creating these from FreqDists.
+    """
+
+    def __init__(self, probdist_dict):
+        """
+        :param probdist_dict: a dictionary containing the probdists indexed
+            by the conditions
+        :type probdist_dict: dict any -> probdist
+        """
+        self.update(probdist_dict)
+
+    def __missing__(self, key):
+        self[key] = DictionaryProbDist()
+        return self[key]
+
+##//////////////////////////////////////////////////////
+## Adding in log-space.
+##//////////////////////////////////////////////////////
+
+# If the difference is bigger than this, then just take the bigger one:
+_ADD_LOGS_MAX_DIFF = math.log(1e-30, 2)
+
+def add_logs(logx, logy):
+    """
+    Given two numbers ``logx`` = *log(x)* and ``logy`` = *log(y)*, return
+    *log(x+y)*.  Conceptually, this is the same as returning
+    ``log(2**(logx)+2**(logy))``, but the actual implementation
+    avoids overflow errors that could result from direct computation.
+    """
+    if (logx < logy + _ADD_LOGS_MAX_DIFF):
+        return logy
+    if (logy < logx + _ADD_LOGS_MAX_DIFF):
+        return logx
+    base = min(logx, logy)
+    return base + math.log(2**(logx-base) + 2**(logy-base), 2)
+
+def sum_logs(logs):
+    return (reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF)
+
+##//////////////////////////////////////////////////////
+##  Probabilistic Mix-in
+##//////////////////////////////////////////////////////
+
+class ProbabilisticMixIn(object):
+    """
+    A mix-in class to associate probabilities with other classes
+    (trees, rules, etc.).  To use the ``ProbabilisticMixIn`` class,
+    define a new class that derives from an existing class and from
+    ProbabilisticMixIn.  You will need to define a new constructor for
+    the new class, which explicitly calls the constructors of both its
+    parent classes.  For example:
+
+        >>> from nltk.probability import ProbabilisticMixIn
+        >>> class A:
+        ...     def __init__(self, x, y): self.data = (x,y)
+        ...
+        >>> class ProbabilisticA(A, ProbabilisticMixIn):
+        ...     def __init__(self, x, y, **prob_kwarg):
+        ...         A.__init__(self, x, y)
+        ...         ProbabilisticMixIn.__init__(self, **prob_kwarg)
+
+    See the documentation for the ProbabilisticMixIn
+    ``constructor<__init__>`` for information about the arguments it
+    expects.
+
+    You should generally also redefine the string representation
+    methods, the comparison methods, and the hashing method.
+    """
+    def __init__(self, **kwargs):
+        """
+        Initialize this object's probability.  This initializer should
+        be called by subclass constructors.  ``prob`` should generally be
+        the first argument for those constructors.
+
+        :param prob: The probability associated with the object.
+        :type prob: float
+        :param logprob: The log of the probability associated with
+            the object.
+        :type logprob: float
+        """
+        if 'prob' in kwargs:
+            if 'logprob' in kwargs:
+                raise TypeError('Must specify either prob or logprob '
+                                '(not both)')
+            else:
+                ProbabilisticMixIn.set_prob(self, kwargs['prob'])
+        elif 'logprob' in kwargs:
+            ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
+        else:
+            self.__prob = self.__logprob = None
+
+    def set_prob(self, prob):
+        """
+        Set the probability associated with this object to ``prob``.
+
+        :param prob: The new probability
+        :type prob: float
+        """
+        self.__prob = prob
+        self.__logprob = None
+
+    def set_logprob(self, logprob):
+        """
+        Set the log probability associated with this object to
+        ``logprob``.  I.e., set the probability associated with this
+        object to ``2**(logprob)``.
+
+        :param logprob: The new log probability
+        :type logprob: float
+        """
+        self.__logprob = logprob
+        self.__prob = None
+
+    def prob(self):
+        """
+        Return the probability associated with this object.
+
+        :rtype: float
+        """
+        if self.__prob is None:
+            if self.__logprob is None: return None
+            self.__prob = 2**(self.__logprob)
+        return self.__prob
+
+    def logprob(self):
+        """
+        Return ``log(p)``, where ``p`` is the probability associated
+        with this object.
+
+        :rtype: float
+        """
+        if self.__logprob is None:
+            if self.__prob is None: return None
+            self.__logprob = math.log(self.__prob, 2)
+        return self.__logprob
+
+class ImmutableProbabilisticMixIn(ProbabilisticMixIn):
+    def set_prob(self, prob):
+        raise ValueError('%s is immutable' % self.__class__.__name__)
+    def set_logprob(self, prob):
+        raise ValueError('%s is immutable' % self.__class__.__name__)
+
+## Helper function for processing keyword arguments
+
+def _get_kwarg(kwargs, key, default):
+    if key in kwargs:
+        arg = kwargs[key]
+        del kwargs[key]
+    else:
+        arg = default
+    return arg
+
+##//////////////////////////////////////////////////////
+##  Demonstration
+##//////////////////////////////////////////////////////
+
+def _create_rand_fdist(numsamples, numoutcomes):
+    """
+    Create a new frequency distribution, with random samples.  The
+    samples are numbers from 1 to ``numsamples``, and are generated by
+    summing two numbers, each of which has a uniform distribution.
+    """
+    import random
+    fdist = FreqDist()
+    for x in range(numoutcomes):
+        y = (random.randint(1, (1 + numsamples) // 2) +
+             random.randint(0, numsamples // 2))
+        fdist[y] += 1
+    return fdist
+
+def _create_sum_pdist(numsamples):
+    """
+    Return the true probability distribution for the experiment
+    ``_create_rand_fdist(numsamples, x)``.
+    """
+    fdist = FreqDist()
+    for x in range(1, (1 + numsamples) // 2 + 1):
+        for y in range(0, numsamples // 2 + 1):
+            fdist[x+y] += 1
+    return MLEProbDist(fdist)
+
+def demo(numsamples=6, numoutcomes=500):
+    """
+    A demonstration of frequency distributions and probability
+    distributions.  This demonstration creates three frequency
+    distributions with, and uses them to sample a random process with
+    ``numsamples`` samples.  Each frequency distribution is sampled
+    ``numoutcomes`` times.  These three frequency distributions are
+    then used to build six probability distributions.  Finally, the
+    probability estimates of these distributions are compared to the
+    actual probability of each sample.
+
+    :type numsamples: int
+    :param numsamples: The number of samples to use in each demo
+        frequency distributions.
+    :type numoutcomes: int
+    :param numoutcomes: The total number of outcomes for each
+        demo frequency distribution.  These outcomes are divided into
+        ``numsamples`` bins.
+    :rtype: None
+    """
+
+    # Randomly sample a stochastic process three times.
+    fdist1 = _create_rand_fdist(numsamples, numoutcomes)
+    fdist2 = _create_rand_fdist(numsamples, numoutcomes)
+    fdist3 = _create_rand_fdist(numsamples, numoutcomes)
+
+    # Use our samples to create probability distributions.
+    pdists = [
+        MLEProbDist(fdist1),
+        LidstoneProbDist(fdist1, 0.5, numsamples),
+        HeldoutProbDist(fdist1, fdist2, numsamples),
+        HeldoutProbDist(fdist2, fdist1, numsamples),
+        CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples),
+        SimpleGoodTuringProbDist(fdist1),
+        SimpleGoodTuringProbDist(fdist1, 7),
+        _create_sum_pdist(numsamples),
+    ]
+
+    # Find the probability of each sample.
+    vals = []
+    for n in range(1,numsamples+1):
+        vals.append(tuple([n, fdist1.freq(n)] +
+                          [pdist.prob(n) for pdist in pdists]))
+
+    # Print the results in a formatted table.
+    print(('%d samples (1-%d); %d outcomes were sampled for each FreqDist' %
+           (numsamples, numsamples, numoutcomes)))
+    print('='*9*(len(pdists)+2))
+    FORMATSTR = '      FreqDist '+ '%8s '*(len(pdists)-1) + '|  Actual'
+    print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1]))
+    print('-'*9*(len(pdists)+2))
+    FORMATSTR = '%3d   %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f'
+    for val in vals:
+        print(FORMATSTR % val)
+
+    # Print the totals for each column (should all be 1.0)
+    zvals = list(zip(*vals))
+    sums = [sum(val) for val in zvals[1:]]
+    print('-'*9*(len(pdists)+2))
+    FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f'
+    print(FORMATSTR % tuple(sums))
+    print('='*9*(len(pdists)+2))
+
+    # Display the distributions themselves, if they're short enough.
+    if len("%s" % fdist1) < 70:
+        print('  fdist1: %s' % fdist1)
+        print('  fdist2: %s' % fdist2)
+        print('  fdist3: %s' % fdist3)
+    print()
+
+    print('Generating:')
+    for pdist in pdists:
+        fdist = FreqDist(pdist.generate() for i in range(5000))
+        print('%20s %s' % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
+    print()
+
+def gt_demo():
+    from nltk import corpus
+    emma_words = corpus.gutenberg.words('austen-emma.txt')
+    fd = FreqDist(emma_words)
+    sgt = SimpleGoodTuringProbDist(fd)
+    print('%18s %8s  %14s' \
+        % ("word", "freqency", "SimpleGoodTuring"))
+    fd_keys_sorted=(key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True))
+    for key in fd_keys_sorted:
+        print('%18s %8d  %14e' \
+            % (key, fd[key], sgt.prob(key)))
+
+if __name__ == '__main__':
+    demo(6, 10)
+    demo(5, 5000)
+    gt_demo()
+
+__all__ = ['ConditionalFreqDist', 'ConditionalProbDist',
+           'ConditionalProbDistI', 'CrossValidationProbDist',
+           'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist',
+           'FreqDist', 'SimpleGoodTuringProbDist', 'HeldoutProbDist',
+           'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist',
+           'MLEProbDist', 'MutableProbDist', 'KneserNeyProbDist', 'ProbDistI', 'ProbabilisticMixIn',
+           'UniformProbDist', 'WittenBellProbDist', 'add_logs',
+           'log_likelihood', 'sum_logs', 'entropy']
diff --git a/nlp_resource_data/nltk/probability.pyc b/nlp_resource_data/nltk/probability.pyc

new file mode 100755 (executable)

index 0000000..863dae5

Binary files /dev/null and b/nlp_resource_data/nltk/probability.pyc differ
diff --git a/nlp_resource_data/nltk/sem/__init__.py b/nlp_resource_data/nltk/sem/__init__.py

new file mode 100755 (executable)

index 0000000..7bad174
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/__init__.py
@@ -0,0 +1,61 @@
+# Natural Language Toolkit: Semantic Interpretation
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Semantic Interpretation Package
+
+This package contains classes for representing semantic structure in
+formulas of first-order logic and for evaluating such formulas in
+set-theoretic models.
+
+    >>> from nltk.sem import logic
+    >>> logic._counter._value = 0
+
+The package has two main components:
+
+ - ``logic`` provides support for analyzing expressions of First
+   Order Logic (FOL).
+ - ``evaluate`` allows users to recursively determine truth in a
+   model for formulas of FOL.
+
+A model consists of a domain of discourse and a valuation function,
+which assigns values to non-logical constants. We assume that entities
+in the domain are represented as strings such as ``'b1'``, ``'g1'``,
+etc. A ``Valuation`` is initialized with a list of (symbol, value)
+pairs, where values are entities, sets of entities or sets of tuples
+of entities.
+The domain of discourse can be inferred from the valuation, and model
+is then created with domain and valuation as parameters.
+
+    >>> from nltk.sem import Valuation, Model
+    >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),
+    ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])),
+    ... ('dog', set(['d1'])),
+    ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
+    >>> val = Valuation(v)
+    >>> dom = val.domain
+    >>> m = Model(dom, val)
+"""
+
+from nltk.sem.util import (parse_sents, interpret_sents, evaluate_sents,
+                           root_semrep)
+from nltk.sem.evaluate import (Valuation, Assignment, Model, Undefined,
+                               is_rel, set2rel, arity, read_valuation)
+from nltk.sem.logic import (boolean_ops, binding_ops, equality_preds,
+                           read_logic, Variable, Expression,
+                           ApplicationExpression, LogicalExpressionException)
+from nltk.sem.skolemize import skolemize
+from nltk.sem.lfg import FStructure
+from nltk.sem.relextract import (extract_rels, rtuple, clause)
+from nltk.sem.boxer import Boxer
+from nltk.sem.drt import DrtExpression, DRS
+
+# from nltk.sem.glue import Glue
+# from nltk.sem.hole import HoleSemantics
+# from nltk.sem.cooper_storage import CooperStore
+
+# don't import chat80 as its names are too generic
diff --git a/nlp_resource_data/nltk/sem/__init__.pyc b/nlp_resource_data/nltk/sem/__init__.pyc

new file mode 100755 (executable)

index 0000000..7cb5d8b

Binary files /dev/null and b/nlp_resource_data/nltk/sem/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/sem/boxer.py b/nlp_resource_data/nltk/sem/boxer.py

new file mode 100755 (executable)

index 0000000..a56017f
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/boxer.py
@@ -0,0 +1,1261 @@
+# Natural Language Toolkit: Interface to Boxer
+# <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An interface to Boxer.
+
+This interface relies on the latest version of the development (subversion) version of
+C&C and Boxer.
+
+Usage:
+  Set the environment variable CANDC to the bin directory of your CandC installation.
+  The models directory should be in the CandC root directory.
+  For example:
+     /path/to/candc/
+        bin/
+            candc
+            boxer
+        models/
+            boxer/
+"""
+from __future__ import print_function, unicode_literals
+
+import os
+import re
+import operator
+import subprocess
+from optparse import OptionParser
+import tempfile
+from functools import reduce
+
+from nltk.internals import find_binary
+
+from nltk.sem.logic import (ExpectedMoreTokensException, LogicalExpressionException,
+                            UnexpectedTokenException, Variable)
+
+from nltk.sem.drt import (DRS, DrtApplicationExpression, DrtEqualityExpression,
+                          DrtNegatedExpression, DrtOrExpression, DrtParser,
+                          DrtProposition, DrtTokens, DrtVariableExpression)
+
+from nltk.compat import python_2_unicode_compatible
+
+class Boxer(object):
+    """
+    This class is an interface to Johan Bos's program Boxer, a wide-coverage
+    semantic parser that produces Discourse Representation Structures (DRSs).
+    """
+
+    def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True):
+        """
+        :param boxer_drs_interpreter: A class that converts from the
+        ``AbstractBoxerDrs`` object hierarchy to a different object.  The
+        default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK
+        DRT hierarchy.
+        :param elimeq: When set to true, Boxer removes all equalities from the
+        DRSs and discourse referents standing in the equality relation are
+        unified, but only if this can be done in a meaning-preserving manner.
+        :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. 
+        Resolution follows Van der Sandt's theory of binding and accommodation.
+        """
+        if boxer_drs_interpreter is None:
+            boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
+        self._boxer_drs_interpreter = boxer_drs_interpreter
+
+        self._resolve = resolve
+        self._elimeq = elimeq
+
+        self.set_bin_dir(bin_dir, verbose)
+
+    def set_bin_dir(self, bin_dir, verbose=False):
+        self._candc_bin = self._find_binary('candc', bin_dir, verbose)
+        self._candc_models_path = os.path.normpath(os.path.join(self._candc_bin[:-5], '../models'))
+        self._boxer_bin = self._find_binary('boxer', bin_dir, verbose)
+
+    def interpret(self, input, discourse_id=None, question=False, verbose=False):
+        """
+        Use Boxer to give a first order representation.
+
+        :param input: str Input sentence to parse
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
+        :return: ``drt.DrtExpression``
+        """
+        discourse_ids = ([discourse_id] if discourse_id is not None else None)
+        d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
+        if not d:
+            raise Exception('Unable to interpret: "{0}"'.format(input))
+        return d
+
+    def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
+        """
+        Use Boxer to give a first order representation.
+
+        :param input: list of str Input sentences to parse as a single discourse
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
+        :return: ``drt.DrtExpression``
+        """
+        discourse_ids = ([discourse_id] if discourse_id is not None else None)
+        d, = self.interpret_multi_sents([input], discourse_ids, question, verbose)
+        if not d:
+            raise Exception('Unable to interpret: "{0}"'.format(input))
+        return d
+
+    def interpret_sents(self, inputs, discourse_ids=None, question=False, verbose=False):
+        """
+        Use Boxer to give a first order representation.
+
+        :param inputs: list of str Input sentences to parse as individual discourses
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
+        :return: list of ``drt.DrtExpression``
+        """
+        return self.interpret_multi_sents([[input] for input in inputs], discourse_ids, question, verbose)
+
+    def interpret_multi_sents(self, inputs, discourse_ids=None, question=False, verbose=False):
+        """
+        Use Boxer to give a first order representation.
+
+        :param inputs: list of list of str Input discourses to parse
+        :param occur_index: bool Should predicates be occurrence indexed?
+        :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
+        :return: ``drt.DrtExpression``
+        """
+        if discourse_ids is not None:
+            assert len(inputs) == len(discourse_ids)
+            assert reduce(operator.and_, (id is not None for id in discourse_ids))
+            use_disc_id = True
+        else:
+            discourse_ids = list(map(str, range(len(inputs))))
+            use_disc_id = False
+
+        candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
+        boxer_out = self._call_boxer(candc_out, verbose=verbose)
+
+#        if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
+#            raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
+
+        drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
+        return [drs_dict.get(id, None) for id in discourse_ids]
+
+    def _call_candc(self, inputs, discourse_ids, question, verbose=False):
+        """
+        Call the ``candc`` binary with the given input.
+
+        :param inputs: list of list of str Input discourses to parse
+        :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
+        :param filename: str A filename for the output file
+        :return: stdout
+        """
+        args = ['--models', os.path.join(self._candc_models_path, ['boxer','questions'][question]),
+                '--candc-printer', 'boxer']
+        return self._call('\n'.join(sum((["<META>'{0}'".format(id)] + d for d,id in zip(inputs,discourse_ids)), [])), self._candc_bin, args, verbose)
+
+    def _call_boxer(self, candc_out, verbose=False):
+        """
+        Call the ``boxer`` binary with the given input.
+
+        :param candc_out: str output from C&C parser
+        :return: stdout
+        """
+        f = None
+        try:
+            fd, temp_filename = tempfile.mkstemp(prefix='boxer-', suffix='.in', text=True)
+            f = os.fdopen(fd, 'w')
+            f.write(candc_out)
+        finally:
+            if f: f.close()
+
+        args = ['--box', 'false',
+                '--semantics', 'drs',
+                #'--flat', 'false', # removed from boxer
+                '--resolve', ['false','true'][self._resolve],
+                '--elimeq', ['false','true'][self._elimeq],
+                '--format', 'prolog',
+                '--instantiate', 'true',
+                '--input', temp_filename]
+        stdout = self._call(None, self._boxer_bin, args, verbose)
+        os.remove(temp_filename)
+        return stdout
+
+    def _find_binary(self, name, bin_dir, verbose=False):
+        return find_binary(name,
+            path_to_bin=bin_dir,
+            env_vars=['CANDC'],
+            url='http://svn.ask.it.usyd.edu.au/trac/candc/',
+            binary_names=[name, name + '.exe'],
+            verbose=verbose)
+
+    def _call(self, input_str, binary, args=[], verbose=False):
+        """
+        Call the binary with the given input.
+
+        :param input_str: A string whose contents are used as stdin.
+        :param binary: The location of the binary to call
+        :param args: A list of command-line arguments.
+        :return: stdout
+        """
+        if verbose:
+            print('Calling:', binary)
+            print('Args:', args)
+            print('Input:', input_str)
+            print('Command:', binary + ' ' + ' '.join(args))
+
+        # Call via a subprocess
+        if input_str is None:
+            cmd = [binary] + args
+            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        else:
+            cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, ' '.join(args))
+            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        stdout, stderr = p.communicate()
+
+        if verbose:
+            print('Return code:', p.returncode)
+            if stdout: print('stdout:\n', stdout, '\n')
+            if stderr: print('stderr:\n', stderr, '\n')
+        if p.returncode != 0:
+            raise Exception('ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}'.format(binary, ' '.join(args), p.returncode, stderr))
+
+        return stdout
+
+    def _parse_to_drs_dict(self, boxer_out, use_disc_id):
+        lines = boxer_out.split('\n')
+        drs_dict = {}
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if line.startswith('id('):
+                comma_idx = line.index(',')
+                discourse_id = line[3:comma_idx]
+                if discourse_id[0] == "'" and discourse_id[-1] == "'":
+                    discourse_id = discourse_id[1:-1]
+                drs_id = line[comma_idx+1:line.index(')')]
+                i += 1
+                line = lines[i]
+                assert line.startswith('sem({0},'.format(drs_id))
+                if line[-4:] == "').'":
+                    line = line[:-4] + ")."
+                assert line.endswith(').'), "can't parse line: {0}".format(line)
+
+                search_start = len('sem({0},['.format(drs_id))
+                brace_count = 1
+                drs_start = -1
+                for j,c in enumerate(line[search_start:]):
+                    if(c == '['):
+                        brace_count += 1
+                    if(c == ']'):
+                        brace_count -= 1
+                        if(brace_count == 0):
+                            drs_start = search_start + j + 1
+                            if line[drs_start:drs_start+3] == "','":
+                                drs_start = drs_start + 3
+                            else:
+                                drs_start = drs_start + 1
+                            break
+                assert drs_start > -1
+
+                drs_input = line[drs_start:-2].strip()
+                parsed = self._parse_drs(drs_input, discourse_id, use_disc_id)
+                drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed)
+            i += 1
+        return drs_dict
+
+    def _parse_drs(self, drs_string, discourse_id, use_disc_id):
+        return BoxerOutputDrsParser([None,discourse_id][use_disc_id]).parse(drs_string)
+
+
+class BoxerOutputDrsParser(DrtParser):
+    def __init__(self, discourse_id=None):
+        """
+        This class is used to parse the Prolog DRS output from Boxer into a
+        hierarchy of python objects.
+        """
+        DrtParser.__init__(self)
+        self.discourse_id = discourse_id
+        self.sentence_id_offset = None
+        self.quote_chars = [("'", "'", "\\", False)]
+
+    def parse(self, data, signature=None):
+        return DrtParser.parse(self, data, signature)
+
+    def get_all_symbols(self):
+        return ['(', ')', ',', '[', ']',':']
+
+    def handle(self, tok, context):
+        return self.handle_drs(tok)
+
+    def attempt_adjuncts(self, expression, context):
+        return expression
+
+    def parse_condition(self, indices):
+        """
+        Parse a DRS condition
+
+        :return: list of ``DrtExpression``
+        """
+        tok = self.token()
+        accum = self.handle_condition(tok, indices)
+        if accum is None:
+            raise UnexpectedTokenException(tok)
+        return accum
+
+    def handle_drs(self, tok):
+        if tok == 'drs':
+            return self.parse_drs()
+        elif tok in ['merge', 'smerge']:
+            return self._handle_binary_expression(self._make_merge_expression)(None, [])
+        elif tok in ['alfa']:
+            return self._handle_alfa(self._make_merge_expression)(None, [])
+
+    def handle_condition(self, tok, indices):
+        """
+        Handle a DRS condition
+
+        :param indices: list of int
+        :return: list of ``DrtExpression``
+        """
+        if tok == 'not':
+            return [self._handle_not()]
+
+        if tok == 'or':
+            conds = [self._handle_binary_expression(self._make_or_expression)]
+        elif tok == 'imp':
+            conds = [self._handle_binary_expression(self._make_imp_expression)]
+        elif tok == 'eq':
+            conds = [self._handle_eq()]
+        elif tok == 'prop':
+            conds = [self._handle_prop()]
+
+        elif tok == 'pred':
+            conds = [self._handle_pred()]
+        elif tok == 'named':
+            conds = [self._handle_named()]
+        elif tok == 'rel':
+            conds = [self._handle_rel()]
+        elif tok == 'timex':
+            conds = self._handle_timex()
+        elif tok == 'card':
+            conds = [self._handle_card()]
+
+        elif tok == 'whq':
+            conds = [self._handle_whq()]
+        elif tok == 'duplex':
+                conds = [self._handle_duplex()]
+
+        else:
+            conds = []
+
+        return sum([[cond(sent_index, word_indices) for cond in conds] for sent_index, word_indices in self._sent_and_word_indices(indices)], [])
+
+    def _handle_not(self):
+        self.assertToken(self.token(), '(')
+        drs = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return BoxerNot(drs)
+
+    def _handle_pred(self):
+        #pred(_G3943, dog, n, 0)
+        self.assertToken(self.token(), '(')
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        name = self.token()
+        self.assertToken(self.token(), ',')
+        pos = self.token()
+        self.assertToken(self.token(), ',')
+        sense = int(self.token())
+        self.assertToken(self.token(), ')')
+
+        def _handle_pred_f(sent_index, word_indices):
+            return BoxerPred(self.discourse_id, sent_index, word_indices, variable, name, pos, sense)
+        return _handle_pred_f
+
+    def _handle_duplex(self):
+        #duplex(whq, drs(...), var, drs(...))
+        self.assertToken(self.token(), '(')
+        # self.assertToken(self.token(), '[')
+        ans_types = []
+        # while self.token(0) != ']':
+        #     cat = self.token()
+        #     self.assertToken(self.token(), ':')
+        #     if cat == 'des':
+        #         ans_types.append(self.token())
+        #     elif cat == 'num':
+        #         ans_types.append('number')
+        #         typ = self.token()
+        #         if typ == 'cou':
+        #             ans_types.append('count')
+        #         else:
+        #             ans_types.append(typ)
+        #     else:
+        #         ans_types.append(self.token())
+        # self.token() #swallow the ']'
+      
+        self.assertToken(self.token(), 'whq')
+        self.assertToken(self.token(), ',')
+        d1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ',')
+        ref = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        d2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
+
+
+    def _handle_named(self):
+        #named(x0, john, per, 0)
+        self.assertToken(self.token(), '(')
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        name = self.token()
+        self.assertToken(self.token(), ',')
+        type = self.token()
+        self.assertToken(self.token(), ',')
+        sense = self.token() # as per boxer rev 2554
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerNamed(self.discourse_id, sent_index, word_indices, variable, name, type, sense)
+
+    def _handle_rel(self):
+        #rel(_G3993, _G3943, agent, 0)
+        self.assertToken(self.token(), '(')
+        var1 = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        var2 = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        rel = self.token()
+        self.assertToken(self.token(), ',')
+        sense = int(self.token())
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerRel(self.discourse_id, sent_index, word_indices, var1, var2, rel, sense)
+
+    def _handle_timex(self):
+        #timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
+        self.assertToken(self.token(), '(')
+        arg = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        new_conds = self._handle_time_expression(arg)
+        self.assertToken(self.token(), ')')
+        return new_conds
+
+    def _handle_time_expression(self, arg):
+        #date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
+        tok = self.token()
+        self.assertToken(self.token(), '(')
+        if tok == 'date':
+            conds = self._handle_date(arg)
+        elif tok == 'time':
+            conds = self._handle_time(arg)
+        else:
+            return None
+        self.assertToken(self.token(), ')')
+        return [lambda sent_index, word_indices: BoxerPred(self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0)] + \
+               [lambda sent_index, word_indices: cond for cond in conds]
+
+    def _handle_date(self, arg):
+        #[]: (+), []:'XXXX', [1004]:'04', []:'XX'
+        conds = []
+        (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
+        self.assertToken(self.token(), '(')
+        pol = self.token()
+        self.assertToken(self.token(), ')')
+        conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_pol_{0}'.format(pol), 'a', 0))
+        self.assertToken(self.token(), ',')
+
+        (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
+        year = self.token()
+        if year != 'XXXX':
+            year = year.replace(':', '_')
+            conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_year_{0}'.format(year), 'a', 0))
+        self.assertToken(self.token(), ',')
+
+        (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
+        month = self.token()
+        if month != 'XX':
+            conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_month_{0}'.format(month), 'a', 0))
+        self.assertToken(self.token(), ',')
+
+        (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
+        day = self.token()
+        if day != 'XX':
+            conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_day_{0}'.format(day), 'a', 0))
+
+        return conds
+
+    def _handle_time(self, arg):
+        #time([1018]:'18', []:'XX', []:'XX')
+        conds = []
+        self._parse_index_list()
+        hour = self.token()
+        if hour != 'XX':
+            conds.append(self._make_atom('r_hour_2',arg,hour))
+        self.assertToken(self.token(), ',')
+
+        self._parse_index_list()
+        min = self.token()
+        if min != 'XX':
+            conds.append(self._make_atom('r_min_2',arg,min))
+        self.assertToken(self.token(), ',')
+
+        self._parse_index_list()
+        sec = self.token()
+        if sec != 'XX':
+            conds.append(self._make_atom('r_sec_2',arg,sec))
+
+        return conds
+
+    def _handle_card(self):
+        #card(_G18535, 28, ge)
+        self.assertToken(self.token(), '(')
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        value = self.token()
+        self.assertToken(self.token(), ',')
+        type = self.token()
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerCard(self.discourse_id, sent_index, word_indices, variable, value, type)
+
+    def _handle_prop(self):
+        #prop(_G15949, drs(...))
+        self.assertToken(self.token(), '(')
+        variable = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        drs = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerProp(self.discourse_id, sent_index, word_indices, variable, drs)
+
+    def _parse_index_list(self):
+        #[1001,1002]:
+        indices = []
+        self.assertToken(self.token(), '[')
+        while self.token(0) != ']':
+            indices.append(self.parse_index())
+            if self.token(0) == ',':
+                self.token() #swallow ','
+        self.token() #swallow ']'
+        self.assertToken(self.token(), ':')
+        return indices
+
+    def parse_drs(self):
+        #drs([[1001]:_G3943],
+        #    [[1002]:pred(_G3943, dog, n, 0)]
+        #   )
+        self.assertToken(self.token(), '(')
+        self.assertToken(self.token(), '[')
+        refs = set()
+        while self.token(0) != ']':
+            indices = self._parse_index_list()
+            refs.add(self.parse_variable())
+            if self.token(0) == ',':
+                self.token() #swallow ','
+        self.token() #swallow ']'
+        self.assertToken(self.token(), ',')
+        self.assertToken(self.token(), '[')
+        conds = []
+        while self.token(0) != ']':
+            indices = self._parse_index_list()
+            conds.extend(self.parse_condition(indices))
+            if self.token(0) == ',':
+                self.token() #swallow ','
+        self.token() #swallow ']'
+        self.assertToken(self.token(), ')')
+        return BoxerDrs(list(refs), conds)
+
+    def _handle_binary_expression(self, make_callback):
+        self.assertToken(self.token(), '(')
+        drs1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ',')
+        drs2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2)
+
+    def _handle_alfa(self, make_callback):
+        self.assertToken(self.token(), '(')
+        type = self.token()
+        self.assertToken(self.token(), ',')
+        drs1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ',')
+        drs2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2)
+
+    def _handle_eq(self):
+        self.assertToken(self.token(), '(')
+        var1 = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        var2 = self.parse_variable()
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerEq(self.discourse_id, sent_index, word_indices, var1, var2)
+
+
+    def _handle_whq(self):
+        self.assertToken(self.token(), '(')
+        self.assertToken(self.token(), '[')
+        ans_types = []
+        while self.token(0) != ']':
+            cat = self.token()
+            self.assertToken(self.token(), ':')
+            if cat == 'des':
+                ans_types.append(self.token())
+            elif cat == 'num':
+                ans_types.append('number')
+                typ = self.token()
+                if typ == 'cou':
+                    ans_types.append('count')
+                else:
+                    ans_types.append(typ)
+            else:
+                ans_types.append(self.token())
+        self.token() #swallow the ']'
+
+        self.assertToken(self.token(), ',')
+        d1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ',')
+        ref = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        d2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
+
+    def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
+        return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
+
+    def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
+        return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
+
+    def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
+        return BoxerDrs(drs1.refs, drs1.conds, drs2)
+
+    def parse_variable(self):
+        var = self.token()
+        assert re.match('^[exps]\d+$', var), var
+        return var
+
+    def parse_index(self):
+        return int(self.token())
+
+    def _sent_and_word_indices(self, indices):
+        """
+        :return: list of (sent_index, word_indices) tuples
+        """
+        sent_indices = set((i / 1000)-1 for i in indices if i>=0)
+        if sent_indices:
+            pairs = []
+            for sent_index in sent_indices:
+                word_indices = [(i % 1000)-1 for i in indices if sent_index == (i / 1000)-1]
+                pairs.append((sent_index, word_indices))
+            return pairs
+        else:
+            word_indices = [(i % 1000)-1 for i in indices]
+            return [(None, word_indices)]
+
+
+class BoxerDrsParser(DrtParser):
+    """
+    Reparse the str form of subclasses of ``AbstractBoxerDrs``
+    """
+    def __init__(self, discourse_id=None):
+        DrtParser.__init__(self)
+        self.discourse_id = discourse_id
+
+    def get_all_symbols(self):
+        return [DrtTokens.OPEN, DrtTokens.CLOSE, DrtTokens.COMMA, DrtTokens.OPEN_BRACKET, DrtTokens.CLOSE_BRACKET]
+
+    def attempt_adjuncts(self, expression, context):
+        return expression
+
+    def handle(self, tok, context):
+        try:
+#             if tok == 'drs':
+#                 self.assertNextToken(DrtTokens.OPEN)
+#                 label = int(self.token())
+#                 self.assertNextToken(DrtTokens.COMMA)
+#                 refs = list(map(int, self.handle_refs()))
+#                 self.assertNextToken(DrtTokens.COMMA)
+#                 conds = self.handle_conds(None)
+#                 self.assertNextToken(DrtTokens.CLOSE)
+#                 return BoxerDrs(label, refs, conds)
+            if tok == 'pred':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                variable = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                name = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                pos = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                sense = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
+            elif tok == 'named':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = map(int, self.handle_refs())
+                self.assertNextToken(DrtTokens.COMMA)
+                variable = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                name = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                type = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                sense = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerNamed(disc_id, sent_id, word_ids, variable, name, type, sense)
+            elif tok == 'rel':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                var1 = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                var2 = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                rel = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                sense = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
+            elif tok == 'prop':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                variable = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                drs = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
+            elif tok == 'not':
+                self.assertNextToken(DrtTokens.OPEN)
+                drs = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerNot(drs)
+            elif tok == 'imp':
+                self.assertNextToken(DrtTokens.OPEN)
+                drs1 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.COMMA)
+                drs2 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerDrs(drs1.refs, drs1.conds, drs2)
+            elif tok == 'or':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = map(int, self.handle_refs())
+                self.assertNextToken(DrtTokens.COMMA)
+                drs1 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.COMMA)
+                drs2 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
+            elif tok == 'eq':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                var1 = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                var2 = int(self.token())
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
+            elif tok == 'card':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = map(int, self.handle_refs())
+                self.assertNextToken(DrtTokens.COMMA)
+                var = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                value = self.token()
+                self.assertNextToken(DrtTokens.COMMA)
+                type = self.token()
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
+            elif tok == 'whq':
+                self.assertNextToken(DrtTokens.OPEN)
+                disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
+                self.assertNextToken(DrtTokens.COMMA)
+                sent_id = self.nullableIntToken()
+                self.assertNextToken(DrtTokens.COMMA)
+                word_ids = list(map(int, self.handle_refs()))
+                self.assertNextToken(DrtTokens.COMMA)
+                ans_types = self.handle_refs()
+                self.assertNextToken(DrtTokens.COMMA)
+                drs1 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.COMMA)
+                var = int(self.token())
+                self.assertNextToken(DrtTokens.COMMA)
+                drs2 = self.process_next_expression(None)
+                self.assertNextToken(DrtTokens.CLOSE)
+                return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2)
+        except Exception as e:
+            raise LogicalExpressionException(self._currentIndex, str(e))
+        assert False, repr(tok)
+
+    def nullableIntToken(self):
+        t = self.token()
+        return [None,int(t)][t != 'None']
+
+    def get_next_token_variable(self, description):
+        try:
+            return self.token()
+        except ExpectedMoreTokensException as e:
+            raise ExpectedMoreTokensException(e.index, 'Variable expected.')
+
+
+
+class AbstractBoxerDrs(object):
+    def variables(self):
+        """
+        :return: (set<variables>, set<events>, set<propositions>)
+        """
+        variables, events, propositions = self._variables()
+        return (variables - (events | propositions), events, propositions - events)
+
+    def variable_types(self):
+        vartypes = {}
+        for t,vars in zip(('z','e','p'), self.variables()):
+            for v in vars:
+                vartypes[v] = t
+        return vartypes
+
+    def _variables(self):
+        """
+        :return: (set<variables>, set<events>, set<propositions>)
+        """
+        return (set(), set(), set())
+
+    def atoms(self):
+        return set()
+
+    def clean(self):
+        return self
+
+    def _clean_name(self, name):
+        return name.replace('-','_').replace("'", "_")
+
+    def renumber_sentences(self, f):
+        return self
+
+    def __hash__(self):
+        return hash("{0}".format(self))
+
+
+@python_2_unicode_compatible
+class BoxerDrs(AbstractBoxerDrs):
+    def __init__(self, refs, conds, consequent=None):
+        AbstractBoxerDrs.__init__(self)
+        self.refs = refs
+        self.conds = conds
+        self.consequent = consequent
+
+    def _variables(self):
+        variables = (set(), set(), set())
+        for cond in self.conds:
+            for s,v in zip(variables, cond._variables()):
+                s.update(v)
+        if self.consequent is not None:
+            for s,v in zip(variables, self.consequent._variables()):
+                s.update(v)
+        return variables
+
+    def atoms(self):
+        atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set())
+        if self.consequent is not None:
+            atoms.update(self.consequent.atoms())
+        return atoms
+
+    def clean(self):
+        consequent = (self.consequent.clean() if self.consequent else None)
+        return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
+
+    def renumber_sentences(self, f):
+        consequent = (self.consequent.renumber_sentences(f) if self.consequent else None)
+        return BoxerDrs(self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
+
+    def __repr__(self):
+        s = 'drs([%s], [%s])' % (', '.join("%s" % r for r in self.refs),
+                                 ', '.join("%s" % c for c in self.conds))
+        if self.consequent is not None:
+            s = 'imp(%s, %s)' % (s, self.consequent)
+        return s
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and \
+               self.refs == other.refs and \
+               len(self.conds) == len(other.conds) and \
+               reduce(operator.and_, (c1==c2 for c1,c2 in zip(self.conds, other.conds))) and \
+               self.consequent == other.consequent
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = AbstractBoxerDrs.__hash__
+
+
+@python_2_unicode_compatible
+class BoxerNot(AbstractBoxerDrs):
+    def __init__(self, drs):
+        AbstractBoxerDrs.__init__(self)
+        self.drs = drs
+
+    def _variables(self):
+        return self.drs._variables()
+
+    def atoms(self):
+        return self.drs.atoms()
+
+    def clean(self):
+        return BoxerNot(self.drs.clean())
+
+    def renumber_sentences(self, f):
+        return BoxerNot(self.drs.renumber_sentences(f))
+
+    def __repr__(self):
+        return 'not(%s)' % (self.drs)
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and self.drs == other.drs
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = AbstractBoxerDrs.__hash__
+
+@python_2_unicode_compatible
+class BoxerIndexed(AbstractBoxerDrs):
+    def __init__(self, discourse_id, sent_index, word_indices):
+        AbstractBoxerDrs.__init__(self)
+        self.discourse_id = discourse_id
+        self.sent_index = sent_index
+        self.word_indices = word_indices
+
+    def atoms(self):
+        return set([self])
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and \
+               self.discourse_id == other.discourse_id and \
+               self.sent_index == other.sent_index and \
+               self.word_indices == other.word_indices and \
+               reduce(operator.and_, (s==o for s,o in zip(self, other)))
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = AbstractBoxerDrs.__hash__
+
+    def __repr__(self):
+        s = '%s(%s, %s, [%s]' % (self._pred(), self.discourse_id,
+                                 self.sent_index, ', '.join("%s" % wi for wi in self.word_indices))
+        for v in self:
+            s += ', %s' % v
+        return s + ')'
+
+class BoxerPred(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.name = name
+        self.pos = pos
+        self.sense = sense
+
+    def _variables(self):
+        return (set([self.var]), set(), set())
+
+    def change_var(self, var):
+        return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.pos, self.sense)
+
+    def clean(self):
+        return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.pos, self.sense)
+
+    def renumber_sentences(self, f):
+        new_sent_index = f(self.sent_index)
+        return BoxerPred(self.discourse_id, new_sent_index, self.word_indices, self.var, self.name, self.pos, self.sense)
+
+    def __iter__(self):
+        return iter((self.var, self.name, self.pos, self.sense))
+
+    def _pred(self):
+        return 'pred'
+
+class BoxerNamed(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.name = name
+        self.type = type
+        self.sense = sense
+
+    def _variables(self):
+        return (set([self.var]), set(), set())
+
+    def change_var(self, var):
+        return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.type, self.sense)
+
+    def clean(self):
+        return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.type, self.sense)
+
+    def renumber_sentences(self, f):
+        return BoxerNamed(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.name, self.type, self.sense)
+
+    def __iter__(self):
+        return iter((self.var, self.name, self.type, self.sense))
+
+    def _pred(self):
+        return 'named'
+
+class BoxerRel(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var1 = var1
+        self.var2 = var2
+        self.rel = rel
+        self.sense = sense
+
+    def _variables(self):
+        return (set([self.var1, self.var2]), set(), set())
+
+    def clean(self):
+        return BoxerRel(self.discourse_id, self.sent_index, self.word_indices, self.var1, self.var2, self._clean_name(self.rel), self.sense)
+
+    def renumber_sentences(self, f):
+        return BoxerRel(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, self.rel, self.sense)
+
+    def __iter__(self):
+        return iter((self.var1, self.var2, self.rel, self.sense))
+
+    def _pred(self):
+        return 'rel'
+
+class BoxerProp(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, drs):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.drs = drs
+
+    def _variables(self):
+        return tuple(map(operator.or_, (set(), set(), set([self.var])), self.drs._variables()))
+
+    def referenced_labels(self):
+        return set([self.drs])
+
+    def atoms(self):
+        return self.drs.atoms()
+
+    def clean(self):
+        return BoxerProp(self.discourse_id, self.sent_index, self.word_indices, self.var, self.drs.clean())
+
+    def renumber_sentences(self, f):
+        return BoxerProp(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.drs.renumber_sentences(f))
+
+    def __iter__(self):
+        return iter((self.var, self.drs))
+
+    def _pred(self):
+        return 'prop'
+
+class BoxerEq(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var1 = var1
+        self.var2 = var2
+
+    def _variables(self):
+        return (set([self.var1, self.var2]), set(), set())
+
+    def atoms(self):
+        return set()
+
+    def renumber_sentences(self, f):
+        return BoxerEq(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2)
+
+    def __iter__(self):
+        return iter((self.var1, self.var2))
+
+    def _pred(self):
+        return 'eq'
+
+class BoxerCard(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.var = var
+        self.value = value
+        self.type = type
+
+    def _variables(self):
+        return (set([self.var]), set(), set())
+
+    def renumber_sentences(self, f):
+        return BoxerCard(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.value, self.type)
+
+    def __iter__(self):
+        return iter((self.var, self.value, self.type))
+
+    def _pred(self):
+        return 'card'
+
+class BoxerOr(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.drs1 = drs1
+        self.drs2 = drs2
+
+    def _variables(self):
+        return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
+
+    def atoms(self):
+        return self.drs1.atoms() | self.drs2.atoms()
+
+    def clean(self):
+        return BoxerOr(self.discourse_id, self.sent_index, self.word_indices, self.drs1.clean(), self.drs2.clean())
+
+    def renumber_sentences(self, f):
+        return BoxerOr(self.discourse_id, f(self.sent_index), self.word_indices, self.drs1, self.drs2)
+
+    def __iter__(self):
+        return iter((self.drs1, self.drs2))
+
+    def _pred(self):
+        return 'or'
+
+class BoxerWhq(BoxerIndexed):
+    def __init__(self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2):
+        BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
+        self.ans_types = ans_types
+        self.drs1 = drs1
+        self.variable = variable
+        self.drs2 = drs2
+
+    def _variables(self):
+        return tuple(map(operator.or_, (set([self.variable]), set(), set()), self.drs1._variables(), self.drs2._variables()))
+
+    def atoms(self):
+        return self.drs1.atoms() | self.drs2.atoms()
+
+    def clean(self):
+        return BoxerWhq(self.discourse_id, self.sent_index, self.word_indices, self.ans_types, self.drs1.clean(), self.variable, self.drs2.clean())
+
+    def renumber_sentences(self, f):
+        return BoxerWhq(self.discourse_id, f(self.sent_index), self.word_indices, self.ans_types, self.drs1, self.variable, self.drs2)
+
+    def __iter__(self):
+        return iter(('['+','.join(self.ans_types)+']', self.drs1, self.variable, self.drs2))
+
+    def _pred(self):
+        return 'whq'
+
+
+
+class PassthroughBoxerDrsInterpreter(object):
+    def interpret(self, ex):
+        return ex
+
+
+class NltkDrtBoxerDrsInterpreter(object):
+    def __init__(self, occur_index=False):
+        self._occur_index = occur_index
+
+    def interpret(self, ex):
+        """
+        :param ex: ``AbstractBoxerDrs``
+        :return: ``DrtExpression``
+        """
+        if isinstance(ex, BoxerDrs):
+            drs = DRS([Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)))
+            if ex.consequent is not None:
+                drs.consequent = self.interpret(ex.consequent)
+            return drs
+        elif isinstance(ex, BoxerNot):
+            return DrtNegatedExpression(self.interpret(ex.drs))
+        elif isinstance(ex, BoxerPred):
+            pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
+            return self._make_atom(pred, ex.var)
+        elif isinstance(ex, BoxerNamed):
+            pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
+            return self._make_atom(pred, ex.var)
+        elif isinstance(ex, BoxerRel):
+            pred = self._add_occur_indexing('%s' % (ex.rel), ex)
+            return self._make_atom(pred, ex.var1, ex.var2)
+        elif isinstance(ex, BoxerProp):
+            return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
+        elif isinstance(ex, BoxerEq):
+            return DrtEqualityExpression(DrtVariableExpression(Variable(ex.var1)),
+                                         DrtVariableExpression(Variable(ex.var2)))
+        elif isinstance(ex, BoxerCard):
+            pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
+            return self._make_atom(pred, ex.var)
+        elif isinstance(ex, BoxerOr):
+            return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
+        elif isinstance(ex, BoxerWhq):
+            drs1 = self.interpret(ex.drs1)
+            drs2 = self.interpret(ex.drs2)
+            return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
+        assert False, '%s: %s' % (ex.__class__.__name__, ex)
+
+    def _make_atom(self, pred, *args):
+        accum = DrtVariableExpression(Variable(pred))
+        for arg in args:
+            accum = DrtApplicationExpression(accum, DrtVariableExpression(Variable(arg)))
+        return accum
+
+    def _add_occur_indexing(self, base, ex):
+        if self._occur_index and ex.sent_index is not None:
+            if ex.discourse_id:
+                base += '_%s'  % ex.discourse_id
+            base += '_s%s' % ex.sent_index
+            base += '_w%s' % sorted(ex.word_indices)[0]
+        return base
+
+
+class UnparseableInputException(Exception):
+    pass
+
+
+if __name__ == '__main__':
+    opts = OptionParser("usage: %prog TEXT [options]")
+    opts.add_option("--verbose", "-v", help="display verbose logs", action="store_true", default=False, dest="verbose")
+    opts.add_option("--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol")
+    opts.add_option("--question", "-q", help="input is a question", action="store_true", default=False, dest="question")
+    opts.add_option("--occur", "-o", help="occurrence index", action="store_true", default=False, dest="occur_index")
+    (options, args) = opts.parse_args()
+
+    if len(args) != 1:
+        opts.error("incorrect number of arguments")
+
+    interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
+    drs = Boxer(interpreter).interpret_multi(args[0].split(r'\n'), question=options.question, verbose=options.verbose)
+    if drs is None:
+        print(None)
+    else:
+        drs = drs.simplify().eliminate_equality()
+        if options.fol:
+            print(drs.fol().normalize())
+        else:
+            drs.pretty_print()
diff --git a/nlp_resource_data/nltk/sem/boxer.pyc b/nlp_resource_data/nltk/sem/boxer.pyc

new file mode 100755 (executable)

index 0000000..c209430

Binary files /dev/null and b/nlp_resource_data/nltk/sem/boxer.pyc differ
diff --git a/nlp_resource_data/nltk/sem/chat80.py b/nlp_resource_data/nltk/sem/chat80.py

new file mode 100755 (executable)

index 0000000..180c50a
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/chat80.py
@@ -0,0 +1,782 @@
+# Natural Language Toolkit: Chat-80 KB Reader
+# See http://www.w3.org/TR/swbp-skos-core-guide/
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>,
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+Overview
+========
+
+Chat-80 was a natural language system which allowed the user to
+interrogate a Prolog knowledge base in the domain of world
+geography. It was developed in the early '80s by Warren and Pereira; see
+``http://www.aclweb.org/anthology/J82-3002.pdf`` for a description and
+``http://www.cis.upenn.edu/~pereira/oldies.html`` for the source
+files.
+
+This module contains functions to extract data from the Chat-80
+relation files ('the world database'), and convert then into a format
+that can be incorporated in the FOL models of
+``nltk.sem.evaluate``. The code assumes that the Prolog
+input files are available in the NLTK corpora directory.
+
+The Chat-80 World Database consists of the following files::
+
+    world0.pl
+    rivers.pl
+    cities.pl
+    countries.pl
+    contain.pl
+    borders.pl
+
+This module uses a slightly modified version of ``world0.pl``, in which
+a set of Prolog rules have been omitted. The modified file is named
+``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since
+it uses a list rather than a string in the second field.
+
+Reading Chat-80 Files
+=====================
+
+Chat-80 relations are like tables in a relational database. The
+relation acts as the name of the table; the first argument acts as the
+'primary key'; and subsequent arguments are further fields in the
+table. In general, the name of the table provides a label for a unary
+predicate whose extension is all the primary keys. For example,
+relations in ``cities.pl`` are of the following form::
+
+   'city(athens,greece,1368).'
+
+Here, ``'athens'`` is the key, and will be mapped to a member of the
+unary predicate *city*.
+
+The fields in the table are mapped to binary predicates. The first
+argument of the predicate is the primary key, while the second
+argument is the data in the relevant field. Thus, in the above
+example, the third field is mapped to the binary predicate
+*population_of*, whose extension is a set of pairs such as
+``'(athens, 1368)'``.
+
+An exception to this general framework is required by the relations in
+the files ``borders.pl`` and ``contains.pl``. These contain facts of the
+following form::
+
+    'borders(albania,greece).'
+
+    'contains0(africa,central_africa).'
+
+We do not want to form a unary concept out the element in
+the first field of these records, and we want the label of the binary
+relation just to be ``'border'``/``'contain'`` respectively.
+
+In order to drive the extraction process, we use 'relation metadata bundles'
+which are Python dictionaries such as the following::
+
+  city = {'label': 'city',
+          'closures': [],
+          'schema': ['city', 'country', 'population'],
+          'filename': 'cities.pl'}
+
+According to this, the file ``city['filename']`` contains a list of
+relational tuples (or more accurately, the corresponding strings in
+Prolog form) whose predicate symbol is ``city['label']`` and whose
+relational schema is ``city['schema']``. The notion of a ``closure`` is
+discussed in the next section.
+
+Concepts
+========
+In order to encapsulate the results of the extraction, a class of
+``Concept`` objects is introduced.  A ``Concept`` object has a number of
+attributes, in particular a ``prefLabel`` and ``extension``, which make
+it easier to inspect the output of the extraction. In addition, the
+``extension`` can be further processed: in the case of the ``'border'``
+relation, we check that the relation is symmetric, and in the case
+of the ``'contain'`` relation, we carry out the transitive
+closure. The closure properties associated with a concept is
+indicated in the relation metadata, as indicated earlier.
+
+The ``extension`` of a ``Concept`` object is then incorporated into a
+``Valuation`` object.
+
+Persistence
+===========
+The functions ``val_dump`` and ``val_load`` are provided to allow a
+valuation to be stored in a persistent database and re-loaded, rather
+than having to be re-computed each time.
+
+Individuals and Lexical Items
+=============================
+As well as deriving relations from the Chat-80 data, we also create a
+set of individual constants, one for each entity in the domain. The
+individual constants are string-identical to the entities. For
+example, given a data item such as ``'zloty'``, we add to the valuation
+a pair ``('zloty', 'zloty')``. In order to parse English sentences that
+refer to these entities, we also create a lexical item such as the
+following for each individual constant::
+
+   PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty'
+
+The set of rules is written to the file ``chat_pnames.cfg`` in the
+current directory.
+
+"""
+from __future__ import print_function, unicode_literals
+
+import re
+import shelve
+import os
+import sys
+
+from six import string_types
+
+import nltk.data
+from nltk.compat import python_2_unicode_compatible
+
+###########################################################################
+# Chat-80 relation metadata bundles needed to build the valuation
+###########################################################################
+
+borders = {'rel_name': 'borders',
+           'closures': ['symmetric'],
+           'schema': ['region', 'border'],
+           'filename': 'borders.pl'}
+
+contains = {'rel_name': 'contains0',
+            'closures': ['transitive'],
+            'schema': ['region', 'contain'],
+            'filename': 'contain.pl'}
+
+city = {'rel_name': 'city',
+        'closures': [],
+        'schema': ['city', 'country', 'population'],
+        'filename': 'cities.pl'}
+
+country = {'rel_name': 'country',
+           'closures': [],
+           'schema': ['country', 'region', 'latitude', 'longitude',
+                      'area', 'population', 'capital', 'currency'],
+           'filename': 'countries.pl'}
+
+circle_of_lat = {'rel_name': 'circle_of_latitude',
+                 'closures': [],
+                 'schema': ['circle_of_latitude', 'degrees'],
+                 'filename': 'world1.pl'}
+
+circle_of_long = {'rel_name': 'circle_of_longitude',
+                 'closures': [],
+                 'schema': ['circle_of_longitude', 'degrees'],
+                 'filename': 'world1.pl'}
+
+continent = {'rel_name': 'continent',
+             'closures': [],
+             'schema': ['continent'],
+             'filename': 'world1.pl'}
+
+region = {'rel_name': 'in_continent',
+          'closures': [],
+          'schema': ['region', 'continent'],
+          'filename': 'world1.pl'}
+
+ocean = {'rel_name': 'ocean',
+         'closures': [],
+         'schema': ['ocean'],
+         'filename': 'world1.pl'}
+
+sea = {'rel_name': 'sea',
+       'closures': [],
+       'schema': ['sea'],
+       'filename': 'world1.pl'}
+
+
+
+items = ['borders', 'contains', 'city', 'country', 'circle_of_lat',
+         'circle_of_long', 'continent', 'region', 'ocean', 'sea']
+items = tuple(sorted(items))
+
+item_metadata = {
+    'borders': borders,
+    'contains': contains,
+    'city': city,
+    'country': country,
+    'circle_of_lat': circle_of_lat,
+    'circle_of_long': circle_of_long,
+    'continent': continent,
+    'region': region,
+    'ocean': ocean,
+    'sea': sea
+    }
+
+rels = item_metadata.values()
+
+not_unary = ['borders.pl', 'contain.pl']
+
+###########################################################################
+
+@python_2_unicode_compatible
+class Concept(object):
+    """
+    A Concept class, loosely based on SKOS
+    (http://www.w3.org/TR/swbp-skos-core-guide/).
+    """
+    def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()):
+        """
+        :param prefLabel: the preferred label for the concept
+        :type prefLabel: str
+        :param arity: the arity of the concept
+        :type arity: int
+        @keyword altLabels: other (related) labels
+        :type altLabels: list
+        @keyword closures: closure properties of the extension \
+            (list items can be ``symmetric``, ``reflexive``, ``transitive``)
+        :type closures: list
+        @keyword extension: the extensional value of the concept
+        :type extension: set
+        """
+        self.prefLabel = prefLabel
+        self.arity = arity
+        self.altLabels = altLabels
+        self.closures = closures
+        #keep _extension internally as a set
+        self._extension = extension
+        #public access is via a list (for slicing)
+        self.extension = sorted(list(extension))
+
+    def __str__(self):
+        #_extension = ''
+        #for element in sorted(self.extension):
+            #if isinstance(element, tuple):
+                #element = '(%s, %s)' % (element)
+            #_extension += element + ', '
+        #_extension = _extension[:-1]
+
+        return "Label = '%s'\nArity = %s\nExtension = %s" % \
+               (self.prefLabel, self.arity, self.extension)
+
+    def __repr__(self):
+        return "Concept('%s')" % self.prefLabel
+
+    def augment(self, data):
+        """
+        Add more data to the ``Concept``'s extension set.
+
+        :param data: a new semantic value
+        :type data: string or pair of strings
+        :rtype: set
+
+        """
+        self._extension.add(data)
+        self.extension = sorted(list(self._extension))
+        return self._extension
+
+
+    def _make_graph(self, s):
+        """
+        Convert a set of pairs into an adjacency linked list encoding of a graph.
+        """
+        g = {}
+        for (x, y) in s:
+            if x in g:
+                g[x].append(y)
+            else:
+                g[x] = [y]
+        return g
+
+    def _transclose(self, g):
+        """
+        Compute the transitive closure of a graph represented as a linked list.
+        """
+        for x in g:
+            for adjacent in g[x]:
+                # check that adjacent is a key
+                if adjacent in g:
+                    for y in g[adjacent]:
+                        if y not in g[x]:
+                            g[x].append(y)
+        return g
+
+    def _make_pairs(self, g):
+        """
+        Convert an adjacency linked list back into a set of pairs.
+        """
+        pairs = []
+        for node in g:
+            for adjacent in g[node]:
+                pairs.append((node, adjacent))
+        return set(pairs)
+
+
+    def close(self):
+        """
+        Close a binary relation in the ``Concept``'s extension set.
+
+        :return: a new extension for the ``Concept`` in which the
+                 relation is closed under a given property
+        """
+        from nltk.sem import is_rel
+        assert is_rel(self._extension)
+        if 'symmetric' in self.closures:
+            pairs = []
+            for (x, y) in self._extension:
+                pairs.append((y, x))
+            sym = set(pairs)
+            self._extension = self._extension.union(sym)
+        if 'transitive' in self.closures:
+            all =  self._make_graph(self._extension)
+            closed =  self._transclose(all)
+            trans = self._make_pairs(closed)
+            #print sorted(trans)
+            self._extension = self._extension.union(trans)
+        self.extension = sorted(list(self._extension))
+
+
+def clause2concepts(filename, rel_name, schema, closures=[]):
+    """
+    Convert a file of Prolog clauses into a list of ``Concept`` objects.
+
+    :param filename: filename containing the relations
+    :type filename: str
+    :param rel_name: name of the relation
+    :type rel_name: str
+    :param schema: the schema used in a set of relational tuples
+    :type schema: list
+    :param closures: closure properties for the extension of the concept
+    :type closures: list
+    :return: a list of ``Concept`` objects
+    :rtype: list
+    """
+    concepts = []
+    # position of the subject of a binary relation
+    subj = 0
+    # label of the 'primary key'
+    pkey = schema[0]
+    # fields other than the primary key
+    fields = schema[1:]
+
+    # convert a file into a list of lists
+    records = _str2records(filename, rel_name)
+
+    # add a unary concept corresponding to the set of entities
+    # in the primary key position
+    # relations in 'not_unary' are more like ordinary binary relations
+    if not filename in not_unary:
+        concepts.append(unary_concept(pkey, subj, records))
+
+    # add a binary concept for each non-key field
+    for field in fields:
+        obj = schema.index(field)
+        concepts.append(binary_concept(field, closures, subj, obj, records))
+
+    return concepts
+
+def cities2table(filename, rel_name, dbname, verbose=False, setup=False):
+    """
+    Convert a file of Prolog clauses into a database table.
+
+    This is not generic, since it doesn't allow arbitrary
+    schemas to be set as a parameter.
+
+    Intended usage::
+
+        cities2table('cities.pl', 'city', 'city.db', verbose=True, setup=True)
+
+    :param filename: filename containing the relations
+    :type filename: str
+    :param rel_name: name of the relation
+    :type rel_name: str
+    :param dbname: filename of persistent store
+    :type schema: str
+    """
+    import sqlite3
+    records = _str2records(filename, rel_name)
+    connection =  sqlite3.connect(dbname)
+    cur = connection.cursor()
+    if setup:
+        cur.execute('''CREATE TABLE city_table
+        (City text, Country text, Population int)''')
+
+    table_name = "city_table"
+    for t in records:
+        cur.execute('insert into %s values (?,?,?)' % table_name, t)
+        if verbose:
+            print("inserting values into %s: " % table_name, t)
+    connection.commit()
+    if verbose:
+        print("Committing update to %s" % dbname)
+    cur.close()
+
+def sql_query(dbname, query):
+    """
+    Execute an SQL query over a database.
+    :param dbname: filename of persistent store
+    :type schema: str
+    :param query: SQL query
+    :type rel_name: str
+    """
+    import sqlite3
+    try:
+        path = nltk.data.find(dbname)
+        connection =  sqlite3.connect(str(path))
+        cur = connection.cursor()
+        return cur.execute(query)
+    except (ValueError, sqlite3.OperationalError):
+        import warnings
+        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
+        raise
+
+def _str2records(filename, rel):
+    """
+    Read a file into memory and convert each relation clause into a list.
+    """
+    recs = []
+    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
+    for line in contents.splitlines():
+        if line.startswith(rel):
+            line = re.sub(rel+r'\(', '', line)
+            line = re.sub(r'\)\.$', '', line)
+            record = line.split(',')
+            recs.append(record)
+    return recs
+
+def unary_concept(label, subj, records):
+    """
+    Make a unary concept out of the primary key in a record.
+
+    A record is a list of entities in some relation, such as
+    ``['france', 'paris']``, where ``'france'`` is acting as the primary
+    key.
+
+    :param label: the preferred label for the concept
+    :type label: string
+    :param subj: position in the record of the subject of the predicate
+    :type subj: int
+    :param records: a list of records
+    :type records: list of lists
+    :return: ``Concept`` of arity 1
+    :rtype: Concept
+    """
+    c = Concept(label, arity=1, extension=set())
+    for record in records:
+        c.augment(record[subj])
+    return c
+
+def binary_concept(label, closures, subj, obj, records):
+    """
+    Make a binary concept out of the primary key and another field in a record.
+
+    A record is a list of entities in some relation, such as
+    ``['france', 'paris']``, where ``'france'`` is acting as the primary
+    key, and ``'paris'`` stands in the ``'capital_of'`` relation to
+    ``'france'``.
+
+    More generally, given a record such as ``['a', 'b', 'c']``, where
+    label is bound to ``'B'``, and ``obj`` bound to 1, the derived
+    binary concept will have label ``'B_of'``, and its extension will
+    be a set of pairs such as ``('a', 'b')``.
+
+
+    :param label: the base part of the preferred label for the concept
+    :type label: str
+    :param closures: closure properties for the extension of the concept
+    :type closures: list
+    :param subj: position in the record of the subject of the predicate
+    :type subj: int
+    :param obj: position in the record of the object of the predicate
+    :type obj: int
+    :param records: a list of records
+    :type records: list of lists
+    :return: ``Concept`` of arity 2
+    :rtype: Concept
+    """
+    if not label == 'border' and not label == 'contain':
+        label = label + '_of'
+    c = Concept(label, arity=2, closures=closures, extension=set())
+    for record in records:
+        c.augment((record[subj], record[obj]))
+    # close the concept's extension according to the properties in closures
+    c.close()
+    return c
+
+
+def process_bundle(rels):
+    """
+    Given a list of relation metadata bundles, make a corresponding
+    dictionary of concepts, indexed by the relation name.
+
+    :param rels: bundle of metadata needed for constructing a concept
+    :type rels: list(dict)
+    :return: a dictionary of concepts, indexed by the relation name.
+    :rtype: dict(str): Concept
+    """
+    concepts = {}
+    for rel in rels:
+        rel_name = rel['rel_name']
+        closures = rel['closures']
+        schema = rel['schema']
+        filename = rel['filename']
+
+        concept_list = clause2concepts(filename, rel_name, schema, closures)
+        for c in concept_list:
+            label = c.prefLabel
+            if (label in concepts):
+                for data in c.extension:
+                    concepts[label].augment(data)
+                concepts[label].close()
+            else:
+                concepts[label] = c
+    return concepts
+
+
+def make_valuation(concepts, read=False, lexicon=False):
+    """
+    Convert a list of ``Concept`` objects into a list of (label, extension) pairs;
+    optionally create a ``Valuation`` object.
+
+    :param concepts: concepts
+    :type concepts: list(Concept)
+    :param read: if ``True``, ``(symbol, set)`` pairs are read into a ``Valuation``
+    :type read: bool
+    :rtype: list or Valuation
+    """
+    vals = []
+
+    for c in concepts:
+        vals.append((c.prefLabel, c.extension))
+    if lexicon: read = True
+    if read:
+        from nltk.sem import Valuation
+        val = Valuation({})
+        val.update(vals)
+        # add labels for individuals
+        val = label_indivs(val, lexicon=lexicon)
+        return val
+    else:
+        return vals
+
+
+def val_dump(rels, db):
+    """
+    Make a ``Valuation`` from a list of relation metadata bundles and dump to
+    persistent database.
+
+    :param rels: bundle of metadata needed for constructing a concept
+    :type rels: list of dict
+    :param db: name of file to which data is written.
+               The suffix '.db' will be automatically appended.
+    :type db: str
+    """
+    concepts = process_bundle(rels).values()
+    valuation = make_valuation(concepts, read=True)
+    db_out = shelve.open(db, 'n')
+
+    db_out.update(valuation)
+
+    db_out.close()
+
+
+def val_load(db):
+    """
+    Load a ``Valuation`` from a persistent database.
+
+    :param db: name of file from which data is read.
+               The suffix '.db' should be omitted from the name.
+    :type db: str
+    """
+    dbname = db+".db"
+
+    if not os.access(dbname, os.R_OK):
+        sys.exit("Cannot read file: %s" % dbname)
+    else:
+        db_in = shelve.open(db)
+        from nltk.sem import Valuation
+        val = Valuation(db_in)
+#        val.read(db_in.items())
+        return val
+
+
+#def alpha(str):
+    #"""
+    #Utility to filter out non-alphabetic constants.
+
+    #:param str: candidate constant
+    #:type str: string
+    #:rtype: bool
+    #"""
+    #try:
+        #int(str)
+        #return False
+    #except ValueError:
+        ## some unknown values in records are labeled '?'
+        #if not str == '?':
+            #return True
+
+
+def label_indivs(valuation, lexicon=False):
+    """
+    Assign individual constants to the individuals in the domain of a ``Valuation``.
+
+    Given a valuation with an entry of the form ``{'rel': {'a': True}}``,
+    add a new entry ``{'a': 'a'}``.
+
+    :type valuation: Valuation
+    :rtype: Valuation
+    """
+    # collect all the individuals into a domain
+    domain = valuation.domain
+    # convert the domain into a sorted list of alphabetic terms
+    # use the same string as a label
+    pairs = [(e, e) for e in domain]
+    if lexicon:
+        lex = make_lex(domain)
+        with open("chat_pnames.cfg", 'w') as outfile:
+            outfile.writelines(lex)
+    # read the pairs into the valuation
+    valuation.update(pairs)
+    return valuation
+
+def make_lex(symbols):
+    """
+    Create lexical CFG rules for each individual symbol.
+
+    Given a valuation with an entry of the form ``{'zloty': 'zloty'}``,
+    create a lexical rule for the proper name 'Zloty'.
+
+    :param symbols: a list of individual constants in the semantic representation
+    :type symbols: sequence -- set(str)
+    :rtype: list(str)
+    """
+    lex = []
+    header = """
+##################################################################
+# Lexical rules automatically generated by running 'chat80.py -x'.
+##################################################################
+
+"""
+    lex.append(header)
+    template = "PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n"
+
+    for s in symbols:
+        parts = s.split('_')
+        caps = [p.capitalize() for p in parts]
+        pname = '_'.join(caps)
+        rule = template % (s, pname)
+        lex.append(rule)
+    return lex
+
+
+###########################################################################
+# Interface function to emulate other corpus readers
+###########################################################################
+
+def concepts(items = items):
+    """
+    Build a list of concepts corresponding to the relation names in ``items``.
+
+    :param items: names of the Chat-80 relations to extract
+    :type items: list(str)
+    :return: the ``Concept`` objects which are extracted from the relations
+    :rtype: list(Concept)
+    """
+    if isinstance(items, string_types): items = (items,)
+
+    rels = [item_metadata[r] for r in items]
+
+    concept_map = process_bundle(rels)
+    return concept_map.values()
+
+
+
+
+###########################################################################
+
+
+def main():
+    import sys
+    from optparse import OptionParser
+    description = \
+    """
+Extract data from the Chat-80 Prolog files and convert them into a
+Valuation object for use in the NLTK semantics package.
+    """
+
+    opts = OptionParser(description=description)
+    opts.set_defaults(verbose=True, lex=False, vocab=False)
+    opts.add_option("-s", "--store", dest="outdb",
+                    help="store a valuation in DB", metavar="DB")
+    opts.add_option("-l", "--load", dest="indb",
+                    help="load a stored valuation from DB", metavar="DB")
+    opts.add_option("-c", "--concepts", action="store_true",
+                    help="print concepts instead of a valuation")
+    opts.add_option("-r", "--relation", dest="label",
+                    help="print concept with label REL (check possible labels with '-v' option)", metavar="REL")
+    opts.add_option("-q", "--quiet", action="store_false", dest="verbose",
+                    help="don't print out progress info")
+    opts.add_option("-x", "--lex", action="store_true", dest="lex",
+                    help="write a file of lexical entries for country names, then exit")
+    opts.add_option("-v", "--vocab", action="store_true", dest="vocab",
+                        help="print out the vocabulary of concept labels and their arity, then exit")
+
+    (options, args) = opts.parse_args()
+    if options.outdb and options.indb:
+        opts.error("Options --store and --load are mutually exclusive")
+
+
+    if options.outdb:
+        # write the valuation to a persistent database
+        if options.verbose:
+            outdb = options.outdb+".db"
+            print("Dumping a valuation to %s" % outdb)
+        val_dump(rels, options.outdb)
+        sys.exit(0)
+    else:
+        # try to read in a valuation from a database
+        if options.indb is not None:
+            dbname = options.indb+".db"
+            if not os.access(dbname, os.R_OK):
+                sys.exit("Cannot read file: %s" % dbname)
+            else:
+                valuation = val_load(options.indb)
+        # we need to create the valuation from scratch
+        else:
+            # build some concepts
+            concept_map = process_bundle(rels)
+            concepts = concept_map.values()
+            # just print out the vocabulary
+            if options.vocab:
+                items = sorted([(c.arity, c.prefLabel) for c in concepts])
+                for (arity, label) in items:
+                    print(label, arity)
+                sys.exit(0)
+            # show all the concepts
+            if options.concepts:
+                for c in concepts:
+                    print(c)
+                    print()
+            if options.label:
+                print(concept_map[options.label])
+                sys.exit(0)
+            else:
+                # turn the concepts into a Valuation
+                if options.lex:
+                    if options.verbose:
+                        print("Writing out lexical rules")
+                    make_valuation(concepts, lexicon=True)
+                else:
+                    valuation = make_valuation(concepts, read=True)
+                    print(valuation)
+
+
+def sql_demo():
+    """
+    Print out every row from the 'city.db' database.
+    """
+    print()
+    print("Using SQL to extract rows from 'city.db' RDB.")
+    for row in sql_query('corpora/city_database/city.db', "SELECT * FROM city_table"):
+        print(row)
+
+
+if __name__ == '__main__':
+    main()
+    sql_demo()
diff --git a/nlp_resource_data/nltk/sem/chat80.pyc b/nlp_resource_data/nltk/sem/chat80.pyc

new file mode 100755 (executable)

index 0000000..6b771b9

Binary files /dev/null and b/nlp_resource_data/nltk/sem/chat80.pyc differ
diff --git a/nlp_resource_data/nltk/sem/cooper_storage.py b/nlp_resource_data/nltk/sem/cooper_storage.py

new file mode 100755 (executable)

index 0000000..f1a7aab
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/cooper_storage.py
@@ -0,0 +1,118 @@
+# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function
+
+from nltk.sem.logic import LambdaExpression, ApplicationExpression, Variable
+from nltk.parse import load_parser
+from nltk.parse.featurechart import InstantiateVarsChart
+
+class CooperStore(object):
+    """
+    A container for handling quantifier ambiguity via Cooper storage.
+    """
+    def __init__(self, featstruct):
+        """
+        :param featstruct: The value of the ``sem`` node in a tree from
+            ``parse_with_bindops()``
+        :type featstruct: FeatStruct (with features ``core`` and ``store``)
+
+        """
+        self.featstruct = featstruct
+        self.readings = []
+        try:
+            self.core = featstruct['CORE']
+            self.store = featstruct['STORE']
+        except KeyError:
+            print("%s is not a Cooper storage structure" % featstruct)
+
+    def _permute(self, lst):
+        """
+        :return: An iterator over the permutations of the input list
+        :type lst: list
+        :rtype: iter
+        """
+        remove = lambda lst0, index: lst0[:index] + lst0[index+1:]
+        if lst:
+            for index, x in enumerate(lst):
+                for y in self._permute(remove(lst, index)):
+                    yield (x,)+y
+        else: yield ()
+
+    def s_retrieve(self, trace=False):
+        """
+        Carry out S-Retrieval of binding operators in store. If hack=True,
+        serialize the bindop and core as strings and reparse. Ugh.
+
+        Each permutation of the store (i.e. list of binding operators) is
+        taken to be a possible scoping of quantifiers. We iterate through the
+        binding operators in each permutation, and successively apply them to
+        the current term, starting with the core semantic representation,
+        working from the inside out.
+
+        Binding operators are of the form::
+
+             bo(\P.all x.(man(x) -> P(x)),z1)
+        """
+        for perm, store_perm in enumerate(self._permute(self.store)):
+            if trace:
+                print("Permutation %s" % (perm+1))
+            term = self.core
+            for bindop in store_perm:
+                # we just want the arguments that are wrapped by the 'bo' predicate
+                quant, varex = tuple(bindop.args)
+                # use var to make an abstraction over the current term and then
+                # apply the quantifier to it
+                term = ApplicationExpression(quant, LambdaExpression(varex.variable, term))
+                if trace:
+                    print("  ", term)
+                term = term.simplify()
+            self.readings.append(term)
+
+
+def parse_with_bindops(sentence, grammar=None, trace=0):
+    """
+    Use a grammar with Binding Operators to parse a sentence.
+    """
+    if not grammar:
+        grammar = 'grammars/book_grammars/storage.fcfg'
+    parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart)
+    # Parse the sentence.
+    tokens = sentence.split()
+    return list(parser.parse(tokens))
+
+
+def demo():
+    from nltk.sem import cooper_storage as cs
+    sentence = "every girl chases a dog"
+    #sentence = "a man gives a bone to every dog"
+    print()
+    print("Analyis of sentence '%s'" % sentence)
+    print("=" * 50)
+    trees = cs.parse_with_bindops(sentence, trace=0)
+    for tree in trees:
+        semrep = cs.CooperStore(tree.label()['SEM'])
+        print()
+        print("Binding operators:")
+        print("-" * 15)
+        for s in semrep.store:
+            print(s)
+        print()
+        print("Core:")
+        print("-" * 15)
+        print(semrep.core)
+        print()
+        print("S-Retrieval:")
+        print("-" * 15)
+        semrep.s_retrieve(trace=True)
+        print("Readings:")
+        print("-" * 15)
+
+        for i, reading in enumerate(semrep.readings):
+            print("%s: %s" % (i+1, reading))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/sem/cooper_storage.pyc b/nlp_resource_data/nltk/sem/cooper_storage.pyc

new file mode 100755 (executable)

index 0000000..9d4f8a9

Binary files /dev/null and b/nlp_resource_data/nltk/sem/cooper_storage.pyc differ
diff --git a/nlp_resource_data/nltk/sem/drt.py b/nlp_resource_data/nltk/sem/drt.py

new file mode 100755 (executable)

index 0000000..bd64839
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/drt.py
@@ -0,0 +1,1258 @@
+# Natural Language Toolkit: Discourse Representation Theory (DRT)
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+import operator
+from functools import reduce
+from itertools import chain
+
+from six import string_types
+
+from nltk.compat import python_2_unicode_compatible
+from nltk.sem.logic import (APP, AbstractVariableExpression, AllExpression,
+                            AndExpression, ApplicationExpression, BinaryExpression,
+                            BooleanExpression, ConstantExpression, EqualityExpression,
+                            EventVariableExpression, ExistsExpression, Expression,
+                            FunctionVariableExpression, ImpExpression,
+                            IndividualVariableExpression, LambdaExpression, Tokens,
+                            LogicParser, NegatedExpression, OrExpression, Variable,
+                            is_eventvar, is_funcvar, is_indvar, unique_variable)
+
+# Import Tkinter-based modules if they are available
+try:
+    from six.moves.tkinter import Canvas, Tk
+    from six.moves.tkinter_font import Font
+    from nltk.util import in_idle
+
+except ImportError:
+    # No need to print a warning here, nltk.draw has already printed one.
+    pass
+
+class DrtTokens(Tokens):
+    DRS = 'DRS'
+    DRS_CONC = '+'
+    PRONOUN = 'PRO'
+    OPEN_BRACKET = '['
+    CLOSE_BRACKET = ']'
+    COLON = ':'
+
+    PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON]
+
+    SYMBOLS = Tokens.SYMBOLS + PUNCT
+
+    TOKENS = Tokens.TOKENS + [DRS] + PUNCT
+
+
+class DrtParser(LogicParser):
+    """A lambda calculus expression parser."""
+    def __init__(self):
+        LogicParser.__init__(self)
+
+        self.operator_precedence = dict(
+                               [(x,1) for x in DrtTokens.LAMBDA_LIST]             + \
+                               [(x,2) for x in DrtTokens.NOT_LIST]                + \
+                               [(APP,3)]                                          + \
+                               [(x,4) for x in DrtTokens.EQ_LIST+Tokens.NEQ_LIST] + \
+                               [(DrtTokens.COLON,5)]                              + \
+                               [(DrtTokens.DRS_CONC,6)]                           + \
+                               [(x,7) for x in DrtTokens.OR_LIST]                 + \
+                               [(x,8) for x in DrtTokens.IMP_LIST]                + \
+                               [(None,9)])
+
+    def get_all_symbols(self):
+        """This method exists to be overridden"""
+        return DrtTokens.SYMBOLS
+
+    def isvariable(self, tok):
+        return tok not in DrtTokens.TOKENS
+
+    def handle(self, tok, context):
+        """This method is intended to be overridden for logics that
+        use different operators or expressions"""
+        if tok in DrtTokens.NOT_LIST:
+            return self.handle_negation(tok, context)
+
+        elif tok in DrtTokens.LAMBDA_LIST:
+            return self.handle_lambda(tok, context)
+
+        elif tok == DrtTokens.OPEN:
+            if self.inRange(0) and self.token(0) == DrtTokens.OPEN_BRACKET:
+                return self.handle_DRS(tok, context)
+            else:
+                return self.handle_open(tok, context)
+
+        elif tok.upper() == DrtTokens.DRS:
+            self.assertNextToken(DrtTokens.OPEN)
+            return self.handle_DRS(tok, context)
+
+        elif self.isvariable(tok):
+            if self.inRange(0) and self.token(0) == DrtTokens.COLON:
+                return self.handle_prop(tok, context)
+            else:
+                return self.handle_variable(tok, context)
+
+    def make_NegatedExpression(self, expression):
+        return DrtNegatedExpression(expression)
+
+    def handle_DRS(self, tok, context):
+        # a DRS
+        refs = self.handle_refs()
+        if self.inRange(0) and self.token(0) == DrtTokens.COMMA: #if there is a comma (it's optional)
+            self.token() # swallow the comma
+        conds = self.handle_conds(context)
+        self.assertNextToken(DrtTokens.CLOSE)
+        return DRS(refs, conds, None)
+
+    def handle_refs(self):
+        self.assertNextToken(DrtTokens.OPEN_BRACKET)
+        refs = []
+        while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET:
+        # Support expressions like: DRS([x y],C) == DRS([x,y],C)
+            if refs and self.token(0) == DrtTokens.COMMA:
+                self.token() # swallow the comma
+            refs.append(self.get_next_token_variable('quantified'))
+        self.assertNextToken(DrtTokens.CLOSE_BRACKET)
+        return refs
+
+    def handle_conds(self, context):
+        self.assertNextToken(DrtTokens.OPEN_BRACKET)
+        conds = []
+        while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET:
+            # Support expressions like: DRS([x y],C) == DRS([x, y],C)
+            if conds and self.token(0) == DrtTokens.COMMA:
+                self.token() # swallow the comma
+            conds.append(self.process_next_expression(context))
+        self.assertNextToken(DrtTokens.CLOSE_BRACKET)
+        return conds
+
+    def handle_prop(self, tok, context):
+        variable = self.make_VariableExpression(tok)
+        self.assertNextToken(':')
+        drs = self.process_next_expression(DrtTokens.COLON)
+        return DrtProposition(variable, drs)
+
+    def make_EqualityExpression(self, first, second):
+        """This method serves as a hook for other logic parsers that
+        have different equality expression classes"""
+        return DrtEqualityExpression(first, second)
+
+    def get_BooleanExpression_factory(self, tok):
+        """This method serves as a hook for other logic parsers that
+        have different boolean operators"""
+        if tok == DrtTokens.DRS_CONC:
+            return lambda first, second: DrtConcatenation(first, second, None)
+        elif tok in DrtTokens.OR_LIST:
+            return DrtOrExpression
+        elif tok in DrtTokens.IMP_LIST:
+            def make_imp_expression(first, second):
+                if isinstance(first, DRS):
+                    return DRS(first.refs, first.conds, second)
+                if isinstance(first, DrtConcatenation):
+                    return DrtConcatenation(first.first, first.second, second)
+                raise Exception('Antecedent of implication must be a DRS')
+            return make_imp_expression
+        else:
+            return None
+
+    def make_BooleanExpression(self, factory, first, second):
+        return factory(first, second)
+
+    def make_ApplicationExpression(self, function, argument):
+        return DrtApplicationExpression(function, argument)
+
+    def make_VariableExpression(self, name):
+        return DrtVariableExpression(Variable(name))
+
+    def make_LambdaExpression(self, variables, term):
+        return DrtLambdaExpression(variables, term)
+
+
+class DrtExpression(object):
+    """
+    This is the base abstract DRT Expression from which every DRT
+    Expression extends.
+    """
+
+    _drt_parser = DrtParser()
+
+    @classmethod
+    def fromstring(cls, s):
+        return cls._drt_parser.parse(s)
+
+    def applyto(self, other):
+        return DrtApplicationExpression(self, other)
+
+    def __neg__(self):
+        return DrtNegatedExpression(self)
+
+    def __and__(self, other):
+        raise NotImplementedError()
+
+    def __or__(self, other):
+        assert isinstance(other, DrtExpression)
+        return DrtOrExpression(self, other)
+
+    def __gt__(self, other):
+        assert isinstance(other, DrtExpression)
+        if isinstance(self, DRS):
+            return DRS(self.refs, self.conds, other)
+        if isinstance(self, DrtConcatenation):
+            return DrtConcatenation(self.first, self.second, other)
+        raise Exception('Antecedent of implication must be a DRS')
+
+    def equiv(self, other, prover=None):
+        """
+        Check for logical equivalence.
+        Pass the expression (self <-> other) to the theorem prover.
+        If the prover says it is valid, then the self and other are equal.
+
+        :param other: an ``DrtExpression`` to check equality against
+        :param prover: a ``nltk.inference.api.Prover``
+        """
+        assert isinstance(other, DrtExpression)
+
+        f1 = self.simplify().fol();
+        f2 = other.simplify().fol();
+        return f1.equiv(f2, prover)
+
+    @property
+    def type(self):
+        raise AttributeError("'%s' object has no attribute 'type'" %
+                             self.__class__.__name__)
+
+    def typecheck(self, signature=None):
+        raise NotImplementedError()
+
+    def __add__(self, other):
+        return DrtConcatenation(self, other, None)
+
+    def get_refs(self, recursive=False):
+        """
+        Return the set of discourse referents in this DRS.
+        :param recursive: bool Also find discourse referents in subterms?
+        :return: list of ``Variable`` objects
+        """
+        raise NotImplementedError()
+
+    def is_pronoun_function(self):
+        """ Is self of the form "PRO(x)"? """
+        return isinstance(self, DrtApplicationExpression) and \
+               isinstance(self.function, DrtAbstractVariableExpression) and \
+               self.function.variable.name == DrtTokens.PRONOUN and \
+               isinstance(self.argument, DrtIndividualVariableExpression)
+
+    def make_EqualityExpression(self, first, second):
+        return DrtEqualityExpression(first, second)
+
+    def make_VariableExpression(self, variable):
+        return DrtVariableExpression(variable)
+
+    def resolve_anaphora(self):
+        return resolve_anaphora(self)
+
+    def eliminate_equality(self):
+        return self.visit_structured(lambda e: e.eliminate_equality(),
+                                     self.__class__)
+
+    def pretty_format(self):
+        """
+        Draw the DRS
+        :return: the pretty print string
+        """
+        return '\n'.join(self._pretty())
+
+    def pretty_print(self):
+        print(self.pretty_format())
+
+    def draw(self):
+        DrsDrawer(self).draw()
+
+
+@python_2_unicode_compatible
+class DRS(DrtExpression, Expression):
+    """A Discourse Representation Structure."""
+    def __init__(self, refs, conds, consequent=None):
+        """
+        :param refs: list of ``DrtIndividualVariableExpression`` for the
+        discourse referents
+        :param conds: list of ``Expression`` for the conditions
+        """
+        self.refs = refs
+        self.conds = conds
+        self.consequent = consequent
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        """Replace all instances of variable v with expression E in self,
+        where v is free in self."""
+        if variable in self.refs:
+            #if a bound variable is the thing being replaced
+            if not replace_bound:
+                return self
+            else:
+                i = self.refs.index(variable)
+                if self.consequent:
+                    consequent = self.consequent.replace(variable, expression, True, alpha_convert)
+                else:
+                    consequent = None
+                return DRS(self.refs[:i]+[expression.variable]+self.refs[i+1:],
+                           [cond.replace(variable, expression, True, alpha_convert)
+                            for cond in self.conds],
+                           consequent)
+        else:
+            if alpha_convert:
+                # any bound variable that appears in the expression must
+                # be alpha converted to avoid a conflict
+                for ref in (set(self.refs) & expression.free()):
+                    newvar = unique_variable(ref)
+                    newvarex = DrtVariableExpression(newvar)
+                    i = self.refs.index(ref)
+                    if self.consequent:
+                        consequent = self.consequent.replace(ref, newvarex, True, alpha_convert)
+                    else:
+                        consequent = None
+                    self = DRS(self.refs[:i]+[newvar]+self.refs[i+1:],
+                               [cond.replace(ref, newvarex, True, alpha_convert)
+                                for cond in self.conds],
+                               consequent)
+
+            #replace in the conditions
+            if self.consequent:
+                consequent = self.consequent.replace(variable, expression, replace_bound, alpha_convert)
+            else:
+                consequent = None
+            return DRS(self.refs,
+                       [cond.replace(variable, expression, replace_bound, alpha_convert)
+                        for cond in self.conds],
+                       consequent)
+
+    def free(self):
+        """:see: Expression.free()"""
+        conds_free = reduce(operator.or_, [c.free() for c in self.conds], set())
+        if self.consequent:
+            conds_free.update(self.consequent.free())
+        return conds_free - set(self.refs)
+
+    def get_refs(self, recursive=False):
+        """:see: AbstractExpression.get_refs()"""
+        if recursive:
+            conds_refs = self.refs + list(chain(*(c.get_refs(True) for c in self.conds)))
+            if self.consequent:
+                conds_refs.extend(self.consequent.get_refs(True))
+            return conds_refs
+        else:
+            return self.refs
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        parts = list(map(function, self.conds))
+        if self.consequent:
+            parts.append(function(self.consequent))
+        return combinator(parts)
+
+    def visit_structured(self, function, combinator):
+        """:see: Expression.visit_structured()"""
+        consequent = (function(self.consequent) if self.consequent else None)
+        return combinator(self.refs, list(map(function, self.conds)), consequent)
+
+    def eliminate_equality(self):
+        drs = self
+        i = 0
+        while i < len(drs.conds):
+            cond = drs.conds[i]
+            if isinstance(cond, EqualityExpression) and \
+               isinstance(cond.first, AbstractVariableExpression) and \
+               isinstance(cond.second, AbstractVariableExpression):
+                drs = DRS(list(set(drs.refs)-set([cond.second.variable])),
+                          drs.conds[:i]+drs.conds[i+1:],
+                          drs.consequent)
+                if cond.second.variable != cond.first.variable:
+                    drs = drs.replace(cond.second.variable, cond.first, False, False)
+                    i = 0
+                i -= 1
+            i += 1
+
+        conds = []
+        for cond in drs.conds:
+            new_cond = cond.eliminate_equality()
+            new_cond_simp = new_cond.simplify()
+            if not isinstance(new_cond_simp, DRS) or \
+               new_cond_simp.refs or new_cond_simp.conds or \
+               new_cond_simp.consequent:
+                conds.append(new_cond)
+
+        consequent = (drs.consequent.eliminate_equality() if drs.consequent else None)
+        return DRS(drs.refs, conds, consequent)
+
+    def fol(self):
+        if self.consequent:
+            accum = None
+            if self.conds:
+                accum = reduce(AndExpression, [c.fol() for c in self.conds])
+
+            if accum:
+                accum = ImpExpression(accum, self.consequent.fol())
+            else:
+                accum = self.consequent.fol()
+
+            for ref in self.refs[::-1]:
+                accum = AllExpression(ref, accum)
+
+            return accum
+
+        else:
+            if not self.conds:
+                raise Exception("Cannot convert DRS with no conditions to FOL.")
+            accum = reduce(AndExpression, [c.fol() for c in self.conds])
+            for ref in map(Variable, self._order_ref_strings(self.refs)[::-1]):
+                accum = ExistsExpression(ref, accum)
+            return accum
+
+    def _pretty(self):
+        refs_line = ' '.join(self._order_ref_strings(self.refs))
+
+        cond_lines = [cond for cond_line in [filter(lambda s: s.strip(), cond._pretty())
+                                             for cond in self.conds]
+                      for cond in cond_line]
+        length = max([len(refs_line)] + list(map(len, cond_lines)))
+        drs = ([' _' + '_' * length            + '_ ',
+                '| ' + refs_line.ljust(length) + ' |',
+                '|-' + '-' * length            + '-|'] +
+               ['| ' + line.ljust(length)      + ' |' for line in cond_lines] +
+               ['|_' + '_' * length            + '_|'])
+        if self.consequent:
+            return DrtBinaryExpression._assemble_pretty(drs, DrtTokens.IMP,
+                                                        self.consequent._pretty())
+        return drs
+
+    def _order_ref_strings(self, refs):
+        strings = ["%s" % ref for ref in refs]
+        ind_vars = []
+        func_vars = []
+        event_vars = []
+        other_vars = []
+        for s in strings:
+            if is_indvar(s):
+                ind_vars.append(s)
+            elif is_funcvar(s):
+                func_vars.append(s)
+            elif is_eventvar(s):
+                event_vars.append(s)
+            else:
+                other_vars.append(s)
+        return sorted(other_vars) + \
+               sorted(event_vars, key=lambda v: int([v[2:],-1][len(v[2:]) == 0])) + \
+               sorted(func_vars, key=lambda v: (v[0], int([v[1:],-1][len(v[1:])==0]))) + \
+               sorted(ind_vars, key=lambda v: (v[0], int([v[1:],-1][len(v[1:])==0])))
+
+    def __eq__(self, other):
+        r"""Defines equality modulo alphabetic variance.
+        If we are comparing \x.M  and \y.N, then check equality of M and N[x/y]."""
+        if isinstance(other, DRS):
+            if len(self.refs) == len(other.refs):
+                converted_other = other
+                for (r1, r2) in zip(self.refs, converted_other.refs):
+                    varex = self.make_VariableExpression(r1)
+                    converted_other = converted_other.replace(r2, varex, True)
+                if self.consequent == converted_other.consequent and \
+                   len(self.conds) == len(converted_other.conds):
+                    for c1, c2 in zip(self.conds, converted_other.conds):
+                        if not (c1 == c2):
+                            return False
+                    return True
+        return False
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Expression.__hash__
+
+    def __str__(self):
+        drs = '([%s],[%s])' % (','.join(self._order_ref_strings(self.refs)),
+                               ', '.join("%s" % cond for cond in self.conds)) # map(str, self.conds)))
+        if self.consequent:
+            return DrtTokens.OPEN + drs + ' ' + DrtTokens.IMP + ' ' + \
+                   "%s" % self.consequent + DrtTokens.CLOSE
+        return drs
+
+
+def DrtVariableExpression(variable):
+    """
+    This is a factory method that instantiates and returns a subtype of
+    ``DrtAbstractVariableExpression`` appropriate for the given variable.
+    """
+    if is_indvar(variable.name):
+        return DrtIndividualVariableExpression(variable)
+    elif is_funcvar(variable.name):
+        return DrtFunctionVariableExpression(variable)
+    elif is_eventvar(variable.name):
+        return DrtEventVariableExpression(variable)
+    else:
+        return DrtConstantExpression(variable)
+
+
+class DrtAbstractVariableExpression(DrtExpression, AbstractVariableExpression):
+    def fol(self):
+        return self
+
+    def get_refs(self, recursive=False):
+        """:see: AbstractExpression.get_refs()"""
+        return []
+
+    def _pretty(self):
+        s = "%s" % self
+        blank = ' '*len(s)
+        return [blank, blank, s, blank]
+
+    def eliminate_equality(self):
+        return self
+
+class DrtIndividualVariableExpression(DrtAbstractVariableExpression, IndividualVariableExpression):
+    pass
+
+class DrtFunctionVariableExpression(DrtAbstractVariableExpression, FunctionVariableExpression):
+    pass
+
+class DrtEventVariableExpression(DrtIndividualVariableExpression, EventVariableExpression):
+    pass
+
+class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression):
+    pass
+
+
+@python_2_unicode_compatible
+class DrtProposition(DrtExpression, Expression):
+    def __init__(self, variable, drs):
+        self.variable = variable
+        self.drs = drs
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        if self.variable == variable:
+            assert isinstance(expression, DrtAbstractVariableExpression), "Can only replace a proposition label with a variable"
+            return DrtProposition(expression.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert))
+        else:
+            return DrtProposition(self.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert))
+
+    def eliminate_equality(self):
+        return DrtProposition(self.variable, self.drs.eliminate_equality())
+
+    def get_refs(self, recursive=False):
+        return (self.drs.get_refs(True) if recursive else [])
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and \
+               self.variable == other.variable and \
+               self.drs == other.drs
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Expression.__hash__
+
+    def fol(self):
+        return self.drs.fol()
+
+    def _pretty(self):
+        drs_s = self.drs._pretty()
+        blank = ' ' * len("%s" % self.variable)
+        return ([blank                + ' ' + line for line in drs_s[:1]] +
+                ["%s" % self.variable + ':' + line for line in drs_s[1:2]] +
+                [blank                + ' ' + line for line in drs_s[2:]])
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        return combinator([function(self.drs)])
+
+    def visit_structured(self, function, combinator):
+        """:see: Expression.visit_structured()"""
+        return combinator(self.variable, function(self.drs))
+
+    def __str__(self):
+        return 'prop(%s, %s)' % (self.variable, self.drs)
+
+
+class DrtNegatedExpression(DrtExpression, NegatedExpression):
+    def fol(self):
+        return NegatedExpression(self.term.fol())
+
+    def get_refs(self, recursive=False):
+        """:see: AbstractExpression.get_refs()"""
+        return self.term.get_refs(recursive)
+
+    def _pretty(self):
+        term_lines = self.term._pretty()
+        return (['    ' + line for line in term_lines[:2]] +
+                ['__  ' + line for line in term_lines[2:3]] +
+                ['  | ' + line for line in term_lines[3:4]] +
+                ['    ' + line for line in term_lines[4:]])
+
+class DrtLambdaExpression(DrtExpression, LambdaExpression):
+    def alpha_convert(self, newvar):
+        """Rename all occurrences of the variable introduced by this variable
+        binder in the expression to ``newvar``.
+        :param newvar: ``Variable``, for the new variable
+        """
+        return self.__class__(newvar, self.term.replace(self.variable,
+                          DrtVariableExpression(newvar), True))
+
+    def fol(self):
+        return LambdaExpression(self.variable, self.term.fol())
+
+    def _pretty(self):
+        variables = [self.variable]
+        term = self.term
+        while term.__class__ == self.__class__:
+            variables.append(term.variable)
+            term = term.term
+        var_string = ' '.join("%s" % v for v in variables) + DrtTokens.DOT
+        term_lines = term._pretty()
+        blank = ' ' * len(var_string)
+        return (['    ' + blank      + line for line in term_lines[:1]] +
+                [' \  ' + blank      + line for line in term_lines[1:2]] +
+                [' /\ ' + var_string + line for line in term_lines[2:3]] +
+                ['    ' + blank      + line for line in term_lines[3:]])
+
+class DrtBinaryExpression(DrtExpression, BinaryExpression):
+    def get_refs(self, recursive=False):
+        """:see: AbstractExpression.get_refs()"""
+        return self.first.get_refs(True) + self.second.get_refs(True) if recursive else []
+
+    def _pretty(self):
+        return DrtBinaryExpression._assemble_pretty(self._pretty_subex(self.first), self.getOp(), self._pretty_subex(self.second))
+
+    @staticmethod
+    def _assemble_pretty(first_lines, op, second_lines):
+        max_lines = max(len(first_lines), len(second_lines))
+        first_lines = _pad_vertically(first_lines, max_lines)
+        second_lines = _pad_vertically(second_lines, max_lines)
+        blank = ' ' * len(op)
+        first_second_lines = list(zip(first_lines, second_lines))
+        return ([' ' + first_line + ' ' + blank + ' ' + second_line + ' ' for first_line, second_line in first_second_lines[:2]] +
+                ['(' + first_line + ' ' + op    + ' ' + second_line + ')' for first_line, second_line in first_second_lines[2:3]] +
+                [' ' + first_line + ' ' + blank + ' ' + second_line + ' ' for first_line, second_line in first_second_lines[3:]])
+
+    def _pretty_subex(self, subex):
+        return subex._pretty()
+
+class DrtBooleanExpression(DrtBinaryExpression, BooleanExpression):
+    pass
+
+class DrtOrExpression(DrtBooleanExpression, OrExpression):
+    def fol(self):
+        return OrExpression(self.first.fol(), self.second.fol())
+
+    def _pretty_subex(self, subex):
+        if isinstance(subex, DrtOrExpression):
+            return [line[1:-1] for line in subex._pretty()]
+        return DrtBooleanExpression._pretty_subex(self, subex)
+
+class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression):
+    def fol(self):
+        return EqualityExpression(self.first.fol(), self.second.fol())
+
+@python_2_unicode_compatible
+class DrtConcatenation(DrtBooleanExpression):
+    """DRS of the form '(DRS + DRS)'"""
+    def __init__(self, first, second, consequent=None):
+        DrtBooleanExpression.__init__(self, first, second)
+        self.consequent = consequent
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        """Replace all instances of variable v with expression E in self,
+        where v is free in self."""
+        first = self.first
+        second = self.second
+        consequent = self.consequent
+
+        # If variable is bound
+        if variable in self.get_refs():
+            if replace_bound:
+                first  = first.replace(variable, expression, replace_bound, alpha_convert)
+                second = second.replace(variable, expression, replace_bound, alpha_convert)
+                if consequent:
+                    consequent = consequent.replace(variable, expression, replace_bound, alpha_convert)
+        else:
+            if alpha_convert:
+                # alpha convert every ref that is free in 'expression'
+                for ref in (set(self.get_refs(True)) & expression.free()):
+                    v = DrtVariableExpression(unique_variable(ref))
+                    first  = first.replace(ref, v, True, alpha_convert)
+                    second = second.replace(ref, v, True, alpha_convert)
+                    if consequent:
+                        consequent = consequent.replace(ref, v, True, alpha_convert)
+
+            first  = first.replace(variable, expression, replace_bound, alpha_convert)
+            second = second.replace(variable, expression, replace_bound, alpha_convert)
+            if consequent:
+                consequent = consequent.replace(variable, expression, replace_bound, alpha_convert)
+
+        return self.__class__(first, second, consequent)
+
+    def eliminate_equality(self):
+        #TODO: at some point.  for now, simplify.
+        drs = self.simplify()
+        assert not isinstance(drs, DrtConcatenation)
+        return drs.eliminate_equality()
+
+    def simplify(self):
+        first = self.first.simplify()
+        second = self.second.simplify()
+        consequent = (self.consequent.simplify() if self.consequent else None)
+
+        if isinstance(first, DRS) and isinstance(second, DRS):
+            # For any ref that is in both 'first' and 'second'
+            for ref in (set(first.get_refs(True)) & set(second.get_refs(True))):
+                # alpha convert the ref in 'second' to prevent collision
+                newvar = DrtVariableExpression(unique_variable(ref))
+                second = second.replace(ref, newvar, True)
+
+            return DRS(first.refs + second.refs, first.conds + second.conds, consequent)
+        else:
+            return self.__class__(first, second, consequent)
+
+    def get_refs(self, recursive=False):
+        """:see: AbstractExpression.get_refs()"""
+        refs = self.first.get_refs(recursive) + self.second.get_refs(recursive)
+        if self.consequent and recursive:
+            refs.extend(self.consequent.get_refs(True))
+        return refs
+
+    def getOp(self):
+        return DrtTokens.DRS_CONC
+
+    def __eq__(self, other):
+        r"""Defines equality modulo alphabetic variance.
+        If we are comparing \x.M  and \y.N, then check equality of M and N[x/y]."""
+        if isinstance(other, DrtConcatenation):
+            self_refs = self.get_refs()
+            other_refs = other.get_refs()
+            if len(self_refs) == len(other_refs):
+                converted_other = other
+                for (r1,r2) in zip(self_refs, other_refs):
+                    varex = self.make_VariableExpression(r1)
+                    converted_other = converted_other.replace(r2, varex, True)
+                return self.first == converted_other.first and \
+                        self.second == converted_other.second and \
+                        self.consequent == converted_other.consequent
+        return False
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = DrtBooleanExpression.__hash__
+
+    def fol(self):
+        e = AndExpression(self.first.fol(), self.second.fol())
+        if self.consequent:
+            e = ImpExpression(e, self.consequent.fol())
+        return e
+
+    def _pretty(self):
+        drs = DrtBinaryExpression._assemble_pretty(self._pretty_subex(self.first),
+                                                   self.getOp(),
+                                                   self._pretty_subex(self.second))
+        if self.consequent:
+            drs = DrtBinaryExpression._assemble_pretty(drs, DrtTokens.IMP,
+                                                       self._pretty(self.consequent))
+        return drs
+
+    def _pretty_subex(self, subex):
+        if isinstance(subex, DrtConcatenation):
+            return [line[1:-1] for line in subex._pretty()]
+        return DrtBooleanExpression._pretty_subex(self, subex)
+
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        if self.consequent:
+            return combinator([function(self.first), function(self.second), function(self.consequent)])
+        else:
+            return combinator([function(self.first), function(self.second)])
+
+    def __str__(self):
+        first = self._str_subex(self.first)
+        second = self._str_subex(self.second)
+        drs = Tokens.OPEN + first + ' ' + self.getOp() \
+                + ' ' + second + Tokens.CLOSE
+        if self.consequent:
+            return DrtTokens.OPEN + drs + ' ' + DrtTokens.IMP + ' ' + \
+                   "%s" % self.consequent + DrtTokens.CLOSE
+        return drs
+
+    def _str_subex(self, subex):
+        s = "%s" % subex
+        if isinstance(subex, DrtConcatenation) and subex.consequent is None:
+            return s[1:-1]
+        return s
+
+
+class DrtApplicationExpression(DrtExpression, ApplicationExpression):
+    def fol(self):
+        return ApplicationExpression(self.function.fol(), self.argument.fol())
+
+    def get_refs(self, recursive=False):
+        """:see: AbstractExpression.get_refs()"""
+        return (self.function.get_refs(True) + self.argument.get_refs(True)
+                if recursive else [])
+
+    def _pretty(self):
+        function, args = self.uncurry()
+        function_lines = function._pretty()
+        args_lines = [arg._pretty() for arg in args]
+        max_lines = max(map(len, [function_lines] + args_lines))
+        function_lines = _pad_vertically(function_lines, max_lines)
+        args_lines = [_pad_vertically(arg_lines, max_lines) for arg_lines in args_lines]
+        func_args_lines = list(zip(function_lines, list(zip(*args_lines))))
+        return ([func_line + ' ' + ' '.join(args_line) + ' ' for func_line, args_line in func_args_lines[:2]] +
+                [func_line + '(' + ','.join(args_line) + ')' for func_line, args_line in func_args_lines[2:3]] +
+                [func_line + ' ' + ' '.join(args_line) + ' ' for func_line, args_line in func_args_lines[3:]])
+
+
+def _pad_vertically(lines, max_lines):
+    pad_line = [' ' * len(lines[0])]
+    return lines + pad_line * (max_lines - len(lines))
+
+
+@python_2_unicode_compatible
+class PossibleAntecedents(list, DrtExpression, Expression):
+    def free(self):
+        """Set of free variables."""
+        return set(self)
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        """Replace all instances of variable v with expression E in self,
+        where v is free in self."""
+        result = PossibleAntecedents()
+        for item in self:
+            if item == variable:
+                self.append(expression)
+            else:
+                self.append(item)
+        return result
+
+    def _pretty(self):
+        s = "%s" % self
+        blank = ' ' * len(s)
+        return [blank, blank, s]
+
+    def __str__(self):
+        return '[' + ','.join("%s" % it for it in self) + ']'
+
+
+class AnaphoraResolutionException(Exception):
+    pass
+
+
+def resolve_anaphora(expression, trail=[]):
+    if isinstance(expression, ApplicationExpression):
+        if expression.is_pronoun_function():
+            possible_antecedents = PossibleAntecedents()
+            for ancestor in trail:
+                for ref in ancestor.get_refs():
+                    refex = expression.make_VariableExpression(ref)
+
+                    #==========================================================
+                    # Don't allow resolution to itself or other types
+                    #==========================================================
+                    if refex.__class__ == expression.argument.__class__ and \
+                       not (refex == expression.argument):
+                        possible_antecedents.append(refex)
+
+            if len(possible_antecedents) == 1:
+                resolution = possible_antecedents[0]
+            else:
+                resolution = possible_antecedents
+            return expression.make_EqualityExpression(expression.argument, resolution)
+        else:
+            r_function = resolve_anaphora(expression.function, trail + [expression])
+            r_argument = resolve_anaphora(expression.argument, trail + [expression])
+            return expression.__class__(r_function, r_argument)
+
+    elif isinstance(expression, DRS):
+        r_conds = []
+        for cond in expression.conds:
+            r_cond = resolve_anaphora(cond, trail + [expression])
+
+            # if the condition is of the form '(x = [])' then raise exception
+            if isinstance(r_cond, EqualityExpression):
+                if isinstance(r_cond.first, PossibleAntecedents):
+                    #Reverse the order so that the variable is on the left
+                    temp = r_cond.first
+                    r_cond.first = r_cond.second
+                    r_cond.second = temp
+                if isinstance(r_cond.second, PossibleAntecedents):
+                    if not r_cond.second:
+                        raise AnaphoraResolutionException("Variable '%s' does not "
+                                "resolve to anything." % r_cond.first)
+
+            r_conds.append(r_cond)
+        if expression.consequent:
+            consequent = resolve_anaphora(expression.consequent, trail + [expression])
+        else:
+            consequent = None
+        return expression.__class__(expression.refs, r_conds, consequent)
+
+    elif isinstance(expression, AbstractVariableExpression):
+        return expression
+
+    elif isinstance(expression, NegatedExpression):
+        return expression.__class__(resolve_anaphora(expression.term, trail + [expression]))
+
+    elif isinstance(expression, DrtConcatenation):
+        if expression.consequent:
+            consequent = resolve_anaphora(expression.consequent, trail + [expression])
+        else:
+            consequent = None
+        return expression.__class__(resolve_anaphora(expression.first, trail + [expression]),
+                                    resolve_anaphora(expression.second, trail + [expression]),
+                                    consequent)
+
+    elif isinstance(expression, BinaryExpression):
+        return expression.__class__(resolve_anaphora(expression.first, trail + [expression]),
+                                    resolve_anaphora(expression.second, trail + [expression]))
+
+    elif isinstance(expression, LambdaExpression):
+        return expression.__class__(expression.variable, resolve_anaphora(expression.term, trail + [expression]))
+
+
+class DrsDrawer(object):
+    BUFFER = 3     #Space between elements
+    TOPSPACE = 10  #Space above whole DRS
+    OUTERSPACE = 6 #Space to the left, right, and bottom of the whle DRS
+
+    def __init__(self, drs, size_canvas=True, canvas=None):
+        """
+        :param drs: ``DrtExpression``, The DRS to be drawn
+        :param size_canvas: bool, True if the canvas size should be the exact size of the DRS
+        :param canvas: ``Canvas`` The canvas on which to draw the DRS.  If none is given, create a new canvas.
+        """
+        master = None
+        if not canvas:
+            master = Tk()
+            master.title("DRT")
+
+            font = Font(family='helvetica', size=12)
+
+            if size_canvas:
+                canvas = Canvas(master, width=0, height=0)
+                canvas.font = font
+                self.canvas = canvas
+                (right, bottom) = self._visit(drs, self.OUTERSPACE, self.TOPSPACE)
+
+                width = max(right+self.OUTERSPACE, 100)
+                height = bottom+self.OUTERSPACE
+                canvas = Canvas(master, width=width, height=height)#, bg='white')
+            else:
+                canvas = Canvas(master, width=300, height=300)
+
+            canvas.pack()
+            canvas.font = font
+
+        self.canvas = canvas
+        self.drs = drs
+        self.master = master
+
+    def _get_text_height(self):
+        """Get the height of a line of text"""
+        return self.canvas.font.metrics("linespace")
+
+    def draw(self, x=OUTERSPACE, y=TOPSPACE):
+        """Draw the DRS"""
+        self._handle(self.drs, self._draw_command, x, y)
+
+        if self.master and not in_idle():
+            self.master.mainloop()
+        else:
+            return self._visit(self.drs, x, y)
+
+    def _visit(self, expression, x, y):
+        """
+        Return the bottom-rightmost point without actually drawing the item
+
+        :param expression: the item to visit
+        :param x: the top of the current drawing area
+        :param y: the left side of the current drawing area
+        :return: the bottom-rightmost point
+        """
+        return self._handle(expression, self._visit_command, x, y)
+
+    def _draw_command(self, item, x, y):
+        """
+        Draw the given item at the given location
+
+        :param item: the item to draw
+        :param x: the top of the current drawing area
+        :param y: the left side of the current drawing area
+        :return: the bottom-rightmost point
+        """
+        if isinstance(item, string_types):
+            self.canvas.create_text(x, y, anchor='nw', font=self.canvas.font, text=item)
+        elif isinstance(item, tuple):
+            # item is the lower-right of a box
+            (right, bottom) = item
+            self.canvas.create_rectangle(x, y, right, bottom)
+            horiz_line_y = y + self._get_text_height() + (self.BUFFER * 2) #the line separating refs from conds
+            self.canvas.create_line(x, horiz_line_y, right, horiz_line_y)
+
+        return self._visit_command(item, x, y)
+
+    def _visit_command(self, item, x, y):
+        """
+        Return the bottom-rightmost point without actually drawing the item
+
+        :param item: the item to visit
+        :param x: the top of the current drawing area
+        :param y: the left side of the current drawing area
+        :return: the bottom-rightmost point
+        """
+        if isinstance(item, string_types):
+            return (x + self.canvas.font.measure(item), y + self._get_text_height())
+        elif isinstance(item, tuple):
+            return item
+
+    def _handle(self, expression, command, x=0, y=0):
+        """
+        :param expression: the expression to handle
+        :param command: the function to apply, either _draw_command or _visit_command
+        :param x: the top of the current drawing area
+        :param y: the left side of the current drawing area
+        :return: the bottom-rightmost point
+        """
+        if command == self._visit_command:
+            #if we don't need to draw the item, then we can use the cached values
+            try:
+                #attempt to retrieve cached values
+                right = expression._drawing_width + x
+                bottom = expression._drawing_height + y
+                return (right, bottom)
+            except AttributeError:
+                #the values have not been cached yet, so compute them
+                pass
+
+        if isinstance(expression, DrtAbstractVariableExpression):
+            factory = self._handle_VariableExpression
+        elif isinstance(expression, DRS):
+            factory = self._handle_DRS
+        elif isinstance(expression, DrtNegatedExpression):
+            factory = self._handle_NegatedExpression
+        elif isinstance(expression, DrtLambdaExpression):
+            factory = self._handle_LambdaExpression
+        elif isinstance(expression, BinaryExpression):
+            factory = self._handle_BinaryExpression
+        elif isinstance(expression, DrtApplicationExpression):
+            factory = self._handle_ApplicationExpression
+        elif isinstance(expression, PossibleAntecedents):
+            factory = self._handle_VariableExpression
+        elif isinstance(expression, DrtProposition):
+            factory = self._handle_DrtProposition
+        else:
+            raise Exception(expression.__class__.__name__)
+
+        (right, bottom) = factory(expression, command, x, y)
+
+        #cache the values
+        expression._drawing_width = right - x
+        expression._drawing_height = bottom - y
+
+        return (right, bottom)
+
+    def _handle_VariableExpression(self, expression, command, x, y):
+        return command("%s" % expression, x, y)
+
+    def _handle_NegatedExpression(self, expression, command, x, y):
+        # Find the width of the negation symbol
+        right = self._visit_command(DrtTokens.NOT, x, y)[0]
+
+        # Handle term
+        (right, bottom) = self._handle(expression.term, command, right, y)
+
+        # Handle variables now that we know the y-coordinate
+        command(DrtTokens.NOT, x, self._get_centered_top(y, bottom - y, self._get_text_height()))
+
+        return (right, bottom)
+
+    def _handle_DRS(self, expression, command, x, y):
+        left = x + self.BUFFER #indent the left side
+        bottom = y + self.BUFFER #indent the top
+
+        # Handle Discourse Referents
+        if expression.refs:
+            refs = ' '.join("%s"%r for r in expression.refs)
+        else:
+            refs = '     '
+        (max_right, bottom) = command(refs, left, bottom)
+        bottom += (self.BUFFER * 2)
+
+        # Handle Conditions
+        if expression.conds:
+            for cond in expression.conds:
+                (right, bottom) = self._handle(cond, command, left, bottom)
+                max_right = max(max_right, right)
+                bottom += self.BUFFER
+        else:
+            bottom += self._get_text_height() + self.BUFFER
+
+        # Handle Box
+        max_right += self.BUFFER
+        return command((max_right, bottom), x, y)
+
+    def _handle_ApplicationExpression(self, expression, command, x, y):
+        function, args = expression.uncurry()
+        if not isinstance(function, DrtAbstractVariableExpression):
+            #It's not a predicate expression ("P(x,y)"), so leave arguments curried
+            function = expression.function
+            args = [expression.argument]
+
+        # Get the max bottom of any element on the line
+        function_bottom = self._visit(function, x, y)[1]
+        max_bottom = max([function_bottom] + [self._visit(arg, x, y)[1] for arg in args])
+
+        line_height = max_bottom - y
+
+        # Handle 'function'
+        function_drawing_top = self._get_centered_top(y, line_height, function._drawing_height)
+        right = self._handle(function, command, x, function_drawing_top)[0]
+
+        # Handle open paren
+        centred_string_top = self._get_centered_top(y, line_height, self._get_text_height())
+        right = command(DrtTokens.OPEN, right, centred_string_top)[0]
+
+        # Handle each arg
+        for (i,arg) in enumerate(args):
+            arg_drawing_top = self._get_centered_top(y, line_height, arg._drawing_height)
+            right = self._handle(arg, command, right, arg_drawing_top)[0]
+
+            if i+1 < len(args):
+                #since it's not the last arg, add a comma
+                right = command(DrtTokens.COMMA + ' ', right, centred_string_top)[0]
+
+        # Handle close paren
+        right = command(DrtTokens.CLOSE, right, centred_string_top)[0]
+
+        return (right, max_bottom)
+
+    def _handle_LambdaExpression(self, expression, command, x, y):
+        # Find the width of the lambda symbol and abstracted variables
+        variables = DrtTokens.LAMBDA + "%s" % expression.variable + DrtTokens.DOT
+        right = self._visit_command(variables, x, y)[0]
+
+        # Handle term
+        (right, bottom) = self._handle(expression.term, command, right, y)
+
+        # Handle variables now that we know the y-coordinate
+        command(variables, x, self._get_centered_top(y, bottom - y, self._get_text_height()))
+
+        return (right, bottom)
+
+    def _handle_BinaryExpression(self, expression, command, x, y):
+        # Get the full height of the line, based on the operands
+        first_height = self._visit(expression.first, 0, 0)[1]
+        second_height = self._visit(expression.second, 0, 0)[1]
+        line_height = max(first_height, second_height)
+
+        # Handle open paren
+        centred_string_top = self._get_centered_top(y, line_height, self._get_text_height())
+        right = command(DrtTokens.OPEN, x, centred_string_top)[0]
+
+        # Handle the first operand
+        first_height = expression.first._drawing_height
+        (right, first_bottom) = self._handle(expression.first, command, right, self._get_centered_top(y, line_height, first_height))
+
+        # Handle the operator
+        right = command(' %s ' % expression.getOp(), right, centred_string_top)[0]
+
+        # Handle the second operand
+        second_height = expression.second._drawing_height
+        (right, second_bottom) = self._handle(expression.second, command, right, self._get_centered_top(y, line_height, second_height))
+
+        # Handle close paren
+        right = command(DrtTokens.CLOSE, right, centred_string_top)[0]
+
+        return (right, max(first_bottom, second_bottom))
+
+    def _handle_DrtProposition(self, expression, command, x, y):
+        # Find the width of the negation symbol
+        right = command(expression.variable, x, y)[0]
+
+        # Handle term
+        (right, bottom) = self._handle(expression.term, command, right, y)
+
+        return (right, bottom)
+
+    def _get_centered_top(self, top, full_height, item_height):
+        """Get the y-coordinate of the point that a figure should start at if
+        its height is 'item_height' and it needs to be centered in an area that
+        starts at 'top' and is 'full_height' tall."""
+        return top + (full_height - item_height) / 2
+
+
+def demo():
+    print('='*20 + 'TEST PARSE' + '='*20)
+    dexpr = DrtExpression.fromstring
+    print(dexpr(r'([x,y],[sees(x,y)])'))
+    print(dexpr(r'([x],[man(x), walks(x)])'))
+    print(dexpr(r'\x.\y.([],[sees(x,y)])'))
+    print(dexpr(r'\x.([],[walks(x)])(john)'))
+    print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))'))
+    print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))'))
+    print(dexpr(r'([x],[PRO(x), sees(John,x)])'))
+    print(dexpr(r'([x],[man(x), -([],[walks(x)])])'))
+    print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'))
+
+    print('='*20 + 'Test fol()' + '='*20)
+    print(dexpr(r'([x,y],[sees(x,y)])').fol())
+
+    print('='*20 + 'Test alpha conversion and lambda expression equality' + '='*20)
+    e1 = dexpr(r'\x.([],[P(x)])')
+    print(e1)
+    e2 = e1.alpha_convert(Variable('z'))
+    print(e2)
+    print(e1 == e2)
+
+    print('='*20 + 'Test resolve_anaphora()' + '='*20)
+    print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])')))
+    print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])')))
+    print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')))
+
+    print('='*20 + 'Test pretty_print()' + '='*20)
+    dexpr(r"([],[])").pretty_print()
+    dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print()
+    dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print()
+    dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print()
+    dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print()
+
+
+def test_draw():
+    try:
+        from six.moves.tkinter import Tk
+    except ImportError:
+        from nose import SkipTest
+        raise SkipTest("tkinter is required, but it's not available.")
+
+    expressions = [
+            r'x',
+            r'([],[])',
+            r'([x],[])',
+            r'([x],[man(x)])',
+
+            r'([x,y],[sees(x,y)])',
+            r'([x],[man(x), walks(x)])',
+            r'\x.([],[man(x), walks(x)])',
+            r'\x y.([],[sees(x,y)])',
+            r'([],[(([],[walks(x)]) + ([],[runs(x)]))])',
+
+            r'([x],[man(x), -([],[walks(x)])])',
+            r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'
+            ]
+
+    for e in expressions:
+        d = DrtExpression.fromstring(e)
+        d.draw()
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/sem/drt.pyc b/nlp_resource_data/nltk/sem/drt.pyc

new file mode 100755 (executable)

index 0000000..9961ce7

Binary files /dev/null and b/nlp_resource_data/nltk/sem/drt.pyc differ
diff --git a/nlp_resource_data/nltk/sem/drt_glue_demo.py b/nlp_resource_data/nltk/sem/drt_glue_demo.py

new file mode 100755 (executable)

index 0000000..4fe4a47
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/drt_glue_demo.py
@@ -0,0 +1,485 @@
+# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse
+#                           Representation Theory (DRT) as meaning language
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+try:
+    from six.moves.tkinter import (Button, Frame, IntVar, Label, Listbox, Menu,
+                                   Scrollbar, Tk)
+    from six.moves.tkinter_font import Font
+    from nltk.draw.util import CanvasFrame, ShowText
+
+except ImportError:
+    """Ignore ImportError because tkinter might not be available."""
+
+from nltk.util import in_idle
+from nltk.tag import RegexpTagger
+from nltk.parse import MaltParser
+from nltk.sem.logic import Variable
+from nltk.sem.drt import DrsDrawer, DrtVariableExpression
+from nltk.sem.glue import DrtGlue
+
+
+class DrtGlueDemo(object):
+    def __init__(self, examples):
+        # Set up the main window.
+        self._top = Tk()
+        self._top.title('DRT Glue Demo')
+
+        # Set up key bindings.
+        self._init_bindings()
+
+        # Initialize the fonts.self._error = None
+        self._init_fonts(self._top)
+
+        self._examples = examples
+        self._readingCache = [None for example in examples]
+
+        # The user can hide the grammar.
+        self._show_grammar = IntVar(self._top)
+        self._show_grammar.set(1)
+
+        # Set the data to None
+        self._curExample = -1
+        self._readings = []
+        self._drs = None
+        self._drsWidget = None
+        self._error = None
+
+        self._init_glue()
+
+        # Create the basic frames.
+        self._init_menubar(self._top)
+        self._init_buttons(self._top)
+        self._init_exampleListbox(self._top)
+        self._init_readingListbox(self._top)
+        self._init_canvas(self._top)
+
+        # Resize callback
+        self._canvas.bind('<Configure>', self._configure)
+
+    #########################################
+    ##  Initialization Helpers
+    #########################################
+
+    def _init_glue(self):
+        tagger = RegexpTagger(
+            [('^(David|Mary|John)$', 'NNP'),
+             ('^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
+             ('^(go|order|vanish|find|approach)$', 'VB'),
+             ('^(a)$', 'ex_quant'),
+             ('^(every)$', 'univ_quant'),
+             ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+             ('^(big|gray|former)$', 'JJ'),
+             ('^(him|himself)$', 'PRP')
+        ])
+
+        depparser = MaltParser(tagger=tagger)
+        self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
+
+    def _init_fonts(self, root):
+        # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
+        self._sysfont = Font(font=Button()["font"])
+        root.option_add("*Font", self._sysfont)
+
+        # TWhat's our font size (default=same as sysfont)
+        self._size = IntVar(root)
+        self._size.set(self._sysfont.cget('size'))
+
+        self._boldfont = Font(family='helvetica', weight='bold',
+                                    size=self._size.get())
+        self._font = Font(family='helvetica',
+                                    size=self._size.get())
+        if self._size.get() < 0: big = self._size.get()-2
+        else: big = self._size.get()+2
+        self._bigfont = Font(family='helvetica', weight='bold',
+                                    size=big)
+
+    def _init_exampleListbox(self, parent):
+        self._exampleFrame = listframe = Frame(parent)
+        self._exampleFrame.pack(fill='both', side='left', padx=2)
+        self._exampleList_label = Label(self._exampleFrame, font=self._boldfont,
+                                     text='Examples')
+        self._exampleList_label.pack()
+        self._exampleList = Listbox(self._exampleFrame, selectmode='single',
+                                 relief='groove', background='white',
+                                 foreground='#909090', font=self._font,
+                                 selectforeground='#004040',
+                                 selectbackground='#c0f0c0')
+
+        self._exampleList.pack(side='right', fill='both', expand=1)
+
+        for example in self._examples:
+            self._exampleList.insert('end', ('  %s' % example))
+        self._exampleList.config(height=min(len(self._examples), 25), width=40)
+
+        # Add a scrollbar if there are more than 25 examples.
+        if len(self._examples) > 25:
+            listscroll = Scrollbar(self._exampleFrame,
+                                   orient='vertical')
+            self._exampleList.config(yscrollcommand = listscroll.set)
+            listscroll.config(command=self._exampleList.yview)
+            listscroll.pack(side='left', fill='y')
+
+        # If they select a example, apply it.
+        self._exampleList.bind('<<ListboxSelect>>', self._exampleList_select)
+
+    def _init_readingListbox(self, parent):
+        self._readingFrame = listframe = Frame(parent)
+        self._readingFrame.pack(fill='both', side='left', padx=2)
+        self._readingList_label = Label(self._readingFrame, font=self._boldfont,
+                                     text='Readings')
+        self._readingList_label.pack()
+        self._readingList = Listbox(self._readingFrame, selectmode='single',
+                                 relief='groove', background='white',
+                                 foreground='#909090', font=self._font,
+                                 selectforeground='#004040',
+                                 selectbackground='#c0f0c0')
+
+        self._readingList.pack(side='right', fill='both', expand=1)
+
+        # Add a scrollbar if there are more than 25 examples.
+        listscroll = Scrollbar(self._readingFrame,
+                               orient='vertical')
+        self._readingList.config(yscrollcommand = listscroll.set)
+        listscroll.config(command=self._readingList.yview)
+        listscroll.pack(side='right', fill='y')
+
+        self._populate_readingListbox()
+
+    def _populate_readingListbox(self):
+        # Populate the listbox with integers
+        self._readingList.delete(0, 'end')
+        for i in range(len(self._readings)):
+            self._readingList.insert('end', ('  %s' % (i+1)))
+        self._readingList.config(height=min(len(self._readings), 25), width=5)
+
+        # If they select a example, apply it.
+        self._readingList.bind('<<ListboxSelect>>', self._readingList_select)
+
+    def _init_bindings(self):
+        # Key bindings are a good thing.
+        self._top.bind('<Control-q>', self.destroy)
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Escape>', self.destroy)
+        self._top.bind('n', self.next)
+        self._top.bind('<space>', self.next)
+        self._top.bind('p', self.prev)
+        self._top.bind('<BackSpace>', self.prev)
+
+    def _init_buttons(self, parent):
+        # Set up the frames.
+        self._buttonframe = buttonframe = Frame(parent)
+        buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
+        Button(buttonframe, text='Prev',
+               background='#90c0d0', foreground='black',
+               command=self.prev,).pack(side='left')
+        Button(buttonframe, text='Next',
+               background='#90c0d0', foreground='black',
+               command=self.next,).pack(side='left')
+
+    def _configure(self, event):
+        self._autostep = 0
+        (x1, y1, x2, y2) = self._cframe.scrollregion()
+        y2 = event.height - 6
+        self._canvas['scrollregion'] = '%d %d %d %d' % (x1,y1,x2,y2)
+        self._redraw()
+
+    def _init_canvas(self, parent):
+        self._cframe = CanvasFrame(parent, background='white',
+                                   #width=525, height=250,
+                                   closeenough=10,
+                                   border=2, relief='sunken')
+        self._cframe.pack(expand=1, fill='both', side='top', pady=2)
+        canvas = self._canvas = self._cframe.canvas()
+
+        # Initially, there's no tree or text
+        self._tree = None
+        self._textwidgets = []
+        self._textline = None
+
+    def _init_menubar(self, parent):
+        menubar = Menu(parent)
+
+        filemenu = Menu(menubar, tearoff=0)
+        filemenu.add_command(label='Exit', underline=1,
+                             command=self.destroy, accelerator='q')
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
+
+        actionmenu = Menu(menubar, tearoff=0)
+        actionmenu.add_command(label='Next', underline=0,
+                               command=self.next, accelerator='n, Space')
+        actionmenu.add_command(label='Previous', underline=0,
+                               command=self.prev, accelerator='p, Backspace')
+        menubar.add_cascade(label='Action', underline=0, menu=actionmenu)
+
+        optionmenu = Menu(menubar, tearoff=0)
+        optionmenu.add_checkbutton(label='Remove Duplicates', underline=0,
+                                   variable=self._glue.remove_duplicates,
+                                   command=self._toggle_remove_duplicates,
+                                   accelerator='r')
+        menubar.add_cascade(label='Options', underline=0, menu=optionmenu)
+
+        viewmenu = Menu(menubar, tearoff=0)
+        viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+                                 underline=0, value=10, command=self.resize)
+        viewmenu.add_radiobutton(label='Small', variable=self._size,
+                                 underline=0, value=12, command=self.resize)
+        viewmenu.add_radiobutton(label='Medium', variable=self._size,
+                                 underline=0, value=14, command=self.resize)
+        viewmenu.add_radiobutton(label='Large', variable=self._size,
+                                 underline=0, value=18, command=self.resize)
+        viewmenu.add_radiobutton(label='Huge', variable=self._size,
+                                 underline=0, value=24, command=self.resize)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
+
+        helpmenu = Menu(menubar, tearoff=0)
+        helpmenu.add_command(label='About', underline=0,
+                             command=self.about)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+
+        parent.config(menu=menubar)
+
+    #########################################
+    ##  Main draw procedure
+    #########################################
+
+    def _redraw(self):
+        canvas = self._canvas
+
+        # Delete the old DRS, widgets, etc.
+        if self._drsWidget is not None:
+            self._drsWidget.clear()
+
+        if self._drs:
+            self._drsWidget = DrsWidget( self._canvas, self._drs )
+            self._drsWidget.draw()
+
+        if self._error:
+            self._drsWidget = DrsWidget( self._canvas, self._error )
+            self._drsWidget.draw()
+
+    #########################################
+    ##  Button Callbacks
+    #########################################
+
+    def destroy(self, *e):
+        self._autostep = 0
+        if self._top is None: return
+        self._top.destroy()
+        self._top = None
+
+    def prev(self, *e):
+        selection = self._readingList.curselection()
+        readingListSize = self._readingList.size()
+
+        # there are readings
+        if readingListSize > 0:
+            # if one reading is currently selected
+            if len(selection) == 1:
+                index = int(selection[0])
+
+                # if it's on (or before) the first item
+                if index <= 0:
+                    self._select_previous_example()
+                else:
+                    self._readingList_store_selection(index-1)
+
+            else:
+                #select its first reading
+                self._readingList_store_selection(readingListSize-1)
+
+        else:
+            self._select_previous_example()
+
+
+    def _select_previous_example(self):
+        #if the current example is not the first example
+        if self._curExample > 0:
+            self._exampleList_store_selection(self._curExample-1)
+        else:
+            #go to the last example
+            self._exampleList_store_selection(len(self._examples)-1)
+
+    def next(self, *e):
+        selection = self._readingList.curselection()
+        readingListSize = self._readingList.size()
+
+        # if there are readings
+        if readingListSize > 0:
+            # if one reading is currently selected
+            if len(selection) == 1:
+                index = int(selection[0])
+
+                # if it's on (or past) the last item
+                if index >= (readingListSize-1):
+                    self._select_next_example()
+                else:
+                    self._readingList_store_selection(index+1)
+
+            else:
+                #select its first reading
+                self._readingList_store_selection(0)
+
+        else:
+            self._select_next_example()
+
+    def _select_next_example(self):
+        #if the current example is not the last example
+        if self._curExample < len(self._examples)-1:
+            self._exampleList_store_selection(self._curExample+1)
+        else:
+            #go to the first example
+            self._exampleList_store_selection(0)
+
+
+    def about(self, *e):
+        ABOUT = ("NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"+
+                 "Written by Daniel H. Garrette")
+        TITLE = 'About: NLTK DRT Glue Demo'
+        try:
+            from six.moves.tkinter_messagebox import Message
+            Message(message=ABOUT, title=TITLE).show()
+        except:
+            ShowText(self._top, TITLE, ABOUT)
+
+    def postscript(self, *e):
+        self._autostep = 0
+        self._cframe.print_to_file()
+
+    def mainloop(self, *args, **kwargs):
+        """
+        Enter the Tkinter mainloop.  This function must be called if
+        this demo is created from a non-interactive program (e.g.
+        from a secript); otherwise, the demo will close as soon as
+        the script completes.
+        """
+        if in_idle(): return
+        self._top.mainloop(*args, **kwargs)
+
+    def resize(self, size=None):
+        if size is not None: self._size.set(size)
+        size = self._size.get()
+        self._font.configure(size=-(abs(size)))
+        self._boldfont.configure(size=-(abs(size)))
+        self._sysfont.configure(size=-(abs(size)))
+        self._bigfont.configure(size=-(abs(size+2)))
+        self._redraw()
+
+    def _toggle_remove_duplicates(self):
+        self._glue.remove_duplicates = not self._glue.remove_duplicates
+
+        self._exampleList.selection_clear(0, 'end')
+        self._readings = []
+        self._populate_readingListbox()
+        self._readingCache = [None for ex in self._examples]
+        self._curExample = -1
+        self._error = None
+
+        self._drs = None
+        self._redraw()
+
+
+    def _exampleList_select(self, event):
+        selection = self._exampleList.curselection()
+        if len(selection) != 1: return
+        self._exampleList_store_selection(int(selection[0]))
+
+    def _exampleList_store_selection(self, index):
+        self._curExample = index
+        example = self._examples[index]
+
+        self._exampleList.selection_clear(0, 'end')
+        if example:
+            cache = self._readingCache[index]
+            if cache:
+                if isinstance(cache, list):
+                    self._readings = cache
+                    self._error = None
+                else:
+                    self._readings = []
+                    self._error = cache
+            else:
+                try:
+                    self._readings = self._glue.parse_to_meaning(example)
+                    self._error = None
+                    self._readingCache[index] = self._readings
+                except Exception as e:
+                    self._readings = []
+                    self._error = DrtVariableExpression(Variable('Error: ' + str(e)))
+                    self._readingCache[index] = self._error
+
+                    #add a star to the end of the example
+                    self._exampleList.delete(index)
+                    self._exampleList.insert(index, ('  %s *' % example))
+                    self._exampleList.config(height=min(len(self._examples), 25), width=40)
+
+            self._populate_readingListbox()
+
+            self._exampleList.selection_set(index)
+
+            self._drs = None
+            self._redraw()
+
+
+    def _readingList_select(self, event):
+        selection = self._readingList.curselection()
+        if len(selection) != 1: return
+        self._readingList_store_selection(int(selection[0]))
+
+    def _readingList_store_selection(self, index):
+        reading = self._readings[index]
+
+        self._readingList.selection_clear(0, 'end')
+        if reading:
+            self._readingList.selection_set(index)
+
+            self._drs = reading.simplify().normalize().resolve_anaphora()
+
+            self._redraw()
+
+
+class DrsWidget(object):
+    def __init__(self, canvas, drs, **attribs):
+        self._drs = drs
+        self._canvas = canvas
+        canvas.font = Font(font=canvas.itemcget(canvas.create_text(0, 0, text=''), 'font'))
+        canvas._BUFFER = 3
+        self.bbox = (0, 0, 0, 0)
+
+    def draw(self):
+        (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw()
+        self.bbox = (0, 0, right+1, bottom+1)
+
+    def clear(self):
+        self._canvas.create_rectangle(self.bbox, fill="white", width="0" )
+
+def demo():
+    examples = ['John walks',
+                'David sees Mary',
+                'David eats a sandwich',
+                'every man chases a dog',
+#                'every man believes a dog yawns',
+#                'John gives David a sandwich',
+                'John chases himself',
+#                'John persuades David to order a pizza',
+#                'John tries to go',
+#                'John tries to find a unicorn',
+#                'John seems to vanish',
+#                'a unicorn seems to approach',
+#                'every big cat leaves',
+#                'every gray cat leaves',
+#                'every big gray cat leaves',
+#                'a former senator leaves',
+#                'John likes a cat',
+#                'John likes every cat',
+#                'he walks',
+#                'John walks and he leaves'
+                ]
+    DrtGlueDemo(examples).mainloop()
+
+if __name__ == '__main__': demo()
diff --git a/nlp_resource_data/nltk/sem/drt_glue_demo.pyc b/nlp_resource_data/nltk/sem/drt_glue_demo.pyc

new file mode 100755 (executable)

index 0000000..d26406a

Binary files /dev/null and b/nlp_resource_data/nltk/sem/drt_glue_demo.pyc differ
diff --git a/nlp_resource_data/nltk/sem/evaluate.py b/nlp_resource_data/nltk/sem/evaluate.py

new file mode 100755 (executable)

index 0000000..f9cdb90
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/evaluate.py
@@ -0,0 +1,790 @@
+# Natural Language Toolkit: Models for first-order languages with lambda
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>,
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+#TODO:
+    #- fix tracing
+    #- fix iterator-based approach to existentials
+
+"""
+This module provides data structures for representing first-order
+models.
+"""
+from __future__ import print_function, unicode_literals
+
+from pprint import pformat
+import inspect
+import textwrap
+import re
+import sys
+
+from six import string_types
+
+from nltk.decorators import decorator # this used in code that is commented out
+from nltk.compat import python_2_unicode_compatible
+
+from nltk.sem.logic import (AbstractVariableExpression, AllExpression, Expression,
+                            AndExpression, ApplicationExpression, EqualityExpression,
+                            ExistsExpression, IffExpression, ImpExpression,
+                            IndividualVariableExpression, LambdaExpression,
+                            NegatedExpression, OrExpression,
+                            Variable, is_indvar)
+
+
+class Error(Exception): pass
+
+class Undefined(Error):  pass
+
+def trace(f, *args, **kw):
+    if sys.version_info[0] >= 3:
+        argspec = inspect.getfullargspec(f)
+    else:
+        argspec = inspect.getargspec(f)
+    d = dict(zip(argspec[0], args))
+    if d.pop('trace', None):
+        print()
+        for item in d.items():
+            print("%s => %s" % item)
+    return f(*args, **kw)
+
+def is_rel(s):
+    """
+    Check whether a set represents a relation (of any arity).
+
+    :param s: a set containing tuples of str elements
+    :type s: set
+    :rtype: bool
+        """
+    # we have the empty relation, i.e. set()
+    if len(s) == 0:
+        return True
+    # all the elements are tuples of the same length
+    elif all(isinstance(el, tuple) for el in s) and len(max(s))==len(min(s)):
+        return True
+    else:
+        raise ValueError("Set %r contains sequences of different lengths" % s)
+
+def set2rel(s):
+    """
+    Convert a set containing individuals (strings or numbers) into a set of
+    unary tuples. Any tuples of strings already in the set are passed through
+    unchanged.
+
+    For example:
+      - set(['a', 'b']) => set([('a',), ('b',)])
+      - set([3, 27]) => set([('3',), ('27',)])
+
+    :type s: set
+    :rtype: set of tuple of str
+    """
+    new = set()
+    for elem in s:
+        if isinstance(elem, string_types):
+            new.add((elem,))
+        elif isinstance(elem, int):
+            new.add((str(elem,)))
+        else:
+            new.add(elem)
+    return new
+
+def arity(rel):
+    """
+    Check the arity of a relation.
+    :type rel: set of tuples
+    :rtype: int of tuple of str
+    """
+    if len(rel) == 0:
+        return 0
+    return len(list(rel)[0])
+
+
+@python_2_unicode_compatible
+class Valuation(dict):
+    """
+    A dictionary which represents a model-theoretic Valuation of non-logical constants.
+    Keys are strings representing the constants to be interpreted, and values correspond
+    to individuals (represented as strings) and n-ary relations (represented as sets of tuples
+    of strings).
+
+    An instance of ``Valuation`` will raise a KeyError exception (i.e.,
+    just behave like a standard  dictionary) if indexed with an expression that
+    is not in its list of symbols.
+    """
+    def __init__(self, xs):
+        """
+        :param xs: a list of (symbol, value) pairs.
+        """
+        super(Valuation, self).__init__()
+        for (sym, val) in xs:
+            if isinstance(val, string_types) or isinstance(val, bool):
+                self[sym] = val
+            elif isinstance(val, set):
+                self[sym] = set2rel(val)
+            else:
+                msg = textwrap.fill("Error in initializing Valuation. "
+                                    "Unrecognized value for symbol '%s':\n%s" % (sym, val), width=66)
+
+                raise ValueError(msg)
+
+    def __getitem__(self, key):
+        if key in self:
+            return dict.__getitem__(self, key)
+        else:
+            raise Undefined("Unknown expression: '%s'" % key)
+
+    def __str__(self):
+        return pformat(self)
+
+    @property
+    def domain(self):
+        """Set-theoretic domain of the value-space of a Valuation."""
+        dom = []
+        for val in self.values():
+            if isinstance(val, string_types):
+                dom.append(val)
+            elif not isinstance(val, bool):
+                dom.extend([elem for tuple_ in val for elem in tuple_ if elem is not None])
+        return set(dom)
+
+    @property
+    def symbols(self):
+        """The non-logical constants which the Valuation recognizes."""
+        return sorted(self.keys())
+
+    @classmethod
+    def fromstring(cls, s):
+        return read_valuation(s)
+
+
+##########################################
+# REs used by the _read_valuation function
+##########################################
+_VAL_SPLIT_RE = re.compile(r'\s*=+>\s*')
+_ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*')
+_TUPLES_RE = re.compile(r"""\s*
+                                (\([^)]+\))  # tuple-expression
+                                \s*""", re.VERBOSE)
+
+def _read_valuation_line(s):
+    """
+    Read a line in a valuation file.
+
+    Lines are expected to be of the form::
+
+      noosa => n
+      girl => {g1, g2}
+      chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)}
+
+    :param s: input line
+    :type s: str
+    :return: a pair (symbol, value)
+    :rtype: tuple
+    """
+    pieces = _VAL_SPLIT_RE.split(s)
+    symbol = pieces[0]
+    value = pieces[1]
+    # check whether the value is meant to be a set
+    if value.startswith('{'):
+        value = value[1:-1]
+        tuple_strings = _TUPLES_RE.findall(value)
+        # are the set elements tuples?
+        if tuple_strings:
+            set_elements = []
+            for ts in tuple_strings:
+                ts = ts[1:-1]
+                element = tuple(_ELEMENT_SPLIT_RE.split(ts))
+                set_elements.append(element)
+        else:
+            set_elements = _ELEMENT_SPLIT_RE.split(value)
+        value = set(set_elements)
+    return symbol, value
+
+def read_valuation(s, encoding=None):
+    """
+    Convert a valuation string into a valuation.
+
+    :param s: a valuation string
+    :type s: str
+    :param encoding: the encoding of the input string, if it is binary
+    :type encoding: str
+    :return: a ``nltk.sem`` valuation
+    :rtype: Valuation
+    """
+    if encoding is not None:
+        s = s.decode(encoding)
+    statements = []
+    for linenum, line in enumerate(s.splitlines()):
+        line = line.strip()
+        if line.startswith('#') or line=='': continue
+        try:
+            statements.append(_read_valuation_line(line))
+        except ValueError:
+            raise ValueError('Unable to parse line %s: %s' % (linenum, line))
+    return Valuation(statements)
+
+
+@python_2_unicode_compatible
+class Assignment(dict):
+    """
+    A dictionary which represents an assignment of values to variables.
+
+    An assigment can only assign values from its domain.
+
+    If an unknown expression *a* is passed to a model *M*\ 's
+    interpretation function *i*, *i* will first check whether *M*\ 's
+    valuation assigns an interpretation to *a* as a constant, and if
+    this fails, *i* will delegate the interpretation of *a* to
+    *g*. *g* only assigns values to individual variables (i.e.,
+    members of the class ``IndividualVariableExpression`` in the ``logic``
+    module. If a variable is not assigned a value by *g*, it will raise
+    an ``Undefined`` exception.
+
+    A variable *Assignment* is a mapping from individual variables to
+    entities in the domain. Individual variables are usually indicated
+    with the letters ``'x'``, ``'y'``, ``'w'`` and ``'z'``, optionally
+    followed by an integer (e.g., ``'x0'``, ``'y332'``).  Assignments are
+    created using the ``Assignment`` constructor, which also takes the
+    domain as a parameter.
+
+        >>> from nltk.sem.evaluate import Assignment
+        >>> dom = set(['u1', 'u2', 'u3', 'u4'])
+        >>> g3 = Assignment(dom, [('x', 'u1'), ('y', 'u2')])
+        >>> g3 == {'x': 'u1', 'y': 'u2'}
+        True
+
+    There is also a ``print`` format for assignments which uses a notation
+    closer to that in logic textbooks:
+
+        >>> print(g3)
+        g[u1/x][u2/y]
+
+    It is also possible to update an assignment using the ``add`` method:
+
+        >>> dom = set(['u1', 'u2', 'u3', 'u4'])
+        >>> g4 = Assignment(dom)
+        >>> g4.add('x', 'u1')
+        {'x': 'u1'}
+
+    With no arguments, ``purge()`` is equivalent to ``clear()`` on a dictionary:
+
+        >>> g4.purge()
+        >>> g4
+        {}
+
+    :param domain: the domain of discourse
+    :type domain: set
+    :param assign: a list of (varname, value) associations
+    :type assign: list
+    """
+
+    def __init__(self, domain, assign=None):
+        super(Assignment, self).__init__()
+        self.domain = domain
+        if assign:
+            for (var, val) in assign:
+                assert val in self.domain,\
+                       "'%s' is not in the domain: %s" % (val, self.domain)
+                assert is_indvar(var),\
+                       "Wrong format for an Individual Variable: '%s'" % var
+                self[var] = val
+        self.variant = None
+        self._addvariant()
+
+    def __getitem__(self, key):
+        if key in self:
+            return dict.__getitem__(self, key)
+        else:
+            raise Undefined("Not recognized as a variable: '%s'" % key)
+
+    def copy(self):
+        new = Assignment(self.domain)
+        new.update(self)
+        return new
+
+    def purge(self, var=None):
+        """
+        Remove one or all keys (i.e. logic variables) from an
+        assignment, and update ``self.variant``.
+
+        :param var: a Variable acting as a key for the assignment.
+        """
+        if var:
+            del self[var]
+        else:
+            self.clear()
+        self._addvariant()
+        return None
+
+    def __str__(self):
+        """
+        Pretty printing for assignments. {'x', 'u'} appears as 'g[u/x]'
+        """
+        gstring = "g"
+        # Deterministic output for unit testing.
+        variant = sorted(self.variant)
+        for (val, var) in variant:
+            gstring += "[%s/%s]" % (val, var)
+        return gstring
+
+    def _addvariant(self):
+        """
+        Create a more pretty-printable version of the assignment.
+        """
+        list_ = []
+        for item in self.items():
+            pair = (item[1], item[0])
+            list_.append(pair)
+        self.variant = list_
+        return None
+
+    def add(self, var, val):
+        """
+        Add a new variable-value pair to the assignment, and update
+        ``self.variant``.
+
+        """
+        assert val in self.domain,\
+               "%s is not in the domain %s" % (val, self.domain)
+        assert is_indvar(var),\
+               "Wrong format for an Individual Variable: '%s'" % var
+        self[var] = val
+        self._addvariant()
+        return self
+
+
+@python_2_unicode_compatible
+class Model(object):
+    """
+    A first order model is a domain *D* of discourse and a valuation *V*.
+
+    A domain *D* is a set, and a valuation *V* is a map that associates
+    expressions with values in the model.
+    The domain of *V* should be a subset of *D*.
+
+    Construct a new ``Model``.
+
+    :type domain: set
+    :param domain: A set of entities representing the domain of discourse of the model.
+    :type valuation: Valuation
+    :param valuation: the valuation of the model.
+    :param prop: If this is set, then we are building a propositional\
+    model and don't require the domain of *V* to be subset of *D*.
+    """
+
+    def __init__(self, domain, valuation):
+        assert isinstance(domain, set)
+        self.domain = domain
+        self.valuation = valuation
+        if not domain.issuperset(valuation.domain):
+            raise Error("The valuation domain, %s, must be a subset of the model's domain, %s"\
+                  % (valuation.domain, domain))
+
+    def __repr__(self):
+        return "(%r, %r)" % (self.domain, self.valuation)
+
+    def __str__(self):
+        return "Domain = %s,\nValuation = \n%s" % (self.domain, self.valuation)
+
+    def evaluate(self, expr, g, trace=None):
+        """
+        Read input expressions, and provide a handler for ``satisfy``
+        that blocks further propagation of the ``Undefined`` error.
+        :param expr: An ``Expression`` of ``logic``.
+        :type g: Assignment
+        :param g: an assignment to individual variables.
+        :rtype: bool or 'Undefined'
+        """
+        try:
+            parsed = Expression.fromstring(expr)
+            value = self.satisfy(parsed, g, trace=trace)
+            if trace:
+                print()
+                print("'%s' evaluates to %s under M, %s" %  (expr, value, g))
+            return value
+        except Undefined:
+            if trace:
+                print()
+                print("'%s' is undefined under M, %s" %  (expr, g))
+            return 'Undefined'
+
+
+    def satisfy(self, parsed, g, trace=None):
+        """
+        Recursive interpretation function for a formula of first-order logic.
+
+        Raises an ``Undefined`` error when ``parsed`` is an atomic string
+        but is not a symbol or an individual variable.
+
+        :return: Returns a truth value or ``Undefined`` if ``parsed`` is\
+        complex, and calls the interpretation function ``i`` if ``parsed``\
+        is atomic.
+
+        :param parsed: An expression of ``logic``.
+        :type g: Assignment
+        :param g: an assignment to individual variables.
+        """
+
+        if isinstance(parsed, ApplicationExpression):
+            function, arguments = parsed.uncurry()
+            if isinstance(function, AbstractVariableExpression):
+                #It's a predicate expression ("P(x,y)"), so used uncurried arguments
+                funval = self.satisfy(function, g)
+                argvals = tuple(self.satisfy(arg, g) for arg in arguments)
+                return argvals in funval
+            else:
+                #It must be a lambda expression, so use curried form
+                funval = self.satisfy(parsed.function, g)
+                argval = self.satisfy(parsed.argument, g)
+                return funval[argval]
+        elif isinstance(parsed, NegatedExpression):
+            return not self.satisfy(parsed.term, g)
+        elif isinstance(parsed, AndExpression):
+            return self.satisfy(parsed.first, g) and \
+                   self.satisfy(parsed.second, g)
+        elif isinstance(parsed, OrExpression):
+            return self.satisfy(parsed.first, g) or \
+                   self.satisfy(parsed.second, g)
+        elif isinstance(parsed, ImpExpression):
+            return (not self.satisfy(parsed.first, g)) or \
+                   self.satisfy(parsed.second, g)
+        elif isinstance(parsed, IffExpression):
+            return self.satisfy(parsed.first, g) == \
+                   self.satisfy(parsed.second, g)
+        elif isinstance(parsed, EqualityExpression):
+            return self.satisfy(parsed.first, g) == \
+                   self.satisfy(parsed.second, g)
+        elif isinstance(parsed, AllExpression):
+            new_g = g.copy()
+            for u in self.domain:
+                new_g.add(parsed.variable.name, u)
+                if not self.satisfy(parsed.term, new_g):
+                    return False
+            return True
+        elif isinstance(parsed, ExistsExpression):
+            new_g = g.copy()
+            for u in self.domain:
+                new_g.add(parsed.variable.name, u)
+                if self.satisfy(parsed.term, new_g):
+                    return True
+            return False
+        elif isinstance(parsed, LambdaExpression):
+            cf = {}
+            var = parsed.variable.name
+            for u in self.domain:
+                val = self.satisfy(parsed.term, g.add(var, u))
+                # NB the dict would be a lot smaller if we do this:
+                # if val: cf[u] = val
+                # But then need to deal with cases where f(a) should yield
+                # a function rather than just False.
+                cf[u] = val
+            return cf
+        else:
+            return self.i(parsed, g, trace)
+
+    #@decorator(trace_eval)
+    def i(self, parsed, g, trace=False):
+        """
+        An interpretation function.
+
+        Assuming that ``parsed`` is atomic:
+
+        - if ``parsed`` is a non-logical constant, calls the valuation *V*
+        - else if ``parsed`` is an individual variable, calls assignment *g*
+        - else returns ``Undefined``.
+
+        :param parsed: an ``Expression`` of ``logic``.
+        :type g: Assignment
+        :param g: an assignment to individual variables.
+        :return: a semantic value
+        """
+        # If parsed is a propositional letter 'p', 'q', etc, it could be in valuation.symbols
+        # and also be an IndividualVariableExpression. We want to catch this first case.
+        # So there is a procedural consequence to the ordering of clauses here:
+        if parsed.variable.name in self.valuation.symbols:
+            return self.valuation[parsed.variable.name]
+        elif isinstance(parsed, IndividualVariableExpression):
+            return g[parsed.variable.name]
+
+        else:
+            raise Undefined("Can't find a value for %s" % parsed)
+
+    def satisfiers(self, parsed, varex, g, trace=None, nesting=0):
+        """
+        Generate the entities from the model's domain that satisfy an open formula.
+
+        :param parsed: an open formula
+        :type parsed: Expression
+        :param varex: the relevant free individual variable in ``parsed``.
+        :type varex: VariableExpression or str
+        :param g: a variable assignment
+        :type g:  Assignment
+        :return: a set of the entities that satisfy ``parsed``.
+        """
+
+        spacer = '   '
+        indent = spacer + (spacer * nesting)
+        candidates = []
+
+        if isinstance(varex, string_types):
+            var = Variable(varex)
+        else:
+            var = varex
+
+        if var in parsed.free():
+            if trace:
+                print()
+                print((spacer * nesting) + "Open formula is '%s' with assignment %s" % (parsed, g))
+            for u in self.domain:
+                new_g = g.copy()
+                new_g.add(var.name, u)
+                if trace and trace > 1:
+                    lowtrace = trace-1
+                else:
+                    lowtrace = 0
+                value = self.satisfy(parsed, new_g, lowtrace)
+
+                if trace:
+                    print(indent + "(trying assignment %s)" % new_g)
+
+                # parsed == False under g[u/var]?
+                if value == False:
+                    if trace:
+                        print(indent + "value of '%s' under %s is False" % (parsed, new_g))
+
+                # so g[u/var] is a satisfying assignment
+                else:
+                    candidates.append(u)
+                    if trace:
+                        print(indent + "value of '%s' under %s is %s" % (parsed, new_g, value))
+
+            result = set(c for c in candidates)
+        # var isn't free in parsed
+        else:
+            raise Undefined("%s is not free in %s" % (var.name, parsed))
+
+        return result
+
+
+
+
+
+#//////////////////////////////////////////////////////////////////////
+# Demo..
+#//////////////////////////////////////////////////////////////////////
+# number of spacer chars
+mult = 30
+
+# Demo 1: Propositional Logic
+#################
+def propdemo(trace=None):
+    """Example of a propositional model."""
+
+    global val1, dom1, m1, g1
+    val1 = Valuation([('P', True), ('Q', True), ('R', False)])
+    dom1 = set([])
+    m1 = Model(dom1, val1)
+    g1 = Assignment(dom1)
+
+    print()
+    print('*' * mult)
+    print("Propositional Formulas Demo")
+    print('*' * mult)
+    print('(Propositional constants treated as nullary predicates)')
+    print()
+    print("Model m1:\n", m1)
+    print('*' * mult)
+    sentences = [
+    '(P & Q)',
+    '(P & R)',
+    '- P',
+    '- R',
+    '- - P',
+    '- (P & R)',
+    '(P | R)',
+    '(R | P)',
+    '(R | R)',
+    '(- P | R)',
+    '(P | - P)',
+    '(P -> Q)',
+    '(P -> R)',
+    '(R -> P)',
+    '(P <-> P)',
+    '(R <-> R)',
+    '(P <-> R)',
+    ]
+
+    for sent in sentences:
+        if trace:
+            print()
+            m1.evaluate(sent, g1, trace)
+        else:
+            print("The value of '%s' is: %s" % (sent, m1.evaluate(sent, g1)))
+
+# Demo 2: FOL Model
+#############
+
+def folmodel(quiet=False, trace=None):
+    """Example of a first-order model."""
+
+    global val2, v2, dom2, m2, g2
+
+    v2 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\
+         ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])),
+         ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
+    val2 = Valuation(v2)
+    dom2 = val2.domain
+    m2 = Model(dom2, val2)
+    g2 = Assignment(dom2, [('x', 'b1'), ('y', 'g2')])
+
+    if not quiet:
+        print()
+        print('*' * mult)
+        print("Models Demo")
+        print("*" * mult)
+        print("Model m2:\n", "-" * 14,"\n", m2)
+        print("Variable assignment = ", g2)
+
+        exprs = ['adam', 'boy', 'love', 'walks', 'x', 'y', 'z']
+        parsed_exprs = [Expression.fromstring(e) for e in exprs]
+
+        print()
+        for parsed in parsed_exprs:
+            try:
+                print("The interpretation of '%s' in m2 is %s" % (parsed, m2.i(parsed, g2)))
+            except Undefined:
+                print("The interpretation of '%s' in m2 is Undefined" % parsed)
+
+
+        applications = [('boy', ('adam')), ('walks', ('adam',)), ('love', ('adam', 'y')), ('love', ('y', 'adam'))]
+
+        for (fun, args) in applications:
+            try:
+                funval = m2.i(Expression.fromstring(fun), g2)
+                argsval = tuple(m2.i(Expression.fromstring(arg), g2) for arg in args)
+                print("%s(%s) evaluates to %s" % (fun, args, argsval in funval))
+            except Undefined:
+                print("%s(%s) evaluates to Undefined" % (fun, args))
+
+# Demo 3: FOL
+#########
+
+def foldemo(trace=None):
+    """
+    Interpretation of closed expressions in a first-order model.
+    """
+    folmodel(quiet=True)
+
+    print()
+    print('*' * mult)
+    print("FOL Formulas Demo")
+    print('*' * mult)
+
+    formulas = [
+    'love (adam, betty)',
+    '(adam = mia)',
+    '\\x. (boy(x) | girl(x))',
+    '\\x. boy(x)(adam)',
+    '\\x y. love(x, y)',
+    '\\x y. love(x, y)(adam)(betty)',
+    '\\x y. love(x, y)(adam, betty)',
+    '\\x y. (boy(x) & love(x, y))',
+    '\\x. exists y. (boy(x) & love(x, y))',
+    'exists z1. boy(z1)',
+    'exists x. (boy(x) &  -(x = adam))',
+    'exists x. (boy(x) & all y. love(y, x))',
+    'all x. (boy(x) | girl(x))',
+    'all x. (girl(x) -> exists y. boy(y) & love(x, y))',    #Every girl loves exists boy.
+    'exists x. (boy(x) & all y. (girl(y) -> love(y, x)))',  #There is exists boy that every girl loves.
+    'exists x. (boy(x) & all y. (girl(y) -> love(x, y)))',  #exists boy loves every girl.
+    'all x. (dog(x) -> - girl(x))',
+    'exists x. exists y. (love(x, y) & love(x, y))'
+    ]
+
+
+    for fmla in formulas:
+        g2.purge()
+        if trace:
+            m2.evaluate(fmla, g2, trace)
+        else:
+            print("The value of '%s' is: %s" % (fmla, m2.evaluate(fmla, g2)))
+
+
+# Demo 3: Satisfaction
+#############
+
+def satdemo(trace=None):
+    """Satisfiers of an open formula in a first order model."""
+
+    print()
+    print('*' * mult)
+    print("Satisfiers Demo")
+    print('*' * mult)
+
+    folmodel(quiet=True)
+
+    formulas = [
+               'boy(x)',
+               '(x = x)',
+               '(boy(x) | girl(x))',
+               '(boy(x) & girl(x))',
+               'love(adam, x)',
+               'love(x, adam)',
+               '-(x = adam)',
+               'exists z22. love(x, z22)',
+               'exists y. love(y, x)',
+               'all y. (girl(y) -> love(x, y))',
+               'all y. (girl(y) -> love(y, x))',
+               'all y. (girl(y) -> (boy(x) & love(y, x)))',
+               '(boy(x) & all y. (girl(y) -> love(x, y)))',
+               '(boy(x) & all y. (girl(y) -> love(y, x)))',
+               '(boy(x) & exists y. (girl(y) & love(y, x)))',
+               '(girl(x) -> dog(x))',
+               'all y. (dog(y) -> (x = y))',
+               'exists y. love(y, x)',
+               'exists y. (love(adam, y) & love(y, x))'
+                ]
+
+    if trace:
+        print(m2)
+
+    for fmla in formulas:
+        print(fmla)
+        Expression.fromstring(fmla)
+
+    parsed = [Expression.fromstring(fmla) for fmla in formulas]
+
+    for p in parsed:
+        g2.purge()
+        print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, 'x', g2, trace)))
+
+
+def demo(num=0, trace=None):
+    """
+    Run exists demos.
+
+     - num = 1: propositional logic demo
+     - num = 2: first order model demo (only if trace is set)
+     - num = 3: first order sentences demo
+     - num = 4: satisfaction of open formulas demo
+     - any other value: run all the demos
+
+    :param trace: trace = 1, or trace = 2 for more verbose tracing
+    """
+    demos = {
+        1: propdemo,
+        2: folmodel,
+        3: foldemo,
+        4: satdemo}
+
+    try:
+        demos[num](trace=trace)
+    except KeyError:
+        for num in demos:
+            demos[num](trace=trace)
+
+
+if __name__ == "__main__":
+    demo(2, trace=0)
diff --git a/nlp_resource_data/nltk/sem/evaluate.pyc b/nlp_resource_data/nltk/sem/evaluate.pyc

new file mode 100755 (executable)

index 0000000..d580411

Binary files /dev/null and b/nlp_resource_data/nltk/sem/evaluate.pyc differ
diff --git a/nlp_resource_data/nltk/sem/glue.py b/nlp_resource_data/nltk/sem/glue.py

new file mode 100755 (executable)

index 0000000..765ff3f
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/glue.py
@@ -0,0 +1,696 @@
+# Natural Language Toolkit: Glue Semantics
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
+
+import os
+from itertools import chain
+
+from six import string_types
+
+import nltk
+from nltk.internals import Counter
+from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
+from nltk.sem.logic import (Expression, Variable, VariableExpression,
+                            LambdaExpression, AbstractVariableExpression)
+from nltk.compat import python_2_unicode_compatible
+from nltk.sem import drt
+from nltk.sem import linearlogic
+
+SPEC_SEMTYPES = {'a'       : 'ex_quant',
+                 'an'      : 'ex_quant',
+                 'every'   : 'univ_quant',
+                 'the'     : 'def_art',
+                 'no'      : 'no_quant',
+                 'default' : 'ex_quant'}
+
+OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct']
+
+@python_2_unicode_compatible
+class GlueFormula(object):
+    def __init__(self, meaning, glue, indices=None):
+        if not indices:
+            indices = set()
+
+        if isinstance(meaning, string_types):
+            self.meaning = Expression.fromstring(meaning)
+        elif isinstance(meaning, Expression):
+            self.meaning = meaning
+        else:
+            raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))
+
+        if isinstance(glue, string_types):
+            self.glue = linearlogic.LinearLogicParser().parse(glue)
+        elif isinstance(glue, linearlogic.Expression):
+            self.glue = glue
+        else:
+            raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__))
+
+        self.indices = indices
+
+    def applyto(self, arg):
+        """ self = (\\x.(walk x), (subj -o f))
+            arg  = (john        ,  subj)
+            returns ((walk john),          f)
+        """
+        if self.indices & arg.indices: # if the sets are NOT disjoint
+            raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'.  Indices are not disjoint." % (self, arg))
+        else: # if the sets ARE disjoint
+            return_indices = (self.indices | arg.indices)
+
+        try:
+            return_glue = linearlogic.ApplicationExpression(self.glue, arg.glue, arg.indices)
+        except linearlogic.LinearLogicApplicationException:
+            raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'" % (self.simplify(), arg.simplify()))
+
+        arg_meaning_abstracted = arg.meaning
+        if return_indices:
+            for dep in self.glue.simplify().antecedent.dependencies[::-1]: # if self.glue is (A -o B), dep is in A.dependencies
+                arg_meaning_abstracted = self.make_LambdaExpression(Variable('v%s' % dep),
+                                                                    arg_meaning_abstracted)
+        return_meaning = self.meaning.applyto(arg_meaning_abstracted)
+
+        return self.__class__(return_meaning, return_glue, return_indices)
+
+    def make_VariableExpression(self, name):
+        return VariableExpression(name)
+
+    def make_LambdaExpression(self, variable, term):
+        return LambdaExpression(variable, term)
+
+    def lambda_abstract(self, other):
+        assert isinstance(other, GlueFormula)
+        assert isinstance(other.meaning, AbstractVariableExpression)
+        return self.__class__(self.make_LambdaExpression(other.meaning.variable,
+                                                         self.meaning),
+                              linearlogic.ImpExpression(other.glue, self.glue))
+
+    def compile(self, counter=None):
+        """From Iddo Lev's PhD Dissertation p108-109"""
+        if not counter:
+            counter = Counter()
+        (compiled_glue, new_forms) = self.glue.simplify().compile_pos(counter, self.__class__)
+        return new_forms + [self.__class__(self.meaning, compiled_glue, set([counter.get()]))]
+
+    def simplify(self):
+        return self.__class__(self.meaning.simplify(), self.glue.simplify(), self.indices)
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue
+
+    def __ne__(self, other):
+        return not self == other
+
+    # sorting for use in doctests which must be deterministic
+    def __lt__(self, other):
+        return str(self) < str(other)
+
+    def __str__(self):
+        assert isinstance(self.indices, set)
+        accum = '%s : %s' % (self.meaning, self.glue)
+        if self.indices:
+            accum += ' : {' + ', '.join(str(index) for index in self.indices) + '}'
+        return accum
+
+    def __repr__(self):
+        return "%s" % self
+
+@python_2_unicode_compatible
+class GlueDict(dict):
+    def __init__(self, filename, encoding=None):
+        self.filename = filename
+        self.file_encoding = encoding
+        self.read_file()
+
+    def read_file(self, empty_first=True):
+        if empty_first:
+            self.clear()
+
+        try:
+            contents = nltk.data.load(self.filename, format='text', encoding=self.file_encoding)
+            # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
+        except LookupError as e:
+            try:
+                contents = nltk.data.load('file:' + self.filename, format='text', encoding=self.file_encoding)
+            except LookupError:
+                raise e
+        lines = contents.splitlines()
+
+        for line in lines:                          # example: 'n : (\\x.(<word> x), (v-or))'
+                                                    #     lambdacalc -^  linear logic -^
+            line = line.strip()                     # remove trailing newline
+            if not len(line): continue              # skip empty lines
+            if line[0] == '#': continue             # skip commented out lines
+
+            parts = line.split(' : ', 2)            # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
+
+            glue_formulas = []
+            paren_count = 0
+            tuple_start = 0
+            tuple_comma = 0
+
+            relationships = None
+
+            if len(parts) > 1:
+                for (i, c) in enumerate(parts[1]):
+                    if c == '(':
+                        if paren_count == 0:             # if it's the first '(' of a tuple
+                            tuple_start = i+1           # then save the index
+                        paren_count += 1
+                    elif c == ')':
+                        paren_count -= 1
+                        if paren_count == 0:             # if it's the last ')' of a tuple
+                            meaning_term =  parts[1][tuple_start:tuple_comma]   # '\\x.(<word> x)'
+                            glue_term =     parts[1][tuple_comma+1:i]           # '(v-r)'
+                            glue_formulas.append([meaning_term, glue_term])     # add the GlueFormula to the list
+                    elif c == ',':
+                        if paren_count == 1:             # if it's a comma separating the parts of the tuple
+                            tuple_comma = i             # then save the index
+                    elif c == '#':                      # skip comments at the ends of lines
+                        if paren_count != 0:             # if the line hasn't parsed correctly so far
+                            raise RuntimeError('Formula syntax is incorrect for entry ' + line)
+                        break                           # break to the next line
+
+            if len(parts) > 2:                      #if there is a relationship entry at the end
+                rel_start = parts[2].index('[')+1
+                rel_end   = parts[2].index(']')
+                if rel_start == rel_end:
+                    relationships = frozenset()
+                else:
+                    relationships = frozenset(r.strip() for r in parts[2][rel_start:rel_end].split(','))
+
+            try:
+                start_inheritance = parts[0].index('(')
+                end_inheritance = parts[0].index(')')
+                sem = parts[0][:start_inheritance].strip()
+                supertype = parts[0][start_inheritance+1:end_inheritance]
+            except:
+                sem = parts[0].strip()
+                supertype = None
+
+            if sem not in self:
+                self[sem] = {}
+
+            if relationships is None: #if not specified for a specific relationship set
+                #add all relationship entries for parents
+                if supertype:
+                    for rels in self[supertype]:
+                        if rels not in self[sem]:
+                            self[sem][rels] = []
+                        glue = self[supertype][rels]
+                        self[sem][rels].extend(glue)
+                        self[sem][rels].extend(glue_formulas) # add the glue formulas to every rel entry
+                else:
+                    if None not in self[sem]:
+                        self[sem][None] = []
+                    self[sem][None].extend(glue_formulas) # add the glue formulas to every rel entry
+            else:
+                if relationships not in self[sem]:
+                    self[sem][relationships] = []
+                if supertype:
+                    self[sem][relationships].extend(self[supertype][relationships])
+                self[sem][relationships].extend(glue_formulas) # add the glue entry to the dictionary
+
+    def __str__(self):
+        accum = ''
+        for pos in self:
+            str_pos = "%s" % pos
+            for relset in self[pos]:
+                i = 1
+                for gf in self[pos][relset]:
+                    if i == 1:
+                        accum += str_pos + ': '
+                    else:
+                        accum += ' '*(len(str_pos)+2)
+                    accum += "%s" % gf
+                    if relset and i == len(self[pos][relset]):
+                        accum += ' : %s' % relset
+                    accum += '\n'
+                    i += 1
+        return accum
+
+    def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
+        if node is None:
+            # TODO: should it be depgraph.root? Is this code tested?
+            top = depgraph.nodes[0]
+            depList = list(chain(*top['deps'].values()))
+            root = depgraph.nodes[depList[0]]
+
+            return self.to_glueformula_list(depgraph, root, Counter(), verbose)
+
+        glueformulas = self.lookup(node, depgraph, counter)
+        for dep_idx in chain(*node['deps'].values()):
+            dep = depgraph.nodes[dep_idx]
+            glueformulas.extend(self.to_glueformula_list(depgraph, dep, counter, verbose))
+        return glueformulas
+
+    def lookup(self, node, depgraph, counter):
+        semtype_names = self.get_semtypes(node)
+
+        semtype = None
+        for name in semtype_names:
+            if name in self:
+                semtype = self[name]
+                break
+        if semtype is None:
+            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
+            return []
+
+        self.add_missing_dependencies(node, depgraph)
+
+        lookup = self._lookup_semtype_option(semtype, node, depgraph)
+
+        if not len(lookup):
+            raise KeyError(
+                "There is no GlueDict entry for sem type of '%s' "
+                "with tag '%s', and rel '%s'" %
+                (node['word'], node['tag'], node['rel'])
+                )
+
+        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
+
+    def add_missing_dependencies(self, node, depgraph):
+        rel = node['rel'].lower()
+
+        if rel == 'main':
+            headnode = depgraph.nodes[node['head']]
+            subj = self.lookup_unique('subj', headnode, depgraph)
+            relation = subj['rel']
+            node['deps'].setdefault(relation,[])
+            node['deps'][relation].append(subj['address'])
+            #node['deps'].append(subj['address'])
+
+    def _lookup_semtype_option(self, semtype, node, depgraph):
+        relationships = frozenset(
+            depgraph.nodes[dep]['rel'].lower()
+            for dep in chain(*node['deps'].values())
+            if depgraph.nodes[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS
+        )
+
+        try:
+            lookup = semtype[relationships]
+        except KeyError:
+            # An exact match is not found, so find the best match where
+            # 'best' is defined as the glue entry whose relationship set has the
+            # most relations of any possible relationship set that is a subset
+            # of the actual depgraph
+            best_match = frozenset()
+            for relset_option in set(semtype)-set([None]):
+                if len(relset_option) > len(best_match) and \
+                   relset_option < relationships:
+                    best_match = relset_option
+            if not best_match:
+                if None in semtype:
+                    best_match = None
+                else:
+                    return None
+            lookup = semtype[best_match]
+
+        return lookup
+
+    def get_semtypes(self, node):
+        """
+        Based on the node, return a list of plausible semtypes in order of
+        plausibility.
+        """
+        rel = node['rel'].lower()
+        word = node['word'].lower()
+
+        if rel == 'spec':
+            if word in SPEC_SEMTYPES:
+                return [SPEC_SEMTYPES[word]]
+            else:
+                return [SPEC_SEMTYPES['default']]
+        elif rel in ['nmod', 'vmod']:
+            return [node['tag'], rel]
+        else:
+            return [node['tag']]
+
+    def get_glueformulas_from_semtype_entry(self, lookup, word, node, depgraph, counter):
+        glueformulas = []
+
+        glueFormulaFactory = self.get_GlueFormula_factory()
+        for meaning, glue in lookup:
+            gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
+            if not len(glueformulas):
+                gf.word = word
+            else:
+                gf.word = '%s%s' % (word, len(glueformulas)+1)
+
+            gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
+
+            glueformulas.append(gf)
+        return glueformulas
+
+    def get_meaning_formula(self, generic, word):
+        """
+        :param generic: A meaning formula string containing the
+        parameter "<word>"
+        :param word: The actual word to be replace "<word>"
+        """
+        word = word.replace('.', '')
+        return generic.replace('<word>', word)
+
+    def initialize_labels(self, expr, node, depgraph, unique_index):
+        if isinstance(expr, linearlogic.AtomicExpression):
+            name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
+            if name[0].isupper():
+                return linearlogic.VariableExpression(name)
+            else:
+                return linearlogic.ConstantExpression(name)
+        else:
+            return linearlogic.ImpExpression(
+                self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
+                self.initialize_labels(expr.consequent, node, depgraph, unique_index)
+            )
+
+    def find_label_name(self, name, node, depgraph, unique_index):
+        try:
+            dot = name.index('.')
+
+            before_dot = name[:dot]
+            after_dot = name[dot+1:]
+            if before_dot == 'super':
+                return self.find_label_name(after_dot, depgraph.nodes[node['head']], depgraph, unique_index)
+            else:
+                return self.find_label_name(after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index)
+        except ValueError:
+            lbl = self.get_label(node)
+            if name == 'f':
+                return lbl
+            elif name == 'v':
+                return '%sv' % lbl
+            elif name == 'r':
+                return '%sr' % lbl
+            elif name == 'super':
+                return self.get_label(depgraph.nodes[node['head']])
+            elif name == 'var':
+                return '%s%s' % (lbl.upper(), unique_index)
+            elif name == 'a':
+                return self.get_label(self.lookup_unique('conja', node, depgraph))
+            elif name == 'b':
+                return self.get_label(self.lookup_unique('conjb', node, depgraph))
+            else:
+                return self.get_label(self.lookup_unique(name, node, depgraph))
+
+    def get_label(self, node):
+        """
+        Pick an alphabetic character as identifier for an entity in the model.
+
+        :param value: where to index into the list of characters
+        :type value: int
+        """
+        value = node['address']
+
+        letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s',
+                  't','u','v','w','x','y','z','a','b','c','d','e'][value-1]
+        num = int(value) // 26
+        if num > 0:
+            return letter + str(num)
+        else:
+            return letter
+
+    def lookup_unique(self, rel, node, depgraph):
+        """
+        Lookup 'key'. There should be exactly one item in the associated relation.
+        """
+        deps = [
+            depgraph.nodes[dep]
+            for dep in chain(*node['deps'].values())
+            if depgraph.nodes[dep]['rel'].lower() == rel.lower()
+        ]
+
+        if len(deps) == 0:
+            raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel))
+        elif len(deps) > 1:
+            raise KeyError("'%s' should only have one feature '%s'" % (node['word'], rel))
+        else:
+            return deps[0]
+
+    def get_GlueFormula_factory(self):
+        return GlueFormula
+
+
+class Glue(object):
+    def __init__(self, semtype_file=None, remove_duplicates=False,
+                 depparser=None, verbose=False):
+        self.verbose = verbose
+        self.remove_duplicates = remove_duplicates
+        self.depparser = depparser
+
+        from nltk import Prover9
+        self.prover = Prover9()
+
+        if semtype_file:
+            self.semtype_file = semtype_file
+        else:
+            self.semtype_file = os.path.join('grammars', 'sample_grammars','glue.semtype')
+
+    def train_depparser(self, depgraphs=None):
+        if depgraphs:
+            self.depparser.train(depgraphs)
+        else:
+            self.depparser.train_from_file(nltk.data.find(
+                os.path.join('grammars', 'sample_grammars',
+                             'glue_train.conll')))
+
+    def parse_to_meaning(self, sentence):
+        readings = []
+        for agenda in self.parse_to_compiled(sentence):
+            readings.extend(self.get_readings(agenda))
+        return readings
+
+    def get_readings(self, agenda):
+        readings = []
+        agenda_length = len(agenda)
+        atomics = dict()
+        nonatomics = dict()
+        while agenda: # is not empty
+            cur = agenda.pop()
+            glue_simp = cur.glue.simplify()
+            if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic
+                for key in atomics:
+                    try:
+                        if isinstance(cur.glue, linearlogic.ApplicationExpression):
+                            bindings = cur.glue.bindings
+                        else:
+                            bindings = linearlogic.BindingDict()
+                        glue_simp.antecedent.unify(key, bindings)
+                        for atomic in atomics[key]:
+                            if not (cur.indices & atomic.indices): # if the sets of indices are disjoint
+                                try:
+                                    agenda.append(cur.applyto(atomic))
+                                except linearlogic.LinearLogicApplicationException:
+                                    pass
+                    except linearlogic.UnificationException:
+                        pass
+                try:
+                    nonatomics[glue_simp.antecedent].append(cur)
+                except KeyError:
+                    nonatomics[glue_simp.antecedent] = [cur]
+
+            else: # else cur.glue is atomic
+                for key in nonatomics:
+                    for nonatomic in nonatomics[key]:
+                        try:
+                            if isinstance(nonatomic.glue, linearlogic.ApplicationExpression):
+                                bindings = nonatomic.glue.bindings
+                            else:
+                                bindings = linearlogic.BindingDict()
+                            glue_simp.unify(key, bindings)
+                            if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint
+                                try:
+                                    agenda.append(nonatomic.applyto(cur))
+                                except linearlogic.LinearLogicApplicationException:
+                                    pass
+                        except linearlogic.UnificationException:
+                            pass
+                try:
+                    atomics[glue_simp].append(cur)
+                except KeyError:
+                    atomics[glue_simp] = [cur]
+
+        for entry in atomics:
+            for gf in atomics[entry]:
+                if len(gf.indices) == agenda_length:
+                    self._add_to_reading_list(gf, readings)
+        for entry in nonatomics:
+            for gf in nonatomics[entry]:
+                if len(gf.indices) == agenda_length:
+                    self._add_to_reading_list(gf, readings)
+        return readings
+
+    def _add_to_reading_list(self, glueformula, reading_list):
+        add_reading = True
+        if self.remove_duplicates:
+            for reading in reading_list:
+                try:
+                    if reading.equiv(glueformula.meaning, self.prover):
+                        add_reading = False
+                        break
+                except Exception as e:
+                    #if there is an exception, the syntax of the formula
+                    #may not be understandable by the prover, so don't
+                    #throw out the reading.
+                    print('Error when checking logical equality of statements', e)
+                    pass
+        if add_reading:
+            reading_list.append(glueformula.meaning)
+
+    def parse_to_compiled(self, sentence):
+        gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
+        return [self.gfl_to_compiled(gfl) for gfl in gfls]
+
+    def dep_parse(self, sentence):
+        """
+        Return a dependency graph for the sentence.
+
+        :param sentence: the sentence to be parsed
+        :type sentence: list(str)
+        :rtype: DependencyGraph
+        """
+
+        #Lazy-initialize the depparser
+        if self.depparser is None:
+            from nltk.parse import MaltParser
+            self.depparser = MaltParser(tagger=self.get_pos_tagger())
+        if not self.depparser._trained:
+            self.train_depparser()
+        return self.depparser.parse(sentence, verbose=self.verbose)
+
+    def depgraph_to_glue(self, depgraph):
+        return self.get_glue_dict().to_glueformula_list(depgraph)
+
+    def get_glue_dict(self):
+        return GlueDict(self.semtype_file)
+
+    def gfl_to_compiled(self, gfl):
+        index_counter = Counter()
+        return_list = []
+        for gf in gfl:
+            return_list.extend(gf.compile(index_counter))
+
+        if self.verbose:
+            print('Compiled Glue Premises:')
+            for cgf in return_list:
+                print(cgf)
+
+        return return_list
+
+    def get_pos_tagger(self):
+        from nltk.corpus import brown
+        regexp_tagger = RegexpTagger(
+            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
+             (r'.*able$', 'JJ'),                # adjectives
+             (r'.*ness$', 'NN'),                # nouns formed from adjectives
+             (r'.*ly$', 'RB'),                  # adverbs
+             (r'.*s$', 'NNS'),                  # plural nouns
+             (r'.*ing$', 'VBG'),                # gerunds
+             (r'.*ed$', 'VBD'),                 # past tense verbs
+             (r'.*', 'NN')                      # nouns (default)
+        ])
+        brown_train = brown.tagged_sents(categories='news')
+        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
+        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
+        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
+
+        #Override particular words
+        main_tagger = RegexpTagger(
+            [(r'(A|a|An|an)$', 'ex_quant'),
+             (r'(Every|every|All|all)$', 'univ_quant')
+        ], backoff=trigram_tagger)
+
+        return main_tagger
+
+
+class DrtGlueFormula(GlueFormula):
+    def __init__(self, meaning, glue, indices=None):
+        if not indices:
+            indices = set()
+
+        if isinstance(meaning, string_types):
+            self.meaning = drt.DrtExpression.fromstring(meaning)
+        elif isinstance(meaning, drt.DrtExpression):
+            self.meaning = meaning
+        else:
+            raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))
+
+        if isinstance(glue, string_types):
+            self.glue = linearlogic.LinearLogicParser().parse(glue)
+        elif isinstance(glue, linearlogic.Expression):
+            self.glue = glue
+        else:
+            raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__))
+
+        self.indices = indices
+
+    def make_VariableExpression(self, name):
+        return drt.DrtVariableExpression(name)
+
+    def make_LambdaExpression(self, variable, term):
+        return drt.DrtLambdaExpression(variable, term)
+
+class DrtGlueDict(GlueDict):
+    def get_GlueFormula_factory(self):
+        return DrtGlueFormula
+
+class DrtGlue(Glue):
+    def __init__(self, semtype_file=None, remove_duplicates=False,
+                 depparser=None, verbose=False):
+        if not semtype_file:
+            semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype')
+        Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
+
+    def get_glue_dict(self):
+        return DrtGlueDict(self.semtype_file)
+
+
+def demo(show_example=-1):
+    from nltk.parse import MaltParser
+    examples = ['David sees Mary',
+                'David eats a sandwich',
+                'every man chases a dog',
+                'every man believes a dog sleeps',
+                'John gives David a sandwich',
+                'John chases himself']
+#                'John persuades David to order a pizza',
+#                'John tries to go',
+#                'John tries to find a unicorn',
+#                'John seems to vanish',
+#                'a unicorn seems to approach',
+#                'every big cat leaves',
+#                'every gray cat leaves',
+#                'every big gray cat leaves',
+#                'a former senator leaves',
+
+    print('============== DEMO ==============')
+
+    tagger = RegexpTagger(
+        [('^(David|Mary|John)$', 'NNP'),
+         ('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
+         ('^(go|order|vanish|find|approach)$', 'VB'),
+         ('^(a)$', 'ex_quant'),
+         ('^(every)$', 'univ_quant'),
+         ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+         ('^(big|gray|former)$', 'JJ'),
+         ('^(him|himself)$', 'PRP')
+    ])
+
+    depparser = MaltParser(tagger=tagger)
+    glue = Glue(depparser=depparser, verbose=False)
+
+    for (i, sentence) in enumerate(examples):
+        if i==show_example or show_example==-1:
+            print('[[[Example %s]]]  %s' % (i, sentence))
+            for reading in glue.parse_to_meaning(sentence.split()):
+                print(reading.simplify())
+            print('')
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/sem/glue.pyc b/nlp_resource_data/nltk/sem/glue.pyc

new file mode 100755 (executable)

index 0000000..0206c35

Binary files /dev/null and b/nlp_resource_data/nltk/sem/glue.pyc differ
diff --git a/nlp_resource_data/nltk/sem/hole.py b/nlp_resource_data/nltk/sem/hole.py

new file mode 100755 (executable)

index 0000000..fe39369
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/hole.py
@@ -0,0 +1,386 @@
+# Natural Language Toolkit: Logic
+#
+# Author:     Peter Wang
+# Updated by: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+An implementation of the Hole Semantics model, following Blackburn and Bos,
+Representation and Inference for Natural Language (CSLI, 2005).
+
+The semantic representations are built by the grammar hole.fcfg.
+This module contains driver code to read in sentences and parse them
+according to a hole semantics grammar.
+
+After parsing, the semantic representation is in the form of an underspecified
+representation that is not easy to read.  We use a "plugging" algorithm to
+convert that representation into first-order logic formulas.
+"""
+from __future__ import print_function, unicode_literals
+
+from functools import reduce
+
+from six import itervalues
+
+from nltk import compat
+from nltk.parse import load_parser
+
+from nltk.sem.skolemize import skolemize
+from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression,
+                            ExistsExpression, IffExpression, ImpExpression,
+                            LambdaExpression, NegatedExpression, OrExpression)
+
+
+# Note that in this code there may be multiple types of trees being referred to:
+#
+# 1. parse trees
+# 2. the underspecified representation
+# 3. first-order logic formula trees
+# 4. the search space when plugging (search tree)
+#
+
+class Constants(object):
+    ALL = 'ALL'
+    EXISTS = 'EXISTS'
+    NOT = 'NOT'
+    AND = 'AND'
+    OR = 'OR'
+    IMP = 'IMP'
+    IFF = 'IFF'
+    PRED = 'PRED'
+    LEQ = 'LEQ'
+    HOLE = 'HOLE'
+    LABEL = 'LABEL'
+
+    MAP = {ALL: lambda v, e: AllExpression(v.variable, e),
+           EXISTS: lambda v, e: ExistsExpression(v.variable, e),
+           NOT: NegatedExpression,
+           AND: AndExpression,
+           OR: OrExpression,
+           IMP: ImpExpression,
+           IFF: IffExpression,
+           PRED: ApplicationExpression}
+
+
+class HoleSemantics(object):
+    """
+    This class holds the broken-down components of a hole semantics, i.e. it
+    extracts the holes, labels, logic formula fragments and constraints out of
+    a big conjunction of such as produced by the hole semantics grammar.  It
+    then provides some operations on the semantics dealing with holes, labels
+    and finding legal ways to plug holes with labels.
+    """
+    def __init__(self, usr):
+        """
+        Constructor.  `usr' is a ``sem.Expression`` representing an
+        Underspecified Representation Structure (USR).  A USR has the following
+        special predicates:
+        ALL(l,v,n),
+        EXISTS(l,v,n),
+        AND(l,n,n),
+        OR(l,n,n),
+        IMP(l,n,n),
+        IFF(l,n,n),
+        PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions,
+        LEQ(n,n),
+        HOLE(n),
+        LABEL(n)
+        where l is the label of the node described by the predicate, n is either
+        a label or a hole, and v is a variable.
+        """
+        self.holes = set()
+        self.labels = set()
+        self.fragments = {}  # mapping of label -> formula fragment
+        self.constraints = set()  # set of Constraints
+        self._break_down(usr)
+        self.top_most_labels = self._find_top_most_labels()
+        self.top_hole = self._find_top_hole()
+
+    def is_node(self, x):
+        """
+        Return true if x is a node (label or hole) in this semantic
+        representation.
+        """
+        return x in (self.labels | self.holes)
+
+    def _break_down(self, usr):
+        """
+        Extract holes, labels, formula fragments and constraints from the hole
+        semantics underspecified representation (USR).
+        """
+        if isinstance(usr, AndExpression):
+            self._break_down(usr.first)
+            self._break_down(usr.second)
+        elif isinstance(usr, ApplicationExpression):
+            func, args = usr.uncurry()
+            if func.variable.name == Constants.LEQ:
+                self.constraints.add(Constraint(args[0], args[1]))
+            elif func.variable.name == Constants.HOLE:
+                self.holes.add(args[0])
+            elif func.variable.name == Constants.LABEL:
+                self.labels.add(args[0])
+            else:
+                label = args[0]
+                assert label not in self.fragments
+                self.fragments[label] = (func, args[1:])
+        else:
+            raise ValueError(usr.label())
+
+    def _find_top_nodes(self, node_list):
+        top_nodes = node_list.copy()
+        for f in itervalues(self.fragments):
+            # the label is the first argument of the predicate
+            args = f[1]
+            for arg in args:
+                if arg in node_list:
+                    top_nodes.discard(arg)
+        return top_nodes
+
+    def _find_top_most_labels(self):
+        """
+        Return the set of labels which are not referenced directly as part of
+        another formula fragment.  These will be the top-most labels for the
+        subtree that they are part of.
+        """
+        return self._find_top_nodes(self.labels)
+
+    def _find_top_hole(self):
+        """
+        Return the hole that will be the top of the formula tree.
+        """
+        top_holes = self._find_top_nodes(self.holes)
+        assert len(top_holes) == 1  # it must be unique
+        return top_holes.pop()
+
+    def pluggings(self):
+        """
+        Calculate and return all the legal pluggings (mappings of labels to
+        holes) of this semantics given the constraints.
+        """
+        record = []
+        self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record)
+        return record
+
+    def _plug_nodes(self, queue, potential_labels, plug_acc, record):
+        """
+        Plug the nodes in `queue' with the labels in `potential_labels'.
+
+        Each element of `queue' is a tuple of the node to plug and the list of
+        ancestor holes from the root of the graph to that node.
+
+        `potential_labels' is a set of the labels which are still available for
+        plugging.
+
+        `plug_acc' is the incomplete mapping of holes to labels made on the
+        current branch of the search tree so far.
+
+        `record' is a list of all the complete pluggings that we have found in
+        total so far.  It is the only parameter that is destructively updated.
+        """
+        if queue != []:
+            (node, ancestors) = queue[0]
+            if node in self.holes:
+                # The node is a hole, try to plug it.
+                self._plug_hole(node, ancestors, queue[1:], potential_labels, plug_acc, record)
+            else:
+                assert node in self.labels
+                # The node is a label.  Replace it in the queue by the holes and
+                # labels in the formula fragment named by that label.
+                args = self.fragments[node][1]
+                head = [(a, ancestors) for a in args if self.is_node(a)]
+                self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
+        else:
+            raise Exception('queue empty')
+
+    def _plug_hole(self, hole, ancestors0, queue, potential_labels0,
+                   plug_acc0, record):
+        """
+        Try all possible ways of plugging a single hole.
+        See _plug_nodes for the meanings of the parameters.
+        """
+        # Add the current hole we're trying to plug into the list of ancestors.
+        assert hole not in ancestors0
+        ancestors = [hole] + ancestors0
+
+        # Try each potential label in this hole in turn.
+        for l in potential_labels0:
+            # Is the label valid in this hole?
+            if self._violates_constraints(l, ancestors):
+                continue
+
+            plug_acc = plug_acc0.copy()
+            plug_acc[hole] = l
+            potential_labels = potential_labels0.copy()
+            potential_labels.remove(l)
+
+            if len(potential_labels) == 0:
+                # No more potential labels.  That must mean all the holes have
+                # been filled so we have found a legal plugging so remember it.
+                #
+                # Note that the queue might not be empty because there might
+                # be labels on there that point to formula fragments with
+                # no holes in them.  _sanity_check_plugging will make sure
+                # all holes are filled.
+                self._sanity_check_plugging(plug_acc, self.top_hole, [])
+                record.append(plug_acc)
+            else:
+                # Recursively try to fill in the rest of the holes in the
+                # queue.  The label we just plugged into the hole could have
+                # holes of its own so at the end of the queue.  Putting it on
+                # the end of the queue gives us a breadth-first search, so that
+                # all the holes at level i of the formula tree are filled
+                # before filling level i+1.
+                # A depth-first search would work as well since the trees must
+                # be finite but the bookkeeping would be harder.
+                self._plug_nodes(queue + [(l, ancestors)], potential_labels, plug_acc, record)
+
+    def _violates_constraints(self, label, ancestors):
+        """
+        Return True if the `label' cannot be placed underneath the holes given
+        by the set `ancestors' because it would violate the constraints imposed
+        on it.
+        """
+        for c in self.constraints:
+            if c.lhs == label:
+                if c.rhs not in ancestors:
+                    return True
+        return False
+
+    def _sanity_check_plugging(self, plugging, node, ancestors):
+        """
+        Make sure that a given plugging is legal.  We recursively go through
+        each node and make sure that no constraints are violated.
+        We also check that all holes have been filled.
+        """
+        if node in self.holes:
+            ancestors = [node] + ancestors
+            label = plugging[node]
+        else:
+            label = node
+        assert label in self.labels
+        for c in self.constraints:
+            if c.lhs == label:
+                assert c.rhs in ancestors
+        args = self.fragments[label][1]
+        for arg in args:
+            if self.is_node(arg):
+                self._sanity_check_plugging(plugging, arg, [label] + ancestors)
+
+    def formula_tree(self, plugging):
+        """
+        Return the first-order logic formula tree for this underspecified
+        representation using the plugging given.
+        """
+        return self._formula_tree(plugging, self.top_hole)
+
+    def _formula_tree(self, plugging, node):
+        if node in plugging:
+            return self._formula_tree(plugging, plugging[node])
+        elif node in self.fragments:
+            pred, args = self.fragments[node]
+            children = [self._formula_tree(plugging, arg) for arg in args]
+            return reduce(Constants.MAP[pred.variable.name], children)
+        else:
+            return node
+
+
+@compat.python_2_unicode_compatible
+class Constraint(object):
+    """
+    This class represents a constraint of the form (L =< N),
+    where L is a label and N is a node (a label or a hole).
+    """
+    def __init__(self, lhs, rhs):
+        self.lhs = lhs
+        self.rhs = rhs
+
+    def __eq__(self, other):
+        if self.__class__ == other.__class__:
+            return self.lhs == other.lhs and self.rhs == other.rhs
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    def __repr__(self):
+        return '(%s < %s)' % (self.lhs, self.rhs)
+
+
+def hole_readings(sentence, grammar_filename=None, verbose=False):
+    if not grammar_filename:
+        grammar_filename = 'grammars/sample_grammars/hole.fcfg'
+
+    if verbose:
+        print('Reading grammar file', grammar_filename)
+
+    parser = load_parser(grammar_filename)
+
+    # Parse the sentence.
+    tokens = sentence.split()
+    trees = list(parser.parse(tokens))
+    if verbose:
+        print('Got %d different parses' % len(trees))
+
+    all_readings = []
+    for tree in trees:
+        # Get the semantic feature from the top of the parse tree.
+        sem = tree.label()['SEM'].simplify()
+
+        # Print the raw semantic representation.
+        if verbose:
+            print('Raw:       ', sem)
+
+        # Skolemize away all quantifiers.  All variables become unique.
+        while isinstance(sem, LambdaExpression):
+            sem = sem.term
+        skolemized = skolemize(sem)
+
+        if verbose:
+            print('Skolemized:', skolemized)
+
+        # Break the hole semantics representation down into its components
+        # i.e. holes, labels, formula fragments and constraints.
+        hole_sem = HoleSemantics(skolemized)
+
+        # Maybe show the details of the semantic representation.
+        if verbose:
+            print('Holes:       ', hole_sem.holes)
+            print('Labels:      ', hole_sem.labels)
+            print('Constraints: ', hole_sem.constraints)
+            print('Top hole:    ', hole_sem.top_hole)
+            print('Top labels:  ', hole_sem.top_most_labels)
+            print('Fragments:')
+            for l, f in hole_sem.fragments.items():
+                print('\t%s: %s' % (l, f))
+
+        # Find all the possible ways to plug the formulas together.
+        pluggings = hole_sem.pluggings()
+
+        # Build FOL formula trees using the pluggings.
+        readings = list(map(hole_sem.formula_tree, pluggings))
+
+        # Print out the formulas in a textual format.
+        if verbose:
+            for i, r in enumerate(readings):
+                print()
+                print('%d. %s' % (i, r))
+            print()
+
+        all_readings.extend(readings)
+
+    return all_readings
+
+
+if __name__ == '__main__':
+    for r in hole_readings('a dog barks'):
+        print(r)
+    print()
+    for r in hole_readings('every girl chases a dog'):
+        print(r)
diff --git a/nlp_resource_data/nltk/sem/hole.pyc b/nlp_resource_data/nltk/sem/hole.pyc

new file mode 100755 (executable)

index 0000000..6de7c4e

Binary files /dev/null and b/nlp_resource_data/nltk/sem/hole.pyc differ
diff --git a/nlp_resource_data/nltk/sem/lfg.py b/nlp_resource_data/nltk/sem/lfg.py

new file mode 100755 (executable)

index 0000000..85b3353
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/lfg.py
@@ -0,0 +1,210 @@
+# Natural Language Toolkit: Lexical Functional Grammar
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
+
+from itertools import chain
+
+from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
+
+
+@python_2_unicode_compatible
+class FStructure(dict):
+    def safeappend(self, key, item):
+        """
+        Append 'item' to the list at 'key'.  If no list exists for 'key', then
+        construct one.
+        """
+        if key not in self:
+            self[key] = []
+        self[key].append(item)
+
+    def __setitem__(self, key, value):
+        dict.__setitem__(self, key.lower(), value)
+
+    def __getitem__(self, key):
+        return dict.__getitem__(self, key.lower())
+
+    def __contains__(self, key):
+        return dict.__contains__(self, key.lower())
+
+    def to_glueformula_list(self, glue_dict):
+        depgraph = self.to_depgraph()
+        return glue_dict.to_glueformula_list(depgraph)
+
+    def to_depgraph(self, rel=None):
+        from nltk.parse.dependencygraph import DependencyGraph
+        depgraph = DependencyGraph()
+        nodes = depgraph.nodes
+
+        self._to_depgraph(nodes, 0, 'ROOT')
+
+        # Add all the dependencies for all the nodes
+        for address, node in nodes.items():
+            for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'):
+                if n2['head'] == address:
+                    relation = n2['rel']
+                    node['deps'].setdefault(relation,[])
+                    node['deps'][relation].append(n2['address'])
+
+        depgraph.root = nodes[1]
+
+        return depgraph
+
+    def _to_depgraph(self, nodes, head, rel):
+        index = len(nodes)
+
+        nodes[index].update(
+            {
+                'address': index,
+                'word': self.pred[0],
+                'tag': self.pred[1],
+                'head': head,
+                'rel': rel,
+            }
+        )
+
+        for feature in sorted(self):
+            for item in sorted(self[feature]):
+                if isinstance(item, FStructure):
+                    item._to_depgraph(nodes, index, feature)
+                elif isinstance(item, tuple):
+                    new_index = len(nodes)
+                    nodes[new_index].update(
+                        {
+                            'address': new_index,
+                            'word': item[0],
+                            'tag': item[1],
+                            'head': index,
+                            'rel': feature,
+                        }
+                    )
+                elif isinstance(item, list):
+                    for n in item:
+                        n._to_depgraph(nodes, index, feature)
+                else:
+                    raise Exception('feature %s is not an FStruct, a list, or a tuple' % feature)
+
+    @staticmethod
+    def read_depgraph(depgraph):
+        return FStructure._read_depgraph(depgraph.root, depgraph)
+
+    @staticmethod
+    def _read_depgraph(node, depgraph, label_counter=None, parent=None):
+        if not label_counter:
+            label_counter = Counter()
+
+        if node['rel'].lower() in ['spec', 'punct']:
+            # the value of a 'spec' entry is a word, not an FStructure
+            return (node['word'], node['tag'])
+
+        else:
+            fstruct = FStructure()
+            fstruct.pred = None
+            fstruct.label = FStructure._make_label(label_counter.get())
+
+            fstruct.parent = parent
+
+            word, tag = node['word'], node['tag']
+            if tag[:2] == 'VB':
+                if tag[2:3] == 'D':
+                    fstruct.safeappend('tense', ('PAST', 'tense'))
+                fstruct.pred = (word, tag[:2])
+
+            if not fstruct.pred:
+                fstruct.pred = (word, tag)
+
+            children = [depgraph.nodes[idx] for idx in chain(*node['deps'].values())]
+            for child in children:
+                fstruct.safeappend(child['rel'], FStructure._read_depgraph(child, depgraph, label_counter, fstruct))
+
+            return fstruct
+
+    @staticmethod
+    def _make_label(value):
+        """
+        Pick an alphabetic character as identifier for an entity in the model.
+
+        :param value: where to index into the list of characters
+        :type value: int
+        """
+        letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s',
+                  't','u','v','w','x','y','z','a','b','c','d','e'][value-1]
+        num = int(value) // 26
+        if num > 0:
+            return letter + str(num)
+        else:
+            return letter
+
+    def __repr__(self):
+        return self.__unicode__().replace('\n', '')
+
+    def __str__(self):
+        return self.pretty_format()
+
+    def pretty_format(self, indent=3):
+        try:
+            accum = '%s:[' % self.label
+        except NameError:
+            accum = '['
+        try:
+            accum += 'pred \'%s\'' % (self.pred[0])
+        except NameError:
+            pass
+
+        for feature in sorted(self):
+            for item in self[feature]:
+                if isinstance(item, FStructure):
+                    next_indent = indent+len(feature)+3+len(self.label)
+                    accum += '\n%s%s %s' % (' '*(indent), feature, item.pretty_format(next_indent))
+                elif isinstance(item, tuple):
+                    accum += '\n%s%s \'%s\'' % (' '*(indent), feature, item[0])
+                elif isinstance(item, list):
+                    accum += '\n%s%s {%s}' % (' '*(indent), feature, ('\n%s' % (' '*(indent+len(feature)+2))).join(item))
+                else: # ERROR
+                    raise Exception('feature %s is not an FStruct, a list, or a tuple' % feature)
+        return accum+']'
+
+
+
+def demo_read_depgraph():
+    from nltk.parse.dependencygraph import DependencyGraph
+    dg1 = DependencyGraph("""\
+Esso       NNP     2       SUB
+said       VBD     0       ROOT
+the        DT      5       NMOD
+Whiting    NNP     5       NMOD
+field      NN      6       SUB
+started    VBD     2       VMOD
+production NN      6       OBJ
+Tuesday    NNP     6       VMOD
+""")
+    dg2 = DependencyGraph("""\
+John    NNP     2       SUB
+sees    VBP     0       ROOT
+Mary    NNP     2       OBJ
+""")
+    dg3 = DependencyGraph("""\
+a       DT      2       SPEC
+man     NN      3       SUBJ
+walks   VB      0       ROOT
+""")
+    dg4 = DependencyGraph("""\
+every   DT      2       SPEC
+girl    NN      3       SUBJ
+chases  VB      0       ROOT
+a       DT      5       SPEC
+dog     NN      3       OBJ
+""")
+
+    depgraphs = [dg1,dg2,dg3,dg4]
+    for dg in depgraphs:
+        print(FStructure.read_depgraph(dg))
+
+if __name__ == '__main__':
+    demo_read_depgraph()
diff --git a/nlp_resource_data/nltk/sem/lfg.pyc b/nlp_resource_data/nltk/sem/lfg.pyc

new file mode 100755 (executable)

index 0000000..111b96c

Binary files /dev/null and b/nlp_resource_data/nltk/sem/lfg.pyc differ
diff --git a/nlp_resource_data/nltk/sem/linearlogic.py b/nlp_resource_data/nltk/sem/linearlogic.py

new file mode 100755 (executable)

index 0000000..38457a7
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/linearlogic.py
@@ -0,0 +1,451 @@
+# Natural Language Toolkit: Linear Logic
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from six import string_types
+
+from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
+from nltk.sem.logic import LogicParser, APP
+
+_counter = Counter()
+
+class Tokens(object):
+    #Punctuation
+    OPEN = '('
+    CLOSE = ')'
+
+    #Operations
+    IMP = '-o'
+
+    PUNCT = [OPEN, CLOSE]
+    TOKENS = PUNCT + [IMP]
+
+class LinearLogicParser(LogicParser):
+    """A linear logic expression parser."""
+    def __init__(self):
+        LogicParser.__init__(self)
+
+        self.operator_precedence = {APP: 1, Tokens.IMP: 2, None: 3}
+        self.right_associated_operations += [Tokens.IMP]
+
+    def get_all_symbols(self):
+        return Tokens.TOKENS
+
+    def handle(self, tok, context):
+        if tok not in Tokens.TOKENS:
+            return self.handle_variable(tok, context)
+        elif tok == Tokens.OPEN:
+            return self.handle_open(tok, context)
+
+    def get_BooleanExpression_factory(self, tok):
+        if tok == Tokens.IMP:
+            return ImpExpression
+        else:
+            return None
+
+    def make_BooleanExpression(self, factory, first, second):
+        return factory(first, second)
+
+    def attempt_ApplicationExpression(self, expression, context):
+        """Attempt to make an application expression.  If the next tokens
+        are an argument in parens, then the argument expression is a
+        function being applied to the arguments.  Otherwise, return the
+        argument expression."""
+        if self.has_priority(APP, context):
+            if self.inRange(0) and self.token(0) == Tokens.OPEN:
+                self.token() #swallow then open paren
+                argument = self.process_next_expression(APP)
+                self.assertNextToken(Tokens.CLOSE)
+                expression = ApplicationExpression(expression, argument, None)
+        return expression
+
+    def make_VariableExpression(self, name):
+        if name[0].isupper():
+            return VariableExpression(name)
+        else:
+            return ConstantExpression(name)
+
+
+@python_2_unicode_compatible
+class Expression(object):
+
+    _linear_logic_parser = LinearLogicParser()
+
+    @classmethod
+    def fromstring(cls, s):
+        return cls._linear_logic_parser.parse(s)
+
+    def applyto(self, other, other_indices=None):
+        return ApplicationExpression(self, other, other_indices)
+
+    def __call__(self, other):
+        return self.applyto(other)
+
+    def __repr__(self):
+        return '<%s %s>' % (self.__class__.__name__, self)
+
+
+@python_2_unicode_compatible
+class AtomicExpression(Expression):
+    def __init__(self, name, dependencies=None):
+        """
+        :param name: str for the constant name
+        :param dependencies: list of int for the indices on which this atom is dependent
+        """
+        assert isinstance(name, string_types)
+        self.name = name
+
+        if not dependencies:
+            dependencies = []
+        self.dependencies = dependencies
+
+    def simplify(self, bindings=None):
+        """
+        If 'self' is bound by 'bindings', return the atomic to which it is bound.
+        Otherwise, return self.
+
+        :param bindings: ``BindingDict`` A dictionary of bindings used to simplify
+        :return: ``AtomicExpression``
+        """
+        if bindings and self in bindings:
+            return bindings[self]
+        else:
+            return self
+
+    def compile_pos(self, index_counter, glueFormulaFactory):
+        """
+        From Iddo Lev's PhD Dissertation p108-109
+
+        :param index_counter: ``Counter`` for unique indices
+        :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas
+        :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas
+        """
+        self.dependencies = []
+        return (self, [])
+
+    def compile_neg(self, index_counter, glueFormulaFactory):
+        """
+        From Iddo Lev's PhD Dissertation p108-109
+
+        :param index_counter: ``Counter`` for unique indices
+        :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas
+        :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas
+        """
+        self.dependencies = []
+        return (self, [])
+
+    def initialize_labels(self, fstruct):
+        self.name = fstruct.initialize_label(self.name.lower())
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and self.name == other.name
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        accum = self.name
+        if self.dependencies:
+            accum += "%s" % self.dependencies
+        return accum
+
+    def __hash__(self):
+        return hash(self.name)
+
+class ConstantExpression(AtomicExpression):
+    def unify(self, other, bindings):
+        """
+        If 'other' is a constant, then it must be equal to 'self'.  If 'other' is a variable,
+        then it must not be bound to anything other than 'self'.
+
+        :param other: ``Expression``
+        :param bindings: ``BindingDict`` A dictionary of all current bindings
+        :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new binding
+        :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings'
+        """
+        assert isinstance(other, Expression)
+        if isinstance(other, VariableExpression):
+            try:
+                return bindings + BindingDict([(other, self)])
+            except VariableBindingException:
+                pass
+        elif self == other:
+            return bindings
+        raise UnificationException(self, other, bindings)
+
+class VariableExpression(AtomicExpression):
+    def unify(self, other, bindings):
+        """
+        'self' must not be bound to anything other than 'other'.
+
+        :param other: ``Expression``
+        :param bindings: ``BindingDict`` A dictionary of all current bindings
+        :return: ``BindingDict`` A new combined dictionary of of 'bindings' and the new binding
+        :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings'
+        """
+        assert isinstance(other, Expression)
+        try:
+            if self == other:
+                return bindings
+            else:
+                return bindings + BindingDict([(self, other)])
+        except VariableBindingException:
+            raise UnificationException(self, other, bindings)
+
+@python_2_unicode_compatible
+class ImpExpression(Expression):
+    def __init__(self, antecedent, consequent):
+        """
+        :param antecedent: ``Expression`` for the antecedent
+        :param consequent: ``Expression`` for the consequent
+        """
+        assert isinstance(antecedent, Expression)
+        assert isinstance(consequent, Expression)
+        self.antecedent = antecedent
+        self.consequent = consequent
+
+    def simplify(self, bindings=None):
+        return self.__class__(self.antecedent.simplify(bindings), self.consequent.simplify(bindings))
+
+    def unify(self, other, bindings):
+        """
+        Both the antecedent and consequent of 'self' and 'other' must unify.
+
+        :param other: ``ImpExpression``
+        :param bindings: ``BindingDict`` A dictionary of all current bindings
+        :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new bindings
+        :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings'
+        """
+        assert isinstance(other, ImpExpression)
+        try:
+            return bindings + self.antecedent.unify(other.antecedent, bindings) + self.consequent.unify(other.consequent, bindings)
+        except VariableBindingException:
+            raise UnificationException(self, other, bindings)
+
+    def compile_pos(self, index_counter, glueFormulaFactory):
+        """
+        From Iddo Lev's PhD Dissertation p108-109
+
+        :param index_counter: ``Counter`` for unique indices
+        :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas
+        :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas
+        """
+        (a, a_new) = self.antecedent.compile_neg(index_counter, glueFormulaFactory)
+        (c, c_new) = self.consequent.compile_pos(index_counter, glueFormulaFactory)
+        return (ImpExpression(a,c), a_new + c_new)
+
+    def compile_neg(self, index_counter, glueFormulaFactory):
+        """
+        From Iddo Lev's PhD Dissertation p108-109
+
+        :param index_counter: ``Counter`` for unique indices
+        :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas
+        :return: (``Expression``,list of ``GlueFormula``) for the compiled linear logic and any newly created glue formulas
+        """
+        (a, a_new) = self.antecedent.compile_pos(index_counter, glueFormulaFactory)
+        (c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory)
+        fresh_index = index_counter.get()
+        c.dependencies.append(fresh_index)
+        new_v = glueFormulaFactory('v%s' % fresh_index, a, set([fresh_index]))
+        return (c, a_new + c_new + [new_v])
+
+    def initialize_labels(self, fstruct):
+        self.antecedent.initialize_labels(fstruct)
+        self.consequent.initialize_labels(fstruct)
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and \
+                self.antecedent == other.antecedent and self.consequent == other.consequent
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return "%s%s %s %s%s" % (
+            Tokens.OPEN, self.antecedent, Tokens.IMP, self.consequent, Tokens.CLOSE)
+
+    def __hash__(self):
+        return hash('%s%s%s' % (hash(self.antecedent), Tokens.IMP, hash(self.consequent)))
+
+@python_2_unicode_compatible
+class ApplicationExpression(Expression):
+    def __init__(self, function, argument, argument_indices=None):
+        """
+        :param function: ``Expression`` for the function
+        :param argument: ``Expression`` for the argument
+        :param argument_indices: set for the indices of the glue formula from which the argument came
+        :raise LinearLogicApplicationException: If 'function' cannot be applied to 'argument' given 'argument_indices'.
+        """
+        function_simp = function.simplify()
+        argument_simp = argument.simplify()
+
+        assert isinstance(function_simp, ImpExpression)
+        assert isinstance(argument_simp, Expression)
+
+        bindings = BindingDict()
+
+        try:
+            if isinstance(function, ApplicationExpression):
+                bindings += function.bindings
+            if isinstance(argument, ApplicationExpression):
+                bindings += argument.bindings
+            bindings += function_simp.antecedent.unify(argument_simp, bindings)
+        except UnificationException as e:
+            raise LinearLogicApplicationException('Cannot apply %s to %s. %s' % (function_simp, argument_simp, e))
+
+        # If you are running it on complied premises, more conditions apply
+        if argument_indices:
+            # A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices
+            if not set(function_simp.antecedent.dependencies) < argument_indices:
+                raise LinearLogicApplicationException('Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s' % (function_simp, argument_simp))
+            if set(function_simp.antecedent.dependencies) == argument_indices:
+                raise LinearLogicApplicationException('Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s' % (function_simp, argument_simp))
+
+        self.function = function
+        self.argument = argument
+        self.bindings = bindings
+
+    def simplify(self, bindings=None):
+        """
+        Since function is an implication, return its consequent.  There should be
+        no need to check that the application is valid since the checking is done
+        by the constructor.
+
+        :param bindings: ``BindingDict`` A dictionary of bindings used to simplify
+        :return: ``Expression``
+        """
+        if not bindings:
+            bindings = self.bindings
+
+        return self.function.simplify(bindings).consequent
+
+    def __eq__(self, other):
+        return self.__class__ == other.__class__ and \
+                self.function == other.function and self.argument == other.argument
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return "%s" % self.function + Tokens.OPEN + "%s" % self.argument + Tokens.CLOSE
+
+    def __hash__(self):
+        return hash('%s%s%s' % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent)))
+
+@python_2_unicode_compatible
+class BindingDict(object):
+    def __init__(self, bindings=None):
+        """
+        :param bindings:
+            list [(``VariableExpression``, ``AtomicExpression``)] to initialize the dictionary
+            dict {``VariableExpression``: ``AtomicExpression``} to initialize the dictionary
+        """
+        self.d = {}
+
+        if isinstance(bindings, dict):
+            bindings = bindings.items()
+
+        if bindings:
+            for (v, b) in bindings:
+                self[v] = b
+
+    def __setitem__(self, variable, binding):
+        """
+        A binding is consistent with the dict if its variable is not already bound, OR if its
+        variable is already bound to its argument.
+
+        :param variable: ``VariableExpression`` The variable bind
+        :param binding: ``Expression`` The expression to which 'variable' should be bound
+        :raise VariableBindingException: If the variable cannot be bound in this dictionary
+        """
+        assert isinstance(variable, VariableExpression)
+        assert isinstance(binding, Expression)
+
+        assert variable != binding
+
+        existing = self.d.get(variable, None)
+
+        if not existing or binding == existing:
+            self.d[variable] = binding
+        else:
+            raise VariableBindingException('Variable %s already bound to another value' % (variable))
+
+    def __getitem__(self, variable):
+        """
+        Return the expression to which 'variable' is bound
+        """
+        assert isinstance(variable, VariableExpression)
+
+        intermediate = self.d[variable]
+        while intermediate:
+            try:
+                intermediate = self.d[intermediate]
+            except KeyError:
+                return intermediate
+
+    def __contains__(self, item):
+        return item in self.d
+
+    def __add__(self, other):
+        """
+        :param other: ``BindingDict`` The dict with which to combine self
+        :return: ``BindingDict`` A new dict containing all the elements of both parameters
+        :raise VariableBindingException: If the parameter dictionaries are not consistent with each other
+        """
+        try:
+            combined = BindingDict()
+            for v in self.d:
+                combined[v] = self.d[v]
+            for v in other.d:
+                combined[v] = other.d[v]
+            return combined
+        except VariableBindingException:
+            raise VariableBindingException('Attempting to add two contradicting'\
+                        ' VariableBindingsLists: %s, %s' % (self, other))
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __eq__(self, other):
+        if not isinstance(other, BindingDict):
+            raise TypeError
+        return self.d == other.d
+
+    def __str__(self):
+        return '{' + ', '.join('%s: %s' % (v, self.d[v]) for v in self.d) + '}'
+
+    def __repr__(self):
+        return 'BindingDict: %s' % self
+
+class VariableBindingException(Exception):
+    pass
+
+class UnificationException(Exception):
+    def __init__(self, a, b, bindings):
+        Exception.__init__(self, 'Cannot unify %s with %s given %s' % (a, b, bindings))
+
+class LinearLogicApplicationException(Exception):
+    pass
+
+
+def demo():
+    lexpr = Expression.fromstring
+
+    print(lexpr(r'f'))
+    print(lexpr(r'(g -o f)'))
+    print(lexpr(r'((g -o G) -o G)'))
+    print(lexpr(r'g -o h -o f'))
+    print(lexpr(r'(g -o f)(g)').simplify())
+    print(lexpr(r'(H -o f)(g)').simplify())
+    print(lexpr(r'((g -o G) -o G)((g -o f))').simplify())
+    print(lexpr(r'(H -o H)((g -o f))').simplify())
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/sem/linearlogic.pyc b/nlp_resource_data/nltk/sem/linearlogic.pyc

new file mode 100755 (executable)

index 0000000..01b0ec1

Binary files /dev/null and b/nlp_resource_data/nltk/sem/linearlogic.pyc differ
diff --git a/nlp_resource_data/nltk/sem/logic.py b/nlp_resource_data/nltk/sem/logic.py

new file mode 100755 (executable)

index 0000000..1053802
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/logic.py
@@ -0,0 +1,1910 @@
+# Natural Language Toolkit: Logic
+#
+# Author: Dan Garrette <dhgarrette@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+A version of first order predicate logic, built on
+top of the typed lambda calculus.
+"""
+from __future__ import print_function, unicode_literals
+
+import re
+import operator
+from collections import defaultdict
+from functools import reduce, total_ordering
+
+from six import string_types
+
+from nltk.util import Trie
+from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
+
+APP = 'APP'
+
+_counter = Counter()
+
+class Tokens(object):
+    LAMBDA = '\\';     LAMBDA_LIST = ['\\']
+
+    #Quantifiers
+    EXISTS = 'exists'; EXISTS_LIST = ['some', 'exists', 'exist']
+    ALL = 'all';       ALL_LIST = ['all', 'forall']
+
+    #Punctuation
+    DOT = '.'
+    OPEN = '('
+    CLOSE = ')'
+    COMMA = ','
+
+    #Operations
+    NOT = '-';         NOT_LIST = ['not', '-', '!']
+    AND = '&';         AND_LIST = ['and', '&', '^']
+    OR = '|';          OR_LIST = ['or', '|']
+    IMP = '->';        IMP_LIST = ['implies', '->', '=>']
+    IFF = '<->';       IFF_LIST = ['iff', '<->', '<=>']
+    EQ = '=';          EQ_LIST = ['=', '==']
+    NEQ = '!=';        NEQ_LIST = ['!=']
+
+    #Collections of tokens
+    BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST
+    QUANTS = EXISTS_LIST + ALL_LIST
+    PUNCT = [DOT, OPEN, CLOSE, COMMA]
+
+    TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST
+
+    #Special
+    SYMBOLS = [x for x in TOKENS if re.match(r'^[-\\.(),!&^|>=<]*$', x)]
+
+
+def boolean_ops():
+    """
+    Boolean operators
+    """
+    names =  ["negation", "conjunction", "disjunction", "implication", "equivalence"]
+    for pair in zip(names, [Tokens.NOT, Tokens.AND, Tokens.OR, Tokens.IMP, Tokens.IFF]):
+        print("%-15s\t%s" %  pair)
+
+def equality_preds():
+    """
+    Equality predicates
+    """
+    names =  ["equality", "inequality"]
+    for pair in zip(names, [Tokens.EQ, Tokens.NEQ]):
+        print("%-15s\t%s" %  pair)
+
+def binding_ops():
+    """
+    Binding operators
+    """
+    names =  ["existential", "universal", "lambda"]
+    for pair in zip(names, [Tokens.EXISTS, Tokens.ALL, Tokens.LAMBDA]):
+        print("%-15s\t%s" %  pair)
+
+
+@python_2_unicode_compatible
+class LogicParser(object):
+    """A lambda calculus expression parser."""
+
+    def __init__(self, type_check=False):
+        """
+        :param type_check: bool should type checking be performed?
+        to their types.
+        """
+        assert isinstance(type_check, bool)
+
+        self._currentIndex = 0
+        self._buffer = []
+        self.type_check = type_check
+
+        """A list of tuples of quote characters.  The 4-tuple is comprised
+        of the start character, the end character, the escape character, and
+        a boolean indicating whether the quotes should be included in the
+        result. Quotes are used to signify that a token should be treated as
+        atomic, ignoring any special characters within the token.  The escape
+        character allows the quote end character to be used within the quote.
+        If True, the boolean indicates that the final token should contain the
+        quote and escape characters.
+        This method exists to be overridden"""
+        self.quote_chars = []
+
+        self.operator_precedence = dict(
+                           [(x,1) for x in Tokens.LAMBDA_LIST]             + \
+                           [(x,2) for x in Tokens.NOT_LIST]                + \
+                           [(APP,3)]                                       + \
+                           [(x,4) for x in Tokens.EQ_LIST+Tokens.NEQ_LIST] + \
+                           [(x,5) for x in Tokens.QUANTS]                  + \
+                           [(x,6) for x in Tokens.AND_LIST]                + \
+                           [(x,7) for x in Tokens.OR_LIST]                 + \
+                           [(x,8) for x in Tokens.IMP_LIST]                + \
+                           [(x,9) for x in Tokens.IFF_LIST]                + \
+                           [(None,10)])
+        self.right_associated_operations = [APP]
+
+    def parse(self, data, signature=None):
+        """
+        Parse the expression.
+
+        :param data: str for the input to be parsed
+        :param signature: ``dict<str, str>`` that maps variable names to type
+        strings
+        :returns: a parsed Expression
+        """
+        data = data.rstrip()
+
+        self._currentIndex = 0
+        self._buffer, mapping = self.process(data)
+
+        try:
+            result = self.process_next_expression(None)
+            if self.inRange(0):
+                raise UnexpectedTokenException(self._currentIndex+1, self.token(0))
+        except LogicalExpressionException as e:
+            msg = '%s\n%s\n%s^' % (e, data, ' '*mapping[e.index-1])
+            raise LogicalExpressionException(None, msg)
+
+        if self.type_check:
+            result.typecheck(signature)
+
+        return result
+
+    def process(self, data):
+        """Split the data into tokens"""
+        out = []
+        mapping = {}
+        tokenTrie = Trie(self.get_all_symbols())
+        token = ''
+        data_idx = 0
+        token_start_idx = data_idx
+        while data_idx < len(data):
+            cur_data_idx = data_idx
+            quoted_token, data_idx = self.process_quoted_token(data_idx, data)
+            if quoted_token:
+                if not token:
+                    token_start_idx = cur_data_idx
+                token += quoted_token
+                continue
+
+            st = tokenTrie
+            c = data[data_idx]
+            symbol = ''
+            while c in st:
+                symbol += c
+                st = st[c]
+                if len(data)-data_idx > len(symbol):
+                    c = data[data_idx+len(symbol)]
+                else:
+                    break
+            if Trie.LEAF in st:
+                #token is a complete symbol
+                if token:
+                    mapping[len(out)] = token_start_idx
+                    out.append(token)
+                    token = ''
+                mapping[len(out)] = data_idx
+                out.append(symbol)
+                data_idx += len(symbol)
+            else:
+                if data[data_idx] in ' \t\n': #any whitespace
+                    if token:
+                        mapping[len(out)] = token_start_idx
+                        out.append(token)
+                        token = ''
+                else:
+                    if not token:
+                        token_start_idx = data_idx
+                    token += data[data_idx]
+                data_idx += 1
+        if token:
+            mapping[len(out)] = token_start_idx
+            out.append(token)
+        mapping[len(out)] = len(data)
+        mapping[len(out)+1] = len(data)+1
+        return out, mapping
+
+    def process_quoted_token(self, data_idx, data):
+        token = ''
+        c = data[data_idx]
+        i = data_idx
+        for start, end, escape, incl_quotes in self.quote_chars:
+            if c == start:
+                if incl_quotes:
+                    token += c
+                i += 1
+                while data[i] != end:
+                    if data[i] == escape:
+                        if incl_quotes:
+                            token += data[i]
+                        i += 1
+                        if len(data) == i: #if there are no more chars
+                            raise LogicalExpressionException(None, "End of input reached.  "
+                                    "Escape character [%s] found at end."
+                                    % escape)
+                        token += data[i]
+                    else:
+                        token += data[i]
+                    i += 1
+                    if len(data) == i:
+                        raise LogicalExpressionException(None, "End of input reached.  "
+                                             "Expected: [%s]" % end)
+                if incl_quotes:
+                    token += data[i]
+                i += 1
+                if not token:
+                    raise LogicalExpressionException(None, 'Empty quoted token found')
+                break
+        return token, i
+
+    def get_all_symbols(self):
+        """This method exists to be overridden"""
+        return Tokens.SYMBOLS
+
+    def inRange(self, location):
+        """Return TRUE if the given location is within the buffer"""
+        return self._currentIndex+location < len(self._buffer)
+
+    def token(self, location=None):
+        """Get the next waiting token.  If a location is given, then
+        return the token at currentIndex+location without advancing
+        currentIndex; setting it gives lookahead/lookback capability."""
+        try:
+            if location is None:
+                tok = self._buffer[self._currentIndex]
+                self._currentIndex += 1
+            else:
+                tok = self._buffer[self._currentIndex+location]
+            return tok
+        except IndexError:
+            raise ExpectedMoreTokensException(self._currentIndex+1)
+
+    def isvariable(self, tok):
+        return tok not in Tokens.TOKENS
+
+    def process_next_expression(self, context):
+        """Parse the next complete expression from the stream and return it."""
+        try:
+            tok = self.token()
+        except ExpectedMoreTokensException:
+            raise ExpectedMoreTokensException(self._currentIndex+1, message='Expression expected.')
+
+        accum = self.handle(tok, context)
+
+        if not accum:
+            raise UnexpectedTokenException(self._currentIndex, tok, message='Expression expected.')
+
+        return self.attempt_adjuncts(accum, context)
+
+    def handle(self, tok, context):
+        """This method is intended to be overridden for logics that
+        use different operators or expressions"""
+        if self.isvariable(tok):
+            return self.handle_variable(tok, context)
+
+        elif tok in Tokens.NOT_LIST:
+            return self.handle_negation(tok, context)
+
+        elif tok in Tokens.LAMBDA_LIST:
+            return self.handle_lambda(tok, context)
+
+        elif tok in Tokens.QUANTS:
+            return self.handle_quant(tok, context)
+
+        elif tok == Tokens.OPEN:
+            return self.handle_open(tok, context)
+
+    def attempt_adjuncts(self, expression, context):
+        cur_idx = None
+        while cur_idx != self._currentIndex: #while adjuncts are added
+            cur_idx = self._currentIndex
+            expression = self.attempt_EqualityExpression(expression, context)
+            expression = self.attempt_ApplicationExpression(expression, context)
+            expression = self.attempt_BooleanExpression(expression, context)
+        return expression
+
+    def handle_negation(self, tok, context):
+        return self.make_NegatedExpression(self.process_next_expression(Tokens.NOT))
+
+    def make_NegatedExpression(self, expression):
+        return NegatedExpression(expression)
+
+    def handle_variable(self, tok, context):
+        #It's either: 1) a predicate expression: sees(x,y)
+        #             2) an application expression: P(x)
+        #             3) a solo variable: john OR x
+        accum = self.make_VariableExpression(tok)
+        if self.inRange(0) and self.token(0) == Tokens.OPEN:
+            #The predicate has arguments
+            if not isinstance(accum, FunctionVariableExpression) and \
+               not isinstance(accum, ConstantExpression):
+                raise LogicalExpressionException(self._currentIndex,
+                                     "'%s' is an illegal predicate name.  "
+                                     "Individual variables may not be used as "
+                                     "predicates." % tok)
+            self.token() #swallow the Open Paren
+
+            #curry the arguments
+            accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP))
+            while self.inRange(0) and self.token(0) == Tokens.COMMA:
+                self.token() #swallow the comma
+                accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP))
+            self.assertNextToken(Tokens.CLOSE)
+        return accum
+
+    def get_next_token_variable(self, description):
+        try:
+            tok = self.token()
+        except ExpectedMoreTokensException as e:
+            raise ExpectedMoreTokensException(e.index, 'Variable expected.')
+        if isinstance(self.make_VariableExpression(tok), ConstantExpression):
+            raise LogicalExpressionException(self._currentIndex,
+                                 "'%s' is an illegal variable name.  "
+                                 "Constants may not be %s." % (tok, description))
+        return Variable(tok)
+
+    def handle_lambda(self, tok, context):
+        # Expression is a lambda expression
+        if not self.inRange(0):
+            raise ExpectedMoreTokensException(self._currentIndex+2,
+                                              message="Variable and Expression expected following lambda operator.")
+        vars = [self.get_next_token_variable('abstracted')]
+        while True:
+            if not self.inRange(0) or (self.token(0) == Tokens.DOT and not self.inRange(1)):
+                raise ExpectedMoreTokensException(self._currentIndex+2, message="Expression expected.")
+            if not self.isvariable(self.token(0)):
+                break
+            # Support expressions like: \x y.M == \x.\y.M
+            vars.append(self.get_next_token_variable('abstracted'))
+        if self.inRange(0) and self.token(0) == Tokens.DOT:
+            self.token() #swallow the dot
+
+        accum = self.process_next_expression(tok)
+        while vars:
+            accum = self.make_LambdaExpression(vars.pop(), accum)
+        return accum
+
+    def handle_quant(self, tok, context):
+        # Expression is a quantified expression: some x.M
+        factory = self.get_QuantifiedExpression_factory(tok)
+
+        if not self.inRange(0):
+            raise ExpectedMoreTokensException(self._currentIndex+2,
+                                              message="Variable and Expression expected following quantifier '%s'." % tok)
+        vars = [self.get_next_token_variable('quantified')]
+        while True:
+            if not self.inRange(0) or (self.token(0) == Tokens.DOT and not self.inRange(1)):
+                raise ExpectedMoreTokensException(self._currentIndex+2, message="Expression expected.")
+            if not self.isvariable(self.token(0)):
+                break
+            # Support expressions like: some x y.M == some x.some y.M
+            vars.append(self.get_next_token_variable('quantified'))
+        if self.inRange(0) and self.token(0) == Tokens.DOT:
+            self.token() #swallow the dot
+
+        accum = self.process_next_expression(tok)
+        while vars:
+            accum = self.make_QuanifiedExpression(factory, vars.pop(), accum)
+        return accum
+
+    def get_QuantifiedExpression_factory(self, tok):
+        """This method serves as a hook for other logic parsers that
+        have different quantifiers"""
+        if tok in Tokens.EXISTS_LIST:
+            return ExistsExpression
+        elif tok in Tokens.ALL_LIST:
+            return AllExpression
+        else:
+            self.assertToken(tok, Tokens.QUANTS)
+
+    def make_QuanifiedExpression(self, factory, variable, term):
+        return factory(variable, term)
+
+    def handle_open(self, tok, context):
+        #Expression is in parens
+        accum = self.process_next_expression(None)
+        self.assertNextToken(Tokens.CLOSE)
+        return accum
+
+    def attempt_EqualityExpression(self, expression, context):
+        """Attempt to make an equality expression.  If the next token is an
+        equality operator, then an EqualityExpression will be returned.
+        Otherwise, the parameter will be returned."""
+        if self.inRange(0):
+            tok = self.token(0)
+            if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority(tok, context):
+                self.token() #swallow the "=" or "!="
+                expression = self.make_EqualityExpression(expression, self.process_next_expression(tok))
+                if tok in Tokens.NEQ_LIST:
+                    expression = self.make_NegatedExpression(expression)
+        return expression
+
+    def make_EqualityExpression(self, first, second):
+        """This method serves as a hook for other logic parsers that
+        have different equality expression classes"""
+        return EqualityExpression(first, second)
+
+    def attempt_BooleanExpression(self, expression, context):
+        """Attempt to make a boolean expression.  If the next token is a boolean
+        operator, then a BooleanExpression will be returned.  Otherwise, the
+        parameter will be returned."""
+        while self.inRange(0):
+            tok = self.token(0)
+            factory = self.get_BooleanExpression_factory(tok)
+            if factory and self.has_priority(tok, context):
+                self.token() #swallow the operator
+                expression = self.make_BooleanExpression(factory, expression,
+                                                         self.process_next_expression(tok))
+            else:
+                break
+        return expression
+
+    def get_BooleanExpression_factory(self, tok):
+        """This method serves as a hook for other logic parsers that
+        have different boolean operators"""
+        if tok in Tokens.AND_LIST:
+            return AndExpression
+        elif tok in Tokens.OR_LIST:
+            return OrExpression
+        elif tok in Tokens.IMP_LIST:
+            return ImpExpression
+        elif tok in Tokens.IFF_LIST:
+            return IffExpression
+        else:
+            return None
+
+    def make_BooleanExpression(self, factory, first, second):
+        return factory(first, second)
+
+    def attempt_ApplicationExpression(self, expression, context):
+        """Attempt to make an application expression.  The next tokens are
+        a list of arguments in parens, then the argument expression is a
+        function being applied to the arguments.  Otherwise, return the
+        argument expression."""
+        if self.has_priority(APP, context):
+            if self.inRange(0) and self.token(0) == Tokens.OPEN:
+                if not isinstance(expression, LambdaExpression) and \
+                   not isinstance(expression, ApplicationExpression) and \
+                   not isinstance(expression, FunctionVariableExpression) and \
+                   not isinstance(expression, ConstantExpression):
+                    raise LogicalExpressionException(self._currentIndex,
+                                         ("The function '%s" % expression) +
+                                         "' is not a Lambda Expression, an "
+                                         "Application Expression, or a "
+                                         "functional predicate, so it may "
+                                         "not take arguments.")
+                self.token() #swallow then open paren
+                #curry the arguments
+                accum = self.make_ApplicationExpression(expression, self.process_next_expression(APP))
+                while self.inRange(0) and self.token(0) == Tokens.COMMA:
+                    self.token() #swallow the comma
+                    accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP))
+                self.assertNextToken(Tokens.CLOSE)
+                return accum
+        return expression
+
+    def make_ApplicationExpression(self, function, argument):
+        return ApplicationExpression(function, argument)
+
+    def make_VariableExpression(self, name):
+        return VariableExpression(Variable(name))
+
+    def make_LambdaExpression(self, variable, term):
+        return LambdaExpression(variable, term)
+
+    def has_priority(self, operation, context):
+        return self.operator_precedence[operation] < self.operator_precedence[context] or \
+               (operation in self.right_associated_operations and \
+                self.operator_precedence[operation] == self.operator_precedence[context])
+
+    def assertNextToken(self, expected):
+        try:
+            tok = self.token()
+        except ExpectedMoreTokensException as e:
+            raise ExpectedMoreTokensException(e.index, message="Expected token '%s'." % expected)
+
+        if isinstance(expected, list):
+            if tok not in expected:
+                raise UnexpectedTokenException(self._currentIndex, tok, expected)
+        else:
+            if tok != expected:
+                raise UnexpectedTokenException(self._currentIndex, tok, expected)
+
+    def assertToken(self, tok, expected):
+        if isinstance(expected, list):
+            if tok not in expected:
+                raise UnexpectedTokenException(self._currentIndex, tok, expected)
+        else:
+            if tok != expected:
+                raise UnexpectedTokenException(self._currentIndex, tok, expected)
+
+    def __repr__(self):
+        if self.inRange(0):
+            msg = 'Next token: ' + self.token(0)
+        else:
+            msg = 'No more tokens'
+        return '<' + self.__class__.__name__ + ': ' + msg + '>'
+
+
+def read_logic(s, logic_parser=None, encoding=None):
+    """
+    Convert a file of First Order Formulas into a list of {Expression}s.
+
+    :param s: the contents of the file
+    :type s: str
+    :param logic_parser: The parser to be used to parse the logical expression
+    :type logic_parser: LogicParser
+    :param encoding: the encoding of the input string, if it is binary
+    :type encoding: str
+    :return: a list of parsed formulas.
+    :rtype: list(Expression)
+    """
+    if encoding is not None:
+        s = s.decode(encoding)
+    if logic_parser is None:
+        logic_parser = LogicParser()
+
+    statements = []
+    for linenum, line in enumerate(s.splitlines()):
+        line = line.strip()
+        if line.startswith('#') or line=='': continue
+        try:
+            statements.append(logic_parser.parse(line))
+        except LogicalExpressionException:
+            raise ValueError('Unable to parse line %s: %s' % (linenum, line))
+    return statements
+
+
+@total_ordering
+@python_2_unicode_compatible
+class Variable(object):
+    def __init__(self, name):
+        """
+        :param name: the name of the variable
+        """
+        assert isinstance(name, string_types), "%s is not a string" % name
+        self.name = name
+
+    def __eq__(self, other):
+        return isinstance(other, Variable) and self.name == other.name
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, Variable):
+            raise TypeError
+        return self.name < other.name
+
+    def substitute_bindings(self, bindings):
+        return bindings.get(self, self)
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return "Variable('%s')" % self.name
+
+
+def unique_variable(pattern=None, ignore=None):
+    """
+    Return a new, unique variable.
+
+    :param pattern: ``Variable`` that is being replaced.  The new variable must
+        be the same type.
+    :param term: a set of ``Variable`` objects that should not be returned from
+        this function.
+    :rtype: Variable
+    """
+    if pattern is not None:
+        if is_indvar(pattern.name):
+            prefix = 'z'
+        elif is_funcvar(pattern.name):
+            prefix = 'F'
+        elif is_eventvar(pattern.name):
+            prefix = 'e0'
+        else:
+            assert False, "Cannot generate a unique constant"
+    else:
+        prefix = 'z'
+
+    v = Variable("%s%s" % (prefix, _counter.get()))
+    while ignore is not None and v in ignore:
+        v = Variable("%s%s" % (prefix, _counter.get()))
+    return v
+
+def skolem_function(univ_scope=None):
+    """
+    Return a skolem function over the variables in univ_scope
+    param univ_scope
+    """
+    skolem = VariableExpression(Variable('F%s' % _counter.get()))
+    if univ_scope:
+        for v in list(univ_scope):
+            skolem = skolem(VariableExpression(v))
+    return skolem
+
+
+@python_2_unicode_compatible
+class Type(object):
+    def __repr__(self):
+        return "%s" % self
+
+    def __hash__(self):
+        return hash("%s" % self)
+
+    @classmethod
+    def fromstring(cls, s):
+        return read_type(s)
+
+@python_2_unicode_compatible
+class ComplexType(Type):
+    def __init__(self, first, second):
+        assert(isinstance(first, Type)), "%s is not a Type" % first
+        assert(isinstance(second, Type)), "%s is not a Type" % second
+        self.first = first
+        self.second = second
+
+    def __eq__(self, other):
+        return isinstance(other, ComplexType) and \
+               self.first == other.first and \
+               self.second == other.second
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Type.__hash__
+
+    def matches(self, other):
+        if isinstance(other, ComplexType):
+            return self.first.matches(other.first) and \
+                   self.second.matches(other.second)
+        else:
+            return self == ANY_TYPE
+
+    def resolve(self, other):
+        if other == ANY_TYPE:
+            return self
+        elif isinstance(other, ComplexType):
+            f = self.first.resolve(other.first)
+            s = self.second.resolve(other.second)
+            if f and s:
+                return ComplexType(f,s)
+            else:
+                return None
+        elif self == ANY_TYPE:
+            return other
+        else:
+            return None
+
+    def __str__(self):
+        if self == ANY_TYPE:
+            return "%s" % ANY_TYPE
+        else:
+            return '<%s,%s>' % (self.first, self.second)
+
+    def str(self):
+        if self == ANY_TYPE:
+            return ANY_TYPE.str()
+        else:
+            return '(%s -> %s)' % (self.first.str(), self.second.str())
+
+class BasicType(Type):
+    def __eq__(self, other):
+        return isinstance(other, BasicType) and ("%s" % self) == ("%s" % other)
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Type.__hash__
+
+    def matches(self, other):
+        return other == ANY_TYPE or self == other
+
+    def resolve(self, other):
+        if self.matches(other):
+            return self
+        else:
+            return None
+
+@python_2_unicode_compatible
+class EntityType(BasicType):
+    def __str__(self):
+        return 'e'
+
+    def str(self):
+        return 'IND'
+
+@python_2_unicode_compatible
+class TruthValueType(BasicType):
+    def __str__(self):
+        return 't'
+
+    def str(self):
+        return 'BOOL'
+
+@python_2_unicode_compatible
+class EventType(BasicType):
+    def __str__(self):
+        return 'v'
+
+    def str(self):
+        return 'EVENT'
+
+@python_2_unicode_compatible
+class AnyType(BasicType, ComplexType):
+    def __init__(self):
+        pass
+
+    @property
+    def first(self): return self
+
+    @property
+    def second(self): return self
+
+    def __eq__(self, other):
+        return isinstance(other, AnyType) or other.__eq__(self)
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Type.__hash__
+
+    def matches(self, other):
+        return True
+
+    def resolve(self, other):
+        return other
+
+    def __str__(self):
+        return '?'
+
+    def str(self):
+        return 'ANY'
+
+
+TRUTH_TYPE = TruthValueType()
+ENTITY_TYPE = EntityType()
+EVENT_TYPE = EventType()
+ANY_TYPE = AnyType()
+
+
+def read_type(type_string):
+    assert isinstance(type_string, string_types)
+    type_string = type_string.replace(' ', '') #remove spaces
+
+    if type_string[0] == '<':
+        assert type_string[-1] == '>'
+        paren_count = 0
+        for i,char in enumerate(type_string):
+            if char == '<':
+                paren_count += 1
+            elif char == '>':
+                paren_count -= 1
+                assert paren_count > 0
+            elif char == ',':
+                if paren_count == 1:
+                    break
+        return ComplexType(read_type(type_string[1  :i ]),
+                           read_type(type_string[i+1:-1]))
+    elif type_string[0] == "%s" % ENTITY_TYPE:
+        return ENTITY_TYPE
+    elif type_string[0] == "%s" % TRUTH_TYPE:
+        return TRUTH_TYPE
+    elif type_string[0] == "%s" % ANY_TYPE:
+        return ANY_TYPE
+    else:
+        raise LogicalExpressionException("Unexpected character: '%s'." % type_string[0])
+
+
+class TypeException(Exception):
+    def __init__(self, msg):
+        super(TypeException, self).__init__(msg)
+
+class InconsistentTypeHierarchyException(TypeException):
+    def __init__(self, variable, expression=None):
+        if expression:
+            msg = "The variable '%s' was found in multiple places with different"\
+                " types in '%s'." % (variable, expression)
+        else:
+            msg = "The variable '%s' was found in multiple places with different"\
+                " types." % (variable)
+        super(InconsistentTypeHierarchyException, self).__init__(msg)
+
+class TypeResolutionException(TypeException):
+    def __init__(self, expression, other_type):
+        super(TypeResolutionException, self).__init__(
+            "The type of '%s', '%s', cannot be resolved with type '%s'" %
+            (expression, expression.type, other_type))
+
+class IllegalTypeException(TypeException):
+    def __init__(self, expression, other_type, allowed_type):
+        super(IllegalTypeException, self).__init__(
+            "Cannot set type of %s '%s' to '%s'; must match type '%s'." %
+            (expression.__class__.__name__, expression, other_type,
+            allowed_type))
+
+def typecheck(expressions, signature=None):
+    """
+    Ensure correct typing across a collection of ``Expression`` objects.
+    :param expressions: a collection of expressions
+    :param signature: dict that maps variable names to types (or string
+    representations of types)
+    """
+    #typecheck and create master signature
+    for expression in expressions:
+        signature = expression.typecheck(signature)
+    #apply master signature to all expressions
+    for expression in expressions[:-1]:
+        expression.typecheck(signature)
+    return signature
+
+
+class SubstituteBindingsI(object):
+    """
+    An interface for classes that can perform substitutions for
+    variables.
+    """
+    def substitute_bindings(self, bindings):
+        """
+        :return: The object that is obtained by replacing
+            each variable bound by ``bindings`` with its values.
+            Aliases are already resolved. (maybe?)
+        :rtype: (any)
+        """
+        raise NotImplementedError()
+
+    def variables(self):
+        """
+        :return: A list of all variables in this object.
+        """
+        raise NotImplementedError()
+
+
+@python_2_unicode_compatible
+class Expression(SubstituteBindingsI):
+    """This is the base abstract object for all logical expressions"""
+
+    _logic_parser = LogicParser()
+    _type_checking_logic_parser = LogicParser(type_check=True)
+
+    @classmethod
+    def fromstring(cls, s, type_check=False, signature=None):
+        if type_check:
+            return cls._type_checking_logic_parser.parse(s, signature)
+        else:
+            return cls._logic_parser.parse(s, signature)
+
+    def __call__(self, other, *additional):
+        accum = self.applyto(other)
+        for a in additional:
+            accum = accum(a)
+        return accum
+
+    def applyto(self, other):
+        assert isinstance(other, Expression), "%s is not an Expression" % other
+        return ApplicationExpression(self, other)
+
+    def __neg__(self):
+        return NegatedExpression(self)
+
+    def negate(self):
+        """If this is a negated expression, remove the negation.
+        Otherwise add a negation."""
+        return -self
+
+    def __and__(self, other):
+        if not isinstance(other, Expression):
+            raise TypeError("%s is not an Expression" % other)
+        return AndExpression(self, other)
+
+    def __or__(self, other):
+        if not isinstance(other, Expression):
+            raise TypeError("%s is not an Expression" % other)
+        return OrExpression(self, other)
+
+    def __gt__(self, other):
+        if not isinstance(other, Expression):
+            raise TypeError("%s is not an Expression" % other)
+        return ImpExpression(self, other)
+
+    def __lt__(self, other):
+        if not isinstance(other, Expression):
+            raise TypeError("%s is not an Expression" % other)
+        return IffExpression(self, other)
+
+    def __eq__(self, other):
+        raise NotImplementedError()
+
+    def __ne__(self, other):
+        return not self == other
+
+    def equiv(self, other, prover=None):
+        """
+        Check for logical equivalence.
+        Pass the expression (self <-> other) to the theorem prover.
+        If the prover says it is valid, then the self and other are equal.
+
+        :param other: an ``Expression`` to check equality against
+        :param prover: a ``nltk.inference.api.Prover``
+        """
+        assert isinstance(other, Expression), "%s is not an Expression" % other
+
+        if prover is None:
+            from nltk.inference import Prover9
+            prover = Prover9()
+        bicond = IffExpression(self.simplify(), other.simplify())
+        return prover.prove(bicond)
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    def substitute_bindings(self, bindings):
+        expr = self
+        for var in expr.variables():
+            if var in bindings:
+                val = bindings[var]
+                if isinstance(val, Variable):
+                    val = self.make_VariableExpression(val)
+                elif not isinstance(val, Expression):
+                    raise ValueError('Can not substitute a non-expression '
+                                     'value into an expression: %r' % (val,))
+                # Substitute bindings in the target value.
+                val = val.substitute_bindings(bindings)
+                # Replace var w/ the target value.
+                expr = expr.replace(var, val)
+        return expr.simplify()
+
+    def typecheck(self, signature=None):
+        """
+        Infer and check types.  Raise exceptions if necessary.
+
+        :param signature: dict that maps variable names to types (or string
+            representations of types)
+        :return: the signature, plus any additional type mappings
+        """
+        sig = defaultdict(list)
+        if signature:
+            for key in signature:
+                val = signature[key]
+                varEx = VariableExpression(Variable(key))
+                if isinstance(val, Type):
+                    varEx.type = val
+                else:
+                    varEx.type = read_type(val)
+                sig[key].append(varEx)
+
+        self._set_type(signature=sig)
+
+        return dict((key, sig[key][0].type) for key in sig)
+
+    def findtype(self, variable):
+        """
+        Find the type of the given variable as it is used in this expression.
+        For example, finding the type of "P" in "P(x) & Q(x,y)" yields "<e,t>"
+
+        :param variable: Variable
+        """
+        raise NotImplementedError()
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """
+        Set the type of this expression to be the given type.  Raise type
+        exceptions where applicable.
+
+        :param other_type: Type
+        :param signature: dict(str -> list(AbstractVariableExpression))
+        """
+        raise NotImplementedError()
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        """
+        Replace every instance of 'variable' with 'expression'
+        :param variable: ``Variable`` The variable to replace
+        :param expression: ``Expression`` The expression with which to replace it
+        :param replace_bound: bool Should bound variables be replaced?
+        :param alpha_convert: bool Alpha convert automatically to avoid name clashes?
+        """
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        assert isinstance(expression, Expression), "%s is not an Expression" % expression
+
+        return self.visit_structured(lambda e: e.replace(variable, expression,
+                                                         replace_bound, alpha_convert),
+                                     self.__class__)
+
+    def normalize(self, newvars=None):
+        """Rename auto-generated unique variables"""
+        def get_indiv_vars(e):
+            if isinstance(e, IndividualVariableExpression):
+                return set([e])
+            elif isinstance(e, AbstractVariableExpression):
+                return set()
+            else:
+                return e.visit(get_indiv_vars,
+                               lambda parts: reduce(operator.or_, parts, set()))
+
+        result = self
+        for i,e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)):
+            if isinstance(e,EventVariableExpression):
+                newVar = e.__class__(Variable('e0%s' % (i+1)))
+            elif isinstance(e,IndividualVariableExpression):
+                newVar = e.__class__(Variable('z%s' % (i+1)))
+            else:
+                newVar = e
+            result = result.replace(e.variable, newVar, True)
+        return result
+
+    def visit(self, function, combinator):
+        """
+        Recursively visit subexpressions.  Apply 'function' to each
+        subexpression and pass the result of each function application
+        to the 'combinator' for aggregation:
+
+            return combinator(map(function, self.subexpressions))
+
+        Bound variables are neither applied upon by the function nor given to
+        the combinator.
+        :param function: ``Function<Expression,T>`` to call on each subexpression
+        :param combinator: ``Function<list<T>,R>`` to combine the results of the
+        function calls
+        :return: result of combination ``R``
+        """
+        raise NotImplementedError()
+
+    def visit_structured(self, function, combinator):
+        """
+        Recursively visit subexpressions.  Apply 'function' to each
+        subexpression and pass the result of each function application
+        to the 'combinator' for aggregation.  The combinator must have
+        the same signature as the constructor.  The function is not
+        applied to bound variables, but they are passed to the
+        combinator.
+        :param function: ``Function`` to call on each subexpression
+        :param combinator: ``Function`` with the same signature as the
+        constructor, to combine the results of the function calls
+        :return: result of combination
+        """
+        return self.visit(function, lambda parts: combinator(*parts))
+
+    def __repr__(self):
+        return '<%s %s>' % (self.__class__.__name__, self)
+
+    def __str__(self):
+        return self.str()
+
+    def variables(self):
+        """
+        Return a set of all the variables for binding substitution.
+        The variables returned include all free (non-bound) individual
+        variables and any variable starting with '?' or '@'.
+        :return: set of ``Variable`` objects
+        """
+        return self.free() | set(p for p in self.predicates()|self.constants()
+                                 if re.match('^[?@]', p.name))
+
+    def free(self):
+        """
+        Return a set of all the free (non-bound) variables.  This includes
+        both individual and predicate variables, but not constants.
+        :return: set of ``Variable`` objects
+        """
+        return self.visit(lambda e: e.free(),
+                          lambda parts: reduce(operator.or_, parts, set()))
+
+    def constants(self):
+        """
+        Return a set of individual constants (non-predicates).
+        :return: set of ``Variable`` objects
+        """
+        return self.visit(lambda e: e.constants(),
+                          lambda parts: reduce(operator.or_, parts, set()))
+
+    def predicates(self):
+        """
+        Return a set of predicates (constants, not variables).
+        :return: set of ``Variable`` objects
+        """
+        return self.visit(lambda e: e.predicates(),
+                          lambda parts: reduce(operator.or_, parts, set()))
+
+    def simplify(self):
+        """
+        :return: beta-converted version of this expression
+        """
+        return self.visit_structured(lambda e: e.simplify(), self.__class__)
+
+    def make_VariableExpression(self, variable):
+        return VariableExpression(variable)
+
+
+@python_2_unicode_compatible
+class ApplicationExpression(Expression):
+    r"""
+    This class is used to represent two related types of logical expressions.
+
+    The first is a Predicate Expression, such as "P(x,y)".  A predicate
+    expression is comprised of a ``FunctionVariableExpression`` or
+    ``ConstantExpression`` as the predicate and a list of Expressions as the
+    arguments.
+
+    The second is a an application of one expression to another, such as
+    "(\x.dog(x))(fido)".
+
+    The reason Predicate Expressions are treated as Application Expressions is
+    that the Variable Expression predicate of the expression may be replaced
+    with another Expression, such as a LambdaExpression, which would mean that
+    the Predicate should be thought of as being applied to the arguments.
+
+    The logical expression reader will always curry arguments in a application expression.
+    So, "\x y.see(x,y)(john,mary)" will be represented internally as
+    "((\x y.(see(x))(y))(john))(mary)".  This simplifies the internals since
+    there will always be exactly one argument in an application.
+
+    The str() method will usually print the curried forms of application
+    expressions.  The one exception is when the the application expression is
+    really a predicate expression (ie, underlying function is an
+    ``AbstractVariableExpression``).  This means that the example from above
+    will be returned as "(\x y.see(x,y)(john))(mary)".
+    """
+    def __init__(self, function, argument):
+        """
+        :param function: ``Expression``, for the function expression
+        :param argument: ``Expression``, for the argument
+        """
+        assert isinstance(function, Expression), "%s is not an Expression" % function
+        assert isinstance(argument, Expression), "%s is not an Expression" % argument
+        self.function = function
+        self.argument = argument
+
+    def simplify(self):
+        function = self.function.simplify()
+        argument = self.argument.simplify()
+        if isinstance(function, LambdaExpression):
+            return function.term.replace(function.variable, argument).simplify()
+        else:
+            return self.__class__(function, argument)
+
+    @property
+    def type(self):
+        if isinstance(self.function.type, ComplexType):
+            return self.function.type.second
+        else:
+            return ANY_TYPE
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        self.argument._set_type(ANY_TYPE, signature)
+        try:
+            self.function._set_type(ComplexType(self.argument.type, other_type), signature)
+        except TypeResolutionException:
+            raise TypeException(
+                    "The function '%s' is of type '%s' and cannot be applied "
+                    "to '%s' of type '%s'.  Its argument must match type '%s'."
+                    % (self.function, self.function.type, self.argument,
+                       self.argument.type, self.function.type.first))
+
+    def findtype(self, variable):
+        """:see Expression.findtype()"""
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        if self.is_atom():
+            function, args = self.uncurry()
+        else:
+            #It's not a predicate expression ("P(x,y)"), so leave args curried
+            function = self.function
+            args = [self.argument]
+
+        found = [arg.findtype(variable) for arg in [function]+args]
+
+        unique = []
+        for f in found:
+            if f != ANY_TYPE:
+                if unique:
+                    for u in unique:
+                        if f.matches(u):
+                            break
+                else:
+                    unique.append(f)
+
+        if len(unique) == 1:
+            return list(unique)[0]
+        else:
+            return ANY_TYPE
+
+    def constants(self):
+        """:see: Expression.constants()"""
+        if isinstance(self.function, AbstractVariableExpression):
+            function_constants = set()
+        else:
+            function_constants = self.function.constants()
+        return function_constants | self.argument.constants()
+
+    def predicates(self):
+        """:see: Expression.predicates()"""
+        if isinstance(self.function, ConstantExpression):
+            function_preds = set([self.function.variable])
+        else:
+            function_preds = self.function.predicates()
+        return function_preds | self.argument.predicates()
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        return combinator([function(self.function), function(self.argument)])
+
+    def __eq__(self, other):
+        return isinstance(other, ApplicationExpression) and \
+                self.function == other.function and \
+                self.argument == other.argument
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Expression.__hash__
+
+    def __str__(self):
+        # uncurry the arguments and find the base function
+        if self.is_atom():
+            function, args = self.uncurry()
+            arg_str = ','.join("%s" % arg for arg in args)
+        else:
+            #Leave arguments curried
+            function = self.function
+            arg_str = "%s" % self.argument
+
+        function_str = "%s" % function
+        parenthesize_function = False
+        if isinstance(function, LambdaExpression):
+            if isinstance(function.term, ApplicationExpression):
+                if not isinstance(function.term.function,
+                                  AbstractVariableExpression):
+                    parenthesize_function = True
+            elif not isinstance(function.term, BooleanExpression):
+                parenthesize_function = True
+        elif isinstance(function, ApplicationExpression):
+            parenthesize_function = True
+
+        if parenthesize_function:
+            function_str = Tokens.OPEN + function_str + Tokens.CLOSE
+
+        return function_str + Tokens.OPEN + arg_str + Tokens.CLOSE
+
+    def uncurry(self):
+        """
+        Uncurry this application expression
+
+        return: A tuple (base-function, arg-list)
+        """
+        function = self.function
+        args = [self.argument]
+        while isinstance(function, ApplicationExpression):
+            #(\x.\y.sees(x,y)(john))(mary)
+            args.insert(0, function.argument)
+            function = function.function
+        return (function, args)
+
+    @property
+    def pred(self):
+        """
+        Return uncurried base-function.
+        If this is an atom, then the result will be a variable expression.
+        Otherwise, it will be a lambda expression.
+        """
+        return self.uncurry()[0]
+
+    @property
+    def args(self):
+        """
+        Return uncurried arg-list
+        """
+        return self.uncurry()[1]
+
+    def is_atom(self):
+        """
+        Is this expression an atom (as opposed to a lambda expression applied
+        to a term)?
+        """
+        return isinstance(self.pred, AbstractVariableExpression)
+
+
+@total_ordering
+@python_2_unicode_compatible
+class AbstractVariableExpression(Expression):
+    """This class represents a variable to be used as a predicate or entity"""
+    def __init__(self, variable):
+        """
+        :param variable: ``Variable``, for the variable
+        """
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        self.variable = variable
+
+    def simplify(self):
+        return self
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        """:see: Expression.replace()"""
+        assert isinstance(variable, Variable), "%s is not an Variable" % variable
+        assert isinstance(expression, Expression), "%s is not an Expression" % expression
+        if self.variable == variable:
+            return expression
+        else:
+            return self
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        resolution = other_type
+        for varEx in signature[self.variable.name]:
+            resolution = varEx.type.resolve(resolution)
+            if not resolution:
+                raise InconsistentTypeHierarchyException(self)
+
+        signature[self.variable.name].append(self)
+        for varEx in signature[self.variable.name]:
+            varEx.type = resolution
+
+    def findtype(self, variable):
+        """:see Expression.findtype()"""
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        if self.variable == variable:
+            return self.type
+        else:
+            return ANY_TYPE
+
+    def predicates(self):
+        """:see: Expression.predicates()"""
+        return set()
+
+    def __eq__(self, other):
+        """Allow equality between instances of ``AbstractVariableExpression``
+        subtypes."""
+        return isinstance(other, AbstractVariableExpression) and \
+               self.variable == other.variable
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __lt__(self, other):
+        if not isinstance(other, AbstractVariableExpression):
+            raise TypeError
+        return self.variable < other.variable
+
+    __hash__ = Expression.__hash__
+
+    def __str__(self):
+        return "%s" % self.variable
+
+class IndividualVariableExpression(AbstractVariableExpression):
+    """This class represents variables that take the form of a single lowercase
+    character (other than 'e') followed by zero or more digits."""
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        if not other_type.matches(ENTITY_TYPE):
+            raise IllegalTypeException(self, other_type, ENTITY_TYPE)
+
+        signature[self.variable.name].append(self)
+
+    def _get_type(self): return ENTITY_TYPE
+    type = property(_get_type, _set_type)
+
+    def free(self):
+        """:see: Expression.free()"""
+        return set([self.variable])
+
+    def constants(self):
+        """:see: Expression.constants()"""
+        return set()
+
+class FunctionVariableExpression(AbstractVariableExpression):
+    """This class represents variables that take the form of a single uppercase
+    character followed by zero or more digits."""
+    type = ANY_TYPE
+
+    def free(self):
+        """:see: Expression.free()"""
+        return set([self.variable])
+
+    def constants(self):
+        """:see: Expression.constants()"""
+        return set()
+
+class EventVariableExpression(IndividualVariableExpression):
+    """This class represents variables that take the form of a single lowercase
+    'e' character followed by zero or more digits."""
+    type = EVENT_TYPE
+
+class ConstantExpression(AbstractVariableExpression):
+    """This class represents variables that do not take the form of a single
+    character followed by zero or more digits."""
+    type = ENTITY_TYPE
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        if other_type == ANY_TYPE:
+            #entity type by default, for individuals
+            resolution = ENTITY_TYPE
+        else:
+            resolution = other_type
+            if self.type != ENTITY_TYPE:
+                resolution = resolution.resolve(self.type)
+
+        for varEx in signature[self.variable.name]:
+            resolution = varEx.type.resolve(resolution)
+            if not resolution:
+                raise InconsistentTypeHierarchyException(self)
+
+        signature[self.variable.name].append(self)
+        for varEx in signature[self.variable.name]:
+            varEx.type = resolution
+
+    def free(self):
+        """:see: Expression.free()"""
+        return set()
+
+    def constants(self):
+        """:see: Expression.constants()"""
+        return set([self.variable])
+
+
+def VariableExpression(variable):
+    """
+    This is a factory method that instantiates and returns a subtype of
+    ``AbstractVariableExpression`` appropriate for the given variable.
+    """
+    assert isinstance(variable, Variable), "%s is not a Variable" % variable
+    if is_indvar(variable.name):
+        return IndividualVariableExpression(variable)
+    elif is_funcvar(variable.name):
+        return FunctionVariableExpression(variable)
+    elif is_eventvar(variable.name):
+        return EventVariableExpression(variable)
+    else:
+        return ConstantExpression(variable)
+
+
+class VariableBinderExpression(Expression):
+    """This an abstract class for any Expression that binds a variable in an
+    Expression.  This includes LambdaExpressions and Quantified Expressions"""
+    def __init__(self, variable, term):
+        """
+        :param variable: ``Variable``, for the variable
+        :param term: ``Expression``, for the term
+        """
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        assert isinstance(term, Expression), "%s is not an Expression" % term
+        self.variable = variable
+        self.term = term
+
+    def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
+        """:see: Expression.replace()"""
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        assert isinstance(expression, Expression), "%s is not an Expression" % expression
+        #if the bound variable is the thing being replaced
+        if self.variable == variable:
+            if replace_bound:
+                assert isinstance(expression, AbstractVariableExpression),\
+                       "%s is not a AbstractVariableExpression" % expression
+                return self.__class__(expression.variable,
+                                      self.term.replace(variable, expression, True, alpha_convert))
+            else:
+                return self
+        else:
+            # if the bound variable appears in the expression, then it must
+            # be alpha converted to avoid a conflict
+            if alpha_convert and self.variable in expression.free():
+                self = self.alpha_convert(unique_variable(pattern=self.variable))
+
+            #replace in the term
+            return self.__class__(self.variable,
+                                  self.term.replace(variable, expression, replace_bound, alpha_convert))
+
+    def alpha_convert(self, newvar):
+        """Rename all occurrences of the variable introduced by this variable
+        binder in the expression to ``newvar``.
+        :param newvar: ``Variable``, for the new variable
+        """
+        assert isinstance(newvar, Variable), "%s is not a Variable" % newvar
+        return self.__class__(newvar,
+                              self.term.replace(self.variable,
+                                                VariableExpression(newvar),
+                                                True))
+
+    def free(self):
+        """:see: Expression.free()"""
+        return self.term.free() - set([self.variable])
+
+    def findtype(self, variable):
+        """:see Expression.findtype()"""
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        if variable == self.variable:
+            return ANY_TYPE
+        else:
+            return self.term.findtype(variable)
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        return combinator([function(self.term)])
+
+    def visit_structured(self, function, combinator):
+        """:see: Expression.visit_structured()"""
+        return combinator(self.variable, function(self.term))
+
+    def __eq__(self, other):
+        r"""Defines equality modulo alphabetic variance.  If we are comparing
+        \x.M  and \y.N, then check equality of M and N[x/y]."""
+        if isinstance(self, other.__class__) or \
+           isinstance(other, self.__class__):
+            if self.variable == other.variable:
+                return self.term == other.term
+            else:
+                # Comparing \x.M  and \y.N.  Relabel y in N with x and continue.
+                varex = VariableExpression(self.variable)
+                return self.term == other.term.replace(other.variable, varex)
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Expression.__hash__
+
+
+@python_2_unicode_compatible
+class LambdaExpression(VariableBinderExpression):
+    @property
+    def type(self):
+        return ComplexType(self.term.findtype(self.variable),
+                           self.term.type)
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        self.term._set_type(other_type.second, signature)
+        if not self.type.resolve(other_type):
+            raise TypeResolutionException(self, other_type)
+
+    def __str__(self):
+        variables = [self.variable]
+        term = self.term
+        while term.__class__ == self.__class__:
+            variables.append(term.variable)
+            term = term.term
+        return Tokens.LAMBDA + ' '.join("%s" % v for v in variables) + \
+               Tokens.DOT + "%s" % term
+
+
+@python_2_unicode_compatible
+class QuantifiedExpression(VariableBinderExpression):
+    @property
+    def type(self): return TRUTH_TYPE
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        if not other_type.matches(TRUTH_TYPE):
+            raise IllegalTypeException(self, other_type, TRUTH_TYPE)
+        self.term._set_type(TRUTH_TYPE, signature)
+
+    def __str__(self):
+        variables = [self.variable]
+        term = self.term
+        while term.__class__ == self.__class__:
+            variables.append(term.variable)
+            term = term.term
+        return self.getQuantifier() + ' ' + ' '.join("%s" % v for v in variables) + \
+               Tokens.DOT + "%s" % term
+
+class ExistsExpression(QuantifiedExpression):
+    def getQuantifier(self):
+        return Tokens.EXISTS
+
+class AllExpression(QuantifiedExpression):
+    def getQuantifier(self):
+        return Tokens.ALL
+
+
+@python_2_unicode_compatible
+class NegatedExpression(Expression):
+    def __init__(self, term):
+        assert isinstance(term, Expression), "%s is not an Expression" % term
+        self.term = term
+
+    @property
+    def type(self): return TRUTH_TYPE
+
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        if not other_type.matches(TRUTH_TYPE):
+            raise IllegalTypeException(self, other_type, TRUTH_TYPE)
+        self.term._set_type(TRUTH_TYPE, signature)
+
+    def findtype(self, variable):
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        return self.term.findtype(variable)
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        return combinator([function(self.term)])
+
+    def negate(self):
+        """:see: Expression.negate()"""
+        return self.term
+
+    def __eq__(self, other):
+        return isinstance(other, NegatedExpression) and self.term == other.term
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Expression.__hash__
+
+    def __str__(self):
+        return Tokens.NOT + "%s" % self.term
+
+
+@python_2_unicode_compatible
+class BinaryExpression(Expression):
+    def __init__(self, first, second):
+        assert isinstance(first, Expression), "%s is not an Expression" % first
+        assert isinstance(second, Expression), "%s is not an Expression" % second
+        self.first = first
+        self.second = second
+
+    @property
+    def type(self): return TRUTH_TYPE
+
+    def findtype(self, variable):
+        """:see Expression.findtype()"""
+        assert isinstance(variable, Variable), "%s is not a Variable" % variable
+        f = self.first.findtype(variable)
+        s = self.second.findtype(variable)
+        if f == s or s == ANY_TYPE:
+            return f
+        elif f == ANY_TYPE:
+            return s
+        else:
+            return ANY_TYPE
+
+    def visit(self, function, combinator):
+        """:see: Expression.visit()"""
+        return combinator([function(self.first), function(self.second)])
+
+    def __eq__(self, other):
+        return (isinstance(self, other.__class__) or \
+                isinstance(other, self.__class__)) and \
+               self.first == other.first and self.second == other.second
+
+    def __ne__(self, other):
+        return not self == other
+
+    __hash__ = Expression.__hash__
+
+    def __str__(self):
+        first = self._str_subex(self.first)
+        second = self._str_subex(self.second)
+        return Tokens.OPEN + first + ' ' + self.getOp() \
+                + ' ' + second + Tokens.CLOSE
+
+    def _str_subex(self, subex):
+        return "%s" % subex
+
+
+class BooleanExpression(BinaryExpression):
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        if not other_type.matches(TRUTH_TYPE):
+            raise IllegalTypeException(self, other_type, TRUTH_TYPE)
+        self.first._set_type(TRUTH_TYPE, signature)
+        self.second._set_type(TRUTH_TYPE, signature)
+
+class AndExpression(BooleanExpression):
+    """This class represents conjunctions"""
+    def getOp(self):
+        return Tokens.AND
+
+    def _str_subex(self, subex):
+        s = "%s" % subex
+        if isinstance(subex, AndExpression):
+            return s[1:-1]
+        return s
+
+class OrExpression(BooleanExpression):
+    """This class represents disjunctions"""
+    def getOp(self):
+        return Tokens.OR
+
+    def _str_subex(self, subex):
+        s = "%s" % subex
+        if isinstance(subex, OrExpression):
+            return s[1:-1]
+        return s
+
+class ImpExpression(BooleanExpression):
+    """This class represents implications"""
+    def getOp(self):
+        return Tokens.IMP
+
+class IffExpression(BooleanExpression):
+    """This class represents biconditionals"""
+    def getOp(self):
+        return Tokens.IFF
+
+
+class EqualityExpression(BinaryExpression):
+    """This class represents equality expressions like "(x = y)"."""
+    def _set_type(self, other_type=ANY_TYPE, signature=None):
+        """:see Expression._set_type()"""
+        assert isinstance(other_type, Type)
+
+        if signature is None:
+            signature = defaultdict(list)
+
+        if not other_type.matches(TRUTH_TYPE):
+            raise IllegalTypeException(self, other_type, TRUTH_TYPE)
+        self.first._set_type(ENTITY_TYPE, signature)
+        self.second._set_type(ENTITY_TYPE, signature)
+
+    def getOp(self):
+        return Tokens.EQ
+
+
+### Utilities
+
+class LogicalExpressionException(Exception):
+    def __init__(self, index, message):
+        self.index = index
+        Exception.__init__(self, message)
+
+class UnexpectedTokenException(LogicalExpressionException):
+    def __init__(self, index, unexpected=None, expected=None, message=None):
+        if unexpected and expected:
+            msg = "Unexpected token: '%s'.  " \
+                  "Expected token '%s'." % (unexpected, expected)
+        elif unexpected:
+            msg = "Unexpected token: '%s'." % unexpected
+            if message:
+                msg += '  '+message
+        else:
+            msg = "Expected token '%s'." % expected
+        LogicalExpressionException.__init__(self, index, msg)
+
+class ExpectedMoreTokensException(LogicalExpressionException):
+    def __init__(self, index, message=None):
+        if not message:
+            message = 'More tokens expected.'
+        LogicalExpressionException.__init__(self, index, 'End of input found.  ' + message)
+
+
+def is_indvar(expr):
+    """
+    An individual variable must be a single lowercase character other than 'e',
+    followed by zero or more digits.
+
+    :param expr: str
+    :return: bool True if expr is of the correct form
+    """
+    assert isinstance(expr, string_types), "%s is not a string" % expr
+    return re.match(r'^[a-df-z]\d*$', expr) is not None
+
+def is_funcvar(expr):
+    """
+    A function variable must be a single uppercase character followed by
+    zero or more digits.
+
+    :param expr: str
+    :return: bool True if expr is of the correct form
+    """
+    assert isinstance(expr, string_types), "%s is not a string" % expr
+    return re.match(r'^[A-Z]\d*$', expr) is not None
+
+def is_eventvar(expr):
+    """
+    An event variable must be a single lowercase 'e' character followed by
+    zero or more digits.
+
+    :param expr: str
+    :return: bool True if expr is of the correct form
+    """
+    assert isinstance(expr, string_types), "%s is not a string" % expr
+    return re.match(r'^e\d*$', expr) is not None
+
+
+def demo():
+    lexpr = Expression.fromstring
+    print('='*20 + 'Test reader' + '='*20)
+    print(lexpr(r'john'))
+    print(lexpr(r'man(x)'))
+    print(lexpr(r'-man(x)'))
+    print(lexpr(r'(man(x) & tall(x) & walks(x))'))
+    print(lexpr(r'exists x.(man(x) & tall(x) & walks(x))'))
+    print(lexpr(r'\x.man(x)'))
+    print(lexpr(r'\x.man(x)(john)'))
+    print(lexpr(r'\x y.sees(x,y)'))
+    print(lexpr(r'\x y.sees(x,y)(a,b)'))
+    print(lexpr(r'(\x.exists y.walks(x,y))(x)'))
+    print(lexpr(r'exists x.x = y'))
+    print(lexpr(r'exists x.(x = y)'))
+    print(lexpr('P(x) & x=y & P(y)'))
+    print(lexpr(r'\P Q.exists x.(P(x) & Q(x))'))
+    print(lexpr(r'man(x) <-> tall(x)'))
+
+    print('='*20 + 'Test simplify' + '='*20)
+    print(lexpr(r'\x.\y.sees(x,y)(john)(mary)').simplify())
+    print(lexpr(r'\x.\y.sees(x,y)(john, mary)').simplify())
+    print(lexpr(r'all x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify())
+    print(lexpr(r'(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))').simplify())
+
+    print('='*20 + 'Test alpha conversion and binder expression equality' + '='*20)
+    e1 = lexpr('exists x.P(x)')
+    print(e1)
+    e2 = e1.alpha_convert(Variable('z'))
+    print(e2)
+    print(e1 == e2)
+
+def demo_errors():
+    print('='*20 + 'Test reader errors' + '='*20)
+    demoException('(P(x) & Q(x)')
+    demoException('((P(x) &) & Q(x))')
+    demoException('P(x) -> ')
+    demoException('P(x')
+    demoException('P(x,')
+    demoException('P(x,)')
+    demoException('exists')
+    demoException('exists x.')
+    demoException('\\')
+    demoException('\\ x y.')
+    demoException('P(x)Q(x)')
+    demoException('(P(x)Q(x)')
+    demoException('exists x -> y')
+
+def demoException(s):
+    try:
+        Expression.fromstring(s)
+    except LogicalExpressionException as e:
+        print("%s: %s" % (e.__class__.__name__, e))
+
+def printtype(ex):
+    print("%s : %s" % (ex.str(), ex.type))
+
+if __name__ == '__main__':
+    demo()
+#    demo_errors()
diff --git a/nlp_resource_data/nltk/sem/logic.pyc b/nlp_resource_data/nltk/sem/logic.pyc

new file mode 100755 (executable)

index 0000000..e52d381

Binary files /dev/null and b/nlp_resource_data/nltk/sem/logic.pyc differ
diff --git a/nlp_resource_data/nltk/sem/relextract.py b/nlp_resource_data/nltk/sem/relextract.py

new file mode 100755 (executable)

index 0000000..a54b5aa
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/relextract.py
@@ -0,0 +1,476 @@
+# Natural Language Toolkit: Relation Extraction
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Code for extracting relational triples from the ieer and conll2002 corpora.
+
+Relations are stored internally as dictionaries ('reldicts').
+
+The two serialization outputs are "rtuple" and "clause".
+
+- An rtuple is a tuple of the form ``(subj, filler, obj)``,
+  where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
+  occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
+  circumvent locale variations in rendering utf-8 encoded strings.
+- A clause is an atom of the form ``relsym(subjsym, objsym)``,
+  where the relation, subject and object have been canonicalized to single strings.
+"""
+from __future__ import print_function
+
+# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
+
+from collections import defaultdict
+import re
+
+from six.moves import html_entities
+
+# Dictionary that associates corpora with NE classes
+NE_CLASSES = {
+    'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
+            'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'],
+    'conll2002': ['LOC', 'PER', 'ORG'],
+    'ace': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
+            'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE', 'FACILITY', 'GPE'],
+    }
+
+# Allow abbreviated class labels
+short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON')
+long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER')
+
+
+def _expand(type):
+    """
+    Expand an NE class name.
+    :type type: str
+    :rtype: str
+    """
+    try:
+        return short2long[type]
+    except KeyError:
+        return type
+
+def class_abbrev(type):
+    """
+    Abbreviate an NE class name.
+    :type type: str
+    :rtype: str
+    """
+    try:
+        return long2short[type]
+    except KeyError:
+        return type
+
+
+def _join(lst, sep=' ', untag=False):
+    """
+    Join a list into a string, turning tags tuples into tag strings or just words.
+    :param untag: if ``True``, omit the tag from tagged input strings.
+    :type lst: list
+    :rtype: str
+    """
+    try:
+        return sep.join(lst)
+    except TypeError:
+        if untag:
+            return sep.join(tup[0] for tup in lst)
+        from nltk.tag import tuple2str
+        return sep.join(tuple2str(tup) for tup in lst)
+
+def descape_entity(m, defs=html_entities.entitydefs):
+    """
+    Translate one entity to its ISO Latin value.
+    Inspired by example from effbot.org
+
+
+    """
+    #s = 'mcglashan_&amp;_sarrail'
+    #l = ['mcglashan', '&amp;', 'sarrail']
+    #pattern = re.compile("&(\w+?);")
+    #new = list2sym(l)
+    #s = pattern.sub(descape_entity, s)
+    #print s, new
+    try:
+        return defs[m.group(1)]
+
+    except KeyError:
+        return m.group(0) # use as is
+
+def list2sym(lst):
+    """
+    Convert a list of strings into a canonical symbol.
+    :type lst: list
+    :return: a Unicode string without whitespace
+    :rtype: unicode
+    """
+    sym = _join(lst, '_', untag=True)
+    sym = sym.lower()
+    ENT = re.compile("&(\w+?);")
+    sym = ENT.sub(descape_entity, sym)
+    sym = sym.replace('.', '')
+    return sym
+
+def tree2semi_rel(tree):
+    """
+    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
+
+    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
+    identifies pairs whose first member is a list (possibly empty) of terminal
+    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
+
+    :param tree: a chunk tree
+    :return: a list of pairs (list(str), ``Tree``)
+    :rtype: list of tuple
+    """
+
+    from nltk.tree import Tree
+
+    semi_rels = []
+    semi_rel = [[], None]
+
+    for dtr in tree:
+        if not isinstance(dtr, Tree):
+            semi_rel[0].append(dtr)
+        else:
+            # dtr is a Tree
+            semi_rel[1] = dtr
+            semi_rels.append(semi_rel)
+            semi_rel = [[], None]
+    return semi_rels
+
+
+def semi_rel2reldict(pairs, window=5, trace=False):
+    """
+    Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which
+    stores information about the subject and object NEs plus the filler between them.
+    Additionally, a left and right context of length =< window are captured (within
+    a given input sentence).
+
+    :param pairs: a pair of list(str) and ``Tree``, as generated by
+    :param window: a threshold for the number of items to include in the left and right context
+    :type window: int
+    :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
+    :rtype: list(defaultdict)
+    """
+    result = []
+    while len(pairs) > 2:
+        reldict = defaultdict(str)
+        reldict['lcon'] = _join(pairs[0][0][-window:])
+        reldict['subjclass'] = pairs[0][1].label()
+        reldict['subjtext'] = _join(pairs[0][1].leaves())
+        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
+        reldict['filler'] = _join(pairs[1][0])
+        reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
+        reldict['objclass'] = pairs[1][1].label()
+        reldict['objtext'] = _join(pairs[1][1].leaves())
+        reldict['objsym'] = list2sym(pairs[1][1].leaves())
+        reldict['rcon'] = _join(pairs[2][0][:window])
+        if trace:
+            print("(%s(%s, %s)" % (reldict['untagged_filler'], reldict['subjclass'], reldict['objclass']))
+        result.append(reldict)
+        pairs = pairs[1:]
+    return result
+
+def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
+    """
+    Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
+
+    The parameters ``subjclass`` and ``objclass`` can be used to restrict the
+    Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
+    'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
+
+    :param subjclass: the class of the subject Named Entity.
+    :type subjclass: str
+    :param objclass: the class of the object Named Entity.
+    :type objclass: str
+    :param doc: input document
+    :type doc: ieer document or a list of chunk trees
+    :param corpus: name of the corpus to take as input; possible values are
+        'ieer' and 'conll2002'
+    :type corpus: str
+    :param pattern: a regular expression for filtering the fillers of
+        retrieved triples.
+    :type pattern: SRE_Pattern
+    :param window: filters out fillers which exceed this threshold
+    :type window: int
+    :return: see ``mk_reldicts``
+    :rtype: list(defaultdict)
+    """
+
+    if subjclass and subjclass not in NE_CLASSES[corpus]:
+        if _expand(subjclass) in NE_CLASSES[corpus]:
+            subjclass = _expand(subjclass)
+        else:
+            raise ValueError("your value for the subject type has not been recognized: %s" % subjclass)
+    if objclass and objclass not in NE_CLASSES[corpus]:
+        if _expand(objclass) in NE_CLASSES[corpus]:
+            objclass = _expand(objclass)
+        else:
+            raise ValueError("your value for the object type has not been recognized: %s" % objclass)
+
+    if corpus == 'ace' or corpus == 'conll2002':
+        pairs = tree2semi_rel(doc)
+    elif corpus == 'ieer':
+        pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
+    else:
+        raise ValueError("corpus type not recognized")
+
+    reldicts = semi_rel2reldict(pairs)
+
+    relfilter = lambda x: (x['subjclass'] == subjclass and
+                           len(x['filler'].split()) <= window and
+                           pattern.match(x['filler']) and
+                           x['objclass'] == objclass)
+
+    return list(filter(relfilter, reldicts))
+
+
+def rtuple(reldict, lcon=False, rcon=False):
+    """
+    Pretty print the reldict as an rtuple.
+    :param reldict: a relation dictionary
+    :type reldict: defaultdict
+    """
+    items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']]
+    format = '[%s: %r] %r [%s: %r]'
+    if lcon:
+        items = [reldict['lcon']] + items
+        format = '...%r)' + format
+    if rcon:
+        items.append(reldict['rcon'])
+        format = format + '(%r...'
+    printargs = tuple(items)
+    return format % printargs
+
+def clause(reldict, relsym):
+    """
+    Print the relation in clausal form.
+    :param reldict: a relation dictionary
+    :type reldict: defaultdict
+    :param relsym: a label for the relation
+    :type relsym: str
+    """
+    items = (relsym, reldict['subjsym'], reldict['objsym'])
+    return "%s(%r, %r)" % items
+
+
+#######################################################
+# Demos of relation extraction with regular expressions
+#######################################################
+
+############################################
+# Example of in(ORG, LOC)
+############################################
+def in_demo(trace=0, sql=True):
+    """
+    Select pairs of organizations and locations whose mentions occur with an
+    intervening occurrence of the preposition "in".
+
+    If the sql parameter is set to True, then the entity pairs are loaded into
+    an in-memory database, and subsequently pulled out using an SQL "SELECT"
+    query.
+    """
+    from nltk.corpus import ieer
+    if sql:
+        try:
+            import sqlite3
+            connection =  sqlite3.connect(":memory:")
+            connection.text_factory = sqlite3.OptimizedUnicode
+            cur = connection.cursor()
+            cur.execute("""create table Locations
+            (OrgName text, LocationName text, DocID text)""")
+        except ImportError:
+            import warnings
+            warnings.warn("Cannot import sqlite; sql flag will be ignored.")
+
+
+    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
+
+    print()
+    print("IEER: in(ORG, LOC) -- just the clauses:")
+    print("=" * 45)
+
+    for file in ieer.fileids():
+        for doc in ieer.parsed_docs(file):
+            if trace:
+                print(doc.docno)
+                print("=" * 15)
+            for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
+                print(clause(rel, relsym='IN'))
+                if sql:
+                    try:
+                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
+                        cur.execute("""insert into Locations
+                                    values (?, ?, ?)""", rtuple)
+                        connection.commit()
+                    except NameError:
+                        pass
+
+    if sql:
+        try:
+            cur.execute("""select OrgName from Locations
+                        where LocationName = 'Atlanta'""")
+            print()
+            print("Extract data from SQL table: ORGs in Atlanta")
+            print("-" * 15)
+            for row in cur:
+                print(row)
+        except NameError:
+            pass
+
+
+############################################
+# Example of has_role(PER, LOC)
+############################################
+
+def roles_demo(trace=0):
+    from nltk.corpus import ieer
+    roles = """
+    (.*(                   # assorted roles
+    analyst|
+    chair(wo)?man|
+    commissioner|
+    counsel|
+    director|
+    economist|
+    editor|
+    executive|
+    foreman|
+    governor|
+    head|
+    lawyer|
+    leader|
+    librarian).*)|
+    manager|
+    partner|
+    president|
+    producer|
+    professor|
+    researcher|
+    spokes(wo)?man|
+    writer|
+    ,\sof\sthe?\s*  # "X, of (the) Y"
+    """
+    ROLES = re.compile(roles, re.VERBOSE)
+
+    print()
+    print("IEER: has_role(PER, ORG) -- raw rtuples:")
+    print("=" * 45)
+
+    for file in ieer.fileids():
+        for doc in ieer.parsed_docs(file):
+            lcon = rcon = False
+            if trace:
+                print(doc.docno)
+                print("=" * 15)
+                lcon = rcon = True
+            for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
+                print(rtuple(rel, lcon=lcon, rcon=rcon))
+
+
+##############################################
+### Show what's in the IEER Headlines
+##############################################
+
+
+def ieer_headlines():
+
+    from nltk.corpus import ieer
+    from nltk.tree import Tree
+
+    print("IEER: First 20 Headlines")
+    print("=" * 45)
+
+    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
+    for tree in trees[:20]:
+        print()
+        print("%s:\n%s" % tree)
+
+
+
+#############################################
+## Dutch CONLL2002: take_on_role(PER, ORG
+#############################################
+
+def conllned(trace=1):
+    """
+    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
+    from CoNLL 2002.
+    """
+
+    from nltk.corpus import conll2002
+
+    vnv = """
+    (
+    is/V|    # 3rd sing present and
+    was/V|   # past forms of the verb zijn ('be')
+    werd/V|  # and also present
+    wordt/V  # past of worden ('become)
+    )
+    .*       # followed by anything
+    van/Prep # followed by van ('of')
+    """
+    VAN = re.compile(vnv, re.VERBOSE)
+
+    print()
+    print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
+    print("=" * 45)
+
+
+    for doc in conll2002.chunked_sents('ned.train'):
+        lcon = rcon = False
+        if trace:
+                lcon = rcon = True
+        for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
+            print(rtuple(rel, lcon=lcon, rcon=rcon))
+
+#############################################
+## Spanish CONLL2002: (PER, ORG)
+#############################################
+
+def conllesp():
+    from nltk.corpus import conll2002
+
+    de = """
+    .*
+    (
+    de/SP|
+    del/SP
+    )
+    """
+    DE = re.compile(de, re.VERBOSE)
+
+    print()
+    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
+    print("=" * 45)
+    rels = [rel for doc in conll2002.chunked_sents('esp.train')
+            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
+    for r in rels[:10]: print(clause(r, relsym='DE'))
+    print()
+
+
+def ne_chunked():
+    print()
+    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
+    print("=" * 45)
+    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
+    rels = []
+    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
+        sent = nltk.ne_chunk(sent)
+        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
+        for rel in rels:
+            print('{0:<5}{1}'.format(i, rtuple(rel)))
+
+
+if __name__ == '__main__':
+    import nltk
+    from nltk.sem import relextract
+    in_demo(trace=0)
+    roles_demo(trace=0)
+    conllned()
+    conllesp()
+    ieer_headlines()
+    ne_chunked()
diff --git a/nlp_resource_data/nltk/sem/relextract.pyc b/nlp_resource_data/nltk/sem/relextract.pyc

new file mode 100755 (executable)

index 0000000..98fa9b2

Binary files /dev/null and b/nlp_resource_data/nltk/sem/relextract.pyc differ
diff --git a/nlp_resource_data/nltk/sem/skolemize.py b/nlp_resource_data/nltk/sem/skolemize.py

new file mode 100755 (executable)

index 0000000..1c5c03f
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/skolemize.py
@@ -0,0 +1,101 @@
+# Natural Language Toolkit: Semantic Interpretation
+#
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression,
+                            EqualityExpression, ExistsExpression, IffExpression,
+                            ImpExpression, NegatedExpression, OrExpression,
+                            VariableExpression, skolem_function, unique_variable)
+
+def skolemize(expression, univ_scope=None, used_variables=None):
+    """
+    Skolemize the expression and convert to conjunctive normal form (CNF)
+    """
+    if univ_scope is None:
+        univ_scope = set()
+    if used_variables is None:
+        used_variables = set()
+
+    if isinstance(expression, AllExpression):
+        term = skolemize(expression.term, univ_scope|set([expression.variable]), used_variables|set([expression.variable]))
+        return term.replace(expression.variable, VariableExpression(unique_variable(ignore=used_variables)))
+    elif isinstance(expression, AndExpression):
+        return skolemize(expression.first, univ_scope, used_variables) &\
+               skolemize(expression.second, univ_scope, used_variables)
+    elif isinstance(expression, OrExpression):
+        return to_cnf(skolemize(expression.first, univ_scope, used_variables),
+                      skolemize(expression.second, univ_scope, used_variables))
+    elif isinstance(expression, ImpExpression):
+        return to_cnf(skolemize(-expression.first, univ_scope, used_variables),
+                      skolemize(expression.second, univ_scope, used_variables))
+    elif isinstance(expression, IffExpression):
+        return to_cnf(skolemize(-expression.first, univ_scope, used_variables),
+                      skolemize(expression.second, univ_scope, used_variables)) &\
+               to_cnf(skolemize(expression.first, univ_scope, used_variables),
+                      skolemize(-expression.second, univ_scope, used_variables))
+    elif isinstance(expression, EqualityExpression):
+        return expression
+    elif isinstance(expression, NegatedExpression):
+        negated = expression.term
+        if isinstance(negated, AllExpression):
+            term = skolemize(-negated.term, univ_scope, used_variables|set([negated.variable]))
+            if univ_scope:
+                return term.replace(negated.variable, skolem_function(univ_scope))
+            else:
+                skolem_constant = VariableExpression(unique_variable(ignore=used_variables))
+                return term.replace(negated.variable, skolem_constant)
+        elif isinstance(negated, AndExpression):
+            return to_cnf(skolemize(-negated.first, univ_scope, used_variables),
+                          skolemize(-negated.second, univ_scope, used_variables))
+        elif isinstance(negated, OrExpression):
+            return skolemize(-negated.first, univ_scope, used_variables) &\
+                   skolemize(-negated.second, univ_scope, used_variables)
+        elif isinstance(negated, ImpExpression):
+            return skolemize(negated.first, univ_scope, used_variables) &\
+                   skolemize(-negated.second, univ_scope, used_variables)
+        elif isinstance(negated, IffExpression):
+            return to_cnf(skolemize(-negated.first, univ_scope, used_variables),
+                          skolemize(-negated.second, univ_scope, used_variables)) &\
+                   to_cnf(skolemize(negated.first, univ_scope, used_variables),
+                          skolemize(negated.second, univ_scope, used_variables))
+        elif isinstance(negated, EqualityExpression):
+            return expression
+        elif isinstance(negated, NegatedExpression):
+            return skolemize(negated.term, univ_scope, used_variables)
+        elif isinstance(negated, ExistsExpression):
+            term = skolemize(-negated.term, univ_scope|set([negated.variable]), used_variables|set([negated.variable]))
+            return term.replace(negated.variable, VariableExpression(unique_variable(ignore=used_variables)))
+        elif isinstance(negated, ApplicationExpression):
+            return expression
+        else:
+            raise Exception('\'%s\' cannot be skolemized' % expression)
+    elif isinstance(expression, ExistsExpression):
+        term = skolemize(expression.term, univ_scope, used_variables|set([expression.variable]))
+        if univ_scope:
+            return term.replace(expression.variable, skolem_function(univ_scope))
+        else:
+            skolem_constant = VariableExpression(unique_variable(ignore=used_variables))
+            return term.replace(expression.variable, skolem_constant)
+    elif isinstance(expression, ApplicationExpression):
+        return expression
+    else:
+        raise Exception('\'%s\' cannot be skolemized' % expression)
+
+def to_cnf(first, second):
+    """
+    Convert this split disjunction to conjunctive normal form (CNF)
+    """
+    if isinstance(first, AndExpression):
+        r_first = to_cnf(first.first, second)
+        r_second = to_cnf(first.second, second)
+        return r_first & r_second
+    elif isinstance(second, AndExpression):
+        r_first = to_cnf(first, second.first)
+        r_second = to_cnf(first, second.second)
+        return r_first & r_second
+    else:
+        return first | second
diff --git a/nlp_resource_data/nltk/sem/skolemize.pyc b/nlp_resource_data/nltk/sem/skolemize.pyc

new file mode 100755 (executable)

index 0000000..04bcb22

Binary files /dev/null and b/nlp_resource_data/nltk/sem/skolemize.pyc differ
diff --git a/nlp_resource_data/nltk/sem/util.py b/nlp_resource_data/nltk/sem/util.py

new file mode 100755 (executable)

index 0000000..edfcb0f
--- /dev/null
+++ b/nlp_resource_data/nltk/sem/util.py
@@ -0,0 +1,249 @@
+# Natural Language Toolkit: Semantic Interpretation
+#
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility functions for batch-processing sentences: parsing and
+extraction of the semantic representation of the root node of the the
+syntax tree, followed by evaluation of the semantic representation in
+a first-order model.
+"""
+from __future__ import print_function, unicode_literals
+
+import codecs
+from nltk.sem import evaluate
+
+
+##############################################################
+## Utility functions for connecting parse output to semantics
+##############################################################
+
+def parse_sents(inputs, grammar, trace=0):
+    """
+    Convert input sentences into syntactic trees.
+
+    :param inputs: sentences to be parsed
+    :type inputs: list(str)
+    :param grammar: ``FeatureGrammar`` or name of feature-based grammar
+    :type grammar: nltk.grammar.FeatureGrammar
+    :rtype: list(nltk.tree.Tree) or dict(list(str)): list(Tree)
+    :return: a mapping from input sentences to a list of ``Tree``s
+    """
+    # put imports here to avoid circult dependencies
+    from nltk.grammar import FeatureGrammar
+    from nltk.parse import FeatureChartParser, load_parser
+
+    if isinstance(grammar, FeatureGrammar):
+        cp = FeatureChartParser(grammar)
+    else:
+        cp = load_parser(grammar, trace=trace)
+    parses = []
+    for sent in inputs:
+        tokens = sent.split() # use a tokenizer?
+        syntrees = list(cp.parse(tokens))
+        parses.append(syntrees)
+    return parses
+
+def root_semrep(syntree, semkey='SEM'):
+    """
+    Find the semantic representation at the root of a tree.
+
+    :param syntree: a parse ``Tree``
+    :param semkey: the feature label to use for the root semantics in the tree
+    :return: the semantic representation at the root of a ``Tree``
+    :rtype: sem.Expression
+    """
+    from nltk.grammar import FeatStructNonterminal
+
+    node = syntree.label()
+    assert isinstance(node, FeatStructNonterminal)
+    try:
+        return node[semkey]
+    except KeyError:
+        print(node, end=' ')
+        print("has no specification for the feature %s" % semkey)
+    raise
+
+def interpret_sents(inputs, grammar, semkey='SEM', trace=0):
+    """
+    Add the semantic representation to each syntactic parse tree
+    of each input sentence.
+
+    :param inputs: a list of sentences
+    :type inputs: list(str)
+    :param grammar: ``FeatureGrammar`` or name of feature-based grammar
+    :type grammar: nltk.grammar.FeatureGrammar
+    :return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations)
+    :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression)))
+    """
+    return [[(syn, root_semrep(syn, semkey)) for syn in syntrees]
+            for syntrees in parse_sents(inputs, grammar, trace=trace)]
+
+def evaluate_sents(inputs, grammar, model, assignment, trace=0):
+    """
+    Add the truth-in-a-model value to each semantic representation
+    for each syntactic parse of each input sentences.
+
+    :param inputs: a list of sentences
+    :type inputs: list(str)
+    :param grammar: ``FeatureGrammar`` or name of feature-based grammar
+    :type grammar: nltk.grammar.FeatureGrammar
+    :return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model)
+    :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression, bool or dict(str): bool)))
+    """
+    return [[(syn, sem, model.evaluate("%s" % sem, assignment, trace=trace))
+            for (syn, sem) in interpretations]
+            for interpretations in interpret_sents(inputs, grammar)]
+
+
+def demo_model0():
+    global m0, g0
+    #Initialize a valuation of non-logical constants."""
+    v = [('john', 'b1'),
+        ('mary', 'g1'),
+        ('suzie', 'g2'),
+        ('fido', 'd1'),
+        ('tess', 'd2'),
+        ('noosa', 'n'),
+        ('girl', set(['g1', 'g2'])),
+        ('boy', set(['b1', 'b2'])),
+        ('dog', set(['d1', 'd2'])),
+        ('bark', set(['d1', 'd2'])),
+        ('walk', set(['b1', 'g2', 'd1'])),
+        ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
+        ('see', set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'),('d2', 'b1'), ('g2', 'n')])),
+        ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])),
+        ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')]))
+     ]
+    #Read in the data from ``v``
+    val = evaluate.Valuation(v)
+    #Bind ``dom`` to the ``domain`` property of ``val``
+    dom = val.domain
+    #Initialize a model with parameters ``dom`` and ``val``.
+    m0 = evaluate.Model(dom, val)
+    #Initialize a variable assignment with parameter ``dom``
+    g0 = evaluate.Assignment(dom)
+
+
+def read_sents(filename, encoding='utf8'):
+    with codecs.open(filename, 'r', encoding) as fp:
+        sents = [l.rstrip() for l in fp]
+
+    # get rid of blank lines
+    sents = [l for l in sents if len(l) > 0]
+    sents = [l for l in sents if not l[0] == '#']
+    return sents
+
+def demo_legacy_grammar():
+    """
+    Check that interpret_sents() is compatible with legacy grammars that use
+    a lowercase 'sem' feature.
+
+    Define 'test.fcfg' to be the following
+
+    """
+    from nltk.grammar import FeatureGrammar
+
+    g = FeatureGrammar.fromstring("""
+    % start S
+    S[sem=<hello>] -> 'hello'
+    """)
+    print("Reading grammar: %s" % g)
+    print("*" * 20)
+    for reading in interpret_sents(['hello'], g, semkey='sem'):
+        syn, sem = reading[0]
+        print()
+        print("output: ", sem)
+
+def demo():
+    import sys
+    from optparse import OptionParser
+    description = \
+    """
+    Parse and evaluate some sentences.
+    """
+
+    opts = OptionParser(description=description)
+
+    opts.set_defaults(evaluate=True, beta=True, syntrace=0,
+                      semtrace=0, demo='default', grammar='', sentences='')
+
+    opts.add_option("-d", "--demo", dest="demo",
+                    help="choose demo D; omit this for the default demo, or specify 'chat80'", metavar="D")
+    opts.add_option("-g", "--gram", dest="grammar",
+                    help="read in grammar G", metavar="G")
+    opts.add_option("-m", "--model", dest="model",
+                        help="import model M (omit '.py' suffix)", metavar="M")
+    opts.add_option("-s", "--sentences", dest="sentences",
+                        help="read in a file of test sentences S", metavar="S")
+    opts.add_option("-e", "--no-eval", action="store_false", dest="evaluate",
+                    help="just do a syntactic analysis")
+    opts.add_option("-b", "--no-beta-reduction", action="store_false",
+                    dest="beta", help="don't carry out beta-reduction")
+    opts.add_option("-t", "--syntrace", action="count", dest="syntrace",
+                    help="set syntactic tracing on; requires '-e' option")
+    opts.add_option("-T", "--semtrace", action="count", dest="semtrace",
+                    help="set semantic tracing on")
+
+    (options, args) = opts.parse_args()
+
+    SPACER = '-' * 30
+
+    demo_model0()
+
+    sents = [
+    'Fido sees a boy with Mary',
+    'John sees Mary',
+    'every girl chases a dog',
+    'every boy chases a girl',
+    'John walks with a girl in Noosa',
+    'who walks']
+
+    gramfile = 'grammars/sample_grammars/sem2.fcfg'
+
+    if options.sentences:
+        sentsfile = options.sentences
+    if options.grammar:
+        gramfile = options.grammar
+    if options.model:
+        exec("import %s as model" % options.model)
+
+    if sents is None:
+        sents = read_sents(sentsfile)
+
+    # Set model and assignment
+    model = m0
+    g = g0
+
+    if options.evaluate:
+        evaluations = \
+            evaluate_sents(sents, gramfile, model, g, trace=options.semtrace)
+    else:
+        semreps = \
+            interpret_sents(sents, gramfile, trace=options.syntrace)
+
+    for i, sent in enumerate(sents):
+        n = 1
+        print('\nSentence: %s' % sent)
+        print(SPACER)
+        if options.evaluate:
+
+            for (syntree, semrep, value) in evaluations[i]:
+                if isinstance(value, dict):
+                    value = set(value.keys())
+                print('%d:  %s' % (n, semrep))
+                print(value)
+                n += 1
+        else:
+
+            for (syntree, semrep) in semreps[i]:
+                print('%d:  %s' % (n, semrep))
+                n += 1
+
+if __name__ == "__main__":
+    demo()
+    demo_legacy_grammar()
diff --git a/nlp_resource_data/nltk/sem/util.pyc b/nlp_resource_data/nltk/sem/util.pyc

new file mode 100755 (executable)

index 0000000..76eedc9

Binary files /dev/null and b/nlp_resource_data/nltk/sem/util.pyc differ
diff --git a/nlp_resource_data/nltk/sentiment/__init__.py b/nlp_resource_data/nltk/sentiment/__init__.py

new file mode 100755 (executable)

index 0000000..6f879bf
--- /dev/null
+++ b/nlp_resource_data/nltk/sentiment/__init__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Sentiment Analysis
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Sentiment Analysis Package
+
+"""
+from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
diff --git a/nlp_resource_data/nltk/sentiment/__init__.pyc b/nlp_resource_data/nltk/sentiment/__init__.pyc

new file mode 100755 (executable)

index 0000000..3bf1123

Binary files /dev/null and b/nlp_resource_data/nltk/sentiment/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/sentiment/sentiment_analyzer.py b/nlp_resource_data/nltk/sentiment/sentiment_analyzer.py

new file mode 100755 (executable)

index 0000000..4fd18fb
--- /dev/null
+++ b/nlp_resource_data/nltk/sentiment/sentiment_analyzer.py
@@ -0,0 +1,229 @@
+# coding: utf-8
+#
+# Natural Language Toolkit: Sentiment Analyzer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks
+using NLTK features and classifiers, especially for teaching and demonstrative
+purposes.
+"""
+
+from __future__ import print_function
+from collections import defaultdict
+
+from nltk.classify.util import apply_features, accuracy as eval_accuracy
+from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import (BigramAssocMeasures, precision as eval_precision,
+    recall as eval_recall, f_measure as eval_f_measure)
+
+from nltk.probability import FreqDist
+
+from nltk.sentiment.util import save_file, timer
+
+class SentimentAnalyzer(object):
+    """
+    A Sentiment Analysis tool based on machine learning approaches.
+    """
+    def __init__(self, classifier=None):
+        self.feat_extractors = defaultdict(list)
+        self.classifier = classifier
+
+    def all_words(self, documents, labeled=None):
+        """
+        Return all words/tokens from the documents (with duplicates).
+        :param documents: a list of (words, label) tuples.
+        :param labeled: if `True`, assume that each document is represented by a
+            (words, label) tuple: (list(str), str). If `False`, each document is
+            considered as being a simple list of strings: list(str).
+        :rtype: list(str)
+        :return: A list of all words/tokens in `documents`.
+        """
+        all_words = []
+        if labeled is None:
+            labeled = documents and isinstance(documents[0], tuple)
+        if labeled == True:
+            for words, sentiment in documents:
+                all_words.extend(words)
+        elif labeled == False:
+            for words in documents:
+                all_words.extend(words)
+        return all_words
+
+    def apply_features(self, documents, labeled=None):
+        """
+        Apply all feature extractor functions to the documents. This is a wrapper
+        around `nltk.classify.util.apply_features`.
+
+        If `labeled=False`, return featuresets as:
+            [feature_func(doc) for doc in documents]
+        If `labeled=True`, return featuresets as:
+            [(feature_func(tok), label) for (tok, label) in toks]
+
+        :param documents: a list of documents. `If labeled=True`, the method expects
+            a list of (words, label) tuples.
+        :rtype: LazyMap
+        """
+        return apply_features(self.extract_features, documents, labeled)
+
+    def unigram_word_feats(self, words, top_n=None, min_freq=0):
+        """
+        Return most common top_n word features.
+
+        :param words: a list of words/tokens.
+        :param top_n: number of best words/tokens to use, sorted by frequency.
+        :rtype: list(str)
+        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
+            frequency.
+        """
+        # Stopwords are not removed
+        unigram_feats_freqs = FreqDist(word for word in words)
+        return [w for w, f in unigram_feats_freqs.most_common(top_n)
+                if unigram_feats_freqs[w] > min_freq]
+
+    def bigram_collocation_feats(self, documents, top_n=None, min_freq=3,
+                                 assoc_measure=BigramAssocMeasures.pmi):
+        """
+        Return `top_n` bigram features (using `assoc_measure`).
+        Note that this method is based on bigram collocations measures, and not
+        on simple bigram frequency.
+
+        :param documents: a list (or iterable) of tokens.
+        :param top_n: number of best words/tokens to use, sorted by association
+            measure.
+        :param assoc_measure: bigram association measure to use as score function.
+        :param min_freq: the minimum number of occurrencies of bigrams to take
+            into consideration.
+
+        :return: `top_n` ngrams scored by the given association measure.
+        """
+        finder = BigramCollocationFinder.from_documents(documents)
+        finder.apply_freq_filter(min_freq)
+        return finder.nbest(assoc_measure, top_n)
+
+    def classify(self, instance):
+        """
+        Classify a single instance applying the features that have already been
+        stored in the SentimentAnalyzer.
+
+        :param instance: a list (or iterable) of tokens.
+        :return: the classification result given by applying the classifier.
+        """
+        instance_feats = self.apply_features([instance], labeled=False)
+        return self.classifier.classify(instance_feats[0])
+
+    def add_feat_extractor(self, function, **kwargs):
+        """
+        Add a new function to extract features from a document. This function will
+        be used in extract_features().
+        Important: in this step our kwargs are only representing additional parameters,
+        and NOT the document we have to parse. The document will always be the first
+        parameter in the parameter list, and it will be added in the extract_features()
+        function.
+
+        :param function: the extractor function to add to the list of feature extractors.
+        :param kwargs: additional parameters required by the `function` function.
+        """
+        self.feat_extractors[function].append(kwargs)
+
+    def extract_features(self, document):
+        """
+        Apply extractor functions (and their parameters) to the present document.
+        We pass `document` as the first parameter of the extractor functions.
+        If we want to use the same extractor function multiple times, we have to
+        add it to the extractors with `add_feat_extractor` using multiple sets of
+        parameters (one for each call of the extractor function).
+
+        :param document: the document that will be passed as argument to the
+            feature extractor functions.
+        :return: A dictionary of populated features extracted from the document.
+        :rtype: dict
+        """
+        all_features = {}
+        for extractor in self.feat_extractors:
+            for param_set in self.feat_extractors[extractor]:
+                feats = extractor(document, **param_set)
+            all_features.update(feats)
+        return all_features
+
+    def train(self, trainer, training_set, save_classifier=None, **kwargs):
+        """
+        Train classifier on the training set, optionally saving the output in the
+        file specified by `save_classifier`.
+        Additional arguments depend on the specific trainer used. For example,
+        a MaxentClassifier can use `max_iter` parameter to specify the number
+        of iterations, while a NaiveBayesClassifier cannot.
+
+        :param trainer: `train` method of a classifier.
+            E.g.: NaiveBayesClassifier.train
+        :param training_set: the training set to be passed as argument to the
+            classifier `train` method.
+        :param save_classifier: the filename of the file where the classifier
+            will be stored (optional).
+        :param kwargs: additional parameters that will be passed as arguments to
+            the classifier `train` function.
+        :return: A classifier instance trained on the training set.
+        :rtype: 
+        """
+        print("Training classifier")
+        self.classifier = trainer(training_set, **kwargs)
+        if save_classifier:
+            save_file(self.classifier, save_classifier)
+
+        return self.classifier
+
+    def evaluate(self, test_set, classifier=None, accuracy=True, f_measure=True,
+                 precision=True, recall=True, verbose=False):
+        """
+        Evaluate and print classifier performance on the test set.
+
+        :param test_set: A list of (tokens, label) tuples to use as gold set.
+        :param classifier: a classifier instance (previously trained).
+        :param accuracy: if `True`, evaluate classifier accuracy.
+        :param f_measure: if `True`, evaluate classifier f_measure.
+        :param precision: if `True`, evaluate classifier precision.
+        :param recall: if `True`, evaluate classifier recall.
+        :return: evaluation results.
+        :rtype: dict(str): float
+        """
+        if classifier is None:
+            classifier = self.classifier
+        print("Evaluating {0} results...".format(type(classifier).__name__))
+        metrics_results = {}
+        if accuracy == True:
+            accuracy_score = eval_accuracy(classifier, test_set)
+            metrics_results['Accuracy'] = accuracy_score
+
+        gold_results = defaultdict(set)
+        test_results = defaultdict(set)
+        labels = set()
+        for i, (feats, label) in enumerate(test_set):
+            labels.add(label)
+            gold_results[label].add(i)
+            observed = classifier.classify(feats)
+            test_results[observed].add(i)
+
+        for label in labels:
+            if precision == True:
+                precision_score = eval_precision(gold_results[label],
+                    test_results[label])
+                metrics_results['Precision [{0}]'.format(label)] = precision_score
+            if recall == True:
+                recall_score = eval_recall(gold_results[label],
+                    test_results[label])
+                metrics_results['Recall [{0}]'.format(label)] = recall_score
+            if f_measure == True:
+                f_measure_score = eval_f_measure(gold_results[label],
+                    test_results[label])
+                metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
+
+        # Print evaluation results (in alphabetical order)
+        if verbose == True:
+            for result in sorted(metrics_results):
+                print('{0}: {1}'.format(result, metrics_results[result]))
+
+        return metrics_results
diff --git a/nlp_resource_data/nltk/sentiment/sentiment_analyzer.pyc b/nlp_resource_data/nltk/sentiment/sentiment_analyzer.pyc

new file mode 100755 (executable)

index 0000000..aeb5ee6

Binary files /dev/null and b/nlp_resource_data/nltk/sentiment/sentiment_analyzer.pyc differ
diff --git a/nlp_resource_data/nltk/sentiment/util.py b/nlp_resource_data/nltk/sentiment/util.py

new file mode 100755 (executable)

index 0000000..b8e3fbe
--- /dev/null
+++ b/nlp_resource_data/nltk/sentiment/util.py
@@ -0,0 +1,762 @@
+# coding: utf-8
+#
+# Natural Language Toolkit: Sentiment Analyzer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility methods for Sentiment Analysis.
+"""
+from __future__ import division
+
+import codecs
+import csv
+import json
+import pickle
+import random
+import re
+import sys
+import time
+from copy import deepcopy
+from itertools import tee
+
+import nltk
+from nltk.corpus import CategorizedPlaintextCorpusReader
+from nltk.data import load
+from nltk.tokenize.casual import EMOTICON_RE
+from nltk.twitter.common import outf_writer_compat, extract_fields
+
+#////////////////////////////////////////////////////////////
+#{ Regular expressions
+#////////////////////////////////////////////////////////////
+
+# Regular expression for negation by Christopher Potts
+NEGATION = r"""
+    (?:
+        ^(?:never|no|nothing|nowhere|noone|none|not|
+            havent|hasnt|hadnt|cant|couldnt|shouldnt|
+            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
+        )$
+    )
+    |
+    n't"""
+
+NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
+
+CLAUSE_PUNCT = r'^[.:;!?]$'
+CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
+
+# Happy and sad emoticons
+
+HAPPY = set([
+    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
+    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
+    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
+    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
+    '<3'
+    ])
+
+SAD = set([
+    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
+    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
+    ':c', ':{', '>:\\', ';('
+    ])
+
+
+def timer(method):
+    """
+    A timer decorator to measure execution performance of methods.
+    """
+    def timed(*args, **kw):
+        start = time.time()
+        result = method(*args, **kw)
+        end = time.time()
+        tot_time = end - start
+        hours = tot_time // 3600
+        mins = tot_time // 60 % 60
+        # in Python 2.x round() will return a float, so we convert it to int
+        secs = int(round(tot_time % 60))
+        if hours == 0 and mins == 0 and secs < 10:
+            print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time))
+        else:
+            print('[TIMER] {0}(): {1}h {2}m {3}s'.format(method.__name__, hours, mins, secs))
+        return result
+    return timed
+
+
+def pairwise(iterable):
+    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
+    a, b = tee(iterable)
+    next(b, None)
+    return zip(a, b)
+
+#////////////////////////////////////////////////////////////
+#{ Feature extractor functions
+#////////////////////////////////////////////////////////////
+"""
+Feature extractor functions are declared outside the SentimentAnalyzer class.
+Users should have the possibility to create their own feature extractors
+without modifying SentimentAnalyzer.
+"""
+
+def extract_unigram_feats(document, unigrams, handle_negation=False):
+    """
+    Populate a dictionary of unigram features, reflecting the presence/absence in
+    the document of each of the tokens in `unigrams`.
+
+    :param document: a list of words/tokens.
+    :param unigrams: a list of words/tokens whose presence/absence has to be
+        checked in `document`.
+    :param handle_negation: if `handle_negation == True` apply `mark_negation`
+        method to `document` before checking for unigram presence/absence.
+    :return: a dictionary of unigram features {unigram : boolean}.
+
+    >>> words = ['ice', 'police', 'riot']
+    >>> document = 'ice is melting due to global warming'.split()
+    >>> sorted(extract_unigram_feats(document, words).items())
+    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
+    """
+    features = {}
+    if handle_negation:
+        document = mark_negation(document)
+    for word in unigrams:
+        features['contains({0})'.format(word)] = word in set(document)
+    return features
+
+def extract_bigram_feats(document, bigrams):
+    """
+    Populate a dictionary of bigram features, reflecting the presence/absence in
+    the document of each of the tokens in `bigrams`. This extractor function only
+    considers contiguous bigrams obtained by `nltk.bigrams`.
+
+    :param document: a list of words/tokens.
+    :param unigrams: a list of bigrams whose presence/absence has to be
+        checked in `document`.
+    :return: a dictionary of bigram features {bigram : boolean}.
+
+    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
+    >>> document = 'ice is melting due to global warming'.split()
+    >>> sorted(extract_bigram_feats(document, bigrams).items())
+    [('contains(global - warming)', True), ('contains(love - you)', False),
+    ('contains(police - prevented)', False)]
+    """
+    features = {}
+    for bigr in bigrams:
+        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
+    return features
+
+#////////////////////////////////////////////////////////////
+#{ Helper Functions
+#////////////////////////////////////////////////////////////
+
+def mark_negation(document, double_neg_flip=False, shallow=False):
+    """
+    Append _NEG suffix to words that appear in the scope between a negation
+    and a punctuation mark.
+
+    :param document: a list of words/tokens, or a tuple (words, label).
+    :param shallow: if True, the method will modify the original document in place.
+    :param double_neg_flip: if True, double negation is considered affirmation
+        (we activate/deactivate negation scope everytime we find a negation).
+    :return: if `shallow == True` the method will modify the original document
+        and return it. If `shallow == False` the method will return a modified
+        document, leaving the original unmodified.
+
+    >>> sent = "I didn't like this movie . It was bad .".split()
+    >>> mark_negation(sent)
+    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
+    """
+    if not shallow:
+        document = deepcopy(document)
+    # check if the document is labeled. If so, do not consider the label.
+    labeled = document and isinstance(document[0], (tuple, list))
+    if labeled:
+        doc = document[0]
+    else:
+        doc = document
+    neg_scope = False
+    for i, word in enumerate(doc):
+        if NEGATION_RE.search(word):
+            if not neg_scope or (neg_scope and double_neg_flip):
+                neg_scope = not neg_scope
+                continue
+            else:
+                doc[i] += '_NEG'
+        elif neg_scope and CLAUSE_PUNCT_RE.search(word):
+            neg_scope = not neg_scope
+        elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
+            doc[i] += '_NEG'
+
+    return document
+
+def output_markdown(filename, **kwargs):
+    """
+    Write the output of an analysis to a file.
+    """
+    with codecs.open(filename, 'at') as outfile:
+        text = '\n*** \n\n'
+        text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M"))
+        for k in sorted(kwargs):
+            if isinstance(kwargs[k], dict):
+                dictionary = kwargs[k]
+                text += '  - **{0}:**\n'.format(k)
+                for entry in sorted(dictionary):
+                    text += '    - {0}: {1} \n'.format(entry, dictionary[entry])
+            elif isinstance(kwargs[k], list):
+                text += '  - **{0}:**\n'.format(k)
+                for entry in kwargs[k]:
+                    text += '    - {0}\n'.format(entry)
+            else:
+                text += '  - **{0}:** {1} \n'.format(k, kwargs[k])
+        outfile.write(text)
+
+def save_file(content, filename):
+    """
+    Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
+    """
+    print("Saving", filename)
+    with codecs.open(filename, 'wb') as storage_file:
+        # The protocol=2 parameter is for python2 compatibility
+        pickle.dump(content, storage_file, protocol=2)
+
+def split_train_test(all_instances, n=None):
+    """
+    Randomly split `n` instances of the dataset into train and test sets.
+
+    :param all_instances: a list of instances (e.g. documents) that will be split.
+    :param n: the number of instances to consider (in case we want to use only a
+        subset).
+    :return: two lists of instances. Train set is 8/10 of the total and test set
+        is 2/10 of the total.
+    """
+    random.seed(12345)
+    random.shuffle(all_instances)
+    if not n or n > len(all_instances):
+        n = len(all_instances)
+    train_set = all_instances[:int(.8*n)]
+    test_set = all_instances[int(.8*n):n]
+
+    return train_set, test_set
+
+def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        raise ImportError('The plot function requires matplotlib to be installed.'
+                         'See http://matplotlib.org/')
+
+    plt.locator_params(axis='y', nbins=3)
+    axes = plt.axes()
+    axes.yaxis.grid()
+    plt.plot(x_values, y_values, 'ro', color='red')
+    plt.ylim(ymin=-1.2, ymax=1.2)
+    plt.tight_layout(pad=5)
+    if x_labels:
+        plt.xticks(x_values, x_labels, rotation='vertical')
+    if y_labels:
+        plt.yticks([-1, 0, 1], y_labels, rotation='horizontal')
+    # Pad margins so that markers are not clipped by the axes
+    plt.margins(0.2)
+    plt.show()
+
+#////////////////////////////////////////////////////////////
+#{ Parsing and conversion functions
+#////////////////////////////////////////////////////////////
+
+def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace',
+            gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True,
+            skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True,
+            limit=None):
+    """
+    Convert json file to csv file, preprocessing each row to obtain a suitable
+    dataset for tweets Semantic Analysis.
+
+    :param json_file: the original json file containing tweets.
+    :param outfile: the output csv filename.
+    :param fields: a list of fields that will be extracted from the json file and
+        kept in the output csv file.
+    :param encoding: the encoding of the files.
+    :param errors: the error handling strategy for the output writer.
+    :param gzip_compress: if True, create a compressed GZIP file.
+
+    :param skip_retweets: if True, remove retweets.
+    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
+        emoticons.
+    :param skip_ambiguous_tweets: if True, remove tweets containing both happy
+        and sad emoticons.
+    :param strip_off_emoticons: if True, strip off emoticons from all tweets.
+    :param remove_duplicates: if True, remove tweets appearing more than once.
+    :param limit: an integer to set the number of tweets to convert. After the
+        limit is reached the conversion will stop. It can be useful to create
+        subsets of the original tweets json data.
+    """
+    with codecs.open(json_file, encoding=encoding) as fp:
+        (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
+        # write the list of fields as header
+        writer.writerow(fields)
+
+        if remove_duplicates == True:
+            tweets_cache = []
+        i = 0
+        for line in fp:
+            tweet = json.loads(line)
+            row = extract_fields(tweet, fields)
+            try:
+                text = row[fields.index('text')]
+                # Remove retweets
+                if skip_retweets == True:
+                    if re.search(r'\bRT\b', text):
+                        continue
+                # Remove tweets containing ":P" and ":-P" emoticons
+                if skip_tongue_tweets == True:
+                    if re.search(r'\:\-?P\b', text):
+                        continue
+                # Remove tweets containing both happy and sad emoticons
+                if skip_ambiguous_tweets == True:
+                    all_emoticons = EMOTICON_RE.findall(text)
+                    if all_emoticons:
+                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
+                            continue
+                # Strip off emoticons from all tweets
+                if strip_off_emoticons == True:
+                    row[fields.index('text')] = re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text))
+                # Remove duplicate tweets
+                if remove_duplicates == True:
+                    if row[fields.index('text')] in tweets_cache:
+                        continue
+                    else:
+                        tweets_cache.append(row[fields.index('text')])
+            except ValueError:
+                pass
+            writer.writerow(row)
+            i += 1
+            if limit and i >= limit:
+                break
+        outf.close()
+
+def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None,
+                     skip_header=True):
+    """
+    Parse csv file containing tweets and output data a list of (text, label) tuples.
+
+    :param filename: the input csv filename.
+    :param label: the label to be appended to each tweet contained in the csv file.
+    :param word_tokenizer: the tokenizer instance that will be used to tokenize
+        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
+        If no word_tokenizer is specified, tweets will not be tokenized.
+    :param sent_tokenizer: the tokenizer that will be used to split each tweet into
+        sentences.
+    :param skip_header: if True, skip the first line of the csv file (which usually
+        contains headers).
+
+    :return: a list of (text, label) tuples.
+    """
+    tweets = []
+    if not sent_tokenizer:
+        sent_tokenizer = load('tokenizers/punkt/english.pickle')
+
+    # If we use Python3.x we can proceed using the 'rt' flag
+    if sys.version_info[0] == 3:
+        with codecs.open(filename, 'rt') as csvfile:
+            reader = csv.reader(csvfile)
+            if skip_header == True:
+                next(reader, None) # skip the header
+            i = 0
+            for tweet_id, text in reader:
+                # text = text[1]
+                i += 1
+                sys.stdout.write('Loaded {0} tweets\r'.format(i))
+                # Apply sentence and word tokenizer to text
+                if word_tokenizer:
+                    tweet = [w for sent in sent_tokenizer.tokenize(text)
+                                       for w in word_tokenizer.tokenize(sent)]
+                else:
+                    tweet = text
+                tweets.append((tweet, label))
+    # If we use Python2.x we need to handle encoding problems
+    elif sys.version_info[0] < 3:
+        with codecs.open(filename) as csvfile:
+            reader = csv.reader(csvfile)
+            if skip_header == True:
+                next(reader, None) # skip the header
+            i = 0
+            for row in reader:
+                unicode_row = [x.decode('utf8') for x in row]
+                text = unicode_row[1]
+                i += 1
+                sys.stdout.write('Loaded {0} tweets\r'.format(i))
+                # Apply sentence and word tokenizer to text
+                if word_tokenizer:
+                    tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text)
+                                       for w in word_tokenizer.tokenize(sent)]
+                else:
+                    tweet = text
+                tweets.append((tweet, label))
+    print("Loaded {0} tweets".format(i))
+    return tweets
+
+#////////////////////////////////////////////////////////////
+#{ Demos
+#////////////////////////////////////////////////////////////
+
+def demo_tweets(trainer, n_instances=None, output=None):
+    """
+    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
+    TweetTokenizer.
+    Features are composed of:
+        - 1000 most frequent unigrams
+        - 100 top bigrams (using BigramAssocMeasures.pmi)
+
+    :param trainer: `train` method of a classifier.
+    :param n_instances: the number of total tweets that have to be used for
+        training and testing. Tweets will be equally split between positive and
+        negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.tokenize import TweetTokenizer
+    from nltk.sentiment import SentimentAnalyzer
+    from nltk.corpus import twitter_samples, stopwords
+
+    # Different customizations for the TweetTokenizer
+    tokenizer = TweetTokenizer(preserve_case=False)
+    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
+    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    fields = ['id', 'text']
+    positive_json = twitter_samples.abspath("positive_tweets.json")
+    positive_csv = 'positive_tweets.csv'
+    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
+
+    negative_json = twitter_samples.abspath("negative_tweets.json")
+    negative_csv = 'negative_tweets.csv'
+    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
+
+    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
+    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_tweets = train_pos_docs+train_neg_docs
+    testing_tweets = test_pos_docs+test_neg_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    # stopwords = stopwords.words('english')
+    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
+    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]
+
+    # Add simple unigram word features
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+    # Add bigram collocation features
+    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
+        top_n=100, min_freq=12)
+    sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)
+
+    training_set = sentim_analyzer.apply_features(training_tweets)
+    test_set = sentim_analyzer.apply_features(testing_tweets)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print('Your classifier does not provide a show_most_informative_features() method.')
+    results = sentim_analyzer.evaluate(test_set)
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
+                        Tokenizer=tokenizer.__class__.__name__, Feats=extr,
+                        Results=results, Instances=n_instances)
+
+def demo_movie_reviews(trainer, n_instances=None, output=None):
+    """
+    Train classifier on all instances of the Movie Reviews dataset.
+    The corpus has been preprocessed using the default sentence tokenizer and
+    WordPunctTokenizer.
+    Features are composed of:
+        - most frequent unigrams
+
+    :param trainer: `train` method of a classifier.
+    :param n_instances: the number of total reviews that have to be used for
+        training and testing. Reviews will be equally split between positive and
+        negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.corpus import movie_reviews
+    from nltk.sentiment import SentimentAnalyzer
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
+    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
+    # We separately split positive and negative instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_docs = train_pos_docs+train_neg_docs
+    testing_docs = test_pos_docs+test_neg_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    all_words = sentim_analyzer.all_words(training_docs)
+
+    # Add simple unigram word features
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+    # Apply features to obtain a feature-value representation of our datasets
+    training_set = sentim_analyzer.apply_features(training_docs)
+    test_set = sentim_analyzer.apply_features(testing_docs)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print('Your classifier does not provide a show_most_informative_features() method.')
+    results = sentim_analyzer.evaluate(test_set)
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
+                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
+                        Instances=n_instances)
+
+def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
+    """
+    Train and test a classifier on instances of the Subjective Dataset by Pang and
+    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
+    All tokens (words and punctuation marks) are separated by a whitespace, so
+    we use the basic WhitespaceTokenizer to parse the data.
+
+    :param trainer: `train` method of a classifier.
+    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
+    :param n_instances: the number of total sentences that have to be used for
+        training and testing. Sentences will be equally split between positive
+        and negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.sentiment import SentimentAnalyzer
+    from nltk.corpus import subjectivity
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
+    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
+    train_obj_docs, test_obj_docs = split_train_test(obj_docs)
+
+    training_docs = train_subj_docs+train_obj_docs
+    testing_docs = test_subj_docs+test_obj_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
+
+    # Add simple unigram word features handling negation
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+    # Apply features to obtain a feature-value representation of our datasets
+    training_set = sentim_analyzer.apply_features(training_docs)
+    test_set = sentim_analyzer.apply_features(testing_docs)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print('Your classifier does not provide a show_most_informative_features() method.')
+    results = sentim_analyzer.evaluate(test_set)
+
+    if save_analyzer == True:
+        save_file(sentim_analyzer, 'sa_subjectivity.pickle')
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
+                        Tokenizer='WhitespaceTokenizer', Feats=extr,
+                        Instances=n_instances, Results=results)
+
+    return sentim_analyzer
+
+def demo_sent_subjectivity(text):
+    """
+    Classify a single sentence as subjective or objective using a stored
+    SentimentAnalyzer.
+
+    :param text: a sentence whose subjectivity has to be classified.
+    """
+    from nltk.classify import NaiveBayesClassifier
+    from nltk.tokenize import regexp
+    word_tokenizer = regexp.WhitespaceTokenizer()
+    try:
+        sentim_analyzer = load('sa_subjectivity.pickle')
+    except LookupError:
+        print('Cannot find the sentiment analyzer you want to load.')
+        print('Training a new one using NaiveBayesClassifier.')
+        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
+
+    # Tokenize and convert to lower case
+    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
+    print(sentim_analyzer.classify(tokenized_text))
+
+def demo_liu_hu_lexicon(sentence, plot=False):
+    """
+    Basic example of sentiment classification using Liu and Hu opinion lexicon.
+    This function simply counts the number of positive, negative and neutral words
+    in the sentence and classifies it depending on which polarity is more represented.
+    Words that do not appear in the lexicon are considered as neutral.
+
+    :param sentence: a sentence whose polarity has to be classified.
+    :param plot: if True, plot a visual representation of the sentence polarity.
+    """
+    from nltk.corpus import opinion_lexicon
+    from nltk.tokenize import treebank
+
+    tokenizer = treebank.TreebankWordTokenizer()
+    pos_words = 0
+    neg_words = 0
+    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
+
+    x = list(range(len(tokenized_sent))) # x axis for the plot
+    y = []
+
+    for word in tokenized_sent:
+        if word in opinion_lexicon.positive():
+            pos_words += 1
+            y.append(1) # positive
+        elif word in opinion_lexicon.negative():
+            neg_words += 1
+            y.append(-1) # negative
+        else:
+            y.append(0) # neutral
+
+    if pos_words > neg_words:
+        print('Positive')
+    elif pos_words < neg_words:
+        print('Negative')
+    elif pos_words == neg_words:
+        print('Neutral')
+
+    if plot == True:
+        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
+
+def demo_vader_instance(text):
+    """
+    Output polarity scores for a text using Vader approach.
+
+    :param text: a text whose polarity has to be evaluated.
+    """
+    from nltk.sentiment import SentimentIntensityAnalyzer
+    vader_analyzer = SentimentIntensityAnalyzer()
+    print(vader_analyzer.polarity_scores(text))
+
+def demo_vader_tweets(n_instances=None, output=None):
+    """
+    Classify 10000 positive and negative tweets using Vader approach.
+
+    :param n_instances: the number of total tweets that have to be classified.
+    :param output: the output file where results have to be reported.
+    """
+    from collections import defaultdict
+    from nltk.corpus import twitter_samples
+    from nltk.sentiment import SentimentIntensityAnalyzer
+    from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision,
+        recall as eval_recall, f_measure as eval_f_measure)
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    fields = ['id', 'text']
+    positive_json = twitter_samples.abspath("positive_tweets.json")
+    positive_csv = 'positive_tweets.csv'
+    json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False,
+                        limit=n_instances)
+
+    negative_json = twitter_samples.abspath("negative_tweets.json")
+    negative_csv = 'negative_tweets.csv'
+    json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False,
+                        limit=n_instances)
+
+    pos_docs = parse_tweets_set(positive_csv, label='pos')
+    neg_docs = parse_tweets_set(negative_csv, label='neg')
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_tweets = train_pos_docs+train_neg_docs
+    testing_tweets = test_pos_docs+test_neg_docs
+
+    vader_analyzer = SentimentIntensityAnalyzer()
+
+    gold_results = defaultdict(set)
+    test_results = defaultdict(set)
+    acc_gold_results = []
+    acc_test_results = []
+    labels = set()
+    num = 0
+    for i, (text, label) in enumerate(testing_tweets):
+        labels.add(label)
+        gold_results[label].add(i)
+        acc_gold_results.append(label)
+        score = vader_analyzer.polarity_scores(text)['compound']
+        if score > 0:
+            observed = 'pos'
+        else:
+            observed = 'neg'
+        num += 1
+        acc_test_results.append(observed)
+        test_results[observed].add(i)
+    metrics_results = {}
+    for label in labels:
+        accuracy_score = eval_accuracy(acc_gold_results,
+            acc_test_results)
+        metrics_results['Accuracy'] = accuracy_score
+        precision_score = eval_precision(gold_results[label],
+            test_results[label])
+        metrics_results['Precision [{0}]'.format(label)] = precision_score
+        recall_score = eval_recall(gold_results[label],
+            test_results[label])
+        metrics_results['Recall [{0}]'.format(label)] = recall_score
+        f_measure_score = eval_f_measure(gold_results[label],
+            test_results[label])
+        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
+
+    for result in sorted(metrics_results):
+            print('{0}: {1}'.format(result, metrics_results[result]))
+
+    if output:
+        output_markdown(output, Approach='Vader', Dataset='labeled_tweets',
+            Instances=n_instances, Results=metrics_results)
+
+if __name__ == '__main__':
+    from nltk.classify import NaiveBayesClassifier, MaxentClassifier
+    from nltk.classify.scikitlearn import SklearnClassifier
+    from sklearn.svm import LinearSVC
+
+    naive_bayes = NaiveBayesClassifier.train
+    svm = SklearnClassifier(LinearSVC()).train
+    maxent = MaxentClassifier.train
+
+    demo_tweets(naive_bayes)
+    # demo_movie_reviews(svm)
+    # demo_subjectivity(svm)
+    # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
+    # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
+    # demo_vader_instance("This movie was actually neither that funny, nor super witty.")
+    # demo_vader_tweets()
diff --git a/nlp_resource_data/nltk/sentiment/util.pyc b/nlp_resource_data/nltk/sentiment/util.pyc

new file mode 100755 (executable)

index 0000000..7c5908b

Binary files /dev/null and b/nlp_resource_data/nltk/sentiment/util.pyc differ
diff --git a/nlp_resource_data/nltk/sentiment/vader.py b/nlp_resource_data/nltk/sentiment/vader.py

new file mode 100755 (executable)

index 0000000..2d232ba
--- /dev/null
+++ b/nlp_resource_data/nltk/sentiment/vader.py
@@ -0,0 +1,443 @@
+# coding: utf-8
+# Natural Language Toolkit: vader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
+#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
+#         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
+#         George Berry <geb97@cornell.edu> (modifications)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+# Modifications to the original VADER code have been made in order to
+# integrate it into NLTK. These have involved changes to
+# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
+
+"""
+If you use the VADER sentiment analysis tools, please cite:
+
+Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
+Sentiment Analysis of Social Media Text. Eighth International Conference on
+Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+"""
+
+import codecs
+import math
+import re
+import string
+from itertools import product
+import nltk.data
+from .util import pairwise
+
+##Constants##
+
+# (empirically derived mean sentiment intensity rating increase for booster words)
+B_INCR = 0.293
+B_DECR = -0.293
+
+# (empirically derived mean sentiment intensity rating increase for using
+# ALLCAPs to emphasize a word)
+C_INCR = 0.733
+
+N_SCALAR = -0.74
+
+# for removing punctuation
+REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
+
+PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
+             "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
+NEGATE = {"aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
+ "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
+ "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
+ "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
+ "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
+ "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
+ "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
+ "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"}
+
+# booster/dampener 'intensifiers' or 'degree adverbs'
+# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+BOOSTER_DICT = \
+{"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR,
+ "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR,
+ "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR,
+ "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR,
+ "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR,
+ "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR,
+ "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
+ "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
+ "so": B_INCR, "substantially": B_INCR,
+ "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR,
+ "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR,
+ "very": B_INCR,
+ "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
+ "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR,
+ "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
+ "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
+ "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}
+
+# check for special case idioms using a sentiment-laden keyword known to SAGE
+SPECIAL_CASE_IDIOMS = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2,
+                       "cut the mustard": 2, "kiss of death": -1.5, "hand to mouth": -2}
+
+
+##Static methods##
+
+def negated(input_words, include_nt=True):
+    """
+    Determine if input contains negation words
+    """
+    neg_words = NEGATE
+    if any(word.lower() in neg_words for word in input_words):
+        return True
+    if include_nt:
+        if any("n't" in word.lower() for word in input_words):
+            return True
+    for first, second in pairwise(input_words):
+        if second.lower() == "least" and first.lower() != 'at':
+            return True
+    return False
+
+
+def normalize(score, alpha=15):
+    """
+    Normalize the score to be between -1 and 1 using an alpha that
+    approximates the max expected value
+    """
+    norm_score = score/math.sqrt((score*score) + alpha)
+    return norm_score
+
+
+def allcap_differential(words):
+    """
+    Check whether just some words in the input are ALL CAPS
+
+    :param list words: The words to inspect
+    :returns: `True` if some but not all items in `words` are ALL CAPS
+    """
+    is_different = False
+    allcap_words = 0
+    for word in words:
+        if word.isupper():
+            allcap_words += 1
+    cap_differential = len(words) - allcap_words
+    if cap_differential > 0 and cap_differential < len(words):
+        is_different = True
+    return is_different
+
+
+def scalar_inc_dec(word, valence, is_cap_diff):
+    """
+    Check if the preceding words increase, decrease, or negate/nullify the
+    valence
+    """
+    scalar = 0.0
+    word_lower = word.lower()
+    if word_lower in BOOSTER_DICT:
+        scalar = BOOSTER_DICT[word_lower]
+        if valence < 0:
+            scalar *= -1
+        #check if booster/dampener word is in ALLCAPS (while others aren't)
+        if word.isupper() and is_cap_diff:
+            if valence > 0:
+                scalar += C_INCR
+            else: scalar -= C_INCR
+    return scalar
+
+class SentiText(object):
+    """
+    Identify sentiment-relevant string-level properties of input text.
+    """
+    def __init__(self, text):
+        if not isinstance(text, str):
+            text = str(text.encode('utf-8'))
+        self.text = text
+        self.words_and_emoticons = self._words_and_emoticons()
+        # doesn't separate words from\
+        # adjacent punctuation (keeps emoticons & contractions)
+        self.is_cap_diff = allcap_differential(self.words_and_emoticons)
+
+    def _words_plus_punc(self):
+        """
+        Returns mapping of form:
+        {
+            'cat,': 'cat',
+            ',cat': 'cat',
+        }
+        """
+        no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
+        # removes punctuation (but loses emoticons & contractions)
+        words_only = no_punc_text.split()
+        # remove singletons
+        words_only = set( w for w in words_only if len(w) > 1 )
+        # the product gives ('cat', ',') and (',', 'cat')
+        punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
+        punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
+        words_punc_dict = punc_before
+        words_punc_dict.update(punc_after)
+        return words_punc_dict
+
+    def _words_and_emoticons(self):
+        """
+        Removes leading and trailing puncutation
+        Leaves contractions and most emoticons
+            Does not preserve punc-plus-letter emoticons (e.g. :D)
+        """
+        wes = self.text.split()
+        words_punc_dict = self._words_plus_punc()
+        wes = [we for we in wes if len(we) > 1]
+        for i, we in enumerate(wes):
+            if we in words_punc_dict:
+                wes[i] = words_punc_dict[we]
+        return wes
+
+class SentimentIntensityAnalyzer(object):
+    """
+    Give a sentiment intensity score to sentences.
+    """
+    def __init__(self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"):
+        self.lexicon_file = nltk.data.load(lexicon_file)
+        self.lexicon = self.make_lex_dict()
+
+    def make_lex_dict(self):
+        """
+        Convert lexicon file to a dictionary
+        """
+        lex_dict = {}
+        for line in self.lexicon_file.split('\n'):
+            (word, measure) = line.strip().split('\t')[0:2]
+            lex_dict[word] = float(measure)
+        return lex_dict
+
+    def polarity_scores(self, text):
+        """
+        Return a float for sentiment strength based on the input text.
+        Positive values are positive valence, negative value are negative
+        valence.
+        """
+        sentitext = SentiText(text)
+        #text, words_and_emoticons, is_cap_diff = self.preprocess(text)
+
+        sentiments = []
+        words_and_emoticons = sentitext.words_and_emoticons
+        for item in words_and_emoticons:
+            valence = 0
+            i = words_and_emoticons.index(item)
+            if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \
+                words_and_emoticons[i+1].lower() == "of") or \
+                item.lower() in BOOSTER_DICT:
+                sentiments.append(valence)
+                continue
+
+            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
+
+        sentiments = self._but_check(words_and_emoticons, sentiments)
+
+        return self.score_valence(sentiments, text)
+
+    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
+        is_cap_diff = sentitext.is_cap_diff
+        words_and_emoticons = sentitext.words_and_emoticons
+        item_lowercase = item.lower()
+        if item_lowercase in self.lexicon:
+            #get the sentiment valence
+            valence = self.lexicon[item_lowercase]
+
+            #check if sentiment laden word is in ALL CAPS (while others aren't)
+            if item.isupper() and is_cap_diff:
+                if valence > 0:
+                    valence += C_INCR
+                else:
+                    valence -= C_INCR
+
+            for start_i in range(0,3):
+                if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon:
+                    # dampen the scalar modifier of preceding words and emoticons
+                    # (excluding the ones that immediately preceed the item) based
+                    # on their distance from the current item.
+                    s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff)
+                    if start_i == 1 and s != 0:
+                        s = s*0.95
+                    if start_i == 2 and s != 0:
+                        s = s*0.9
+                    valence = valence+s
+                    valence = self._never_check(valence, words_and_emoticons, start_i, i)
+                    if start_i == 2:
+                        valence = self._idioms_check(valence, words_and_emoticons, i)
+
+                        # future work: consider other sentiment-laden idioms
+                        # other_idioms =
+                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
+                        #  "upper hand": 1, "break a leg": 2,
+                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
+                        #  "on the ball": 2,"under the weather": -2}
+
+            valence = self._least_check(valence, words_and_emoticons, i)
+
+        sentiments.append(valence)
+        return sentiments
+
+    def _least_check(self, valence, words_and_emoticons, i):
+        # check for negation case using "least"
+        if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \
+           and words_and_emoticons[i-1].lower() == "least":
+            if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very":
+                valence = valence*N_SCALAR
+        elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \
+             and words_and_emoticons[i-1].lower() == "least":
+            valence = valence*N_SCALAR
+        return valence
+
+    def _but_check(self, words_and_emoticons, sentiments):
+        # check for modification in sentiment due to contrastive conjunction 'but'
+        if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
+            try:
+                bi = words_and_emoticons.index('but')
+            except ValueError:
+                bi = words_and_emoticons.index('BUT')
+            for sentiment in sentiments:
+                si = sentiments.index(sentiment)
+                if si < bi:
+                    sentiments.pop(si)
+                    sentiments.insert(si, sentiment*0.5)
+                elif si > bi:
+                    sentiments.pop(si)
+                    sentiments.insert(si, sentiment*1.5)
+        return sentiments
+
+    def _idioms_check(self, valence, words_and_emoticons, i):
+        onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i])
+
+        twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2],
+                                       words_and_emoticons[i-1], words_and_emoticons[i])
+
+        twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1])
+
+        threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3],
+                                        words_and_emoticons[i-2], words_and_emoticons[i-1])
+
+        threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2])
+
+        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
+
+        for seq in sequences:
+            if seq in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[seq]
+                break
+
+        if len(words_and_emoticons)-1 > i:
+            zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1])
+            if zeroone in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[zeroone]
+        if len(words_and_emoticons)-1 > i+1:
+            zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2])
+            if zeroonetwo in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
+
+        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
+        if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
+            valence = valence+B_DECR
+        return valence
+
+    def _never_check(self, valence, words_and_emoticons, start_i, i):
+        if start_i == 0:
+            if negated([words_and_emoticons[i-1]]):
+                    valence = valence*N_SCALAR
+        if start_i == 1:
+            if words_and_emoticons[i-2] == "never" and\
+               (words_and_emoticons[i-1] == "so" or
+                words_and_emoticons[i-1] == "this"):
+                valence = valence*1.5
+            elif negated([words_and_emoticons[i-(start_i+1)]]):
+                valence = valence*N_SCALAR
+        if start_i == 2:
+            if words_and_emoticons[i-3] == "never" and \
+               (words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \
+               (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"):
+                valence = valence*1.25
+            elif negated([words_and_emoticons[i-(start_i+1)]]):
+                valence = valence*N_SCALAR
+        return valence
+
+    def _punctuation_emphasis(self, sum_s, text):
+        # add emphasis from exclamation points and question marks
+        ep_amplifier = self._amplify_ep(text)
+        qm_amplifier = self._amplify_qm(text)
+        punct_emph_amplifier = ep_amplifier+qm_amplifier
+        return punct_emph_amplifier
+
+    def _amplify_ep(self, text):
+        # check for added emphasis resulting from exclamation points (up to 4 of them)
+        ep_count = text.count("!")
+        if ep_count > 4:
+            ep_count = 4
+        # (empirically derived mean sentiment intensity rating increase for
+        # exclamation points)
+        ep_amplifier = ep_count*0.292
+        return ep_amplifier
+
+    def _amplify_qm(self, text):
+        # check for added emphasis resulting from question marks (2 or 3+)
+        qm_count = text.count("?")
+        qm_amplifier = 0
+        if qm_count > 1:
+            if qm_count <= 3:
+                # (empirically derived mean sentiment intensity rating increase for
+                # question marks)
+                qm_amplifier = qm_count*0.18
+            else:
+                qm_amplifier = 0.96
+        return qm_amplifier
+
+    def _sift_sentiment_scores(self, sentiments):
+        # want separate positive versus negative sentiment scores
+        pos_sum = 0.0
+        neg_sum = 0.0
+        neu_count = 0
+        for sentiment_score in sentiments:
+            if sentiment_score > 0:
+                pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
+            if sentiment_score < 0:
+                neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
+            if sentiment_score == 0:
+                neu_count += 1
+        return pos_sum, neg_sum, neu_count
+
+    def score_valence(self, sentiments, text):
+        if sentiments:
+            sum_s = float(sum(sentiments))
+            # compute and add emphasis from punctuation in text
+            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
+            if sum_s > 0:
+                sum_s += punct_emph_amplifier
+            elif  sum_s < 0:
+                sum_s -= punct_emph_amplifier
+
+            compound = normalize(sum_s)
+            # discriminate between positive, negative and neutral sentiment scores
+            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
+
+            if pos_sum > math.fabs(neg_sum):
+                pos_sum += (punct_emph_amplifier)
+            elif pos_sum < math.fabs(neg_sum):
+                neg_sum -= (punct_emph_amplifier)
+
+            total = pos_sum + math.fabs(neg_sum) + neu_count
+            pos = math.fabs(pos_sum / total)
+            neg = math.fabs(neg_sum / total)
+            neu = math.fabs(neu_count / total)
+
+        else:
+            compound = 0.0
+            pos = 0.0
+            neg = 0.0
+            neu = 0.0
+
+        sentiment_dict = \
+            {"neg" : round(neg, 3),
+             "neu" : round(neu, 3),
+             "pos" : round(pos, 3),
+             "compound" : round(compound, 4)}
+
+        return sentiment_dict
diff --git a/nlp_resource_data/nltk/sentiment/vader.pyc b/nlp_resource_data/nltk/sentiment/vader.pyc

new file mode 100755 (executable)

index 0000000..04a58fb

Binary files /dev/null and b/nlp_resource_data/nltk/sentiment/vader.pyc differ
diff --git a/nlp_resource_data/nltk/stem/__init__.py b/nlp_resource_data/nltk/stem/__init__.py

new file mode 100755 (executable)

index 0000000..6886f7b
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/__init__.py
@@ -0,0 +1,31 @@
+# Natural Language Toolkit: Stemmers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Stemmers
+
+Interfaces used to remove morphological affixes from words, leaving
+only the word stem.  Stemming algorithms aim to remove those affixes
+required for eg. grammatical role, tense, derivational morphology
+leaving only the stem of the word.  This is a difficult problem due to
+irregular words (eg. common verbs in English), complicated
+morphological rules, and part-of-speech and sense ambiguities
+(eg. ``ceil-`` is not the stem of ``ceiling``).
+
+StemmerI defines a standard interface for stemmers.
+"""
+
+from nltk.stem.api import StemmerI
+from nltk.stem.regexp import RegexpStemmer
+from nltk.stem.lancaster import LancasterStemmer
+from nltk.stem.isri import ISRIStemmer
+from nltk.stem.porter import PorterStemmer
+from nltk.stem.snowball import SnowballStemmer
+from nltk.stem.wordnet import WordNetLemmatizer
+from nltk.stem.rslp import RSLPStemmer
diff --git a/nlp_resource_data/nltk/stem/__init__.pyc b/nlp_resource_data/nltk/stem/__init__.pyc

new file mode 100755 (executable)

index 0000000..02f7067

Binary files /dev/null and b/nlp_resource_data/nltk/stem/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/stem/api.py b/nlp_resource_data/nltk/stem/api.py

new file mode 100755 (executable)

index 0000000..92ab73d
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/api.py
@@ -0,0 +1,28 @@
+# Natural Language Toolkit: Stemmer Interface
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+
+@add_metaclass(ABCMeta)
+class StemmerI(object):
+    """
+    A processing interface for removing morphological affixes from
+    words.  This process is known as stemming.
+
+    """
+    @abstractmethod
+    def stem(self, token):
+        """
+        Strip affixes from the token and return the stem.
+
+        :param token: The token that should be stemmed.
+        :type token: str
+        """
diff --git a/nlp_resource_data/nltk/stem/api.pyc b/nlp_resource_data/nltk/stem/api.pyc

new file mode 100755 (executable)

index 0000000..40f1466

Binary files /dev/null and b/nlp_resource_data/nltk/stem/api.pyc differ
diff --git a/nlp_resource_data/nltk/stem/arlstem.py b/nlp_resource_data/nltk/stem/arlstem.py

new file mode 100755 (executable)

index 0000000..81de360
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/arlstem.py
@@ -0,0 +1,355 @@
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: ARLSTem Stemmer
+#
+# Copyright (C) 2001-2017 NLTK Project
+#
+# Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
+# Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
+#                         Siham Ouamour
+#                         Halim Sayoud
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+ARLSTem Arabic Stemmer
+The details about the implementation of this algorithm are described in:
+K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
+Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
+Vol. 29, No. 3, 2017, pp. 557-573.
+The ARLSTem is a light Arabic stemmer that is based on removing the affixes
+from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
+compared to several other stemmers using Paice's parameters (under-stemming
+index, over-stemming index and stemming weight), and the results showed that
+ARLSTem is promising and producing high performances. This stemmer is not
+based on any dictionary and can be used on-line effectively.
+"""
+from __future__ import unicode_literals
+import re
+
+from nltk.stem.api import StemmerI
+
+
+class ARLSTem(StemmerI):
+    '''
+    ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
+    Department of Telecommunication & Information Processing. USTHB University,
+    Algiers, Algeria.
+    ARLSTem.stem(token) returns the Arabic stem for the input token.
+    The ARLSTem Stemmer requires that all tokens are encoded using Unicode
+    encoding.
+    '''
+
+    def __init__(self):
+        # different Alif with hamza
+        self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
+        self.re_alifMaqsura = re.compile(r'[\u0649]')
+        self.re_diacritics = re.compile(r'[\u064B-\u065F]')
+
+        # Alif Laam, Laam Laam, Fa Laam, Fa Ba
+        self.pr2 = [
+            '\u0627\u0644', '\u0644\u0644',
+            '\u0641\u0644', '\u0641\u0628'
+            ]
+        # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
+        self.pr3 = [
+            '\u0628\u0627\u0644',
+            '\u0643\u0627\u0644',
+            '\u0648\u0627\u0644'
+            ]
+        # Fa Laam Laam, Waaw Laam Laam
+        self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
+        # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
+        self.pr4 = [
+            '\u0641\u0628\u0627\u0644',
+            '\u0648\u0628\u0627\u0644',
+            '\u0641\u0643\u0627\u0644'
+            ]
+
+        # Kaf Yaa, Kaf Miim
+        self.su2 = [
+            '\u0643\u064A',
+            '\u0643\u0645'
+            ]
+        # Ha Alif, Ha Miim
+        self.su22 = ['\u0647\u0627', '\u0647\u0645']
+        # Kaf Miim Alif, Kaf Noon Shadda
+        self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
+        # Ha Miim Alif, Ha Noon Shadda
+        self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
+
+        # Alif Noon, Ya Noon, Waaw Noon
+        self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
+        # Taa Alif Noon, Taa Ya Noon
+        self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
+
+        # Alif Noon, Waaw Noon
+        self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
+        # Siin Taa, Siin Yaa
+        self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
+        # Siin Alif, Siin Noon
+        self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
+
+        # Taa Miim Alif, Taa Noon Shadda
+        self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
+        # Noon Alif, Taa Miim, Taa Alif, Waaw Alif
+        self.verb_suf2 = [
+            '\u0646\u0627', '\u062A\u0645',
+            '\u062A\u0627', '\u0648\u0627'
+            ]
+        # Taa, Alif, Noon
+        self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
+
+    def stem(self, token):
+        """
+            call this function to get the word's stem based on ARLSTem .
+        """
+        try:
+            if token is None:
+                raise ValueError("The word could not be stemmed, because \
+                                 it is empty !")
+            # remove Arabic diacritics and replace some letters with others
+            token = self.norm(token)
+            # strip common prefixes of the nouns
+            pre = self.pref(token)
+            if pre is not None:
+                token = pre
+            # strip the suffixes which are common to nouns and verbs
+            token = self.suff(token)
+            # transform a plural noun to a singular noun
+            ps = self.plur2sing(token)
+            if ps is None:
+                # transform from the feminine form to the masculine form
+                fm = self.fem2masc(token)
+                if fm is not None:
+                    return fm
+                else:
+                    if pre is None:  # if the prefixes are not stripped
+                        # strip the verb prefixes and suffixes
+                        return self.verb(token)
+            else:
+                return ps
+            return token
+        except ValueError as e:
+            print(e)
+
+    def norm(self, token):
+        """
+            normalize the word by removing diacritics, replacing hamzated Alif
+            with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
+            beginning.
+        """
+        # strip Arabic diacritics
+        token = self.re_diacritics.sub('', token)
+        # replace Hamzated Alif with Alif bare
+        token = self.re_hamzated_alif.sub('\u0627', token)
+        # replace alifMaqsura with Yaa
+        token = self.re_alifMaqsura.sub('\u064A', token)
+        # strip the Waaw from the word beginning if the remaining is 3 letters
+        # at least
+        if token.startswith('\u0648') and len(token) > 3:
+            token = token[1:]
+        return token
+
+    def pref(self, token):
+        """
+            remove prefixes from the words' beginning.
+        """
+        if len(token) > 5:
+            for p3 in self.pr3:
+                if token.startswith(p3):
+                    return token[3:]
+        if len(token) > 6:
+            for p4 in self.pr4:
+                if token.startswith(p4):
+                    return token[4:]
+        if len(token) > 5:
+            for p3 in self.pr32:
+                if token.startswith(p3):
+                    return token[3:]
+        if len(token) > 4:
+            for p2 in self.pr2:
+                if token.startswith(p2):
+                    return token[2:]
+
+    def suff(self, token):
+        """
+            remove suffixes from the word's end.
+        """
+        if token.endswith('\u0643') and len(token) > 3:
+            return token[:-1]
+        if len(token) > 4:
+            for s2 in self.su2:
+                if token.endswith(s2):
+                    return token[:-2]
+        if len(token) > 5:
+            for s3 in self.su3:
+                if token.endswith(s3):
+                    return token[:-3]
+        if token.endswith('\u0647') and len(token) > 3:
+            token = token[:-1]
+            return token
+        if len(token) > 4:
+            for s2 in self.su22:
+                if token.endswith(s2):
+                    return token[:-2]
+        if len(token) > 5:
+            for s3 in self.su32:
+                if token.endswith(s3):
+                    return token[:-3]
+        if token.endswith('\u0646\u0627') and len(token) > 4:
+            return token[:-2]
+        return token
+
+    def fem2masc(self, token):
+        """
+            transform the word from the feminine form to the masculine form.
+        """
+        if token.endswith('\u0629') and len(token) > 3:
+            return token[:-1]
+
+    def plur2sing(self, token):
+        """
+            transform the word from the plural form to the singular form.
+        """
+        if len(token) > 4:
+            for ps2 in self.pl_si2:
+                if token.endswith(ps2):
+                    return token[:-2]
+        if len(token) > 5:
+            for ps3 in self.pl_si3:
+                if token.endswith(ps3):
+                    return token[:-3]
+        if len(token) > 3 and token.endswith('\u0627\u062A'):
+            return token[:-2]
+        if (len(token) > 3 and token.startswith('\u0627')
+           and token[2] == '\u0627'):
+            return token[:2] + token[3:]
+        if (len(token) > 4 and token.startswith('\u0627')
+           and token[-2] == '\u0627'):
+            return token[1:-2] + token[-1]
+
+    def verb(self, token):
+        """
+            stem the verb prefixes and suffixes or both
+        """
+        vb = self.verb_t1(token)
+        if vb is not None:
+            return vb
+        vb = self.verb_t2(token)
+        if vb is not None:
+            return vb
+        vb = self.verb_t3(token)
+        if vb is not None:
+            return vb
+        vb = self.verb_t4(token)
+        if vb is not None:
+            return vb
+        return self.verb_t5(token)
+
+    def verb_t1(self, token):
+        """
+            stem the present prefixes and suffixes
+        """
+        if len(token) > 5 and token.startswith('\u062A'):  # Taa
+            for s2 in self.pl_si2:
+                if token.endswith(s2):
+                    return token[1:-2]
+        if len(token) > 5 and token.startswith('\u064A'):  # Yaa
+            for s2 in self.verb_su2:
+                if token.endswith(s2):
+                    return token[1:-2]
+        if len(token) > 4 and token.startswith('\u0627'):  # Alif
+            # Waaw Alif
+            if len(token) > 5 and token.endswith('\u0648\u0627'):
+                return token[1:-2]
+            # Yaa
+            if token.endswith('\u064A'):
+                return token[1:-1]
+            # Alif
+            if token.endswith('\u0627'):
+                return token[1:-1]
+            # Noon
+            if token.endswith('\u0646'):
+                return token[1:-1]
+        # ^Yaa, Noon$
+        if (len(token) > 4
+           and token.startswith('\u064A')
+           and token.endswith('\u0646')):
+            return token[1:-1]
+        # ^Taa, Noon$
+        if (len(token) > 4
+           and token.startswith('\u062A')
+           and token.endswith('\u0646')):
+            return token[1:-1]
+
+    def verb_t2(self, token):
+        """
+            stem the future prefixes and suffixes
+        """
+        if len(token) > 6:
+            for s2 in self.pl_si2:
+                # ^Siin Taa
+                if (token.startswith(self.verb_pr2[0])
+                   and token.endswith(s2)):
+                    return token[2:-2]
+            # ^Siin Yaa, Alif Noon$
+            if (token.startswith(self.verb_pr2[1])
+               and token.endswith(self.pl_si2[0])):
+                return token[2:-2]
+            # ^Siin Yaa, Waaw Noon$
+            if (token.startswith(self.verb_pr2[1])
+               and token.endswith(self.pl_si2[2])):
+                return token[2:-2]
+        # ^Siin Taa, Noon$
+        if (len(token) > 5
+           and token.startswith(self.verb_pr2[0])
+           and token.endswith('\u0646')):
+            return token[2:-1]
+        # ^Siin Yaa, Noon$
+        if (len(token) > 5
+           and token.startswith(self.verb_pr2[1])
+           and token.endswith('\u0646')):
+            return token[2:-1]
+
+    def verb_t3(self, token):
+        """
+            stem the present suffixes
+        """
+        if len(token) > 5:
+            for su3 in self.verb_suf3:
+                if(token.endswith(su3)):
+                    return token[:-3]
+        if len(token) > 4:
+            for su2 in self.verb_suf2:
+                if token.endswith(su2):
+                    return token[:-2]
+        if len(token) > 3:
+            for su1 in self.verb_suf1:
+                if token.endswith(su1):
+                    return token[:-1]
+
+    def verb_t4(self, token):
+        """
+            stem the present prefixes
+        """
+        if len(token) > 3:
+            for pr1 in self.verb_suf1:
+                if token.startswith(pr1):
+                    return token[1:]
+            if token.startswith('\u064A'):
+                return token[1:]
+
+    def verb_t5(self, token):
+        """
+            stem the future prefixes
+        """
+        if len(token) > 4:
+            for pr2 in self.verb_pr22:
+                if token.startswith(pr2):
+                    return token[2:]
+            for pr2 in self.verb_pr2:
+                if token.startswith(pr2):
+                    return token[2:]
+        return token
diff --git a/nlp_resource_data/nltk/stem/arlstem.pyc b/nlp_resource_data/nltk/stem/arlstem.pyc

new file mode 100755 (executable)

index 0000000..169b8fd

Binary files /dev/null and b/nlp_resource_data/nltk/stem/arlstem.pyc differ
diff --git a/nlp_resource_data/nltk/stem/isri.py b/nlp_resource_data/nltk/stem/isri.py

new file mode 100755 (executable)

index 0000000..44c187a
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/isri.py
@@ -0,0 +1,345 @@
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: The ISRI Arabic Stemmer
+#
+# Copyright (C) 2001-2017 NLTK Proejct
+# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
+# Author: Hosam Algasaier <hosam_hme@yahoo.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+ISRI Arabic Stemmer
+
+The algorithm for this stemmer is described in:
+
+Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary.
+Information Science Research Institute. University of Nevada, Las Vegas, USA.
+
+The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features
+with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root
+dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than
+returning the original unmodified word.
+
+Additional adjustments were made to improve the algorithm:
+
+1- Adding 60 stop words.
+2- Adding the pattern (تفاعيل) to ISRI pattern set.
+3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it
+increases the word ambiguities and changes the original root.
+
+"""
+from __future__ import unicode_literals
+import re
+
+from nltk.stem.api import StemmerI
+
+
+class ISRIStemmer(StemmerI):
+    '''
+    ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
+    Information Science Research Institute. University of Nevada, Las Vegas, USA.
+
+    A few minor modifications have been made to ISRI basic algorithm.
+    See the source code of this module for more information.
+
+    isri.stem(token) returns Arabic root for the given token.
+
+    The ISRI Stemmer requires that all tokens have Unicode string types.
+    If you use Python IDLE on Arabic Windows you have to decode text first
+    using Arabic '1256' coding.
+    '''
+
+    def __init__(self):
+        # length three prefixes
+        self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644',
+                   '\u0648\u0644\u0644', '\u0648\u0627\u0644']
+
+        # length two prefixes
+        self.p2 = ['\u0627\u0644', '\u0644\u0644']
+
+        # length one prefixes
+        self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648',
+                   '\u064a', '\u062a', '\u0646', '\u0627']
+
+        # length three suffixes
+        self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644',
+                   '\u062a\u0627\u0646', '\u062a\u064a\u0646',
+                   '\u0643\u0645\u0644']
+
+        # length two suffixes
+        self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646',
+                   '\u064a\u0646', '\u062a\u0646', '\u0643\u0645',
+                   '\u0647\u0646', '\u0646\u0627', '\u064a\u0627',
+                   '\u0647\u0627', '\u062a\u0645', '\u0643\u0646',
+                   '\u0646\u064a', '\u0648\u0627', '\u0645\u0627',
+                   '\u0647\u0645']
+
+        # length one suffixes
+        self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a',
+                   '\u0627', '\u0646']
+
+        # groups of length four patterns
+        self.pr4 = {0: ['\u0645'], 1: ['\u0627'],
+                    2: ['\u0627', '\u0648', '\u064A'], 3: ['\u0629']}
+
+        # Groups of length five patterns and length three roots
+        self.pr53 = {0: ['\u0627', '\u062a'],
+                     1: ['\u0627', '\u064a', '\u0648'],
+                     2: ['\u0627', '\u062a', '\u0645'],
+                     3: ['\u0645', '\u064a', '\u062a'],
+                     4: ['\u0645', '\u062a'],
+                     5: ['\u0627', '\u0648'],
+                     6: ['\u0627', '\u0645']}
+
+        self.re_short_vowels = re.compile(r'[\u064B-\u0652]')
+        self.re_hamza = re.compile(r'[\u0621\u0624\u0626]')
+        self.re_initial_hamza = re.compile(r'^[\u0622\u0623\u0625]')
+
+        self.stop_words = ['\u064a\u0643\u0648\u0646',
+                           '\u0648\u0644\u064a\u0633',
+                           '\u0648\u0643\u0627\u0646',
+                           '\u0643\u0630\u0644\u0643',
+                           '\u0627\u0644\u062a\u064a',
+                           '\u0648\u0628\u064a\u0646',
+                           '\u0639\u0644\u064a\u0647\u0627',
+                           '\u0645\u0633\u0627\u0621',
+                           '\u0627\u0644\u0630\u064a',
+                           '\u0648\u0643\u0627\u0646\u062a',
+                           '\u0648\u0644\u0643\u0646',
+                           '\u0648\u0627\u0644\u062a\u064a',
+                           '\u062a\u0643\u0648\u0646',
+                           '\u0627\u0644\u064a\u0648\u0645',
+                           '\u0627\u0644\u0644\u0630\u064a\u0646',
+                           '\u0639\u0644\u064a\u0647',
+                           '\u0643\u0627\u0646\u062a',
+                           '\u0644\u0630\u0644\u0643',
+                           '\u0623\u0645\u0627\u0645',
+                           '\u0647\u0646\u0627\u0643',
+                           '\u0645\u0646\u0647\u0627',
+                           '\u0645\u0627\u0632\u0627\u0644',
+                           '\u0644\u0627\u0632\u0627\u0644',
+                           '\u0644\u0627\u064a\u0632\u0627\u0644',
+                           '\u0645\u0627\u064a\u0632\u0627\u0644',
+                           '\u0627\u0635\u0628\u062d',
+                           '\u0623\u0635\u0628\u062d',
+                           '\u0623\u0645\u0633\u0649',
+                           '\u0627\u0645\u0633\u0649',
+                           '\u0623\u0636\u062d\u0649',
+                           '\u0627\u0636\u062d\u0649',
+                           '\u0645\u0627\u0628\u0631\u062d',
+                           '\u0645\u0627\u0641\u062a\u0626',
+                           '\u0645\u0627\u0627\u0646\u0641\u0643',
+                           '\u0644\u0627\u0633\u064a\u0645\u0627',
+                           '\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
+                           '\u0627\u0644\u062d\u0627\u0644\u064a',
+                           '\u0627\u0644\u064a\u0647\u0627',
+                           '\u0627\u0644\u0630\u064a\u0646',
+                           '\u0641\u0627\u0646\u0647',
+                           '\u0648\u0627\u0644\u0630\u064a',
+                           '\u0648\u0647\u0630\u0627',
+                           '\u0644\u0647\u0630\u0627',
+                           '\u0641\u0643\u0627\u0646',
+                           '\u0633\u062a\u0643\u0648\u0646',
+                           '\u0627\u0644\u064a\u0647',
+                           '\u064a\u0645\u0643\u0646',
+                           '\u0628\u0647\u0630\u0627',
+                           '\u0627\u0644\u0630\u0649']
+
+    def stem(self, token):
+        """
+        Stemming a word token using the ISRI stemmer.
+        """
+        token = self.norm(token, 1)   # remove diacritics which representing Arabic short vowels
+        if token in self.stop_words:
+            return token              # exclude stop words from being processed
+        token = self.pre32(token)     # remove length three and length two prefixes in this order
+        token = self.suf32(token)     # remove length three and length two suffixes in this order
+        token = self.waw(token)       # remove connective ‘و’ if it precedes a word beginning with ‘و’
+        token = self.norm(token, 2)   # normalize initial hamza to bare alif
+        # if 4 <= word length <= 7, then stem; otherwise, no stemming
+        if len(token) == 4:           # length 4 word
+            token = self.pro_w4(token)
+        elif len(token) == 5:         # length 5 word
+            token = self.pro_w53(token)
+            token = self.end_w5(token)
+        elif len(token) == 6:         # length 6 word
+            token = self.pro_w6(token)
+            token = self.end_w6(token)
+        elif len(token) == 7:         # length 7 word
+            token = self.suf1(token)
+            if len(token) == 7:
+                token = self.pre1(token)
+            if len(token) == 6:
+                token = self.pro_w6(token)
+                token = self.end_w6(token)
+        return token
+
+    def norm(self, word, num=3):
+        """
+        normalization:
+        num=1  normalize diacritics
+        num=2  normalize initial hamza
+        num=3  both 1&2
+        """
+        if num == 1:
+            word = self.re_short_vowels.sub('', word)
+        elif num == 2:
+            word = self.re_initial_hamza.sub('\u0627', word)
+        elif num == 3:
+            word = self.re_short_vowels.sub('', word)
+            word = self.re_initial_hamza.sub('\u0627', word)
+        return word
+
+    def pre32(self, word):
+        """remove length three and length two prefixes in this order"""
+        if len(word) >= 6:
+            for pre3 in self.p3:
+                if word.startswith(pre3):
+                    return word[3:]
+        if len(word) >= 5:
+            for pre2 in self.p2:
+                if word.startswith(pre2):
+                    return word[2:]
+        return word
+
+    def suf32(self, word):
+        """remove length three and length two suffixes in this order"""
+        if len(word) >= 6:
+            for suf3 in self.s3:
+                if word.endswith(suf3):
+                    return word[:-3]
+        if len(word) >= 5:
+            for suf2 in self.s2:
+                if word.endswith(suf2):
+                    return word[:-2]
+        return word
+
+    def waw(self, word):
+        """remove connective ‘و’ if it precedes a word beginning with ‘و’ """
+        if len(word) >= 4 and word[:2] == '\u0648\u0648':
+            word = word[1:]
+        return word
+
+    def pro_w4(self, word):
+        """process length four patterns and extract length three roots"""
+        if word[0] in self.pr4[0]:      # مفعل
+            word = word[1:]
+        elif word[1] in self.pr4[1]:    # فاعل
+            word = word[:1] + word[2:]
+        elif word[2] in self.pr4[2]:    # فعال - فعول - فعيل
+            word = word[:2] + word[3]
+        elif word[3] in self.pr4[3]:    # فعلة
+            word = word[:-1]
+        else:
+            word = self.suf1(word)      # do - normalize short sufix
+            if len(word) == 4:
+                word = self.pre1(word)  # do - normalize short prefix
+        return word
+
+    def pro_w53(self, word):
+        """process length five patterns and extract length three roots"""
+        if word[2] in self.pr53[0] and word[0] == '\u0627':    # افتعل - افاعل
+            word = word[1] + word[3:]
+        elif word[3] in self.pr53[1] and word[0] == '\u0645':  # مفعول - مفعال - مفعيل
+            word = word[1:3] + word[4]
+        elif word[0] in self.pr53[2] and word[4] == '\u0629':  # مفعلة - تفعلة - افعلة
+            word = word[1:4]
+        elif word[0] in self.pr53[3] and word[2] == '\u062a':  # مفتعل - يفتعل - تفتعل
+            word = word[1] + word[3:]
+        elif word[0] in self.pr53[4] and word[2] == '\u0627':  # مفاعل - تفاعل
+            word = word[1] + word[3:]
+        elif word[2] in self.pr53[5] and word[4] == '\u0629':  # فعولة - فعالة
+            word = word[:2] + word[3]
+        elif word[0] in self.pr53[6] and word[1] == '\u0646':  # انفعل - منفعل
+            word = word[2:]
+        elif word[3] == '\u0627' and word[0] == '\u0627':      # افعال
+            word = word[1:3] + word[4]
+        elif word[4] == '\u0646' and word[3] == '\u0627':      # فعلان
+            word = word[:3]
+        elif word[3] == '\u064a' and word[0] == '\u062a':      # تفعيل
+            word = word[1:3] + word[4]
+        elif word[3] == '\u0648' and word[1] == '\u0627':      # فاعول
+            word = word[0] + word[2] + word[4]
+        elif word[2] == '\u0627' and word[1] == '\u0648':      # فواعل
+            word = word[0] + word[3:]
+        elif word[3] == '\u0626' and word[2] == '\u0627':      # فعائل
+            word = word[:2] + word[4]
+        elif word[4] == '\u0629' and word[1] == '\u0627':      # فاعلة
+            word = word[0] + word[2:4]
+        elif word[4] == '\u064a' and word[2] == '\u0627':      # فعالي
+            word = word[:2] + word[3]
+        else:
+            word = self.suf1(word)      # do - normalize short sufix
+            if len(word) == 5:
+                word = self.pre1(word)  # do - normalize short prefix
+        return word
+
+    def pro_w54(self, word):
+        """process length five patterns and extract length four roots"""
+        if word[0] in self.pr53[2]:  # تفعلل - افعلل - مفعلل
+            word = word[1:]
+        elif word[4] == '\u0629':    # فعللة
+            word = word[:4]
+        elif word[2] == '\u0627':    # فعالل
+            word = word[:2] + word[3:]
+        return word
+
+    def end_w5(self, word):
+        """ending step (word of length five)"""
+        if len(word) == 4:
+            word = self.pro_w4(word)
+        elif len(word) == 5:
+            word = self.pro_w54(word)
+        return word
+
+    def pro_w6(self, word):
+        """process length six patterns and extract length three roots"""
+        if word.startswith('\u0627\u0633\u062a') or word.startswith('\u0645\u0633\u062a'):  # مستفعل - استفعل
+            word = word[3:]
+        elif word[0] == '\u0645' and word[3] == '\u0627' and word[5] == '\u0629':           # مفعالة
+            word = word[1:3] + word[4]
+        elif word[0] == '\u0627' and word[2] == '\u062a' and word[4] == '\u0627':           # افتعال
+            word = word[1] + word[3] + word[5]
+        elif word[0] == '\u0627' and word[3] == '\u0648' and word[2] == word[4]:            # افعوعل
+            word = word[1] + word[4:]
+        elif word[0] == '\u062a' and word[2] == '\u0627' and word[4] == '\u064a':           # تفاعيل   new pattern
+            word = word[1] + word[3] + word[5]
+        else:
+            word = self.suf1(word)      # do - normalize short sufix
+            if len(word) == 6:
+                word = self.pre1(word)  # do - normalize short prefix
+        return word
+
+    def pro_w64(self, word):
+        """process length six patterns and extract length four roots"""
+        if word[0] == '\u0627' and word[4] == '\u0627':  # افعلال
+            word = word[1:4] + word[5]
+        elif word.startswith('\u0645\u062a'):            # متفعلل
+            word = word[2:]
+        return word
+
+    def end_w6(self, word):
+        """ending step (word of length six)"""
+        if len(word) == 5:
+            word = self.pro_w53(word)
+            word = self.end_w5(word)
+        elif len(word) == 6:
+            word = self.pro_w64(word)
+        return word
+
+    def suf1(self, word):
+        """normalize short sufix"""
+        for sf1 in self.s1:
+            if word.endswith(sf1):
+                return word[:-1]
+        return word
+
+    def pre1(self, word):
+        """normalize short prefix"""
+        for sp1 in self.p1:
+            if word.startswith(sp1):
+                return word[1:]
+        return word
+
+
diff --git a/nlp_resource_data/nltk/stem/isri.pyc b/nlp_resource_data/nltk/stem/isri.pyc

new file mode 100755 (executable)

index 0000000..64a9bb7

Binary files /dev/null and b/nlp_resource_data/nltk/stem/isri.pyc differ
diff --git a/nlp_resource_data/nltk/stem/lancaster.py b/nlp_resource_data/nltk/stem/lancaster.py

new file mode 100755 (executable)

index 0000000..e7e3b47
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/lancaster.py
@@ -0,0 +1,337 @@
+# Natural Language Toolkit: Stemmers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Tomcavage <stomcava@law.upenn.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
+Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
+"""
+from __future__ import unicode_literals
+import re
+
+from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class LancasterStemmer(StemmerI):
+    """
+    Lancaster Stemmer
+
+        >>> from nltk.stem.lancaster import LancasterStemmer
+        >>> st = LancasterStemmer()
+        >>> st.stem('maximum')     # Remove "-um" when word is intact
+        'maxim'
+        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
+        'presum'
+        >>> st.stem('multiply')    # No action taken if word ends with "-ply"
+        'multiply'
+        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
+        'provid'
+        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
+        'ow'
+        >>> st.stem('ear')         # ditto
+        'ear'
+        >>> st.stem('saying')      # Words starting with consonant must contain at least 3
+        'say'
+        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
+        'cry'
+        >>> st.stem('string')      # ditto
+        'string'
+        >>> st.stem('meant')       # ditto
+        'meant'
+        >>> st.stem('cement')      # ditto
+        'cem'
+        >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
+        >>> st_pre.stem('kilometer') # Test Prefix
+        'met'
+        >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
+        >>> st_custom.stem("ness") # Change s to t
+        'nest'
+    """
+
+    # The rule list is static since it doesn't change between instances
+    default_rule_tuple = (
+        "ai*2.",     # -ia > -   if intact
+        "a*1.",      # -a > -    if intact
+        "bb1.",      # -bb > -b
+        "city3s.",   # -ytic > -ys
+        "ci2>",      # -ic > -
+        "cn1t>",     # -nc > -nt
+        "dd1.",      # -dd > -d
+        "dei3y>",    # -ied > -y
+        "deec2ss.",  # -ceed >", -cess
+        "dee1.",     # -eed > -ee
+        "de2>",      # -ed > -
+        "dooh4>",    # -hood > -
+        "e1>",       # -e > -
+        "feil1v.",   # -lief > -liev
+        "fi2>",      # -if > -
+        "gni3>",     # -ing > -
+        "gai3y.",    # -iag > -y
+        "ga2>",      # -ag > -
+        "gg1.",      # -gg > -g
+        "ht*2.",     # -th > -   if intact
+        "hsiug5ct.", # -guish > -ct
+        "hsi3>",     # -ish > -
+        "i*1.",      # -i > -    if intact
+        "i1y>",      # -i > -y
+        "ji1d.",     # -ij > -id   --  see nois4j> & vis3j>
+        "juf1s.",    # -fuj > -fus
+        "ju1d.",     # -uj > -ud
+        "jo1d.",     # -oj > -od
+        "jeh1r.",    # -hej > -her
+        "jrev1t.",   # -verj > -vert
+        "jsim2t.",   # -misj > -mit
+        "jn1d.",     # -nj > -nd
+        "j1s.",      # -j > -s
+        "lbaifi6.",  # -ifiabl > -
+        "lbai4y.",   # -iabl > -y
+        "lba3>",     # -abl > -
+        "lbi3.",     # -ibl > -
+        "lib2l>",    # -bil > -bl
+        "lc1.",      # -cl > c
+        "lufi4y.",   # -iful > -y
+        "luf3>",     # -ful > -
+        "lu2.",      # -ul > -
+        "lai3>",     # -ial > -
+        "lau3>",     # -ual > -
+        "la2>",      # -al > -
+        "ll1.",      # -ll > -l
+        "mui3.",     # -ium > -
+        "mu*2.",     # -um > -   if intact
+        "msi3>",     # -ism > -
+        "mm1.",      # -mm > -m
+        "nois4j>",   # -sion > -j
+        "noix4ct.",  # -xion > -ct
+        "noi3>",     # -ion > -
+        "nai3>",     # -ian > -
+        "na2>",      # -an > -
+        "nee0.",     # protect  -een
+        "ne2>",      # -en > -
+        "nn1.",      # -nn > -n
+        "pihs4>",    # -ship > -
+        "pp1.",      # -pp > -p
+        "re2>",      # -er > -
+        "rae0.",     # protect  -ear
+        "ra2.",      # -ar > -
+        "ro2>",      # -or > -
+        "ru2>",      # -ur > -
+        "rr1.",      # -rr > -r
+        "rt1>",      # -tr > -t
+        "rei3y>",    # -ier > -y
+        "sei3y>",    # -ies > -y
+        "sis2.",     # -sis > -s
+        "si2>",      # -is > -
+        "ssen4>",    # -ness > -
+        "ss0.",      # protect  -ss
+        "suo3>",     # -ous > -
+        "su*2.",     # -us > -   if intact
+        "s*1>",      # -s > -    if intact
+        "s0.",       # -s > -s
+        "tacilp4y.", # -plicat > -ply
+        "ta2>",      # -at > -
+        "tnem4>",    # -ment > -
+        "tne3>",     # -ent > -
+        "tna3>",     # -ant > -
+        "tpir2b.",   # -ript > -rib
+        "tpro2b.",   # -orpt > -orb
+        "tcud1.",    # -duct > -duc
+        "tpmus2.",   # -sumpt > -sum
+        "tpec2iv.",  # -cept > -ceiv
+        "tulo2v.",   # -olut > -olv
+        "tsis0.",    # protect  -sist
+        "tsi3>",     # -ist > -
+        "tt1.",      # -tt > -t
+        "uqi3.",     # -iqu > -
+        "ugo1.",     # -ogu > -og
+        "vis3j>",    # -siv > -j
+        "vie0.",     # protect  -eiv
+        "vi2>",      # -iv > -
+        "ylb1>",     # -bly > -bl
+        "yli3y>",    # -ily > -y
+        "ylp0.",     # protect  -ply
+        "yl2>",      # -ly > -
+        "ygo1.",     # -ogy > -og
+        "yhp1.",     # -phy > -ph
+        "ymo1.",     # -omy > -om
+        "ypo1.",     # -opy > -op
+        "yti3>",     # -ity > -
+        "yte3>",     # -ety > -
+        "ytl2.",     # -lty > -l
+        "yrtsi5.",   # -istry > -
+        "yra3>",     # -ary > -
+        "yro3>",     # -ory > -
+        "yfi3.",     # -ify > -
+        "ycn2t>",    # -ncy > -nt
+        "yca3>",     # -acy > -
+        "zi2>",      # -iz > -
+        "zy1s."      # -yz > -ys
+    )
+
+    def __init__(self, rule_tuple=None, strip_prefix_flag=False):
+        """Create an instance of the Lancaster stemmer.
+        """
+        # Setup an empty rule dictionary - this will be filled in later
+        self.rule_dictionary = {}
+        # Check if a user wants to strip prefix
+        self._strip_prefix = strip_prefix_flag
+        # Check if a user wants to use his/her own rule tuples.
+        self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
+
+    def parseRules(self, rule_tuple=None):
+        """Validate the set of rules used in this stemmer.
+
+        If this function is called as an individual method, without using stem
+        method, rule_tuple argument will be compiled into self.rule_dictionary.
+        If this function is called within stem, self._rule_tuple will be used.
+
+        """
+        # If there is no argument for the function, use class' own rule tuple.
+        rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
+        valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
+        # Empty any old rules from the rule set before adding new ones
+        self.rule_dictionary = {}
+
+        for rule in rule_tuple:
+            if not valid_rule.match(rule):
+                raise ValueError("The rule {0} is invalid".format(rule))
+            first_letter = rule[0:1]
+            if first_letter in self.rule_dictionary:
+                self.rule_dictionary[first_letter].append(rule)
+            else:
+                self.rule_dictionary[first_letter] = [rule]
+
+    def stem(self, word):
+        """Stem a word using the Lancaster stemmer.
+        """
+        # Lower-case the word, since all the rules are lower-cased
+        word = word.lower()
+        word = self.__stripPrefix(word) if self._strip_prefix else word
+
+        # Save a copy of the original word
+        intact_word = word
+
+        # If rule dictionary is empty, parse rule tuple.
+        if not self.rule_dictionary:
+            self.parseRules()
+
+        return self.__doStemming(word, intact_word)
+
+    def __doStemming(self, word, intact_word):
+        """Perform the actual word stemming
+        """
+
+        valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
+
+        proceed = True
+
+        while proceed:
+
+            # Find the position of the last letter of the word to be stemmed
+            last_letter_position = self.__getLastLetter(word)
+
+            # Only stem the word if it has a last letter and a rule matching that last letter
+            if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary:
+                proceed = False
+
+            else:
+                rule_was_applied = False
+
+                # Go through each rule that matches the word's final letter
+                for rule in self.rule_dictionary[word[last_letter_position]]:
+                    rule_match = valid_rule.match(rule)
+                    if rule_match:
+                        (ending_string,
+                         intact_flag,
+                         remove_total,
+                         append_string,
+                         cont_flag) = rule_match.groups()
+
+                        # Convert the number of chars to remove when stemming
+                        # from a string to an integer
+                        remove_total = int(remove_total)
+
+                        # Proceed if word's ending matches rule's word ending
+                        if word.endswith(ending_string[::-1]):
+                            if intact_flag:
+                                if (word == intact_word and
+                                    self.__isAcceptable(word, remove_total)):
+                                    word = self.__applyRule(word,
+                                                            remove_total,
+                                                            append_string)
+                                    rule_was_applied = True
+                                    if cont_flag == '.':
+                                        proceed = False
+                                    break
+                            elif self.__isAcceptable(word, remove_total):
+                                word = self.__applyRule(word,
+                                                        remove_total,
+                                                        append_string)
+                                rule_was_applied = True
+                                if cont_flag == '.':
+                                    proceed = False
+                                break
+                # If no rules apply, the word doesn't need any more stemming
+                if rule_was_applied == False:
+                    proceed = False
+        return word
+
+    def __getLastLetter(self, word):
+        """Get the zero-based index of the last alphabetic character in this string
+        """
+        last_letter = -1
+        for position in range(len(word)):
+            if word[position].isalpha():
+                last_letter = position
+            else:
+                break
+        return last_letter
+
+    def __isAcceptable(self, word, remove_total):
+        """Determine if the word is acceptable for stemming.
+        """
+        word_is_acceptable = False
+        # If the word starts with a vowel, it must be at least 2
+        # characters long to be stemmed
+        if word[0] in "aeiouy":
+            if (len(word) - remove_total >= 2):
+                word_is_acceptable = True
+        # If the word starts with a consonant, it must be at least 3
+        # characters long (including one vowel) to be stemmed
+        elif (len(word) - remove_total >= 3):
+            if word[1] in "aeiouy":
+                word_is_acceptable = True
+            elif word[2] in "aeiouy":
+                word_is_acceptable = True
+        return word_is_acceptable
+
+
+    def __applyRule(self, word, remove_total, append_string):
+        """Apply the stemming rule to the word
+        """
+        # Remove letters from the end of the word
+        new_word_length = len(word) - remove_total
+        word = word[0:new_word_length]
+
+        # And add new letters to the end of the truncated word
+        if append_string:
+            word += append_string
+        return word
+
+    def __stripPrefix(self, word):
+        """Remove prefix from a word.
+
+        This function originally taken from Whoosh.
+
+        """
+        for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
+                       "nano", "pico", "pseudo"):
+            if word.startswith(prefix):
+                return word[len(prefix):]
+        return word
+
+    def __repr__(self):
+        return '<LancasterStemmer>'
diff --git a/nlp_resource_data/nltk/stem/lancaster.pyc b/nlp_resource_data/nltk/stem/lancaster.pyc

new file mode 100755 (executable)

index 0000000..cff52b8

Binary files /dev/null and b/nlp_resource_data/nltk/stem/lancaster.pyc differ
diff --git a/nlp_resource_data/nltk/stem/porter.py b/nlp_resource_data/nltk/stem/porter.py

new file mode 100755 (executable)

index 0000000..db68050
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/porter.py
@@ -0,0 +1,710 @@
+"""
+Porter Stemmer
+
+This is the Porter stemming algorithm. It follows the algorithm
+presented in
+
+Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
+
+with some optional deviations that can be turned on or off with the
+`mode` argument to the constructor.
+
+Martin Porter, the algorithm's inventor, maintains a web page about the
+algorithm at
+
+    http://www.tartarus.org/~martin/PorterStemmer/
+
+which includes another Python implementation and other implementations
+in many languages.
+"""
+
+from __future__ import print_function, unicode_literals
+
+__docformat__ = 'plaintext'
+
+import re
+
+from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class PorterStemmer(StemmerI):
+    """
+    A word stemmer based on the Porter stemming algorithm.
+
+        Porter, M. "An algorithm for suffix stripping."
+        Program 14.3 (1980): 130-137.
+        
+    See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
+    of the algorithm.
+        
+    Martin Porter has endorsed several modifications to the Porter
+    algorithm since writing his original paper, and those extensions are
+    included in the implementations on his website. Additionally, others
+    have proposed further improvements to the algorithm, including NLTK
+    contributors. There are thus three modes that can be selected by
+    passing the appropriate constant to the class constructor's `mode`
+    attribute:
+
+        PorterStemmer.ORIGINAL_ALGORITHM
+        - Implementation that is faithful to the original paper.
+        
+          Note that Martin Porter has deprecated this version of the
+          algorithm. Martin distributes implementations of the Porter
+          Stemmer in many languages, hosted at:
+          
+            http://www.tartarus.org/~martin/PorterStemmer/
+            
+          and all of these implementations include his extensions. He
+          strongly recommends against using the original, published
+          version of the algorithm; only use this mode if you clearly
+          understand why you are choosing to do so.
+        
+        PorterStemmer.MARTIN_EXTENSIONS
+        - Implementation that only uses the modifications to the
+          algorithm that are included in the implementations on Martin
+          Porter's website. He has declared Porter frozen, so the
+          behaviour of those implementations should never change.
+          
+        PorterStemmer.NLTK_EXTENSIONS (default)
+        - Implementation that includes further improvements devised by
+          NLTK contributors or taken from other modified implementations
+          found on the web.
+          
+    For the best stemming, you should use the default NLTK_EXTENSIONS
+    version. However, if you need to get the same results as either the
+    original algorithm or one of Martin Porter's hosted versions for
+    compability with an existing implementation or dataset, you can use
+    one of the other modes instead.
+    """
+    
+    # Modes the Stemmer can be instantiated in
+    NLTK_EXTENSIONS = 'NLTK_EXTENSIONS'
+    MARTIN_EXTENSIONS = 'MARTIN_EXTENSIONS'
+    ORIGINAL_ALGORITHM = 'ORIGINAL_ALGORITHM'
+
+    def __init__(self, mode=NLTK_EXTENSIONS):
+        if mode not in (
+            self.NLTK_EXTENSIONS,
+            self.MARTIN_EXTENSIONS,
+            self.ORIGINAL_ALGORITHM
+        ):
+            raise ValueError(
+                "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
+                "PorterStemmer.MARTIN_EXTENSIONS, or "
+                "PorterStemmer.ORIGINAL_ALGORITHM"
+            )
+        
+        self.mode = mode
+        
+        if self.mode == self.NLTK_EXTENSIONS:
+            # This is a table of irregular forms. It is quite short,
+            # but still reflects the errors actually drawn to Martin
+            # Porter's attention over a 20 year period!
+            irregular_forms = {
+                "sky" :     ["sky", "skies"],
+                "die" :     ["dying"],
+                "lie" :     ["lying"],
+                "tie" :     ["tying"],
+                "news" :    ["news"],
+                "inning" :  ["innings", "inning"],
+                "outing" :  ["outings", "outing"],
+                "canning" : ["cannings", "canning"],
+                "howe" :    ["howe"],
+                "proceed" : ["proceed"],
+                "exceed"  : ["exceed"],
+                "succeed" : ["succeed"],
+            }
+
+            self.pool = {}
+            for key in irregular_forms:
+                for val in irregular_forms[key]:
+                    self.pool[val] = key
+
+        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
+
+    def _is_consonant(self, word, i):
+        """Returns True if word[i] is a consonant, False otherwise
+        
+        A consonant is defined in the paper as follows:
+        
+            A consonant in a word is a letter other than A, E, I, O or
+            U, and other than Y preceded by a consonant. (The fact that
+            the term `consonant' is defined to some extent in terms of
+            itself does not make it ambiguous.) So in TOY the consonants
+            are T and Y, and in SYZYGY they are S, Z and G. If a letter
+            is not a consonant it is a vowel.
+        """
+        if word[i] in self.vowels:
+            return False
+        if word[i] == 'y':
+            if i == 0:
+                return True
+            else:
+                return (not self._is_consonant(word, i - 1))
+        return True
+        
+    def _measure(self, stem):
+        """Returns the 'measure' of stem, per definition in the paper
+        
+        From the paper:
+        
+            A consonant will be denoted by c, a vowel by v. A list
+            ccc... of length greater than 0 will be denoted by C, and a
+            list vvv... of length greater than 0 will be denoted by V.
+            Any word, or part of a word, therefore has one of the four
+            forms:
+
+                CVCV ... C
+                CVCV ... V
+                VCVC ... C
+                VCVC ... V
+                
+            These may all be represented by the single form
+            
+                [C]VCVC ... [V]
+                
+            where the square brackets denote arbitrary presence of their
+            contents. Using (VC){m} to denote VC repeated m times, this
+            may again be written as
+
+                [C](VC){m}[V].
+
+            m will be called the \measure\ of any word or word part when
+            represented in this form. The case m = 0 covers the null
+            word. Here are some examples:
+
+                m=0    TR,  EE,  TREE,  Y,  BY.
+                m=1    TROUBLE,  OATS,  TREES,  IVY.
+                m=2    TROUBLES,  PRIVATE,  OATEN,  ORRERY.
+        """
+        cv_sequence = ''
+        
+        # Construct a string of 'c's and 'v's representing whether each
+        # character in `stem` is a consonant or a vowel.
+        # e.g. 'falafel' becomes 'cvcvcvc',
+        #      'architecture' becomes 'vcccvcvccvcv'
+        for i in range(len(stem)):
+            if self._is_consonant(stem, i):
+                cv_sequence += 'c'
+            else:
+                cv_sequence += 'v'
+                
+        # Count the number of 'vc' occurences, which is equivalent to
+        # the number of 'VC' occurrences in Porter's reduced form in the
+        # docstring above, which is in turn equivalent to `m`
+        return cv_sequence.count('vc')
+        
+    def _has_positive_measure(self, stem):
+        return self._measure(stem) > 0
+
+    def _contains_vowel(self, stem):
+        """Returns True if stem contains a vowel, else False"""
+        for i in range(len(stem)):
+            if not self._is_consonant(stem, i):
+                return True
+        return False
+        
+    def _ends_double_consonant(self, word):
+        """Implements condition *d from the paper
+        
+        Returns True if word ends with a double consonant
+        """
+        return (
+            len(word) >= 2 and
+            word[-1] == word[-2] and
+            self._is_consonant(word, len(word)-1)
+        )
+
+    def _ends_cvc(self, word):
+        """Implements condition *o from the paper
+        
+        From the paper:
+        
+            *o  - the stem ends cvc, where the second c is not W, X or Y
+                  (e.g. -WIL, -HOP).
+        """
+        return (
+            len(word) >= 3 and
+            self._is_consonant(word, len(word) - 3) and
+            not self._is_consonant(word, len(word) - 2) and
+            self._is_consonant(word, len(word) - 1) and
+            word[-1] not in ('w', 'x', 'y')
+        ) or (
+            self.mode == self.NLTK_EXTENSIONS and
+            len(word) == 2 and
+            not self._is_consonant(word, 0) and
+            self._is_consonant(word, 1)
+        )
+        
+    def _replace_suffix(self, word, suffix, replacement):
+        """Replaces `suffix` of `word` with `replacement"""
+        assert word.endswith(suffix), "Given word doesn't end with given suffix"
+        if suffix == '':
+            return word + replacement
+        else:
+            return word[:-len(suffix)] + replacement
+                
+    def _apply_rule_list(self, word, rules):
+        """Applies the first applicable suffix-removal rule to the word
+        
+        Takes a word and a list of suffix-removal rules represented as
+        3-tuples, with the first element being the suffix to remove,
+        the second element being the string to replace it with, and the
+        final element being the condition for the rule to be applicable,
+        or None if the rule is unconditional.
+        """
+        for rule in rules:
+            suffix, replacement, condition = rule
+            if suffix == '*d' and self._ends_double_consonant(word):
+                stem = word[:-2]
+                if condition is None or condition(stem):
+                    return stem + replacement
+                else:
+                    # Don't try any further rules
+                    return word
+            if word.endswith(suffix):
+                stem = self._replace_suffix(word, suffix, '')
+                if condition is None or condition(stem):
+                    return stem + replacement
+                else:
+                    # Don't try any further rules
+                    return word
+                
+        return word
+        
+    def _step1a(self, word):
+        """Implements Step 1a from "An algorithm for suffix stripping"
+        
+        From the paper:
+            
+            SSES -> SS                         caresses  ->  caress
+            IES  -> I                          ponies    ->  poni
+                                               ties      ->  ti
+            SS   -> SS                         caress    ->  caress
+            S    ->                            cats      ->  cat
+        """
+        # this NLTK-only rule extends the original algorithm, so
+        # that 'flies'->'fli' but 'dies'->'die' etc
+        if self.mode == self.NLTK_EXTENSIONS:
+            if word.endswith('ies') and len(word) == 4:
+                return self._replace_suffix(word, 'ies', 'ie')
+            
+        return self._apply_rule_list(word, [
+            ('sses', 'ss', None), # SSES -> SS
+            ('ies', 'i', None),   # IES  -> I
+            ('ss', 'ss', None),   # SS   -> SS
+            ('s', '', None),      # S    ->
+        ])
+        
+    def _step1b(self, word):
+        """Implements Step 1b from "An algorithm for suffix stripping"
+        
+        From the paper:
+        
+            (m>0) EED -> EE                    feed      ->  feed
+                                               agreed    ->  agree
+            (*v*) ED  ->                       plastered ->  plaster
+                                               bled      ->  bled
+            (*v*) ING ->                       motoring  ->  motor
+                                               sing      ->  sing
+                                               
+        If the second or third of the rules in Step 1b is successful,
+        the following is done:
+
+            AT -> ATE                       conflat(ed)  ->  conflate
+            BL -> BLE                       troubl(ed)   ->  trouble
+            IZ -> IZE                       siz(ed)      ->  size
+            (*d and not (*L or *S or *Z))
+               -> single letter
+                                            hopp(ing)    ->  hop
+                                            tann(ed)     ->  tan
+                                            fall(ing)    ->  fall
+                                            hiss(ing)    ->  hiss
+                                            fizz(ed)     ->  fizz
+            (m=1 and *o) -> E               fail(ing)    ->  fail
+                                            fil(ing)     ->  file
+
+        The rule to map to a single letter causes the removal of one of
+        the double letter pair. The -E is put back on -AT, -BL and -IZ,
+        so that the suffixes -ATE, -BLE and -IZE can be recognised
+        later. This E may be removed in step 4.
+        """
+        # this NLTK-only block extends the original algorithm, so that
+        # 'spied'->'spi' but 'died'->'die' etc
+        if self.mode == self.NLTK_EXTENSIONS:
+            if word.endswith('ied'):
+                if len(word) == 4:
+                    return self._replace_suffix(word, 'ied', 'ie')
+                else:
+                    return self._replace_suffix(word, 'ied', 'i')
+        
+        # (m>0) EED -> EE
+        if word.endswith('eed'):
+            stem = self._replace_suffix(word, 'eed', '')
+            if self._measure(stem) > 0:
+                return stem + 'ee'
+            else:
+                return word
+            
+        rule_2_or_3_succeeded = False
+        
+        for suffix in ['ed', 'ing']:
+            if word.endswith(suffix):
+                intermediate_stem = self._replace_suffix(word, suffix, '')
+                if self._contains_vowel(intermediate_stem):
+                    rule_2_or_3_succeeded = True
+                    break
+                
+        if not rule_2_or_3_succeeded:
+            return word
+
+        return self._apply_rule_list(intermediate_stem, [
+            ('at', 'ate', None), # AT -> ATE
+            ('bl', 'ble', None), # BL -> BLE
+            ('iz', 'ize', None), # IZ -> IZE
+            # (*d and not (*L or *S or *Z))
+            # -> single letter
+            (
+                '*d',
+                intermediate_stem[-1],
+                lambda stem: intermediate_stem[-1] not in ('l', 's', 'z')
+            ),
+            # (m=1 and *o) -> E
+            (
+                '',
+                'e',
+                lambda stem: (self._measure(stem) == 1 and
+                              self._ends_cvc(stem))
+            ),
+        ])
+    
+    def _step1c(self, word):
+        """Implements Step 1c from "An algorithm for suffix stripping"
+        
+        From the paper:
+        
+        Step 1c
+
+            (*v*) Y -> I                    happy        ->  happi
+                                            sky          ->  sky
+        """
+        def nltk_condition(stem):
+            """
+            This has been modified from the original Porter algorithm so
+            that y->i is only done when y is preceded by a consonant,
+            but not if the stem is only a single consonant, i.e.
+
+               (*c and not c) Y -> I
+
+            So 'happy' -> 'happi', but
+               'enjoy' -> 'enjoy'  etc
+
+            This is a much better rule. Formerly 'enjoy'->'enjoi' and
+            'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
+            with this modification that no longer really matters.
+
+            Also, the removal of the contains_vowel(z) condition means
+            that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
+            conflate with 'spied', 'tried', 'flies' ...
+            """
+            return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
+        
+        def original_condition(stem):
+            return self._contains_vowel(stem)
+        
+        return self._apply_rule_list(word, [
+            (
+                'y',
+                'i',
+                nltk_condition if self.mode == self.NLTK_EXTENSIONS
+                               else original_condition
+            )
+        ])
+
+    def _step2(self, word):
+        """Implements Step 2 from "An algorithm for suffix stripping"
+        
+        From the paper:
+        
+        Step 2
+
+            (m>0) ATIONAL ->  ATE       relational     ->  relate
+            (m>0) TIONAL  ->  TION      conditional    ->  condition
+                                        rational       ->  rational
+            (m>0) ENCI    ->  ENCE      valenci        ->  valence
+            (m>0) ANCI    ->  ANCE      hesitanci      ->  hesitance
+            (m>0) IZER    ->  IZE       digitizer      ->  digitize
+            (m>0) ABLI    ->  ABLE      conformabli    ->  conformable
+            (m>0) ALLI    ->  AL        radicalli      ->  radical
+            (m>0) ENTLI   ->  ENT       differentli    ->  different
+            (m>0) ELI     ->  E         vileli        - >  vile
+            (m>0) OUSLI   ->  OUS       analogousli    ->  analogous
+            (m>0) IZATION ->  IZE       vietnamization ->  vietnamize
+            (m>0) ATION   ->  ATE       predication    ->  predicate
+            (m>0) ATOR    ->  ATE       operator       ->  operate
+            (m>0) ALISM   ->  AL        feudalism      ->  feudal
+            (m>0) IVENESS ->  IVE       decisiveness   ->  decisive
+            (m>0) FULNESS ->  FUL       hopefulness    ->  hopeful
+            (m>0) OUSNESS ->  OUS       callousness    ->  callous
+            (m>0) ALITI   ->  AL        formaliti      ->  formal
+            (m>0) IVITI   ->  IVE       sensitiviti    ->  sensitive
+            (m>0) BILITI  ->  BLE       sensibiliti    ->  sensible
+        """
+
+        if self.mode == self.NLTK_EXTENSIONS:
+            # Instead of applying the ALLI -> AL rule after '(a)bli' per
+            # the published algorithm, instead we apply it first, and,
+            # if it succeeds, run the result through step2 again.
+            if (
+                word.endswith('alli') and
+                self._has_positive_measure(
+                    self._replace_suffix(word, 'alli', '')
+                )
+            ):
+                return self._step2(
+                    self._replace_suffix(word, 'alli', 'al')
+                )
+        
+        bli_rule = ('bli', 'ble', self._has_positive_measure)
+        abli_rule = ('abli', 'able', self._has_positive_measure)
+        
+        rules = [
+            ('ational', 'ate', self._has_positive_measure),
+            ('tional', 'tion', self._has_positive_measure),
+            ('enci', 'ence', self._has_positive_measure),
+            ('anci', 'ance', self._has_positive_measure),
+            ('izer', 'ize', self._has_positive_measure),
+            
+            abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
+            
+            ('alli', 'al', self._has_positive_measure),
+            ('entli', 'ent', self._has_positive_measure),
+            ('eli', 'e', self._has_positive_measure),
+            ('ousli', 'ous', self._has_positive_measure),
+            ('ization', 'ize', self._has_positive_measure),
+            ('ation', 'ate', self._has_positive_measure),
+            ('ator', 'ate', self._has_positive_measure),
+            ('alism', 'al', self._has_positive_measure),
+            ('iveness', 'ive', self._has_positive_measure),
+            ('fulness', 'ful', self._has_positive_measure),
+            ('ousness', 'ous', self._has_positive_measure),
+            ('aliti', 'al', self._has_positive_measure),
+            ('iviti', 'ive', self._has_positive_measure),
+            ('biliti', 'ble', self._has_positive_measure),
+        ]
+        
+        if self.mode == self.NLTK_EXTENSIONS:
+            rules.append(
+                ('fulli', 'ful', self._has_positive_measure)
+            )
+            
+            # The 'l' of the 'logi' -> 'log' rule is put with the stem,
+            # so that short stems like 'geo' 'theo' etc work like
+            # 'archaeo' 'philo' etc.
+            rules.append((
+                "logi",
+                "log",
+                lambda stem: self._has_positive_measure(word[:-3])
+            ))
+
+        if self.mode == self.MARTIN_EXTENSIONS:
+            rules.append(
+                ("logi", "log", self._has_positive_measure)
+            )
+        
+        return self._apply_rule_list(word, rules)
+
+    def _step3(self, word):
+        """Implements Step 3 from "An algorithm for suffix stripping"
+        
+        From the paper:
+        
+        Step 3
+
+            (m>0) ICATE ->  IC              triplicate     ->  triplic
+            (m>0) ATIVE ->                  formative      ->  form
+            (m>0) ALIZE ->  AL              formalize      ->  formal
+            (m>0) ICITI ->  IC              electriciti    ->  electric
+            (m>0) ICAL  ->  IC              electrical     ->  electric
+            (m>0) FUL   ->                  hopeful        ->  hope
+            (m>0) NESS  ->                  goodness       ->  good
+        """
+        return self._apply_rule_list(word, [
+            ('icate', 'ic', self._has_positive_measure),
+            ('ative', '', self._has_positive_measure),
+            ('alize', 'al', self._has_positive_measure),
+            ('iciti', 'ic', self._has_positive_measure),
+            ('ical', 'ic', self._has_positive_measure),
+            ('ful', '', self._has_positive_measure),
+            ('ness', '', self._has_positive_measure),
+        ])
+
+    def _step4(self, word):
+        """Implements Step 4 from "An algorithm for suffix stripping"
+        
+        Step 4
+
+            (m>1) AL    ->                  revival        ->  reviv
+            (m>1) ANCE  ->                  allowance      ->  allow
+            (m>1) ENCE  ->                  inference      ->  infer
+            (m>1) ER    ->                  airliner       ->  airlin
+            (m>1) IC    ->                  gyroscopic     ->  gyroscop
+            (m>1) ABLE  ->                  adjustable     ->  adjust
+            (m>1) IBLE  ->                  defensible     ->  defens
+            (m>1) ANT   ->                  irritant       ->  irrit
+            (m>1) EMENT ->                  replacement    ->  replac
+            (m>1) MENT  ->                  adjustment     ->  adjust
+            (m>1) ENT   ->                  dependent      ->  depend
+            (m>1 and (*S or *T)) ION ->     adoption       ->  adopt
+            (m>1) OU    ->                  homologou      ->  homolog
+            (m>1) ISM   ->                  communism      ->  commun
+            (m>1) ATE   ->                  activate       ->  activ
+            (m>1) ITI   ->                  angulariti     ->  angular
+            (m>1) OUS   ->                  homologous     ->  homolog
+            (m>1) IVE   ->                  effective      ->  effect
+            (m>1) IZE   ->                  bowdlerize     ->  bowdler
+
+        The suffixes are now removed. All that remains is a little
+        tidying up.
+        """
+        measure_gt_1 = lambda stem: self._measure(stem) > 1
+        
+        return self._apply_rule_list(word, [
+            ('al', '', measure_gt_1),
+            ('ance', '', measure_gt_1),
+            ('ence', '', measure_gt_1),
+            ('er', '', measure_gt_1),
+            ('ic', '', measure_gt_1),
+            ('able', '', measure_gt_1),
+            ('ible', '', measure_gt_1),
+            ('ant', '', measure_gt_1),
+            ('ement', '', measure_gt_1),
+            ('ment', '', measure_gt_1),
+            ('ent', '', measure_gt_1),
+            
+            # (m>1 and (*S or *T)) ION -> 
+            (
+                'ion',
+                '',
+                lambda stem: self._measure(stem) > 1 and stem[-1] in ('s', 't')
+            ),
+            
+            ('ou', '', measure_gt_1),
+            ('ism', '', measure_gt_1),
+            ('ate', '', measure_gt_1),
+            ('iti', '', measure_gt_1),
+            ('ous', '', measure_gt_1),
+            ('ive', '', measure_gt_1),
+            ('ize', '', measure_gt_1),
+        ])
+        
+    def _step5a(self, word):
+        """Implements Step 5a from "An algorithm for suffix stripping"
+        
+        From the paper:
+        
+        Step 5a
+
+            (m>1) E     ->                  probate        ->  probat
+                                            rate           ->  rate
+            (m=1 and not *o) E ->           cease          ->  ceas
+        """
+        # Note that Martin's test vocabulary and reference
+        # implementations are inconsistent in how they handle the case
+        # where two rules both refer to a suffix that matches the word
+        # to be stemmed, but only the condition of the second one is
+        # true.
+        # Earlier in step2b we had the rules:
+        #     (m>0) EED -> EE
+        #     (*v*) ED  ->
+        # but the examples in the paper included "feed"->"feed", even
+        # though (*v*) is true for "fe" and therefore the second rule
+        # alone would map "feed"->"fe".
+        # However, in THIS case, we need to handle the consecutive rules
+        # differently and try both conditions (obviously; the second
+        # rule here would be redundant otherwise). Martin's paper makes
+        # no explicit mention of the inconsistency; you have to infer it
+        # from the examples.
+        # For this reason, we can't use _apply_rule_list here.
+        if word.endswith('e'):
+            stem = self._replace_suffix(word, 'e', '')
+            if self._measure(stem) > 1:
+                return stem
+            if self._measure(stem) == 1 and not self._ends_cvc(stem):
+                return stem
+        return word
+
+    def _step5b(self, word):
+        """Implements Step 5a from "An algorithm for suffix stripping"
+        
+        From the paper:
+        
+        Step 5b
+
+            (m > 1 and *d and *L) -> single letter
+                                    controll       ->  control
+                                    roll           ->  roll
+        """
+        return self._apply_rule_list(word, [
+            ('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)
+        ])
+
+    def stem(self, word):
+        stem = word.lower()
+        
+        if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
+            return self.pool[word]
+
+        if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
+            # With this line, strings of length 1 or 2 don't go through
+            # the stemming process, although no mention is made of this
+            # in the published algorithm.
+            return word
+
+        stem = self._step1a(stem)
+        stem = self._step1b(stem)
+        stem = self._step1c(stem)
+        stem = self._step2(stem)
+        stem = self._step3(stem)
+        stem = self._step4(stem)
+        stem = self._step5a(stem)
+        stem = self._step5b(stem)
+        
+        return stem
+
+    def __repr__(self):
+        return '<PorterStemmer>'
+
+def demo():
+    """
+    A demonstration of the porter stemmer on a sample from
+    the Penn Treebank corpus.
+    """
+
+    from nltk.corpus import treebank
+    from nltk import stem
+
+    stemmer = stem.PorterStemmer()
+
+    orig = []
+    stemmed = []
+    for item in treebank.fileids()[:3]:
+        for (word, tag) in treebank.tagged_words(item):
+            orig.append(word)
+            stemmed.append(stemmer.stem(word))
+
+    # Convert the results to a string, and word-wrap them.
+    results = ' '.join(stemmed)
+    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
+
+    # Convert the original to a string, and word wrap it.
+    original = ' '.join(orig)
+    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
+
+    # Print the results.
+    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
+    print(original)
+    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
+    print(results)
+    print('*'*70)
diff --git a/nlp_resource_data/nltk/stem/porter.pyc b/nlp_resource_data/nltk/stem/porter.pyc

new file mode 100755 (executable)

index 0000000..ae3db42

Binary files /dev/null and b/nlp_resource_data/nltk/stem/porter.pyc differ
diff --git a/nlp_resource_data/nltk/stem/regexp.py b/nlp_resource_data/nltk/stem/regexp.py

new file mode 100755 (executable)

index 0000000..9053571
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/regexp.py
@@ -0,0 +1,61 @@
+# Natural Language Toolkit: Stemmers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+#         Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+import re
+
+from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class RegexpStemmer(StemmerI):
+    """
+    A stemmer that uses regular expressions to identify morphological
+    affixes.  Any substrings that match the regular expressions will
+    be removed.
+
+        >>> from nltk.stem import RegexpStemmer
+        >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4)
+        >>> st.stem('cars')
+        'car'
+        >>> st.stem('mass')
+        'mas'
+        >>> st.stem('was')
+        'was'
+        >>> st.stem('bee')
+        'bee'
+        >>> st.stem('compute')
+        'comput'
+        >>> st.stem('advisable')
+        'advis'
+
+    :type regexp: str or regexp
+    :param regexp: The regular expression that should be used to
+        identify morphological affixes.
+    :type min: int
+    :param min: The minimum length of string to stem
+    """
+    def __init__(self, regexp, min=0):
+
+        if not hasattr(regexp, 'pattern'):
+            regexp = re.compile(regexp)
+        self._regexp = regexp
+        self._min = min
+
+    def stem(self, word):
+        if len(word) < self._min:
+            return word
+        else:
+            return self._regexp.sub('', word)
+
+    def __repr__(self):
+        return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)
+
+
+
+
diff --git a/nlp_resource_data/nltk/stem/regexp.pyc b/nlp_resource_data/nltk/stem/regexp.pyc

new file mode 100755 (executable)

index 0000000..ed20601

Binary files /dev/null and b/nlp_resource_data/nltk/stem/regexp.pyc differ
diff --git a/nlp_resource_data/nltk/stem/rslp.py b/nlp_resource_data/nltk/stem/rslp.py

new file mode 100755 (executable)

index 0000000..ebf190d
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/rslp.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+
+# Natural Language Toolkit: RSLP Stemmer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tiago Tresoldi <tresoldi@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+# This code is based on the algorithm presented in the paper "A Stemming
+# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
+# Christian Huyck, which unfortunately I had no access to. The code is a
+# Python version, with some minor modifications of mine, to the description
+# presented at http://www.webcitation.org/5NnvdIzOb and to the C source code
+# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
+# Please note that this stemmer is intended for demonstration and educational
+# purposes only. Feel free to write me for any comments, including the
+# development of a different and/or better stemmer for Portuguese. I also
+# suggest using NLTK's mailing list for Portuguese for any discussion.
+
+# Este código é baseado no algoritmo apresentado no artigo "A Stemming
+# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
+# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
+# código é uma conversão para Python, com algumas pequenas modificações
+# minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do
+# código para linguagem C disponível em
+# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
+# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
+# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
+# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
+# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
+# do NLTK para o português para qualquer debate.
+from __future__ import print_function, unicode_literals
+from nltk.data import load
+
+from nltk.stem.api import StemmerI
+
+class RSLPStemmer(StemmerI):
+    """
+    A stemmer for Portuguese.
+
+        >>> from nltk.stem import RSLPStemmer
+        >>> st = RSLPStemmer()
+        >>> # opening lines of Erico Verissimo's "Música ao Longe"
+        >>> text = '''
+        ... Clarissa risca com giz no quadro-negro a paisagem que os alunos
+        ... devem copiar . Uma casinha de porta e janela , em cima duma
+        ... coxilha .'''
+        >>> for token in text.split():
+        ...     print(st.stem(token))
+        clariss risc com giz no quadro-negr a pais que os alun dev copi .
+        uma cas de port e janel , em cim dum coxilh .
+    """
+
+    def __init__ (self):
+        self._model = []
+
+        self._model.append( self.read_rule("step0.pt") )
+        self._model.append( self.read_rule("step1.pt") )
+        self._model.append( self.read_rule("step2.pt") )
+        self._model.append( self.read_rule("step3.pt") )
+        self._model.append( self.read_rule("step4.pt") )
+        self._model.append( self.read_rule("step5.pt") )
+        self._model.append( self.read_rule("step6.pt") )
+
+    def read_rule (self, filename):
+        rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
+        lines = rules.split("\n")
+
+        lines = [line for line in lines if line != ""]     # remove blank lines
+        lines = [line for line in lines if line[0] != "#"]  # remove comments
+
+        # NOTE: a simple but ugly hack to make this parser happy with double '\t's
+        lines = [line.replace("\t\t", "\t") for line in lines]
+
+        # parse rules
+        rules = []
+        for line in lines:
+            rule = []
+            tokens = line.split("\t")
+
+            # text to be searched for at the end of the string
+            rule.append( tokens[0][1:-1] ) # remove quotes
+
+            # minimum stem size to perform the replacement
+            rule.append( int(tokens[1]) )
+
+            # text to be replaced into
+            rule.append( tokens[2][1:-1] ) # remove quotes
+
+            # exceptions to this rule
+            rule.append( [token[1:-1] for token in tokens[3].split(",")] )
+
+            # append to the results
+            rules.append(rule)
+
+        return rules
+
+    def stem(self, word):
+        word = word.lower()
+
+        # the word ends in 's'? apply rule for plural reduction
+        if word[-1] == "s":
+            word = self.apply_rule(word, 0)
+
+        # the word ends in 'a'? apply rule for feminine reduction
+        if word[-1] == "a":
+            word = self.apply_rule(word, 1)
+
+        # augmentative reduction
+        word = self.apply_rule(word, 3)
+
+        # adverb reduction
+        word = self.apply_rule(word, 2)
+
+        # noun reduction
+        prev_word = word
+        word = self.apply_rule(word, 4)
+        if word == prev_word:
+            # verb reduction
+            prev_word = word
+            word = self.apply_rule(word, 5)
+            if word == prev_word:
+                # vowel removal
+                word = self.apply_rule(word, 6)
+
+        return word
+
+    def apply_rule(self, word, rule_index):
+        rules = self._model[rule_index]
+        for rule in rules:
+            suffix_length = len(rule[0])
+            if word[-suffix_length:] == rule[0]:       # if suffix matches
+                if len(word) >= suffix_length + rule[1]: # if we have minimum size
+                    if word not in rule[3]:                # if not an exception
+                        word = word[:-suffix_length] + rule[2]
+                        break
+
+        return word
+
+
+
diff --git a/nlp_resource_data/nltk/stem/rslp.pyc b/nlp_resource_data/nltk/stem/rslp.pyc

new file mode 100755 (executable)

index 0000000..7807e0a

Binary files /dev/null and b/nlp_resource_data/nltk/stem/rslp.pyc differ
diff --git a/nlp_resource_data/nltk/stem/snowball.py b/nlp_resource_data/nltk/stem/snowball.py

new file mode 100755 (executable)

index 0000000..00b511c
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/snowball.py
@@ -0,0 +1,4236 @@
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: Snowball Stemmer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Peter Michael Stahl <pemistahl@gmail.com>
+#         Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
+#         Lakhdar Benzahia <lakhdar.benzahia@gmail.com>  (co-writer)
+#         Assem Chelli <assem.ch@gmail.com>  (reviewer arabicstemmer)
+#         Abdelkrim Aries <ab_aries@esi.dz> (reviewer arabicstemmer)
+# Algorithms: Dr Martin Porter <martin@tartarus.org>
+#             Assem Chelli <assem.ch@gmail.com>  arabic stemming algorithm
+#             Benzahia Lakhdar <lakhdar.benzahia@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Snowball stemmers
+
+This module provides a port of the Snowball stemmers
+developed by Martin Porter.
+
+There is also a demo function: `snowball.demo()`.
+
+"""
+from __future__ import unicode_literals, print_function
+
+from six.moves import input
+import re
+
+from nltk import compat
+from nltk.corpus import stopwords
+from nltk.stem import porter
+from nltk.stem.util import suffix_replace, prefix_replace
+
+from nltk.stem.api import StemmerI
+
+
+class SnowballStemmer(StemmerI):
+
+    """
+    Snowball Stemmer
+
+    The following languages are supported:
+    Arabic, Danish, Dutch, English, Finnish, French, German,
+    Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
+    Spanish and Swedish.
+
+    The algorithm for English is documented here:
+
+        Porter, M. \"An algorithm for suffix stripping.\"
+        Program 14.3 (1980): 130-137.
+
+    The algorithms have been developed by Martin Porter.
+    These stemmers are called Snowball, because Porter created
+    a programming language with this name for creating
+    new stemming algorithms. There is more information available
+    at http://snowball.tartarus.org/
+
+    The stemmer is invoked as shown below:
+
+    >>> from nltk.stem import SnowballStemmer
+    >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
+    arabic danish dutch english finnish french german hungarian
+    italian norwegian porter portuguese romanian russian
+    spanish swedish
+    >>> stemmer = SnowballStemmer("german") # Choose a language
+    >>> stemmer.stem("Autobahnen") # Stem a word
+    'autobahn'
+
+    Invoking the stemmers that way is useful if you do not know the
+    language to be stemmed at runtime. Alternatively, if you already know
+    the language, then you can invoke the language specific stemmer directly:
+
+    >>> from nltk.stem.snowball import GermanStemmer
+    >>> stemmer = GermanStemmer()
+    >>> stemmer.stem("Autobahnen")
+    'autobahn'
+
+    :param language: The language whose subclass is instantiated.
+    :type language: str or unicode
+    :param ignore_stopwords: If set to True, stopwords are
+                             not stemmed and returned unchanged.
+                             Set to False by default.
+    :type ignore_stopwords: bool
+    :raise ValueError: If there is no stemmer for the specified
+                           language, a ValueError is raised.
+    """
+
+    languages = ("arabic", "danish", "dutch", "english", "finnish", "french", "german",
+                 "hungarian", "italian", "norwegian", "porter", "portuguese",
+                 "romanian", "russian", "spanish", "swedish")
+
+    def __init__(self, language, ignore_stopwords=False):
+        if language not in self.languages:
+            raise ValueError("The language '{0}' is not supported.".format(language))
+        stemmerclass = globals()[language.capitalize() + "Stemmer"]
+        self.stemmer = stemmerclass(ignore_stopwords)
+        self.stem = self.stemmer.stem
+        self.stopwords = self.stemmer.stopwords
+    
+    def stem(self, token):
+        return self.stemmer.stem(self, token)
+
+
+@compat.python_2_unicode_compatible
+class _LanguageSpecificStemmer(StemmerI):
+
+    """
+    This helper subclass offers the possibility
+    to invoke a specific stemmer directly.
+    This is useful if you already know the language to be stemmed at runtime.
+
+    Create an instance of the Snowball stemmer.
+
+    :param ignore_stopwords: If set to True, stopwords are
+                             not stemmed and returned unchanged.
+                             Set to False by default.
+    :type ignore_stopwords: bool
+    """
+
+    def __init__(self, ignore_stopwords=False):
+        # The language is the name of the class, minus the final "Stemmer".
+        language = type(self).__name__.lower()
+        if language.endswith("stemmer"):
+            language = language[:-7]
+
+        self.stopwords = set()
+        if ignore_stopwords:
+            try:
+                for word in stopwords.words(language):
+                    self.stopwords.add(word)
+            except IOError:
+                raise ValueError("{!r} has no list of stopwords. Please set"
+                                 " 'ignore_stopwords' to 'False'.".format(self))
+
+    def __repr__(self):
+        """
+        Print out the string representation of the respective class.
+
+        """
+        return "<{0}>".format(type(self).__name__)
+
+
+class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer):
+    """
+    A word stemmer based on the original Porter stemming algorithm.
+
+        Porter, M. \"An algorithm for suffix stripping.\"
+        Program 14.3 (1980): 130-137.
+
+    A few minor modifications have been made to Porter's basic
+    algorithm.  See the source code of the module
+    nltk.stem.porter for more information.
+
+    """
+    def __init__(self, ignore_stopwords=False):
+        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
+        porter.PorterStemmer.__init__(self)
+
+
+class _ScandinavianStemmer(_LanguageSpecificStemmer):
+
+    """
+    This subclass encapsulates a method for defining the string region R1.
+    It is used by the Danish, Norwegian, and Swedish stemmer.
+
+    """
+
+    def _r1_scandinavian(self, word, vowels):
+        """
+        Return the region R1 that is used by the Scandinavian stemmers.
+
+        R1 is the region after the first non-vowel following a vowel,
+        or is the null region at the end of the word if there is no
+        such non-vowel. But then R1 is adjusted so that the region
+        before it contains at least three letters.
+
+        :param word: The word whose region R1 is determined.
+        :type word: str or unicode
+        :param vowels: The vowels of the respective language that are
+                       used to determine the region R1.
+        :type vowels: unicode
+        :return: the region R1 for the respective word.
+        :rtype: unicode
+        :note: This helper method is invoked by the respective stem method of
+               the subclasses DanishStemmer, NorwegianStemmer, and
+               SwedishStemmer. It is not to be invoked directly!
+
+        """
+        r1 = ""
+        for i in range(1, len(word)):
+            if word[i] not in vowels and word[i-1] in vowels:
+                if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
+                    r1 = word[3:]
+                elif len(word[:i+1]) >= 3:
+                    r1 = word[i+1:]
+                else:
+                    return word
+                break
+
+        return r1
+
+
+class _StandardStemmer(_LanguageSpecificStemmer):
+
+    """
+    This subclass encapsulates two methods for defining the standard versions
+    of the string regions R1, R2, and RV.
+
+    """
+
+    def _r1r2_standard(self, word, vowels):
+        """
+        Return the standard interpretations of the string regions R1 and R2.
+
+        R1 is the region after the first non-vowel following a vowel,
+        or is the null region at the end of the word if there is no
+        such non-vowel.
+
+        R2 is the region after the first non-vowel following a vowel
+        in R1, or is the null region at the end of the word if there
+        is no such non-vowel.
+
+        :param word: The word whose regions R1 and R2 are determined.
+        :type word: str or unicode
+        :param vowels: The vowels of the respective language that are
+                       used to determine the regions R1 and R2.
+        :type vowels: unicode
+        :return: (r1,r2), the regions R1 and R2 for the respective word.
+        :rtype: tuple
+        :note: This helper method is invoked by the respective stem method of
+               the subclasses DutchStemmer, FinnishStemmer,
+               FrenchStemmer, GermanStemmer, ItalianStemmer,
+               PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
+               It is not to be invoked directly!
+        :note: A detailed description of how to define R1 and R2
+               can be found at http://snowball.tartarus.org/texts/r1r2.html
+
+        """
+        r1 = ""
+        r2 = ""
+        for i in range(1, len(word)):
+            if word[i] not in vowels and word[i-1] in vowels:
+                r1 = word[i+1:]
+                break
+
+        for i in range(1, len(r1)):
+            if r1[i] not in vowels and r1[i-1] in vowels:
+                r2 = r1[i+1:]
+                break
+
+        return (r1, r2)
+
+
+
+    def _rv_standard(self, word, vowels):
+        """
+        Return the standard interpretation of the string region RV.
+
+        If the second letter is a consonant, RV is the region after the
+        next following vowel. If the first two letters are vowels, RV is
+        the region after the next following consonant. Otherwise, RV is
+        the region after the third letter.
+
+        :param word: The word whose region RV is determined.
+        :type word: str or unicode
+        :param vowels: The vowels of the respective language that are
+                       used to determine the region RV.
+        :type vowels: unicode
+        :return: the region RV for the respective word.
+        :rtype: unicode
+        :note: This helper method is invoked by the respective stem method of
+               the subclasses ItalianStemmer, PortugueseStemmer,
+               RomanianStemmer, and SpanishStemmer. It is not to be
+               invoked directly!
+
+        """
+        rv = ""
+        if len(word) >= 2:
+            if word[1] not in vowels:
+                for i in range(2, len(word)):
+                    if word[i] in vowels:
+                        rv = word[i+1:]
+                        break
+
+            elif word[0] in vowels and word[1] in vowels:
+                for i in range(2, len(word)):
+                    if word[i] not in vowels:
+                        rv = word[i+1:]
+                        break
+            else:
+                rv = word[3:]
+
+        return rv
+
+class ArabicStemmer(_LanguageSpecificStemmer):
+    """
+        https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
+        The Snowball Arabic light Stemmer
+        Algorithm : Assem Chelli
+                   Abdelkrim Aries
+                   Lakhdar Benzahia
+        Nltk Version Author : Lakhdar Benzahia
+    """
+    # Normalize_pre stes
+    __vocalization = re.compile(r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]') # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
+
+    __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda
+
+    __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') #  ؛ ، ؟
+
+    # Normalize_post
+    __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ
+
+    # normalize other hamza's
+    __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') #  أ، إ، آ
+
+    __waw_hamza = re.compile(r'[\u0624]') # ؤ
+
+    __yeh_hamza = re.compile(r'[\u0626]') # ئ
+
+    __alefat = re.compile(r'[\u0623\u0622\u0625]') #  أ، إ، آ
+
+    # Checks
+    __checks1 = ('\u0643\u0627\u0644', '\u0628\u0627\u0644',  # بال، كال
+                 '\u0627\u0644', '\u0644\u0644' # لل، ال
+                 )
+
+    __checks2 = ('\u0629', # ة
+                 '\u0627\u062a'  #  female plural ات
+                 )
+
+    # Suffixes
+    __suffix_noun_step1a = ('\u064a', '\u0643', '\u0647', # ي، ك، ه
+                            '\u0646\u0627', '\u0643\u0645', '\u0647\u0627', '\u0647\u0646', '\u0647\u0645', # نا، كم، ها، هن، هم
+                            '\u0643\u0645\u0627', '\u0647\u0645\u0627' # كما، هما
+                            )
+
+    __suffix_noun_step1b = ('\u0646') # ن
+
+    __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و
+
+    __suffix_noun_step2b = ('\u0627\u062a') # ات
+
+    __suffix_noun_step2c1 = ('\u062a') # ت
+
+    __suffix_noun_step2c2 = ('\u0629') # ة
+
+    __suffix_noun_step3 = ('\u064a') # ي
+
+    __suffix_verb_step1 = ('\u0647', '\u0643', # ه، ك
+                           '\u0646\u064a', '\u0646\u0627', '\u0647\u0627', '\u0647\u0645', # ني، نا، ها، هم
+                           '\u0647\u0646', '\u0643\u0645', '\u0643\u0646', # هن، كم، كن
+                           '\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648' # هما، كما، كمو
+                          )
+
+    __suffix_verb_step2a = ( '\u062a', '\u0627', '\u0646' , '\u064a', # ت، ا، ن، ي
+                             '\u0646\u0627', '\u062a\u0627', '\u062a\u0646', # نا، تا، تن Past
+                             '\u0627\u0646', '\u0648\u0646', '\u064a\u0646', # ان، هن، ين Present
+                             '\u062a\u0645\u0627' # تما
+                           )
+
+    __suffix_verb_step2b = ('\u0648\u0627','\u062a\u0645') # وا، تم
+
+    __suffix_verb_step2c = ('\u0648', # و
+                            '\u062a\u0645\u0648' # تمو
+                           )
+
+    __suffix_all_alef_maqsura = ('\u0649') # ى
+
+    # Prefixes
+    __prefix_step1 = ('\u0623', # أ
+                      '\u0623\u0623', '\u0623\u0622', '\u0623\u0624', '\u0623\u0627', '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ
+                      )
+
+    __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال
+
+    __prefix_step2b = ('\u0641', '\u0648') # ف، و
+
+    __prefix_step3a_noun = ('\u0627\u0644', '\u0644\u0644', # لل، ال
+                            '\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال
+                            )
+
+    __prefix_step3b_noun = ('\u0628', '\u0643', '\u0644', # ب، ك، ل
+                            '\u0628\u0628', '\u0643\u0643' # بب، كك
+                           )
+
+    __prefix_step3_verb = ('\u0633\u064a', '\u0633\u062a', '\u0633\u0646', '\u0633\u0623') # سي، ست، سن، سأ
+
+    __prefix_step4_verb = ('\u064a\u0633\u062a', '\u0646\u0633\u062a', '\u062a\u0633\u062a') # يست، نست، تست
+
+    # Suffixes added due to Conjugation Verbs
+    __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك
+
+    __conjugation_suffix_verb_2 = ('\u0646\u064a', '\u0646\u0627','\u0647\u0627', # ني، نا، ها
+                                   '\u0647\u0645', '\u0647\u0646', '\u0643\u0645', # هم، هن، كم
+                                   '\u0643\u0646' # كن
+                                   )
+    __conjugation_suffix_verb_3 = ('\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648') # هما، كما، كمو
+
+    __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي
+
+    __conjugation_suffix_verb_past = ('\u0646\u0627', '\u062a\u0627', '\u062a\u0646') # نا، تا، تن
+
+    __conjugation_suffix_verb_presnet = ('\u0627\u0646', '\u0648\u0646', '\u064a\u0646') # ان، ون، ين
+
+    # Suffixes added due to derivation Names
+    __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه
+
+    __conjugation_suffix_noun_2 = ('\u0646\u0627', '\u0643\u0645', # نا، كم
+                                   '\u0647\u0627', '\u0647\u0646', '\u0647\u0645' # ها، هن، هم
+                                   )
+
+    __conjugation_suffix_noun_3 = ('\u0643\u0645\u0627', '\u0647\u0645\u0627') # كما، هما
+
+    # Prefixes added due to derivation Names
+    __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا
+
+    __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644')  # بال كال
+
+    __articles_2len = ('\u0627\u0644', '\u0644\u0644')  # ال لل
+
+    # Prepositions letters
+    __prepositions1 = ('\u0643', '\u0644') # ك، ل
+    __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك
+
+    is_verb = True
+    is_noun = True
+    is_defined = False
+
+    suffixes_verb_step1_success = False
+    suffix_verb_step2a_success = False
+    suffix_verb_step2b_success = False
+    suffix_noun_step2c2_success = False
+    suffix_noun_step1a_success = False
+    suffix_noun_step2a_success = False
+    suffix_noun_step2b_success = False
+    suffixe_noun_step1b_success = False
+    prefix_step2a_success = False
+    prefix_step3a_noun_success = False
+    prefix_step3b_noun_success = False
+
+    def __normalize_pre(self, token):
+        """
+        :param token: string
+        :return: normalized token type string
+        """
+        # strip diacritics
+        token = self.__vocalization.sub('', token)
+        #strip kasheeda
+        token = self.__kasheeda.sub('', token)
+        # strip punctuation marks
+        token = self.__arabic_punctuation_marks.sub('', token)
+        return token
+
+    def __normalize_post(self, token):
+        # normalize last hamza
+        for hamza in self.__last_hamzat:
+            if token.endswith(hamza):
+                token = suffix_replace(token, hamza, '\u0621')
+                break
+        # normalize other hamzat
+        token = self.__initial_hamzat.sub('\u0627', token)
+        token = self.__waw_hamza.sub('\u0648', token)
+        token = self.__yeh_hamza.sub('\u064a', token)
+        token = self.__alefat.sub('\u0627', token)
+        return  token
+
+    def __checks_1(self, token):
+        for prefix in self.__checks1 :
+            if token.startswith(prefix):
+                if prefix in self.__articles_3len and len(token) > 4 :
+                    self.is_noun = True
+                    self.is_verb = False
+                    self.is_defined = True
+                    break
+
+                if prefix in self.__articles_2len and len(token) > 3 :
+                    self.is_noun = True
+                    self.is_verb = False
+                    self.is_defined = True
+                    break
+
+    def __checks_2(self, token):
+        for suffix in self.__checks2:
+            if token.endswith(suffix):
+                if suffix == '\u0629' and len(token) > 2:
+                    self.is_noun = True
+                    self.is_verb = False
+                    break
+
+                if suffix == '\u0627\u062a' and len(token) > 3:
+                    self.is_noun = True
+                    self.is_verb = False
+                    break
+
+    def __Suffix_Verb_Step1(self, token):
+        for suffix in self.__suffix_verb_step1:
+            if token.endswith(suffix):
+                if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffixes_verb_step1_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
+                    token = token[:-2]
+                    self.suffixes_verb_step1_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
+                    token = token[:-3]
+                    self.suffixes_verb_step1_success = True
+                    break
+        return token
+
+    def __Suffix_Verb_Step2a(self, token):
+        for suffix in self.__suffix_verb_step2a:
+            if token.endswith(suffix):
+                if suffix == '\u062a' and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
+                    token = token[:-2]  # past
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
+                    token = token[:-2]  # present
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
+                    token = token[:-3]
+                    self.suffix_verb_step2a_success = True
+                    break
+        return  token
+
+    def __Suffix_Verb_Step2c(self, token):
+        for suffix in self.__suffix_verb_step2c:
+            if token.endswith(suffix):
+                if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
+                    token = token[:-3]
+                    break
+
+                if suffix == '\u0648' and len(token) >= 4:
+                    token = token[:-1]
+                    break
+        return token
+
+    def __Suffix_Verb_Step2b(self, token):
+        for suffix in self.__suffix_verb_step2b:
+            if token.endswith(suffix) and len(token) >= 5:
+                token = token[:-2]
+                self.suffix_verb_step2b_success = True
+                break
+        return  token
+
+    def __Suffix_Noun_Step2c2(self, token):
+        for suffix in self.__suffix_noun_step2c2:
+            if token.endswith(suffix) and len(token) >= 3:
+                token = token[:-1]
+                self.suffix_noun_step2c2_success = True
+                break
+        return token
+
+    def __Suffix_Noun_Step1a(self, token):
+        for suffix in self.__suffix_noun_step1a:
+            if token.endswith(suffix):
+                if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffix_noun_step1a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
+                    token = token[:-2]
+                    self.suffix_noun_step1a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
+                    token = token[:-3]
+                    self.suffix_noun_step1a_success = True
+                    break
+        return token
+
+    def __Suffix_Noun_Step2a(self, token):
+        for suffix in self.__suffix_noun_step2a:
+            if token.endswith(suffix) and len(token) > 4:
+                token = token[:-1]
+                self.suffix_noun_step2a_success = True
+                break
+        return token
+
+    def __Suffix_Noun_Step2b(self, token):
+        for suffix in self.__suffix_noun_step2b:
+            if token.endswith(suffix) and len(token) >= 5:
+                token = token[:-2]
+                self.suffix_noun_step2b_success = True
+                break
+        return  token
+
+    def __Suffix_Noun_Step2c1(self, token):
+        for suffix in self.__suffix_noun_step2c1:
+            if token.endswith(suffix) and len(token) >= 4:
+                token = token[:-1]
+                break
+        return token
+
+    def __Suffix_Noun_Step1b(self, token):
+        for suffix in self.__suffix_noun_step1b:
+            if token.endswith(suffix) and len(token) > 5:
+                token = token[:-1]
+                self.suffixe_noun_step1b_success = True
+                break
+        return token
+
+    def __Suffix_Noun_Step3(self, token):
+        for suffix in self.__suffix_noun_step3:
+            if token.endswith(suffix) and len(token) >= 3:
+                token = token[:-1]  # ya' nisbiya
+                break
+        return token
+
+    def __Suffix_All_alef_maqsura(self, token):
+        for suffix in self.__suffix_all_alef_maqsura:
+            if token.endswith(suffix):
+                token = suffix_replace(token, suffix, '\u064a')
+        return  token
+
+    def __Prefix_Step1(self, token):
+        for prefix in self.__prefix_step1:
+            if token.startswith(prefix) and len(token) > 3:
+                if prefix == '\u0623\u0623':
+                    token = prefix_replace(token, prefix, '\u0623')
+                    break
+
+                elif prefix == '\u0623\u0622':
+                    token = prefix_replace(token, prefix, '\u0622')
+                    break
+
+                elif prefix == '\u0623\u0624':
+                    token = prefix_replace(token, prefix, '\u0624')
+                    break
+
+                elif prefix == '\u0623\u0627' :
+                    token = prefix_replace(token, prefix, '\u0627')
+                    break
+
+                elif prefix == '\u0623\u0625' :
+                    token = prefix_replace(token, prefix, '\u0625')
+                    break
+        return token
+
+    def __Prefix_Step2a(self, token):
+        for prefix in self.__prefix_step2a:
+            if token.startswith(prefix) and len(token) > 5:
+                token = token[len(prefix):]
+                self.prefix_step2a_success = True
+                break
+        return  token
+
+    def __Prefix_Step2b(self, token):
+        for prefix in self.__prefix_step2b:
+            if token.startswith(prefix) and len(token) > 3 :
+                if token[:2] not in self.__prefixes1:
+                    token = token[len(prefix):]
+                    break
+        return token
+
+    def __Prefix_Step3a_Noun(self, token):
+        for prefix in self.__prefix_step3a_noun:
+            if token.startswith(prefix):
+                if prefix in self.__articles_2len and len(token) > 4:
+                    token =  token[len(prefix):]
+                    self.prefix_step3a_noun_success = True
+                    break
+                if prefix in self.__articles_3len  and len(token) > 5:
+                    token = token[len(prefix):]
+                    break
+        return token
+
+    def __Prefix_Step3b_Noun(self, token):
+        for prefix in self.__prefix_step3b_noun:
+            if token.startswith(prefix):
+                if len(token) > 3:
+                    if prefix == '\u0628':
+                        token = token[len(prefix):]
+                        self.prefix_step3b_noun_success = True
+                        break
+
+                    if prefix in self.__prepositions2:
+                        token = prefix_replace(token, prefix, prefix[1])
+                        self.prefix_step3b_noun_success = True
+                        break
+
+                if prefix in self.__prepositions1 and len(token) > 4:
+                    token = token[len(prefix):]  # BUG: cause confusion
+                    self.prefix_step3b_noun_success = True
+                    break
+        return token
+
+    def __Prefix_Step3_Verb(self, token):
+        for prefix in self.__prefix_step3_verb:
+            if token.startswith(prefix) and len(token) > 4:
+                token = prefix_replace(token, prefix, prefix[1])
+                break
+        return token
+
+    def __Prefix_Step4_Verb(self, token):
+        for prefix in self.__prefix_step4_verb:
+            if token.startswith(prefix) and len(token) > 4:
+                token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
+                self.is_verb = True
+                self.is_noun = False
+                break
+        return token
+
+    def stem(self, word):
+        """
+         Stem an Arabic word and return the stemmed form.
+        :param word: string
+        :return: string
+        """
+        # set initial values
+        self.is_verb = True
+        self.is_noun = True
+        self.is_defined = False
+
+        self.suffix_verb_step2a_success = False
+        self.suffix_verb_step2b_success = False
+        self.suffix_noun_step2c2_success = False
+        self.suffix_noun_step1a_success = False
+        self.suffix_noun_step2a_success = False
+        self.suffix_noun_step2b_success = False
+        self.suffixe_noun_step1b_success = False
+        self.prefix_step2a_success = False
+        self.prefix_step3a_noun_success = False
+        self.prefix_step3b_noun_success = False
+
+        modified_word = word
+        # guess type and properties
+        # checks1
+        self.__checks_1(modified_word)
+        # checks2
+        self.__checks_2(modified_word)
+        modified_word = self.__normalize_pre(modified_word)
+        if self.is_verb:
+            modified_word = self.__Suffix_Verb_Step1(modified_word)
+            if  self.suffixes_verb_step1_success:
+                modified_word = self.__Suffix_Verb_Step2a(modified_word)
+                if not self.suffix_verb_step2a_success :
+                    modified_word = self.__Suffix_Verb_Step2c(modified_word)
+                #or next
+            else:
+                modified_word = self.__Suffix_Verb_Step2b(modified_word)
+                if not self.suffix_verb_step2b_success:
+                    modified_word = self.__Suffix_Verb_Step2a(modified_word)
+        if self.is_noun:
+            modified_word = self.__Suffix_Noun_Step2c2(modified_word)
+            if not self.suffix_noun_step2c2_success:
+                if not self.is_defined:
+                    modified_word = self.__Suffix_Noun_Step1a(modified_word)
+                    #if self.suffix_noun_step1a_success:
+                    modified_word = self.__Suffix_Noun_Step2a(modified_word)
+                    if not self.suffix_noun_step2a_success:
+                         modified_word = self.__Suffix_Noun_Step2b(modified_word)
+                    if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
+                        modified_word = self.__Suffix_Noun_Step2c1(modified_word)
+                    # or next ? todo : how to deal with or next
+                else:
+                    modified_word =  self.__Suffix_Noun_Step1b(modified_word)
+                    if self.suffixe_noun_step1b_success:
+                        modified_word = self.__Suffix_Noun_Step2a(modified_word)
+                        if not self.suffix_noun_step2a_success:
+                            modified_word = self.__Suffix_Noun_Step2b(modified_word)
+                        if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
+                            modified_word = self.__Suffix_Noun_Step2c1(modified_word)
+                    else:
+                        if not self.is_defined:
+                            modified_word = self.__Suffix_Noun_Step2a(modified_word)
+                        modified_word = self.__Suffix_Noun_Step2b(modified_word)
+            modified_word = self.__Suffix_Noun_Step3(modified_word)
+        if not self.is_noun and self.is_verb:
+            modified_word = self.__Suffix_All_alef_maqsura(modified_word)
+
+        # prefixes
+        modified_word = self.__Prefix_Step1(modified_word)
+        modified_word = self.__Prefix_Step2a(modified_word)
+        if not self.prefix_step2a_success:
+            modified_word = self.__Prefix_Step2b(modified_word)
+        modified_word = self.__Prefix_Step3a_Noun(modified_word)
+        if not self.prefix_step3a_noun_success and self.is_noun:
+            modified_word = self.__Prefix_Step3b_Noun(modified_word)
+        else:
+            if not self.prefix_step3b_noun_success and self.is_verb:
+                modified_word = self.__Prefix_Step3_Verb(modified_word)
+                modified_word = self.__Prefix_Step4_Verb(modified_word)
+
+        # post normalization stemming
+        modified_word = self.__normalize_post(modified_word)
+        stemmed_word = modified_word
+        return stemmed_word
+
+class DanishStemmer(_ScandinavianStemmer):
+
+    """
+    The Danish Snowball stemmer.
+
+    :cvar __vowels: The Danish vowels.
+    :type __vowels: unicode
+    :cvar __consonants: The Danish consonants.
+    :type __consonants: unicode
+    :cvar __double_consonants: The Danish double consonants.
+    :type __double_consonants: tuple
+    :cvar __s_ending: Letters that may directly appear before a word final 's'.
+    :type __s_ending: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :note: A detailed description of the Danish
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/danish/stemmer.html
+
+    """
+
+    # The language's vowels and other important characters are defined.
+    __vowels = "aeiouy\xE6\xE5\xF8"
+    __consonants = "bcdfghjklmnpqrstvwxz"
+    __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
+                           "kk", "ll", "mm", "nn", "pp", "qq", "rr",
+                           "ss", "tt", "vv", "ww", "xx", "zz")
+    __s_ending = "abcdfghjklmnoprtvyz\xE5"
+
+    # The different suffixes, divided into the algorithm's steps
+    # and organized by length, are listed in tuples.
+    __step1_suffixes = ("erendes", "erende", "hedens", "ethed",
+                        "erede", "heden", "heder", "endes",
+                        "ernes", "erens", "erets", "ered",
+                        "ende", "erne", "eren", "erer", "heds",
+                        "enes", "eres", "eret", "hed", "ene", "ere",
+                        "ens", "ers", "ets", "en", "er", "es", "et",
+                        "e", "s")
+    __step2_suffixes = ("gd", "dt", "gt", "kt")
+    __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig")
+
+    def stem(self, word):
+        """
+        Stem a Danish word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        # Every word is put into lower case for normalization.
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        # After this, the required regions are generated
+        # by the respective helper method.
+        r1 = self._r1_scandinavian(word, self.__vowels)
+
+        # Then the actual stemming process starts.
+        # Every new step is explicitly indicated
+        # according to the descriptions on the Snowball website.
+
+        # STEP 1
+        for suffix in self.__step1_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "s":
+                    if word[-2] in self.__s_ending:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 2
+        for suffix in self.__step2_suffixes:
+            if r1.endswith(suffix):
+                word = word[:-1]
+                r1 = r1[:-1]
+                break
+
+        # STEP 3
+        if r1.endswith("igst"):
+            word = word[:-2]
+            r1 = r1[:-2]
+
+        for suffix in self.__step3_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "l\xF8st":
+                    word = word[:-1]
+                    r1 = r1[:-1]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+
+                    if r1.endswith(self.__step2_suffixes):
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                break
+
+        # STEP 4: Undouble
+        for double_cons in self.__double_consonants:
+            if word.endswith(double_cons) and len(word) > 3:
+                word = word[:-1]
+                break
+
+
+        return word
+
+
+class DutchStemmer(_StandardStemmer):
+
+    """
+    The Dutch Snowball stemmer.
+
+    :cvar __vowels: The Dutch vowels.
+    :type __vowels: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
+    :type __step3b_suffixes: tuple
+    :note: A detailed description of the Dutch
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/dutch/stemmer.html
+
+    """
+
+    __vowels = "aeiouy\xE8"
+    __step1_suffixes = ("heden", "ene", "en", "se", "s")
+    __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")
+
+    def stem(self, word):
+        """
+        Stem a Dutch word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step2_success = False
+
+        # Vowel accents are removed.
+        word = (word.replace("\xE4", "a").replace("\xE1", "a")
+                    .replace("\xEB", "e").replace("\xE9", "e")
+                    .replace("\xED", "i").replace("\xEF", "i")
+                    .replace("\xF6", "o").replace("\xF3", "o")
+                    .replace("\xFC", "u").replace("\xFA", "u"))
+
+        # An initial 'y', a 'y' after a vowel,
+        # and an 'i' between self.__vowels is put into upper case.
+        # As from now these are treated as consonants.
+        if word.startswith("y"):
+            word = "".join(("Y", word[1:]))
+
+        for i in range(1, len(word)):
+            if word[i-1] in self.__vowels and word[i] == "y":
+                word = "".join((word[:i], "Y", word[i+1:]))
+
+        for i in range(1, len(word)-1):
+            if (word[i-1] in self.__vowels and word[i] == "i" and
+               word[i+1] in self.__vowels):
+                word = "".join((word[:i], "I", word[i+1:]))
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+
+        # R1 is adjusted so that the region before it
+        # contains at least 3 letters.
+        for i in range(1, len(word)):
+            if word[i] not in self.__vowels and word[i-1] in self.__vowels:
+                if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
+                    r1 = word[3:]
+                elif len(word[:i+1]) == 0:
+                    return word
+                break
+
+        # STEP 1
+        for suffix in self.__step1_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "heden":
+                    word = suffix_replace(word, suffix, "heid")
+                    r1 = suffix_replace(r1, suffix, "heid")
+                    if r2.endswith("heden"):
+                        r2 = suffix_replace(r2, suffix, "heid")
+
+                elif (suffix in ("ene", "en") and
+                      not word.endswith("heden") and
+                      word[-len(suffix)-1] not in self.__vowels and
+                      word[-len(suffix)-3:-len(suffix)] != "gem"):
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    if word.endswith(("kk", "dd", "tt")):
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+
+                elif (suffix in ("se", "s") and
+                      word[-len(suffix)-1] not in self.__vowels and
+                      word[-len(suffix)-1] != "j"):
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 2
+        if r1.endswith("e") and word[-2] not in self.__vowels:
+            step2_success = True
+            word = word[:-1]
+            r1 = r1[:-1]
+            r2 = r2[:-1]
+
+            if word.endswith(("kk", "dd", "tt")):
+                word = word[:-1]
+                r1 = r1[:-1]
+                r2 = r2[:-1]
+
+        # STEP 3a
+        if r2.endswith("heid") and word[-5] != "c":
+            word = word[:-4]
+            r1 = r1[:-4]
+            r2 = r2[:-4]
+
+            if (r1.endswith("en") and word[-3] not in self.__vowels and
+                word[-5:-2] != "gem"):
+                word = word[:-2]
+                r1 = r1[:-2]
+                r2 = r2[:-2]
+
+                if word.endswith(("kk", "dd", "tt")):
+                    word = word[:-1]
+                    r1 = r1[:-1]
+                    r2 = r2[:-1]
+
+        # STEP 3b: Derivational suffixes
+        for suffix in self.__step3b_suffixes:
+            if r2.endswith(suffix):
+                if suffix in ("end", "ing"):
+                    word = word[:-3]
+                    r2 = r2[:-3]
+
+                    if r2.endswith("ig") and word[-3] != "e":
+                        word = word[:-2]
+                    else:
+                        if word.endswith(("kk", "dd", "tt")):
+                            word = word[:-1]
+
+                elif suffix == "ig" and word[-3] != "e":
+                    word = word[:-2]
+
+                elif suffix == "lijk":
+                    word = word[:-4]
+                    r1 = r1[:-4]
+
+                    if r1.endswith("e") and word[-2] not in self.__vowels:
+                        word = word[:-1]
+                        if word.endswith(("kk", "dd", "tt")):
+                            word = word[:-1]
+
+                elif suffix == "baar":
+                    word = word[:-4]
+
+                elif suffix == "bar" and step2_success:
+                    word = word[:-3]
+                break
+
+        # STEP 4: Undouble vowel
+        if len(word) >= 4:
+            if word[-1] not in self.__vowels and word[-1] != "I":
+                if word[-3:-1] in ("aa", "ee", "oo", "uu"):
+                    if word[-4] not in self.__vowels:
+                        word = "".join((word[:-3], word[-3], word[-1]))
+
+        # All occurrences of 'I' and 'Y' are put back into lower case.
+        word = word.replace("I", "i").replace("Y", "y")
+
+
+        return word
+
+
+
+class EnglishStemmer(_StandardStemmer):
+
+    """
+    The English Snowball stemmer.
+
+    :cvar __vowels: The English vowels.
+    :type __vowels: unicode
+    :cvar __double_consonants: The English double consonants.
+    :type __double_consonants: tuple
+    :cvar __li_ending: Letters that may directly appear before a word final 'li'.
+    :type __li_ending: unicode
+    :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
+    :type __step0_suffixes: tuple
+    :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
+    :type __step1a_suffixes: tuple
+    :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
+    :type __step1b_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
+    :type __step4_suffixes: tuple
+    :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
+    :type __step5_suffixes: tuple
+    :cvar __special_words: A dictionary containing words
+                           which have to be stemmed specially.
+    :type __special_words: dict
+    :note: A detailed description of the English
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/english/stemmer.html
+    """
+
+    __vowels = "aeiouy"
+    __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn",
+                           "pp", "rr", "tt")
+    __li_ending = "cdeghkmnrt"
+    __step0_suffixes = ("'s'", "'s", "'")
+    __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
+    __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
+    __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness',
+                        'iveness', 'tional', 'biliti', 'lessli',
+                        'entli', 'ation', 'alism', 'aliti', 'ousli',
+                        'iviti', 'fulli', 'enci', 'anci', 'abli',
+                        'izer', 'ator', 'alli', 'bli', 'ogi', 'li')
+    __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti',
+                        'ative', 'ical', 'ness', 'ful')
+    __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment',
+                        'ant', 'ent', 'ism', 'ate', 'iti', 'ous',
+                        'ive', 'ize', 'ion', 'al', 'er', 'ic')
+    __step5_suffixes = ("e", "l")
+    __special_words = {"skis" : "ski",
+                       "skies" : "sky",
+                       "dying" : "die",
+                       "lying" : "lie",
+                       "tying" : "tie",
+                       "idly" : "idl",
+                       "gently" : "gentl",
+                       "ugly" : "ugli",
+                       "early" : "earli",
+                       "only" : "onli",
+                       "singly" : "singl",
+                       "sky" : "sky",
+                       "news" : "news",
+                       "howe" : "howe",
+                       "atlas" : "atlas",
+                       "cosmos" : "cosmos",
+                       "bias" : "bias",
+                       "andes" : "andes",
+                       "inning" : "inning",
+                       "innings" : "inning",
+                       "outing" : "outing",
+                       "outings" : "outing",
+                       "canning" : "canning",
+                       "cannings" : "canning",
+                       "herring" : "herring",
+                       "herrings" : "herring",
+                       "earring" : "earring",
+                       "earrings" : "earring",
+                       "proceed" : "proceed",
+                       "proceeds" : "proceed",
+                       "proceeded" : "proceed",
+                       "proceeding" : "proceed",
+                       "exceed" : "exceed",
+                       "exceeds" : "exceed",
+                       "exceeded" : "exceed",
+                       "exceeding" : "exceed",
+                       "succeed" : "succeed",
+                       "succeeds" : "succeed",
+                       "succeeded" : "succeed",
+                       "succeeding" : "succeed"}
+
+    def stem(self, word):
+
+        """
+        Stem an English word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords or len(word) <= 2:
+            return word
+
+        elif word in self.__special_words:
+            return self.__special_words[word]
+
+        # Map the different apostrophe characters to a single consistent one
+        word = (word.replace("\u2019", "\x27")
+                    .replace("\u2018", "\x27")
+                    .replace("\u201B", "\x27"))
+
+        if word.startswith("\x27"):
+            word = word[1:]
+
+        if word.startswith("y"):
+            word = "".join(("Y", word[1:]))
+
+        for i in range(1, len(word)):
+            if word[i-1] in self.__vowels and word[i] == "y":
+                word = "".join((word[:i], "Y", word[i+1:]))
+
+        step1a_vowel_found = False
+        step1b_vowel_found = False
+
+        r1 = ""
+        r2 = ""
+
+        if word.startswith(("gener", "commun", "arsen")):
+            if word.startswith(("gener", "arsen")):
+                r1 = word[5:]
+            else:
+                r1 = word[6:]
+
+            for i in range(1, len(r1)):
+                if r1[i] not in self.__vowels and r1[i-1] in self.__vowels:
+                    r2 = r1[i+1:]
+                    break
+        else:
+            r1, r2 = self._r1r2_standard(word, self.__vowels)
+
+
+        # STEP 0
+        for suffix in self.__step0_suffixes:
+            if word.endswith(suffix):
+                word = word[:-len(suffix)]
+                r1 = r1[:-len(suffix)]
+                r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 1a
+        for suffix in self.__step1a_suffixes:
+            if word.endswith(suffix):
+
+                if suffix == "sses":
+                    word = word[:-2]
+                    r1 = r1[:-2]
+                    r2 = r2[:-2]
+
+                elif suffix in ("ied", "ies"):
+                    if len(word[:-len(suffix)]) > 1:
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+                    else:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+
+                elif suffix == "s":
+                    for letter in word[:-2]:
+                        if letter in self.__vowels:
+                            step1a_vowel_found = True
+                            break
+
+                    if step1a_vowel_found:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+                break
+
+        # STEP 1b
+        for suffix in self.__step1b_suffixes:
+            if word.endswith(suffix):
+                if suffix in ("eed", "eedly"):
+
+                    if r1.endswith(suffix):
+                        word = suffix_replace(word, suffix, "ee")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ee")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ee")
+                        else:
+                            r2 = ""
+                else:
+                    for letter in word[:-len(suffix)]:
+                        if letter in self.__vowels:
+                            step1b_vowel_found = True
+                            break
+
+                    if step1b_vowel_found:
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+
+                        if word.endswith(("at", "bl", "iz")):
+                            word = "".join((word, "e"))
+                            r1 = "".join((r1, "e"))
+
+                            if len(word) > 5 or len(r1) >=3:
+                                r2 = "".join((r2, "e"))
+
+                        elif word.endswith(self.__double_consonants):
+                            word = word[:-1]
+                            r1 = r1[:-1]
+                            r2 = r2[:-1]
+
+                        elif ((r1 == "" and len(word) >= 3 and
+                               word[-1] not in self.__vowels and
+                               word[-1] not in "wxY" and
+                               word[-2] in self.__vowels and
+                               word[-3] not in self.__vowels)
+                              or
+                              (r1 == "" and len(word) == 2 and
+                               word[0] in self.__vowels and
+                               word[1] not in self.__vowels)):
+
+                            word = "".join((word, "e"))
+
+                            if len(r1) > 0:
+                                r1 = "".join((r1, "e"))
+
+                            if len(r2) > 0:
+                                r2 = "".join((r2, "e"))
+                break
+
+        # STEP 1c
+        if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels:
+            word = "".join((word[:-1], "i"))
+            if len(r1) >= 1:
+                r1 = "".join((r1[:-1], "i"))
+            else:
+                r1 = ""
+
+            if len(r2) >= 1:
+                r2 = "".join((r2[:-1], "i"))
+            else:
+                r2 = ""
+
+        # STEP 2
+        for suffix in self.__step2_suffixes:
+            if word.endswith(suffix):
+                if r1.endswith(suffix):
+                    if suffix == "tional":
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                    elif suffix in ("enci", "anci", "abli"):
+                        word = "".join((word[:-1], "e"))
+
+                        if len(r1) >= 1:
+                            r1 = "".join((r1[:-1], "e"))
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= 1:
+                            r2 = "".join((r2[:-1], "e"))
+                        else:
+                            r2 = ""
+
+                    elif suffix == "entli":
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                    elif suffix in ("izer", "ization"):
+                        word = suffix_replace(word, suffix, "ize")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ize")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ize")
+                        else:
+                            r2 = ""
+
+                    elif suffix in ("ational", "ation", "ator"):
+                        word = suffix_replace(word, suffix, "ate")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ate")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ate")
+                        else:
+                            r2 = "e"
+
+                    elif suffix in ("alism", "aliti", "alli"):
+                        word = suffix_replace(word, suffix, "al")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "al")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "al")
+                        else:
+                            r2 = ""
+
+                    elif suffix == "fulness":
+                        word = word[:-4]
+                        r1 = r1[:-4]
+                        r2 = r2[:-4]
+
+                    elif suffix in ("ousli", "ousness"):
+                        word = suffix_replace(word, suffix, "ous")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ous")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ous")
+                        else:
+                            r2 = ""
+
+                    elif suffix in ("iveness", "iviti"):
+                        word = suffix_replace(word, suffix, "ive")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ive")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ive")
+                        else:
+                            r2 = "e"
+
+                    elif suffix in ("biliti", "bli"):
+                        word = suffix_replace(word, suffix, "ble")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ble")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ble")
+                        else:
+                            r2 = ""
+
+                    elif suffix == "ogi" and word[-4] == "l":
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+
+                    elif suffix in ("fulli", "lessli"):
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                    elif suffix == "li" and word[-3] in self.__li_ending:
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+                break
+
+        # STEP 3
+        for suffix in self.__step3_suffixes:
+            if word.endswith(suffix):
+                if r1.endswith(suffix):
+                    if suffix == "tional":
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                    elif suffix == "ational":
+                        word = suffix_replace(word, suffix, "ate")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ate")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ate")
+                        else:
+                            r2 = ""
+
+                    elif suffix == "alize":
+                        word = word[:-3]
+                        r1 = r1[:-3]
+                        r2 = r2[:-3]
+
+                    elif suffix in ("icate", "iciti", "ical"):
+                        word = suffix_replace(word, suffix, "ic")
+
+                        if len(r1) >= len(suffix):
+                            r1 = suffix_replace(r1, suffix, "ic")
+                        else:
+                            r1 = ""
+
+                        if len(r2) >= len(suffix):
+                            r2 = suffix_replace(r2, suffix, "ic")
+                        else:
+                            r2 = ""
+
+                    elif suffix in ("ful", "ness"):
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+
+                    elif suffix == "ative" and r2.endswith(suffix):
+                        word = word[:-5]
+                        r1 = r1[:-5]
+                        r2 = r2[:-5]
+                break
+
+        # STEP 4
+        for suffix in self.__step4_suffixes:
+            if word.endswith(suffix):
+                if r2.endswith(suffix):
+                    if suffix == "ion":
+                        if word[-4] in "st":
+                            word = word[:-3]
+                            r1 = r1[:-3]
+                            r2 = r2[:-3]
+                    else:
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 5
+        if r2.endswith("l") and word[-2] == "l":
+            word = word[:-1]
+        elif r2.endswith("e"):
+            word = word[:-1]
+        elif r1.endswith("e"):
+            if len(word) >= 4 and (word[-2] in self.__vowels or
+                                   word[-2] in "wxY" or
+                                   word[-3] not in self.__vowels or
+                                   word[-4] in self.__vowels):
+                word = word[:-1]
+
+
+        word = word.replace("Y", "y")
+
+
+        return word
+
+
+
+class FinnishStemmer(_StandardStemmer):
+
+    """
+    The Finnish Snowball stemmer.
+
+    :cvar __vowels: The Finnish vowels.
+    :type __vowels: unicode
+    :cvar __restricted_vowels: A subset of the Finnish vowels.
+    :type __restricted_vowels: unicode
+    :cvar __long_vowels: The Finnish vowels in their long forms.
+    :type __long_vowels: tuple
+    :cvar __consonants: The Finnish consonants.
+    :type __consonants: unicode
+    :cvar __double_consonants: The Finnish double consonants.
+    :type __double_consonants: tuple
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
+    :type __step4_suffixes: tuple
+    :note: A detailed description of the Finnish
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/finnish/stemmer.html
+    """
+
+    __vowels = "aeiouy\xE4\xF6"
+    __restricted_vowels = "aeiou\xE4\xF6"
+    __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4",
+                     "\xF6\xF6")
+    __consonants = "bcdfghjklmnpqrstvwxz"
+    __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
+                           "kk", "ll", "mm", "nn", "pp", "qq", "rr",
+                           "ss", "tt", "vv", "ww", "xx", "zz")
+    __step1_suffixes = ('kaan', 'k\xE4\xE4n', 'sti', 'kin', 'han',
+                        'h\xE4n', 'ko', 'k\xF6', 'pa', 'p\xE4')
+    __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni',
+                        'an', '\xE4n', 'en')
+    __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin',
+                        'hon', 'h\xE4n', 'h\xF6n', 'den', 'tta',
+                        'tt\xE4', 'ssa', 'ss\xE4', 'sta',
+                        'st\xE4', 'lla', 'll\xE4', 'lta',
+                        'lt\xE4', 'lle', 'ksi', 'ine', 'ta',
+                        't\xE4', 'na', 'n\xE4', 'a', '\xE4',
+                        'n')
+    __step4_suffixes = ('impi', 'impa', 'imp\xE4', 'immi', 'imma',
+                        'imm\xE4', 'mpi', 'mpa', 'mp\xE4', 'mmi',
+                        'mma', 'mm\xE4', 'eja', 'ej\xE4')
+
+    def stem(self, word):
+        """
+        Stem a Finnish word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step3_success = False
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+
+        # STEP 1: Particles etc.
+        for suffix in self.__step1_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "sti":
+                    if suffix in r2:
+                        word = word[:-3]
+                        r1 = r1[:-3]
+                        r2 = r2[:-3]
+                else:
+                    if word[-len(suffix)-1] in "ntaeiouy\xE4\xF6":
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 2: Possessives
+        for suffix in self.__step2_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "si":
+                    if word[-3] != "k":
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                elif suffix == "ni":
+                    word = word[:-2]
+                    r1 = r1[:-2]
+                    r2 = r2[:-2]
+                    if word.endswith("kse"):
+                        word = suffix_replace(word, "kse", "ksi")
+
+                    if r1.endswith("kse"):
+                        r1 = suffix_replace(r1, "kse", "ksi")
+
+                    if r2.endswith("kse"):
+                        r2 = suffix_replace(r2, "kse", "ksi")
+
+                elif suffix == "an":
+                    if (word[-4:-2] in ("ta", "na") or
+                        word[-5:-2] in ("ssa", "sta", "lla", "lta")):
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                elif suffix == "\xE4n":
+                    if (word[-4:-2] in ("t\xE4", "n\xE4") or
+                        word[-5:-2] in ("ss\xE4", "st\xE4",
+                                        "ll\xE4", "lt\xE4")):
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+
+                elif suffix == "en":
+                    if word[-5:-2] in ("lle", "ine"):
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+                else:
+                    word = word[:-3]
+                    r1 = r1[:-3]
+                    r2 = r2[:-3]
+                break
+
+        # STEP 3: Cases
+        for suffix in self.__step3_suffixes:
+            if r1.endswith(suffix):
+                if suffix in ("han", "hen", "hin", "hon", "h\xE4n",
+                              "h\xF6n"):
+                    if ((suffix == "han" and word[-4] == "a") or
+                        (suffix == "hen" and word[-4] == "e") or
+                        (suffix == "hin" and word[-4] == "i") or
+                        (suffix == "hon" and word[-4] == "o") or
+                        (suffix == "h\xE4n" and word[-4] == "\xE4") or
+                        (suffix == "h\xF6n" and word[-4] == "\xF6")):
+                        word = word[:-3]
+                        r1 = r1[:-3]
+                        r2 = r2[:-3]
+                        step3_success = True
+
+                elif suffix in ("siin", "den", "tten"):
+                    if (word[-len(suffix)-1] == "i" and
+                        word[-len(suffix)-2] in self.__restricted_vowels):
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        step3_success = True
+                    else:
+                        continue
+
+                elif suffix == "seen":
+                    if word[-6:-4] in self.__long_vowels:
+                        word = word[:-4]
+                        r1 = r1[:-4]
+                        r2 = r2[:-4]
+                        step3_success = True
+                    else:
+                        continue
+
+                elif suffix in ("a", "\xE4"):
+                    if word[-2] in self.__vowels and word[-3] in self.__consonants:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+                        step3_success = True
+
+                elif suffix in ("tta", "tt\xE4"):
+                    if word[-4] == "e":
+                        word = word[:-3]
+                        r1 = r1[:-3]
+                        r2 = r2[:-3]
+                        step3_success = True
+
+                elif suffix == "n":
+                    word = word[:-1]
+                    r1 = r1[:-1]
+                    r2 = r2[:-1]
+                    step3_success = True
+
+                    if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    step3_success = True
+                break
+
+        # STEP 4: Other endings
+        for suffix in self.__step4_suffixes:
+            if r2.endswith(suffix):
+                if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma",
+                              "mm\xE4"):
+                    if word[-5:-3] != "po":
+                        word = word[:-3]
+                        r1 = r1[:-3]
+                        r2 = r2[:-3]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 5: Plurals
+        if step3_success and len(r1) >= 1 and r1[-1] in "ij":
+            word = word[:-1]
+            r1 = r1[:-1]
+
+        elif (not step3_success and len(r1) >= 2 and
+              r1[-1] == "t" and r1[-2] in self.__vowels):
+            word = word[:-1]
+            r1 = r1[:-1]
+            r2 = r2[:-1]
+            if r2.endswith("imma"):
+                word = word[:-4]
+                r1 = r1[:-4]
+            elif r2.endswith("mma") and r2[-5:-3] != "po":
+                word = word[:-3]
+                r1 = r1[:-3]
+
+        # STEP 6: Tidying up
+        if r1[-2:] in self.__long_vowels:
+            word = word[:-1]
+            r1 = r1[:-1]
+
+        if (len(r1) >= 2 and r1[-2] in self.__consonants and
+            r1[-1] in "a\xE4ei"):
+            word = word[:-1]
+            r1 = r1[:-1]
+
+        if r1.endswith(("oj", "uj")):
+            word = word[:-1]
+            r1 = r1[:-1]
+
+        if r1.endswith("jo"):
+            word = word[:-1]
+            r1 = r1[:-1]
+
+        # If the word ends with a double consonant
+        # followed by zero or more vowels, the last consonant is removed.
+        for i in range(1, len(word)):
+            if word[-i] in self.__vowels:
+                continue
+            else:
+                if i == 1:
+                    if word[-i-1:] in self.__double_consonants:
+                        word = word[:-1]
+                else:
+                    if word[-i-1:-i+1] in self.__double_consonants:
+                        word = "".join((word[:-i], word[-i+1:]))
+                break
+
+
+        return word
+
+
+
+class FrenchStemmer(_StandardStemmer):
+
+    """
+    The French Snowball stemmer.
+
+    :cvar __vowels: The French vowels.
+    :type __vowels: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
+    :type __step2a_suffixes: tuple
+    :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
+    :type __step2b_suffixes: tuple
+    :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
+    :type __step4_suffixes: tuple
+    :note: A detailed description of the French
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/french/stemmer.html
+    """
+
+    __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
+    __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice',
+                        'ateurs', 'ations', 'logies', 'usions',
+                        'utions', 'ements', 'amment', 'emment',
+                        'ances', 'iqUes', 'ismes', 'ables', 'istes',
+                        'ateur', 'ation', 'logie', 'usion', 'ution',
+                        'ences', 'ement', 'euses', 'ments', 'ance',
+                        'iqUe', 'isme', 'able', 'iste', 'ence',
+                        'it\xE9s', 'ives', 'eaux', 'euse', 'ment',
+                        'eux', 'it\xE9', 'ive', 'ifs', 'aux', 'if')
+    __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante',
+                         'issants', 'issions', 'irions', 'issais',
+                         'issait', 'issant', 'issent', 'issiez', 'issons',
+                         'irais', 'irait', 'irent', 'iriez', 'irons',
+                         'iront', 'isses', 'issez', '\xEEmes',
+                         '\xEEtes', 'irai', 'iras', 'irez', 'isse',
+                         'ies', 'ira', '\xEEt', 'ie', 'ir', 'is',
+                         'it', 'i')
+    __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent',
+                         'assiez', '\xE8rent', 'erais', 'erait',
+                         'eriez', 'erons', 'eront', 'aIent', 'antes',
+                         'asses', 'ions', 'erai', 'eras', 'erez',
+                         '\xE2mes', '\xE2tes', 'ante', 'ants',
+                         'asse', '\xE9es', 'era', 'iez', 'ais',
+                         'ait', 'ant', '\xE9e', '\xE9s', 'er',
+                         'ez', '\xE2t', 'ai', 'as', '\xE9', 'a')
+    __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier',
+                        'e', '\xEB')
+
+    def stem(self, word):
+        """
+        Stem a French word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step1_success = False
+        rv_ending_found = False
+        step2a_success = False
+        step2b_success = False
+
+        # Every occurrence of 'u' after 'q' is put into upper case.
+        for i in range(1, len(word)):
+            if word[i-1] == "q" and word[i] == "u":
+                word = "".join((word[:i], "U", word[i+1:]))
+
+        # Every occurrence of 'u' and 'i'
+        # between vowels is put into upper case.
+        # Every occurrence of 'y' preceded or
+        # followed by a vowel is also put into upper case.
+        for i in range(1, len(word)-1):
+            if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
+                if word[i] == "u":
+                    word = "".join((word[:i], "U", word[i+1:]))
+
+                elif word[i] == "i":
+                    word = "".join((word[:i], "I", word[i+1:]))
+
+            if word[i-1] in self.__vowels or word[i+1] in self.__vowels:
+                if word[i] == "y":
+                    word = "".join((word[:i], "Y", word[i+1:]))
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+        rv = self.__rv_french(word, self.__vowels)
+
+        # STEP 1: Standard suffix removal
+        for suffix in self.__step1_suffixes:
+            if word.endswith(suffix):
+                if suffix == "eaux":
+                    word = word[:-1]
+                    step1_success = True
+
+                elif suffix in ("euse", "euses"):
+                    if suffix in r2:
+                        word = word[:-len(suffix)]
+                        step1_success = True
+
+                    elif suffix in r1:
+                        word = suffix_replace(word, suffix, "eux")
+                        step1_success = True
+
+                elif suffix in ("ement", "ements") and suffix in rv:
+                    word = word[:-len(suffix)]
+                    step1_success = True
+
+                    if word[-2:] == "iv" and "iv" in r2:
+                        word = word[:-2]
+
+                        if word[-2:] == "at" and "at" in r2:
+                            word = word[:-2]
+
+                    elif word[-3:] == "eus":
+                        if "eus" in r2:
+                            word = word[:-3]
+                        elif "eus" in r1:
+                            word = "".join((word[:-1], "x"))
+
+                    elif word[-3:] in ("abl", "iqU"):
+                        if "abl" in r2 or "iqU" in r2:
+                            word = word[:-3]
+
+                    elif word[-3:] in ("i\xE8r", "I\xE8r"):
+                        if "i\xE8r" in rv or "I\xE8r" in rv:
+                            word = "".join((word[:-3], "i"))
+
+                elif suffix == "amment" and suffix in rv:
+                    word = suffix_replace(word, "amment", "ant")
+                    rv = suffix_replace(rv, "amment", "ant")
+                    rv_ending_found = True
+
+                elif suffix == "emment" and suffix in rv:
+                    word = suffix_replace(word, "emment", "ent")
+                    rv_ending_found = True
+
+                elif (suffix in ("ment", "ments") and suffix in rv and
+                      not rv.startswith(suffix) and
+                      rv[rv.rindex(suffix)-1] in self.__vowels):
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    rv_ending_found = True
+
+                elif suffix == "aux" and suffix in r1:
+                    word = "".join((word[:-2], "l"))
+                    step1_success = True
+
+                elif (suffix in ("issement", "issements") and suffix in r1
+                      and word[-len(suffix)-1] not in self.__vowels):
+                    word = word[:-len(suffix)]
+                    step1_success = True
+
+                elif suffix in ("ance", "iqUe", "isme", "able", "iste",
+                              "eux", "ances", "iqUes", "ismes",
+                              "ables", "istes") and suffix in r2:
+                    word = word[:-len(suffix)]
+                    step1_success = True
+
+                elif suffix in ("atrice", "ateur", "ation", "atrices",
+                                "ateurs", "ations") and suffix in r2:
+                    word = word[:-len(suffix)]
+                    step1_success = True
+
+                    if word[-2:] == "ic":
+                        if "ic" in r2:
+                            word = word[:-2]
+                        else:
+                            word = "".join((word[:-2], "iqU"))
+
+                elif suffix in ("logie", "logies") and suffix in r2:
+                    word = suffix_replace(word, suffix, "log")
+                    step1_success = True
+
+                elif (suffix in ("usion", "ution", "usions", "utions") and
+                      suffix in r2):
+                    word = suffix_replace(word, suffix, "u")
+                    step1_success = True
+
+                elif suffix in ("ence", "ences") and suffix in r2:
+                    word = suffix_replace(word, suffix, "ent")
+                    step1_success = True
+
+                elif suffix in ("it\xE9", "it\xE9s") and suffix in r2:
+                    word = word[:-len(suffix)]
+                    step1_success = True
+
+                    if word[-4:] == "abil":
+                        if "abil" in r2:
+                            word = word[:-4]
+                        else:
+                            word = "".join((word[:-2], "l"))
+
+                    elif word[-2:] == "ic":
+                        if "ic" in r2:
+                            word = word[:-2]
+                        else:
+                            word = "".join((word[:-2], "iqU"))
+
+                    elif word[-2:] == "iv":
+                        if "iv" in r2:
+                            word = word[:-2]
+
+                elif (suffix in ("if", "ive", "ifs", "ives") and
+                      suffix in r2):
+                    word = word[:-len(suffix)]
+                    step1_success = True
+
+                    if word[-2:] == "at" and "at" in r2:
+                        word = word[:-2]
+
+                        if word[-2:] == "ic":
+                            if "ic" in r2:
+                                word = word[:-2]
+                            else:
+                                word = "".join((word[:-2], "iqU"))
+                break
+
+        # STEP 2a: Verb suffixes beginning 'i'
+        if not step1_success or rv_ending_found:
+            for suffix in self.__step2a_suffixes:
+                if word.endswith(suffix):
+                    if (suffix in rv and len(rv) > len(suffix) and
+                        rv[rv.rindex(suffix)-1] not in self.__vowels):
+                        word = word[:-len(suffix)]
+                        step2a_success = True
+                    break
+
+        # STEP 2b: Other verb suffixes
+            if not step2a_success:
+                for suffix in self.__step2b_suffixes:
+                    if rv.endswith(suffix):
+                        if suffix == "ions" and "ions" in r2:
+                            word = word[:-4]
+                            step2b_success = True
+
+                        elif suffix in ('eraIent', 'erions', '\xE8rent',
+                                        'erais', 'erait', 'eriez',
+                                        'erons', 'eront', 'erai', 'eras',
+                                        'erez', '\xE9es', 'era', 'iez',
+                                        '\xE9e', '\xE9s', 'er', 'ez',
+                                        '\xE9'):
+                            word = word[:-len(suffix)]
+                            step2b_success = True
+
+                        elif suffix in ('assions', 'assent', 'assiez',
+                                        'aIent', 'antes', 'asses',
+                                        '\xE2mes', '\xE2tes', 'ante',
+                                        'ants', 'asse', 'ais', 'ait',
+                                        'ant', '\xE2t', 'ai', 'as',
+                                        'a'):
+                            word = word[:-len(suffix)]
+                            rv = rv[:-len(suffix)]
+                            step2b_success = True
+                            if rv.endswith("e"):
+                                word = word[:-1]
+                        break
+
+        # STEP 3
+        if step1_success or step2a_success or step2b_success:
+            if word[-1] == "Y":
+                word = "".join((word[:-1], "i"))
+            elif word[-1] == "\xE7":
+                word = "".join((word[:-1], "c"))
+
+        # STEP 4: Residual suffixes
+        else:
+            if (len(word) >= 2 and word[-1] == "s" and
+                word[-2] not in "aiou\xE8s"):
+                word = word[:-1]
+
+            for suffix in self.__step4_suffixes:
+                if word.endswith(suffix):
+                    if suffix in rv:
+                        if (suffix == "ion" and suffix in r2 and
+                            rv[-4] in "st"):
+                            word = word[:-3]
+
+                        elif suffix in ("ier", "i\xE8re", "Ier",
+                                        "I\xE8re"):
+                            word = suffix_replace(word, suffix, "i")
+
+                        elif suffix == "e":
+                            word = word[:-1]
+
+                        elif suffix == "\xEB" and word[-3:-1] == "gu":
+                            word = word[:-1]
+                        break
+
+        # STEP 5: Undouble
+        if word.endswith(("enn", "onn", "ett", "ell", "eill")):
+            word = word[:-1]
+
+        # STEP 6: Un-accent
+        for i in range(1, len(word)):
+            if word[-i] not in self.__vowels:
+                i += 1
+            else:
+                if i != 1 and word[-i] in ("\xE9", "\xE8"):
+                    word = "".join((word[:-i], "e", word[-i+1:]))
+                break
+
+        word = (word.replace("I", "i")
+                    .replace("U", "u")
+                    .replace("Y", "y"))
+
+
+        return word
+
+
+
+    def __rv_french(self, word, vowels):
+        """
+        Return the region RV that is used by the French stemmer.
+
+        If the word begins with two vowels, RV is the region after
+        the third letter. Otherwise, it is the region after the first
+        vowel not at the beginning of the word, or the end of the word
+        if these positions cannot be found. (Exceptionally, u'par',
+        u'col' or u'tap' at the beginning of a word is also taken to
+        define RV as the region to their right.)
+
+        :param word: The French word whose region RV is determined.
+        :type word: str or unicode
+        :param vowels: The French vowels that are used to determine
+                       the region RV.
+        :type vowels: unicode
+        :return: the region RV for the respective French word.
+        :rtype: unicode
+        :note: This helper method is invoked by the stem method of
+               the subclass FrenchStemmer. It is not to be invoked directly!
+
+        """
+        rv = ""
+        if len(word) >= 2:
+            if (word.startswith(("par", "col", "tap")) or
+                (word[0] in vowels and word[1] in vowels)):
+                rv = word[3:]
+            else:
+                for i in range(1, len(word)):
+                    if word[i] in vowels:
+                        rv = word[i+1:]
+                        break
+
+        return rv
+
+
+
+class GermanStemmer(_StandardStemmer):
+
+    """
+    The German Snowball stemmer.
+
+    :cvar __vowels: The German vowels.
+    :type __vowels: unicode
+    :cvar __s_ending: Letters that may directly appear before a word final 's'.
+    :type __s_ending: unicode
+    :cvar __st_ending: Letter that may directly appear before a word final 'st'.
+    :type __st_ending: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :note: A detailed description of the German
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/german/stemmer.html
+
+    """
+
+    __vowels = "aeiouy\xE4\xF6\xFC"
+    __s_ending = "bdfghklmnrt"
+    __st_ending = "bdfghklmnt"
+
+    __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
+    __step2_suffixes = ("est", "en", "er", "st")
+    __step3_suffixes = ("isch", "lich", "heit", "keit",
+                          "end", "ung", "ig", "ik")
+
+    def stem(self, word):
+        """
+        Stem a German word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        word = word.replace("\xDF", "ss")
+
+        # Every occurrence of 'u' and 'y'
+        # between vowels is put into upper case.
+        for i in range(1, len(word)-1):
+            if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
+                if word[i] == "u":
+                    word = "".join((word[:i], "U", word[i+1:]))
+
+                elif word[i] == "y":
+                    word = "".join((word[:i], "Y", word[i+1:]))
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+
+        # R1 is adjusted so that the region before it
+        # contains at least 3 letters.
+        for i in range(1, len(word)):
+            if word[i] not in self.__vowels and word[i-1] in self.__vowels:
+                if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
+                    r1 = word[3:]
+                elif len(word[:i+1]) == 0:
+                    return word
+                break
+
+        # STEP 1
+        for suffix in self.__step1_suffixes:
+            if r1.endswith(suffix):
+                if (suffix in ("en", "es", "e") and
+                    word[-len(suffix)-4:-len(suffix)] == "niss"):
+                    word = word[:-len(suffix)-1]
+                    r1 = r1[:-len(suffix)-1]
+                    r2 = r2[:-len(suffix)-1]
+
+                elif suffix == "s":
+                    if word[-2] in self.__s_ending:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                        r2 = r2[:-1]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 2
+        for suffix in self.__step2_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "st":
+                    if word[-3] in self.__st_ending and len(word[:-3]) >= 3:
+                        word = word[:-2]
+                        r1 = r1[:-2]
+                        r2 = r2[:-2]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                break
+
+        # STEP 3: Derivational suffixes
+        for suffix in self.__step3_suffixes:
+            if r2.endswith(suffix):
+                if suffix in ("end", "ung"):
+                    if ("ig" in r2[-len(suffix)-2:-len(suffix)] and
+                        "e" not in r2[-len(suffix)-3:-len(suffix)-2]):
+                        word = word[:-len(suffix)-2]
+                    else:
+                        word = word[:-len(suffix)]
+
+                elif (suffix in ("ig", "ik", "isch") and
+                      "e" not in r2[-len(suffix)-1:-len(suffix)]):
+                    word = word[:-len(suffix)]
+
+                elif suffix in ("lich", "heit"):
+                    if ("er" in r1[-len(suffix)-2:-len(suffix)] or
+                        "en" in r1[-len(suffix)-2:-len(suffix)]):
+                        word = word[:-len(suffix)-2]
+                    else:
+                        word = word[:-len(suffix)]
+
+                elif suffix == "keit":
+                    if "lich" in r2[-len(suffix)-4:-len(suffix)]:
+                        word = word[:-len(suffix)-4]
+
+                    elif "ig" in r2[-len(suffix)-2:-len(suffix)]:
+                        word = word[:-len(suffix)-2]
+                    else:
+                        word = word[:-len(suffix)]
+                break
+
+        # Umlaut accents are removed and
+        # 'u' and 'y' are put back into lower case.
+        word = (word.replace("\xE4", "a").replace("\xF6", "o")
+                    .replace("\xFC", "u").replace("U", "u")
+                    .replace("Y", "y"))
+
+
+        return word
+
+
+
+class HungarianStemmer(_LanguageSpecificStemmer):
+
+    """
+    The Hungarian Snowball stemmer.
+
+    :cvar __vowels: The Hungarian vowels.
+    :type __vowels: unicode
+    :cvar __digraphs: The Hungarian digraphs.
+    :type __digraphs: tuple
+    :cvar __double_consonants: The Hungarian double consonants.
+    :type __double_consonants: tuple
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
+    :type __step4_suffixes: tuple
+    :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
+    :type __step5_suffixes: tuple
+    :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
+    :type __step6_suffixes: tuple
+    :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
+    :type __step7_suffixes: tuple
+    :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
+    :type __step8_suffixes: tuple
+    :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
+    :type __step9_suffixes: tuple
+    :note: A detailed description of the Hungarian
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
+
+    """
+
+    __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB"
+    __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
+    __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg",
+                             "ggy", "jj", "kk", "ll", "lly", "mm",
+                             "nn", "nny", "pp", "rr", "ss", "ssz",
+                             "tt", "tty", "vv", "zz", "zzs")
+
+    __step1_suffixes = ("al", "el")
+    __step2_suffixes = ('k\xE9ppen', 'onk\xE9nt', 'enk\xE9nt',
+                        'ank\xE9nt', 'k\xE9pp', 'k\xE9nt', 'ban',
+                        'ben', 'nak', 'nek', 'val', 'vel', 't\xF3l',
+                        't\xF5l', 'r\xF3l', 'r\xF5l', 'b\xF3l',
+                        'b\xF5l', 'hoz', 'hez', 'h\xF6z',
+                        'n\xE1l', 'n\xE9l', '\xE9rt', 'kor',
+                        'ba', 'be', 'ra', 're', 'ig', 'at', 'et',
+                        'ot', '\xF6t', 'ul', '\xFCl', 'v\xE1',
+                        'v\xE9', 'en', 'on', 'an', '\xF6n',
+                        'n', 't')
+    __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
+    __step4_suffixes = ('astul', 'est\xFCl', '\xE1stul',
+                        '\xE9st\xFCl', 'stul', 'st\xFCl')
+    __step5_suffixes = ("\xE1", "\xE9")
+    __step6_suffixes = ('ok\xE9', '\xF6k\xE9', 'ak\xE9',
+                        'ek\xE9', '\xE1k\xE9', '\xE1\xE9i',
+                        '\xE9k\xE9', '\xE9\xE9i', 'k\xE9',
+                        '\xE9i', '\xE9\xE9', '\xE9')
+    __step7_suffixes = ('\xE1juk', '\xE9j\xFCk', '\xFCnk',
+                        'unk', 'juk', 'j\xFCk', '\xE1nk',
+                        '\xE9nk', 'nk', 'uk', '\xFCk', 'em',
+                        'om', 'am', 'od', 'ed', 'ad', '\xF6d',
+                        'ja', 'je', '\xE1m', '\xE1d', '\xE9m',
+                        '\xE9d', 'm', 'd', 'a', 'e', 'o',
+                        '\xE1', '\xE9')
+    __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok',
+                        'eitek', '\xE1itok', '\xE9itek', 'jaim',
+                        'jeim', 'jaid', 'jeid', 'eink', 'aink',
+                        'itek', 'jeik', 'jaik', '\xE1ink',
+                        '\xE9ink', 'aim', 'eim', 'aid', 'eid',
+                        'jai', 'jei', 'ink', 'aik', 'eik',
+                        '\xE1im', '\xE1id', '\xE1ik', '\xE9im',
+                        '\xE9id', '\xE9ik', 'im', 'id', 'ai',
+                        'ei', 'ik', '\xE1i', '\xE9i', 'i')
+    __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok",
+                        "ek", "ak", "k")
+
+    def stem(self, word):
+        """
+        Stem an Hungarian word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
+
+        # STEP 1: Remove instrumental case
+        if r1.endswith(self.__step1_suffixes):
+            for double_cons in self.__double_consonants:
+                if word[-2-len(double_cons):-2] == double_cons:
+                    word = "".join((word[:-4], word[-3]))
+
+                    if r1[-2-len(double_cons):-2] == double_cons:
+                        r1 = "".join((r1[:-4], r1[-3]))
+                    break
+
+        # STEP 2: Remove frequent cases
+        for suffix in self.__step2_suffixes:
+            if word.endswith(suffix):
+                if r1.endswith(suffix):
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+
+                    if r1.endswith("\xE1"):
+                        word = "".join((word[:-1], "a"))
+                        r1 = suffix_replace(r1, "\xE1", "a")
+
+                    elif r1.endswith("\xE9"):
+                        word = "".join((word[:-1], "e"))
+                        r1 = suffix_replace(r1, "\xE9", "e")
+                break
+
+        # STEP 3: Remove special cases
+        for suffix in self.__step3_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "\xE9n":
+                    word = suffix_replace(word, suffix, "e")
+                    r1 = suffix_replace(r1, suffix, "e")
+                else:
+                    word = suffix_replace(word, suffix, "a")
+                    r1 = suffix_replace(r1, suffix, "a")
+                break
+
+        # STEP 4: Remove other cases
+        for suffix in self.__step4_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "\xE1stul":
+                    word = suffix_replace(word, suffix, "a")
+                    r1 = suffix_replace(r1, suffix, "a")
+
+                elif suffix == "\xE9st\xFCl":
+                    word = suffix_replace(word, suffix, "e")
+                    r1 = suffix_replace(r1, suffix, "e")
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 5: Remove factive case
+        for suffix in self.__step5_suffixes:
+            if r1.endswith(suffix):
+                for double_cons in self.__double_consonants:
+                    if word[-1-len(double_cons):-1] == double_cons:
+                        word = "".join((word[:-3], word[-2]))
+
+                        if r1[-1-len(double_cons):-1] == double_cons:
+                            r1 = "".join((r1[:-3], r1[-2]))
+                        break
+
+        # STEP 6: Remove owned
+        for suffix in self.__step6_suffixes:
+            if r1.endswith(suffix):
+                if suffix in ("\xE1k\xE9", "\xE1\xE9i"):
+                    word = suffix_replace(word, suffix, "a")
+                    r1 = suffix_replace(r1, suffix, "a")
+
+                elif suffix in ("\xE9k\xE9", "\xE9\xE9i",
+                                "\xE9\xE9"):
+                    word = suffix_replace(word, suffix, "e")
+                    r1 = suffix_replace(r1, suffix, "e")
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 7: Remove singular owner suffixes
+        for suffix in self.__step7_suffixes:
+            if word.endswith(suffix):
+                if r1.endswith(suffix):
+                    if suffix in ("\xE1nk", "\xE1juk", "\xE1m",
+                                  "\xE1d", "\xE1"):
+                        word = suffix_replace(word, suffix, "a")
+                        r1 = suffix_replace(r1, suffix, "a")
+
+                    elif suffix in ("\xE9nk", "\xE9j\xFCk",
+                                    "\xE9m", "\xE9d", "\xE9"):
+                        word = suffix_replace(word, suffix, "e")
+                        r1 = suffix_replace(r1, suffix, "e")
+                    else:
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 8: Remove plural owner suffixes
+        for suffix in self.__step8_suffixes:
+            if word.endswith(suffix):
+                if r1.endswith(suffix):
+                    if suffix in ("\xE1im", "\xE1id", "\xE1i",
+                                  "\xE1ink", "\xE1itok", "\xE1ik"):
+                        word = suffix_replace(word, suffix, "a")
+                        r1 = suffix_replace(r1, suffix, "a")
+
+                    elif suffix in ("\xE9im", "\xE9id", "\xE9i",
+                                    "\xE9ink", "\xE9itek", "\xE9ik"):
+                        word = suffix_replace(word, suffix, "e")
+                        r1 = suffix_replace(r1, suffix, "e")
+                    else:
+                        word = word[:-len(suffix)]
+                        r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 9: Remove plural suffixes
+        for suffix in self.__step9_suffixes:
+            if word.endswith(suffix):
+                if r1.endswith(suffix):
+                    if suffix == "\xE1k":
+                        word = suffix_replace(word, suffix, "a")
+                    elif suffix == "\xE9k":
+                        word = suffix_replace(word, suffix, "e")
+                    else:
+                        word = word[:-len(suffix)]
+                break
+
+
+        return word
+
+
+
+    def __r1_hungarian(self, word, vowels, digraphs):
+        """
+        Return the region R1 that is used by the Hungarian stemmer.
+
+        If the word begins with a vowel, R1 is defined as the region
+        after the first consonant or digraph (= two letters stand for
+        one phoneme) in the word. If the word begins with a consonant,
+        it is defined as the region after the first vowel in the word.
+        If the word does not contain both a vowel and consonant, R1
+        is the null region at the end of the word.
+
+        :param word: The Hungarian word whose region R1 is determined.
+        :type word: str or unicode
+        :param vowels: The Hungarian vowels that are used to determine
+                       the region R1.
+        :type vowels: unicode
+        :param digraphs: The digraphs that are used to determine the
+                         region R1.
+        :type digraphs: tuple
+        :return: the region R1 for the respective word.
+        :rtype: unicode
+        :note: This helper method is invoked by the stem method of the subclass
+               HungarianStemmer. It is not to be invoked directly!
+
+        """
+        r1 = ""
+        if word[0] in vowels:
+            for digraph in digraphs:
+                if digraph in word[1:]:
+                    r1 = word[word.index(digraph[-1])+1:]
+                    return r1
+
+            for i in range(1, len(word)):
+                if word[i] not in vowels:
+                    r1 = word[i+1:]
+                    break
+        else:
+            for i in range(1, len(word)):
+                if word[i] in vowels:
+                    r1 = word[i+1:]
+                    break
+
+        return r1
+
+
+
+class ItalianStemmer(_StandardStemmer):
+
+    """
+    The Italian Snowball stemmer.
+
+    :cvar __vowels: The Italian vowels.
+    :type __vowels: unicode
+    :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
+    :type __step0_suffixes: tuple
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :note: A detailed description of the Italian
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/italian/stemmer.html
+
+    """
+
+    __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
+    __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo',
+                        'gliene', 'sene', 'mela', 'mele', 'meli',
+                        'melo', 'mene', 'tela', 'tele', 'teli',
+                        'telo', 'tene', 'cela', 'cele', 'celi',
+                        'celo', 'cene', 'vela', 'vele', 'veli',
+                        'velo', 'vene', 'gli', 'ci', 'la', 'le',
+                        'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi')
+    __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni',
+                        'uzione', 'uzioni', 'usione', 'usioni',
+                        'amento', 'amenti', 'imento', 'imenti',
+                        'amente', 'abile', 'abili', 'ibile', 'ibili',
+                        'mente', 'atore', 'atori', 'logia', 'logie',
+                        'anza', 'anze', 'iche', 'ichi', 'ismo',
+                        'ismi', 'ista', 'iste', 'isti', 'ist\xE0',
+                        'ist\xE8', 'ist\xEC', 'ante', 'anti',
+                        'enza', 'enze', 'ico', 'ici', 'ica', 'ice',
+                        'oso', 'osi', 'osa', 'ose', 'it\xE0',
+                        'ivo', 'ivi', 'iva', 'ive')
+    __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo',
+                        'eranno', 'erebbe', 'eremmo', 'ereste',
+                        'eresti', 'essero', 'iranno', 'irebbe',
+                        'iremmo', 'ireste', 'iresti', 'iscano',
+                        'iscono', 'issero', 'arono', 'avamo', 'avano',
+                        'avate', 'eremo', 'erete', 'erono', 'evamo',
+                        'evano', 'evate', 'iremo', 'irete', 'irono',
+                        'ivamo', 'ivano', 'ivate', 'ammo', 'ando',
+                        'asse', 'assi', 'emmo', 'enda', 'ende',
+                        'endi', 'endo', 'erai', 'erei', 'Yamo',
+                        'iamo', 'immo', 'irai', 'irei', 'isca',
+                        'isce', 'isci', 'isco', 'ano', 'are', 'ata',
+                        'ate', 'ati', 'ato', 'ava', 'avi', 'avo',
+                        'er\xE0', 'ere', 'er\xF2', 'ete', 'eva',
+                        'evi', 'evo', 'ir\xE0', 'ire', 'ir\xF2',
+                        'ita', 'ite', 'iti', 'ito', 'iva', 'ivi',
+                        'ivo', 'ono', 'uta', 'ute', 'uti', 'uto',
+                        'ar', 'ir')
+
+    def stem(self, word):
+        """
+        Stem an Italian word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step1_success = False
+
+        # All acute accents are replaced by grave accents.
+        word = (word.replace("\xE1", "\xE0")
+                    .replace("\xE9", "\xE8")
+                    .replace("\xED", "\xEC")
+                    .replace("\xF3", "\xF2")
+                    .replace("\xFA", "\xF9"))
+
+        # Every occurrence of 'u' after 'q'
+        # is put into upper case.
+        for i in range(1, len(word)):
+            if word[i-1] == "q" and word[i] == "u":
+                word = "".join((word[:i], "U", word[i+1:]))
+
+        # Every occurrence of 'u' and 'i'
+        # between vowels is put into upper case.
+        for i in range(1, len(word)-1):
+            if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
+                if word[i] == "u":
+                    word = "".join((word[:i], "U", word[i+1:]))
+
+                elif word [i] == "i":
+                    word = "".join((word[:i], "I", word[i+1:]))
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+        rv = self._rv_standard(word, self.__vowels)
+
+        # STEP 0: Attached pronoun
+        for suffix in self.__step0_suffixes:
+            if rv.endswith(suffix):
+                if rv[-len(suffix)-4:-len(suffix)] in ("ando", "endo"):
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+
+                elif (rv[-len(suffix)-2:-len(suffix)] in
+                      ("ar", "er", "ir")):
+                    word = suffix_replace(word, suffix, "e")
+                    r1 = suffix_replace(r1, suffix, "e")
+                    r2 = suffix_replace(r2, suffix, "e")
+                    rv = suffix_replace(rv, suffix, "e")
+                break
+
+        # STEP 1: Standard suffix removal
+        for suffix in self.__step1_suffixes:
+            if word.endswith(suffix):
+                if suffix == "amente" and r1.endswith(suffix):
+                    step1_success = True
+                    word = word[:-6]
+                    r2 = r2[:-6]
+                    rv = rv[:-6]
+
+                    if r2.endswith("iv"):
+                        word = word[:-2]
+                        r2 = r2[:-2]
+                        rv = rv[:-2]
+
+                        if r2.endswith("at"):
+                            word = word[:-2]
+                            rv = rv[:-2]
+
+                    elif r2.endswith(("os", "ic")):
+                        word = word[:-2]
+                        rv = rv[:-2]
+
+                    elif r2 .endswith("abil"):
+                        word = word[:-4]
+                        rv = rv[:-4]
+
+                elif (suffix in ("amento", "amenti",
+                                 "imento", "imenti") and
+                      rv.endswith(suffix)):
+                    step1_success = True
+                    word = word[:-6]
+                    rv = rv[:-6]
+
+                elif r2.endswith(suffix):
+                    step1_success = True
+                    if suffix in ("azione", "azioni", "atore", "atori"):
+                        word = word[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+
+                        if r2.endswith("ic"):
+                            word = word[:-2]
+                            rv = rv[:-2]
+
+                    elif suffix in ("logia", "logie"):
+                        word = word[:-2]
+                        rv = word[:-2]
+
+                    elif suffix in ("uzione", "uzioni",
+                                    "usione", "usioni"):
+                        word = word[:-5]
+                        rv = rv[:-5]
+
+                    elif suffix in ("enza", "enze"):
+                        word = suffix_replace(word, suffix, "te")
+                        rv = suffix_replace(rv, suffix, "te")
+
+                    elif suffix == "it\xE0":
+                        word = word[:-3]
+                        r2 = r2[:-3]
+                        rv = rv[:-3]
+
+                        if r2.endswith(("ic", "iv")):
+                            word = word[:-2]
+                            rv = rv[:-2]
+
+                        elif r2.endswith("abil"):
+                            word = word[:-4]
+                            rv = rv[:-4]
+
+                    elif suffix in ("ivo", "ivi", "iva", "ive"):
+                        word = word[:-3]
+                        r2 = r2[:-3]
+                        rv = rv[:-3]
+
+                        if r2.endswith("at"):
+                            word = word[:-2]
+                            r2 = r2[:-2]
+                            rv = rv[:-2]
+
+                            if r2.endswith("ic"):
+                                word = word[:-2]
+                                rv = rv[:-2]
+                    else:
+                        word = word[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+                break
+
+        # STEP 2: Verb suffixes
+        if not step1_success:
+            for suffix in self.__step2_suffixes:
+                if rv.endswith(suffix):
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    break
+
+        # STEP 3a
+        if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8",
+                        "\xEC", "\xF2")):
+            word = word[:-1]
+            rv = rv[:-1]
+
+            if rv.endswith("i"):
+                word = word[:-1]
+                rv = rv[:-1]
+
+        # STEP 3b
+        if rv.endswith(("ch", "gh")):
+            word = word[:-1]
+
+        word = word.replace("I", "i").replace("U", "u")
+
+
+        return word
+
+
+
+class NorwegianStemmer(_ScandinavianStemmer):
+
+    """
+    The Norwegian Snowball stemmer.
+
+    :cvar __vowels: The Norwegian vowels.
+    :type __vowels: unicode
+    :cvar __s_ending: Letters that may directly appear before a word final 's'.
+    :type __s_ending: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :note: A detailed description of the Norwegian
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
+
+    """
+
+    __vowels = "aeiouy\xE6\xE5\xF8"
+    __s_ending = "bcdfghjlmnoprtvyz"
+    __step1_suffixes = ("hetenes", "hetene", "hetens", "heter",
+                        "heten", "endes", "ande", "ende", "edes",
+                        "enes", "erte", "ede", "ane", "ene", "ens",
+                        "ers", "ets", "het", "ast", "ert", "en",
+                        "ar", "er", "as", "es", "et", "a", "e", "s")
+
+    __step2_suffixes = ("dt", "vt")
+
+    __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov",
+                          "leg", "eig", "lig", "els", "lov", "ig")
+
+    def stem(self, word):
+        """
+        Stem a Norwegian word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        r1 = self._r1_scandinavian(word, self.__vowels)
+
+        # STEP 1
+        for suffix in self.__step1_suffixes:
+            if r1.endswith(suffix):
+                if suffix in ("erte", "ert"):
+                    word = suffix_replace(word, suffix, "er")
+                    r1 = suffix_replace(r1, suffix, "er")
+
+                elif suffix == "s":
+                    if (word[-2] in self.__s_ending or
+                        (word[-2] == "k" and word[-3] not in self.__vowels)):
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 2
+        for suffix in self.__step2_suffixes:
+            if r1.endswith(suffix):
+                word = word[:-1]
+                r1 = r1[:-1]
+                break
+
+        # STEP 3
+        for suffix in self.__step3_suffixes:
+            if r1.endswith(suffix):
+                word = word[:-len(suffix)]
+                break
+
+
+        return word
+
+
+
+class PortugueseStemmer(_StandardStemmer):
+
+    """
+    The Portuguese Snowball stemmer.
+
+    :cvar __vowels: The Portuguese vowels.
+    :type __vowels: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
+    :type __step4_suffixes: tuple
+    :note: A detailed description of the Portuguese
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
+
+    """
+
+    __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
+    __step1_suffixes = ('amentos', 'imentos', 'uço~es', 'amento',
+                        'imento', 'adoras', 'adores', 'a\xE7o~es',
+                        'logias', '\xEAncias', 'amente',
+                        'idades', 'an\xE7as', 'ismos', 'istas', 'adora',
+                        'a\xE7a~o', 'antes', '\xE2ncia',
+                        'logia', 'uça~o', '\xEAncia',
+                        'mente', 'idade', 'an\xE7a', 'ezas', 'icos', 'icas',
+                        'ismo', '\xE1vel', '\xEDvel', 'ista',
+                        'osos', 'osas', 'ador', 'ante', 'ivas',
+                        'ivos', 'iras', 'eza', 'ico', 'ica',
+                        'oso', 'osa', 'iva', 'ivo', 'ira')
+    __step2_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos',
+                        '\xE1ssemos', '\xEAssemos', '\xEDssemos',
+                        'ar\xEDeis', 'er\xEDeis', 'ir\xEDeis',
+                        '\xE1sseis', '\xE9sseis', '\xEDsseis',
+                        '\xE1ramos', '\xE9ramos', '\xEDramos',
+                        '\xE1vamos', 'aremos', 'eremos', 'iremos',
+                        'ariam', 'eriam', 'iriam', 'assem', 'essem',
+                        'issem', 'ara~o', 'era~o', 'ira~o', 'arias',
+                        'erias', 'irias', 'ardes', 'erdes', 'irdes',
+                        'asses', 'esses', 'isses', 'astes', 'estes',
+                        'istes', '\xE1reis', 'areis', '\xE9reis',
+                        'ereis', '\xEDreis', 'ireis', '\xE1veis',
+                        '\xEDamos', 'armos', 'ermos', 'irmos',
+                        'aria', 'eria', 'iria', 'asse', 'esse',
+                        'isse', 'aste', 'este', 'iste', 'arei',
+                        'erei', 'irei', 'aram', 'eram', 'iram',
+                        'avam', 'arem', 'erem', 'irem',
+                        'ando', 'endo', 'indo', 'adas', 'idas',
+                        'ar\xE1s', 'aras', 'er\xE1s', 'eras',
+                        'ir\xE1s', 'avas', 'ares', 'eres', 'ires',
+                        '\xEDeis', 'ados', 'idos', '\xE1mos',
+                        'amos', 'emos', 'imos', 'iras', 'ada', 'ida',
+                        'ar\xE1', 'ara', 'er\xE1', 'era',
+                        'ir\xE1', 'ava', 'iam', 'ado', 'ido',
+                        'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am',
+                        'em', 'ar', 'er', 'ir', 'as',
+                        'es', 'is', 'eu', 'iu', 'ou')
+    __step4_suffixes = ("os", "a", "i", "o", "\xE1",
+                        "\xED", "\xF3")
+
+    def stem(self, word):
+        """
+        Stem a Portuguese word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step1_success = False
+        step2_success = False
+
+        word = (word.replace("\xE3", "a~")
+                    .replace("\xF5", "o~")
+                    .replace("q\xFC", "qu")
+                    .replace("g\xFC", "gu"))
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+        rv = self._rv_standard(word, self.__vowels)
+
+        # STEP 1: Standard suffix removal
+        for suffix in self.__step1_suffixes:
+            if word.endswith(suffix):
+                if suffix == "amente" and r1.endswith(suffix):
+                    step1_success = True
+
+                    word = word[:-6]
+                    r2 = r2[:-6]
+                    rv = rv[:-6]
+
+                    if r2.endswith("iv"):
+                        word = word[:-2]
+                        r2 = r2[:-2]
+                        rv = rv[:-2]
+
+                        if r2.endswith("at"):
+                            word = word[:-2]
+                            rv = rv[:-2]
+
+                    elif r2.endswith(("os", "ic", "ad")):
+                        word = word[:-2]
+                        rv = rv[:-2]
+
+                elif (suffix in ("ira", "iras") and rv.endswith(suffix) and
+                      word[-len(suffix)-1:-len(suffix)] == "e"):
+                    step1_success = True
+
+                    word = suffix_replace(word, suffix, "ir")
+                    rv = suffix_replace(rv, suffix, "ir")
+
+                elif r2.endswith(suffix):
+                    step1_success = True
+
+                    if suffix in ("logia", "logias"):
+                        word = suffix_replace(word, suffix, "log")
+                        rv = suffix_replace(rv, suffix, "log")
+
+                    elif suffix in ("uça~o", "uço~es"):
+                        word = suffix_replace(word, suffix, "u")
+                        rv = suffix_replace(rv, suffix, "u")
+
+                    elif suffix in ("\xEAncia", "\xEAncias"):
+                        word = suffix_replace(word, suffix, "ente")
+                        rv = suffix_replace(rv, suffix, "ente")
+
+                    elif suffix == "mente":
+                        word = word[:-5]
+                        r2 = r2[:-5]
+                        rv = rv[:-5]
+
+                        if r2.endswith(("ante", "avel", "ivel")):
+                            word = word[:-4]
+                            rv = rv[:-4]
+
+                    elif suffix in ("idade", "idades"):
+                        word = word[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+
+                        if r2.endswith(("ic", "iv")):
+                            word = word[:-2]
+                            rv = rv[:-2]
+
+                        elif r2.endswith("abil"):
+                            word = word[:-4]
+                            rv = rv[:-4]
+
+                    elif suffix in ("iva", "ivo", "ivas", "ivos"):
+                        word = word[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+
+                        if r2.endswith("at"):
+                            word = word[:-2]
+                            rv = rv[:-2]
+                    else:
+                        word = word[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+                break
+
+        # STEP 2: Verb suffixes
+        if not step1_success:
+            for suffix in self.__step2_suffixes:
+                if rv.endswith(suffix):
+                    step2_success = True
+
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    break
+
+        # STEP 3
+        if step1_success or step2_success:
+            if rv.endswith("i") and word[-2] == "c":
+                word = word[:-1]
+                rv = rv[:-1]
+
+        ### STEP 4: Residual suffix
+        if not step1_success and not step2_success:
+            for suffix in self.__step4_suffixes:
+                if rv.endswith(suffix):
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    break
+
+        # STEP 5
+        if rv.endswith(("e", "\xE9", "\xEA")):
+            word = word[:-1]
+            rv = rv[:-1]
+
+            if ((word.endswith("gu") and rv.endswith("u")) or
+                (word.endswith("ci") and rv.endswith("i"))):
+                word = word[:-1]
+
+        elif word.endswith("\xE7"):
+            word = suffix_replace(word, "\xE7", "c")
+
+        word = word.replace("a~", "\xE3").replace("o~", "\xF5")
+
+
+        return word
+
+
+
+class RomanianStemmer(_StandardStemmer):
+
+    """
+    The Romanian Snowball stemmer.
+
+    :cvar __vowels: The Romanian vowels.
+    :type __vowels: unicode
+    :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
+    :type __step0_suffixes: tuple
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :note: A detailed description of the Romanian
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/romanian/stemmer.html
+
+    """
+
+    __vowels = "aeiou\u0103\xE2\xEE"
+    __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor',
+                        'atei', 'a\u0163ie', 'a\u0163ia', 'aua',
+                        'ele', 'iua', 'iei', 'ile', 'ul', 'ea',
+                        'ii')
+    __step1_suffixes = ('abilitate', 'abilitati', 'abilit\u0103\u0163i',
+                        'ibilitate', 'abilit\u0103i', 'ivitate',
+                        'ivitati', 'ivit\u0103\u0163i', 'icitate',
+                        'icitati', 'icit\u0103\u0163i', 'icatori',
+                        'ivit\u0103i', 'icit\u0103i', 'icator',
+                        'a\u0163iune', 'atoare', '\u0103toare',
+                        'i\u0163iune', 'itoare', 'iciva', 'icive',
+                        'icivi', 'iciv\u0103', 'icala', 'icale',
+                        'icali', 'ical\u0103', 'ativa', 'ative',
+                        'ativi', 'ativ\u0103', 'atori', '\u0103tori',
+                        'itiva', 'itive', 'itivi', 'itiv\u0103',
+                        'itori', 'iciv', 'ical', 'ativ', 'ator',
+                        '\u0103tor', 'itiv', 'itor')
+    __step2_suffixes = ('abila', 'abile', 'abili', 'abil\u0103',
+                        'ibila', 'ibile', 'ibili', 'ibil\u0103',
+                        'atori', 'itate', 'itati', 'it\u0103\u0163i',
+                        'abil', 'ibil', 'oasa', 'oas\u0103', 'oase',
+                        'anta', 'ante', 'anti', 'ant\u0103', 'ator',
+                        'it\u0103i', 'iune', 'iuni', 'isme', 'ista',
+                        'iste', 'isti', 'ist\u0103', 'i\u015Fti',
+                        'ata', 'at\u0103', 'ati', 'ate', 'uta',
+                        'ut\u0103', 'uti', 'ute', 'ita', 'it\u0103',
+                        'iti', 'ite', 'ica', 'ice', 'ici', 'ic\u0103',
+                        'osi', 'o\u015Fi', 'ant', 'iva', 'ive', 'ivi',
+                        'iv\u0103', 'ism', 'ist', 'at', 'ut', 'it',
+                        'ic', 'os', 'iv')
+    __step3_suffixes = ('seser\u0103\u0163i', 'aser\u0103\u0163i',
+                        'iser\u0103\u0163i', '\xE2ser\u0103\u0163i',
+                        'user\u0103\u0163i', 'seser\u0103m',
+                        'aser\u0103m', 'iser\u0103m', '\xE2ser\u0103m',
+                        'user\u0103m', 'ser\u0103\u0163i', 'sese\u015Fi',
+                        'seser\u0103', 'easc\u0103', 'ar\u0103\u0163i',
+                        'ur\u0103\u0163i', 'ir\u0103\u0163i',
+                        '\xE2r\u0103\u0163i', 'ase\u015Fi',
+                        'aser\u0103', 'ise\u015Fi', 'iser\u0103',
+                        '\xe2se\u015Fi', '\xE2ser\u0103',
+                        'use\u015Fi', 'user\u0103', 'ser\u0103m',
+                        'sesem', 'indu', '\xE2ndu', 'eaz\u0103',
+                        'e\u015Fti', 'e\u015Fte', '\u0103\u015Fti',
+                        '\u0103\u015Fte', 'ea\u0163i', 'ia\u0163i',
+                        'ar\u0103m', 'ur\u0103m', 'ir\u0103m',
+                        '\xE2r\u0103m', 'asem', 'isem',
+                        '\xE2sem', 'usem', 'se\u015Fi', 'ser\u0103',
+                        'sese', 'are', 'ere', 'ire', '\xE2re',
+                        'ind', '\xE2nd', 'eze', 'ezi', 'esc',
+                        '\u0103sc', 'eam', 'eai', 'eau', 'iam',
+                        'iai', 'iau', 'a\u015Fi', 'ar\u0103',
+                        'u\u015Fi', 'ur\u0103', 'i\u015Fi', 'ir\u0103',
+                        '\xE2\u015Fi', '\xe2r\u0103', 'ase',
+                        'ise', '\xE2se', 'use', 'a\u0163i',
+                        'e\u0163i', 'i\u0163i', '\xe2\u0163i', 'sei',
+                        'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui',
+                        '\xE2i', '\u0103m', 'em', 'im', '\xE2m',
+                        'se')
+
+    def stem(self, word):
+        """
+        Stem a Romanian word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step1_success = False
+        step2_success = False
+
+        for i in range(1, len(word)-1):
+            if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
+                if word[i] == "u":
+                    word = "".join((word[:i], "U", word[i+1:]))
+
+                elif word[i] == "i":
+                    word = "".join((word[:i], "I", word[i+1:]))
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+        rv = self._rv_standard(word, self.__vowels)
+
+        # STEP 0: Removal of plurals and other simplifications
+        for suffix in self.__step0_suffixes:
+            if word.endswith(suffix):
+                if suffix in r1:
+                    if suffix in ("ul", "ului"):
+                        word = word[:-len(suffix)]
+
+                        if suffix in rv:
+                            rv = rv[:-len(suffix)]
+                        else:
+                            rv = ""
+
+                    elif (suffix == "aua" or suffix == "atei" or
+                          (suffix == "ile" and word[-5:-3] != "ab")):
+                        word = word[:-2]
+
+                    elif suffix in ("ea", "ele", "elor"):
+                        word = suffix_replace(word, suffix, "e")
+
+                        if suffix in rv:
+                            rv = suffix_replace(rv, suffix, "e")
+                        else:
+                            rv = ""
+
+                    elif suffix in ("ii", "iua", "iei",
+                                    "iile", "iilor", "ilor"):
+                        word = suffix_replace(word, suffix, "i")
+
+                        if suffix in rv:
+                            rv = suffix_replace(rv, suffix, "i")
+                        else:
+                            rv = ""
+
+                    elif suffix in ("a\u0163ie", "a\u0163ia"):
+                        word = word[:-1]
+                break
+
+        # STEP 1: Reduction of combining suffixes
+        while True:
+
+            replacement_done = False
+
+            for suffix in self.__step1_suffixes:
+                if word.endswith(suffix):
+                    if suffix in r1:
+                        step1_success = True
+                        replacement_done = True
+
+                        if suffix in ("abilitate", "abilitati",
+                                      "abilit\u0103i",
+                                      "abilit\u0103\u0163i"):
+                            word = suffix_replace(word, suffix, "abil")
+
+                        elif suffix == "ibilitate":
+                            word = word[:-5]
+
+                        elif suffix in ("ivitate", "ivitati",
+                                        "ivit\u0103i",
+                                        "ivit\u0103\u0163i"):
+                            word = suffix_replace(word, suffix, "iv")
+
+                        elif suffix in ("icitate", "icitati", "icit\u0103i",
+                                        "icit\u0103\u0163i", "icator",
+                                        "icatori", "iciv", "iciva",
+                                        "icive", "icivi", "iciv\u0103",
+                                        "ical", "icala", "icale", "icali",
+                                        "ical\u0103"):
+                            word = suffix_replace(word, suffix, "ic")
+
+                        elif suffix in ("ativ", "ativa", "ative", "ativi",
+                                        "ativ\u0103", "a\u0163iune",
+                                        "atoare", "ator", "atori",
+                                        "\u0103toare",
+                                        "\u0103tor", "\u0103tori"):
+                            word = suffix_replace(word, suffix, "at")
+
+                            if suffix in r2:
+                                r2 = suffix_replace(r2, suffix, "at")
+
+                        elif suffix in ("itiv", "itiva", "itive", "itivi",
+                                        "itiv\u0103", "i\u0163iune",
+                                        "itoare", "itor", "itori"):
+                            word = suffix_replace(word, suffix, "it")
+
+                            if suffix in r2:
+                                r2 = suffix_replace(r2, suffix, "it")
+                    else:
+                        step1_success = False
+                    break
+
+            if not replacement_done:
+                break
+
+        # STEP 2: Removal of standard suffixes
+        for suffix in self.__step2_suffixes:
+            if word.endswith(suffix):
+                if suffix in r2:
+                    step2_success = True
+
+                    if suffix in ("iune", "iuni"):
+                        if word[-5] == "\u0163":
+                            word = "".join((word[:-5], "t"))
+
+                    elif suffix in ("ism", "isme", "ist", "ista", "iste",
+                                    "isti", "ist\u0103", "i\u015Fti"):
+                        word = suffix_replace(word, suffix, "ist")
+
+                    else:
+                        word = word[:-len(suffix)]
+                break
+
+        # STEP 3: Removal of verb suffixes
+        if not step1_success and not step2_success:
+            for suffix in self.__step3_suffixes:
+                if word.endswith(suffix):
+                    if suffix in rv:
+                        if suffix in ('seser\u0103\u0163i', 'seser\u0103m',
+                                      'ser\u0103\u0163i', 'sese\u015Fi',
+                                      'seser\u0103', 'ser\u0103m', 'sesem',
+                                      'se\u015Fi', 'ser\u0103', 'sese',
+                                      'a\u0163i', 'e\u0163i', 'i\u0163i',
+                                      '\xE2\u0163i', 'sei', '\u0103m',
+                                      'em', 'im', '\xE2m', 'se'):
+                            word = word[:-len(suffix)]
+                            rv = rv[:-len(suffix)]
+                        else:
+                            if (not rv.startswith(suffix) and
+                                rv[rv.index(suffix)-1] not in
+                                "aeio\u0103\xE2\xEE"):
+                                word = word[:-len(suffix)]
+                        break
+
+        # STEP 4: Removal of final vowel
+        for suffix in ("ie", "a", "e", "i", "\u0103"):
+            if word.endswith(suffix):
+                if suffix in rv:
+                    word = word[:-len(suffix)]
+                break
+
+        word = word.replace("I", "i").replace("U", "u")
+
+
+        return word
+
+
+
+class RussianStemmer(_LanguageSpecificStemmer):
+
+    """
+    The Russian Snowball stemmer.
+
+    :cvar __perfective_gerund_suffixes: Suffixes to be deleted.
+    :type __perfective_gerund_suffixes: tuple
+    :cvar __adjectival_suffixes: Suffixes to be deleted.
+    :type __adjectival_suffixes: tuple
+    :cvar __reflexive_suffixes: Suffixes to be deleted.
+    :type __reflexive_suffixes: tuple
+    :cvar __verb_suffixes: Suffixes to be deleted.
+    :type __verb_suffixes: tuple
+    :cvar __noun_suffixes: Suffixes to be deleted.
+    :type __noun_suffixes: tuple
+    :cvar __superlative_suffixes: Suffixes to be deleted.
+    :type __superlative_suffixes: tuple
+    :cvar __derivational_suffixes: Suffixes to be deleted.
+    :type __derivational_suffixes: tuple
+    :note: A detailed description of the Russian
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/russian/stemmer.html
+
+    """
+
+    __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'",
+                                      "ivshi", "yvshi", "vshi", "iv",
+                                      "yv", "v")
+    __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a',
+                               'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego',
+                               'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu',
+                               'ui^ushchikh', 'ui^ushchykh',
+                               'ui^ushchui^u', 'ui^ushchaia',
+                               'ui^ushchoi^u', 'ui^ushchei^u',
+                               'i^ushchi^ui^u', 'i^ushchi^ai^a',
+                               'ui^ushchee', 'ui^ushchie',
+                               'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`',
+                               'ui^ushchii`', 'ui^ushchyi`',
+                               'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim',
+                               'ui^ushchym', 'ui^ushchom', 'i^ushchimi',
+                               'i^ushchymi', 'i^ushchego', 'i^ushchogo',
+                               'i^ushchemu', 'i^ushchomu', 'i^ushchikh',
+                               'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a',
+                               'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee',
+                               'i^ushchie', 'i^ushchye', 'i^ushchoe',
+                               'i^ushchei`', 'i^ushchii`',
+                               'i^ushchyi`', 'i^ushchoi`', 'i^ushchem',
+                               'i^ushchim', 'i^ushchym', 'i^ushchom',
+                               'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u',
+                               'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a',
+                               'shchimi', 'shchymi', 'shchego', 'shchogo',
+                               'shchemu', 'shchomu', 'shchikh', 'shchykh',
+                               'shchui^u', 'shchai^a', 'shchoi^u',
+                               'shchei^u', 'ivshimi', 'ivshymi',
+                               'ivshego', 'ivshogo', 'ivshemu', 'ivshomu',
+                               'ivshikh', 'ivshykh', 'ivshui^u',
+                               'ivshai^a', 'ivshoi^u', 'ivshei^u',
+                               'yvshimi', 'yvshymi', 'yvshego', 'yvshogo',
+                               'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh',
+                               'yvshui^u', 'yvshai^a', 'yvshoi^u',
+                               'yvshei^u', 'vshi^ui^u', 'vshi^ai^a',
+                               'shchee', 'shchie', 'shchye', 'shchoe',
+                               'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
+                               'shchem', 'shchim', 'shchym', 'shchom',
+                               'ivshee', 'ivshie', 'ivshye', 'ivshoe',
+                               'ivshei`', 'ivshii`', 'ivshyi`',
+                               'ivshoi`', 'ivshem', 'ivshim', 'ivshym',
+                               'ivshom', 'yvshee', 'yvshie', 'yvshye',
+                               'yvshoe', 'yvshei`', 'yvshii`',
+                               'yvshyi`', 'yvshoi`', 'yvshem',
+                               'yvshim', 'yvshym', 'yvshom', 'vshimi',
+                               'vshymi', 'vshego', 'vshogo', 'vshemu',
+                               'vshomu', 'vshikh', 'vshykh', 'vshui^u',
+                               'vshai^a', 'vshoi^u', 'vshei^u',
+                               'emi^ui^u', 'emi^ai^a', 'nni^ui^u',
+                               'nni^ai^a', 'vshee',
+                               'vshie', 'vshye', 'vshoe', 'vshei`',
+                               'vshii`', 'vshyi`', 'vshoi`',
+                               'vshem', 'vshim', 'vshym', 'vshom',
+                               'emimi', 'emymi', 'emego', 'emogo',
+                               'ememu', 'emomu', 'emikh', 'emykh',
+                               'emui^u', 'emai^a', 'emoi^u', 'emei^u',
+                               'nnimi', 'nnymi', 'nnego', 'nnogo',
+                               'nnemu', 'nnomu', 'nnikh', 'nnykh',
+                               'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
+                               'emee', 'emie', 'emye', 'emoe',
+                               'emei`', 'emii`', 'emyi`',
+                               'emoi`', 'emem', 'emim', 'emym',
+                               'emom', 'nnee', 'nnie', 'nnye', 'nnoe',
+                               'nnei`', 'nnii`', 'nnyi`',
+                               'nnoi`', 'nnem', 'nnim', 'nnym',
+                               'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi',
+                               'ego', 'ogo', 'emu', 'omu', 'ikh',
+                               'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u',
+                               'ee', 'ie', 'ye', 'oe', 'ei`',
+                               'ii`', 'yi`', 'oi`', 'em',
+                               'im', 'ym', 'om')
+    __reflexive_suffixes = ("si^a", "s'")
+    __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut',
+                         "ish'", 'ete', 'i`te', 'i^ut', 'nno',
+                         'ila', 'yla', 'ena', 'ite', 'ili', 'yli',
+                         'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny',
+                         "it'", "yt'", 'ui^u', 'la', 'na', 'li',
+                         'em', 'lo', 'no', 'et', 'ny', "t'",
+                         'ei`', 'ui`', 'il', 'yl', 'im',
+                         'ym', 'en', 'it', 'yt', 'i^u', 'i`',
+                         'l', 'n')
+    __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh',
+                         'ami', 'iei`', 'i^am', 'iem', 'akh',
+                         'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov',
+                         'ie', "'e", 'ei', 'ii', 'ei`',
+                         'oi`', 'ii`', 'em', 'am', 'om',
+                         'i^u', 'i^a', 'a', 'e', 'i', 'i`',
+                         'o', 'u', 'y', "'")
+    __superlative_suffixes = ("ei`she", "ei`sh")
+    __derivational_suffixes = ("ost'", "ost")
+
+    def stem(self, word):
+        """
+        Stem a Russian word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        if word in self.stopwords:
+            return word
+
+        chr_exceeded = False
+        for i in range(len(word)):
+            if ord(word[i]) > 255:
+                chr_exceeded = True
+                break
+
+        if chr_exceeded:
+            word = self.__cyrillic_to_roman(word)
+
+        step1_success = False
+        adjectival_removed = False
+        verb_removed = False
+        undouble_success = False
+        superlative_removed = False
+
+        rv, r2 = self.__regions_russian(word)
+
+        # Step 1
+        for suffix in self.__perfective_gerund_suffixes:
+            if rv.endswith(suffix):
+                if suffix in ("v", "vshi", "vshis'"):
+                    if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or
+                        rv[-len(suffix)-1:-len(suffix)] == "a"):
+                        word = word[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+                        step1_success = True
+                        break
+                else:
+                    word = word[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    step1_success = True
+                    break
+
+        if not step1_success:
+            for suffix in self.__reflexive_suffixes:
+                if rv.endswith(suffix):
+                    word = word[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    break
+
+            for suffix in self.__adjectival_suffixes:
+                if rv.endswith(suffix):
+                    if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a',
+                              'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u',
+                              'i^ushchei^u', 'i^ushchimi', 'i^ushchymi',
+                              'i^ushchego', 'i^ushchogo', 'i^ushchemu',
+                              'i^ushchomu', 'i^ushchikh', 'i^ushchykh',
+                              'shchi^ui^u', 'shchi^ai^a', 'i^ushchee',
+                              'i^ushchie', 'i^ushchye', 'i^ushchoe',
+                              'i^ushchei`', 'i^ushchii`', 'i^ushchyi`',
+                              'i^ushchoi`', 'i^ushchem', 'i^ushchim',
+                              'i^ushchym', 'i^ushchom', 'vshi^ui^u',
+                              'vshi^ai^a', 'shchui^u', 'shchai^a',
+                              'shchoi^u', 'shchei^u', 'emi^ui^u',
+                              'emi^ai^a', 'nni^ui^u', 'nni^ai^a',
+                              'shchimi', 'shchymi', 'shchego', 'shchogo',
+                              'shchemu', 'shchomu', 'shchikh', 'shchykh',
+                              'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u',
+                              'shchee', 'shchie', 'shchye', 'shchoe',
+                              'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
+                              'shchem', 'shchim', 'shchym', 'shchom',
+                              'vshimi', 'vshymi', 'vshego', 'vshogo',
+                              'vshemu', 'vshomu', 'vshikh', 'vshykh',
+                              'emui^u', 'emai^a', 'emoi^u', 'emei^u',
+                              'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
+                              'vshee', 'vshie', 'vshye', 'vshoe',
+                              'vshei`', 'vshii`', 'vshyi`', 'vshoi`',
+                              'vshem', 'vshim', 'vshym', 'vshom',
+                              'emimi', 'emymi', 'emego', 'emogo',
+                              'ememu', 'emomu', 'emikh', 'emykh',
+                              'nnimi', 'nnymi', 'nnego', 'nnogo',
+                              'nnemu', 'nnomu', 'nnikh', 'nnykh',
+                              'emee', 'emie', 'emye', 'emoe', 'emei`',
+                              'emii`', 'emyi`', 'emoi`', 'emem', 'emim',
+                              'emym', 'emom', 'nnee', 'nnie', 'nnye',
+                              'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`',
+                              'nnem', 'nnim', 'nnym', 'nnom'):
+                        if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or
+                            rv[-len(suffix)-1:-len(suffix)] == "a"):
+                            word = word[:-len(suffix)]
+                            r2 = r2[:-len(suffix)]
+                            rv = rv[:-len(suffix)]
+                            adjectival_removed = True
+                            break
+                    else:
+                        word = word[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+                        adjectival_removed = True
+                        break
+
+            if not adjectival_removed:
+                for suffix in self.__verb_suffixes:
+                    if rv.endswith(suffix):
+                        if suffix in ("la", "na", "ete", "i`te", "li",
+                                      "i`", "l", "em", "n", "lo", "no",
+                                      "et", "i^ut", "ny", "t'", "esh'",
+                                      "nno"):
+                            if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or
+                                rv[-len(suffix)-1:-len(suffix)] == "a"):
+                                word = word[:-len(suffix)]
+                                r2 = r2[:-len(suffix)]
+                                rv = rv[:-len(suffix)]
+                                verb_removed = True
+                                break
+                        else:
+                            word = word[:-len(suffix)]
+                            r2 = r2[:-len(suffix)]
+                            rv = rv[:-len(suffix)]
+                            verb_removed = True
+                            break
+
+            if not adjectival_removed and not verb_removed:
+                for suffix in self.__noun_suffixes:
+                    if rv.endswith(suffix):
+                        word = word[:-len(suffix)]
+                        r2 = r2[:-len(suffix)]
+                        rv = rv[:-len(suffix)]
+                        break
+
+        # Step 2
+        if rv.endswith("i"):
+            word = word[:-1]
+            r2 = r2[:-1]
+
+        # Step 3
+        for suffix in self.__derivational_suffixes:
+            if r2.endswith(suffix):
+                word = word[:-len(suffix)]
+                break
+
+        # Step 4
+        if word.endswith("nn"):
+            word = word[:-1]
+            undouble_success = True
+
+        if not undouble_success:
+            for suffix in self.__superlative_suffixes:
+                if word.endswith(suffix):
+                    word = word[:-len(suffix)]
+                    superlative_removed = True
+                    break
+            if word.endswith("nn"):
+                word = word[:-1]
+
+        if not undouble_success and not superlative_removed:
+            if word.endswith("'"):
+                word = word[:-1]
+
+        if chr_exceeded:
+            word = self.__roman_to_cyrillic(word)
+
+
+        return word
+
+
+
+    def __regions_russian(self, word):
+        """
+        Return the regions RV and R2 which are used by the Russian stemmer.
+
+        In any word, RV is the region after the first vowel,
+        or the end of the word if it contains no vowel.
+
+        R2 is the region after the first non-vowel following
+        a vowel in R1, or the end of the word if there is no such non-vowel.
+
+        R1 is the region after the first non-vowel following a vowel,
+        or the end of the word if there is no such non-vowel.
+
+        :param word: The Russian word whose regions RV and R2 are determined.
+        :type word: str or unicode
+        :return: the regions RV and R2 for the respective Russian word.
+        :rtype: tuple
+        :note: This helper method is invoked by the stem method of the subclass
+               RussianStemmer. It is not to be invoked directly!
+
+        """
+        r1 = ""
+        r2 = ""
+        rv = ""
+
+        vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
+        word = (word.replace("i^a", "A")
+                    .replace("i^u", "U")
+                    .replace("e`", "E"))
+
+        for i in range(1, len(word)):
+            if word[i] not in vowels and word[i-1] in vowels:
+                r1 = word[i+1:]
+                break
+
+        for i in range(1, len(r1)):
+            if r1[i] not in vowels and r1[i-1] in vowels:
+                r2 = r1[i+1:]
+                break
+
+        for i in range(len(word)):
+            if word[i] in vowels:
+                rv = word[i+1:]
+                break
+
+        r2 = (r2.replace("A", "i^a")
+                .replace("U", "i^u")
+                .replace("E", "e`"))
+        rv = (rv.replace("A", "i^a")
+              .replace("U", "i^u")
+              .replace("E", "e`"))
+
+
+        return (rv, r2)
+
+
+
+    def __cyrillic_to_roman(self, word):
+        """
+        Transliterate a Russian word into the Roman alphabet.
+
+        A Russian word whose letters consist of the Cyrillic
+        alphabet are transliterated into the Roman alphabet
+        in order to ease the forthcoming stemming process.
+
+        :param word: The word that is transliterated.
+        :type word: unicode
+        :return: the transliterated word.
+        :rtype: unicode
+        :note: This helper method is invoked by the stem method of the subclass
+               RussianStemmer. It is not to be invoked directly!
+
+        """
+        word = (word.replace("\u0410", "a").replace("\u0430", "a")
+                    .replace("\u0411", "b").replace("\u0431", "b")
+                    .replace("\u0412", "v").replace("\u0432", "v")
+                    .replace("\u0413", "g").replace("\u0433", "g")
+                    .replace("\u0414", "d").replace("\u0434", "d")
+                    .replace("\u0415", "e").replace("\u0435", "e")
+                    .replace("\u0401", "e").replace("\u0451", "e")
+                    .replace("\u0416", "zh").replace("\u0436", "zh")
+                    .replace("\u0417", "z").replace("\u0437", "z")
+                    .replace("\u0418", "i").replace("\u0438", "i")
+                    .replace("\u0419", "i`").replace("\u0439", "i`")
+                    .replace("\u041A", "k").replace("\u043A", "k")
+                    .replace("\u041B", "l").replace("\u043B", "l")
+                    .replace("\u041C", "m").replace("\u043C", "m")
+                    .replace("\u041D", "n").replace("\u043D", "n")
+                    .replace("\u041E", "o").replace("\u043E", "o")
+                    .replace("\u041F", "p").replace("\u043F", "p")
+                    .replace("\u0420", "r").replace("\u0440", "r")
+                    .replace("\u0421", "s").replace("\u0441", "s")
+                    .replace("\u0422", "t").replace("\u0442", "t")
+                    .replace("\u0423", "u").replace("\u0443", "u")
+                    .replace("\u0424", "f").replace("\u0444", "f")
+                    .replace("\u0425", "kh").replace("\u0445", "kh")
+                    .replace("\u0426", "t^s").replace("\u0446", "t^s")
+                    .replace("\u0427", "ch").replace("\u0447", "ch")
+                    .replace("\u0428", "sh").replace("\u0448", "sh")
+                    .replace("\u0429", "shch").replace("\u0449", "shch")
+                    .replace("\u042A", "''").replace("\u044A", "''")
+                    .replace("\u042B", "y").replace("\u044B", "y")
+                    .replace("\u042C", "'").replace("\u044C", "'")
+                    .replace("\u042D", "e`").replace("\u044D", "e`")
+                    .replace("\u042E", "i^u").replace("\u044E", "i^u")
+                    .replace("\u042F", "i^a").replace("\u044F", "i^a"))
+
+
+        return word
+
+
+
+    def __roman_to_cyrillic(self, word):
+        """
+        Transliterate a Russian word back into the Cyrillic alphabet.
+
+        A Russian word formerly transliterated into the Roman alphabet
+        in order to ease the stemming process, is transliterated back
+        into the Cyrillic alphabet, its original form.
+
+        :param word: The word that is transliterated.
+        :type word: str or unicode
+        :return: word, the transliterated word.
+        :rtype: unicode
+        :note: This helper method is invoked by the stem method of the subclass
+               RussianStemmer. It is not to be invoked directly!
+
+        """
+        word = (word.replace("i^u", "\u044E").replace("i^a", "\u044F")
+                    .replace("shch", "\u0449").replace("kh", "\u0445")
+                    .replace("t^s", "\u0446").replace("ch", "\u0447")
+                    .replace("e`", "\u044D").replace("i`", "\u0439")
+                    .replace("sh", "\u0448").replace("k", "\u043A")
+                    .replace("e", "\u0435").replace("zh", "\u0436")
+                    .replace("a", "\u0430").replace("b", "\u0431")
+                    .replace("v", "\u0432").replace("g", "\u0433")
+                    .replace("d", "\u0434").replace("e", "\u0435")
+                    .replace("z", "\u0437").replace("i", "\u0438")
+                    .replace("l", "\u043B").replace("m", "\u043C")
+                    .replace("n", "\u043D").replace("o", "\u043E")
+                    .replace("p", "\u043F").replace("r", "\u0440")
+                    .replace("s", "\u0441").replace("t", "\u0442")
+                    .replace("u", "\u0443").replace("f", "\u0444")
+                    .replace("''", "\u044A").replace("y", "\u044B")
+                    .replace("'", "\u044C"))
+
+
+        return word
+
+
+class SpanishStemmer(_StandardStemmer):
+
+    """
+    The Spanish Snowball stemmer.
+
+    :cvar __vowels: The Spanish vowels.
+    :type __vowels: unicode
+    :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
+    :type __step0_suffixes: tuple
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
+    :type __step2a_suffixes: tuple
+    :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
+    :type __step2b_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :note: A detailed description of the Spanish
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/spanish/stemmer.html
+
+    """
+
+    __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC"
+    __step0_suffixes = ("selas", "selos", "sela", "selo", "las",
+                        "les", "los", "nos", "me", "se", "la", "le",
+                        "lo")
+    __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento',
+                        'aciones', 'uciones', 'adoras', 'adores',
+                        'ancias', 'log\xEDas', 'encias', 'amente',
+                        'idades', 'anzas', 'ismos', 'ables', 'ibles',
+                        'istas', 'adora', 'aci\xF3n', 'antes',
+                        'ancia', 'log\xEDa', 'uci\xf3n', 'encia',
+                        'mente', 'anza', 'icos', 'icas', 'ismo',
+                        'able', 'ible', 'ista', 'osos', 'osas',
+                        'ador', 'ante', 'idad', 'ivas', 'ivos',
+                        'ico',
+                        'ica', 'oso', 'osa', 'iva', 'ivo')
+    __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan',
+                         'yen', 'yas', 'yes', 'ya', 'ye', 'yo',
+                         'y\xF3')
+    __step2b_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos',
+                         'i\xE9ramos', 'i\xE9semos', 'ar\xEDais',
+                         'aremos', 'er\xEDais', 'eremos',
+                         'ir\xEDais', 'iremos', 'ierais', 'ieseis',
+                         'asteis', 'isteis', '\xE1bamos',
+                         '\xE1ramos', '\xE1semos', 'ar\xEDan',
+                         'ar\xEDas', 'ar\xE9is', 'er\xEDan',
+                         'er\xEDas', 'er\xE9is', 'ir\xEDan',
+                         'ir\xEDas', 'ir\xE9is',
+                         'ieran', 'iesen', 'ieron', 'iendo', 'ieras',
+                         'ieses', 'abais', 'arais', 'aseis',
+                         '\xE9amos', 'ar\xE1n', 'ar\xE1s',
+                         'ar\xEDa', 'er\xE1n', 'er\xE1s',
+                         'er\xEDa', 'ir\xE1n', 'ir\xE1s',
+                         'ir\xEDa', 'iera', 'iese', 'aste', 'iste',
+                         'aban', 'aran', 'asen', 'aron', 'ando',
+                         'abas', 'adas', 'idas', 'aras', 'ases',
+                         '\xEDais', 'ados', 'idos', 'amos', 'imos',
+                         'emos', 'ar\xE1', 'ar\xE9', 'er\xE1',
+                         'er\xE9', 'ir\xE1', 'ir\xE9', 'aba',
+                         'ada', 'ida', 'ara', 'ase', '\xEDan',
+                         'ado', 'ido', '\xEDas', '\xE1is',
+                         '\xE9is', '\xEDa', 'ad', 'ed', 'id',
+                         'an', 'i\xF3', 'ar', 'er', 'ir', 'as',
+                         '\xEDs', 'en', 'es')
+    __step3_suffixes = ("os", "a", "e", "o", "\xE1",
+                        "\xE9", "\xED", "\xF3")
+
+    def stem(self, word):
+        """
+        Stem a Spanish word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        step1_success = False
+
+        r1, r2 = self._r1r2_standard(word, self.__vowels)
+        rv = self._rv_standard(word, self.__vowels)
+
+        # STEP 0: Attached pronoun
+        for suffix in self.__step0_suffixes:
+            if not (word.endswith(suffix) and rv.endswith(suffix)):
+                continue
+
+            if ((rv[:-len(suffix)].endswith(("ando", "\xE1ndo",
+                                             "ar", "\xE1r",
+                                             "er", "\xE9r",
+                                             "iendo", "i\xE9ndo",
+                                             "ir", "\xEDr"))) or
+                (rv[:-len(suffix)].endswith("yendo") and
+                    word[:-len(suffix)].endswith("uyendo"))):
+
+                word = self.__replace_accented(word[:-len(suffix)])
+                r1 = self.__replace_accented(r1[:-len(suffix)])
+                r2 = self.__replace_accented(r2[:-len(suffix)])
+                rv = self.__replace_accented(rv[:-len(suffix)])
+            break
+
+        # STEP 1: Standard suffix removal
+        for suffix in self.__step1_suffixes:
+            if not word.endswith(suffix):
+                continue
+
+            if suffix == "amente" and r1.endswith(suffix):
+                step1_success = True
+                word = word[:-6]
+                r2 = r2[:-6]
+                rv = rv[:-6]
+
+                if r2.endswith("iv"):
+                    word = word[:-2]
+                    r2 = r2[:-2]
+                    rv = rv[:-2]
+
+                    if r2.endswith("at"):
+                        word = word[:-2]
+                        rv = rv[:-2]
+
+                elif r2.endswith(("os", "ic", "ad")):
+                    word = word[:-2]
+                    rv = rv[:-2]
+
+            elif r2.endswith(suffix):
+                step1_success = True
+                if suffix in ("adora", "ador", "aci\xF3n", "adoras",
+                              "adores", "aciones", "ante", "antes",
+                              "ancia", "ancias"):
+                    word = word[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+
+                    if r2.endswith("ic"):
+                        word = word[:-2]
+                        rv = rv[:-2]
+
+                elif suffix in ("log\xEDa", "log\xEDas"):
+                    word = suffix_replace(word, suffix, "log")
+                    rv = suffix_replace(rv, suffix, "log")
+
+                elif suffix in ("uci\xF3n", "uciones"):
+                    word = suffix_replace(word, suffix, "u")
+                    rv = suffix_replace(rv, suffix, "u")
+
+                elif suffix in ("encia", "encias"):
+                    word = suffix_replace(word, suffix, "ente")
+                    rv = suffix_replace(rv, suffix, "ente")
+
+                elif suffix == "mente":
+                    word = word[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+
+                    if r2.endswith(("ante", "able", "ible")):
+                        word = word[:-4]
+                        rv = rv[:-4]
+
+                elif suffix in ("idad", "idades"):
+                    word = word[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+
+                    for pre_suff in ("abil", "ic", "iv"):
+                        if r2.endswith(pre_suff):
+                            word = word[:-len(pre_suff)]
+                            rv = rv[:-len(pre_suff)]
+
+                elif suffix in ("ivo", "iva", "ivos", "ivas"):
+                    word = word[:-len(suffix)]
+                    r2 = r2[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    if r2.endswith("at"):
+                        word = word[:-2]
+                        rv = rv[:-2]
+                else:
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+            break
+
+        # STEP 2a: Verb suffixes beginning 'y'
+        if not step1_success:
+            for suffix in self.__step2a_suffixes:
+                if (rv.endswith(suffix) and
+                        word[-len(suffix)-1:-len(suffix)] == "u"):
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    break
+
+        # STEP 2b: Other verb suffixes
+            for suffix in self.__step2b_suffixes:
+                if rv.endswith(suffix):
+                    word = word[:-len(suffix)]
+                    rv = rv[:-len(suffix)]
+                    if suffix in ("en", "es", "\xE9is", "emos"):
+                        if word.endswith("gu"):
+                            word = word[:-1]
+
+                        if rv.endswith("gu"):
+                            rv = rv[:-1]
+                    break
+
+        # STEP 3: Residual suffix
+        for suffix in self.__step3_suffixes:
+            if rv.endswith(suffix):
+                word = word[:-len(suffix)]
+                if suffix in ("e", "\xE9"):
+                    rv = rv[:-len(suffix)]
+
+                    if word[-2:] == "gu" and rv.endswith("u"):
+                        word = word[:-1]
+                break
+
+        word = self.__replace_accented(word)
+
+        return word
+
+    def __replace_accented(self, word):
+        """
+        Replaces all accented letters on a word with their non-accented
+        counterparts.
+
+        :param word: A spanish word, with or without accents
+        :type word: str or unicode
+        :return: a word with the accented letters (á, é, í, ó, ú) replaced with
+                 their non-accented counterparts (a, e, i, o, u)
+        :rtype: str or unicode
+        """
+        return (word.replace("\xE1", "a")
+                .replace("\xE9", "e")
+                .replace("\xED", "i")
+                .replace("\xF3", "o")
+                .replace("\xFA", "u"))
+
+
+class SwedishStemmer(_ScandinavianStemmer):
+
+    """
+    The Swedish Snowball stemmer.
+
+    :cvar __vowels: The Swedish vowels.
+    :type __vowels: unicode
+    :cvar __s_ending: Letters that may directly appear before a word final 's'.
+    :type __s_ending: unicode
+    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
+    :type __step1_suffixes: tuple
+    :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
+    :type __step2_suffixes: tuple
+    :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
+    :type __step3_suffixes: tuple
+    :note: A detailed description of the Swedish
+           stemming algorithm can be found under
+           http://snowball.tartarus.org/algorithms/swedish/stemmer.html
+
+    """
+
+    __vowels = "aeiouy\xE4\xE5\xF6"
+    __s_ending = "bcdfghjklmnoprtvy"
+    __step1_suffixes = ("heterna", "hetens", "heter", "heten",
+                        "anden", "arnas", "ernas", "ornas", "andes",
+                        "andet", "arens", "arna", "erna", "orna",
+                        "ande", "arne", "aste", "aren", "ades",
+                        "erns", "ade", "are", "ern", "ens", "het",
+                        "ast", "ad", "en", "ar", "er", "or", "as",
+                        "es", "at", "a", "e", "s")
+    __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
+    __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
+
+    def stem(self, word):
+        """
+        Stem a Swedish word and return the stemmed form.
+
+        :param word: The word that is stemmed.
+        :type word: str or unicode
+        :return: The stemmed form.
+        :rtype: unicode
+
+        """
+        word = word.lower()
+
+        if word in self.stopwords:
+            return word
+
+        r1 = self._r1_scandinavian(word, self.__vowels)
+
+        # STEP 1
+        for suffix in self.__step1_suffixes:
+            if r1.endswith(suffix):
+                if suffix == "s":
+                    if word[-2] in self.__s_ending:
+                        word = word[:-1]
+                        r1 = r1[:-1]
+                else:
+                    word = word[:-len(suffix)]
+                    r1 = r1[:-len(suffix)]
+                break
+
+        # STEP 2
+        for suffix in self.__step2_suffixes:
+            if r1.endswith(suffix):
+                word = word[:-1]
+                r1 = r1[:-1]
+                break
+
+        # STEP 3
+        for suffix in self.__step3_suffixes:
+            if r1.endswith(suffix):
+                if suffix in ("els", "lig", "ig"):
+                    word = word[:-len(suffix)]
+                elif suffix in ("fullt", "l\xF6st"):
+                    word = word[:-1]
+                break
+
+        return word
+
+
+def demo():
+    """
+    This function provides a demonstration of the Snowball stemmers.
+
+    After invoking this function and specifying a language,
+    it stems an excerpt of the Universal Declaration of Human Rights
+    (which is a part of the NLTK corpus collection) and then prints
+    out the original and the stemmed text.
+
+    """
+
+    import re
+    from nltk.corpus import udhr
+
+    udhr_corpus = {"arabic":     "Arabic_Alarabia-Arabic",
+                   "danish":     "Danish_Dansk-Latin1",
+                   "dutch":      "Dutch_Nederlands-Latin1",
+                   "english":    "English-Latin1",
+                   "finnish":    "Finnish_Suomi-Latin1",
+                   "french":     "French_Francais-Latin1",
+                   "german":     "German_Deutsch-Latin1",
+                   "hungarian":  "Hungarian_Magyar-UTF8",
+                   "italian":    "Italian_Italiano-Latin1",
+                   "norwegian":  "Norwegian-Latin1",
+                   "porter":     "English-Latin1",
+                   "portuguese": "Portuguese_Portugues-Latin1",
+                   "romanian":   "Romanian_Romana-Latin2",
+                   "russian":    "Russian-UTF8",
+                   "spanish":    "Spanish-Latin1",
+                   "swedish":    "Swedish_Svenska-Latin1",
+                   }
+
+    print("\n")
+    print("******************************")
+    print("Demo for the Snowball stemmers")
+    print("******************************")
+
+    while True:
+
+        language = input("Please enter the name of the language " +
+                             "to be demonstrated\n" +
+                             "/".join(SnowballStemmer.languages) +
+                             "\n" +
+                             "(enter 'exit' in order to leave): ")
+
+        if language == "exit":
+            break
+
+        if language not in SnowballStemmer.languages:
+            print(("\nOops, there is no stemmer for this language. " +
+                   "Please try again.\n"))
+            continue
+
+        stemmer = SnowballStemmer(language)
+        excerpt = udhr.words(udhr_corpus[language]) [:300]
+
+        stemmed = " ".join(stemmer.stem(word) for word in excerpt)
+        stemmed = re.sub(r"(.{,70})\s", r'\1\n', stemmed+' ').rstrip()
+        excerpt = " ".join(excerpt)
+        excerpt = re.sub(r"(.{,70})\s", r'\1\n', excerpt+' ').rstrip()
+
+        print("\n")
+        print('-' * 70)
+        print('ORIGINAL'.center(70))
+        print(excerpt)
+        print("\n\n")
+        print('STEMMED RESULTS'.center(70))
+        print(stemmed)
+        print('-' * 70)
+        print("\n")
diff --git a/nlp_resource_data/nltk/stem/snowball.pyc b/nlp_resource_data/nltk/stem/snowball.pyc

new file mode 100755 (executable)

index 0000000..50882bd

Binary files /dev/null and b/nlp_resource_data/nltk/stem/snowball.pyc differ
diff --git a/nlp_resource_data/nltk/stem/util.py b/nlp_resource_data/nltk/stem/util.py

new file mode 100755 (executable)

index 0000000..2ba8547
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/util.py
@@ -0,0 +1,22 @@
+# Natural Language Toolkit: Stemmer Utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Helder <he7d3r@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+def suffix_replace(original, old, new):
+    """
+    Replaces the old suffix of the original string by a new suffix
+    """
+    return original[:-len(old)] + new
+
+def prefix_replace(original, old, new):
+    """
+     Replaces the old prefix of the original string by a new suffix
+    :param original: string
+    :param old: string
+    :param new: string
+    :return: string
+    """
+    return new + original[len(old):]
+\ No newline at end of file
diff --git a/nlp_resource_data/nltk/stem/util.pyc b/nlp_resource_data/nltk/stem/util.pyc

new file mode 100755 (executable)

index 0000000..62ec717

Binary files /dev/null and b/nlp_resource_data/nltk/stem/util.pyc differ
diff --git a/nlp_resource_data/nltk/stem/wordnet.py b/nlp_resource_data/nltk/stem/wordnet.py

new file mode 100755 (executable)

index 0000000..3a217ff
--- /dev/null
+++ b/nlp_resource_data/nltk/stem/wordnet.py
@@ -0,0 +1,51 @@
+# Natural Language Toolkit: WordNet stemmer interface
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+
+from nltk.corpus.reader.wordnet import NOUN
+from nltk.corpus import wordnet
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class WordNetLemmatizer(object):
+    """
+    WordNet Lemmatizer
+
+    Lemmatize using WordNet's built-in morphy function.
+    Returns the input word unchanged if it cannot be found in WordNet.
+
+        >>> from nltk.stem import WordNetLemmatizer
+        >>> wnl = WordNetLemmatizer()
+        >>> print(wnl.lemmatize('dogs'))
+        dog
+        >>> print(wnl.lemmatize('churches'))
+        church
+        >>> print(wnl.lemmatize('aardwolves'))
+        aardwolf
+        >>> print(wnl.lemmatize('abaci'))
+        abacus
+        >>> print(wnl.lemmatize('hardrock'))
+        hardrock
+    """
+
+    def __init__(self):
+        pass
+
+    def lemmatize(self, word, pos=NOUN):
+        lemmas = wordnet._morphy(word, pos)
+        return min(lemmas, key=len) if lemmas else word
+
+    def __repr__(self):
+        return '<WordNetLemmatizer>'
+
+
+# unload wordnet
+def teardown_module(module=None):
+    from nltk.corpus import wordnet
+    wordnet._unload()
+
diff --git a/nlp_resource_data/nltk/stem/wordnet.pyc b/nlp_resource_data/nltk/stem/wordnet.pyc

new file mode 100755 (executable)

index 0000000..e7bc2b8

Binary files /dev/null and b/nlp_resource_data/nltk/stem/wordnet.pyc differ
diff --git a/nlp_resource_data/nltk/tag/__init__.py b/nlp_resource_data/nltk/tag/__init__.py

new file mode 100755 (executable)

index 0000000..34c8798
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/__init__.py
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Taggers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+NLTK Taggers
+
+This package contains classes and interfaces for part-of-speech
+tagging, or simply "tagging".
+
+A "tag" is a case-sensitive string that specifies some property of a token,
+such as its part of speech.  Tagged tokens are encoded as tuples
+``(tag, token)``.  For example, the following tagged token combines
+the word ``'fly'`` with a noun part of speech tag (``'NN'``):
+
+    >>> tagged_tok = ('fly', 'NN')
+
+An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
+
+    >>> from nltk import pos_tag, word_tokenize
+    >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+    [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
+    ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
+
+A Russian tagger is also available if you specify lang="rus". It uses 
+the Russian National Corpus tagset:
+
+    >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus')    # doctest: +SKIP
+    [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'),
+    ('бумажку', 'S'), ('.', 'NONLEX')]
+
+This package defines several taggers, which take a list of tokens,
+assign a tag to each one, and return the resulting list of tagged tokens.
+Most of the taggers are built automatically based on a training corpus.
+For example, the unigram tagger tags each word *w* by checking what
+the most frequent tag for *w* was in a training corpus:
+
+    >>> from nltk.corpus import brown
+    >>> from nltk.tag import UnigramTagger
+    >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
+    >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
+    >>> for word, tag in tagger.tag(sent):
+    ...     print(word, '->', tag)
+    Mitchell -> NP
+    decried -> None
+    the -> AT
+    high -> JJ
+    rate -> NN
+    of -> IN
+    unemployment -> None
+
+Note that words that the tagger has not seen during training receive a tag
+of ``None``.
+
+We evaluate a tagger on data that was not seen during training:
+
+    >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600])
+    0.73...
+
+For more information, please consult chapter 5 of the NLTK Book.
+"""
+from __future__ import print_function
+
+from nltk.tag.api           import TaggerI
+from nltk.tag.util          import str2tuple, tuple2str, untag
+from nltk.tag.sequential    import (SequentialBackoffTagger, ContextTagger,
+                                    DefaultTagger, NgramTagger, UnigramTagger,
+                                    BigramTagger, TrigramTagger, AffixTagger,
+                                    RegexpTagger, ClassifierBasedTagger,
+                                    ClassifierBasedPOSTagger)
+from nltk.tag.brill         import BrillTagger
+from nltk.tag.brill_trainer import BrillTaggerTrainer
+from nltk.tag.tnt           import TnT
+from nltk.tag.hunpos        import HunposTagger
+from nltk.tag.stanford      import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
+from nltk.tag.hmm           import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
+from nltk.tag.senna         import SennaTagger, SennaChunkTagger, SennaNERTagger
+from nltk.tag.mapping       import tagset_mapping, map_tag
+from nltk.tag.crf           import CRFTagger
+from nltk.tag.perceptron    import PerceptronTagger
+
+from nltk.data import load, find
+
+RUS_PICKLE = 'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle'
+
+
+def _get_tagger(lang=None):
+    if lang == 'rus':
+        tagger = PerceptronTagger(False)
+        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
+        tagger.load(ap_russian_model_loc)
+    else:
+        tagger = PerceptronTagger()
+    return tagger
+
+
+def _pos_tag(tokens, tagset, tagger):
+    tagged_tokens = tagger.tag(tokens)
+    if tagset:
+        tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
+    return tagged_tokens
+
+
+def pos_tag(tokens, tagset=None, lang='eng'):
+    """
+    Use NLTK's currently recommended part of speech tagger to
+    tag the given list of tokens.
+
+        >>> from nltk.tag import pos_tag
+        >>> from nltk.tokenize import word_tokenize
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
+        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
+        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
+        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
+
+    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
+
+    :param tokens: Sequence of tokens to be tagged
+    :type tokens: list(str)
+    :param tagset: the tagset to be used, e.g. universal, wsj, brown
+    :type tagset: str
+    :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
+    :type lang: str
+    :return: The tagged tokens
+    :rtype: list(tuple(str, str))
+    """
+    tagger = _get_tagger(lang)
+    return _pos_tag(tokens, tagset, tagger)    
+
+
+def pos_tag_sents(sentences, tagset=None, lang='eng'):
+    """
+    Use NLTK's currently recommended part of speech tagger to tag the
+    given list of sentences, each consisting of a list of tokens.
+
+    :param tokens: List of sentences to be tagged
+    :type tokens: list(list(str))
+    :param tagset: the tagset to be used, e.g. universal, wsj, brown
+    :type tagset: str
+    :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
+    :type lang: str
+    :return: The list of tagged sentences
+    :rtype: list(list(tuple(str, str)))
+    """
+    tagger = _get_tagger(lang)
+    return [_pos_tag(sent, tagset, tagger) for sent in sentences]
diff --git a/nlp_resource_data/nltk/tag/__init__.pyc b/nlp_resource_data/nltk/tag/__init__.pyc

new file mode 100755 (executable)

index 0000000..4fada9a

Binary files /dev/null and b/nlp_resource_data/nltk/tag/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/tag/api.py b/nlp_resource_data/nltk/tag/api.py

new file mode 100755 (executable)

index 0000000..804c769
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/api.py
@@ -0,0 +1,86 @@
+# Natural Language Toolkit: Tagger Interface
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Interface for tagging each token in a sentence with supplementary
+information, such as its part of speech.
+"""
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from itertools import chain
+
+from nltk.internals import overridden
+from nltk.metrics import accuracy
+
+from nltk.tag.util import untag
+
+
+@add_metaclass(ABCMeta)
+class TaggerI(object):
+    """
+    A processing interface for assigning a tag to each token in a list.
+    Tags are case sensitive strings that identify some property of each
+    token, such as its part of speech or its sense.
+
+    Some taggers require specific types for their tokens.  This is
+    generally indicated by the use of a sub-interface to ``TaggerI``.
+    For example, featureset taggers, which are subclassed from
+    ``FeaturesetTagger``, require that each token be a ``featureset``.
+
+    Subclasses must define:
+      - either ``tag()`` or ``tag_sents()`` (or both)
+    """
+    @abstractmethod
+    def tag(self, tokens):
+        """
+        Determine the most appropriate tag sequence for the given
+        token sequence, and return a corresponding list of tagged
+        tokens.  A tagged token is encoded as a tuple ``(token, tag)``.
+
+        :rtype: list(tuple(str, str))
+        """
+        if overridden(self.tag_sents):
+            return self.tag_sents([tokens])[0]
+
+    def tag_sents(self, sentences):
+        """
+        Apply ``self.tag()`` to each element of *sentences*.  I.e.:
+
+            return [self.tag(sent) for sent in sentences]
+        """
+        return [self.tag(sent) for sent in sentences]
+
+    def evaluate(self, gold):
+        """
+        Score the accuracy of the tagger against the gold standard.
+        Strip the tags from the gold standard text, retag it using
+        the tagger, then compute the accuracy score.
+
+        :type gold: list(list(tuple(str, str)))
+        :param gold: The list of tagged sentences to score the tagger on.
+        :rtype: float
+        """
+
+        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
+        gold_tokens = list(chain(*gold))
+        test_tokens = list(chain(*tagged_sents))
+        return accuracy(gold_tokens, test_tokens)
+
+    def _check_params(self, train, model):
+        if (train and model) or (not train and not model):
+            raise ValueError(
+                    'Must specify either training data or trained model.')
+
+
+class FeaturesetTaggerI(TaggerI):
+    """
+    A tagger that requires tokens to be ``featuresets``.  A featureset
+    is a dictionary that maps from feature names to feature
+    values.  See ``nltk.classify`` for more information about features
+    and featuresets.
+    """
diff --git a/nlp_resource_data/nltk/tag/api.pyc b/nlp_resource_data/nltk/tag/api.pyc

new file mode 100755 (executable)

index 0000000..4aab404

Binary files /dev/null and b/nlp_resource_data/nltk/tag/api.pyc differ
diff --git a/nlp_resource_data/nltk/tag/brill.py b/nlp_resource_data/nltk/tag/brill.py

new file mode 100755 (executable)

index 0000000..24e4df4
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/brill.py
@@ -0,0 +1,424 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import print_function, division
+
+from collections import defaultdict, Counter
+
+from nltk.tag import TaggerI
+from nltk.tbl import Feature, Template
+from nltk import jsontags
+
+
+######################################################################
+# Brill Templates
+######################################################################
+
+@jsontags.register_tag
+class Word(Feature):
+    """
+    Feature which examines the text (word) of nearby tokens.
+    """
+
+    json_tag = 'nltk.tag.brill.Word'
+
+    @staticmethod
+    def extract_property(tokens, index):
+        """@return: The given token's text."""
+        return tokens[index][0]
+
+
+@jsontags.register_tag
+class Pos(Feature):
+    """
+    Feature which examines the tags of nearby tokens.
+    """
+
+    json_tag = 'nltk.tag.brill.Pos'
+
+    @staticmethod
+    def extract_property(tokens, index):
+        """@return: The given token's tag."""
+        return tokens[index][1]
+
+
+def nltkdemo18():
+    """
+    Return 18 templates, from the original nltk demo, in multi-feature syntax
+    """
+    return [
+        Template(Pos([-1])),
+        Template(Pos([1])),
+        Template(Pos([-2])),
+        Template(Pos([2])),
+        Template(Pos([-2, -1])),
+        Template(Pos([1, 2])),
+        Template(Pos([-3, -2, -1])),
+        Template(Pos([1, 2, 3])),
+        Template(Pos([-1]), Pos([1])),
+        Template(Word([-1])),
+        Template(Word([1])),
+        Template(Word([-2])),
+        Template(Word([2])),
+        Template(Word([-2, -1])),
+        Template(Word([1, 2])),
+        Template(Word([-3, -2, -1])),
+        Template(Word([1, 2, 3])),
+        Template(Word([-1]), Word([1])),
+    ]
+
+
+def nltkdemo18plus():
+    """
+    Return 18 templates, from the original nltk demo, and additionally a few
+    multi-feature ones (the motivation is easy comparison with nltkdemo18)
+    """
+    return nltkdemo18() + [
+        Template(Word([-1]), Pos([1])),
+        Template(Pos([-1]), Word([1])),
+        Template(Word([-1]), Word([0]), Pos([1])),
+        Template(Pos([-1]), Word([0]), Word([1])),
+        Template(Pos([-1]), Word([0]), Pos([1])),
+    ]
+
+
+def fntbl37():
+    """
+    Return 37 templates taken from the postagging task of the
+    fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/
+    (37 is after excluding a handful which do not condition on Pos[0];
+    fntbl can do that but the current nltk implementation cannot.)
+    """
+    return [
+        Template(Word([0]), Word([1]), Word([2])),
+        Template(Word([-1]), Word([0]), Word([1])),
+        Template(Word([0]), Word([-1])),
+        Template(Word([0]), Word([1])),
+        Template(Word([0]), Word([2])),
+        Template(Word([0]), Word([-2])),
+        Template(Word([1, 2])),
+        Template(Word([-2, -1])),
+        Template(Word([1, 2, 3])),
+        Template(Word([-3, -2, -1])),
+        Template(Word([0]), Pos([2])),
+        Template(Word([0]), Pos([-2])),
+        Template(Word([0]), Pos([1])),
+        Template(Word([0]), Pos([-1])),
+        Template(Word([0])),
+        Template(Word([-2])),
+        Template(Word([2])),
+        Template(Word([1])),
+        Template(Word([-1])),
+        Template(Pos([-1]), Pos([1])),
+        Template(Pos([1]), Pos([2])),
+        Template(Pos([-1]), Pos([-2])),
+        Template(Pos([1])),
+        Template(Pos([-1])),
+        Template(Pos([-2])),
+        Template(Pos([2])),
+        Template(Pos([1, 2, 3])),
+        Template(Pos([1, 2])),
+        Template(Pos([-3, -2, -1])),
+        Template(Pos([-2, -1])),
+        Template(Pos([1]), Word([0]), Word([1])),
+        Template(Pos([1]), Word([0]), Word([-1])),
+        Template(Pos([-1]), Word([-1]), Word([0])),
+        Template(Pos([-1]), Word([0]), Word([1])),
+        Template(Pos([-2]), Pos([-1])),
+        Template(Pos([1]), Pos([2])),
+        Template(Pos([1]), Pos([2]), Word([1]))
+    ]
+
+
+def brill24():
+    """
+    Return 24 templates of the seminal TBL paper, Brill (1995)
+    """
+    return [
+        Template(Pos([-1])),
+        Template(Pos([1])),
+        Template(Pos([-2])),
+        Template(Pos([2])),
+        Template(Pos([-2, -1])),
+        Template(Pos([1, 2])),
+        Template(Pos([-3, -2, -1])),
+        Template(Pos([1, 2, 3])),
+        Template(Pos([-1]), Pos([1])),
+        Template(Pos([-2]), Pos([-1])),
+        Template(Pos([1]), Pos([2])),
+        Template(Word([-1])),
+        Template(Word([1])),
+        Template(Word([-2])),
+        Template(Word([2])),
+        Template(Word([-2, -1])),
+        Template(Word([1, 2])),
+        Template(Word([-1, 0])),
+        Template(Word([0, 1])),
+        Template(Word([0])),
+        Template(Word([-1]), Pos([-1])),
+        Template(Word([1]), Pos([1])),
+        Template(Word([0]), Word([-1]), Pos([-1])),
+        Template(Word([0]), Word([1]), Pos([1])),
+    ]
+
+
+def describe_template_sets():
+    """
+    Print the available template sets in this demo, with a short description"
+    """
+    import inspect
+    import sys
+
+    # a bit of magic to get all functions in this module
+    templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction)
+    for (name, obj) in templatesets:
+        if name == "describe_template_sets":
+            continue
+        print(name, obj.__doc__, "\n")
+
+
+######################################################################
+# The Brill Tagger
+######################################################################
+
+@jsontags.register_tag
+class BrillTagger(TaggerI):
+    """
+    Brill's transformational rule-based tagger.  Brill taggers use an
+    initial tagger (such as ``tag.DefaultTagger``) to assign an initial
+    tag sequence to a text; and then apply an ordered list of
+    transformational rules to correct the tags of individual tokens.
+    These transformation rules are specified by the ``TagRule``
+    interface.
+
+    Brill taggers can be created directly, from an initial tagger and
+    a list of transformational rules; but more often, Brill taggers
+    are created by learning rules from a training corpus, using one
+    of the TaggerTrainers available.
+    """
+
+    json_tag = 'nltk.tag.BrillTagger'
+
+    def __init__(self, initial_tagger, rules, training_stats=None):
+        """
+        :param initial_tagger: The initial tagger
+        :type initial_tagger: TaggerI
+
+        :param rules: An ordered list of transformation rules that
+            should be used to correct the initial tagging.
+        :type rules: list(TagRule)
+
+        :param training_stats: A dictionary of statistics collected
+            during training, for possible later use
+        :type training_stats: dict
+
+        """
+        self._initial_tagger = initial_tagger
+        self._rules = tuple(rules)
+        self._training_stats = training_stats
+
+    def encode_json_obj(self):
+        return self._initial_tagger, self._rules, self._training_stats
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _initial_tagger, _rules, _training_stats = obj
+        return cls(_initial_tagger, _rules, _training_stats)
+
+    def rules(self):
+        """
+        Return the ordered list of  transformation rules that this tagger has learnt
+
+        :return: the ordered list of transformation rules that correct the initial tagging
+        :rtype: list of Rules
+        """
+        return self._rules
+
+    def train_stats(self, statistic=None):
+        """
+        Return a named statistic collected during training, or a dictionary of all
+        available statistics if no name given
+
+        :param statistic: name of statistic
+        :type statistic: str
+        :return: some statistic collected during training of this tagger
+        :rtype: any (but usually a number)
+        """
+        if statistic is None:
+            return self._training_stats
+        else:
+            return self._training_stats.get(statistic)
+
+    def tag(self, tokens):
+        # Inherit documentation from TaggerI
+
+        # Run the initial tagger.
+        tagged_tokens = self._initial_tagger.tag(tokens)
+
+        # Create a dictionary that maps each tag to a list of the
+        # indices of tokens that have that tag.
+        tag_to_positions = defaultdict(set)
+        for i, (token, tag) in enumerate(tagged_tokens):
+            tag_to_positions[tag].add(i)
+
+        # Apply each rule, in order.  Only try to apply rules at
+        # positions that have the desired original tag.
+        for rule in self._rules:
+            # Find the positions where it might apply
+            positions = tag_to_positions.get(rule.original_tag, [])
+            # Apply the rule at those positions.
+            changed = rule.apply(tagged_tokens, positions)
+            # Update tag_to_positions with the positions of tags that
+            # were modified.
+            for i in changed:
+                tag_to_positions[rule.original_tag].remove(i)
+                tag_to_positions[rule.replacement_tag].add(i)
+
+        return tagged_tokens
+
+    def print_template_statistics(self, test_stats=None, printunused=True):
+        """
+        Print a list of all templates, ranked according to efficiency.
+
+        If test_stats is available, the templates are ranked according to their
+        relative contribution (summed for all rules created from a given template,
+        weighted by score) to the performance on the test set. If no test_stats, then
+        statistics collected during training are used instead. There is also
+        an unweighted measure (just counting the rules). This is less informative,
+        though, as many low-score rules will appear towards end of training.
+
+        :param test_stats: dictionary of statistics collected during testing
+        :type test_stats: dict of str -> any (but usually numbers)
+        :param printunused: if True, print a list of all unused templates
+        :type printunused: bool
+        :return: None
+        :rtype: None
+        """
+        tids = [r.templateid for r in self._rules]
+        train_stats = self.train_stats()
+
+        trainscores = train_stats['rulescores']
+        assert len(trainscores) == len(tids), "corrupt statistics: " \
+            "{0} train scores for {1} rules".format(trainscores, tids)
+        template_counts = Counter(tids)
+        weighted_traincounts = Counter()
+        for (tid, score) in zip(tids, trainscores):
+            weighted_traincounts[tid] += score
+        tottrainscores = sum(trainscores)
+
+        # det_tplsort() is for deterministic sorting;
+        # the otherwise convenient Counter.most_common() unfortunately
+        # does not break ties deterministically
+        # between python versions and will break cross-version tests
+        def det_tplsort(tpl_value):
+            return (tpl_value[1], repr(tpl_value[0]))
+
+        def print_train_stats():
+            print("TEMPLATE STATISTICS (TRAIN)  {0} templates, {1} rules)".format(
+                len(template_counts),
+                len(tids))
+            )
+            print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
+                  "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats))
+            head = "#ID | Score (train) |  #Rules     | Template"
+            print(head, "\n", "-" * len(head), sep="")
+            train_tplscores = sorted(weighted_traincounts.items(), key=det_tplsort, reverse=True)
+            for (tid, trainscore) in train_tplscores:
+                s = "{0} | {1:5d}   {2:5.3f} |{3:4d}   {4:.3f} | {5}".format(
+                    tid,
+                    trainscore,
+                    trainscore/tottrainscores,
+                    template_counts[tid],
+                    template_counts[tid]/len(tids),
+                    Template.ALLTEMPLATES[int(tid)],
+                )
+                print(s)
+
+        def print_testtrain_stats():
+            testscores = test_stats['rulescores']
+            print("TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format(
+                len(template_counts),
+                len(tids)),
+            )
+            print("TEST  ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
+                  "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats))
+            print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
+                  "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats))
+            weighted_testcounts = Counter()
+            for (tid, score) in zip(tids, testscores):
+                weighted_testcounts[tid] += score
+            tottestscores = sum(testscores)
+            head = "#ID | Score (test) | Score (train) |  #Rules     | Template"
+            print(head, "\n", "-" * len(head), sep="")
+            test_tplscores = sorted(weighted_testcounts.items(), key=det_tplsort, reverse=True)
+            for (tid, testscore) in test_tplscores:
+                s = "{0:s} |{1:5d}  {2:6.3f} |  {3:4d}   {4:.3f} |{5:4d}   {6:.3f} | {7:s}".format(
+                    tid,
+                    testscore,
+                    testscore/tottestscores,
+                    weighted_traincounts[tid],
+                    weighted_traincounts[tid]/tottrainscores,
+                    template_counts[tid],
+                    template_counts[tid]/len(tids),
+                    Template.ALLTEMPLATES[int(tid)],
+                )
+                print(s)
+
+        def print_unused_templates():
+            usedtpls = set([int(tid) for tid in tids])
+            unused = [(tid, tpl) for (tid, tpl) in enumerate(Template.ALLTEMPLATES) if tid not in usedtpls]
+            print("UNUSED TEMPLATES ({0})".format(len(unused)))
+
+            for (tid, tpl) in unused:
+                print("{0:03d} {1:s}".format(tid, str(tpl)))
+
+        if test_stats is None:
+            print_train_stats()
+        else:
+            print_testtrain_stats()
+        print()
+        if printunused:
+            print_unused_templates()
+        print()
+
+    def batch_tag_incremental(self, sequences, gold):
+        """
+        Tags by applying each rule to the entire corpus (rather than all rules to a
+        single sequence). The point is to collect statistics on the test set for
+        individual rules.
+
+        NOTE: This is inefficient (does not build any index, so will traverse the entire
+        corpus N times for N rules) -- usually you would not care about statistics for
+        individual rules and thus use batch_tag() instead
+
+        :param sequences: lists of token sequences (sentences, in some applications) to be tagged
+        :type sequences: list of list of strings
+        :param gold: the gold standard
+        :type gold: list of list of strings
+        :returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule))
+        """
+        def counterrors(xs):
+            return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair))
+        testing_stats = {}
+        testing_stats['tokencount'] = sum(len(t) for t in sequences)
+        testing_stats['sequencecount'] = len(sequences)
+        tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences]
+        testing_stats['initialerrors'] = counterrors(tagged_tokenses)
+        testing_stats['initialacc'] = 1 - testing_stats['initialerrors']/testing_stats['tokencount']
+        # Apply each rule to the entire corpus, in order
+        errors = [testing_stats['initialerrors']]
+        for rule in self._rules:
+            for tagged_tokens in tagged_tokenses:
+                rule.apply(tagged_tokens)
+            errors.append(counterrors(tagged_tokenses))
+        testing_stats['rulescores'] = [err0 - err1 for (err0, err1) in zip(errors, errors[1:])]
+        testing_stats['finalerrors'] = errors[-1]
+        testing_stats['finalacc'] = 1 - testing_stats['finalerrors']/testing_stats['tokencount']
+        return (tagged_tokenses, testing_stats)
diff --git a/nlp_resource_data/nltk/tag/brill.pyc b/nlp_resource_data/nltk/tag/brill.pyc

new file mode 100755 (executable)

index 0000000..8cd4eea

Binary files /dev/null and b/nlp_resource_data/nltk/tag/brill.pyc differ
diff --git a/nlp_resource_data/nltk/tag/brill_trainer.py b/nlp_resource_data/nltk/tag/brill_trainer.py

new file mode 100755 (executable)

index 0000000..fde697e
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/brill_trainer.py
@@ -0,0 +1,608 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2013 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import print_function, division
+
+import bisect
+import textwrap
+from collections import defaultdict
+
+from nltk.tag import untag, BrillTagger
+
+######################################################################
+#  Brill Tagger Trainer
+######################################################################
+
+
+class BrillTaggerTrainer(object):
+    """
+    A trainer for tbl taggers.
+    """
+    def __init__(self, initial_tagger, templates, trace=0,
+                 deterministic=None, ruleformat="str"):
+        """
+        Construct a Brill tagger from a baseline tagger and a
+        set of templates
+
+        :param initial_tagger: the baseline tagger
+        :type initial_tagger: Tagger
+        :param templates: templates to be used in training
+        :type templates: list of Templates
+        :param trace: verbosity level
+        :type trace: int
+        :param deterministic: if True, adjudicate ties deterministically
+        :type deterministic: bool
+        :param ruleformat: format of reported Rules
+        :type ruleformat: str
+        :return: An untrained BrillTagger
+        :rtype: BrillTagger
+        """
+
+        if deterministic is None:
+            deterministic = (trace > 0)
+        self._initial_tagger = initial_tagger
+        self._templates = templates
+        self._trace = trace
+        self._deterministic = deterministic
+        self._ruleformat = ruleformat
+
+        self._tag_positions = None
+        """Mapping from tags to lists of positions that use that tag."""
+
+        self._rules_by_position = None
+        """Mapping from positions to the set of rules that are known
+           to occur at that position.  Position is (sentnum, wordnum).
+           Initially, this will only contain positions where each rule
+           applies in a helpful way; but when we examine a rule, we'll
+           extend this list to also include positions where each rule
+           applies in a harmful or neutral way."""
+
+        self._positions_by_rule = None
+        """Mapping from rule to position to effect, specifying the
+           effect that each rule has on the overall score, at each
+           position.  Position is (sentnum, wordnum); and effect is
+           -1, 0, or 1.  As with _rules_by_position, this mapping starts
+           out only containing rules with positive effects; but when
+           we examine a rule, we'll extend this mapping to include
+           the positions where the rule is harmful or neutral."""
+
+        self._rules_by_score = None
+        """Mapping from scores to the set of rules whose effect on the
+           overall score is upper bounded by that score.  Invariant:
+           rulesByScore[s] will contain r iff the sum of
+           _positions_by_rule[r] is s."""
+
+        self._rule_scores = None
+        """Mapping from rules to upper bounds on their effects on the
+           overall score.  This is the inverse mapping to _rules_by_score.
+           Invariant: ruleScores[r] = sum(_positions_by_rule[r])"""
+
+        self._first_unknown_position = None
+        """Mapping from rules to the first position where we're unsure
+           if the rule applies.  This records the next position we
+           need to check to see if the rule messed anything up."""
+
+    # Training
+
+    def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
+        """
+        Trains the Brill tagger on the corpus *train_sents*,
+        producing at most *max_rules* transformations, each of which
+        reduces the net number of errors in the corpus by at least
+        *min_score*, and each of which has accuracy not lower than
+        *min_acc*.
+
+        #imports
+        >>> from nltk.tbl.template import Template
+        >>> from nltk.tag.brill import Pos, Word
+        >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer
+
+        #some data
+        >>> from nltk.corpus import treebank
+        >>> training_data = treebank.tagged_sents()[:100]
+        >>> baseline_data = treebank.tagged_sents()[100:200]
+        >>> gold_data = treebank.tagged_sents()[200:300]
+        >>> testing_data = [untag(s) for s in gold_data]
+
+        >>> backoff = RegexpTagger([
+        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
+        ... (r'.*able$', 'JJ'),                # adjectives
+        ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
+        ... (r'.*ly$', 'RB'),                  # adverbs
+        ... (r'.*s$', 'NNS'),                  # plural nouns
+        ... (r'.*ing$', 'VBG'),                # gerunds
+        ... (r'.*ed$', 'VBD'),                 # past tense verbs
+        ... (r'.*', 'NN')                      # nouns (default)
+        ... ])
+
+        >>> baseline = backoff #see NOTE1
+
+        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
+        0.2450142...
+
+        #templates
+        >>> Template._cleartemplates() #clear any templates created in earlier tests
+        >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
+
+        #construct a BrillTaggerTrainer
+        >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)
+
+        >>> tagger1 = tt.train(training_data, max_rules=10)
+        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
+        Finding initial useful rules...
+            Found 845 useful rules.
+        <BLANKLINE>
+                   B      |
+           S   F   r   O  |        Score = Fixed - Broken
+           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+           e   d   n   r  |  e
+        ------------------+-------------------------------------------------------
+         132 132   0   0  | AT->DT if Pos:NN@[-1]
+          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
+          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
+          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
+          47  63  16 161  | NN->IN if Pos:NNS@[-1]
+          33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
+          26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
+          24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
+          22  27   5  24  | NN->-NONE- if Pos:VBD@[-1]
+          17  17   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]
+
+        >>> tagger1.rules()[1:3]
+        (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))
+
+        >>> train_stats = tagger1.train_stats()
+        >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
+        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
+
+        >>> tagger1.print_template_statistics(printunused=False)
+        TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
+        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
+        #ID | Score (train) |  #Rules     | Template
+        --------------------------------------------
+        001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
+        000 |   201   0.397 |   3   0.300 | Template(Pos([-1]))
+        <BLANKLINE>
+        <BLANKLINE>
+
+        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
+        0.43996...
+
+        >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
+
+        >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
+        ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
+        ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
+        True
+
+        >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
+        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
+
+        # a high-accuracy tagger
+        >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
+        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
+        Finding initial useful rules...
+            Found 845 useful rules.
+        <BLANKLINE>
+                   B      |
+           S   F   r   O  |        Score = Fixed - Broken
+           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+           e   d   n   r  |  e
+        ------------------+-------------------------------------------------------
+         132 132   0   0  | AT->DT if Pos:NN@[-1]
+          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
+          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
+          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
+          36  36   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
+          26  26   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
+          24  24   0   0  | NN->, if Pos:NNS@[-1] & Word:,@[0]
+          19  19   0   6  | NN->VB if Pos:TO@[-1]
+          18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
+          18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]
+
+        >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
+        0.44159544...
+        >>> tagger2.rules()[2:4]
+        (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
+
+        # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
+        # with a RegexpTagger only as backoff. For instance,
+        # >>> baseline = UnigramTagger(baseline_data, backoff=backoff)
+        # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
+        # between python versions. The simplistic backoff above is a workaround to make doctests
+        # get consistent input.
+
+        :param train_sents: training data
+        :type train_sents: list(list(tuple))
+        :param max_rules: output at most max_rules rules
+        :type max_rules: int
+        :param min_score: stop training when no rules better than min_score can be found
+        :type min_score: int
+        :param min_acc: discard any rule with lower accuracy than min_acc
+        :type min_acc: float or None
+        :return: the learned tagger
+        :rtype: BrillTagger
+
+        """
+        # FIXME: several tests are a bit too dependent on tracing format
+        # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates
+
+        # Basic idea: Keep track of the rules that apply at each position.
+        # And keep track of the positions to which each rule applies.
+
+        # Create a new copy of the training corpus, and run the
+        # initial tagger on it.  We will progressively update this
+        # test corpus to look more like the training corpus.
+        test_sents = [list(self._initial_tagger.tag(untag(sent)))
+                      for sent in train_sents]
+
+        # Collect some statistics on the training process
+        trainstats = {}
+        trainstats['min_acc'] = min_acc
+        trainstats['min_score'] = min_score
+        trainstats['tokencount'] = sum(len(t) for t in test_sents)
+        trainstats['sequencecount'] = len(test_sents)
+        trainstats['templatecount'] = len(self._templates)
+        trainstats['rulescores'] = []
+        trainstats['initialerrors'] = sum(
+            tag[1] != truth[1]
+            for paired in zip(test_sents, train_sents)
+            for (tag, truth) in zip(*paired)
+        )
+        trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount']
+        if self._trace > 0:
+            print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
+                  "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats))
+
+        # Initialize our mappings.  This will find any errors made
+        # by the initial tagger, and use those to generate repair
+        # rules, which are added to the rule mappings.
+        if self._trace:
+            print("Finding initial useful rules...")
+        self._init_mappings(test_sents, train_sents)
+        if self._trace:
+            print(("    Found %d useful rules." % len(self._rule_scores)))
+
+        # Let the user know what we're up to.
+        if self._trace > 2:
+            self._trace_header()
+        elif self._trace == 1:
+            print("Selecting rules...")
+
+        # Repeatedly select the best rule, and add it to `rules`.
+        rules = []
+        try:
+            while (len(rules) < max_rules):
+                # Find the best rule, and add it to our rule list.
+                rule = self._best_rule(train_sents, test_sents, min_score, min_acc)
+                if rule:
+                    rules.append(rule)
+                    score = self._rule_scores[rule]
+                    trainstats['rulescores'].append(score)
+                else:
+                    break  # No more good rules left!
+
+                # Report the rule that we found.
+                if self._trace > 1:
+                    self._trace_rule(rule)
+
+                # Apply the new rule at the relevant sites
+                self._apply_rule(rule, test_sents)
+
+                # Update _tag_positions[rule.original_tag] and
+                # _tag_positions[rule.replacement_tag] for the affected
+                # positions (i.e., self._positions_by_rule[rule]).
+                self._update_tag_positions(rule)
+
+                # Update rules that were affected by the change.
+                self._update_rules(rule, train_sents, test_sents)
+
+        # The user can cancel training manually:
+        except KeyboardInterrupt:
+            print("Training stopped manually -- %d rules found" % len(rules))
+
+        # Discard our tag position mapping & rule mappings.
+        self._clean()
+        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores'])
+        trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount']
+        # Create and return a tagger from the rules we found.
+        return BrillTagger(self._initial_tagger, rules, trainstats)
+
+    def _init_mappings(self, test_sents, train_sents):
+        """
+        Initialize the tag position mapping & the rule related
+        mappings.  For each error in test_sents, find new rules that
+        would correct them, and add them to the rule mappings.
+        """
+        self._tag_positions = defaultdict(list)
+        self._rules_by_position = defaultdict(set)
+        self._positions_by_rule = defaultdict(dict)
+        self._rules_by_score = defaultdict(set)
+        self._rule_scores = defaultdict(int)
+        self._first_unknown_position = defaultdict(int)
+        # Scan through the corpus, initializing the tag_positions
+        # mapping and all the rule-related mappings.
+        for sentnum, sent in enumerate(test_sents):
+            for wordnum, (word, tag) in enumerate(sent):
+
+                # Initialize tag_positions
+                self._tag_positions[tag].append((sentnum, wordnum))
+
+                # If it's an error token, update the rule-related mappings.
+                correct_tag = train_sents[sentnum][wordnum][1]
+                if tag != correct_tag:
+                    for rule in self._find_rules(sent, wordnum, correct_tag):
+                        self._update_rule_applies(rule, sentnum, wordnum,
+                                                  train_sents)
+
+    def _clean(self):
+        self._tag_positions = None
+        self._rules_by_position = None
+        self._positions_by_rule = None
+        self._rules_by_score = None
+        self._rule_scores = None
+        self._first_unknown_position = None
+
+    def _find_rules(self, sent, wordnum, new_tag):
+        """
+        Use the templates to find rules that apply at index *wordnum*
+        in the sentence *sent* and generate the tag *new_tag*.
+        """
+        for template in self._templates:
+            for rule in template.applicable_rules(sent, wordnum, new_tag):
+                yield rule
+
+    def _update_rule_applies(self, rule, sentnum, wordnum, train_sents):
+        """
+        Update the rule data tables to reflect the fact that
+        *rule* applies at the position *(sentnum, wordnum)*.
+        """
+        pos = sentnum, wordnum
+
+        # If the rule is already known to apply here, ignore.
+        # (This only happens if the position's tag hasn't changed.)
+        if pos in self._positions_by_rule[rule]:
+            return
+
+        # Update self._positions_by_rule.
+        correct_tag = train_sents[sentnum][wordnum][1]
+        if rule.replacement_tag == correct_tag:
+            self._positions_by_rule[rule][pos] = 1
+        elif rule.original_tag == correct_tag:
+            self._positions_by_rule[rule][pos] = -1
+        else:  # was wrong, remains wrong
+            self._positions_by_rule[rule][pos] = 0
+
+        # Update _rules_by_position
+        self._rules_by_position[pos].add(rule)
+
+        # Update _rule_scores.
+        old_score = self._rule_scores[rule]
+        self._rule_scores[rule] += self._positions_by_rule[rule][pos]
+
+        # Update _rules_by_score.
+        self._rules_by_score[old_score].discard(rule)
+        self._rules_by_score[self._rule_scores[rule]].add(rule)
+
+    def _update_rule_not_applies(self, rule, sentnum, wordnum):
+        """
+        Update the rule data tables to reflect the fact that *rule*
+        does not apply at the position *(sentnum, wordnum)*.
+        """
+        pos = sentnum, wordnum
+
+        # Update _rule_scores.
+        old_score = self._rule_scores[rule]
+        self._rule_scores[rule] -= self._positions_by_rule[rule][pos]
+
+        # Update _rules_by_score.
+        self._rules_by_score[old_score].discard(rule)
+        self._rules_by_score[self._rule_scores[rule]].add(rule)
+
+        # Update _positions_by_rule
+        del self._positions_by_rule[rule][pos]
+        self._rules_by_position[pos].remove(rule)
+
+        # Optional addition: if the rule now applies nowhere, delete
+        # all its dictionary entries.
+
+    def _best_rule(self, train_sents, test_sents, min_score, min_acc):
+        """
+        Find the next best rule.  This is done by repeatedly taking a
+        rule with the highest score and stepping through the corpus to
+        see where it applies.  When it makes an error (decreasing its
+        score) it's bumped down, and we try a new rule with the
+        highest score.  When we find a rule which has the highest
+        score *and* which has been tested against the entire corpus, we
+        can conclude that it's the next best rule.
+        """
+        for max_score in sorted(self._rules_by_score.keys(), reverse=True):
+            if len(self._rules_by_score) == 0:
+                return None
+            if max_score < min_score or max_score <= 0:
+                return None
+            best_rules = list(self._rules_by_score[max_score])
+            if self._deterministic:
+                best_rules.sort(key=repr)
+            for rule in best_rules:
+                positions = self._tag_positions[rule.original_tag]
+
+                unk = self._first_unknown_position.get(rule, (0, -1))
+                start = bisect.bisect_left(positions, unk)
+
+                for i in range(start, len(positions)):
+                    sentnum, wordnum = positions[i]
+                    if rule.applies(test_sents[sentnum], wordnum):
+                        self._update_rule_applies(rule, sentnum, wordnum,
+                                                  train_sents)
+                        if self._rule_scores[rule] < max_score:
+                            self._first_unknown_position[rule] = (sentnum,
+                                                                  wordnum+1)
+                            break  # The update demoted the rule.
+
+                if self._rule_scores[rule] == max_score:
+                    self._first_unknown_position[rule] = (len(train_sents) + 1, 0)
+                    # optimization: if no min_acc threshold given, don't bother computing accuracy
+                    if min_acc is None:
+                        return rule
+                    else:
+                        changes = self._positions_by_rule[rule].values()
+                        num_fixed = len([c for c in changes if c == 1])
+                        num_broken = len([c for c in changes if c == -1])
+                        # acc here is fixed/(fixed+broken); could also be
+                        # fixed/(fixed+broken+other) == num_fixed/len(changes)
+                        acc = num_fixed/(num_fixed+num_broken)
+                        if acc >= min_acc:
+                            return rule
+                        # else: rule too inaccurate, discard and try next
+
+            # We demoted (or skipped due to < min_acc, if that was given)
+            # all the rules with score==max_score.
+
+            assert min_acc is not None or not self._rules_by_score[max_score]
+            if not self._rules_by_score[max_score]:
+                del self._rules_by_score[max_score]
+
+    def _apply_rule(self, rule, test_sents):
+        """
+        Update *test_sents* by applying *rule* everywhere where its
+        conditions are met.
+        """
+        update_positions = set(self._positions_by_rule[rule])
+        new_tag = rule.replacement_tag
+
+        if self._trace > 3:
+            self._trace_apply(len(update_positions))
+
+        # Update test_sents.
+        for (sentnum, wordnum) in update_positions:
+            text = test_sents[sentnum][wordnum][0]
+            test_sents[sentnum][wordnum] = (text, new_tag)
+
+    def _update_tag_positions(self, rule):
+        """
+        Update _tag_positions to reflect the changes to tags that are
+        made by *rule*.
+        """
+        # Update the tag index.
+        for pos in self._positions_by_rule[rule]:
+            # Delete the old tag.
+            old_tag_positions = self._tag_positions[rule.original_tag]
+            old_index = bisect.bisect_left(old_tag_positions, pos)
+            del old_tag_positions[old_index]
+            # Insert the new tag.
+            new_tag_positions = self._tag_positions[rule.replacement_tag]
+            bisect.insort_left(new_tag_positions, pos)
+
+    def _update_rules(self, rule, train_sents, test_sents):
+        """
+        Check if we should add or remove any rules from consideration,
+        given the changes made by *rule*.
+        """
+        # Collect a list of all positions that might be affected.
+        neighbors = set()
+        for sentnum, wordnum in self._positions_by_rule[rule]:
+            for template in self._templates:
+                n = template.get_neighborhood(test_sents[sentnum], wordnum)
+                neighbors.update([(sentnum, i) for i in n])
+
+        # Update the rules at each position.
+        num_obsolete = num_new = num_unseen = 0
+        for sentnum, wordnum in neighbors:
+            test_sent = test_sents[sentnum]
+            correct_tag = train_sents[sentnum][wordnum][1]
+
+            # Check if the change causes any rule at this position to
+            # stop matching; if so, then update our rule mappings
+            # accordingly.
+            old_rules = set(self._rules_by_position[sentnum, wordnum])
+            for old_rule in old_rules:
+                if not old_rule.applies(test_sent, wordnum):
+                    num_obsolete += 1
+                    self._update_rule_not_applies(old_rule, sentnum, wordnum)
+
+            # Check if the change causes our templates to propose any
+            # new rules for this position.
+            for template in self._templates:
+                for new_rule in template.applicable_rules(test_sent, wordnum,
+                                                          correct_tag):
+                    if new_rule not in old_rules:
+                        num_new += 1
+                        if new_rule not in self._rule_scores:
+                            num_unseen += 1
+                        old_rules.add(new_rule)
+                        self._update_rule_applies(new_rule, sentnum,
+                                                  wordnum, train_sents)
+
+            # We may have caused other rules to match here, that are
+            # not proposed by our templates -- in particular, rules
+            # that are harmful or neutral.  We therefore need to
+            # update any rule whose first_unknown_position is past
+            # this rule.
+            for new_rule, pos in self._first_unknown_position.items():
+                if pos > (sentnum, wordnum):
+                    if new_rule not in old_rules:
+                        num_new += 1
+                        if new_rule.applies(test_sent, wordnum):
+                            self._update_rule_applies(new_rule, sentnum,
+                                                      wordnum, train_sents)
+
+        if self._trace > 3:
+            self._trace_update_rules(num_obsolete, num_new, num_unseen)
+
+    # Tracing
+
+    def _trace_header(self):
+        print("""
+           B      |
+   S   F   r   O  |        Score = Fixed - Broken
+   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
+   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
+   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
+   e   d   n   r  |  e
+------------------+-------------------------------------------------------
+        """.rstrip())
+
+    def _trace_rule(self, rule):
+        assert self._rule_scores[rule] == sum(self._positions_by_rule[rule].values())
+
+        changes = self._positions_by_rule[rule].values()
+        num_fixed = len([c for c in changes if c == 1])
+        num_broken = len([c for c in changes if c == -1])
+        num_other = len([c for c in changes if c == 0])
+        score = self._rule_scores[rule]
+
+        rulestr = rule.format(self._ruleformat)
+        if self._trace > 2:
+            print('%4d%4d%4d%4d  |' % (score, num_fixed, num_broken, num_other), end=' ')
+            print(textwrap.fill(rulestr, initial_indent=' '*20, width=79,
+                                subsequent_indent=' '*18+'|   ').strip())
+        else:
+            print(rulestr)
+
+    def _trace_apply(self, num_updates):
+        prefix = ' '*18+'|'
+        print(prefix)
+        print(prefix, 'Applying rule to %d positions.' % num_updates)
+
+    def _trace_update_rules(self, num_obsolete, num_new, num_unseen):
+        prefix = ' '*18+'|'
+        print(prefix, 'Updated rule tables:')
+        print(prefix, ('  - %d rule applications removed' % num_obsolete))
+        print(prefix, ('  - %d rule applications added (%d novel)' %
+                       (num_new, num_unseen)))
+        print(prefix)
+
+
diff --git a/nlp_resource_data/nltk/tag/brill_trainer.pyc b/nlp_resource_data/nltk/tag/brill_trainer.pyc

new file mode 100755 (executable)

index 0000000..910c147

Binary files /dev/null and b/nlp_resource_data/nltk/tag/brill_trainer.pyc differ
diff --git a/nlp_resource_data/nltk/tag/crf.py b/nlp_resource_data/nltk/tag/crf.py

new file mode 100755 (executable)

index 0000000..6a33aca
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/crf.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the CRFSuite Tagger
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Long Duong <longdt219@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for POS tagging using CRFSuite
+"""
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import unicodedata
+import re 
+from nltk.tag.api import TaggerI
+
+try:
+    import pycrfsuite
+except ImportError:
+    pass
+
+class CRFTagger(TaggerI):
+    """
+    A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite
+    
+    >>> from nltk.tag import CRFTagger
+    >>> ct = CRFTagger()
+ 
+    >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')],
+    ... [('dog','Noun'),('eat','Verb'),('meat','Noun')]]
+    
+    >>> ct.train(train_data,'model.crf.tagger')
+    >>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']])
+    [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]]
+    
+    >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] 
+    >>> ct.evaluate(gold_sentences) 
+    1.0
+    
+    Setting learned model file  
+    >>> ct = CRFTagger() 
+    >>> ct.set_model_file('model.crf.tagger')
+    >>> ct.evaluate(gold_sentences)
+    1.0
+    
+    """
+    
+    
+    def __init__(self,  feature_func = None, verbose = False, training_opt = {}):
+        """
+        Initialize the CRFSuite tagger 
+        :param feature_func: The function that extracts features for each token of a sentence. This function should take 
+        2 parameters: tokens and index which extract features at index position from tokens list. See the build in 
+        _get_features function for more detail.   
+        :param verbose: output the debugging messages during training.
+        :type verbose: boolean  
+        :param training_opt: python-crfsuite training options
+        :type training_opt : dictionary 
+        
+        Set of possible training options (using LBFGS training algorithm).  
+         'feature.minfreq' : The minimum frequency of features.
+         'feature.possible_states' : Force to generate possible state features.
+         'feature.possible_transitions' : Force to generate possible transition features.
+         'c1' : Coefficient for L1 regularization.
+         'c2' : Coefficient for L2 regularization.
+         'max_iterations' : The maximum number of iterations for L-BFGS optimization.
+         'num_memories' : The number of limited memories for approximating the inverse hessian matrix.
+         'epsilon' : Epsilon for testing the convergence of the objective.
+         'period' : The duration of iterations to test the stopping criterion.
+         'delta' : The threshold for the stopping criterion; an L-BFGS iteration stops when the
+                    improvement of the log likelihood over the last ${period} iterations is no greater than this threshold.
+         'linesearch' : The line search algorithm used in L-BFGS updates:
+                           { 'MoreThuente': More and Thuente's method,
+                              'Backtracking': Backtracking method with regular Wolfe condition,
+                              'StrongBacktracking': Backtracking method with strong Wolfe condition
+                           } 
+         'max_linesearch' :  The maximum number of trials for the line search algorithm.
+         
+        """
+                   
+        self._model_file = ''
+        self._tagger = pycrfsuite.Tagger()
+        
+        if feature_func is None:
+            self._feature_func =  self._get_features
+        else:
+            self._feature_func =  feature_func
+        
+        self._verbose = verbose 
+        self._training_options = training_opt
+        self._pattern = re.compile(r'\d')
+        
+    def set_model_file(self, model_file):
+        self._model_file = model_file
+        self._tagger.open(self._model_file)
+            
+    def _get_features(self, tokens, idx):
+        """
+        Extract basic features about this word including 
+             - Current Word 
+             - Is Capitalized ?
+             - Has Punctuation ?
+             - Has Number ?
+             - Suffixes up to length 3
+        Note that : we might include feature over previous word, next word ect. 
+        
+        :return : a list which contains the features
+        :rtype : list(str)    
+        
+        """ 
+        token = tokens[idx]
+        
+        feature_list = []
+        
+        if not token:
+            return feature_list
+            
+        # Capitalization 
+        if token[0].isupper():
+            feature_list.append('CAPITALIZATION')
+        
+        # Number 
+        if re.search(self._pattern, token) is not None:
+            feature_list.append('HAS_NUM') 
+        
+        # Punctuation
+        punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
+        if all (unicodedata.category(x) in punc_cat for x in token):
+            feature_list.append('PUNCTUATION')
+        
+        # Suffix up to length 3
+        if len(token) > 1:
+            feature_list.append('SUF_' + token[-1:]) 
+        if len(token) > 2: 
+            feature_list.append('SUF_' + token[-2:])    
+        if len(token) > 3: 
+            feature_list.append('SUF_' + token[-3:])
+            
+        feature_list.append('WORD_' + token )
+        
+        return feature_list
+        
+    def tag_sents(self, sents):
+        '''
+        Tag a list of sentences. NB before using this function, user should specify the mode_file either by 
+                       - Train a new model using ``train'' function 
+                       - Use the pre-trained model which is set via ``set_model_file'' function  
+        :params sentences : list of sentences needed to tag. 
+        :type sentences : list(list(str))
+        :return : list of tagged sentences. 
+        :rtype : list (list (tuple(str,str))) 
+        '''
+        if self._model_file == '':
+            raise Exception(' No model file is found !! Please use train or set_model_file function')
+        
+        # We need the list of sentences instead of the list generator for matching the input and output
+        result = []  
+        for tokens in sents:
+            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
+            labels = self._tagger.tag(features)
+                
+            if len(labels) != len(tokens):
+                raise Exception(' Predicted Length Not Matched, Expect Errors !')
+            
+            tagged_sent = list(zip(tokens,labels))
+            result.append(tagged_sent)
+            
+        return result 
+    
+    def train(self, train_data, model_file):
+        '''
+        Train the CRF tagger using CRFSuite  
+        :params train_data : is the list of annotated sentences.        
+        :type train_data : list (list(tuple(str,str)))
+        :params model_file : the model will be saved to this file.     
+         
+        '''
+        trainer = pycrfsuite.Trainer(verbose=self._verbose)
+        trainer.set_params(self._training_options)
+        
+        for sent in train_data:
+            tokens,labels = zip(*sent)
+            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
+            trainer.append(features,labels)
+                        
+        # Now train the model, the output should be model_file
+        trainer.train(model_file)
+        # Save the model file
+        self.set_model_file(model_file) 
+
+    def tag(self, tokens):
+        '''
+        Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by 
+                       - Train a new model using ``train'' function 
+                       - Use the pre-trained model which is set via ``set_model_file'' function  
+        :params tokens : list of tokens needed to tag. 
+        :type tokens : list(str)
+        :return : list of tagged tokens. 
+        :rtype : list (tuple(str,str)) 
+        '''
+        
+        return self.tag_sents([tokens])[0]
+
diff --git a/nlp_resource_data/nltk/tag/crf.pyc b/nlp_resource_data/nltk/tag/crf.pyc

new file mode 100755 (executable)

index 0000000..ccc71e7

Binary files /dev/null and b/nlp_resource_data/nltk/tag/crf.pyc differ
diff --git a/nlp_resource_data/nltk/tag/hmm.py b/nlp_resource_data/nltk/tag/hmm.py

new file mode 100755 (executable)

index 0000000..309f6fe
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/hmm.py
@@ -0,0 +1,1275 @@
+# Natural Language Toolkit: Hidden Markov Model
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Trevor Cohn <tacohn@csse.unimelb.edu.au>
+#         Philip Blunsom <pcbl@csse.unimelb.edu.au>
+#         Tiago Tresoldi <tiago@tresoldi.pro.br> (fixes)
+#         Steven Bird <stevenbird1@gmail.com> (fixes)
+#         Joseph Frazee <jfrazee@mail.utexas.edu> (fixes)
+#         Steven Xu <xxu@student.unimelb.edu.au> (fixes)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Hidden Markov Models (HMMs) largely used to assign the correct label sequence
+to sequential data or assess the probability of a given label and data
+sequence. These models are finite state machines characterised by a number of
+states, transitions between these states, and output symbols emitted while in
+each state. The HMM is an extension to the Markov chain, where each state
+corresponds deterministically to a given event. In the HMM the observation is
+a probabilistic function of the state. HMMs share the Markov chain's
+assumption, being that the probability of transition from one state to another
+only depends on the current state - i.e. the series of states that led to the
+current state are not used. They are also time invariant.
+
+The HMM is a directed graph, with probability weighted edges (representing the
+probability of a transition between the source and sink states) where each
+vertex emits an output symbol when entered. The symbol (or observation) is
+non-deterministically generated. For this reason, knowing that a sequence of
+output observations was generated by a given HMM does not mean that the
+corresponding sequence of states (and what the current state is) is known.
+This is the 'hidden' in the hidden markov model.
+
+Formally, a HMM can be characterised by:
+
+- the output observation alphabet. This is the set of symbols which may be
+  observed as output of the system.
+- the set of states.
+- the transition probabilities *a_{ij} = P(s_t = j | s_{t-1} = i)*. These
+  represent the probability of transition to each state from a given state.
+- the output probability matrix *b_i(k) = P(X_t = o_k | s_t = i)*. These
+  represent the probability of observing each symbol in a given state.
+- the initial state distribution. This gives the probability of starting
+  in each state.
+
+To ground this discussion, take a common NLP application, part-of-speech (POS)
+tagging. An HMM is desirable for this task as the highest probability tag
+sequence can be calculated for a given sequence of word forms. This differs
+from other tagging techniques which often tag each word individually, seeking
+to optimise each individual tagging greedily without regard to the optimal
+combination of tags for a larger unit, such as a sentence. The HMM does this
+with the Viterbi algorithm, which efficiently computes the optimal path
+through the graph given the sequence of words forms.
+
+In POS tagging the states usually have a 1:1 correspondence with the tag
+alphabet - i.e. each state represents a single tag. The output observation
+alphabet is the set of word forms (the lexicon), and the remaining three
+parameters are derived by a training regime. With this information the
+probability of a given sentence can be easily derived, by simply summing the
+probability of each distinct path through the model. Similarly, the highest
+probability tagging sequence can be derived with the Viterbi algorithm,
+yielding a state sequence which can be mapped into a tag sequence.
+
+This discussion assumes that the HMM has been trained. This is probably the
+most difficult task with the model, and requires either MLE estimates of the
+parameters or unsupervised learning using the Baum-Welch algorithm, a variant
+of EM.
+
+For more information, please consult the source code for this module,
+which includes extensive demonstration code.
+"""
+from __future__ import print_function, unicode_literals, division
+
+import re
+import itertools
+
+from six.moves import map, zip
+
+try:
+    import numpy as np
+except ImportError:
+    pass
+
+from nltk.probability import (FreqDist, ConditionalFreqDist,
+                              ConditionalProbDist, DictionaryProbDist,
+                              DictionaryConditionalProbDist,
+                              LidstoneProbDist, MutableProbDist,
+                              MLEProbDist, RandomProbDist)
+from nltk.metrics import accuracy
+from nltk.util import LazyMap, unique_list
+from nltk.compat import python_2_unicode_compatible
+from nltk.tag.api import TaggerI
+
+
+_TEXT = 0  # index of text in a tuple
+_TAG = 1   # index of tag in a tuple
+
+def _identity(labeled_symbols):
+    return labeled_symbols
+
+@python_2_unicode_compatible
+class HiddenMarkovModelTagger(TaggerI):
+    """
+    Hidden Markov model class, a generative model for labelling sequence data.
+    These models define the joint probability of a sequence of symbols and
+    their labels (state transitions) as the product of the starting state
+    probability, the probability of each state transition, and the probability
+    of each observation being generated from each state. This is described in
+    more detail in the module documentation.
+
+    This implementation is based on the HMM description in Chapter 8, Huang,
+    Acero and Hon, Spoken Language Processing and includes an extension for
+    training shallow HMM parsers or specialized HMMs as in Molina et.
+    al, 2002.  A specialized HMM modifies training data by applying a
+    specialization function to create a new training set that is more
+    appropriate for sequential tagging with an HMM.  A typical use case is
+    chunking.
+
+    :param symbols: the set of output symbols (alphabet)
+    :type symbols: seq of any
+    :param states: a set of states representing state space
+    :type states: seq of any
+    :param transitions: transition probabilities; Pr(s_i | s_j) is the
+        probability of transition from state i given the model is in
+        state_j
+    :type transitions: ConditionalProbDistI
+    :param outputs: output probabilities; Pr(o_k | s_i) is the probability
+        of emitting symbol k when entering state i
+    :type outputs: ConditionalProbDistI
+    :param priors: initial state distribution; Pr(s_i) is the probability
+        of starting in state i
+    :type priors: ProbDistI
+    :param transform: an optional function for transforming training
+        instances, defaults to the identity function.
+    :type transform: callable
+    """
+    def __init__(self, symbols, states, transitions, outputs, priors,
+                 transform=_identity):
+        self._symbols = unique_list(symbols)
+        self._states = unique_list(states)
+        self._transitions = transitions
+        self._outputs = outputs
+        self._priors = priors
+        self._cache = None
+        self._transform = transform
+
+    @classmethod
+    def _train(cls, labeled_sequence, test_sequence=None,
+                    unlabeled_sequence=None, transform=_identity,
+                    estimator=None, **kwargs):
+
+        if estimator is None:
+            def estimator(fd, bins):
+                return LidstoneProbDist(fd, 0.1, bins)
+
+        labeled_sequence = LazyMap(transform, labeled_sequence)
+        symbols = unique_list(word for sent in labeled_sequence
+            for word, tag in sent)
+        tag_set = unique_list(tag for sent in labeled_sequence
+            for word, tag in sent)
+
+        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
+        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
+        hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs,
+                  hmm._priors, transform=transform)
+
+        if test_sequence:
+            hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
+
+        if unlabeled_sequence:
+            max_iterations = kwargs.get('max_iterations', 5)
+            hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm,
+                max_iterations=max_iterations)
+            if test_sequence:
+                hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
+
+        return hmm
+
+    @classmethod
+    def train(cls, labeled_sequence, test_sequence=None,
+                   unlabeled_sequence=None, **kwargs):
+        """
+        Train a new HiddenMarkovModelTagger using the given labeled and
+        unlabeled training instances. Testing will be performed if test
+        instances are provided.
+
+        :return: a hidden markov model tagger
+        :rtype: HiddenMarkovModelTagger
+        :param labeled_sequence: a sequence of labeled training instances,
+            i.e. a list of sentences represented as tuples
+        :type labeled_sequence: list(list)
+        :param test_sequence: a sequence of labeled test instances
+        :type test_sequence: list(list)
+        :param unlabeled_sequence: a sequence of unlabeled training instances,
+            i.e. a list of sentences represented as words
+        :type unlabeled_sequence: list(list)
+        :param transform: an optional function for transforming training
+            instances, defaults to the identity function, see ``transform()``
+        :type transform: function
+        :param estimator: an optional function or class that maps a
+            condition's frequency distribution to its probability
+            distribution, defaults to a Lidstone distribution with gamma = 0.1
+        :type estimator: class or function
+        :param verbose: boolean flag indicating whether training should be
+            verbose or include printed output
+        :type verbose: bool
+        :param max_iterations: number of Baum-Welch interations to perform
+        :type max_iterations: int
+        """
+        return cls._train(labeled_sequence, test_sequence,
+                          unlabeled_sequence, **kwargs)
+
+    def probability(self, sequence):
+        """
+        Returns the probability of the given symbol sequence. If the sequence
+        is labelled, then returns the joint probability of the symbol, state
+        sequence. Otherwise, uses the forward algorithm to find the
+        probability over all label sequences.
+
+        :return: the probability of the sequence
+        :rtype: float
+        :param sequence: the sequence of symbols which must contain the TEXT
+            property, and optionally the TAG property
+        :type sequence:  Token
+        """
+        return 2**(self.log_probability(self._transform(sequence)))
+
+    def log_probability(self, sequence):
+        """
+        Returns the log-probability of the given symbol sequence. If the
+        sequence is labelled, then returns the joint log-probability of the
+        symbol, state sequence. Otherwise, uses the forward algorithm to find
+        the log-probability over all label sequences.
+
+        :return: the log-probability of the sequence
+        :rtype: float
+        :param sequence: the sequence of symbols which must contain the TEXT
+            property, and optionally the TAG property
+        :type sequence:  Token
+        """
+        sequence = self._transform(sequence)
+
+        T = len(sequence)
+
+        if T > 0 and sequence[0][_TAG]:
+            last_state = sequence[0][_TAG]
+            p = self._priors.logprob(last_state) + \
+                self._output_logprob(last_state, sequence[0][_TEXT])
+            for t in range(1, T):
+                state = sequence[t][_TAG]
+                p += self._transitions[last_state].logprob(state) + \
+                     self._output_logprob(state, sequence[t][_TEXT])
+                last_state = state
+            return p
+        else:
+            alpha = self._forward_probability(sequence)
+            p = logsumexp2(alpha[T-1])
+            return p
+
+    def tag(self, unlabeled_sequence):
+        """
+        Tags the sequence with the highest probability state sequence. This
+        uses the best_path method to find the Viterbi path.
+
+        :return: a labelled sequence of symbols
+        :rtype: list
+        :param unlabeled_sequence: the sequence of unlabeled symbols
+        :type unlabeled_sequence: list
+        """
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+        return self._tag(unlabeled_sequence)
+
+    def _tag(self, unlabeled_sequence):
+        path = self._best_path(unlabeled_sequence)
+        return list(zip(unlabeled_sequence, path))
+
+    def _output_logprob(self, state, symbol):
+        """
+        :return: the log probability of the symbol being observed in the given
+            state
+        :rtype: float
+        """
+        return self._outputs[state].logprob(symbol)
+
+    def _create_cache(self):
+        """
+        The cache is a tuple (P, O, X, S) where:
+
+          - S maps symbols to integers.  I.e., it is the inverse
+            mapping from self._symbols; for each symbol s in
+            self._symbols, the following is true::
+
+              self._symbols[S[s]] == s
+
+          - O is the log output probabilities::
+
+              O[i,k] = log( P(token[t]=sym[k]|tag[t]=state[i]) )
+
+          - X is the log transition probabilities::
+
+              X[i,j] = log( P(tag[t]=state[j]|tag[t-1]=state[i]) )
+
+          - P is the log prior probabilities::
+
+              P[i] = log( P(tag[0]=state[i]) )
+        """
+        if not self._cache:
+            N = len(self._states)
+            M = len(self._symbols)
+            P = np.zeros(N, np.float32)
+            X = np.zeros((N, N), np.float32)
+            O = np.zeros((N, M), np.float32)
+            for i in range(N):
+                si = self._states[i]
+                P[i] = self._priors.logprob(si)
+                for j in range(N):
+                    X[i, j] = self._transitions[si].logprob(self._states[j])
+                for k in range(M):
+                    O[i, k] = self._output_logprob(si, self._symbols[k])
+            S = {}
+            for k in range(M):
+                S[self._symbols[k]] = k
+            self._cache = (P, O, X, S)
+
+    def _update_cache(self, symbols):
+        # add new symbols to the symbol table and repopulate the output
+        # probabilities and symbol table mapping
+        if symbols:
+            self._create_cache()
+            P, O, X, S = self._cache
+            for symbol in symbols:
+                if symbol not in self._symbols:
+                    self._cache = None
+                    self._symbols.append(symbol)
+            # don't bother with the work if there aren't any new symbols
+            if not self._cache:
+                N = len(self._states)
+                M = len(self._symbols)
+                Q = O.shape[1]
+                # add new columns to the output probability table without
+                # destroying the old probabilities
+                O = np.hstack([O, np.zeros((N, M - Q), np.float32)])
+                for i in range(N):
+                    si = self._states[i]
+                    # only calculate probabilities for new symbols
+                    for k in range(Q, M):
+                        O[i, k] = self._output_logprob(si, self._symbols[k])
+                # only create symbol mappings for new symbols
+                for k in range(Q, M):
+                    S[self._symbols[k]] = k
+                self._cache = (P, O, X, S)
+
+    def reset_cache(self):
+        self._cache = None
+
+    def best_path(self, unlabeled_sequence):
+        """
+        Returns the state sequence of the optimal (most probable) path through
+        the HMM. Uses the Viterbi algorithm to calculate this part by dynamic
+        programming.
+
+        :return: the state sequence
+        :rtype: sequence of any
+        :param unlabeled_sequence: the sequence of unlabeled symbols
+        :type unlabeled_sequence: list
+        """
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+        return self._best_path(unlabeled_sequence)
+
+    def _best_path(self, unlabeled_sequence):
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+        self._create_cache()
+        self._update_cache(unlabeled_sequence)
+        P, O, X, S = self._cache
+
+        V = np.zeros((T, N), np.float32)
+        B = -np.ones((T, N), np.int)
+
+        V[0] = P + O[:, S[unlabeled_sequence[0]]]
+        for t in range(1, T):
+            for j in range(N):
+                vs = V[t-1, :] + X[:, j]
+                best = np.argmax(vs)
+                V[t, j] = vs[best] + O[j, S[unlabeled_sequence[t]]]
+                B[t, j] = best
+
+        current = np.argmax(V[T-1,:])
+        sequence = [current]
+        for t in range(T-1, 0, -1):
+            last = B[t, current]
+            sequence.append(last)
+            current = last
+
+        sequence.reverse()
+        return list(map(self._states.__getitem__, sequence))
+
+    def best_path_simple(self, unlabeled_sequence):
+        """
+        Returns the state sequence of the optimal (most probable) path through
+        the HMM. Uses the Viterbi algorithm to calculate this part by dynamic
+        programming.  This uses a simple, direct method, and is included for
+        teaching purposes.
+
+        :return: the state sequence
+        :rtype: sequence of any
+        :param unlabeled_sequence: the sequence of unlabeled symbols
+        :type unlabeled_sequence: list
+        """
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+        return self._best_path_simple(unlabeled_sequence)
+
+    def _best_path_simple(self, unlabeled_sequence):
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+        V = np.zeros((T, N), np.float64)
+        B = {}
+
+        # find the starting log probabilities for each state
+        symbol = unlabeled_sequence[0]
+        for i, state in enumerate(self._states):
+            V[0, i] = self._priors.logprob(state) + \
+                      self._output_logprob(state, symbol)
+            B[0, state] = None
+
+        # find the maximum log probabilities for reaching each state at time t
+        for t in range(1, T):
+            symbol = unlabeled_sequence[t]
+            for j in range(N):
+                sj = self._states[j]
+                best = None
+                for i in range(N):
+                    si = self._states[i]
+                    va = V[t-1, i] + self._transitions[si].logprob(sj)
+                    if not best or va > best[0]:
+                        best = (va, si)
+                V[t, j] = best[0] + self._output_logprob(sj, symbol)
+                B[t, sj] = best[1]
+
+        # find the highest probability final state
+        best = None
+        for i in range(N):
+            val = V[T-1, i]
+            if not best or val > best[0]:
+                best = (val, self._states[i])
+
+        # traverse the back-pointers B to find the state sequence
+        current = best[1]
+        sequence = [current]
+        for t in range(T-1, 0, -1):
+            last = B[t, current]
+            sequence.append(last)
+            current = last
+
+        sequence.reverse()
+        return sequence
+
+    def random_sample(self, rng, length):
+        """
+        Randomly sample the HMM to generate a sentence of a given length. This
+        samples the prior distribution then the observation distribution and
+        transition distribution for each subsequent observation and state.
+        This will mostly generate unintelligible garbage, but can provide some
+        amusement.
+
+        :return:        the randomly created state/observation sequence,
+                        generated according to the HMM's probability
+                        distributions. The SUBTOKENS have TEXT and TAG
+                        properties containing the observation and state
+                        respectively.
+        :rtype:         list
+        :param rng:     random number generator
+        :type rng:      Random (or any object with a random() method)
+        :param length:  desired output length
+        :type length:   int
+        """
+
+        # sample the starting state and symbol prob dists
+        tokens = []
+        state = self._sample_probdist(self._priors, rng.random(), self._states)
+        symbol = self._sample_probdist(self._outputs[state],
+                                  rng.random(), self._symbols)
+        tokens.append((symbol, state))
+
+        for i in range(1, length):
+            # sample the state transition and symbol prob dists
+            state = self._sample_probdist(self._transitions[state],
+                                     rng.random(), self._states)
+            symbol = self._sample_probdist(self._outputs[state],
+                                      rng.random(), self._symbols)
+            tokens.append((symbol, state))
+
+        return tokens
+
+    def _sample_probdist(self, probdist, p, samples):
+        cum_p = 0
+        for sample in samples:
+            add_p = probdist.prob(sample)
+            if cum_p <= p <= cum_p + add_p:
+                return sample
+            cum_p += add_p
+        raise Exception('Invalid probability distribution - '
+                        'does not sum to one')
+
+    def entropy(self, unlabeled_sequence):
+        """
+        Returns the entropy over labellings of the given sequence. This is
+        given by::
+
+            H(O) = - sum_S Pr(S | O) log Pr(S | O)
+
+        where the summation ranges over all state sequences, S. Let
+        *Z = Pr(O) = sum_S Pr(S, O)}* where the summation ranges over all state
+        sequences and O is the observation sequence. As such the entropy can
+        be re-expressed as::
+
+            H = - sum_S Pr(S | O) log [ Pr(S, O) / Z ]
+            = log Z - sum_S Pr(S | O) log Pr(S, 0)
+            = log Z - sum_S Pr(S | O) [ log Pr(S_0) + sum_t Pr(S_t | S_{t-1}) + sum_t Pr(O_t | S_t) ]
+
+        The order of summation for the log terms can be flipped, allowing
+        dynamic programming to be used to calculate the entropy. Specifically,
+        we use the forward and backward probabilities (alpha, beta) giving::
+
+            H = log Z - sum_s0 alpha_0(s0) beta_0(s0) / Z * log Pr(s0)
+            + sum_t,si,sj alpha_t(si) Pr(sj | si) Pr(O_t+1 | sj) beta_t(sj) / Z * log Pr(sj | si)
+            + sum_t,st alpha_t(st) beta_t(st) / Z * log Pr(O_t | st)
+
+        This simply uses alpha and beta to find the probabilities of partial
+        sequences, constrained to include the given state(s) at some point in
+        time.
+        """
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+
+        alpha = self._forward_probability(unlabeled_sequence)
+        beta = self._backward_probability(unlabeled_sequence)
+        normalisation = logsumexp2(alpha[T-1])
+
+        entropy = normalisation
+
+        # starting state, t = 0
+        for i, state in enumerate(self._states):
+            p = 2**(alpha[0, i] + beta[0, i] - normalisation)
+            entropy -= p * self._priors.logprob(state)
+            #print 'p(s_0 = %s) =' % state, p
+
+        # state transitions
+        for t0 in range(T - 1):
+            t1 = t0 + 1
+            for i0, s0 in enumerate(self._states):
+                for i1, s1 in enumerate(self._states):
+                    p = 2**(alpha[t0, i0] + self._transitions[s0].logprob(s1) +
+                            self._outputs[s1].logprob(
+                                unlabeled_sequence[t1][_TEXT]) +
+                            beta[t1, i1] - normalisation)
+                    entropy -= p * self._transitions[s0].logprob(s1)
+                    #print 'p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p
+
+        # symbol emissions
+        for t in range(T):
+            for i, state in enumerate(self._states):
+                p = 2**(alpha[t, i] + beta[t, i] - normalisation)
+                entropy -= p * self._outputs[state].logprob(
+                    unlabeled_sequence[t][_TEXT])
+                #print 'p(s_%d = %s) =' % (t, state), p
+
+        return entropy
+
+    def point_entropy(self, unlabeled_sequence):
+        """
+        Returns the pointwise entropy over the possible states at each
+        position in the chain, given the observation sequence.
+        """
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+
+        alpha = self._forward_probability(unlabeled_sequence)
+        beta = self._backward_probability(unlabeled_sequence)
+        normalisation = logsumexp2(alpha[T-1])
+
+        entropies = np.zeros(T, np.float64)
+        probs = np.zeros(N, np.float64)
+        for t in range(T):
+            for s in range(N):
+                probs[s] = alpha[t, s] + beta[t, s] - normalisation
+
+            for s in range(N):
+                entropies[t] -= 2**(probs[s]) * probs[s]
+
+        return entropies
+
+    def _exhaustive_entropy(self, unlabeled_sequence):
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+
+        labellings = [[state] for state in self._states]
+        for t in range(T - 1):
+            current = labellings
+            labellings = []
+            for labelling in current:
+                for state in self._states:
+                    labellings.append(labelling + [state])
+
+        log_probs = []
+        for labelling in labellings:
+            labeled_sequence = unlabeled_sequence[:]
+            for t, label in enumerate(labelling):
+                labeled_sequence[t] = (labeled_sequence[t][_TEXT], label)
+            lp = self.log_probability(labeled_sequence)
+            log_probs.append(lp)
+        normalisation = _log_add(*log_probs)
+
+        #ps = zeros((T, N), float64)
+        #for labelling, lp in zip(labellings, log_probs):
+            #for t in range(T):
+                #ps[t, self._states.index(labelling[t])] += \
+                #    2**(lp - normalisation)
+
+        #for t in range(T):
+            #print 'prob[%d] =' % t, ps[t]
+
+        entropy = 0
+        for lp in log_probs:
+            lp -= normalisation
+            entropy -= 2**(lp) * lp
+
+        return entropy
+
+    def _exhaustive_point_entropy(self, unlabeled_sequence):
+        unlabeled_sequence = self._transform(unlabeled_sequence)
+
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+
+        labellings = [[state] for state in self._states]
+        for t in range(T - 1):
+            current = labellings
+            labellings = []
+            for labelling in current:
+                for state in self._states:
+                    labellings.append(labelling + [state])
+
+        log_probs = []
+        for labelling in labellings:
+            labelled_sequence = unlabeled_sequence[:]
+            for t, label in enumerate(labelling):
+                labelled_sequence[t] = (labelled_sequence[t][_TEXT], label)
+            lp = self.log_probability(labelled_sequence)
+            log_probs.append(lp)
+
+        normalisation = _log_add(*log_probs)
+
+        probabilities = _ninf_array((T,N))
+
+        for labelling, lp in zip(labellings, log_probs):
+            lp -= normalisation
+            for t, label in enumerate(labelling):
+                index = self._states.index(label)
+                probabilities[t, index] = _log_add(probabilities[t, index], lp)
+
+        entropies = np.zeros(T, np.float64)
+        for t in range(T):
+            for s in range(N):
+                entropies[t] -= 2**(probabilities[t, s]) * probabilities[t, s]
+
+        return entropies
+
+    def _transitions_matrix(self):
+        """ Return a matrix of transition log probabilities. """
+        trans_iter = (self._transitions[sj].logprob(si)
+                      for sj in self._states
+                      for si in self._states)
+
+        transitions_logprob = np.fromiter(trans_iter, dtype=np.float64)
+        N = len(self._states)
+        return transitions_logprob.reshape((N, N)).T
+
+    def _outputs_vector(self, symbol):
+        """
+        Return a vector with log probabilities of emitting a symbol
+        when entering states.
+        """
+        out_iter = (self._output_logprob(sj, symbol) for sj in self._states)
+        return np.fromiter(out_iter, dtype=np.float64)
+
+    def _forward_probability(self, unlabeled_sequence):
+        """
+        Return the forward probability matrix, a T by N array of
+        log-probabilities, where T is the length of the sequence and N is the
+        number of states. Each entry (t, s) gives the probability of being in
+        state s at time t after observing the partial symbol sequence up to
+        and including t.
+
+        :param unlabeled_sequence: the sequence of unlabeled symbols
+        :type unlabeled_sequence: list
+        :return: the forward log probability matrix
+        :rtype: array
+        """
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+        alpha = _ninf_array((T, N))
+
+        transitions_logprob = self._transitions_matrix()
+
+        # Initialization
+        symbol = unlabeled_sequence[0][_TEXT]
+        for i, state in enumerate(self._states):
+            alpha[0, i] = self._priors.logprob(state) + \
+                          self._output_logprob(state, symbol)
+
+        # Induction
+        for t in range(1, T):
+            symbol = unlabeled_sequence[t][_TEXT]
+            output_logprob = self._outputs_vector(symbol)
+
+            for i in range(N):
+                summand = alpha[t-1] + transitions_logprob[i]
+                alpha[t, i] = logsumexp2(summand) + output_logprob[i]
+
+        return alpha
+
+    def _backward_probability(self, unlabeled_sequence):
+        """
+        Return the backward probability matrix, a T by N array of
+        log-probabilities, where T is the length of the sequence and N is the
+        number of states. Each entry (t, s) gives the probability of being in
+        state s at time t after observing the partial symbol sequence from t
+        .. T.
+
+        :return: the backward log probability matrix
+        :rtype:  array
+        :param unlabeled_sequence: the sequence of unlabeled symbols
+        :type unlabeled_sequence: list
+        """
+        T = len(unlabeled_sequence)
+        N = len(self._states)
+        beta = _ninf_array((T, N))
+
+        transitions_logprob = self._transitions_matrix().T
+
+        # initialise the backward values;
+        # "1" is an arbitrarily chosen value from Rabiner tutorial
+        beta[T-1, :] = np.log2(1)
+
+        # inductively calculate remaining backward values
+        for t in range(T-2, -1, -1):
+            symbol = unlabeled_sequence[t+1][_TEXT]
+            outputs = self._outputs_vector(symbol)
+
+            for i in range(N):
+                summand = transitions_logprob[i] + beta[t+1] + outputs
+                beta[t, i] = logsumexp2(summand)
+
+        return beta
+
+    def test(self, test_sequence, verbose=False, **kwargs):
+        """
+        Tests the HiddenMarkovModelTagger instance.
+
+        :param test_sequence: a sequence of labeled test instances
+        :type test_sequence: list(list)
+        :param verbose: boolean flag indicating whether training should be
+            verbose or include printed output
+        :type verbose: bool
+        """
+
+        def words(sent):
+            return [word for (word, tag) in sent]
+
+        def tags(sent):
+            return [tag for (word, tag) in sent]
+
+        def flatten(seq):
+            return list(itertools.chain(*seq))
+
+        test_sequence = self._transform(test_sequence)
+        predicted_sequence = list(map(self._tag, map(words, test_sequence)))
+
+        if verbose:
+            for test_sent, predicted_sent in zip(test_sequence, predicted_sequence):
+                print('Test:',
+                    ' '.join('%s/%s' % (token, tag)
+                             for (token, tag) in test_sent))
+                print()
+                print('Untagged:',
+                    ' '.join("%s" % token for (token, tag) in test_sent))
+                print()
+                print('HMM-tagged:',
+                    ' '.join('%s/%s' % (token, tag)
+                              for (token, tag) in predicted_sent))
+                print()
+                print('Entropy:',
+                    self.entropy([(token, None) for
+                                  (token, tag) in predicted_sent]))
+                print()
+                print('-' * 60)
+
+        test_tags = flatten(map(tags, test_sequence))
+        predicted_tags = flatten(map(tags, predicted_sequence))
+
+        acc = accuracy(test_tags, predicted_tags)
+        count = sum(len(sent) for sent in test_sequence)
+        print('accuracy over %d tokens: %.2f' % (count, acc * 100))
+
+    def __repr__(self):
+        return ('<HiddenMarkovModelTagger %d states and %d output symbols>'
+                % (len(self._states), len(self._symbols)))
+
+
+class HiddenMarkovModelTrainer(object):
+    """
+    Algorithms for learning HMM parameters from training data. These include
+    both supervised learning (MLE) and unsupervised learning (Baum-Welch).
+
+    Creates an HMM trainer to induce an HMM with the given states and
+    output symbol alphabet. A supervised and unsupervised training
+    method may be used. If either of the states or symbols are not given,
+    these may be derived from supervised training.
+
+    :param states:  the set of state labels
+    :type states:   sequence of any
+    :param symbols: the set of observation symbols
+    :type symbols:  sequence of any
+    """
+    def __init__(self, states=None, symbols=None):
+        self._states = (states if states else [])
+        self._symbols = (symbols if symbols else [])
+
+    def train(self, labeled_sequences=None, unlabeled_sequences=None,
+              **kwargs):
+        """
+        Trains the HMM using both (or either of) supervised and unsupervised
+        techniques.
+
+        :return: the trained model
+        :rtype: HiddenMarkovModelTagger
+        :param labelled_sequences: the supervised training data, a set of
+            labelled sequences of observations
+        :type labelled_sequences: list
+        :param unlabeled_sequences: the unsupervised training data, a set of
+            sequences of observations
+        :type unlabeled_sequences: list
+        :param kwargs: additional arguments to pass to the training methods
+        """
+        assert labeled_sequences or unlabeled_sequences
+        model = None
+        if labeled_sequences:
+            model = self.train_supervised(labeled_sequences, **kwargs)
+        if unlabeled_sequences:
+            if model: kwargs['model'] = model
+            model = self.train_unsupervised(unlabeled_sequences, **kwargs)
+        return model
+
+
+    def _baum_welch_step(self, sequence, model, symbol_to_number):
+
+        N = len(model._states)
+        M = len(model._symbols)
+        T = len(sequence)
+
+        # compute forward and backward probabilities
+        alpha = model._forward_probability(sequence)
+        beta = model._backward_probability(sequence)
+
+        # find the log probability of the sequence
+        lpk = logsumexp2(alpha[T-1])
+
+        A_numer = _ninf_array((N, N))
+        B_numer = _ninf_array((N, M))
+        A_denom = _ninf_array(N)
+        B_denom = _ninf_array(N)
+
+        transitions_logprob = model._transitions_matrix().T
+
+        for t in range(T):
+            symbol = sequence[t][_TEXT]  # not found? FIXME
+            next_symbol = None
+            if t < T - 1:
+                next_symbol = sequence[t+1][_TEXT]  # not found? FIXME
+            xi = symbol_to_number[symbol]
+
+            next_outputs_logprob = model._outputs_vector(next_symbol)
+            alpha_plus_beta = alpha[t] + beta[t]
+
+            if t < T - 1:
+                numer_add = transitions_logprob + next_outputs_logprob + \
+                            beta[t+1] + alpha[t].reshape(N, 1)
+                A_numer = np.logaddexp2(A_numer, numer_add)
+                A_denom = np.logaddexp2(A_denom, alpha_plus_beta)
+            else:
+                B_denom = np.logaddexp2(A_denom, alpha_plus_beta)
+
+            B_numer[:,xi] = np.logaddexp2(B_numer[:,xi], alpha_plus_beta)
+
+        return lpk, A_numer, A_denom, B_numer, B_denom
+
+    def train_unsupervised(self, unlabeled_sequences, update_outputs=True,
+                           **kwargs):
+        """
+        Trains the HMM using the Baum-Welch algorithm to maximise the
+        probability of the data sequence. This is a variant of the EM
+        algorithm, and is unsupervised in that it doesn't need the state
+        sequences for the symbols. The code is based on 'A Tutorial on Hidden
+        Markov Models and Selected Applications in Speech Recognition',
+        Lawrence Rabiner, IEEE, 1989.
+
+        :return: the trained model
+        :rtype: HiddenMarkovModelTagger
+        :param unlabeled_sequences: the training data, a set of
+            sequences of observations
+        :type unlabeled_sequences: list
+
+        kwargs may include following parameters:
+
+        :param model: a HiddenMarkovModelTagger instance used to begin
+            the Baum-Welch algorithm
+        :param max_iterations: the maximum number of EM iterations
+        :param convergence_logprob: the maximum change in log probability to
+            allow convergence
+        """
+
+        # create a uniform HMM, which will be iteratively refined, unless
+        # given an existing model
+        model = kwargs.get('model')
+        if not model:
+            priors = RandomProbDist(self._states)
+            transitions = DictionaryConditionalProbDist(
+                            dict((state, RandomProbDist(self._states))
+                                  for state in self._states))
+            outputs = DictionaryConditionalProbDist(
+                            dict((state, RandomProbDist(self._symbols))
+                                  for state in self._states))
+            model = HiddenMarkovModelTagger(self._symbols, self._states,
+                            transitions, outputs, priors)
+
+        self._states = model._states
+        self._symbols = model._symbols
+
+        N = len(self._states)
+        M = len(self._symbols)
+        symbol_numbers = dict((sym, i) for i, sym in enumerate(self._symbols))
+
+        # update model prob dists so that they can be modified
+        # model._priors = MutableProbDist(model._priors, self._states)
+
+        model._transitions = DictionaryConditionalProbDist(
+            dict((s, MutableProbDist(model._transitions[s], self._states))
+                 for s in self._states))
+
+        if update_outputs:
+            model._outputs = DictionaryConditionalProbDist(
+                dict((s, MutableProbDist(model._outputs[s], self._symbols))
+                     for s in self._states))
+
+        model.reset_cache()
+
+        # iterate until convergence
+        converged = False
+        last_logprob = None
+        iteration = 0
+        max_iterations = kwargs.get('max_iterations', 1000)
+        epsilon = kwargs.get('convergence_logprob', 1e-6)
+
+        while not converged and iteration < max_iterations:
+            A_numer = _ninf_array((N, N))
+            B_numer = _ninf_array((N, M))
+            A_denom = _ninf_array(N)
+            B_denom = _ninf_array(N)
+
+            logprob = 0
+            for sequence in unlabeled_sequences:
+                sequence = list(sequence)
+                if not sequence:
+                    continue
+
+                (lpk, seq_A_numer, seq_A_denom,
+                seq_B_numer, seq_B_denom) = self._baum_welch_step(sequence, model, symbol_numbers)
+
+                # add these sums to the global A and B values
+                for i in range(N):
+                    A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i]-lpk)
+                    B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i]-lpk)
+
+                A_denom = np.logaddexp2(A_denom, seq_A_denom-lpk)
+                B_denom = np.logaddexp2(B_denom, seq_B_denom-lpk)
+
+                logprob += lpk
+
+            # use the calculated values to update the transition and output
+            # probability values
+            for i in range(N):
+                logprob_Ai = A_numer[i] - A_denom[i]
+                logprob_Bi = B_numer[i] - B_denom[i]
+
+                # We should normalize all probabilities (see p.391 Huang et al)
+                # Let sum(P) be K.
+                # We can divide each Pi by K to make sum(P) == 1.
+                #   Pi' = Pi/K
+                #   log2(Pi') = log2(Pi) - log2(K)
+                logprob_Ai -= logsumexp2(logprob_Ai)
+                logprob_Bi -= logsumexp2(logprob_Bi)
+
+                # update output and transition probabilities
+                si = self._states[i]
+
+                for j in range(N):
+                    sj = self._states[j]
+                    model._transitions[si].update(sj, logprob_Ai[j])
+
+                if update_outputs:
+                    for k in range(M):
+                        ok = self._symbols[k]
+                        model._outputs[si].update(ok, logprob_Bi[k])
+
+                # Rabiner says the priors don't need to be updated. I don't
+                # believe him. FIXME
+
+            # test for convergence
+            if iteration > 0 and abs(logprob - last_logprob) < epsilon:
+                converged = True
+
+            print('iteration', iteration, 'logprob', logprob)
+            iteration += 1
+            last_logprob = logprob
+
+        return model
+
+    def train_supervised(self, labelled_sequences, estimator=None):
+        """
+        Supervised training maximising the joint probability of the symbol and
+        state sequences. This is done via collecting frequencies of
+        transitions between states, symbol observations while within each
+        state and which states start a sentence. These frequency distributions
+        are then normalised into probability estimates, which can be
+        smoothed if desired.
+
+        :return: the trained model
+        :rtype: HiddenMarkovModelTagger
+        :param labelled_sequences: the training data, a set of
+            labelled sequences of observations
+        :type labelled_sequences: list
+        :param estimator: a function taking
+            a FreqDist and a number of bins and returning a CProbDistI;
+            otherwise a MLE estimate is used
+        """
+
+        # default to the MLE estimate
+        if estimator is None:
+            estimator = lambda fdist, bins: MLEProbDist(fdist)
+
+        # count occurrences of starting states, transitions out of each state
+        # and output symbols observed in each state
+        known_symbols = set(self._symbols)
+        known_states = set(self._states)
+
+        starting = FreqDist()
+        transitions = ConditionalFreqDist()
+        outputs = ConditionalFreqDist()
+        for sequence in labelled_sequences:
+            lasts = None
+            for token in sequence:
+                state = token[_TAG]
+                symbol = token[_TEXT]
+                if lasts is None:
+                    starting[state] += 1
+                else:
+                    transitions[lasts][state] += 1
+                outputs[state][symbol] += 1
+                lasts = state
+
+                # update the state and symbol lists
+                if state not in known_states:
+                    self._states.append(state)
+                    known_states.add(state)
+
+                if symbol not in known_symbols:
+                    self._symbols.append(symbol)
+                    known_symbols.add(symbol)
+
+        # create probability distributions (with smoothing)
+        N = len(self._states)
+        pi = estimator(starting, N)
+        A = ConditionalProbDist(transitions, estimator, N)
+        B = ConditionalProbDist(outputs, estimator, len(self._symbols))
+
+        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
+
+
+def _ninf_array(shape):
+    res = np.empty(shape, np.float64)
+    res.fill(-np.inf)
+    return res
+
+
+def logsumexp2(arr):
+    max_ = arr.max()
+    return np.log2(np.sum(2**(arr - max_))) + max_
+
+
+def _log_add(*values):
+    """
+    Adds the logged values, returning the logarithm of the addition.
+    """
+    x = max(values)
+    if x > -np.inf:
+        sum_diffs = 0
+        for value in values:
+            sum_diffs += 2**(value - x)
+        return x + np.log2(sum_diffs)
+    else:
+        return x
+
+
+def _create_hmm_tagger(states, symbols, A, B, pi):
+    def pd(values, samples):
+        d = dict(zip(samples, values))
+        return DictionaryProbDist(d)
+
+    def cpd(array, conditions, samples):
+        d = {}
+        for values, condition in zip(array, conditions):
+            d[condition] = pd(values, samples)
+        return DictionaryConditionalProbDist(d)
+
+    A = cpd(A, states, states)
+    B = cpd(B, states, symbols)
+    pi = pd(pi, states)
+    return HiddenMarkovModelTagger(symbols=symbols, states=states,
+                                   transitions=A, outputs=B, priors=pi)
+
+
+def _market_hmm_example():
+    """
+    Return an example HMM (described at page 381, Huang et al)
+    """
+    states = ['bull', 'bear', 'static']
+    symbols = ['up', 'down', 'unchanged']
+    A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64)
+    B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64)
+    pi = np.array([0.5, 0.2, 0.3], np.float64)
+
+    model = _create_hmm_tagger(states, symbols, A, B, pi)
+    return model, states, symbols
+
+
+def demo():
+    # demonstrates HMM probability calculation
+
+    print()
+    print("HMM probability calculation demo")
+    print()
+
+    model, states, symbols = _market_hmm_example()
+
+    print('Testing', model)
+
+    for test in [['up', 'up'], ['up', 'down', 'up'],
+                 ['down'] * 5, ['unchanged'] * 5 + ['up']]:
+
+        sequence = [(t, None) for t in test]
+
+        print('Testing with state sequence', test)
+        print('probability =', model.probability(sequence))
+        print('tagging =    ', model.tag([word for (word,tag) in sequence]))
+        print('p(tagged) =  ', model.probability(sequence))
+        print('H =          ', model.entropy(sequence))
+        print('H_exh =      ', model._exhaustive_entropy(sequence))
+        print('H(point) =   ', model.point_entropy(sequence))
+        print('H_exh(point)=', model._exhaustive_point_entropy(sequence))
+        print()
+
+def load_pos(num_sents):
+    from nltk.corpus import brown
+
+    sentences = brown.tagged_sents(categories='news')[:num_sents]
+
+    tag_re = re.compile(r'[*]|--|[^+*-]+')
+    tag_set = set()
+    symbols = set()
+
+    cleaned_sentences = []
+    for sentence in sentences:
+        for i in range(len(sentence)):
+            word, tag = sentence[i]
+            word = word.lower()  # normalize
+            symbols.add(word)    # log this word
+            # Clean up the tag.
+            tag = tag_re.match(tag).group()
+            tag_set.add(tag)
+            sentence[i] = (word, tag)  # store cleaned-up tagged token
+        cleaned_sentences += [sentence]
+
+    return cleaned_sentences, list(tag_set), list(symbols)
+
+def demo_pos():
+    # demonstrates POS tagging using supervised training
+
+    print()
+    print("HMM POS tagging demo")
+    print()
+
+    print('Training HMM...')
+    labelled_sequences, tag_set, symbols = load_pos(20000)
+    trainer = HiddenMarkovModelTrainer(tag_set, symbols)
+    hmm = trainer.train_supervised(labelled_sequences[10:],
+                    estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
+
+    print('Testing...')
+    hmm.test(labelled_sequences[:10], verbose=True)
+
+def _untag(sentences):
+    unlabeled = []
+    for sentence in sentences:
+        unlabeled.append([(token[_TEXT], None) for token in sentence])
+    return unlabeled
+
+def demo_pos_bw(test=10, supervised=20, unsupervised=10, verbose=True,
+                max_iterations=5):
+    # demonstrates the Baum-Welch algorithm in POS tagging
+
+    print()
+    print("Baum-Welch demo for POS tagging")
+    print()
+
+    print('Training HMM (supervised, %d sentences)...' % supervised)
+
+    sentences, tag_set, symbols = load_pos(test + supervised + unsupervised)
+
+    symbols = set()
+    for sentence in sentences:
+        for token in sentence:
+            symbols.add(token[_TEXT])
+
+    trainer = HiddenMarkovModelTrainer(tag_set, list(symbols))
+    hmm = trainer.train_supervised(sentences[test:test+supervised],
+                    estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
+
+    hmm.test(sentences[:test], verbose=verbose)
+
+    print('Training (unsupervised, %d sentences)...' % unsupervised)
+    # it's rather slow - so only use 10 samples by default
+    unlabeled = _untag(sentences[test+supervised:])
+    hmm = trainer.train_unsupervised(unlabeled, model=hmm,
+                                     max_iterations=max_iterations)
+    hmm.test(sentences[:test], verbose=verbose)
+
+def demo_bw():
+    # demo Baum Welch by generating some sequences and then performing
+    # unsupervised training on them
+
+    print()
+    print("Baum-Welch demo for market example")
+    print()
+
+    model, states, symbols = _market_hmm_example()
+
+    # generate some random sequences
+    training = []
+    import random
+    rng = random.Random()
+    rng.seed(0)
+    for i in range(10):
+        item = model.random_sample(rng, 5)
+        training.append([(i[0], None) for i in item])
+
+    # train on those examples, starting with the model that generated them
+    trainer = HiddenMarkovModelTrainer(states, symbols)
+    hmm = trainer.train_unsupervised(training, model=model,
+                                     max_iterations=1000)
diff --git a/nlp_resource_data/nltk/tag/hmm.pyc b/nlp_resource_data/nltk/tag/hmm.pyc

new file mode 100755 (executable)

index 0000000..44d684d

Binary files /dev/null and b/nlp_resource_data/nltk/tag/hmm.pyc differ
diff --git a/nlp_resource_data/nltk/tag/hunpos.py b/nlp_resource_data/nltk/tag/hunpos.py

new file mode 100755 (executable)

index 0000000..e81b180
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/hunpos.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the HunPos POS-tagger
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+#         Dávid Márk Nemeskey <nemeskeyd@gmail.com> (modifications)
+#         Attila Zséder <zseder@gmail.com> (modifications)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for interfacing with the HunPos open-source POS-tagger.
+"""
+
+import os
+from subprocess import Popen, PIPE
+
+from six import text_type
+
+from nltk.internals import find_binary, find_file
+from nltk.tag.api import TaggerI
+
+_hunpos_url = 'http://code.google.com/p/hunpos/'
+
+_hunpos_charset = 'ISO-8859-1'
+"""The default encoding used by hunpos: ISO-8859-1."""
+
+class HunposTagger(TaggerI):
+    """
+    A class for pos tagging with HunPos. The input is the paths to:
+     - a model trained on training data
+     - (optionally) the path to the hunpos-tag binary
+     - (optionally) the encoding of the training data (default: ISO-8859-1)
+
+    Example:
+
+        >>> from nltk.tag import HunposTagger
+        >>> ht = HunposTagger('en_wsj.model')
+        >>> ht.tag('What is the airspeed of an unladen swallow ?'.split())
+        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
+        >>> ht.close()
+
+    This class communicates with the hunpos-tag binary via pipes. When the
+    tagger object is no longer needed, the close() method should be called to
+    free system resources. The class supports the context manager interface; if
+    used in a with statement, the close() method is invoked automatically:
+
+        >>> with HunposTagger('en_wsj.model') as ht:
+        ...     ht.tag('What is the airspeed of an unladen swallow ?'.split())
+        ...
+        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
+    """
+
+    def __init__(self, path_to_model, path_to_bin=None,
+                 encoding=_hunpos_charset, verbose=False):
+        """
+        Starts the hunpos-tag executable and establishes a connection with it.
+
+        :param path_to_model: The model file.
+        :param path_to_bin: The hunpos-tag binary.
+        :param encoding: The encoding used by the model. Unicode tokens
+            passed to the tag() and tag_sents() methods are converted to
+            this charset when they are sent to hunpos-tag.
+            The default is ISO-8859-1 (Latin-1).
+
+            This parameter is ignored for str tokens, which are sent as-is.
+            The caller must ensure that tokens are encoded in the right charset.
+        """
+        self._closed = True
+        hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
+                        '/Applications/bin', '~/bin', '~/Applications/bin']
+        hunpos_paths = list(map(os.path.expanduser, hunpos_paths))
+
+        self._hunpos_bin = find_binary(
+            'hunpos-tag', path_to_bin,
+            env_vars=('HUNPOS_TAGGER',),
+            searchpath=hunpos_paths,
+            url=_hunpos_url,
+            verbose=verbose
+        )
+
+        self._hunpos_model = find_file(
+            path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose)
+        self._encoding = encoding
+        self._hunpos = Popen([self._hunpos_bin, self._hunpos_model],
+                             shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+        self._closed = False
+
+    def __del__(self):
+        self.close()
+
+    def close(self):
+        """Closes the pipe to the hunpos executable."""
+        if not self._closed:
+            self._hunpos.communicate()
+            self._closed = True
+
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def tag(self, tokens):
+        """Tags a single sentence: a list of words.
+        The tokens should not contain any newline characters.
+        """
+        for token in tokens:
+            assert "\n" not in token, "Tokens should not contain newlines"
+            if isinstance(token, text_type):
+                token = token.encode(self._encoding)
+            self._hunpos.stdin.write(token + b"\n")
+        # We write a final empty line to tell hunpos that the sentence is finished:
+        self._hunpos.stdin.write(b"\n")
+        self._hunpos.stdin.flush()
+
+        tagged_tokens = []
+        for token in tokens:
+            tagged = self._hunpos.stdout.readline().strip().split(b"\t")
+            tag = (tagged[1] if len(tagged) > 1 else None)
+            tagged_tokens.append((token, tag))
+        # We have to read (and dismiss) the final empty line:
+        self._hunpos.stdout.readline()
+
+        return tagged_tokens
+
+# skip doctests if Hunpos tagger is not installed
+def setup_module(module):
+    from nose import SkipTest
+    try:
+        HunposTagger('en_wsj.model')
+    except LookupError:
+        raise SkipTest("HunposTagger is not available")
diff --git a/nlp_resource_data/nltk/tag/hunpos.pyc b/nlp_resource_data/nltk/tag/hunpos.pyc

new file mode 100755 (executable)

index 0000000..79372b2

Binary files /dev/null and b/nlp_resource_data/nltk/tag/hunpos.pyc differ
diff --git a/nlp_resource_data/nltk/tag/mapping.py b/nlp_resource_data/nltk/tag/mapping.py

new file mode 100755 (executable)

index 0000000..fc37dbc
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/mapping.py
@@ -0,0 +1,101 @@
+# Natural Language Toolkit: Tagset Mapping
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Nathan Schneider <nathan@cmu.edu>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Interface for converting POS tags from various treebanks
+to the universal tagset of Petrov, Das, & McDonald.
+
+The tagset consists of the following 12 coarse tags:
+
+VERB - verbs (all tenses and modes)
+NOUN - nouns (common and proper)
+PRON - pronouns
+ADJ - adjectives
+ADV - adverbs
+ADP - adpositions (prepositions and postpositions)
+CONJ - conjunctions
+DET - determiners
+NUM - cardinal numbers
+PRT - particles or other function words
+X - other: foreign words, typos, abbreviations
+. - punctuation
+
+@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/
+
+"""
+
+from __future__ import print_function, unicode_literals, division
+from collections import defaultdict
+from os.path import join
+
+from nltk.data import load
+
+_UNIVERSAL_DATA = "taggers/universal_tagset"
+_UNIVERSAL_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.')
+
+# _MAPPINGS = defaultdict(lambda: defaultdict(dict))
+# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
+_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))
+
+
+def _load_universal_map(fileid):
+    contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")
+
+    # When mapping to the Universal Tagset,
+    # map unknown inputs to 'X' not 'UNK'
+    _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
+
+    for line in contents.splitlines():
+        line = line.strip()
+        if line == '':
+            continue
+        fine, coarse = line.split('\t')
+
+        assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
+        assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)
+
+        _MAPPINGS[fileid]['universal'][fine] = coarse
+
+
+def tagset_mapping(source, target):
+    """
+    Retrieve the mapping dictionary between tagsets.
+
+    >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\
+            'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\
+            'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'}
+    True
+    """
+
+    if source not in _MAPPINGS or target not in _MAPPINGS[source]:
+        if target == 'universal':
+            _load_universal_map(source)
+    return _MAPPINGS[source][target]
+
+def map_tag(source, target, source_tag):
+    """
+    Maps the tag from the source tagset to the target tagset.
+
+    >>> map_tag('en-ptb', 'universal', 'VBZ')
+    'VERB'
+    >>> map_tag('en-ptb', 'universal', 'VBP')
+    'VERB'
+    >>> map_tag('en-ptb', 'universal', '``')
+    '.'
+    """
+
+    # we need a systematic approach to naming
+    if target == 'universal':
+        if source == 'wsj':
+            source = 'en-ptb'
+        if source == 'brown':
+            source = 'en-brown'
+
+    return tagset_mapping(source, target)[source_tag]
+
+
diff --git a/nlp_resource_data/nltk/tag/mapping.pyc b/nlp_resource_data/nltk/tag/mapping.pyc

new file mode 100755 (executable)

index 0000000..f582a55

Binary files /dev/null and b/nlp_resource_data/nltk/tag/mapping.pyc differ
diff --git a/nlp_resource_data/nltk/tag/perceptron.py b/nlp_resource_data/nltk/tag/perceptron.py

new file mode 100755 (executable)

index 0000000..4cedd8d
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/perceptron.py
@@ -0,0 +1,329 @@
+# -*- coding: utf-8 -*-
+# This module is a port of the Textblob Averaged Perceptron Tagger
+# Author: Matthew Honnibal <honnibal+gh@gmail.com>, 
+#         Long Duong <longdt219@gmail.com> (NLTK port)
+# URL: <https://github.com/sloria/textblob-aptagger>
+#      <http://nltk.org/>
+# Copyright 2013 Matthew Honnibal
+# NLTK modifications Copyright 2015 The NLTK Project
+#
+# This module is provided under the terms of the MIT License.
+
+from __future__ import absolute_import
+from __future__ import print_function, division
+
+import random
+from collections import defaultdict
+import pickle
+import logging
+
+from nltk.tag.api import TaggerI
+from nltk.data import find, load
+from nltk.compat import python_2_unicode_compatible
+
+PICKLE = "averaged_perceptron_tagger.pickle"
+
+class AveragedPerceptron(object):
+
+    '''An averaged perceptron, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
+    '''
+
+    def __init__(self):
+        # Each feature gets its own weight vector, so weights is a dict-of-dicts
+        self.weights = {}
+        self.classes = set()
+        # The accumulated values, for the averaging. These will be keyed by
+        # feature/clas tuples
+        self._totals = defaultdict(int)
+        # The last time the feature was changed, for the averaging. Also
+        # keyed by feature/clas tuples
+        # (tstamps is short for timestamps)
+        self._tstamps = defaultdict(int)
+        # Number of instances seen
+        self.i = 0
+
+    def predict(self, features):
+        '''Dot-product the features and current weights and return the best label.'''
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        # Do a secondary alphabetic sort, for stability
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features):
+        '''Update the feature weights.'''
+        def upd_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return None
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+
+    def average_weights(self):
+        '''Average weights from all iterations.'''
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                param = (feat, clas)
+                total = self._totals[param]
+                total += (self.i - self._tstamps[param]) * weight
+                averaged = round(total / self.i, 3)
+                if averaged:
+                    new_feat_weights[clas] = averaged
+            self.weights[feat] = new_feat_weights
+
+    def save(self, path):
+        '''Save the pickled model weights.'''
+        with open(path, 'wb') as fout:
+            return pickle.dump(dict(self.weights), fout)
+
+    def load(self, path):
+        '''Load the pickled model weights.'''
+        self.weights = load(path)
+
+@python_2_unicode_compatible
+class PerceptronTagger(TaggerI):
+
+    '''
+    Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
+    See more implementation details here:
+        https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
+    
+    >>> from nltk.tag.perceptron import PerceptronTagger
+
+    Train the model 
+    
+    >>> tagger = PerceptronTagger(load=False)
+    
+    >>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')],
+    ... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]])
+    
+    >>> tagger.tag(['today','is','a','beautiful','day'])
+    [('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')]
+    
+    Use the pretrain model (the default constructor) 
+    
+    >>> pretrain = PerceptronTagger()
+    
+    >>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split())
+    [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]
+    
+    >>> pretrain.tag("The red cat".split())
+    [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')]
+    '''
+
+    START = ['-START-', '-START2-']
+    END = ['-END-', '-END2-']
+    
+    def __init__(self, load=True):
+        '''
+        :param load: Load the pickled model upon instantiation.
+        '''
+        self.model = AveragedPerceptron()
+        self.tagdict = {}
+        self.classes = set()
+        if load:
+            AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
+            self.load(AP_MODEL_LOC)
+
+    def tag(self, tokens):
+        '''
+        Tag tokenized sentences.
+        :params tokens: list of word
+        :type tokens: list(str)
+        '''
+        prev, prev2 = self.START
+        output = []
+        
+        context = self.START + [self.normalize(w) for w in tokens] + self.END
+        for i, word in enumerate(tokens):
+            tag = self.tagdict.get(word)
+            if not tag:
+                features = self._get_features(i, word, context, prev, prev2)
+                tag = self.model.predict(features)
+            output.append((word, tag))
+            prev2 = prev
+            prev = tag
+
+        return output
+
+    def train(self, sentences, save_loc=None, nr_iter=5):
+        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
+        controls the number of Perceptron training iterations.
+
+        :param sentences: A list or iterator of sentences, where each sentence
+            is a list of (words, tags) tuples.
+        :param save_loc: If not ``None``, saves a pickled model in this location.
+        :param nr_iter: Number of training iterations.
+        '''
+        # We'd like to allow ``sentences`` to be either a list or an iterator,
+        # the latter being especially important for a large training dataset.
+        # Because ``self._make_tagdict(sentences)`` runs regardless, we make
+        # it populate ``self._sentences`` (a list) with all the sentences.
+        # This saves the overheard of just iterating through ``sentences`` to
+        # get the list by ``sentences = list(sentences)``.
+
+        self._sentences = list()  # to be populated by self._make_tagdict...
+        self._make_tagdict(sentences)
+        self.model.classes = self.classes
+        for iter_ in range(nr_iter):
+            c = 0
+            n = 0
+            for sentence in self._sentences:
+                words, tags = zip(*sentence)
+                
+                prev, prev2 = self.START
+                context = self.START + [self.normalize(w) for w in words] \
+                                                                    + self.END
+                for i, word in enumerate(words):
+                    guess = self.tagdict.get(word)
+                    if not guess:
+                        feats = self._get_features(i, word, context, prev, prev2)
+                        guess = self.model.predict(feats)
+                        self.model.update(tags[i], guess, feats)
+                    prev2 = prev
+                    prev = guess
+                    c += guess == tags[i]
+                    n += 1
+            random.shuffle(self._sentences)
+            logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
+
+        # We don't need the training sentences anymore, and we don't want to
+        # waste space on them when we pickle the trained tagger.
+        self._sentences = None
+
+        self.model.average_weights()
+        # Pickle as a binary file
+        if save_loc is not None:
+            with open(save_loc, 'wb') as fout:
+                # changed protocol from -1 to 2 to make pickling Python 2 compatible
+                pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2)
+        
+
+    def load(self, loc):
+        '''
+        :param loc: Load a pickled model at location.
+        :type loc: str 
+        '''
+
+        self.model.weights, self.tagdict, self.classes = load(loc)
+        self.model.classes = self.classes
+        
+
+    def normalize(self, word):
+        '''
+        Normalization used in pre-processing.
+        - All words are lower cased
+        - Groups of digits of length 4 are represented as !YEAR;
+        - Other digits are represented as !DIGITS
+
+        :rtype: str
+        '''
+        if '-' in word and word[0] != '-':
+            return '!HYPHEN'
+        elif word.isdigit() and len(word) == 4:
+            return '!YEAR'
+        elif word[0].isdigit():
+            return '!DIGITS'
+        else:
+            return word.lower()
+
+    def _get_features(self, i, word, context, prev, prev2):
+        '''Map tokens into a feature representation, implemented as a
+        {hashable: int} dict. If the features change, a new model must be
+        trained.
+        '''
+        def add(name, *args):
+            features[' '.join((name,) + tuple(args))] += 1
+
+        i += len(self.START)
+        features = defaultdict(int)
+        # It's useful to have a constant feature, which acts sort of like a prior
+        add('bias')
+        add('i suffix', word[-3:])
+        add('i pref1', word[0])
+        add('i-1 tag', prev)
+        add('i-2 tag', prev2)
+        add('i tag+i-2 tag', prev, prev2)
+        add('i word', context[i])
+        add('i-1 tag+i word', prev, context[i])
+        add('i-1 word', context[i-1])
+        add('i-1 suffix', context[i-1][-3:])
+        add('i-2 word', context[i-2])
+        add('i+1 word', context[i+1])
+        add('i+1 suffix', context[i+1][-3:])
+        add('i+2 word', context[i+2])
+        return features
+
+    def _make_tagdict(self, sentences):
+        '''
+        Make a tag dictionary for single-tag words.
+        :param sentences: A list of list of (word, tag) tuples.
+        '''
+        counts = defaultdict(lambda: defaultdict(int))
+        for sentence in sentences:
+            self._sentences.append(sentence)
+            for word, tag in sentence:
+                counts[word][tag] += 1
+                self.classes.add(tag)
+        freq_thresh = 20
+        ambiguity_thresh = 0.97
+        for word, tag_freqs in counts.items():
+            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
+            n = sum(tag_freqs.values())
+            # Don't add rare words to the tag dictionary
+            # Only add quite unambiguous words
+            if n >= freq_thresh and (mode / n) >= ambiguity_thresh:
+                self.tagdict[word] = tag
+
+
+def _pc(n, d):
+    return (n / d) * 100
+
+def _load_data_conll_format(filename):
+    print ('Read from file: ', filename)
+    with open(filename,'rb') as fin:
+        sentences = []
+        sentence = []
+        for line in fin.readlines():
+            line = line.strip()
+            #print line
+            if len(line) ==0:
+                sentences.append(sentence)
+                sentence = []
+                continue
+            tokens = line.split('\t')
+            word = tokens[1]
+            tag = tokens[4]
+            sentence.append((word,tag)) 
+        return sentences
+
+def _get_pretrain_model():
+    # Train and test on English part of ConLL data (WSJ part of Penn Treebank)
+    # Train: section 2-11 
+    # Test : section 23
+    tagger = PerceptronTagger()
+    training = _load_data_conll_format('english_ptb_train.conll')
+    testing = _load_data_conll_format('english_ptb_test.conll')
+    print ('Size of training and testing (sentence)', len(training), len(testing))
+    # Train and save the model 
+    tagger.train(training, PICKLE) 
+    print ('Accuracy : ',tagger.evaluate(testing))
+    
+if __name__ == '__main__':
+    #_get_pretrain_model()
+    pass
diff --git a/nlp_resource_data/nltk/tag/perceptron.pyc b/nlp_resource_data/nltk/tag/perceptron.pyc

new file mode 100755 (executable)

index 0000000..764210e

Binary files /dev/null and b/nlp_resource_data/nltk/tag/perceptron.pyc differ
diff --git a/nlp_resource_data/nltk/tag/senna.py b/nlp_resource_data/nltk/tag/senna.py

new file mode 100755 (executable)

index 0000000..c74ec94
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/senna.py
@@ -0,0 +1,147 @@
+# encoding: utf-8
+# Natural Language Toolkit: Senna POS Tagger
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Senna POS tagger, NER Tagger, Chunk Tagger
+
+The input is:
+- path to the directory that contains SENNA executables. If the path is incorrect,
+   SennaTagger will automatically search for executable file specified in SENNA environment variable
+- (optionally) the encoding of the input data (default:utf-8)
+
+Note: Unit tests for this module can be found in test/unit/test_senna.py
+
+    >>> from nltk.tag import SennaTagger
+    >>> tagger = SennaTagger('/usr/share/senna-v3.0')
+    >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+    [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),
+    ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]
+
+    >>> from nltk.tag import SennaChunkTagger
+    >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')
+    >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+    [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
+    ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
+    ('?', 'O')]
+
+    >>> from nltk.tag import SennaNERTagger
+    >>> nertagger = SennaNERTagger('/usr/share/senna-v3.0')
+    >>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP
+    [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'),
+    ('London', 'B-LOC'), ('.', 'O')]
+    >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP
+    [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'),
+    ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
+"""
+
+from nltk.compat import python_2_unicode_compatible
+from nltk.classify import Senna
+
+@python_2_unicode_compatible
+class SennaTagger(Senna):
+    def __init__(self, path, encoding='utf-8'):
+        super(SennaTagger, self).__init__(path, ['pos'], encoding)
+
+    def tag_sents(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return
+        for each sentence a list of tuples of (word, tag).
+        """
+        tagged_sents = super(SennaTagger, self).tag_sents(sentences)
+        for i in range(len(tagged_sents)):
+            for j in range(len(tagged_sents[i])):
+                annotations = tagged_sents[i][j]
+                tagged_sents[i][j] = (annotations['word'], annotations['pos'])
+        return tagged_sents
+
+@python_2_unicode_compatible
+class SennaChunkTagger(Senna):
+    def __init__(self, path, encoding='utf-8'):
+        super(SennaChunkTagger, self).__init__(path, ['chk'], encoding)
+
+    def tag_sents(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return
+        for each sentence a list of tuples of (word, tag).
+        """
+        tagged_sents = super(SennaChunkTagger, self).tag_sents(sentences)
+        for i in range(len(tagged_sents)):
+            for j in range(len(tagged_sents[i])):
+                annotations = tagged_sents[i][j]
+                tagged_sents[i][j] = (annotations['word'], annotations['chk'])
+        return tagged_sents
+
+    def bio_to_chunks(self, tagged_sent, chunk_type):
+        """
+        Extracts the chunks in a BIO chunk-tagged sentence.
+
+        >>> from nltk.tag import SennaChunkTagger
+        >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')
+        >>> sent = 'What is the airspeed of an unladen swallow ?'.split()
+        >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP
+        >>> tagged_sent # doctest: +SKIP
+        [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
+        ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
+        ('?', 'O')]
+        >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP
+        [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
+
+        :param tagged_sent: A list of tuples of word and BIO chunk tag.
+        :type tagged_sent: list(tuple)
+        :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP'
+        :type tagged_sent: str
+
+        :return: An iterable of tuples of chunks that users want to extract
+          and their corresponding indices.
+        :rtype: iter(tuple(str))
+        """
+        current_chunk = []
+        current_chunk_position = []
+        for idx, word_pos in enumerate(tagged_sent):
+            word, pos = word_pos
+            if '-'+chunk_type in pos: # Append the word to the current_chunk.
+                current_chunk.append((word))
+                current_chunk_position.append((idx))
+            else:
+                if current_chunk: # Flush the full chunk when out of an NP.
+                    _chunk_str = ' '.join(current_chunk)
+                    _chunk_pos_str = '-'.join(map(str, current_chunk_position))
+                    yield _chunk_str, _chunk_pos_str
+                    current_chunk = []
+                    current_chunk_position = []
+        if current_chunk: # Flush the last chunk.
+            yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))
+
+
+@python_2_unicode_compatible
+class SennaNERTagger(Senna):
+    def __init__(self, path, encoding='utf-8'):
+        super(SennaNERTagger, self).__init__(path, ['ner'], encoding)
+
+    def tag_sents(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return
+        for each sentence a list of tuples of (word, tag).
+        """
+        tagged_sents = super(SennaNERTagger, self).tag_sents(sentences)
+        for i in range(len(tagged_sents)):
+            for j in range(len(tagged_sents[i])):
+                annotations = tagged_sents[i][j]
+                tagged_sents[i][j] = (annotations['word'], annotations['ner'])
+        return tagged_sents
+
+
+
+# skip doctests if Senna is not installed
+def setup_module(module):
+    from nose import SkipTest
+    try:
+        tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
+    except OSError:
+        raise SkipTest("Senna executable not found")
+
diff --git a/nlp_resource_data/nltk/tag/senna.pyc b/nlp_resource_data/nltk/tag/senna.pyc

new file mode 100755 (executable)

index 0000000..57ac951

Binary files /dev/null and b/nlp_resource_data/nltk/tag/senna.pyc differ
diff --git a/nlp_resource_data/nltk/tag/sequential.py b/nlp_resource_data/nltk/tag/sequential.py

new file mode 100755 (executable)

index 0000000..3cdcd05
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/sequential.py
@@ -0,0 +1,740 @@
+# Natural Language Toolkit: Sequential Backoff Taggers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+#         Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes for tagging sentences sequentially, left to right.  The
+abstract base class SequentialBackoffTagger serves as the base
+class for all the taggers in this module.  Tagging of individual words
+is performed by the method ``choose_tag()``, which is defined by
+subclasses of SequentialBackoffTagger.  If a tagger is unable to
+determine a tag for the specified token, then its backoff tagger is
+consulted instead.  Any SequentialBackoffTagger may serve as a
+backoff tagger for any other SequentialBackoffTagger.
+"""
+from __future__ import print_function, unicode_literals
+from abc import abstractmethod
+
+import re
+
+from nltk.probability import ConditionalFreqDist
+from nltk.classify import NaiveBayesClassifier
+from nltk.compat import python_2_unicode_compatible
+
+from nltk.tag.api import TaggerI, FeaturesetTaggerI
+
+from nltk import jsontags
+
+
+######################################################################
+# Abstract Base Classes
+######################################################################
+class SequentialBackoffTagger(TaggerI):
+    """
+    An abstract base class for taggers that tags words sequentially,
+    left to right.  Tagging of individual words is performed by the
+    ``choose_tag()`` method, which should be defined by subclasses.  If
+    a tagger is unable to determine a tag for the specified token,
+    then its backoff tagger is consulted.
+
+    :ivar _taggers: A list of all the taggers that should be tried to
+        tag a token (i.e., self and its backoff taggers).
+    """
+    def __init__(self, backoff=None):
+        if backoff is None:
+            self._taggers = [self]
+        else:
+            self._taggers = [self] + backoff._taggers
+
+    @property
+    def backoff(self):
+        """The backoff tagger for this tagger."""
+        return self._taggers[1] if len(self._taggers) > 1 else None
+
+    def tag(self, tokens):
+        # docs inherited from TaggerI
+        tags = []
+        for i in range(len(tokens)):
+            tags.append(self.tag_one(tokens, i, tags))
+        return list(zip(tokens, tags))
+
+    def tag_one(self, tokens, index, history):
+        """
+        Determine an appropriate tag for the specified token, and
+        return that tag.  If this tagger is unable to determine a tag
+        for the specified token, then its backoff tagger is consulted.
+
+        :rtype: str
+        :type tokens: list
+        :param tokens: The list of words that are being tagged.
+        :type index: int
+        :param index: The index of the word whose tag should be
+            returned.
+        :type history: list(str)
+        :param history: A list of the tags for all words before *index*.
+        """
+        tag = None
+        for tagger in self._taggers:
+            tag = tagger.choose_tag(tokens, index, history)
+            if tag is not None:
+                break
+        return tag
+
+    @abstractmethod
+    def choose_tag(self, tokens, index, history):
+        """
+        Decide which tag should be used for the specified token, and
+        return that tag.  If this tagger is unable to determine a tag
+        for the specified token, return None -- do not consult
+        the backoff tagger.  This method should be overridden by
+        subclasses of SequentialBackoffTagger.
+
+        :rtype: str
+        :type tokens: list
+        :param tokens: The list of words that are being tagged.
+        :type index: int
+        :param index: The index of the word whose tag should be
+            returned.
+        :type history: list(str)
+        :param history: A list of the tags for all words before *index*.
+        """
+
+
+@python_2_unicode_compatible
+class ContextTagger(SequentialBackoffTagger):
+    """
+    An abstract base class for sequential backoff taggers that choose
+    a tag for a token based on the value of its "context".  Different
+    subclasses are used to define different contexts.
+
+    A ContextTagger chooses the tag for a token by calculating the
+    token's context, and looking up the corresponding tag in a table.
+    This table can be constructed manually; or it can be automatically
+    constructed based on a training corpus, using the ``_train()``
+    factory method.
+
+    :ivar _context_to_tag: Dictionary mapping contexts to tags.
+    """
+    def __init__(self, context_to_tag, backoff=None):
+        """
+        :param context_to_tag: A dictionary mapping contexts to tags.
+        :param backoff: The backoff tagger that should be used for this tagger.
+        """
+        SequentialBackoffTagger.__init__(self, backoff)
+        self._context_to_tag = (context_to_tag if context_to_tag else {})
+
+    @abstractmethod
+    def context(self, tokens, index, history):
+        """
+        :return: the context that should be used to look up the tag
+            for the specified token; or None if the specified token
+            should not be handled by this tagger.
+        :rtype: (hashable)
+        """
+
+    def choose_tag(self, tokens, index, history):
+        context = self.context(tokens, index, history)
+        return self._context_to_tag.get(context)
+
+    def size(self):
+        """
+        :return: The number of entries in the table used by this
+            tagger to map from contexts to tags.
+        """
+        return len(self._context_to_tag)
+
+    def __repr__(self):
+        return '<%s: size=%d>' % (self.__class__.__name__, self.size())
+
+    def _train(self, tagged_corpus, cutoff=0, verbose=False):
+        """
+        Initialize this ContextTagger's ``_context_to_tag`` table
+        based on the given training data.  In particular, for each
+        context ``c`` in the training data, set
+        ``_context_to_tag[c]`` to the most frequent tag for that
+        context.  However, exclude any contexts that are already
+        tagged perfectly by the backoff tagger(s).
+
+        The old value of ``self._context_to_tag`` (if any) is discarded.
+
+        :param tagged_corpus: A tagged corpus.  Each item should be
+            a list of (word, tag tuples.
+        :param cutoff: If the most likely tag for a context occurs
+            fewer than cutoff times, then exclude it from the
+            context-to-tag table for the new tagger.
+        """
+
+        token_count = hit_count = 0
+
+        # A context is considered 'useful' if it's not already tagged
+        # perfectly by the backoff tagger.
+        useful_contexts = set()
+
+        # Count how many times each tag occurs in each context.
+        fd = ConditionalFreqDist()
+        for sentence in tagged_corpus:
+            tokens, tags = zip(*sentence)
+            for index, (token, tag) in enumerate(sentence):
+                # Record the event.
+                token_count += 1
+                context = self.context(tokens, index, tags[:index])
+                if context is None:
+                    continue
+                fd[context][tag] += 1
+                # If the backoff got it wrong, this context is useful:
+                if (self.backoff is None or
+                        tag != self.backoff.tag_one(
+                        tokens, index, tags[:index])):
+                    useful_contexts.add(context)
+
+        # Build the context_to_tag table -- for each context, figure
+        # out what the most likely tag is.  Only include contexts that
+        # we've seen at least `cutoff` times.
+        for context in useful_contexts:
+            best_tag = fd[context].max()
+            hits = fd[context][best_tag]
+            if hits > cutoff:
+                self._context_to_tag[context] = best_tag
+                hit_count += hits
+
+        # Display some stats, if requested.
+        if verbose:
+            size = len(self._context_to_tag)
+            backoff = 100 - (hit_count * 100.0) / token_count
+            pruning = 100 - (size * 100.0) / len(fd.conditions())
+            print("[Trained Unigram tagger:", end=' ')
+            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
+                size, backoff, pruning))
+
+
+######################################################################
+# Tagger Classes
+######################################################################
+@python_2_unicode_compatible
+@jsontags.register_tag
+class DefaultTagger(SequentialBackoffTagger):
+    """
+    A tagger that assigns the same tag to every token.
+
+        >>> from nltk.tag import DefaultTagger
+        >>> default_tagger = DefaultTagger('NN')
+        >>> list(default_tagger.tag('This is a test'.split()))
+        [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
+
+    This tagger is recommended as a backoff tagger, in cases where
+    a more powerful tagger is unable to assign a tag to the word
+    (e.g. because the word was not seen during training).
+
+    :param tag: The tag to assign to each token
+    :type tag: str
+    """
+
+    json_tag = 'nltk.tag.sequential.DefaultTagger'
+
+    def __init__(self, tag):
+        self._tag = tag
+        SequentialBackoffTagger.__init__(self, None)
+
+    def encode_json_obj(self):
+        return self._tag
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        tag = obj
+        return cls(tag)
+
+    def choose_tag(self, tokens, index, history):
+        return self._tag  # ignore token and history
+
+    def __repr__(self):
+        return '<DefaultTagger: tag=%s>' % self._tag
+
+
+@jsontags.register_tag
+class NgramTagger(ContextTagger):
+    """
+    A tagger that chooses a token's tag based on its word string and
+    on the preceding n word's tags.  In particular, a tuple
+    (tags[i-n:i-1], words[i]) is looked up in a table, and the
+    corresponding tag is returned.  N-gram taggers are typically
+    trained on a tagged corpus.
+
+    Train a new NgramTagger using the given training data or
+    the supplied model.  In particular, construct a new tagger
+    whose table maps from each context (tag[i-n:i-1], word[i])
+    to the most frequent tag for that context.  But exclude any
+    contexts that are already tagged perfectly by the backoff
+    tagger.
+
+    :param train: A tagged corpus consisting of a list of tagged
+        sentences, where each sentence is a list of (word, tag) tuples.
+    :param backoff: A backoff tagger, to be used by the new
+        tagger if it encounters an unknown context.
+    :param cutoff: If the most likely tag for a context occurs
+        fewer than *cutoff* times, then exclude it from the
+        context-to-tag table for the new tagger.
+    """
+    json_tag = 'nltk.tag.sequential.NgramTagger'
+
+    def __init__(self, n, train=None, model=None,
+                 backoff=None, cutoff=0, verbose=False):
+        self._n = n
+        self._check_params(train, model)
+
+        ContextTagger.__init__(self, model, backoff)
+
+        if train:
+            self._train(train, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._n, self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _n, _context_to_tag, backoff = obj
+        return cls(_n, model=_context_to_tag, backoff=backoff)
+
+    def context(self, tokens, index, history):
+        tag_context = tuple(history[max(0, index-self._n+1):index])
+        return tag_context, tokens[index]
+
+
+@jsontags.register_tag
+class UnigramTagger(NgramTagger):
+    """
+    Unigram Tagger
+
+    The UnigramTagger finds the most likely tag for each word in a training
+    corpus, and then uses that information to assign tags to new tokens.
+
+        >>> from nltk.corpus import brown
+        >>> from nltk.tag import UnigramTagger
+        >>> test_sent = brown.sents(categories='news')[0]
+        >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
+        >>> for tok, tag in unigram_tagger.tag(test_sent):
+        ...     print("(%s, %s), " % (tok, tag))
+        (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
+        (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
+        (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
+        (primary, NN), (election, NN), (produced, VBD), (``, ``),
+        (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI),
+        (irregularities, NNS), (took, VBD), (place, NN), (., .),
+
+    :param train: The corpus of training data, a list of tagged sentences
+    :type train: list(list(tuple(str, str)))
+    :param model: The tagger model
+    :type model: dict
+    :param backoff: Another tagger which this tagger will consult when it is
+        unable to tag a word
+    :type backoff: TaggerI
+    :param cutoff: The number of instances of training data the tagger must see
+        in order not to use the backoff tagger
+    :type cutoff: int
+    """
+
+    json_tag = 'nltk.tag.sequential.UnigramTagger'
+
+    def __init__(self, train=None, model=None,
+                 backoff=None, cutoff=0, verbose=False):
+        NgramTagger.__init__(self, 1, train, model,
+                             backoff, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _context_to_tag, backoff = obj
+        return cls(model=_context_to_tag, backoff=backoff)
+
+    def context(self, tokens, index, history):
+        return tokens[index]
+
+
+@jsontags.register_tag
+class BigramTagger(NgramTagger):
+    """
+    A tagger that chooses a token's tag based its word string and on
+    the preceding words' tag.  In particular, a tuple consisting
+    of the previous tag and the word is looked up in a table, and
+    the corresponding tag is returned.
+
+    :param train: The corpus of training data, a list of tagged sentences
+    :type train: list(list(tuple(str, str)))
+    :param model: The tagger model
+    :type model: dict
+    :param backoff: Another tagger which this tagger will consult when it is
+        unable to tag a word
+    :type backoff: TaggerI
+    :param cutoff: The number of instances of training data the tagger must see
+        in order not to use the backoff tagger
+    :type cutoff: int
+    """
+    json_tag = 'nltk.tag.sequential.BigramTagger'
+
+    def __init__(self, train=None, model=None,
+                 backoff=None, cutoff=0, verbose=False):
+        NgramTagger.__init__(self, 2, train, model,
+                             backoff, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _context_to_tag, backoff = obj
+        return cls(model=_context_to_tag, backoff=backoff)
+
+
+@jsontags.register_tag
+class TrigramTagger(NgramTagger):
+    """
+    A tagger that chooses a token's tag based its word string and on
+    the preceding two words' tags.  In particular, a tuple consisting
+    of the previous two tags and the word is looked up in a table, and
+    the corresponding tag is returned.
+
+    :param train: The corpus of training data, a list of tagged sentences
+    :type train: list(list(tuple(str, str)))
+    :param model: The tagger model
+    :type model: dict
+    :param backoff: Another tagger which this tagger will consult when it is
+        unable to tag a word
+    :type backoff: TaggerI
+    :param cutoff: The number of instances of training data the tagger must see
+        in order not to use the backoff tagger
+    :type cutoff: int
+    """
+    json_tag = 'nltk.tag.sequential.TrigramTagger'
+
+    def __init__(self, train=None, model=None,
+                 backoff=None, cutoff=0, verbose=False):
+        NgramTagger.__init__(self, 3, train, model,
+                             backoff, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _context_to_tag, backoff = obj
+        return cls(model=_context_to_tag, backoff=backoff)
+
+
+@jsontags.register_tag
+class AffixTagger(ContextTagger):
+    """
+    A tagger that chooses a token's tag based on a leading or trailing
+    substring of its word string.  (It is important to note that these
+    substrings are not necessarily "true" morphological affixes).  In
+    particular, a fixed-length substring of the word is looked up in a
+    table, and the corresponding tag is returned.  Affix taggers are
+    typically constructed by training them on a tagged corpus.
+
+    Construct a new affix tagger.
+
+    :param affix_length: The length of the affixes that should be
+        considered during training and tagging.  Use negative
+        numbers for suffixes.
+    :param min_stem_length: Any words whose length is less than
+        min_stem_length+abs(affix_length) will be assigned a
+        tag of None by this tagger.
+    """
+
+    json_tag = 'nltk.tag.sequential.AffixTagger'
+
+    def __init__(self, train=None, model=None, affix_length=-3,
+                 min_stem_length=2, backoff=None, cutoff=0, verbose=False):
+
+        self._check_params(train, model)
+
+        ContextTagger.__init__(self, model, backoff)
+
+        self._affix_length = affix_length
+        self._min_word_length = min_stem_length + abs(affix_length)
+
+        if train:
+            self._train(train, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._affix_length, self._min_word_length, self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _affix_length, _min_word_length, _context_to_tag, backoff = obj
+        return cls(
+            affix_length=_affix_length,
+            min_stem_length=_min_word_length - abs(_affix_length),
+            model=_context_to_tag,
+            backoff=backoff
+        )
+
+    def context(self, tokens, index, history):
+        token = tokens[index]
+        if len(token) < self._min_word_length:
+            return None
+        elif self._affix_length > 0:
+            return token[:self._affix_length]
+        else:
+            return token[self._affix_length:]
+
+
+@python_2_unicode_compatible
+@jsontags.register_tag
+class RegexpTagger(SequentialBackoffTagger):
+    """
+    Regular Expression Tagger
+
+    The RegexpTagger assigns tags to tokens by comparing their
+    word strings to a series of regular expressions.  The following tagger
+    uses word suffixes to make guesses about the correct Brown Corpus part
+    of speech tag:
+
+        >>> from nltk.corpus import brown
+        >>> from nltk.tag import RegexpTagger
+        >>> test_sent = brown.sents(categories='news')[0]
+        >>> regexp_tagger = RegexpTagger(
+        ...     [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ...      (r'(The|the|A|a|An|an)$', 'AT'),   # articles
+        ...      (r'.*able$', 'JJ'),                # adjectives
+        ...      (r'.*ness$', 'NN'),                # nouns formed from adjectives
+        ...      (r'.*ly$', 'RB'),                  # adverbs
+        ...      (r'.*s$', 'NNS'),                  # plural nouns
+        ...      (r'.*ing$', 'VBG'),                # gerunds
+        ...      (r'.*ed$', 'VBD'),                 # past tense verbs
+        ...      (r'.*', 'NN')                      # nouns (default)
+        ... ])
+        >>> regexp_tagger
+        <Regexp Tagger: size=9>
+        >>> regexp_tagger.tag(test_sent)
+        [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
+        ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
+        ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
+        ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'),
+        ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'),
+        ('place', 'NN'), ('.', 'NN')]
+
+    :type regexps: list(tuple(str, str))
+    :param regexps: A list of ``(regexp, tag)`` pairs, each of
+        which indicates that a word matching ``regexp`` should
+        be tagged with ``tag``.  The pairs will be evalutated in
+        order.  If none of the regexps match a word, then the
+        optional backoff tagger is invoked, else it is
+        assigned the tag None.
+    """
+
+    json_tag = 'nltk.tag.sequential.RegexpTagger'
+
+    def __init__(self, regexps, backoff=None):
+        """
+        """
+        SequentialBackoffTagger.__init__(self, backoff)
+        self._regexs = [(re.compile(regexp), tag,) for regexp, tag in regexps]
+
+    def encode_json_obj(self):
+        return [(regexp.patten, tag,) for regexp, tag in self._regexs], self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        regexps, backoff = obj
+        self = cls(())
+        self._regexs = [(re.compile(regexp), tag,) for regexp, tag in regexps]
+        SequentialBackoffTagger.__init__(self, backoff)
+        return self
+
+    def choose_tag(self, tokens, index, history):
+        for regexp, tag in self._regexs:
+            if re.match(regexp, tokens[index]):
+                return tag
+        return None
+
+    def __repr__(self):
+        return '<Regexp Tagger: size=%d>' % len(self._regexs)
+
+
+@python_2_unicode_compatible
+class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
+    """
+    A sequential tagger that uses a classifier to choose the tag for
+    each token in a sentence.  The featureset input for the classifier
+    is generated by a feature detector function::
+
+        feature_detector(tokens, index, history) -> featureset
+
+    Where tokens is the list of unlabeled tokens in the sentence;
+    index is the index of the token for which feature detection
+    should be performed; and history is list of the tags for all
+    tokens before index.
+
+    Construct a new classifier-based sequential tagger.
+
+    :param feature_detector: A function used to generate the
+        featureset input for the classifier::
+        feature_detector(tokens, index, history) -> featureset
+
+    :param train: A tagged corpus consisting of a list of tagged
+        sentences, where each sentence is a list of (word, tag) tuples.
+
+    :param backoff: A backoff tagger, to be used by the new tagger
+        if it encounters an unknown context.
+
+    :param classifier_builder: A function used to train a new
+        classifier based on the data in *train*.  It should take
+        one argument, a list of labeled featuresets (i.e.,
+        (featureset, label) tuples).
+
+    :param classifier: The classifier that should be used by the
+        tagger.  This is only useful if you want to manually
+        construct the classifier; normally, you would use *train*
+        instead.
+
+    :param backoff: A backoff tagger, used if this tagger is
+        unable to determine a tag for a given token.
+
+    :param cutoff_prob: If specified, then this tagger will fall
+        back on its backoff tagger if the probability of the most
+        likely tag is less than *cutoff_prob*.
+    """
+    def __init__(self, feature_detector=None, train=None,
+                 classifier_builder=NaiveBayesClassifier.train,
+                 classifier=None, backoff=None,
+                 cutoff_prob=None, verbose=False):
+        self._check_params(train, classifier)
+
+        SequentialBackoffTagger.__init__(self, backoff)
+
+        if (train and classifier) or (not train and not classifier):
+            raise ValueError('Must specify either training data or '
+                             'trained classifier.')
+
+        if feature_detector is not None:
+            self._feature_detector = feature_detector
+            # The feature detector function, used to generate a featureset
+            # or each token: feature_detector(tokens, index, history) -> featureset
+
+        self._cutoff_prob = cutoff_prob
+        """Cutoff probability for tagging -- if the probability of the
+           most likely tag is less than this, then use backoff."""
+
+        self._classifier = classifier
+        """The classifier used to choose a tag for each token."""
+
+        if train:
+            self._train(train, classifier_builder, verbose)
+
+    def choose_tag(self, tokens, index, history):
+        # Use our feature detector to get the featureset.
+        featureset = self.feature_detector(tokens, index, history)
+
+        # Use the classifier to pick a tag.  If a cutoff probability
+        # was specified, then check that the tag's probability is
+        # higher than that cutoff first; otherwise, return None.
+        if self._cutoff_prob is None:
+            return self._classifier.classify(featureset)
+
+        pdist = self._classifier.prob_classify(featureset)
+        tag = pdist.max()
+        return tag if pdist.prob(tag) >= self._cutoff_prob else None
+
+    def _train(self, tagged_corpus, classifier_builder, verbose):
+        """
+        Build a new classifier, based on the given training data
+        *tagged_corpus*.
+        """
+
+        classifier_corpus = []
+        if verbose:
+            print('Constructing training corpus for classifier.')
+
+        for sentence in tagged_corpus:
+            history = []
+            untagged_sentence, tags = zip(*sentence)
+            for index in range(len(sentence)):
+                featureset = self.feature_detector(untagged_sentence,
+                                                   index, history)
+                classifier_corpus.append((featureset, tags[index]))
+                history.append(tags[index])
+
+        if verbose:
+            print('Training classifier (%d instances)' % len(classifier_corpus))
+        self._classifier = classifier_builder(classifier_corpus)
+
+    def __repr__(self):
+        return '<ClassifierBasedTagger: %r>' % self._classifier
+
+    def feature_detector(self, tokens, index, history):
+        """
+        Return the feature detector that this tagger uses to generate
+        featuresets for its classifier.  The feature detector is a
+        function with the signature::
+
+          feature_detector(tokens, index, history) -> featureset
+
+        See ``classifier()``
+        """
+        return self._feature_detector(tokens, index, history)
+
+    def classifier(self):
+        """
+        Return the classifier that this tagger uses to choose a tag
+        for each word in a sentence.  The input for this classifier is
+        generated using this tagger's feature detector.
+        See ``feature_detector()``
+        """
+        return self._classifier
+
+
+class ClassifierBasedPOSTagger(ClassifierBasedTagger):
+    """
+    A classifier based part of speech tagger.
+    """
+    def feature_detector(self, tokens, index, history):
+        word = tokens[index]
+        if index == 0:
+            prevword = prevprevword = None
+            prevtag = prevprevtag = None
+        elif index == 1:
+            prevword = tokens[index-1].lower()
+            prevprevword = None
+            prevtag = history[index-1]
+            prevprevtag = None
+        else:
+            prevword = tokens[index-1].lower()
+            prevprevword = tokens[index-2].lower()
+            prevtag = history[index-1]
+            prevprevtag = history[index-2]
+
+        if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
+            shape = 'number'
+        elif re.match('\W+$', word):
+            shape = 'punct'
+        elif re.match('[A-Z][a-z]+$', word):
+            shape = 'upcase'
+        elif re.match('[a-z]+$', word):
+            shape = 'downcase'
+        elif re.match('\w+$', word):
+            shape = 'mixedcase'
+        else:
+            shape = 'other'
+
+        features = {
+            'prevtag': prevtag,
+            'prevprevtag': prevprevtag,
+            'word': word,
+            'word.lower': word.lower(),
+            'suffix3': word.lower()[-3:],
+            'suffix2': word.lower()[-2:],
+            'suffix1': word.lower()[-1:],
+            'prevprevword': prevprevword,
+            'prevword': prevword,
+            'prevtag+word': '%s+%s' % (prevtag, word.lower()),
+            'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
+            'prevword+word': '%s+%s' % (prevword, word.lower()),
+            'shape': shape,
+            }
+        return features
diff --git a/nlp_resource_data/nltk/tag/sequential.pyc b/nlp_resource_data/nltk/tag/sequential.pyc

new file mode 100755 (executable)

index 0000000..d9a4799

Binary files /dev/null and b/nlp_resource_data/nltk/tag/sequential.pyc differ
diff --git a/nlp_resource_data/nltk/tag/stanford.py b/nlp_resource_data/nltk/tag/stanford.py

new file mode 100755 (executable)

index 0000000..26f36db
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/stanford.py
@@ -0,0 +1,290 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Nitin Madnani <nmadnani@ets.org>
+#         Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for interfacing with the Stanford taggers.
+
+Tagger models need to be downloaded from https://nlp.stanford.edu/software
+and the STANFORD_MODELS environment variable set (a colon-separated
+list of paths).
+
+For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
+"""
+
+from abc import abstractmethod
+import os
+import tempfile
+from subprocess import PIPE
+import warnings
+
+from six import text_type
+
+from nltk.internals import find_file, find_jar, config_java, java, _java_options
+from nltk.tag.api import TaggerI
+from nltk.parse.corenlp import CoreNLPParser
+
+_stanford_url = 'https://nlp.stanford.edu/software'
+
+
+class StanfordTagger(TaggerI):
+    """
+    An interface to Stanford taggers. Subclasses must define:
+
+    - ``_cmd`` property: A property that returns the command that will be
+      executed.
+    - ``_SEPARATOR``: Class constant that represents that character that
+      is used to separate the tokens from their tags.
+    - ``_JAR`` file: Class constant that represents the jar file name.
+    """
+
+    _SEPARATOR = ''
+    _JAR = ''
+
+    def __init__(self, model_filename, path_to_jar=None, encoding='utf8',
+                 verbose=False, java_options='-mx1000m'):
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(str("\nThe StanfordTokenizer will "
+                          "be deprecated in version 3.2.5.\n"
+                          "Please use \033[91mnltk.tag.stanford.CoreNLPPOSTagger\033[0m "
+                          "or \033[91mnltk.tag.stanford.CoreNLPNERTagger\033[0m instead."),
+                      DeprecationWarning, stacklevel=2)
+        warnings.simplefilter('ignore', DeprecationWarning)
+        if not self._JAR:
+            warnings.warn('The StanfordTagger class is not meant to be '
+                          'instantiated directly. Did you mean '
+                          'StanfordPOSTagger or StanfordNERTagger?')
+        self._stanford_jar = find_jar(
+                self._JAR, path_to_jar,
+                searchpath=(), url=_stanford_url,
+                verbose=verbose)
+
+        self._stanford_model = find_file(model_filename,
+                                         env_vars=('STANFORD_MODELS',),
+                                         verbose=verbose)
+
+        self._encoding = encoding
+        self.java_options = java_options
+
+    @property
+    @abstractmethod
+    def _cmd(self):
+        """
+        A property that returns the command that will be executed.
+        """
+
+    def tag(self, tokens):
+        # This function should return list of tuple rather than list of list
+        return sum(self.tag_sents([tokens]), [])
+
+    def tag_sents(self, sentences):
+        encoding = self._encoding
+        default_options = ' '.join(_java_options)
+        config_java(options=self.java_options, verbose=False)
+
+        # Create a temporary input file
+        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
+
+        cmd = list(self._cmd)
+        cmd.extend(['-encoding', encoding])
+
+        # Write the actual sentences to the temporary input file
+        _input_fh = os.fdopen(_input_fh, 'wb')
+        _input = '\n'.join((' '.join(x) for x in sentences))
+        if isinstance(_input, text_type) and encoding:
+            _input = _input.encode(encoding)
+        _input_fh.write(_input)
+        _input_fh.close()
+
+        # Run the tagger and get the output
+        stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
+                                       stdout=PIPE, stderr=PIPE)
+        stanpos_output = stanpos_output.decode(encoding)
+
+        # Delete the temporary file
+        os.unlink(self._input_file_path)
+
+        # Return java configurations to their default values
+        config_java(options=default_options, verbose=False)
+
+        return self.parse_output(stanpos_output, sentences)
+
+    def parse_output(self, text, sentences=None):
+        # Output the tagged sentences
+        tagged_sentences = []
+        for tagged_sentence in text.strip().split("\n"):
+            sentence = []
+            for tagged_word in tagged_sentence.strip().split():
+                word_tags = tagged_word.strip().split(self._SEPARATOR)
+                sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
+            tagged_sentences.append(sentence)
+        return tagged_sentences
+
+
+class StanfordPOSTagger(StanfordTagger):
+    """
+    A class for pos tagging with Stanford Tagger. The input is the paths to:
+     - a model trained on training data
+     - (optionally) the path to the stanford tagger jar file. If not specified here,
+       then this jar file must be specified in the CLASSPATH envinroment variable.
+     - (optionally) the encoding of the training data (default: UTF-8)
+
+    Example:
+
+        >>> from nltk.tag import StanfordPOSTagger
+        >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
+        >>> st.tag('What is the airspeed of an unladen swallow ?'.split())
+        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+    """
+    _SEPARATOR = '_'
+    _JAR = 'stanford-postagger.jar'
+
+    def __init__(self, *args, **kwargs):
+        super(StanfordPOSTagger, self).__init__(*args, **kwargs)
+
+    @property
+    def _cmd(self):
+        return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
+                '-model', self._stanford_model, '-textFile',
+                self._input_file_path, '-tokenize', 'false',
+                '-outputFormatOptions', 'keepEmptySentences']
+
+
+class StanfordNERTagger(StanfordTagger):
+    """
+    A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
+
+    - a model trained on training data
+    - (optionally) the path to the stanford tagger jar file. If not specified here,
+      then this jar file must be specified in the CLASSPATH envinroment variable.
+    - (optionally) the encoding of the training data (default: UTF-8)
+
+    Example:
+
+        >>> from nltk.tag import StanfordNERTagger
+        >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
+        >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
+        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
+         ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
+         ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
+    """
+
+    _SEPARATOR = '/'
+    _JAR = 'stanford-ner.jar'
+    _FORMAT = 'slashTags'
+
+    def __init__(self, *args, **kwargs):
+        super(StanfordNERTagger, self).__init__(*args, **kwargs)
+
+    @property
+    def _cmd(self):
+        # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
+        return ['edu.stanford.nlp.ie.crf.CRFClassifier',
+                '-loadClassifier', self._stanford_model, '-textFile',
+                self._input_file_path, '-outputFormat', self._FORMAT,
+                '-tokenizerFactory',
+                'edu.stanford.nlp.process.WhitespaceTokenizer',
+                '-tokenizerOptions', '\"tokenizeNLs=false\"']
+
+    def parse_output(self, text, sentences):
+        if self._FORMAT == 'slashTags':
+            # Joint together to a big list
+            tagged_sentences = []
+            for tagged_sentence in text.strip().split("\n"):
+                for tagged_word in tagged_sentence.strip().split():
+                    word_tags = tagged_word.strip().split(self._SEPARATOR)
+                    tagged_sentences.append((''.join(word_tags[:-1]),
+                                             word_tags[-1]))
+
+            # Separate it according to the input
+            result = []
+            start = 0
+            for sent in sentences:
+                result.append(tagged_sentences[start:start + len(sent)])
+                start += len(sent)
+            return result
+
+        raise NotImplementedError
+
+class CoreNLPTagger(CoreNLPParser, TaggerI):
+    def __init__(self, tagtype, url='http://localhost:9000', encoding='utf8'):
+        """
+        An abstract interface to POS/NER taggers of CoreNLP that returns the
+        POS/NER tags from the Stanford CoreNLP API at nltk.parse.corenlp.
+        """
+        self.tagtype = tagtype
+        super(CoreNLPTagger, self).__init__(url, encoding)
+
+    def tag_sents(self, sentences):
+        # Converting list(list(str)) -> list(str)
+        sentences = (' '.join(words) for words in sentences)
+        return list(self.raw_tag_sents(sentences))
+
+
+    def tag(self, sentence):
+        return self.tag_sents([sentence])[0]
+
+    def raw_tag_sents(self, sentences):
+        """
+        This function will interface the `GenericCoreNLPParser.api_call` to
+        retreive the JSON output and return the annotations required.
+        """
+        default_properties = {'ssplit.isOneSentence': 'true',
+                              'annotators': 'tokenize,ssplit,' }
+        # Supports only 'pos' or 'ner' tags.
+        assert self.tagtype in ['pos', 'ner']
+        default_properties['annotators'] += self.tagtype
+        for sentence in sentences:
+            tagged_data = self.api_call(sentence, properties=default_properties)
+            assert len(tagged_data['sentences']) == 1
+            # Taggers only need to return 1-best sentence.
+            yield [(token['word'], token[self.tagtype]) for token in tagged_data['sentences'][0]['tokens']]
+
+
+class CoreNLPPOSTagger(CoreNLPTagger):
+    """
+    This is a subclass of the CoreNLPTagger that wraps around the
+    nltk.parse.CoreNLPParser for Part-of-Sppech tagging.
+
+        >>> from nltk.tag.stanford import CoreNLPPOSTagger
+        >>> CoreNLPPOSTagger(url='http://localhost:9000').tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+    """
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        super(CoreNLPPOSTagger, self).__init__('pos', url, encoding)
+
+
+class CoreNLPNERTagger(CoreNLPTagger):
+    """
+    This is a subclass of the CoreNLPTagger that wraps around the
+    nltk.parse.CoreNLPParser for Named-Entity tagging.
+
+        >>> from nltk.tag.stanford import CoreNLPNERTagger
+        >>> CoreNLPNERTagger(url='http://localhost:9000').tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
+        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
+    """
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        super(CoreNLPNERTagger, self).__init__('ner', url, encoding)
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        StanfordPOSTagger('english-bidirectional-distsim.tagger')
+    except LookupError:
+        raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
+                       of the stanford jars cannot be found.')
+
+    try:
+        CoreNLPPOSTagger()
+        CoreNLPNERTagger()
+    except LookupError:
+        raise SkipTest('Doctests from nltk.tag.stanford.CoreNLPTokenizer'
+                       'are skipped because the stanford corenlp server not started')
diff --git a/nlp_resource_data/nltk/tag/stanford.pyc b/nlp_resource_data/nltk/tag/stanford.pyc

new file mode 100755 (executable)

index 0000000..734982c

Binary files /dev/null and b/nlp_resource_data/nltk/tag/stanford.pyc differ
diff --git a/nlp_resource_data/nltk/tag/tnt.py b/nlp_resource_data/nltk/tag/tnt.py

new file mode 100755 (executable)

index 0000000..63db23a
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/tnt.py
@@ -0,0 +1,595 @@
+# Natural Language Toolkit: TnT Tagger
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Sam Huston <sjh900@gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+'''
+Implementation of 'TnT - A Statisical Part of Speech Tagger'
+by Thorsten Brants
+
+http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
+'''
+from __future__ import print_function, division
+from math import log
+
+from operator import itemgetter
+
+from nltk.probability import FreqDist, ConditionalFreqDist
+from nltk.tag.api import TaggerI
+
+class TnT(TaggerI):
+    '''
+    TnT - Statistical POS tagger
+
+    IMPORTANT NOTES:
+
+    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS
+
+      - It is possible to provide an untrained POS tagger to
+        create tags for unknown words, see __init__ function
+
+    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT
+
+      - Due to the nature of this tagger, it works best when
+        trained over sentence delimited input.
+      - However it still produces good results if the training
+        data and testing data are separated on all punctuation eg: [,.?!]
+      - Input for training is expected to be a list of sentences
+        where each sentence is a list of (word, tag) tuples
+      - Input for tag function is a single sentence
+        Input for tagdata function is a list of sentences
+        Output is of a similar form
+
+    * Function provided to process text that is unsegmented
+
+      - Please see basic_sent_chop()
+
+
+    TnT uses a second order Markov model to produce tags for
+    a sequence of input, specifically:
+
+      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)
+
+    IE: the maximum projection of a set of probabilities
+
+    The set of possible tags for a given word is derived
+    from the training data. It is the set of all tags
+    that exact word has been assigned.
+
+    To speed up and get more precision, we can use log addition
+    to instead multiplication, specifically:
+
+      argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
+             log(P(t_T+1|t_T))
+
+    The probability of a tag for a given word is the linear
+    interpolation of 3 markov models; a zero-order, first-order,
+    and a second order model.
+
+      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
+                             l3*P(t_i| t_i-1, t_i-2)
+
+    A beam search is used to limit the memory usage of the algorithm.
+    The degree of the beam can be changed using N in the initialization.
+    N represents the maximum number of possible solutions to maintain
+    while tagging.
+
+    It is possible to differentiate the tags which are assigned to
+    capitalized words. However this does not result in a significant
+    gain in the accuracy of the results.
+    '''
+
+    def __init__(self, unk=None, Trained=False, N=1000, C=False):
+        '''
+        Construct a TnT statistical tagger. Tagger must be trained
+        before being used to tag input.
+
+        :param unk: instance of a POS tagger, conforms to TaggerI
+        :type  unk:(TaggerI)
+        :param Trained: Indication that the POS tagger is trained or not
+        :type  Trained: boolean
+        :param N: Beam search degree (see above)
+        :type  N:(int)
+        :param C: Capitalization flag
+        :type  C: boolean
+
+        Initializer, creates frequency distributions to be used
+        for tagging
+
+        _lx values represent the portion of the tri/bi/uni taggers
+        to be used to calculate the probability
+
+        N value is the number of possible solutions to maintain
+        while tagging. A good value for this is 1000
+
+        C is a boolean value which specifies to use or
+        not use the Capitalization of the word as additional
+        information for tagging.
+        NOTE: using capitalization may not increase the accuracy
+        of the tagger
+        '''
+
+        self._uni  = FreqDist()
+        self._bi   = ConditionalFreqDist()
+        self._tri  = ConditionalFreqDist()
+        self._wd   = ConditionalFreqDist()
+        self._eos  = ConditionalFreqDist()
+        self._l1   = 0.0
+        self._l2   = 0.0
+        self._l3   = 0.0
+        self._N    = N
+        self._C    = C
+        self._T    = Trained
+
+        self._unk = unk
+
+        # statistical tools (ignore or delete me)
+        self.unknown = 0
+        self.known = 0
+
+    def train(self, data):
+        '''
+        Uses a set of tagged data to train the tagger.
+        If an unknown word tagger is specified,
+        it is trained on the same data.
+
+        :param data: List of lists of (word, tag) tuples
+        :type data: tuple(str)
+        '''
+
+        # Ensure that local C flag is initialized before use
+        C = False
+
+        if self._unk is not None and self._T == False:
+            self._unk.train(data)
+
+        for sent in data:
+            history = [('BOS',False), ('BOS',False)]
+            for w, t in sent:
+
+                # if capitalization is requested,
+                # and the word begins with a capital
+                # set local flag C to True
+                if self._C and w[0].isupper(): C=True
+
+                self._wd[w][t] += 1
+                self._uni[(t,C)] += 1
+                self._bi[history[1]][(t,C)] += 1
+                self._tri[tuple(history)][(t,C)] += 1
+
+                history.append((t,C))
+                history.pop(0)
+
+                # set local flag C to false for the next word
+                C = False
+
+            self._eos[t]['EOS'] += 1
+
+
+        # compute lambda values from the trained frequency distributions
+        self._compute_lambda()
+
+        #(debugging -- ignore or delete me)
+        #print "lambdas"
+        #print i, self._l1, i, self._l2, i, self._l3
+
+
+    def _compute_lambda(self):
+        '''
+        creates lambda values based upon training data
+
+        NOTE: no need to explicitly reference C,
+        it is contained within the tag variable :: tag == (tag,C)
+
+        for each tag trigram (t1, t2, t3)
+        depending on the maximum value of
+        - f(t1,t2,t3)-1 / f(t1,t2)-1
+        - f(t2,t3)-1 / f(t2)-1
+        - f(t3)-1 / N-1
+
+        increment l3,l2, or l1 by f(t1,t2,t3)
+
+        ISSUES -- Resolutions:
+        if 2 values are equal, increment both lambda values
+        by (f(t1,t2,t3) / 2)
+        '''
+
+        # temporary lambda variables
+        tl1 = 0.0
+        tl2 = 0.0
+        tl3 = 0.0
+
+        # for each t1,t2 in system
+        for history in self._tri.conditions():
+            (h1, h2) = history
+
+            # for each t3 given t1,t2 in system
+            # (NOTE: tag actually represents (tag,C))
+            # However no effect within this function
+            for tag in self._tri[history].keys():
+
+                # if there has only been 1 occurrence of this tag in the data
+                # then ignore this trigram.
+                if self._uni[tag] == 1:
+                    continue
+
+                # safe_div provides a safe floating point division
+                # it returns -1 if the denominator is 0
+                c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))
+                c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))
+                c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))
+
+
+                # if c1 is the maximum value:
+                if (c1 > c3) and (c1 > c2):
+                    tl1 += self._tri[history][tag]
+
+                # if c2 is the maximum value
+                elif (c2 > c3) and (c2 > c1):
+                    tl2 += self._tri[history][tag]
+
+                # if c3 is the maximum value
+                elif (c3 > c2) and (c3 > c1):
+                    tl3 += self._tri[history][tag]
+
+                # if c3, and c2 are equal and larger than c1
+                elif (c3 == c2) and (c3 > c1):
+                    tl2 += self._tri[history][tag] / 2.0
+                    tl3 += self._tri[history][tag] / 2.0
+
+                # if c1, and c2 are equal and larger than c3
+                # this might be a dumb thing to do....(not sure yet)
+                elif (c2 == c1) and (c1 > c3):
+                    tl1 += self._tri[history][tag] / 2.0
+                    tl2 += self._tri[history][tag] / 2.0
+
+                # otherwise there might be a problem
+                # eg: all values = 0
+                else:
+                    #print "Problem", c1, c2 ,c3
+                    pass
+
+        # Lambda normalisation:
+        # ensures that l1+l2+l3 = 1
+        self._l1 = tl1 / (tl1+tl2+tl3)
+        self._l2 = tl2 / (tl1+tl2+tl3)
+        self._l3 = tl3 / (tl1+tl2+tl3)
+
+
+
+    def _safe_div(self, v1, v2):
+        '''
+        Safe floating point division function, does not allow division by 0
+        returns -1 if the denominator is 0
+        '''
+        if v2 == 0:
+            return -1
+        else:
+            return v1 / v2
+
+    def tagdata(self, data):
+        '''
+        Tags each sentence in a list of sentences
+
+        :param data:list of list of words
+        :type data: [[string,],]
+        :return: list of list of (word, tag) tuples
+
+        Invokes tag(sent) function for each sentence
+        compiles the results into a list of tagged sentences
+        each tagged sentence is a list of (word, tag) tuples
+        '''
+        res = []
+        for sent in data:
+            res1 = self.tag(sent)
+            res.append(res1)
+        return res
+
+
+    def tag(self, data):
+        '''
+        Tags a single sentence
+
+        :param data: list of words
+        :type data: [string,]
+
+        :return: [(word, tag),]
+
+        Calls recursive function '_tagword'
+        to produce a list of tags
+
+        Associates the sequence of returned tags
+        with the correct words in the input sequence
+
+        returns a list of (word, tag) tuples
+        '''
+
+        current_state = [(['BOS', 'BOS'], 0.0)]
+
+        sent = list(data)
+
+        tags = self._tagword(sent, current_state)
+
+        res = []
+        for i in range(len(sent)):
+            # unpack and discard the C flags
+            (t,C) = tags[i+2]
+            res.append((sent[i], t))
+
+        return res
+
+
+    def _tagword(self, sent, current_states):
+        '''
+        :param sent : List of words remaining in the sentence
+        :type sent  : [word,]
+        :param current_states : List of possible tag combinations for
+                                the sentence so far, and the log probability
+                                associated with each tag combination
+        :type current_states  : [([tag, ], logprob), ]
+
+        Tags the first word in the sentence and
+        recursively tags the reminder of sentence
+
+        Uses formula specified above to calculate the probability
+        of a particular tag
+        '''
+
+        # if this word marks the end of the sentance,
+        # return the most probable tag
+        if sent == []:
+            (h, logp) = current_states[0]
+            return h
+
+        # otherwise there are more words to be tagged
+        word = sent[0]
+        sent = sent[1:]
+        new_states = []
+
+        # if the Capitalisation is requested,
+        # initalise the flag for this word
+        C = False
+        if self._C and word[0].isupper(): C=True
+
+        # if word is known
+        # compute the set of possible tags
+        # and their associated log probabilities
+        if word in self._wd:
+            self.known += 1
+
+            for (history, curr_sent_logprob) in current_states:
+                logprobs = []
+
+                for t in self._wd[word].keys():
+                    tC = (t,C)
+                    p_uni = self._uni.freq(tC)
+                    p_bi = self._bi[history[-1]].freq(tC)
+                    p_tri = self._tri[tuple(history[-2:])].freq(tC)
+                    p_wd = self._wd[word][t] / self._uni[tC]
+                    p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
+                    p2 = log(p, 2) + log(p_wd, 2)
+
+                    # compute the result of appending each tag to this history
+                    new_states.append((history + [tC],
+                                       curr_sent_logprob + p2))
+
+        # otherwise a new word, set of possible tags is unknown
+        else:
+            self.unknown += 1
+
+            # since a set of possible tags,
+            # and the probability of each specific tag
+            # can not be returned from most classifiers:
+            # specify that any unknown words are tagged with certainty
+            p = 1
+
+            # if no unknown word tagger has been specified
+            # then use the tag 'Unk'
+            if self._unk is None:
+                tag = ('Unk',C)
+
+            # otherwise apply the unknown word tagger
+            else:
+                [(_w, t)] = list(self._unk.tag([word]))
+                tag = (t,C)
+
+            for (history, logprob) in current_states:
+                history.append(tag)
+
+            new_states = current_states
+
+        # now have computed a set of possible new_states
+
+        # sort states by log prob
+        # set is now ordered greatest to least log probability
+        new_states.sort(reverse=True, key=itemgetter(1))
+
+        # del everything after N (threshold)
+        # this is the beam search cut
+        if len(new_states) > self._N:
+            new_states = new_states[:self._N]
+
+        # compute the tags for the rest of the sentence
+        # return the best list of tags for the sentence
+        return self._tagword(sent, new_states)
+
+
+########################################
+# helper function -- basic sentence tokenizer
+########################################
+
+def basic_sent_chop(data, raw=True):
+    '''
+    Basic method for tokenizing input into sentences
+    for this tagger:
+
+    :param data: list of tokens (words or (word, tag) tuples)
+    :type data: str or tuple(str, str)
+    :param raw: boolean flag marking the input data
+                as a list of words or a list of tagged words
+    :type raw: bool
+    :return: list of sentences
+             sentences are a list of tokens
+             tokens are the same as the input
+
+    Function takes a list of tokens and separates the tokens into lists
+    where each list represents a sentence fragment
+    This function can separate both tagged and raw sequences into
+    basic sentences.
+
+    Sentence markers are the set of [,.!?]
+
+    This is a simple method which enhances the performance of the TnT
+    tagger. Better sentence tokenization will further enhance the results.
+    '''
+
+    new_data = []
+    curr_sent = []
+    sent_mark = [',','.','?','!']
+
+
+    if raw:
+        for word in data:
+            if word in sent_mark:
+                curr_sent.append(word)
+                new_data.append(curr_sent)
+                curr_sent = []
+            else:
+                curr_sent.append(word)
+
+    else:
+        for (word,tag) in data:
+            if word in sent_mark:
+                curr_sent.append((word,tag))
+                new_data.append(curr_sent)
+                curr_sent = []
+            else:
+                curr_sent.append((word,tag))
+    return new_data
+
+
+
+def demo():
+    from nltk.corpus import brown
+    sents = list(brown.tagged_sents())
+    test = list(brown.sents())
+
+    # create and train the tagger
+    tagger = TnT()
+    tagger.train(sents[200:1000])
+
+    # tag some data
+    tagged_data = tagger.tagdata(test[100:120])
+
+    # print results
+    for j in range(len(tagged_data)):
+        s = tagged_data[j]
+        t = sents[j+100]
+        for i in range(len(s)):
+            print(s[i],'--', t[i])
+        print()
+
+
+def demo2():
+    from nltk.corpus import treebank
+
+    d = list(treebank.tagged_sents())
+
+    t = TnT(N=1000, C=False)
+    s = TnT(N=1000, C=True)
+    t.train(d[(11)*100:])
+    s.train(d[(11)*100:])
+
+    for i in range(10):
+        tacc = t.evaluate(d[i*100:((i+1)*100)])
+        tp_un = t.unknown / (t.known + t.unknown)
+        tp_kn = t.known / (t.known + t.unknown)
+        t.unknown = 0
+        t.known = 0
+
+        print('Capitalization off:')
+        print('Accuracy:', tacc)
+        print('Percentage known:', tp_kn)
+        print('Percentage unknown:', tp_un)
+        print('Accuracy over known words:', (tacc / tp_kn))
+
+        sacc = s.evaluate(d[i*100:((i+1)*100)])
+        sp_un = s.unknown / (s.known + s.unknown)
+        sp_kn = s.known / (s.known + s.unknown)
+        s.unknown = 0
+        s.known = 0
+
+        print('Capitalization on:')
+        print('Accuracy:', sacc)
+        print('Percentage known:', sp_kn)
+        print('Percentage unknown:', sp_un)
+        print('Accuracy over known words:', (sacc / sp_kn))
+
+def demo3():
+    from nltk.corpus import treebank, brown
+
+    d = list(treebank.tagged_sents())
+    e = list(brown.tagged_sents())
+
+    d = d[:1000]
+    e = e[:1000]
+
+    d10 = int(len(d)*0.1)
+    e10 = int(len(e)*0.1)
+
+    tknacc = 0
+    sknacc = 0
+    tallacc = 0
+    sallacc = 0
+    tknown = 0
+    sknown = 0
+
+    for i in range(10):
+
+        t = TnT(N=1000, C=False)
+        s = TnT(N=1000, C=False)
+
+        dtest = d[(i*d10):((i+1)*d10)]
+        etest = e[(i*e10):((i+1)*e10)]
+
+        dtrain = d[:(i*d10)] + d[((i+1)*d10):]
+        etrain = e[:(i*e10)] + e[((i+1)*e10):]
+
+        t.train(dtrain)
+        s.train(etrain)
+
+        tacc = t.evaluate(dtest)
+        tp_un = t.unknown / (t.known + t.unknown)
+        tp_kn = t.known / (t.known + t.unknown)
+        tknown += tp_kn
+        t.unknown = 0
+        t.known = 0
+
+        sacc = s.evaluate(etest)
+        sp_un = s.unknown / (s.known + s.unknown)
+        sp_kn = s.known / (s.known + s.unknown)
+        sknown += sp_kn
+        s.unknown = 0
+        s.known = 0
+
+        tknacc += (tacc / tp_kn)
+        sknacc += (sacc / tp_kn)
+        tallacc += tacc
+        sallacc += sacc
+
+        #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
+
+
+    print("brown: acc over words known:", 10 * tknacc)
+    print("     : overall accuracy:", 10 * tallacc)
+    print("     : words known:", 10 * tknown)
+    print("treebank: acc over words known:", 10 * sknacc)
+    print("        : overall accuracy:", 10 * sallacc)
+    print("        : words known:", 10 * sknown)
+
+
+
+
diff --git a/nlp_resource_data/nltk/tag/tnt.pyc b/nlp_resource_data/nltk/tag/tnt.pyc

new file mode 100755 (executable)

index 0000000..da8d8f7

Binary files /dev/null and b/nlp_resource_data/nltk/tag/tnt.pyc differ
diff --git a/nlp_resource_data/nltk/tag/util.py b/nlp_resource_data/nltk/tag/util.py

new file mode 100755 (executable)

index 0000000..5d72f01
--- /dev/null
+++ b/nlp_resource_data/nltk/tag/util.py
@@ -0,0 +1,72 @@
+# Natural Language Toolkit: Tagger Utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+def str2tuple(s, sep='/'):
+    """
+    Given the string representation of a tagged token, return the
+    corresponding tuple representation.  The rightmost occurrence of
+    *sep* in *s* will be used to divide *s* into a word string and
+    a tag string.  If *sep* does not occur in *s*, return (s, None).
+
+        >>> from nltk.tag.util import str2tuple
+        >>> str2tuple('fly/NN')
+        ('fly', 'NN')
+
+    :type s: str
+    :param s: The string representation of a tagged token.
+    :type sep: str
+    :param sep: The separator string used to separate word strings
+        from tags.
+    """
+    loc = s.rfind(sep)
+    if loc >= 0:
+        return (s[:loc], s[loc+len(sep):].upper())
+    else:
+        return (s, None)
+
+def tuple2str(tagged_token, sep='/'):
+    """
+    Given the tuple representation of a tagged token, return the
+    corresponding string representation.  This representation is
+    formed by concatenating the token's word string, followed by the
+    separator, followed by the token's tag.  (If the tag is None,
+    then just return the bare word string.)
+
+        >>> from nltk.tag.util import tuple2str
+        >>> tagged_token = ('fly', 'NN')
+        >>> tuple2str(tagged_token)
+        'fly/NN'
+
+    :type tagged_token: tuple(str, str)
+    :param tagged_token: The tuple representation of a tagged token.
+    :type sep: str
+    :param sep: The separator string used to separate word strings
+        from tags.
+    """
+    word, tag = tagged_token
+    if tag is None:
+        return word
+    else:
+        assert sep not in tag, 'tag may not contain sep!'
+        return '%s%s%s' % (word, sep, tag)
+
+def untag(tagged_sentence):
+    """
+    Given a tagged sentence, return an untagged version of that
+    sentence.  I.e., return a list containing the first element
+    of each tuple in *tagged_sentence*.
+
+        >>> from nltk.tag.util import untag
+        >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')])
+        ['John', 'saw', 'Mary']
+
+    """
+    return [w for (w, t) in tagged_sentence]
+
+
+
diff --git a/nlp_resource_data/nltk/tag/util.pyc b/nlp_resource_data/nltk/tag/util.pyc

new file mode 100755 (executable)

index 0000000..c72489a

Binary files /dev/null and b/nlp_resource_data/nltk/tag/util.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/__init__.py b/nlp_resource_data/nltk/tbl/__init__.py

new file mode 100755 (executable)

index 0000000..a71ca8c
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/__init__.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+"""
+Transformation Based Learning
+
+A general purpose package for Transformation Based Learning,
+currently used by nltk.tag.BrillTagger.
+"""
+
+from nltk.tbl.template import Template
+#API: Template(...), Template.expand(...)
+
+from nltk.tbl.feature import Feature
+#API: Feature(...), Feature.expand(...)
+
+from nltk.tbl.rule import Rule
+#API: Rule.format(...), Rule.templatetid
+
+from nltk.tbl.erroranalysis import error_list
+
diff --git a/nlp_resource_data/nltk/tbl/__init__.pyc b/nlp_resource_data/nltk/tbl/__init__.pyc

new file mode 100755 (executable)

index 0000000..c1f1ea2

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/api.py b/nlp_resource_data/nltk/tbl/api.py

new file mode 100755 (executable)

index 0000000..8b13789
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/api.py
@@ -0,0 +1 @@
+
diff --git a/nlp_resource_data/nltk/tbl/api.pyc b/nlp_resource_data/nltk/tbl/api.pyc

new file mode 100755 (executable)

index 0000000..7d5ba00

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/api.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/demo.py b/nlp_resource_data/nltk/tbl/demo.py

new file mode 100755 (executable)

index 0000000..fa70972
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/demo.py
@@ -0,0 +1,366 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import print_function, absolute_import, division
+import os
+import pickle
+
+import random
+import time
+
+from nltk.corpus import treebank
+
+from nltk.tbl import error_list, Template
+from nltk.tag.brill import Word, Pos
+from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger
+
+def demo():
+    """
+    Run a demo with defaults. See source comments for details,
+    or docstrings of any of the more specific demo_* functions.
+    """
+    postag()
+
+def demo_repr_rule_format():
+    """
+    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
+    """
+    postag(ruleformat="repr")
+
+def demo_str_rule_format():
+    """
+    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
+    """
+    postag(ruleformat="str")
+
+def demo_verbose_rule_format():
+    """
+    Exemplify Rule.format("verbose")
+    """
+    postag(ruleformat="verbose")
+
+def demo_multiposition_feature():
+    """
+    The feature/s of a template takes a list of positions
+    relative to the current word where the feature should be
+    looked for, conceptually joined by logical OR. For instance,
+    Pos([-1, 1]), given a value V, will hold whenever V is found
+    one step to the left and/or one step to the right.
+
+    For contiguous ranges, a 2-arg form giving inclusive end
+    points can also be used: Pos(-3, -1) is the same as the arg
+    below.
+    """
+    postag(templates=[Template(Pos([-3,-2,-1]))])
+
+def demo_multifeature_template():
+    """
+    Templates can have more than a single feature.
+    """
+    postag(templates=[Template(Word([0]), Pos([-2,-1]))])
+
+def demo_template_statistics():
+    """
+    Show aggregate statistics per template. Little used templates are
+    candidates for deletion, much used templates may possibly be refined.
+
+    Deleting unused templates is mostly about saving time and/or space:
+    training is basically O(T) in the number of templates T
+    (also in terms of memory usage, which often will be the limiting factor).
+    """
+    postag(incremental_stats=True, template_stats=True)
+
+def demo_generated_templates():
+    """
+    Template.expand and Feature.expand are class methods facilitating
+    generating large amounts of templates. See their documentation for
+    details.
+
+    Note: training with 500 templates can easily fill all available
+    even on relatively small corpora
+    """
+    wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False)
+    tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True)
+    templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3)))
+    print("Generated {0} templates for transformation-based learning".format(len(templates)))
+    postag(templates=templates, incremental_stats=True, template_stats=True)
+
+def demo_learning_curve():
+    """
+    Plot a learning curve -- the contribution on tagging accuracy of
+    the individual rules.
+    Note: requires matplotlib
+    """
+    postag(incremental_stats=True, separate_baseline_data=True, learning_curve_output="learningcurve.png")
+
+def demo_error_analysis():
+    """
+    Writes a file with context for each erroneous word after tagging testing data
+    """
+    postag(error_output="errors.txt")
+
+def demo_serialize_tagger():
+    """
+    Serializes the learned tagger to a file in pickle format; reloads it
+    and validates the process.
+    """
+    postag(serialize_output="tagger.pcl")
+
+def demo_high_accuracy_rules():
+    """
+    Discard rules with low accuracy. This may hurt performance a bit,
+    but will often produce rules which are more interesting read to a human.
+    """
+    postag(num_sents=3000, min_acc=0.96, min_score=10)
+
+def postag(
+    templates=None,
+    tagged_data=None,
+    num_sents=1000,
+    max_rules=300,
+    min_score=3,
+    min_acc=None,
+    train=0.8,
+    trace=3,
+    randomize=False,
+    ruleformat="str",
+    incremental_stats=False,
+    template_stats=False,
+    error_output=None,
+    serialize_output=None,
+    learning_curve_output=None,
+    learning_curve_take=300,
+    baseline_backoff_tagger=None,
+    separate_baseline_data=False,
+    cache_baseline_tagger=None):
+    """
+    Brill Tagger Demonstration
+    :param templates: how many sentences of training and testing data to use
+    :type templates: list of Template
+
+    :param tagged_data: maximum number of rule instances to create
+    :type tagged_data: C{int}
+
+    :param num_sents: how many sentences of training and testing data to use
+    :type num_sents: C{int}
+
+    :param max_rules: maximum number of rule instances to create
+    :type max_rules: C{int}
+
+    :param min_score: the minimum score for a rule in order for it to be considered
+    :type min_score: C{int}
+
+    :param min_acc: the minimum score for a rule in order for it to be considered
+    :type min_acc: C{float}
+
+    :param train: the fraction of the the corpus to be used for training (1=all)
+    :type train: C{float}
+
+    :param trace: the level of diagnostic tracing output to produce (0-4)
+    :type trace: C{int}
+
+    :param randomize: whether the training data should be a random subset of the corpus
+    :type randomize: C{bool}
+
+    :param ruleformat: rule output format, one of "str", "repr", "verbose"
+    :type ruleformat: C{str}
+
+    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
+    :type incremental_stats: C{bool}
+
+    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
+    :type template_stats: C{bool}
+
+    :param error_output: the file where errors will be saved
+    :type error_output: C{string}
+
+    :param serialize_output: the file where the learned tbl tagger will be saved
+    :type serialize_output: C{string}
+
+    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
+    :type learning_curve_output: C{string}
+
+    :param learning_curve_take: how many rules plotted
+    :type learning_curve_take: C{int}
+
+    :param baseline_backoff_tagger: the file where rules will be saved
+    :type baseline_backoff_tagger: tagger
+
+    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
+    :type separate_baseline_data: C{bool}
+
+    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
+                                  deterministic output from the baseline unigram tagger between python versions)
+    :type cache_baseline_tagger: C{string}
+
+
+    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
+    is fast and fine for a demo, but is likely to generalize worse on unseen data.
+    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
+    """
+
+    # defaults
+    baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
+    if templates is None:
+        from nltk.tag.brill import describe_template_sets, brill24
+        # some pre-built template sets taken from typical systems or publications are
+        # available. Print a list with describe_template_sets()
+        # for instance:
+        templates = brill24()
+    (training_data, baseline_data, gold_data, testing_data) = \
+       _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)
+
+    # creating (or reloading from cache) a baseline tagger (unigram tagger)
+    # this is just a mechanism for getting deterministic output from the baseline between
+    # python versions
+    if cache_baseline_tagger:
+        if not os.path.exists(cache_baseline_tagger):
+            baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
+            with open(cache_baseline_tagger, 'w') as print_rules:
+                pickle.dump(baseline_tagger, print_rules)
+            print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
+        with open(cache_baseline_tagger, "r") as print_rules:
+            baseline_tagger= pickle.load(print_rules)
+            print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
+    else:
+        baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
+        print("Trained baseline tagger")
+    if gold_data:
+        print("    Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))
+
+    # creating a Brill tagger
+    tbrill = time.time()
+    trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
+    print("Training tbl tagger...")
+    brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
+    print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
+    if gold_data:
+        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))
+
+    # printing the learned rules, if learned silently
+    if trace == 1:
+        print("\nLearned rules: ")
+        for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
+            print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))
+
+
+    # printing template statistics (optionally including comparison with the training data)
+    # note: if not separate_baseline_data, then baseline accuracy will be artificially high
+    if  incremental_stats:
+        print("Incrementally tagging the test data, collecting individual rule statistics")
+        (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
+        print("    Rule statistics collected")
+        if not separate_baseline_data:
+            print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
+                  "will be artificially high")
+        trainstats = brill_tagger.train_stats()
+        if template_stats:
+            brill_tagger.print_template_statistics(teststats)
+        if learning_curve_output:
+            _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
+            print("Wrote plot of learning curve to {0}".format(learning_curve_output))
+    else:
+        print("Tagging the test data")
+        taggedtest = brill_tagger.tag_sents(testing_data)
+        if template_stats:
+            brill_tagger.print_template_statistics()
+
+    # writing error analysis to file
+    if error_output is not None:
+        with open(error_output, 'w') as f:
+            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
+            f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
+        print("Wrote tagger errors including context to {0}".format(error_output))
+
+    # serializing the tagger to a pickle file and reloading (just to see it works)
+    if serialize_output is not None:
+        taggedtest = brill_tagger.tag_sents(testing_data)
+        with open(serialize_output, 'w') as print_rules:
+            pickle.dump(brill_tagger, print_rules)
+        print("Wrote pickled tagger to {0}".format(serialize_output))
+        with open(serialize_output, "r") as print_rules:
+            brill_tagger_reloaded = pickle.load(print_rules)
+        print("Reloaded pickled tagger from {0}".format(serialize_output))
+        taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
+        if taggedtest == taggedtest_reloaded:
+            print("Reloaded tagger tried on test set, results identical")
+        else:
+            print("PROBLEM: Reloaded tagger gave different results on test set")
+
+def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
+    # train is the proportion of data used in training; the rest is reserved
+    # for testing.
+    if tagged_data is None:
+        print("Loading tagged data from treebank... ")
+        tagged_data = treebank.tagged_sents()
+    if num_sents is None or len(tagged_data) <= num_sents:
+        num_sents = len(tagged_data)
+    if randomize:
+        random.seed(len(tagged_data))
+        random.shuffle(tagged_data)
+    cutoff = int(num_sents * train)
+    training_data = tagged_data[:cutoff]
+    gold_data = tagged_data[cutoff:num_sents]
+    testing_data = [[t[0] for t in sent] for sent in gold_data]
+    if not separate_baseline_data:
+        baseline_data = training_data
+    else:
+        bl_cutoff = len(training_data) // 3
+        (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
+    (trainseqs, traintokens) = corpus_size(training_data)
+    (testseqs, testtokens) = corpus_size(testing_data)
+    (bltrainseqs, bltraintokens) = corpus_size(baseline_data)
+    print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
+    print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
+    print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
+        bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
+    return (training_data, baseline_data, gold_data, testing_data)
+
+
+def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
+   testcurve = [teststats['initialerrors']]
+   for rulescore in teststats['rulescores']:
+       testcurve.append(testcurve[-1] - rulescore)
+   testcurve = [1 - x/teststats['tokencount'] for x in testcurve[:take]]
+
+   traincurve = [trainstats['initialerrors']]
+   for rulescore in trainstats['rulescores']:
+       traincurve.append(traincurve[-1] - rulescore)
+   traincurve = [1 - x/trainstats['tokencount'] for x in traincurve[:take]]
+
+   import matplotlib.pyplot as plt
+   r = list(range(len(testcurve)))
+   plt.plot(r, testcurve, r, traincurve)
+   plt.axis([None, None, None, 1.0])
+   plt.savefig(learning_curve_output)
+
+
+NN_CD_TAGGER = RegexpTagger(
+    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
+     (r'.*', 'NN')])
+
+REGEXP_TAGGER = RegexpTagger(
+    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+     (r'(The|the|A|a|An|an)$', 'AT'),   # articles
+     (r'.*able$', 'JJ'),                # adjectives
+     (r'.*ness$', 'NN'),                # nouns formed from adjectives
+     (r'.*ly$', 'RB'),                  # adverbs
+     (r'.*s$', 'NNS'),                  # plural nouns
+     (r'.*ing$', 'VBG'),                # gerunds
+     (r'.*ed$', 'VBD'),                 # past tense verbs
+     (r'.*', 'NN')                      # nouns (default)
+])
+
+
+def corpus_size(seqs):
+    return (len(seqs), sum(len(x) for x in seqs))
+
+if __name__ == '__main__':
+    demo_learning_curve()
diff --git a/nlp_resource_data/nltk/tbl/demo.pyc b/nlp_resource_data/nltk/tbl/demo.pyc

new file mode 100755 (executable)

index 0000000..d941c53

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/demo.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/erroranalysis.py b/nlp_resource_data/nltk/tbl/erroranalysis.py

new file mode 100755 (executable)

index 0000000..aaceb01
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/erroranalysis.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import print_function
+
+
+# returns a list of errors in string format
+
+def error_list(train_sents, test_sents):
+    """
+    Returns a list of human-readable strings indicating the errors in the
+    given tagging of the corpus.
+
+    :param train_sents: The correct tagging of the corpus
+    :type train_sents: list(tuple)
+    :param test_sents: The tagged corpus
+    :type test_sents: list(tuple)
+    """
+    hdr = (('%25s | %s | %s\n' + '-'*26+'+'+'-'*24+'+'+'-'*26) %
+           ('left context', 'word/test->gold'.center(22), 'right context'))
+    errors = [hdr]
+    for (train_sent, test_sent) in zip(train_sents, test_sents):
+        for wordnum, (word, train_pos) in enumerate(train_sent):
+            test_pos = test_sent[wordnum][1]
+            if train_pos != test_pos:
+                left = ' '.join('%s/%s' % w for w in train_sent[:wordnum])
+                right = ' '.join('%s/%s' % w for w in train_sent[wordnum+1:])
+                mid = '%s/%s->%s' % (word, test_pos, train_pos)
+                errors.append('%25s | %s | %s' %
+                              (left[-25:], mid.center(22), right[:25]))
+
+    return errors
diff --git a/nlp_resource_data/nltk/tbl/erroranalysis.pyc b/nlp_resource_data/nltk/tbl/erroranalysis.pyc

new file mode 100755 (executable)

index 0000000..78d4509

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/erroranalysis.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/feature.py b/nlp_resource_data/nltk/tbl/feature.py

new file mode 100755 (executable)

index 0000000..eb3539b
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/feature.py
@@ -0,0 +1,259 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import division, print_function, unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+
+@add_metaclass(ABCMeta)
+class Feature(object):
+    """
+    An abstract base class for Features. A Feature is a combination of
+    a specific property-computing method and a list of relative positions
+    to apply that method to.
+
+    The property-computing method, M{extract_property(tokens, index)},
+    must be implemented by every subclass. It extracts or computes a specific
+    property for the token at the current index. Typical extract_property()
+    methods return features such as the token text or tag; but more involved
+    methods may consider the entire sequence M{tokens} and
+    for instance compute the length of the sentence the token belongs to.
+
+    In addition, the subclass may have a PROPERTY_NAME, which is how
+    it will be printed (in Rules and Templates, etc). If not given, defaults
+    to the classname.
+
+    """
+
+    json_tag = 'nltk.tbl.Feature'
+    PROPERTY_NAME = None
+
+    def __init__(self, positions, end=None):
+        """
+        Construct a Feature which may apply at C{positions}.
+
+        #For instance, importing some concrete subclasses (Feature is abstract)
+        >>> from nltk.tag.brill import Word, Pos
+
+        #Feature Word, applying at one of [-2, -1]
+        >>> Word([-2,-1])
+        Word([-2, -1])
+
+        #Positions need not be contiguous
+        >>> Word([-2,-1, 1])
+        Word([-2, -1, 1])
+
+        #Contiguous ranges can alternatively be specified giving the
+        #two endpoints (inclusive)
+        >>> Pos(-3, -1)
+        Pos([-3, -2, -1])
+
+        #In two-arg form, start <= end is enforced
+        >>> Pos(2, 1)
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+          File "nltk/tbl/template.py", line 306, in __init__
+            raise TypeError
+        ValueError: illegal interval specification: (start=2, end=1)
+
+        :type positions: list of int
+        :param positions: the positions at which this features should apply
+        :raises ValueError: illegal position specifications
+
+        An alternative calling convention, for contiguous positions only,
+        is Feature(start, end):
+
+        :type start: int
+        :param start: start of range where this feature should apply
+        :type end: int
+        :param end: end of range (NOTE: inclusive!) where this feature should apply
+
+        """
+        self.positions = None  # to avoid warnings
+        if end is None:
+            self.positions = tuple(sorted(set([int(i) for i in positions])))
+        else:                # positions was actually not a list, but only the start index
+            try:
+                if positions > end:
+                    raise TypeError
+                self.positions = tuple(range(positions, end+1))
+            except TypeError:
+                # let any kind of erroneous spec raise ValueError
+                raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end))
+
+        # set property name given in subclass, or otherwise name of subclass
+        self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
+
+    def encode_json_obj(self):
+        return self.positions
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        positions = obj
+        return cls(positions)
+
+    def __repr__(self):
+        return "%s(%r)" % (
+            self.__class__.__name__, list(self.positions))
+
+    @classmethod
+    def expand(cls, starts, winlens, excludezero=False):
+        """
+        Return a list of features, one for each start point in starts
+        and for each window length in winlen. If excludezero is True,
+        no Features containing 0 in its positions will be generated
+        (many tbl trainers have a special representation for the
+        target feature at [0])
+
+        For instance, importing a concrete subclass (Feature is abstract)
+        >>> from nltk.tag.brill import Word
+
+        First argument gives the possible start positions, second the
+        possible window lengths
+        >>> Word.expand([-3,-2,-1], [1])
+        [Word([-3]), Word([-2]), Word([-1])]
+
+        >>> Word.expand([-2,-1], [1])
+        [Word([-2]), Word([-1])]
+
+        >>> Word.expand([-3,-2,-1], [1,2])
+        [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]
+
+        >>> Word.expand([-2,-1], [1])
+        [Word([-2]), Word([-1])]
+
+        a third optional argument excludes all Features whose positions contain zero
+        >>> Word.expand([-2,-1,0], [1,2], excludezero=False)
+        [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]
+
+        >>> Word.expand([-2,-1,0], [1,2], excludezero=True)
+        [Word([-2]), Word([-1]), Word([-2, -1])]
+
+        All window lengths must be positive
+        >>> Word.expand([-2,-1], [0])
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+          File "nltk/tag/tbl/template.py", line 371, in expand
+            :param starts: where to start looking for Feature
+        ValueError: non-positive window length in [0]
+
+        :param starts: where to start looking for Feature
+        :type starts: list of ints
+        :param winlens: window lengths where to look for Feature
+        :type starts: list of ints
+        :param excludezero: do not output any Feature with 0 in any of its positions.
+        :type excludezero: bool
+        :returns: list of Features
+        :raises ValueError: for non-positive window lengths
+        """
+        if not all(x > 0 for x in winlens):
+            raise ValueError("non-positive window length in {0}".format(winlens))
+        xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1))
+        return [cls(x) for x in xs if not (excludezero and 0 in x)]
+
+    def issuperset(self, other):
+        """
+        Return True if this Feature always returns True when other does
+
+        More precisely, return True if this feature refers to the same property as other;
+        and this Feature looks at all positions that other does (and possibly
+        other positions in addition).
+
+        #For instance, importing a concrete subclass (Feature is abstract)
+        >>> from nltk.tag.brill import Word, Pos
+
+        >>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
+        True
+
+        >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
+        False
+
+        #Feature subclasses must agree
+        >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
+        False
+
+        :param other: feature with which to compare
+        :type other: (subclass of) Feature
+        :return: True if this feature is superset, otherwise False
+        :rtype: bool
+
+
+        """
+        return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions)
+
+    def intersects(self, other):
+        """
+        Return True if the positions of this Feature intersects with those of other
+
+        More precisely, return True if this feature refers to the same property as other;
+        and there is some overlap in the positions they look at.
+
+        #For instance, importing a concrete subclass (Feature is abstract)
+        >>> from nltk.tag.brill import Word, Pos
+
+        >>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
+        True
+
+        >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
+        True
+
+        >>> Word([-3,-2,-1]).intersects(Word([0]))
+        False
+
+        #Feature subclasses must agree
+        >>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
+        False
+
+        :param other: feature with which to compare
+        :type other: (subclass of) Feature
+        :return: True if feature classes agree and there is some overlap in the positions they look at
+        :rtype: bool
+        """
+
+        return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions)))
+
+    # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
+    # it will be enough to define __lt__ and __eq__
+    def __eq__(self, other):
+        return (self.__class__ is other.__class__ and self.positions == other.positions)
+
+    def __lt__(self, other):
+        return (
+            self.__class__.__name__ < other.__class__.__name__ or
+            #    self.positions is a sorted tuple of ints
+            self.positions < other.positions
+        )
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __gt__(self, other):
+        return other < self
+
+    def __ge__(self, other):
+        return not self < other
+
+    def __le__(self, other):
+        return self < other or self == other
+
+    @staticmethod
+    @abstractmethod
+    def extract_property(tokens, index):
+        """
+        Any subclass of Feature must define static method extract_property(tokens, index)
+
+        :param tokens: the sequence of tokens
+        :type tokens: list of tokens
+        :param index: the current index
+        :type index: int
+        :return: feature value
+        :rtype: any (but usually scalar)
+        """
diff --git a/nlp_resource_data/nltk/tbl/feature.pyc b/nlp_resource_data/nltk/tbl/feature.pyc

new file mode 100755 (executable)

index 0000000..4d1c4d1

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/feature.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/rule.py b/nlp_resource_data/nltk/tbl/rule.py

new file mode 100755 (executable)

index 0000000..7c5c3f2
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/rule.py
@@ -0,0 +1,321 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+from nltk import jsontags
+
+
+######################################################################
+# Tag Rules
+######################################################################
+@add_metaclass(ABCMeta)
+class TagRule(object):
+    """
+    An interface for tag transformations on a tagged corpus, as
+    performed by tbl taggers.  Each transformation finds all tokens
+    in the corpus that are tagged with a specific original tag and
+    satisfy a specific condition, and replaces their tags with a
+    replacement tag.  For any given transformation, the original
+    tag, replacement tag, and condition are fixed.  Conditions may
+    depend on the token under consideration, as well as any other
+    tokens in the corpus.
+
+    Tag rules must be comparable and hashable.
+    """
+
+    def __init__(self, original_tag, replacement_tag):
+
+        self.original_tag = original_tag
+        """The tag which this TagRule may cause to be replaced."""
+
+        self.replacement_tag = replacement_tag
+        """The tag with which this TagRule may replace another tag."""
+
+    def apply(self, tokens, positions=None):
+        """
+        Apply this rule at every position in positions where it
+        applies to the given sentence.  I.e., for each position p
+        in *positions*, if *tokens[p]* is tagged with this rule's
+        original tag, and satisfies this rule's condition, then set
+        its tag to be this rule's replacement tag.
+
+        :param tokens: The tagged sentence
+        :type tokens: list(tuple(str, str))
+        :type positions: list(int)
+        :param positions: The positions where the transformation is to
+            be tried.  If not specified, try it at all positions.
+        :return: The indices of tokens whose tags were changed by this
+            rule.
+        :rtype: int
+        """
+        if positions is None:
+            positions = list(range(len(tokens)))
+
+        # Determine the indices at which this rule applies.
+        change = [i for i in positions if self.applies(tokens, i)]
+
+        # Make the changes.  Note: this must be done in a separate
+        # step from finding applicable locations, since we don't want
+        # the rule to interact with itself.
+        for i in change:
+            tokens[i] = (tokens[i][0], self.replacement_tag)
+
+        return change
+
+    @abstractmethod
+    def applies(self, tokens, index):
+        """
+        :return: True if the rule would change the tag of
+            ``tokens[index]``, False otherwise
+        :rtype: bool
+        :param tokens: A tagged sentence
+        :type tokens: list(str)
+        :param index: The index to check
+        :type index: int
+        """
+
+    # Rules must be comparable and hashable for the algorithm to work
+    def __eq__(self, other):
+        raise TypeError("Rules must implement __eq__()")
+
+    def __ne__(self, other):
+        raise TypeError("Rules must implement __ne__()")
+
+    def __hash__(self):
+        raise TypeError("Rules must implement __hash__()")
+
+
+@python_2_unicode_compatible
+@jsontags.register_tag
+class Rule(TagRule):
+    """
+    A Rule checks the current corpus position for a certain set of conditions;
+    if they are all fulfilled, the Rule is triggered, meaning that it
+    will change tag A to tag B. For other tags than A, nothing happens.
+
+    The conditions are parameters to the Rule instance. Each condition is a feature-value pair,
+    with a set of positions to check for the value of the corresponding feature.
+    Conceptually, the positions are joined by logical OR, and the feature set by logical AND.
+
+    More formally, the Rule is then applicable to the M{n}th token iff:
+
+      - The M{n}th token is tagged with the Rule's original tag; and
+      - For each (Feature(positions), M{value}) tuple:
+        - The value of Feature of at least one token in {n+p for p in positions}
+          is M{value}.
+
+    """
+
+    json_tag = 'nltk.tbl.Rule'
+
+    def __init__(self, templateid, original_tag, replacement_tag, conditions):
+        """
+        Construct a new Rule that changes a token's tag from
+        C{original_tag} to C{replacement_tag} if all of the properties
+        specified in C{conditions} hold.
+
+        @type templateid: string
+        @param templateid: the template id (a zero-padded string, '001' etc,
+          so it will sort nicely)
+
+        @type conditions: C{iterable} of C{Feature}
+        @param conditions: A list of Feature(positions),
+            each of which specifies that the property (computed by
+            Feature.extract_property()) of at least one
+            token in M{n} + p in positions is C{value}.
+
+        """
+        TagRule.__init__(self, original_tag, replacement_tag)
+        self._conditions = conditions
+        self.templateid = templateid
+
+    def encode_json_obj(self):
+        return {
+            'templateid':   self.templateid,
+            'original':     self.original_tag,
+            'replacement':  self.replacement_tag,
+            'conditions':   self._conditions,
+        }
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        return cls(obj['templateid'], obj['original'], obj['replacement'], obj['conditions'])
+
+    def applies(self, tokens, index):
+        # Inherit docs from TagRule
+
+        # Does the given token have this Rule's "original tag"?
+        if tokens[index][1] != self.original_tag:
+            return False
+
+        # Check to make sure that every condition holds.
+        for (feature, val) in self._conditions:
+
+            # Look for *any* token that satisfies the condition.
+            for pos in feature.positions:
+                if not (0 <= index + pos < len(tokens)):
+                    continue
+                if feature.extract_property(tokens, index+pos) == val:
+                    break
+            else:
+                # No token satisfied the condition; return false.
+                return False
+
+        # Every condition checked out, so the Rule is applicable.
+        return True
+
+    def __eq__(self, other):
+        return (self is other or
+                (other is not None and
+                 other.__class__ == self.__class__ and
+                 self.original_tag == other.original_tag and
+                 self.replacement_tag == other.replacement_tag and
+                 self._conditions == other._conditions))
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __hash__(self):
+
+        # Cache our hash value (justified by profiling.)
+        try:
+            return self.__hash
+        except AttributeError:
+            self.__hash = hash(repr(self))
+            return self.__hash
+
+    def __repr__(self):
+        # Cache the repr (justified by profiling -- this is used as
+        # a sort key when deterministic=True.)
+        try:
+            return self.__repr
+        except AttributeError:
+            self.__repr = (
+                "{0}('{1}', {2}, {3}, [{4}])".format(
+                    self.__class__.__name__,
+                    self.templateid,
+                    unicode_repr(self.original_tag),
+                    unicode_repr(self.replacement_tag),
+
+                    # list(self._conditions) would be simpler but will not generate
+                    # the same Rule.__repr__ in python 2 and 3 and thus break some tests
+                    ', '.join("({0},{1})".format(f, unicode_repr(v)) for (f, v) in self._conditions)
+                )
+            )
+
+            return self.__repr
+
+    def __str__(self):
+        def _condition_to_logic(feature, value):
+            """
+            Return a compact, predicate-logic styled string representation
+            of the given condition.
+            """
+            return '{0}:{1}@[{2}]'.format(
+                feature.PROPERTY_NAME,
+                value,
+                ",".join(str(w) for w in feature.positions)
+            )
+
+        conditions = ' & '.join([_condition_to_logic(f, v) for (f, v) in self._conditions])
+        s = '{0}->{1} if {2}'.format(
+            self.original_tag,
+            self.replacement_tag,
+            conditions
+        )
+
+        return s
+
+    def format(self, fmt):
+        """
+        Return a string representation of this rule.
+
+        >>> from nltk.tbl.rule import Rule
+        >>> from nltk.tag.brill import Pos
+
+        >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')])
+
+        r.format("str") == str(r)
+        True
+        >>> r.format("str")
+        'VB->NN if Pos:DT@[-2,-1]'
+
+        r.format("repr") == repr(r)
+        True
+        >>> r.format("repr")
+        "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])"
+
+        >>> r.format("verbose")
+        'VB -> NN if the Pos of words i-2...i-1 is "DT"'
+
+        >>> r.format("not_found")
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+          File "nltk/tbl/rule.py", line 256, in format
+            raise ValueError("unknown rule format spec: {0}".format(fmt))
+        ValueError: unknown rule format spec: not_found
+        >>>
+
+        :param fmt: format specification
+        :type fmt: str
+        :return: string representation
+        :rtype: str
+        """
+        if fmt == "str":
+            return self.__str__()
+        elif fmt == "repr":
+            return self.__repr__()
+        elif fmt == "verbose":
+            return self._verbose_format()
+        else:
+            raise ValueError("unknown rule format spec: {0}".format(fmt))
+
+    def _verbose_format(self):
+        """
+        Return a wordy, human-readable string representation
+        of the given rule.
+
+        Not sure how useful this is.
+        """
+        def condition_to_str(feature, value):
+            return ('the %s of %s is "%s"' %
+                    (feature.PROPERTY_NAME, range_to_str(feature.positions), value))
+
+        def range_to_str(positions):
+            if len(positions) == 1:
+                p = positions[0]
+                if p == 0:
+                    return 'this word'
+                if p == -1:
+                    return 'the preceding word'
+                elif p == 1:
+                    return 'the following word'
+                elif p < 0:
+                    return 'word i-%d' % -p
+                elif p > 0:
+                    return 'word i+%d' % p
+            else:
+                # for complete compatibility with the wordy format of nltk2
+                mx = max(positions)
+                mn = min(positions)
+                if mx - mn == len(positions) - 1:
+                    return 'words i%+d...i%+d' % (mn, mx)
+                else:
+                    return 'words {%s}' % (",".join("i%+d" % d for d in positions),)
+
+        replacement = '%s -> %s' % (self.original_tag, self.replacement_tag)
+        conditions = (' if ' if self._conditions else "") + ', and '.join(
+            condition_to_str(f, v) for (f, v) in self._conditions
+        )
+        return replacement + conditions
diff --git a/nlp_resource_data/nltk/tbl/rule.pyc b/nlp_resource_data/nltk/tbl/rule.pyc

new file mode 100755 (executable)

index 0000000..86d294f

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/rule.pyc differ
diff --git a/nlp_resource_data/nltk/tbl/template.py b/nlp_resource_data/nltk/tbl/template.py

new file mode 100755 (executable)

index 0000000..38db64b
--- /dev/null
+++ b/nlp_resource_data/nltk/tbl/template.py
@@ -0,0 +1,303 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <http://nltk.org/>
+# For license information, see  LICENSE.TXT
+
+from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+import itertools as it
+from nltk.tbl.feature import Feature
+from nltk.tbl.rule import Rule
+
+
+@add_metaclass(ABCMeta)
+class BrillTemplateI(object):
+    """
+    An interface for generating lists of transformational rules that
+    apply at given sentence positions.  ``BrillTemplateI`` is used by
+    ``Brill`` training algorithms to generate candidate rules.
+    """
+    @abstractmethod
+    def applicable_rules(self, tokens, i, correctTag):
+        """
+        Return a list of the transformational rules that would correct
+        the *i*th subtoken's tag in the given token.  In particular,
+        return a list of zero or more rules that would change
+        *tokens*[i][1] to *correctTag*, if applied to *token*[i].
+
+        If the *i*th token already has the correct tag (i.e., if
+        tagged_tokens[i][1] == correctTag), then
+        ``applicable_rules()`` should return the empty list.
+
+        :param tokens: The tagged tokens being tagged.
+        :type tokens: list(tuple)
+        :param i: The index of the token whose tag should be corrected.
+        :type i: int
+        :param correctTag: The correct tag for the *i*th token.
+        :type correctTag: any
+        :rtype: list(BrillRule)
+        """
+
+    @abstractmethod
+    def get_neighborhood(self, token, index):
+        """
+        Returns the set of indices *i* such that
+        ``applicable_rules(token, i, ...)`` depends on the value of
+        the *index*th token of *token*.
+
+        This method is used by the "fast" Brill tagger trainer.
+
+        :param token: The tokens being tagged.
+        :type token: list(tuple)
+        :param index: The index whose neighborhood should be returned.
+        :type index: int
+        :rtype: set
+        """
+
+
+class Template(BrillTemplateI):
+    """
+    A tbl Template that generates a list of L{Rule}s that apply at a given sentence
+    position.  In particular, each C{Template} is parameterized by a list of
+    independent features (a combination of a specific
+    property to extract and a list C{L} of relative positions at which to extract
+    it) and generates all Rules that:
+
+      - use the given features, each at its own independent position; and
+      - are applicable to the given token.
+    """
+    ALLTEMPLATES = []
+    # record a unique id of form "001", for each template created
+    # _ids = it.count(0)
+
+    def __init__(self, *features):
+
+        """
+        Construct a Template for generating Rules.
+
+        Takes a list of Features. A C{Feature} is a combination
+        of a specific property and its relative positions and should be
+        a subclass of L{nltk.tbl.feature.Feature}.
+
+        An alternative calling convention (kept for backwards compatibility,
+        but less expressive as it only permits one feature type) is
+        Template(Feature, (start1, end1), (start2, end2), ...)
+        In new code, that would be better written
+        Template(Feature(start1, end1), Feature(start2, end2), ...)
+
+        #For instance, importing some features
+        >>> from nltk.tbl.template import Template
+        >>> from nltk.tag.brill import Word, Pos
+
+        #create some features
+
+        >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1]))
+
+        #Create a single-feature template
+        >>> Template(wfeat1)
+        Template(Word([-1]))
+
+        #or a two-feature one
+        >>> Template(wfeat1, wfeat2)
+        Template(Word([-1]),Word([1, 2]))
+
+        #or a three-feature one with two different feature types
+        >>> Template(wfeat1, wfeat2, pfeat)
+        Template(Word([-1]),Word([1, 2]),Pos([-2, -1]))
+
+        #deprecated api: Feature subclass, followed by list of (start,end) pairs
+        #(permits only a single Feature)
+        >>> Template(Word, (-2,-1), (0,0))
+        Template(Word([-2, -1]),Word([0]))
+
+        #incorrect specification raises TypeError
+        >>> Template(Word, (-2,-1), Pos, (0,0))
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+          File "nltk/tag/tbl/template.py", line 143, in __init__
+            raise TypeError(
+        TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...
+
+        :type features: list of Features
+        :param features: the features to build this Template on
+        """
+        # determine the calling form: either
+        # Template(Feature, args1, [args2, ...)]
+        # Template(Feature1(args),  Feature2(args), ...)
+        if all(isinstance(f, Feature) for f in features):
+            self._features = features
+        elif issubclass(features[0], Feature) and all(isinstance(a, tuple) for a in features[1:]):
+            self._features = [features[0](*tp) for tp in features[1:]]
+        else:
+            raise TypeError(
+                "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...")
+        self.id = "{0:03d}".format(len(self.ALLTEMPLATES))
+        self.ALLTEMPLATES.append(self)
+
+    def __repr__(self):
+        return "%s(%s)" % (self.__class__.__name__, ",".join([str(f) for f in self._features]))
+
+    def applicable_rules(self, tokens, index, correct_tag):
+        if tokens[index][1] == correct_tag:
+            return []
+
+        # For each of this Template's features, find the conditions
+        # that are applicable for the given token.
+        # Then, generate one Rule for each combination of features
+        # (the crossproduct of the conditions).
+
+        applicable_conditions = self._applicable_conditions(tokens, index)
+        xs = list(it.product(*applicable_conditions))
+        return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs]
+
+    def _applicable_conditions(self, tokens, index):
+        """
+        :returns: A set of all conditions for rules
+        that are applicable to C{tokens[index]}.
+        """
+        conditions = []
+
+        for feature in self._features:
+            conditions.append([])
+            for pos in feature.positions:
+                if not (0 <= index+pos < len(tokens)):
+                    continue
+                value = feature.extract_property(tokens, index+pos)
+                conditions[-1].append( (feature, value) )
+        return conditions
+
+    def get_neighborhood(self, tokens, index):
+        # inherit docs from BrillTemplateI
+
+        # applicable_rules(tokens, index, ...) depends on index.
+        neighborhood = set([index])  #set literal for python 2.7+
+
+        # applicable_rules(tokens, i, ...) depends on index if
+        # i+start < index <= i+end.
+
+        allpositions = [0] + [p for feat in self._features for p in feat.positions]
+        start, end = min(allpositions), max(allpositions)
+        s = max(0, index+(-end))
+        e = min(index+(-start)+1, len(tokens))
+        for i in range(s, e):
+            neighborhood.add(i)
+        return neighborhood
+
+    @classmethod
+    def expand(cls, featurelists, combinations=None, skipintersecting=True):
+
+        """
+        Factory method to mass generate Templates from a list L of lists of  Features.
+
+        #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2
+        #of the sublists in L; it will output all Templates formed by the Cartesian product
+        #of this selection, with duplicates and other semantically equivalent
+        #forms removed. Default for combinations is (1, len(L)).
+
+        The feature lists may have been specified
+        manually, or generated from Feature.expand(). For instance,
+
+        >>> from nltk.tbl.template import Template
+        >>> from nltk.tag.brill import Word, Pos
+
+        #creating some features
+        >>> (wd_0, wd_01) = (Word([0]), Word([0,1]))
+
+        >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3]))
+
+        >>> list(Template.expand([[wd_0], [pos_m2]]))
+        [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))]
+
+        >>> list(Template.expand([[wd_0, wd_01], [pos_m2]]))
+        [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))]
+
+        #note: with Feature.expand(), it is very easy to generate more templates
+        #than your system can handle -- for instance,
+        >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False)
+        >>> len(wordtpls)
+        7
+
+        >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True)
+        >>> len(postpls)
+        9
+
+        #and now the Cartesian product of all non-empty combinations of two wordtpls and
+        #two postpls, with semantic equivalents removed
+        >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls]))
+        >>> len(templates)
+        713
+
+
+          will return a list of eight templates
+              Template(Word([0])),
+              Template(Word([0, 1])),
+              Template(Pos([-2])),
+              Template(Pos([-1])),
+              Template(Pos([-2]),Word([0])),
+              Template(Pos([-1]),Word([0])),
+              Template(Pos([-2]),Word([0, 1])),
+              Template(Pos([-1]),Word([0, 1]))]
+
+
+        #Templates where one feature is a subset of another, such as
+        #Template(Word([0,1]), Word([1]), will not appear in the output.
+        #By default, this non-subset constraint is tightened to disjointness:
+        #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out.
+        #With skipintersecting=False, then such Templates are allowed
+
+        WARNING: this method makes it very easy to fill all your memory when training
+        generated templates on any real-world corpus
+
+        :param featurelists: lists of Features, whose Cartesian product will return a set of Templates
+        :type featurelists: list of (list of Features)
+        :param combinations: given n featurelists: if combinations=k, all generated Templates will have
+                k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n
+        :type combinations: None, int, or (int, int)
+        :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature)
+        :type skipintersecting: bool
+        :returns: generator of Templates
+
+        """
+        def nonempty_powerset(xs): #xs is a list
+            # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
+
+            # find the correct tuple given combinations, one of {None, k, (k1,k2)}
+            k = combinations #for brevity
+            combrange = ((1, len(xs)+1) if k is None else     # n over 1 .. n over n (all non-empty combinations)
+                         (k, k+1) if isinstance(k, int) else  # n over k (only
+                         (k[0], k[1]+1))                      # n over k1, n over k1+1... n over k2
+            return it.chain.from_iterable(it.combinations(xs, r)
+                                          for r in range(*combrange))
+        seentemplates = set()
+        for picks in nonempty_powerset(featurelists):
+            for pick in it.product(*picks):
+                if any(i != j and x.issuperset(y)
+                       for (i, x) in enumerate(pick)
+                       for (j, y) in enumerate(pick)):
+                    continue
+                if skipintersecting and any(i != j and x.intersects(y)
+                                            for (i, x) in enumerate(pick)
+                                            for (j, y) in enumerate(pick)):
+                    continue
+                thistemplate = cls(*sorted(pick))
+                strpick = str(thistemplate)
+                #!!FIXME --this is hackish
+                if strpick in seentemplates: #already added
+                    cls._poptemplate()
+                    continue
+                seentemplates.add(strpick)
+                yield thistemplate
+
+    @classmethod
+    def _cleartemplates(cls):
+        cls.ALLTEMPLATES = []
+
+    @classmethod
+    def _poptemplate(cls):
+        return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None
diff --git a/nlp_resource_data/nltk/tbl/template.pyc b/nlp_resource_data/nltk/tbl/template.pyc

new file mode 100755 (executable)

index 0000000..2dff36c

Binary files /dev/null and b/nlp_resource_data/nltk/tbl/template.pyc differ
diff --git a/nlp_resource_data/nltk/text.py b/nlp_resource_data/nltk/text.py

new file mode 100755 (executable)

index 0000000..5563f3e
--- /dev/null
+++ b/nlp_resource_data/nltk/text.py
@@ -0,0 +1,616 @@
+# Natural Language Toolkit: Texts
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This module brings together a variety of NLTK functionality for
+text analysis, and provides simple, interactive interfaces.
+Functionality includes: concordancing, collocation discovery,
+regular expression search over tokenized strings, and
+distributional similarity.
+"""
+from __future__ import print_function, division, unicode_literals
+
+from math import log
+from collections import defaultdict, Counter
+from functools import reduce
+from itertools import islice
+import re
+
+from six import text_type
+
+from nltk.probability import FreqDist, LidstoneProbDist
+from nltk.probability import ConditionalFreqDist as CFD
+from nltk.util import tokenwrap, LazyConcatenation
+from nltk.metrics import f_measure, BigramAssocMeasures
+from nltk.collocations import BigramCollocationFinder
+from nltk.compat import python_2_unicode_compatible
+
+
+class ContextIndex(object):
+    """
+    A bidirectional index between words and their 'contexts' in a text.
+    The context of a word is usually defined to be the words that occur
+    in a fixed window around the word; but other definitions may also
+    be used by providing a custom context function.
+    """
+    @staticmethod
+    def _default_context(tokens, i):
+        """One left token and one right token, normalized to lowercase"""
+        left = (tokens[i-1].lower() if i != 0 else '*START*')
+        right = (tokens[i+1].lower() if i != len(tokens) - 1 else '*END*')
+        return (left, right)
+
+    def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
+        self._key = key
+        self._tokens = tokens
+        if context_func:
+            self._context_func = context_func
+        else:
+            self._context_func = self._default_context
+        if filter:
+            tokens = [t for t in tokens if filter(t)]
+        self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
+                                     for i, w in enumerate(tokens))
+        self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
+                                     for i, w in enumerate(tokens))
+
+    def tokens(self):
+        """
+        :rtype: list(str)
+        :return: The document that this context index was
+            created from.
+        """
+        return self._tokens
+
+    def word_similarity_dict(self, word):
+        """
+        Return a dictionary mapping from words to 'similarity scores,'
+        indicating how often these two words occur in the same
+        context.
+        """
+        word = self._key(word)
+        word_contexts = set(self._word_to_contexts[word])
+
+        scores = {}
+        for w, w_contexts in self._word_to_contexts.items():
+            scores[w] = f_measure(word_contexts, set(w_contexts))
+
+        return scores
+
+    def similar_words(self, word, n=20):
+        scores = defaultdict(int)
+        for c in self._word_to_contexts[self._key(word)]:
+            for w in self._context_to_words[c]:
+                if w != word:
+                    scores[w] += self._context_to_words[c][word] * self._context_to_words[c][w]
+        return sorted(scores, key=scores.get, reverse=True)[:n]
+
+    def common_contexts(self, words, fail_on_unknown=False):
+        """
+        Find contexts where the specified words can all appear; and
+        return a frequency distribution mapping each context to the
+        number of times that context was used.
+
+        :param words: The words used to seed the similarity search
+        :type words: str
+        :param fail_on_unknown: If true, then raise a value error if
+            any of the given words do not occur at all in the index.
+        """
+        words = [self._key(w) for w in words]
+        contexts = [set(self._word_to_contexts[w]) for w in words]
+        empty = [words[i] for i in range(len(words)) if not contexts[i]]
+        common = reduce(set.intersection, contexts)
+        if empty and fail_on_unknown:
+            raise ValueError("The following word(s) were not found:",
+                             " ".join(words))
+        elif not common:
+            # nothing in common -- just return an empty freqdist.
+            return FreqDist()
+        else:
+            fd = FreqDist(c for w in words
+                          for c in self._word_to_contexts[w]
+                          if c in common)
+            return fd
+
+@python_2_unicode_compatible
+class ConcordanceIndex(object):
+    """
+    An index that can be used to look up the offset locations at which
+    a given word occurs in a document.
+    """
+    def __init__(self, tokens, key=lambda x:x):
+        """
+        Construct a new concordance index.
+
+        :param tokens: The document (list of tokens) that this
+            concordance index was created from.  This list can be used
+            to access the context of a given word occurrence.
+        :param key: A function that maps each token to a normalized
+            version that will be used as a key in the index.  E.g., if
+            you use ``key=lambda s:s.lower()``, then the index will be
+            case-insensitive.
+        """
+        self._tokens = tokens
+        """The document (list of tokens) that this concordance index
+           was created from."""
+
+        self._key = key
+        """Function mapping each token to an index key (or None)."""
+
+        self._offsets = defaultdict(list)
+        """Dictionary mapping words (or keys) to lists of offset
+           indices."""
+
+        # Initialize the index (self._offsets)
+        for index, word in enumerate(tokens):
+            word = self._key(word)
+            self._offsets[word].append(index)
+
+    def tokens(self):
+        """
+        :rtype: list(str)
+        :return: The document that this concordance index was
+            created from.
+        """
+        return self._tokens
+
+    def offsets(self, word):
+        """
+        :rtype: list(int)
+        :return: A list of the offset positions at which the given
+            word occurs.  If a key function was specified for the
+            index, then given word's key will be looked up.
+        """
+        word = self._key(word)
+        return self._offsets[word]
+
+    def __repr__(self):
+        return '<ConcordanceIndex for %d tokens (%d types)>' % (
+            len(self._tokens), len(self._offsets))
+
+    def print_concordance(self, word, width=75, lines=25):
+        """
+        Print a concordance for ``word`` with the specified context window.
+
+        :param word: The target word
+        :type word: str
+        :param width: The width of each line, in characters (default=80)
+        :type width: int
+        :param lines: The number of lines to display (default=25)
+        :type lines: int
+        """
+        half_width = (width - len(word) - 2) // 2
+        context = width // 4 # approx number of words of context
+
+        offsets = self.offsets(word)
+        if offsets:
+            lines = min(lines, len(offsets))
+            print("Displaying %s of %s matches:" % (lines, len(offsets)))
+            for i in offsets:
+                if lines <= 0:
+                    break
+                left = (' ' * half_width +
+                        ' '.join(self._tokens[i-context:i]))
+                right = ' '.join(self._tokens[i+1:i+context])
+                left = left[-half_width:]
+                right = right[:half_width]
+                print(left, self._tokens[i], right)
+                lines -= 1
+        else:
+            print("No matches")
+
+class TokenSearcher(object):
+    """
+    A class that makes it easier to use regular expressions to search
+    over tokenized strings.  The tokenized string is converted to a
+    string where tokens are marked with angle brackets -- e.g.,
+    ``'<the><window><is><still><open>'``.  The regular expression
+    passed to the ``findall()`` method is modified to treat angle
+    brackets as non-capturing parentheses, in addition to matching the
+    token boundaries; and to have ``'.'`` not match the angle brackets.
+    """
+    def __init__(self, tokens):
+        self._raw = ''.join('<'+w+'>' for w in tokens)
+
+    def findall(self, regexp):
+        """
+        Find instances of the regular expression in the text.
+        The text is a list of tokens, and a regexp pattern to match
+        a single token must be surrounded by angle brackets.  E.g.
+
+        >>> from nltk.text import TokenSearcher
+        >>> print('hack'); from nltk.book import text1, text5, text9
+        hack...
+        >>> text5.findall("<.*><.*><bro>")
+        you rule bro; telling you bro; u twizted bro
+        >>> text1.findall("<a>(<.*>)<man>")
+        monied; nervous; dangerous; white; white; white; pious; queer; good;
+        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
+        pale; furious; better; certain; complete; dismasted; younger; brave;
+        brave; brave; brave
+        >>> text9.findall("<th.*>{3,}")
+        thread through those; the thought that; that the thing; the thing
+        that; that that thing; through these than through; them that the;
+        through the thick; them that they; thought that the
+
+        :param regexp: A regular expression
+        :type regexp: str
+        """
+        # preprocess the regular expression
+        regexp = re.sub(r'\s', '', regexp)
+        regexp = re.sub(r'<', '(?:<(?:', regexp)
+        regexp = re.sub(r'>', ')>)', regexp)
+        regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
+
+        # perform the search
+        hits = re.findall(regexp, self._raw)
+
+        # Sanity check
+        for h in hits:
+            if not h.startswith('<') and h.endswith('>'):
+                raise ValueError('Bad regexp for TokenSearcher.findall')
+
+        # postprocess the output
+        hits = [h[1:-1].split('><') for h in hits]
+        return hits
+
+
+@python_2_unicode_compatible
+class Text(object):
+    """
+    A wrapper around a sequence of simple (string) tokens, which is
+    intended to support initial exploration of texts (via the
+    interactive console).  Its methods perform a variety of analyses
+    on the text's contexts (e.g., counting, concordancing, collocation
+    discovery), and display the results.  If you wish to write a
+    program which makes use of these analyses, then you should bypass
+    the ``Text`` class, and use the appropriate analysis function or
+    class directly instead.
+
+    A ``Text`` is typically initialized from a given document or
+    corpus.  E.g.:
+
+    >>> import nltk.corpus
+    >>> from nltk.text import Text
+    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
+
+    """
+    # This defeats lazy loading, but makes things faster.  This
+    # *shouldn't* be necessary because the corpus view *should* be
+    # doing intelligent caching, but without this it's running slow.
+    # Look into whether the caching is working correctly.
+    _COPY_TOKENS = True
+
+    def __init__(self, tokens, name=None):
+        """
+        Create a Text object.
+
+        :param tokens: The source text.
+        :type tokens: sequence of str
+        """
+        if self._COPY_TOKENS:
+            tokens = list(tokens)
+        self.tokens = tokens
+
+        if name:
+            self.name = name
+        elif ']' in tokens[:20]:
+            end = tokens[:20].index(']')
+            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
+        else:
+            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
+
+    #////////////////////////////////////////////////////////////
+    # Support item & slice access
+    #////////////////////////////////////////////////////////////
+
+    def __getitem__(self, i):
+        return self.tokens[i]
+
+    def __len__(self):
+        return len(self.tokens)
+
+    #////////////////////////////////////////////////////////////
+    # Interactive console methods
+    #////////////////////////////////////////////////////////////
+
+    def concordance(self, word, width=79, lines=25):
+        """
+        Print a concordance for ``word`` with the specified context window.
+        Word matching is not case-sensitive.
+        :seealso: ``ConcordanceIndex``
+        """
+        if '_concordance_index' not in self.__dict__:
+            #print("Building index...")
+            self._concordance_index = ConcordanceIndex(self.tokens,
+                                                       key=lambda s:s.lower())
+
+        self._concordance_index.print_concordance(word, width, lines)
+
+    def collocations(self, num=20, window_size=2):
+        """
+        Print collocations derived from the text, ignoring stopwords.
+
+        :seealso: find_collocations
+        :param num: The maximum number of collocations to print.
+        :type num: int
+        :param window_size: The number of tokens spanned by a collocation (default=2)
+        :type window_size: int
+        """
+        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
+            self._num = num
+            self._window_size = window_size
+
+            #print("Building collocations list")
+            from nltk.corpus import stopwords
+            ignored_words = stopwords.words('english')
+            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
+            finder.apply_freq_filter(2)
+            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
+            bigram_measures = BigramAssocMeasures()
+            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
+        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
+        print(tokenwrap(colloc_strings, separator="; "))
+
+    def count(self, word):
+        """
+        Count the number of times this word appears in the text.
+        """
+        return self.tokens.count(word)
+
+    def index(self, word):
+        """
+        Find the index of the first occurrence of the word in the text.
+        """
+        return self.tokens.index(word)
+
+    def readability(self, method):
+        # code from nltk_contrib.readability
+        raise NotImplementedError
+
+    def similar(self, word, num=20):
+        """
+        Distributional similarity: find other words which appear in the
+        same contexts as the specified word; list most similar words first.
+
+        :param word: The word used to seed the similarity search
+        :type word: str
+        :param num: The number of words to generate (default=20)
+        :type num: int
+        :seealso: ContextIndex.similar_words()
+        """
+        if '_word_context_index' not in self.__dict__:
+            #print('Building word-context index...')
+            self._word_context_index = ContextIndex(self.tokens,
+                                                    filter=lambda x:x.isalpha(),
+                                                    key=lambda s:s.lower())
+
+#        words = self._word_context_index.similar_words(word, num)
+
+        word = word.lower()
+        wci = self._word_context_index._word_to_contexts
+        if word in wci.conditions():
+            contexts = set(wci[word])
+            fd = Counter(w for w in wci.conditions() for c in wci[w]
+                          if c in contexts and not w == word)
+            words = [w for w, _ in fd.most_common(num)]
+            print(tokenwrap(words))
+        else:
+            print("No matches")
+
+
+    def common_contexts(self, words, num=20):
+        """
+        Find contexts where the specified words appear; list
+        most frequent common contexts first.
+
+        :param word: The word used to seed the similarity search
+        :type word: str
+        :param num: The number of words to generate (default=20)
+        :type num: int
+        :seealso: ContextIndex.common_contexts()
+        """
+        if '_word_context_index' not in self.__dict__:
+            #print('Building word-context index...')
+            self._word_context_index = ContextIndex(self.tokens,
+                                                    key=lambda s:s.lower())
+
+        try:
+            fd = self._word_context_index.common_contexts(words, True)
+            if not fd:
+                print("No common contexts were found")
+            else:
+                ranked_contexts = [w for w, _ in fd.most_common(num)]
+                print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts))
+
+        except ValueError as e:
+            print(e)
+
+    def dispersion_plot(self, words):
+        """
+        Produce a plot showing the distribution of the words through the text.
+        Requires pylab to be installed.
+
+        :param words: The words to be plotted
+        :type words: list(str)
+        :seealso: nltk.draw.dispersion_plot()
+        """
+        from nltk.draw import dispersion_plot
+        dispersion_plot(self, words)
+
+    def generate(self, words):
+        """
+        Issues a reminder to users following the book online
+        """
+        import warnings
+        warnings.warn('The generate() method is no longer available.', DeprecationWarning)
+
+    def plot(self, *args):
+        """
+        See documentation for FreqDist.plot()
+        :seealso: nltk.prob.FreqDist.plot()
+        """
+        self.vocab().plot(*args)
+
+    def vocab(self):
+        """
+        :seealso: nltk.prob.FreqDist
+        """
+        if "_vocab" not in self.__dict__:
+            #print("Building vocabulary index...")
+            self._vocab = FreqDist(self)
+        return self._vocab
+
+    def findall(self, regexp):
+        """
+        Find instances of the regular expression in the text.
+        The text is a list of tokens, and a regexp pattern to match
+        a single token must be surrounded by angle brackets.  E.g.
+
+        >>> print('hack'); from nltk.book import text1, text5, text9
+        hack...
+        >>> text5.findall("<.*><.*><bro>")
+        you rule bro; telling you bro; u twizted bro
+        >>> text1.findall("<a>(<.*>)<man>")
+        monied; nervous; dangerous; white; white; white; pious; queer; good;
+        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
+        pale; furious; better; certain; complete; dismasted; younger; brave;
+        brave; brave; brave
+        >>> text9.findall("<th.*>{3,}")
+        thread through those; the thought that; that the thing; the thing
+        that; that that thing; through these than through; them that the;
+        through the thick; them that they; thought that the
+
+        :param regexp: A regular expression
+        :type regexp: str
+        """
+
+        if "_token_searcher" not in self.__dict__:
+            self._token_searcher = TokenSearcher(self)
+
+        hits = self._token_searcher.findall(regexp)
+        hits = [' '.join(h) for h in hits]
+        print(tokenwrap(hits, "; "))
+
+    #////////////////////////////////////////////////////////////
+    # Helper Methods
+    #////////////////////////////////////////////////////////////
+
+    _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
+    def _context(self, tokens, i):
+        """
+        One left & one right token, both case-normalized.  Skip over
+        non-sentence-final punctuation.  Used by the ``ContextIndex``
+        that is created for ``similar()`` and ``common_contexts()``.
+        """
+        # Left context
+        j = i-1
+        while j>=0 and not self._CONTEXT_RE.match(tokens[j]):
+            j -= 1
+        left = (tokens[j] if j != 0 else '*START*')
+
+        # Right context
+        j = i+1
+        while j<len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
+            j += 1
+        right = (tokens[j] if j != len(tokens) else '*END*')
+
+        return (left, right)
+
+    #////////////////////////////////////////////////////////////
+    # String Display
+    #////////////////////////////////////////////////////////////
+
+    def __str__(self):
+        return '<Text: %s>' % self.name
+
+    def __repr__(self):
+        return '<Text: %s>' % self.name
+
+
+# Prototype only; this approach will be slow to load
+class TextCollection(Text):
+    """A collection of texts, which can be loaded with list of texts, or
+    with a corpus consisting of one or more texts, and which supports
+    counting, concordancing, collocation discovery, etc.  Initialize a
+    TextCollection as follows:
+
+    >>> import nltk.corpus
+    >>> from nltk.text import TextCollection
+    >>> print('hack'); from nltk.book import text1, text2, text3
+    hack...
+    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
+    >>> mytexts = TextCollection([text1, text2, text3])
+
+    Iterating over a TextCollection produces all the tokens of all the
+    texts in order.
+    """
+    def __init__(self, source):
+        if hasattr(source, 'words'): # bridge to the text corpus reader
+            source = [source.words(f) for f in source.fileids()]
+
+        self._texts = source
+        Text.__init__(self, LazyConcatenation(source))
+        self._idf_cache = {}
+
+    def tf(self, term, text):
+        """ The frequency of the term in text. """
+        return text.count(term) / len(text)
+
+    def idf(self, term):
+        """ The number of texts in the corpus divided by the
+        number of texts that the term appears in.
+        If a term does not appear in the corpus, 0.0 is returned. """
+        # idf values are cached for performance.
+        idf = self._idf_cache.get(term)
+        if idf is None:
+            matches = len([True for text in self._texts if term in text])
+            # FIXME Should this raise some kind of error instead?
+            idf = (log(len(self._texts) / matches) if matches else 0.0)
+            self._idf_cache[term] = idf
+        return idf
+
+    def tf_idf(self, term, text):
+        return self.tf(term, text) * self.idf(term)
+
+def demo():
+    from nltk.corpus import brown
+    text = Text(brown.words(categories='news'))
+    print(text)
+    print()
+    print("Concordance:")
+    text.concordance('news')
+    print()
+    print("Distributionally similar words:")
+    text.similar('news')
+    print()
+    print("Collocations:")
+    text.collocations()
+    print()
+    #print("Automatically generated text:")
+    #text.generate()
+    #print()
+    print("Dispersion plot:")
+    text.dispersion_plot(['news', 'report', 'said', 'announced'])
+    print()
+    print("Vocabulary plot:")
+    text.plot(50)
+    print()
+    print("Indexing:")
+    print("text[3]:", text[3])
+    print("text[3:5]:", text[3:5])
+    print("text.vocab()['news']:", text.vocab()['news'])
+
+if __name__ == '__main__':
+    demo()
+
+__all__ = ["ContextIndex",
+           "ConcordanceIndex",
+           "TokenSearcher",
+           "Text",
+           "TextCollection"]
diff --git a/nlp_resource_data/nltk/text.pyc b/nlp_resource_data/nltk/text.pyc

new file mode 100755 (executable)

index 0000000..e0c4eb8

Binary files /dev/null and b/nlp_resource_data/nltk/text.pyc differ
diff --git a/nlp_resource_data/nltk/tgrep.py b/nlp_resource_data/nltk/tgrep.py

new file mode 100755 (executable)

index 0000000..81c7e2b
--- /dev/null
+++ b/nlp_resource_data/nltk/tgrep.py
@@ -0,0 +1,938 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: TGrep search
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Will Roberts <wildwilhelm@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+'''
+============================================
+ TGrep search implementation for NLTK trees
+============================================
+
+This module supports TGrep2 syntax for matching parts of NLTK Trees.
+Note that many tgrep operators require the tree passed to be a
+``ParentedTree``.
+
+External links:
+
+- `Tgrep tutorial <http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html>`_
+- `Tgrep2 manual <http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf>`_
+- `Tgrep2 source <http://tedlab.mit.edu/~dr/Tgrep2/>`_
+
+Usage
+=====
+
+>>> from nltk.tree import ParentedTree
+>>> from nltk.tgrep import tgrep_nodes, tgrep_positions
+>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
+>>> list(tgrep_nodes('NN', [tree]))
+[[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]]
+>>> list(tgrep_positions('NN', [tree]))
+[[(0, 2), (2, 1)]]
+>>> list(tgrep_nodes('DT', [tree]))
+[[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]]
+>>> list(tgrep_nodes('DT $ JJ', [tree]))
+[[ParentedTree('DT', ['the'])]]
+
+This implementation adds syntax to select nodes based on their NLTK
+tree position.  This syntax is ``N`` plus a Python tuple representing
+the tree position.  For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are
+valid node selectors.  Example:
+
+>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
+>>> tree[0,0]
+ParentedTree('DT', ['the'])
+>>> tree[0,0].treeposition()
+(0, 0)
+>>> list(tgrep_nodes('N(0,0)', [tree]))
+[[ParentedTree('DT', ['the'])]]
+
+Caveats:
+========
+
+- Link modifiers: "?" and "=" are not implemented.
+- Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are
+  not implemented.
+- The "=" and "~" links are not implemented.
+
+Known Issues:
+=============
+
+- There are some issues with link relations involving leaf nodes
+  (which are represented as bare strings in NLTK trees).  For
+  instance, consider the tree::
+
+      (S (A x))
+
+  The search string ``* !>> S`` should select all nodes which are not
+  dominated in some way by an ``S`` node (i.e., all nodes which are
+  not descendants of an ``S``).  Clearly, in this tree, the only node
+  which fulfills this criterion is the top node (since it is not
+  dominated by anything).  However, the code here will find both the
+  top node and the leaf node ``x``.  This is because we cannot recover
+  the parent of the leaf, since it is stored as a bare string.
+
+  A possible workaround, when performing this kind of search, would be
+  to filter out all leaf nodes.
+
+Implementation notes
+====================
+
+This implementation is (somewhat awkwardly) based on lambda functions
+which are predicates on a node.  A predicate is a function which is
+either True or False; using a predicate function, we can identify sets
+of nodes with particular properties.  A predicate function, could, for
+instance, return True only if a particular node has a label matching a
+particular regular expression, and has a daughter node which has no
+sisters.  Because tgrep2 search strings can do things statefully (such
+as substituting in macros, and binding nodes with node labels), the
+actual predicate function is declared with three arguments::
+
+    pred = lambda n, m, l: return True # some logic here
+
+``n``
+    is a node in a tree; this argument must always be given
+
+``m``
+    contains a dictionary, mapping macro names onto predicate functions
+
+``l``
+    is a dictionary to map node labels onto nodes in the tree
+
+``m`` and ``l`` are declared to default to ``None``, and so need not be
+specified in a call to a predicate.  Predicates which call other
+predicates must always pass the value of these arguments on.  The
+top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
+macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
+
+import functools
+import re
+
+from six import binary_type, text_type
+
+try:
+    import pyparsing
+except ImportError:
+    print('Warning: nltk.tgrep will not work without the `pyparsing` package')
+    print('installed.')
+
+import nltk.tree
+
+class TgrepException(Exception):
+    '''Tgrep exception type.'''
+    pass
+
+def ancestors(node):
+    '''
+    Returns the list of all nodes dominating the given tree node.
+    This method will not work with leaf nodes, since there is no way
+    to recover the parent.
+    '''
+    results = []
+    try:
+        current = node.parent()
+    except AttributeError:
+        # if node is a leaf, we cannot retrieve its parent
+        return results
+    while current:
+        results.append(current)
+        current = current.parent()
+    return results
+
+def unique_ancestors(node):
+    '''
+    Returns the list of all nodes dominating the given node, where
+    there is only a single path of descent.
+    '''
+    results = []
+    try:
+        current = node.parent()
+    except AttributeError:
+        # if node is a leaf, we cannot retrieve its parent
+        return results
+    while current and len(current) == 1:
+        results.append(current)
+        current = current.parent()
+    return results
+
+def _descendants(node):
+    '''
+    Returns the list of all nodes which are descended from the given
+    tree node in some way.
+    '''
+    try:
+        treepos = node.treepositions()
+    except AttributeError:
+        return []
+    return [node[x] for x in treepos[1:]]
+
+def _leftmost_descendants(node):
+    '''
+    Returns the set of all nodes descended in some way through
+    left branches from this node.
+    '''
+    try:
+        treepos = node.treepositions()
+    except AttributeError:
+        return []
+    return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
+
+def _rightmost_descendants(node):
+    '''
+    Returns the set of all nodes descended in some way through
+    right branches from this node.
+    '''
+    try:
+        rightmost_leaf = max(node.treepositions())
+    except AttributeError:
+        return []
+    return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
+
+def _istree(obj):
+    '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
+    return isinstance(obj, nltk.tree.Tree)
+
+def _unique_descendants(node):
+    '''
+    Returns the list of all nodes descended from the given node, where
+    there is only a single path of descent.
+    '''
+    results = []
+    current = node
+    while current and _istree(current) and len(current) == 1:
+        current = current[0]
+        results.append(current)
+    return results
+
+def _before(node):
+    '''
+    Returns the set of all nodes that are before the given node.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+    except AttributeError:
+        return []
+    return [tree[x] for x in tree.treepositions()
+            if x[:len(pos)] < pos[:len(x)]]
+
+def _immediately_before(node):
+    '''
+    Returns the set of all nodes that are immediately before the given
+    node.
+
+    Tree node A immediately precedes node B if the last terminal
+    symbol (word) produced by A immediately precedes the first
+    terminal symbol produced by B.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+    except AttributeError:
+        return []
+    # go "upwards" from pos until there is a place we can go to the left
+    idx = len(pos) - 1
+    while 0 <= idx and pos[idx] == 0:
+        idx -= 1
+    if idx < 0:
+        return []
+    pos = list(pos[:idx + 1])
+    pos[-1] -= 1
+    before = tree[pos]
+    return [before] + _rightmost_descendants(before)
+
+def _after(node):
+    '''
+    Returns the set of all nodes that are after the given node.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+    except AttributeError:
+        return []
+    return [tree[x] for x in tree.treepositions()
+            if x[:len(pos)] > pos[:len(x)]]
+
+def _immediately_after(node):
+    '''
+    Returns the set of all nodes that are immediately after the given
+    node.
+
+    Tree node A immediately follows node B if the first terminal
+    symbol (word) produced by A immediately follows the last
+    terminal symbol produced by B.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+        current = node.parent()
+    except AttributeError:
+        return []
+    # go "upwards" from pos until there is a place we can go to the
+    # right
+    idx = len(pos) - 1
+    while 0 <= idx and pos[idx] == len(current) - 1:
+        idx -= 1
+        current = current.parent()
+    if idx < 0:
+        return []
+    pos = list(pos[:idx + 1])
+    pos[-1] += 1
+    after = tree[pos]
+    return [after] + _leftmost_descendants(after)
+
+def _tgrep_node_literal_value(node):
+    '''
+    Gets the string value of a given parse tree node, for comparison
+    using the tgrep node literal predicates.
+    '''
+    return (node.label() if _istree(node) else text_type(node))
+
+def _tgrep_macro_use_action(_s, _l, tokens):
+    '''
+    Builds a lambda function which looks up the macro name used.
+    '''
+    assert len(tokens) == 1
+    assert tokens[0][0] == '@'
+    macro_name = tokens[0][1:]
+    def macro_use(n, m=None, l=None):
+        if m is None or macro_name not in m:
+            raise TgrepException('macro {0} not defined'.format(macro_name))
+        return m[macro_name](n, m, l)
+    return macro_use
+
+def _tgrep_node_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    depending on the name of its node.
+    '''
+    # print 'node tokens: ', tokens
+    if tokens[0] == "'":
+        # strip initial apostrophe (tgrep2 print command)
+        tokens = tokens[1:]
+    if len(tokens) > 1:
+        # disjunctive definition of a node name
+        assert list(set(tokens[1::2])) == ['|']
+        # recursively call self to interpret each node name definition
+        tokens = [_tgrep_node_action(None, None, [node])
+                  for node in tokens[::2]]
+        # capture tokens and return the disjunction
+        return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
+    else:
+        if hasattr(tokens[0], '__call__'):
+            # this is a previously interpreted parenthetical node
+            # definition (lambda function)
+            return tokens[0]
+        elif tokens[0] == '*' or tokens[0] == '__':
+            return lambda n, m=None, l=None: True
+        elif tokens[0].startswith('"'):
+            assert tokens[0].endswith('"')
+            node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
+            return (lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s)(node_lit)
+        elif tokens[0].startswith('/'):
+            assert tokens[0].endswith('/')
+            node_lit = tokens[0][1:-1]
+            return (lambda r: lambda n, m=None, l=None:
+                    r.search(_tgrep_node_literal_value(n)))(re.compile(node_lit))
+        elif tokens[0].startswith('i@'):
+            node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
+            return (lambda f: lambda n, m=None, l=None:
+                    f(_tgrep_node_literal_value(n).lower()))(node_func)
+        else:
+            return (lambda s: lambda n, m=None, l=None:
+                    _tgrep_node_literal_value(n) == s)(tokens[0])
+
+def _tgrep_parens_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    from a parenthetical notation.
+    '''
+    # print 'parenthetical tokens: ', tokens
+    assert len(tokens) == 3
+    assert tokens[0] == '('
+    assert tokens[2] == ')'
+    return tokens[1]
+
+def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    which returns true if the node is located at a specific tree
+    position.
+    '''
+    # recover the tuple from the parsed sting
+    node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
+    # capture the node's tree position
+    return (lambda i: lambda n, m=None, l=None: (hasattr(n, 'treeposition') and
+                                                 n.treeposition() == i))(node_tree_position)
+
+def _tgrep_relation_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    depending on its relation to other nodes in the tree.
+    '''
+    # print 'relation tokens: ', tokens
+    # process negation first if needed
+    negated = False
+    if tokens[0] == '!':
+        negated = True
+        tokens = tokens[1:]
+    if tokens[0] == '[':
+        # process square-bracketed relation expressions
+        assert len(tokens) == 3
+        assert tokens[2] == ']'
+        retval = tokens[1]
+    else:
+        # process operator-node relation expressions
+        assert len(tokens) == 2
+        operator, predicate = tokens
+        # A < B       A is the parent of (immediately dominates) B.
+        if operator == '<':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l) for x in n))
+        # A > B       A is the child of B.
+        elif operator == '>':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                predicate(n.parent(), m, l))
+        # A <, B      Synonymous with A <1 B.
+        elif operator == '<,' or operator == '<1':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                bool(list(n)) and
+                                                predicate(n[0], m, l))
+        # A >, B      Synonymous with A >1 B.
+        elif operator == '>,' or operator == '>1':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                (n is n.parent()[0]) and
+                                                predicate(n.parent(), m, l))
+        # A <N B      B is the Nth child of A (the first child is <1).
+        elif operator[0] == '<' and operator[1:].isdigit():
+            idx = int(operator[1:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+                                                           bool(list(n)) and
+                                                           0 <= i < len(n) and
+                                                           predicate(n[i], m, l)))(idx - 1)
+        # A >N B      A is the Nth child of B (the first child is >1).
+        elif operator[0] == '>' and operator[1:].isdigit():
+            idx = int(operator[1:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                           bool(n.parent()) and
+                                                           0 <= i < len(n.parent()) and
+                                                           (n is n.parent()[i]) and
+                                                           predicate(n.parent(), m, l)))(idx - 1)
+        # A <' B      B is the last child of A (also synonymous with A <-1 B).
+        # A <- B      B is the last child of A (synonymous with A <-1 B).
+        elif operator == '<\'' or operator == '<-' or operator == '<-1':
+            retval = lambda n, m=None, l=None: (_istree(n) and bool(list(n))
+                                                and predicate(n[-1], m, l))
+        # A >' B      A is the last child of B (also synonymous with A >-1 B).
+        # A >- B      A is the last child of B (synonymous with A >-1 B).
+        elif operator == '>\'' or operator == '>-' or operator == '>-1':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                (n is n.parent()[-1]) and
+                                                predicate(n.parent(), m, l))
+        # A <-N B        B is the N th-to-last child of A (the last child is <-1).
+        elif operator[:2] == '<-' and operator[2:].isdigit():
+            idx = -int(operator[2:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+                                                           bool(list(n)) and
+                                                           0 <= (i + len(n)) < len(n) and
+                                                           predicate(n[i + len(n)], m, l)))(idx)
+        # A >-N B        A is the N th-to-last child of B (the last child is >-1).
+        elif operator[:2] == '>-' and operator[2:].isdigit():
+            idx = -int(operator[2:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None:
+                          (hasattr(n, 'parent') and
+                           bool(n.parent()) and
+                           0 <= (i + len(n.parent())) < len(n.parent()) and
+                           (n is n.parent()[i + len(n.parent())]) and
+                           predicate(n.parent(), m, l)))(idx)
+        # A <: B      B is the only child of A
+        elif operator == '<:':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                len(n) == 1 and
+                                                predicate(n[0], m, l))
+        # A >: B      A is the only child of B.
+        elif operator == '>:':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                len(n.parent()) == 1 and
+                                                predicate(n.parent(), m, l))
+        # A << B      A dominates B (A is an ancestor of B).
+        elif operator == '<<':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l) for x in _descendants(n)))
+        # A >> B      A is dominated by B (A is a descendant of B).
+        elif operator == '>>':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in ancestors(n))
+        # A <<, B     B is a left-most descendant of A.
+        elif operator == '<<,' or operator == '<<1':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l)
+                                                    for x in _leftmost_descendants(n)))
+        # A >>, B     A is a left-most descendant of B.
+        elif operator == '>>,':
+            retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+                                                    n in _leftmost_descendants(x))
+                                                   for x in ancestors(n))
+        # A <<' B     B is a right-most descendant of A.
+        elif operator == '<<\'':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l)
+                                                    for x in _rightmost_descendants(n)))
+        # A >>' B     A is a right-most descendant of B.
+        elif operator == '>>\'':
+            retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+                                                    n in _rightmost_descendants(x))
+                                                   for x in ancestors(n))
+        # A <<: B     There is a single path of descent from A and B is on it.
+        elif operator == '<<:':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l)
+                                                    for x in _unique_descendants(n)))
+        # A >>: B     There is a single path of descent from B and A is on it.
+        elif operator == '>>:':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in unique_ancestors(n))
+        # A . B       A immediately precedes B.
+        elif operator == '.':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+                                                   for x in _immediately_after(n))
+        # A , B       A immediately follows B.
+        elif operator == ',':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+                                                   for x in _immediately_before(n))
+        # A .. B      A precedes B.
+        elif operator == '..':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _after(n))
+        # A ,, B      A follows B.
+        elif operator == ',,':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _before(n))
+        # A $ B       A is a sister of B (and A != B).
+        elif operator == '$' or operator == '%':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                any(predicate(x, m, l)
+                                                    for x in n.parent() if x is not n))
+        # A $. B      A is a sister of and immediately precedes B.
+        elif operator == '$.' or operator == '%.':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'right_sibling') and
+                                                bool(n.right_sibling()) and
+                                                predicate(n.right_sibling(), m, l))
+        # A $, B      A is a sister of and immediately follows B.
+        elif operator == '$,' or operator == '%,':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'left_sibling') and
+                                                bool(n.left_sibling()) and
+                                                predicate(n.left_sibling(), m, l))
+        # A $.. B     A is a sister of and precedes B.
+        elif operator == '$..' or operator == '%..':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                hasattr(n, 'parent_index') and
+                                                bool(n.parent()) and
+                                                any(predicate(x, m, l) for x in
+                                                    n.parent()[n.parent_index() + 1:]))
+        # A $,, B     A is a sister of and follows B.
+        elif operator == '$,,' or operator == '%,,':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                hasattr(n, 'parent_index') and
+                                                bool(n.parent()) and
+                                                any(predicate(x, m, l) for x in
+                                                    n.parent()[:n.parent_index()]))
+        else:
+            raise TgrepException(
+                'cannot interpret tgrep operator "{0}"'.format(operator))
+    # now return the built function
+    if negated:
+        return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval)
+    else:
+        return retval
+
+def _tgrep_conjunction_action(_s, _l, tokens, join_char = '&'):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    from the conjunction of several other such lambda functions.
+
+    This is prototypically called for expressions like
+    (`tgrep_rel_conjunction`)::
+
+        < NP & < AP < VP
+
+    where tokens is a list of predicates representing the relations
+    (`< NP`, `< AP`, and `< VP`), possibly with the character `&`
+    included (as in the example here).
+
+    This is also called for expressions like (`tgrep_node_expr2`)::
+
+        NP < NN
+        S=s < /NP/=n : s < /VP/=v : n .. v
+
+    tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
+    list of segmented patterns (`tgrep_expr_labeled`, processed by
+    `_tgrep_segmented_pattern_action`).
+    '''
+    # filter out the ampersand
+    tokens = [x for x in tokens if x != join_char]
+    # print 'relation conjunction tokens: ', tokens
+    if len(tokens) == 1:
+        return tokens[0]
+    else:
+        return (lambda ts: lambda n, m=None, l=None: all(predicate(n, m, l)
+                                                         for predicate in ts))(tokens)
+
+def _tgrep_segmented_pattern_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a segmented pattern.
+
+    Called for expressions like (`tgrep_expr_labeled`)::
+
+        =s .. =v < =n
+
+    This is a segmented pattern, a tgrep2 expression which begins with
+    a node label.
+
+    The problem is that for segemented_pattern_action (': =v < =s'),
+    the first element (in this case, =v) is specifically selected by
+    virtue of matching a particular node in the tree; to retrieve
+    the node, we need the label, not a lambda function.  For node
+    labels inside a tgrep_node_expr, we need a lambda function which
+    returns true if the node visited is the same as =v.
+
+    We solve this by creating two copies of a node_label_use in the
+    grammar; the label use inside a tgrep_expr_labeled has a separate
+    parse action to the pred use inside a node_expr.  See
+    `_tgrep_node_label_use_action` and
+    `_tgrep_node_label_pred_use_action`.
+    '''
+    # tokens[0] is a string containing the node label
+    node_label = tokens[0]
+    # tokens[1:] is an (optional) list of predicates which must all
+    # hold of the bound node
+    reln_preds = tokens[1:]
+    def pattern_segment_pred(n, m=None, l=None):
+        '''This predicate function ignores its node argument.'''
+        # look up the bound node using its label
+        if l is None or node_label not in l:
+            raise TgrepException('node_label ={0} not bound in pattern'.format(
+                node_label))
+        node = l[node_label]
+        # match the relation predicates against the node
+        return all(pred(node, m, l) for pred in reln_preds)
+    return pattern_segment_pred
+
+def _tgrep_node_label_use_action(_s, _l, tokens):
+    '''
+    Returns the node label used to begin a tgrep_expr_labeled.  See
+    `_tgrep_segmented_pattern_action`.
+
+    Called for expressions like (`tgrep_node_label_use`)::
+
+        =s
+
+    when they appear as the first element of a `tgrep_expr_labeled`
+    expression (see `_tgrep_segmented_pattern_action`).
+
+    It returns the node label.
+    '''
+    assert len(tokens) == 1
+    assert tokens[0].startswith('=')
+    return tokens[0][1:]
+
+def _tgrep_node_label_pred_use_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    which describes the use of a previously bound node label.
+
+    Called for expressions like (`tgrep_node_label_use_pred`)::
+
+        =s
+
+    when they appear inside a tgrep_node_expr (for example, inside a
+    relation).  The predicate returns true if and only if its node
+    argument is identical the the node looked up in the node label
+    dictionary using the node's label.
+    '''
+    assert len(tokens) == 1
+    assert tokens[0].startswith('=')
+    node_label = tokens[0][1:]
+    def node_label_use_pred(n, m=None, l=None):
+        # look up the bound node using its label
+        if l is None or node_label not in l:
+            raise TgrepException('node_label ={0} not bound in pattern'.format(
+                node_label))
+        node = l[node_label]
+        # truth means the given node is this node
+        return n is node
+    return node_label_use_pred
+
+def _tgrep_bind_node_label_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    which can optionally bind a matching node into the tgrep2 string's
+    label_dict.
+
+    Called for expressions like (`tgrep_node_expr2`)::
+
+        /NP/
+        @NP=n
+    '''
+    # tokens[0] is a tgrep_node_expr
+    if len(tokens) == 1:
+        return tokens[0]
+    else:
+        # if present, tokens[1] is the character '=', and tokens[2] is
+        # a tgrep_node_label, a string value containing the node label
+        assert len(tokens) == 3
+        assert tokens[1] == '='
+        node_pred = tokens[0]
+        node_label = tokens[2]
+        def node_label_bind_pred(n, m=None, l=None):
+            if node_pred(n, m, l):
+                # bind `n` into the dictionary `l`
+                if l is None:
+                    raise TgrepException(
+                        'cannot bind node_label {0}: label_dict is None'.format(
+                            node_label))
+                l[node_label] = n
+                return True
+            else:
+                return False
+        return node_label_bind_pred
+
+def _tgrep_rel_disjunction_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    from the disjunction of several other such lambda functions.
+    '''
+    # filter out the pipe
+    tokens = [x for x in tokens if x != '|']
+    # print 'relation disjunction tokens: ', tokens
+    if len(tokens) == 1:
+        return tokens[0]
+    elif len(tokens) == 2:
+        return (lambda a, b: lambda n, m=None, l=None:
+                a(n, m, l) or b(n, m, l))(tokens[0], tokens[1])
+
+def _macro_defn_action(_s, _l, tokens):
+    '''
+    Builds a dictionary structure which defines the given macro.
+    '''
+    assert len(tokens) == 3
+    assert tokens[0] == '@'
+    return {tokens[1]: tokens[2]}
+
+def _tgrep_exprs_action(_s, _l, tokens):
+    '''
+    This is the top-lebel node in a tgrep2 search string; the
+    predicate function it returns binds together all the state of a
+    tgrep2 search string.
+
+    Builds a lambda function representing a predicate on a tree node
+    from the disjunction of several tgrep expressions.  Also handles
+    macro definitions and macro name binding, and node label
+    definitions and node label binding.
+    '''
+    if len(tokens) == 1:
+        return lambda n, m=None, l=None: tokens[0](n, None, {})
+    # filter out all the semicolons
+    tokens = [x for x in tokens if x != ';']
+    # collect all macro definitions
+    macro_dict = {}
+    macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
+    for macro_def in macro_defs:
+        macro_dict.update(macro_def)
+    # collect all tgrep expressions
+    tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)]
+    # create a new scope for the node label dictionary
+    def top_level_pred(n, m=macro_dict, l=None):
+        label_dict = {}
+        # bind macro definitions and OR together all tgrep_exprs
+        return any(predicate(n, m, label_dict) for predicate in tgrep_exprs)
+    return top_level_pred
+
+def _build_tgrep_parser(set_parse_actions = True):
+    '''
+    Builds a pyparsing-based parser object for tokenizing and
+    interpreting tgrep search strings.
+    '''
+    tgrep_op = (pyparsing.Optional('!') +
+                pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*'))
+    tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\',
+                                           unquoteResults=False)
+    tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\',
+                                              unquoteResults=False)
+    tgrep_qstring_icase = pyparsing.Regex(
+        'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
+    tgrep_node_regex_icase = pyparsing.Regex(
+        'i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
+    tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
+    tgrep_expr = pyparsing.Forward()
+    tgrep_relations = pyparsing.Forward()
+    tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
+    tgrep_nltk_tree_pos = (
+        pyparsing.Literal('N(') +
+        pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' +
+                           pyparsing.Optional(pyparsing.delimitedList(
+                    pyparsing.Word(pyparsing.nums), delim=',') +
+                                              pyparsing.Optional(','))) + ')')
+    tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
+    tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
+    # see _tgrep_segmented_pattern_action
+    tgrep_node_label_use_pred = tgrep_node_label_use.copy()
+    macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
+    macro_name.setWhitespaceChars('')
+    macro_use = pyparsing.Combine('@' + macro_name)
+    tgrep_node_expr = (tgrep_node_label_use_pred |
+                       macro_use |
+                       tgrep_nltk_tree_pos |
+                       tgrep_qstring_icase |
+                       tgrep_node_regex_icase |
+                       tgrep_qstring |
+                       tgrep_node_regex |
+                       '*' |
+                       tgrep_node_literal)
+    tgrep_node_expr2 = ((tgrep_node_expr +
+                         pyparsing.Literal('=').setWhitespaceChars('') +
+                         tgrep_node_label.copy().setWhitespaceChars('')) |
+                        tgrep_node_expr)
+    tgrep_node = (tgrep_parens |
+                  (pyparsing.Optional("'") +
+                   tgrep_node_expr2 +
+                   pyparsing.ZeroOrMore("|" + tgrep_node_expr)))
+    tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
+    tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
+    tgrep_rel_conjunction = pyparsing.Forward()
+    tgrep_rel_conjunction << (tgrep_relation +
+                              pyparsing.ZeroOrMore(pyparsing.Optional('&') +
+                                                   tgrep_rel_conjunction))
+    tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
+        "|" + tgrep_relations)
+    tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
+    tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
+    tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
+    macro_defn = (pyparsing.Literal('@') +
+                  pyparsing.White().suppress() +
+                  macro_name +
+                  tgrep_expr2)
+    tgrep_exprs = (pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';') +
+                   tgrep_expr2 +
+                   pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2)) +
+                   pyparsing.ZeroOrMore(';').suppress())
+    if set_parse_actions:
+        tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
+        tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action)
+        macro_use.setParseAction(_tgrep_macro_use_action)
+        tgrep_node.setParseAction(_tgrep_node_action)
+        tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action)
+        tgrep_parens.setParseAction(_tgrep_parens_action)
+        tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action)
+        tgrep_relation.setParseAction(_tgrep_relation_action)
+        tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action)
+        tgrep_relations.setParseAction(_tgrep_rel_disjunction_action)
+        macro_defn.setParseAction(_macro_defn_action)
+        # the whole expression is also the conjunction of two
+        # predicates: the first node predicate, and the remaining
+        # relation predicates
+        tgrep_expr.setParseAction(_tgrep_conjunction_action)
+        tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
+        tgrep_expr2.setParseAction(functools.partial(_tgrep_conjunction_action,
+                                                     join_char = ':'))
+        tgrep_exprs.setParseAction(_tgrep_exprs_action)
+    return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
+
+def tgrep_tokenize(tgrep_string):
+    '''
+    Tokenizes a TGrep search string into separate tokens.
+    '''
+    parser = _build_tgrep_parser(False)
+    if isinstance(tgrep_string, binary_type):
+        tgrep_string = tgrep_string.decode()
+    return list(parser.parseString(tgrep_string))
+
+def tgrep_compile(tgrep_string):
+    '''
+    Parses (and tokenizes, if necessary) a TGrep search string into a
+    lambda function.
+    '''
+    parser = _build_tgrep_parser(True)
+    if isinstance(tgrep_string, binary_type):
+        tgrep_string = tgrep_string.decode()
+    return list(parser.parseString(tgrep_string, parseAll=True))[0]
+
+def treepositions_no_leaves(tree):
+    '''
+    Returns all the tree positions in the given tree which are not
+    leaf nodes.
+    '''
+    treepositions = tree.treepositions()
+    # leaves are treeposition tuples that are not prefixes of any
+    # other treeposition
+    prefixes = set()
+    for pos in treepositions:
+        for length in range(len(pos)):
+            prefixes.add(pos[:length])
+    return [pos for pos in treepositions if pos in prefixes]
+
+def tgrep_positions(pattern, trees, search_leaves=True):
+    """
+    Return the tree positions in the trees which match the given pattern.
+
+    :param pattern: a tgrep search pattern
+    :type pattern: str or output of tgrep_compile()
+    :param trees: a sequence of NLTK trees (usually ParentedTrees)
+    :type trees: iter(ParentedTree) or iter(Tree)
+    :param search_leaves: whether ot return matching leaf nodes
+    :type search_leaves: bool
+    :rtype: iter(tree positions)
+    """
+
+    if isinstance(pattern, (binary_type, text_type)):
+        pattern = tgrep_compile(pattern)
+
+    for tree in trees:
+        try:
+            if search_leaves:
+                positions = tree.treepositions()
+            else:
+                positions = treepositions_no_leaves(tree)
+            yield [position for position in positions
+                      if pattern(tree[position])]
+        except AttributeError:
+            yield []
+
+def tgrep_nodes(pattern, trees, search_leaves=True):
+    """
+    Return the tree nodes in the trees which match the given pattern.
+
+    :param pattern: a tgrep search pattern
+    :type pattern: str or output of tgrep_compile()
+    :param trees: a sequence of NLTK trees (usually ParentedTrees)
+    :type trees: iter(ParentedTree) or iter(Tree)
+    :param search_leaves: whether ot return matching leaf nodes
+    :type search_leaves: bool
+    :rtype: iter(tree nodes)
+    """
+
+    if isinstance(pattern, (binary_type, text_type)):
+        pattern = tgrep_compile(pattern)
+
+    for tree in trees:
+        try:
+            if search_leaves:
+                positions = tree.treepositions()
+            else:
+                positions = treepositions_no_leaves(tree)
+            yield [tree[position] for position in positions
+                      if pattern(tree[position])]
+        except AttributeError:
+            yield []
diff --git a/nlp_resource_data/nltk/tgrep.pyc b/nlp_resource_data/nltk/tgrep.pyc

new file mode 100755 (executable)

index 0000000..aa8d5eb

Binary files /dev/null and b/nlp_resource_data/nltk/tgrep.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/__init__.py b/nlp_resource_data/nltk/tokenize/__init__.py

new file mode 100755 (executable)

index 0000000..6c16781
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/__init__.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# Contributors: matthewmc, clouds56
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+r"""
+NLTK Tokenizer Package
+
+Tokenizers divide strings into lists of substrings.  For example,
+tokenizers can be used to find the words and punctuation in a string:
+
+    >>> from nltk.tokenize import word_tokenize
+    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+    ... two of them.\n\nThanks.'''
+    >>> word_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+This particular tokenizer requires the Punkt sentence tokenization
+models to be installed. NLTK also provides a simpler,
+regular-expression based tokenizer, which splits text on whitespace
+and punctuation:
+
+    >>> from nltk.tokenize import wordpunct_tokenize
+    >>> wordpunct_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+We can also operate at the level of sentences, using the sentence
+tokenizer directly as follows:
+
+    >>> from nltk.tokenize import sent_tokenize, word_tokenize
+    >>> sent_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
+    >>> [word_tokenize(t) for t in sent_tokenize(s)]
+    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
+    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
+
+Caution: when tokenizing a Unicode string, make sure you are not
+using an encoded version of the string (it may be necessary to
+decode it first, e.g. with ``s.decode("utf8")``.
+
+NLTK tokenizers can produce token-spans, represented as tuples of integers
+having the same semantics as string slices, to support efficient comparison
+of tokenizers.  (These methods are implemented as generators.)
+
+    >>> from nltk.tokenize import WhitespaceTokenizer
+    >>> list(WhitespaceTokenizer().span_tokenize(s))
+    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
+    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+
+There are numerous ways to tokenize text.  If you need more control over
+tokenization, see the other methods provided in this package.
+
+For further information, please see Chapter 3 of the NLTK book.
+"""
+
+import re
+
+from nltk.data              import load
+from nltk.tokenize.casual   import (TweetTokenizer, casual_tokenize)
+from nltk.tokenize.mwe      import MWETokenizer
+from nltk.tokenize.punkt    import PunktSentenceTokenizer
+from nltk.tokenize.regexp   import (RegexpTokenizer, WhitespaceTokenizer,
+                                    BlanklineTokenizer, WordPunctTokenizer,
+                                    wordpunct_tokenize, regexp_tokenize,
+                                    blankline_tokenize)
+from nltk.tokenize.repp     import ReppTokenizer
+from nltk.tokenize.sexpr    import SExprTokenizer, sexpr_tokenize
+from nltk.tokenize.simple   import (SpaceTokenizer, TabTokenizer, LineTokenizer,
+                                    line_tokenize)
+from nltk.tokenize.texttiling import TextTilingTokenizer
+from nltk.tokenize.toktok   import ToktokTokenizer
+from nltk.tokenize.treebank import TreebankWordTokenizer
+from nltk.tokenize.util     import string_span_tokenize, regexp_span_tokenize
+from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+
+# Standard sentence tokenizer.
+def sent_tokenize(text, language='english'):
+    """
+    Return a sentence-tokenized copy of *text*,
+    using NLTK's recommended sentence tokenizer
+    (currently :class:`.PunktSentenceTokenizer`
+    for the specified language).
+
+    :param text: text to split into sentences
+    :param language: the model name in the Punkt corpus
+    """
+    tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
+    return tokenizer.tokenize(text)
+
+# Standard word tokenizer.
+_treebank_word_tokenizer = TreebankWordTokenizer()
+
+# See discussion on https://github.com/nltk/nltk/pull/1437
+# Adding to TreebankWordTokenizer, the splits on
+# - chervon quotes u'\xab' and u'\xbb' .
+# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+
+improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
+improved_close_quote_regex = re.compile(u'([»”’])', re.U)
+improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
+_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
+_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
+_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
+
+
+def word_tokenize(text, language='english', preserve_line=False):
+    """
+    Return a tokenized copy of *text*,
+    using NLTK's recommended word tokenizer
+    (currently an improved :class:`.TreebankWordTokenizer`
+    along with :class:`.PunktSentenceTokenizer`
+    for the specified language).
+
+    :param text: text to split into words
+    :type text: str
+    :param language: the model name in the Punkt corpus
+    :type language: str
+    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
+    :type preserver_line: bool
+    """
+    sentences = [text] if preserve_line else sent_tokenize(text, language)
+    return [token for sent in sentences
+            for token in _treebank_word_tokenizer.tokenize(sent)]
diff --git a/nlp_resource_data/nltk/tokenize/__init__.pyc b/nlp_resource_data/nltk/tokenize/__init__.pyc

new file mode 100755 (executable)

index 0000000..5a37a5a

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/api.py b/nlp_resource_data/nltk/tokenize/api.py

new file mode 100755 (executable)

index 0000000..f38ce86
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/api.py
@@ -0,0 +1,77 @@
+# Natural Language Toolkit: Tokenizer Interface
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tokenizer Interface
+"""
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+from nltk.internals import overridden
+from nltk.tokenize.util import string_span_tokenize
+
+
+@add_metaclass(ABCMeta)
+class TokenizerI(object):
+    """
+    A processing interface for tokenizing a string.
+    Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
+    """
+    @abstractmethod
+    def tokenize(self, s):
+        """
+        Return a tokenized copy of *s*.
+
+        :rtype: list of str
+        """
+        if overridden(self.tokenize_sents):
+            return self.tokenize_sents([s])[0]
+
+    def span_tokenize(self, s):
+        """
+        Identify the tokens using integer offsets ``(start_i, end_i)``,
+        where ``s[start_i:end_i]`` is the corresponding token.
+
+        :rtype: iter(tuple(int, int))
+        """
+        raise NotImplementedError()
+
+    def tokenize_sents(self, strings):
+        """
+        Apply ``self.tokenize()`` to each element of ``strings``.  I.e.:
+
+            return [self.tokenize(s) for s in strings]
+
+        :rtype: list(list(str))
+        """
+        return [self.tokenize(s) for s in strings]
+
+    def span_tokenize_sents(self, strings):
+        """
+        Apply ``self.span_tokenize()`` to each element of ``strings``.  I.e.:
+
+            return [self.span_tokenize(s) for s in strings]
+
+        :rtype: iter(list(tuple(int, int)))
+        """
+        for s in strings:
+            yield list(self.span_tokenize(s))
+
+
+class StringTokenizer(TokenizerI):
+    """A tokenizer that divides a string into substrings by splitting
+    on the specified string (defined in subclasses).
+    """
+
+    def tokenize(self, s):
+        return s.split(self._string)
+
+    def span_tokenize(self, s):
+        for span in string_span_tokenize(s, self._string):
+            yield span
diff --git a/nlp_resource_data/nltk/tokenize/api.pyc b/nlp_resource_data/nltk/tokenize/api.pyc

new file mode 100755 (executable)

index 0000000..5e5da6d

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/api.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/casual.py b/nlp_resource_data/nltk/tokenize/casual.py

new file mode 100755 (executable)

index 0000000..4a44233
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/casual.py
@@ -0,0 +1,343 @@
+# coding: utf-8
+#
+# Natural Language Toolkit: Twitter Tokenizer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Christopher Potts <cgpotts@stanford.edu>
+#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
+#         Pierpaolo Pantone <> (modifications)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+
+"""
+Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
+domains and tasks. The basic logic is this:
+
+1. The tuple regex_strings defines a list of regular expression
+   strings.
+
+2. The regex_strings strings are put, in order, into a compiled
+   regular expression object called word_re.
+
+3. The tokenization is done by word_re.findall(s), where s is the
+   user-supplied string, inside the tokenize() method of the class
+   Tokenizer.
+
+4. When instantiating Tokenizer objects, there is a single option:
+   preserve_case.  By default, it is set to True. If it is set to
+   False, then the tokenizer will downcase everything except for
+   emoticons.
+
+"""
+
+
+
+######################################################################
+
+from __future__ import unicode_literals
+import re
+
+from six import int2byte, unichr
+from six.moves import html_entities
+
+######################################################################
+# The following strings are components in the regular expression
+# that is used for tokenizing. It's important that phone_number
+# appears first in the final regex (since it can contain whitespace).
+# It also could matter that tags comes after emoticons, due to the
+# possibility of having text like
+#
+#     <:| and some text >:)
+#
+# Most importantly, the final element should always be last, since it
+# does a last ditch whitespace-based tokenization of whatever is left.
+
+# ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
+
+# This particular element is used in a couple ways, so we define it
+# with a name:
+EMOTICONS = r"""
+    (?:
+      [<>]?
+      [:;=8]                     # eyes
+      [\-o\*\']?                 # optional nose
+      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
+      |
+      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
+      [\-o\*\']?                 # optional nose
+      [:;=8]                     # eyes
+      [<>]?
+      |
+      <3                         # heart
+    )"""
+
+# URL pattern due to John Gruber, modified by Tom Winzig. See
+# https://gist.github.com/winzig/8894715
+
+URLS = r"""                    # Capture 1: entire matched URL
+  (?:
+  https?:                              # URL protocol and colon
+    (?:
+      /{1,3}                           # 1-3 slashes
+      |                                        #   or
+      [a-z0-9%]                                # Single letter or digit or '%'
+                                       # (Trying not to match e.g. "URI::Escape")
+    )
+    |                                  #   or
+                                       # looks like domain name followed by a slash:
+    [a-z0-9.\-]+[.]
+    (?:[a-z]{2,13})
+    /
+  )
+  (?:                                  # One or more:
+    [^\s()<>{}\[\]]+                   # Run of non-space, non-()<>{}[]
+    |                                  #   or
+    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
+    |
+    \([^\s]+?\)                                # balanced parens, non-recursive: (...)
+  )+
+  (?:                                  # End with:
+    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
+    |
+    \([^\s]+?\)                                # balanced parens, non-recursive: (...)
+    |                                  #   or
+    [^\s`!()\[\]{};:'".,<>?«»“”‘’]   # not a space or one of these punct chars
+  )
+  |                                    # OR, the following to match naked domains:
+  (?:
+       (?<!@)                          # not preceded by a @, avoid matching foo@_gmail.com_
+    [a-z0-9]+
+    (?:[.\-][a-z0-9]+)*
+    [.]
+    (?:[a-z]{2,13})
+    \b
+    /?
+    (?!@)                              # not succeeded by a @,
+                            # avoid matching "foo.na" in "foo.na@example.com"
+  )
+"""
+
+# The components of the tokenizer:
+REGEXPS = (
+    URLS,
+    # Phone numbers:
+    r"""
+    (?:
+      (?:            # (international)
+        \+?[01]
+        [\-\s.]*
+      )?
+      (?:            # (area code)
+        [\(]?
+        \d{3}
+        [\-\s.\)]*
+      )?
+      \d{3}          # exchange
+      [\-\s.]*
+      \d{4}          # base
+    )"""
+    ,
+    # ASCII Emoticons
+    EMOTICONS
+    ,
+    # HTML tags:
+    r"""<[^>\s]+>"""
+    ,
+    # ASCII Arrows
+    r"""[\-]+>|<[\-]+"""
+    ,
+    # Twitter username:
+    r"""(?:@[\w_]+)"""
+    ,
+    # Twitter hashtags:
+    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
+    ,
+    # email addresses
+    r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]"""
+    ,
+    # Remaining word types:
+    r"""
+    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
+    |
+    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
+    |
+    (?:[\w_]+)                     # Words without apostrophes or dashes.
+    |
+    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
+    |
+    (?:\S)                         # Everything else that isn't whitespace.
+    """
+    )
+
+######################################################################
+# This is the core tokenizing regex:
+
+WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I
+                     | re.UNICODE)
+
+# WORD_RE performs poorly on these patterns:
+HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
+
+# The emoticon string gets its own regex so that we can preserve case for
+# them as needed:
+EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
+
+# These are for regularizing HTML entities to Unicode:
+ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
+
+
+######################################################################
+# Functions for converting html entities
+######################################################################
+
+def _str_to_unicode(text, encoding=None, errors='strict'):
+    if encoding is None:
+        encoding = 'utf-8'
+    if isinstance(text, bytes):
+        return text.decode(encoding, errors)
+    return text
+
+def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
+    """
+    Remove entities from text by converting them to their
+    corresponding unicode character.
+
+    :param text: a unicode string or a byte string encoded in the given
+    `encoding` (which defaults to 'utf-8').
+
+    :param list keep:  list of entity names which should not be replaced.\
+    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
+    and named entities (such as ``&nbsp;`` or ``&gt;``).
+
+    :param bool remove_illegal: If `True`, entities that can't be converted are\
+    removed. Otherwise, entities that can't be converted are kept "as
+    is".
+
+    :returns: A unicode string with the entities removed.
+
+    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
+
+        >>> from nltk.tokenize.casual import _replace_html_entities
+        >>> _replace_html_entities(b'Price: &pound;100')
+        'Price: \\xa3100'
+        >>> print(_replace_html_entities(b'Price: &pound;100'))
+        Price: £100
+        >>>
+    """
+
+    def _convert_entity(match):
+        entity_body = match.group(3)
+        if match.group(1):
+            try:
+                if match.group(2):
+                    number = int(entity_body, 16)
+                else:
+                    number = int(entity_body, 10)
+                # Numeric character references in the 80-9F range are typically
+                # interpreted by browsers as representing the characters mapped
+                # to bytes 80-9F in the Windows-1252 encoding. For more info
+                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
+                if 0x80 <= number <= 0x9f:
+                    return int2byte(number).decode('cp1252')
+            except ValueError:
+                number = None
+        else:
+            if entity_body in keep:
+                return match.group(0)
+            else:
+                number = html_entities.name2codepoint.get(entity_body)
+        if number is not None:
+            try:
+                return unichr(number)
+            except ValueError:
+                pass
+
+        return "" if remove_illegal else match.group(0)
+
+    return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
+
+
+######################################################################
+
+class TweetTokenizer:
+    r"""
+    Tokenizer for tweets.
+
+        >>> from nltk.tokenize import TweetTokenizer
+        >>> tknzr = TweetTokenizer()
+        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
+        >>> tknzr.tokenize(s0)
+        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
+
+    Examples using `strip_handles` and `reduce_len parameters`:
+
+        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
+        >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
+        >>> tknzr.tokenize(s1)
+        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
+    """
+
+    def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
+        self.preserve_case = preserve_case
+        self.reduce_len = reduce_len
+        self.strip_handles = strip_handles
+
+    def tokenize(self, text):
+        """
+        :param text: str
+        :rtype: list(str)
+        :return: a tokenized list of strings; concatenating this list returns\
+        the original string if `preserve_case=False`
+        """
+        # Fix HTML character entities:
+        text = _replace_html_entities(text)
+        # Remove username handles
+        if self.strip_handles:
+            text = remove_handles(text)
+        # Normalize word lengthening
+        if self.reduce_len:
+            text = reduce_lengthening(text)
+        # Shorten problematic sequences of characters
+        safe_text = HANG_RE.sub(r'\1\1\1', text)
+        # Tokenize:
+        words = WORD_RE.findall(safe_text)
+        # Possibly alter the case, but avoid changing emoticons like :D into :d:
+        if not self.preserve_case:
+            words = list(map((lambda x : x if EMOTICON_RE.search(x) else
+                              x.lower()), words))
+        return words
+
+######################################################################
+# Normalization Functions
+######################################################################
+
+def reduce_lengthening(text):
+    """
+    Replace repeated character sequences of length 3 or greater with sequences
+    of length 3.
+    """
+    pattern = re.compile(r"(.)\1{2,}")
+    return pattern.sub(r"\1\1\1", text)
+
+def remove_handles(text):
+    """
+    Remove Twitter username handles from text.
+    """
+    pattern = re.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")
+    # Substitute hadnles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+    return pattern.sub(' ', text)
+
+######################################################################
+# Tokenization Function
+######################################################################
+
+def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
+    """
+    Convenience function for wrapping the tokenizer.
+    """
+    return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len,
+                          strip_handles=strip_handles).tokenize(text)
+
+###############################################################################
diff --git a/nlp_resource_data/nltk/tokenize/casual.pyc b/nlp_resource_data/nltk/tokenize/casual.pyc

new file mode 100755 (executable)

index 0000000..30abc56

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/casual.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/moses.py b/nlp_resource_data/nltk/tokenize/moses.py

new file mode 100755 (executable)

index 0000000..0f7d31d
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/moses.py
@@ -0,0 +1,634 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit:
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pidong Wang, Josh Schroeder, Ondrej Bojar, based on code by Philipp Koehn
+# Contributors: Liling Tan, Martijn Pieters, Wiktor Stribizew
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+from __future__ import print_function
+import re
+from six import text_type
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import is_cjk
+from nltk.corpus import perluniprops, nonbreaking_prefixes
+
+
+class MosesTokenizer(TokenizerI):
+    """
+    This is a Python port of the Moses Tokenizer from
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    >>> tokenizer = MosesTokenizer()
+    >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+    >>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+    >>> tokenized_text = tokenizer.tokenize(text, return_str=True)
+    >>> tokenized_text == expected_tokenized
+    True
+    >>> tokenizer.tokenize(text) == [u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+    True
+
+    The nonbreaking prefixes should tokenize the final fullstop.
+
+    >>> m = MosesTokenizer()
+    >>> m.tokenize('abc def.')
+    [u'abc', u'def', u'.']
+
+    The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
+    In below example, "pp" is the last element, and there is no digit after it.
+
+    >>> m = MosesTokenizer()
+    >>> m.tokenize('2016, pp.')
+    [u'2016', u',', u'pp', u'.']
+    
+    >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+    >>> m.tokenize(sent, escape=True)
+    ['This', 'ain', '&apos;t', 'funny', '.', 'It', '&apos;s', 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '&#124;', '&#91;', '&#93;', '&lt;', '&gt;', '&#91;', '&#93;', '&amp;', 'You', '&apos;re', 'gonna', 'shake', 'it', 'off', '?', 'Don', '&apos;t', '?']
+    >>> m.tokenize(sent, escape=False)
+    ['This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off', '?', 'Don', "'t", '?']
+    """
+
+    # Perl Unicode Properties character sets.
+    IsN = text_type(''.join(perluniprops.chars('IsN')))
+    IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
+    IsSc = text_type(''.join(perluniprops.chars('IsSc')))
+    IsSo = text_type(''.join(perluniprops.chars('IsSo')))
+    IsAlpha = text_type(''.join(perluniprops.chars('IsAlpha')))
+    IsLower = text_type(''.join(perluniprops.chars('IsLower')))
+
+    # Remove ASCII junk.
+    DEDUPLICATE_SPACE = r'\s+', r' '
+    ASCII_JUNK = r'[\000-\037]', r''
+
+    # Neurotic Perl heading space, multi-space and trailing space chomp.
+    # These regexes are kept for reference purposes and shouldn't be used!!
+    MID_STRIP = r" +", r" "     # Use DEDUPLICATE_SPACE instead.
+    LEFT_STRIP = r"^ ", r""     # Uses text.lstrip() instead.
+    RIGHT_STRIP = r" $", r""    # Uses text.rstrip() instead.
+
+    # Pad all "other" special characters not in IsAlnum.
+    PAD_NOT_ISALNUM = u'([^{}\s\.\'\`\,\-])'.format(IsAlnum), r' \1 '
+
+    # Splits all hypens (regardless of circumstances), e.g.
+    # 'foo -- bar' -> 'foo @-@ @-@ bar' , 'foo-bar' -> 'foo @-@ bar'
+    AGGRESSIVE_HYPHEN_SPLIT = u'([{alphanum}])\-(?=[{alphanum}])'.format(alphanum=IsAlnum), r'\1 \@-\@ '
+
+    # Make multi-dots stay together.
+    REPLACE_DOT_WITH_LITERALSTRING_1 = r'\.([\.]+)', ' DOTMULTI\1'
+    REPLACE_DOT_WITH_LITERALSTRING_2 = r'DOTMULTI\.([^\.])', 'DOTDOTMULTI \1'
+    REPLACE_DOT_WITH_LITERALSTRING_3 = r'DOTMULTI\.', 'DOTDOTMULTI'
+
+    # Separate out "," except if within numbers (5,300)
+    # e.g.  A,B,C,D,E > A , B,C , D,E
+    # First application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    COMMA_SEPARATE_1 = u'([^{}])[,]'.format(IsN), r'\1 , '
+    COMMA_SEPARATE_2 = u'[,]([^{}])'.format(IsN), r' , \1'
+
+    # Attempt to get correct directional quotes.
+    DIRECTIONAL_QUOTE_1 = r'^``',               r'`` '
+    DIRECTIONAL_QUOTE_2 = r'^"',                r'`` '
+    DIRECTIONAL_QUOTE_3 = r'^`([^`])',          r'` \1'
+    DIRECTIONAL_QUOTE_4 = r"^'",                r'`  '
+    DIRECTIONAL_QUOTE_5 = r'([ ([{<])"',        r'\1 `` '
+    DIRECTIONAL_QUOTE_6 = r'([ ([{<])``',       r'\1 `` '
+    DIRECTIONAL_QUOTE_7 = r'([ ([{<])`([^`])',  r'\1 ` \2'
+    DIRECTIONAL_QUOTE_8 = r"([ ([{<])'",        r'\1 ` '
+
+    # Replace ... with _ELLIPSIS_
+    REPLACE_ELLIPSIS = r'\.\.\.',       r' _ELLIPSIS_ '
+    # Restore _ELLIPSIS_ with ...
+    RESTORE_ELLIPSIS = r'_ELLIPSIS_',   r'\.\.\.'
+
+    # Pad , with tailing space except if within numbers, e.g. 5,300
+    # These are used in nltk.tokenize.moses.penn_tokenize()
+    COMMA_1 = u'([^{numbers}])[,]([^{numbers}])'.format(numbers=IsN), r'\1 , \2'
+    COMMA_2 = u'([{numbers}])[,]([^{numbers}])'.format(numbers=IsN), r'\1 , \2'
+    COMMA_3 = u'([^{numbers}])[,]([{numbers}])'.format(numbers=IsN), r'\1 , \2'
+
+    # Pad unicode symbols with spaces.
+    SYMBOLS = u'([;:@#\$%&{}{}])'.format(IsSc, IsSo), r' \1 '
+
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser.
+    # e.g. "and/or" -> "and @/@ or"
+    INTRATOKEN_SLASHES = u'([{alphanum}])\/([{alphanum}])'.format(alphanum=IsAlnum), r'$1 \@\/\@ $2'
+
+    # Splits final period at end of string.
+    FINAL_PERIOD = r"""([^.])([.])([\]\)}>"']*) ?$""", r'\1 \2\3'
+    # Pad all question marks and exclamation marks with spaces.
+    PAD_QUESTION_EXCLAMATION_MARK = r'([?!])', r' \1 '
+
+    # Handles parentheses, brackets and converts them to PTB symbols.
+    PAD_PARENTHESIS = r'([\]\[\(\){}<>])', r' \1 '
+    CONVERT_PARENTHESIS_1 = r'\(', '-LRB-'
+    CONVERT_PARENTHESIS_2 = r'\)', '-RRB-'
+    CONVERT_PARENTHESIS_3 = r'\[', '-LSB-'
+    CONVERT_PARENTHESIS_4 = r'\]', '-RSB-'
+    CONVERT_PARENTHESIS_5 = r'\{', '-LCB-'
+    CONVERT_PARENTHESIS_6 = r'\}', '-RCB-'
+
+    # Pads double dashes with spaces.
+    PAD_DOUBLE_DASHES = r'--', ' -- '
+
+    # Adds spaces to start and end of string to simplify further regexps.
+    PAD_START_OF_STR = r'^', ' '
+    PAD_END_OF_STR = r'$', ' '
+
+    # Converts double quotes to two single quotes and pad with spaces.
+    CONVERT_DOUBLE_TO_SINGLE_QUOTES = r'"', " '' "
+    # Handles single quote in possessives or close-single-quote.
+    HANDLES_SINGLE_QUOTES = r"([^'])' ", r"\1 ' "
+
+    # Pad apostrophe in possessive or close-single-quote.
+    APOSTROPHE = r"([^'])'", r"\1 ' "
+
+    # Prepend space on contraction apostrophe.
+    CONTRACTION_1 = r"'([sSmMdD]) ", r" '\1 "
+    CONTRACTION_2 = r"'ll ", r" 'll "
+    CONTRACTION_3 = r"'re ", r" 're "
+    CONTRACTION_4 = r"'ve ", r" 've "
+    CONTRACTION_5 = r"n't ", r" n't "
+    CONTRACTION_6 = r"'LL ", r" 'LL "
+    CONTRACTION_7 = r"'RE ", r" 'RE "
+    CONTRACTION_8 = r"'VE ", r" 'VE "
+    CONTRACTION_9 = r"N'T ", r" N'T "
+
+    # Informal Contractions.
+    CONTRACTION_10 = r" ([Cc])annot ",  r" \1an not "
+    CONTRACTION_11 = r" ([Dd])'ye ",    r" \1' ye "
+    CONTRACTION_12 = r" ([Gg])imme ",   r" \1im me "
+    CONTRACTION_13 = r" ([Gg])onna ",   r" \1on na "
+    CONTRACTION_14 = r" ([Gg])otta ",   r" \1ot ta "
+    CONTRACTION_15 = r" ([Ll])emme ",   r" \1em me "
+    CONTRACTION_16 = r" ([Mm])ore$text =~ s='n ",  r" \1ore 'n "
+    CONTRACTION_17 = r" '([Tt])is ",    r" '\1 is "
+    CONTRACTION_18 = r" '([Tt])was ",   r" '\1 was "
+    CONTRACTION_19 = r" ([Ww])anna ",   r" \1an na "
+
+    # Clean out extra spaces
+    CLEAN_EXTRA_SPACE_1 = r'  *', r' '
+    CLEAN_EXTRA_SPACE_2 = r'^ *', r''
+    CLEAN_EXTRA_SPACE_3 = r' *$', r''
+
+    # Neurotic Perl regexes to escape special characters.
+    # These XML escaping regexes are kept such that tokens generated from
+    # NLTK's implementation is consistent with Moses' tokenizer's output.
+    # Outside of the MosesTokenizer function, it's strongly encouraged to use
+    # nltk.tokenize.util.xml_escape() function instead.
+    ESCAPE_AMPERSAND = r'&', r'&amp;'
+    ESCAPE_PIPE = r'\|', r'&#124;'
+    ESCAPE_LEFT_ANGLE_BRACKET = r'<', r'&lt;'
+    ESCAPE_RIGHT_ANGLE_BRACKET = r'>', r'&gt;'
+    ESCAPE_SINGLE_QUOTE = r"\'", r"&apos;"
+    ESCAPE_DOUBLE_QUOTE = r'\"', r'&quot;'
+    ESCAPE_LEFT_SQUARE_BRACKET = r"\[", r"&#91;"
+    ESCAPE_RIGHT_SQUARE_BRACKET = r"]", r"&#93;"
+
+    EN_SPECIFIC_1 = u"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+    EN_SPECIFIC_2 = u"([^{alpha}{isn}])[']([{alpha}])".format(alpha=IsAlpha, isn=IsN), r"\1 ' \2"
+    EN_SPECIFIC_3 = u"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+    EN_SPECIFIC_4 = u"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 '\2"
+    EN_SPECIFIC_5 = u"([{isn}])[']([s])".format(isn=IsN), r"\1 '\2"
+
+    ENGLISH_SPECIFIC_APOSTROPHE = [EN_SPECIFIC_1, EN_SPECIFIC_2, EN_SPECIFIC_3,
+                                   EN_SPECIFIC_4, EN_SPECIFIC_5]
+
+    FR_IT_SPECIFIC_1 = u"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+    FR_IT_SPECIFIC_2 = u"([^{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+    FR_IT_SPECIFIC_3 = u"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+    FR_IT_SPECIFIC_4 = u"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1' \2"
+
+    FR_IT_SPECIFIC_APOSTROPHE = [FR_IT_SPECIFIC_1, FR_IT_SPECIFIC_2,
+                                 FR_IT_SPECIFIC_3, FR_IT_SPECIFIC_4]
+
+    NON_SPECIFIC_APOSTROPHE = r"\'", r" \' "
+
+    MOSES_PENN_REGEXES_1 = [DEDUPLICATE_SPACE, ASCII_JUNK, DIRECTIONAL_QUOTE_1,
+                              DIRECTIONAL_QUOTE_2, DIRECTIONAL_QUOTE_3,
+                              DIRECTIONAL_QUOTE_4, DIRECTIONAL_QUOTE_5,
+                              DIRECTIONAL_QUOTE_6, DIRECTIONAL_QUOTE_7,
+                              DIRECTIONAL_QUOTE_8, REPLACE_ELLIPSIS, COMMA_1,
+                              COMMA_2, COMMA_3, SYMBOLS, INTRATOKEN_SLASHES,
+                              FINAL_PERIOD, PAD_QUESTION_EXCLAMATION_MARK,
+                              PAD_PARENTHESIS, CONVERT_PARENTHESIS_1,
+                              CONVERT_PARENTHESIS_2, CONVERT_PARENTHESIS_3,
+                              CONVERT_PARENTHESIS_4, CONVERT_PARENTHESIS_5,
+                              CONVERT_PARENTHESIS_6, PAD_DOUBLE_DASHES,
+                              PAD_START_OF_STR, PAD_END_OF_STR,
+                              CONVERT_DOUBLE_TO_SINGLE_QUOTES,
+                              HANDLES_SINGLE_QUOTES, APOSTROPHE, CONTRACTION_1,
+                              CONTRACTION_2, CONTRACTION_3, CONTRACTION_4,
+                              CONTRACTION_5, CONTRACTION_6, CONTRACTION_7,
+                              CONTRACTION_8, CONTRACTION_9, CONTRACTION_10,
+                              CONTRACTION_11, CONTRACTION_12, CONTRACTION_13,
+                              CONTRACTION_14, CONTRACTION_15, CONTRACTION_16,
+                              CONTRACTION_17, CONTRACTION_18, CONTRACTION_19]
+
+    MOSES_PENN_REGEXES_2 = [RESTORE_ELLIPSIS, CLEAN_EXTRA_SPACE_1,
+                        CLEAN_EXTRA_SPACE_2, CLEAN_EXTRA_SPACE_3,
+                        ESCAPE_AMPERSAND, ESCAPE_PIPE,
+                        ESCAPE_LEFT_ANGLE_BRACKET, ESCAPE_RIGHT_ANGLE_BRACKET,
+                        ESCAPE_SINGLE_QUOTE, ESCAPE_DOUBLE_QUOTE]
+
+    MOSES_ESCAPE_XML_REGEXES = [ESCAPE_AMPERSAND, ESCAPE_PIPE,
+                                ESCAPE_LEFT_ANGLE_BRACKET,
+                                ESCAPE_RIGHT_ANGLE_BRACKET,
+                                ESCAPE_SINGLE_QUOTE, ESCAPE_DOUBLE_QUOTE,
+                                ESCAPE_LEFT_SQUARE_BRACKET,
+                                ESCAPE_RIGHT_SQUARE_BRACKET]
+
+    def __init__(self, lang='en'):
+        # Initialize the object.
+        super(MosesTokenizer, self).__init__()
+        self.lang = lang
+        # Initialize the language specific nonbreaking prefixes.
+        self.NONBREAKING_PREFIXES = [_nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)]
+        self.NUMERIC_ONLY_PREFIXES = [w.rpartition(' ')[0] for w in
+                                      self.NONBREAKING_PREFIXES if
+                                      self.has_numeric_only(w)]
+
+
+
+    def replace_multidots(self, text):
+        text = re.sub(r'\.([\.]+)', r' DOTMULTI\1', text)
+        while re.search(r'DOTMULTI\.', text):
+            text = re.sub(r'DOTMULTI\.([^\.])', r'DOTDOTMULTI \1', text)
+            text = re.sub(r'DOTMULTI\.', 'DOTDOTMULTI', text)
+        return text
+
+    def restore_multidots(self, text):
+        while re.search(r'DOTDOTMULTI', text):
+            text = re.sub(r'DOTDOTMULTI', r'DOTMULTI.', text)
+        return re.sub(r'DOTMULTI', r'.', text)
+
+    def islower(self, text):
+        return not set(text).difference(set(self.IsLower))
+
+    def isalpha(self, text):
+        return not set(text).difference(set(self.IsAlpha))
+
+    def has_numeric_only(self, text):
+        return bool(re.search(r'(.*)[\s]+(\#NUMERIC_ONLY\#)', text))
+
+    def handles_nonbreaking_prefixes(self, text):
+        # Splits the text into tokens to check for nonbreaking prefixes.
+        tokens = text.split()
+        num_tokens = len(tokens)
+        for i, token in enumerate(tokens):
+            # Checks if token ends with a fullstop.
+            token_ends_with_period = re.search(r'^(\S+)\.$', token)
+            if token_ends_with_period:
+                prefix = token_ends_with_period.group(1)
+                # Checks for 3 conditions if
+                # i.   the prefix contains a fullstop and
+                #      any char in the prefix is within the IsAlpha charset
+                # ii.  the prefix is in the list of nonbreaking prefixes and
+                #      does not contain #NUMERIC_ONLY#
+                # iii. the token is not the last token and that the
+                #      next token contains all lowercase.
+                if ( ('.' in prefix and self.isalpha(prefix)) or
+                     (prefix in self.NONBREAKING_PREFIXES and
+                      prefix not in self.NUMERIC_ONLY_PREFIXES) or
+                     (i != num_tokens-1 and self.islower(tokens[i+1])) ):
+                    pass # No change to the token.
+                # Checks if the prefix is in NUMERIC_ONLY_PREFIXES
+                # and ensures that the next word is a digit.
+                elif (prefix in self.NUMERIC_ONLY_PREFIXES and
+                      (i + 1) < num_tokens and
+                      re.search(r'^[0-9]+', tokens[i+1])):
+                    pass # No change to the token.
+                else: # Otherwise, adds a space after the tokens before a dot.
+                    tokens[i] = prefix + ' .'
+        return " ".join(tokens) # Stitch the tokens back.
+
+    def escape_xml(self, text):
+        for regexp, substitution in self.MOSES_ESCAPE_XML_REGEXES:
+            text = re.sub(regexp, substitution, text)
+        return text
+
+    def penn_tokenize(self, text, return_str=False):
+        """
+        This is a Python port of the Penn treebank tokenizer adapted by the Moses
+        machine translation community. It's a little different from the
+        version in nltk.tokenize.treebank.
+        """
+        # Converts input string into unicode.
+        text = text_type(text)
+        # Perform a chain of regex substituitions using MOSES_PENN_REGEXES_1
+        for regexp, substitution in self.MOSES_PENN_REGEXES_1:
+            text = re.sub(regexp, substitution, text)
+        # Handles nonbreaking prefixes.
+        text = self.handles_nonbreaking_prefixes(text)
+        # Restore ellipsis, clean extra spaces, escape XML symbols.
+        for regexp, substitution in self.MOSES_PENN_REGEXES_2:
+            text = re.sub(regexp, substitution, text)
+        return text if return_str else text.split()
+
+    def tokenize(self, text, agressive_dash_splits=False, return_str=False, escape=True):
+        """
+        Python port of the Moses tokenizer.
+
+        >>> mtokenizer = MosesTokenizer()
+        >>> text = u'Is 9.5 or 525,600 my favorite number?'
+        >>> print (mtokenizer.tokenize(text, return_str=True))
+        Is 9.5 or 525,600 my favorite number ?
+        >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+        >>> print (mtokenizer.tokenize(text, return_str=True))
+        The https : / / github.com / jonsafari / tok-tok / blob / master / tok-tok.pl is a website with / and / or slashes and sort of weird : things
+        >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+        >>> expected = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+        >>> assert mtokenizer.tokenize(text, return_str=True) == expected
+
+        :param tokens: A single string, i.e. sentence text.
+        :type tokens: str
+        :param agressive_dash_splits: Option to trigger dash split rules .
+        :type agressive_dash_splits: bool
+        """
+        # Converts input string into unicode.
+        text = text_type(text)
+
+        # De-duplicate spaces and clean ASCII junk
+        for regexp, substitution in [self.DEDUPLICATE_SPACE, self.ASCII_JUNK]:
+            text = re.sub(regexp, substitution, text)
+        # Strips heading and trailing spaces.
+        text = text.strip()
+        # Separate special characters outside of IsAlnum character set.
+        regexp, substitution = self.PAD_NOT_ISALNUM
+        text = re.sub(regexp, substitution, text)
+        # Aggressively splits dashes
+        if agressive_dash_splits:
+            regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
+            text = re.sub(regexp, substitution, text)
+        # Replaces multidots with "DOTDOTMULTI" literal strings.
+        text = self.replace_multidots(text)
+        # Separate out "," except if within numbers e.g. 5,300
+        for regexp, substitution in [self.COMMA_SEPARATE_1, self.COMMA_SEPARATE_2]:
+            text = re.sub(regexp, substitution, text)
+
+        # (Language-specific) apostrophe tokenization.
+        if self.lang == 'en':
+            for regexp, substitution in self.ENGLISH_SPECIFIC_APOSTROPHE:
+                 text = re.sub(regexp, substitution, text)
+        elif self.lang in ['fr', 'it']:
+            for regexp, substitution in self.FR_IT_SPECIFIC_APOSTROPHE:
+                text = re.sub(regexp, substitution, text)
+        else:
+            regexp, substitution = self.NON_SPECIFIC_APOSTROPHE
+            text = re.sub(regexp, substitution, text)
+
+        # Handles nonbreaking prefixes.
+        text = self.handles_nonbreaking_prefixes(text)
+        # Cleans up extraneous spaces.
+        regexp, substitution = self.DEDUPLICATE_SPACE
+        text = re.sub(regexp,substitution, text).strip()
+        # Restore multidots.
+        text = self.restore_multidots(text)
+        if escape:
+            # Escape XML symbols.
+            text = self.escape_xml(text)
+
+        return text if return_str else text.split()
+
+
+class MosesDetokenizer(TokenizerI):
+    """
+    This is a Python port of the Moses Detokenizer from
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl
+
+    >>> tokenizer = MosesTokenizer()
+    >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+    >>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+    >>> tokenized_text = tokenizer.tokenize(text, return_str=True)
+    >>> tokenized_text == expected_tokenized
+    True
+    >>> detokenizer = MosesDetokenizer()
+    >>> expected_detokenized = u'This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+    >>> detokenized_text = detokenizer.detokenize(tokenized_text.split(), return_str=True)
+    >>> detokenized_text == expected_detokenized
+    True
+
+    >>> from nltk.tokenize.moses import MosesTokenizer, MosesDetokenizer
+    >>> t, d = MosesTokenizer(), MosesDetokenizer()
+    >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+    >>> expected_tokens = [u'This', u'ain', u'&apos;t', u'funny', u'.', u'It', u'&apos;s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'&#124;', u'&#91;', u'&#93;', u'&lt;', u'&gt;', u'&#91;', u'&#93;', u'&amp;', u'You', u'&apos;re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u'&apos;t', u'?']
+    >>> expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
+    >>> tokens = t.tokenize(sent)
+    >>> tokens == expected_tokens
+    True
+    >>> detokens = d.detokenize(tokens)
+    >>> " ".join(detokens) == expected_detokens
+    True
+    
+    >>> d.detokenize(expected_tokens, unescape=True)
+    ['This', "ain't", 'funny.', "It's", 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[]', '<', '>', '[]', '&', "You're", 'gonna', 'shake', 'it', 'off?', "Don't?"]
+    >>> d.detokenize(expected_tokens, unescape=False)
+    ['This', 'ain', '&apos;t', 'funny.', 'It', '&apos;s', 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '&#124;', '&#91;', '&#93;', '&lt;', '&gt;', '&#91;', '&#93;', '&amp;', 'You', '&apos;re', 'gonna', 'shake', 'it', 'off?', 'Don', '&apos;t?']
+    """
+    # Currency Symbols.
+    IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
+    IsAlpha = text_type(''.join(perluniprops.chars('IsAlpha')))
+    IsSc = text_type(''.join(perluniprops.chars('IsSc')))
+
+    AGGRESSIVE_HYPHEN_SPLIT = r' \@\-\@ ', r'-'
+
+    # Merge multiple spaces.
+    ONE_SPACE = re.compile(r' {2,}'), ' '
+
+    # Unescape special characters.
+    UNESCAPE_FACTOR_SEPARATOR = r'&#124;', r'|'
+    UNESCAPE_LEFT_ANGLE_BRACKET = r'&lt;', r'<'
+    UNESCAPE_RIGHT_ANGLE_BRACKET = r'&gt;', r'>'
+    UNESCAPE_DOUBLE_QUOTE = r'&quot;', r'"'
+    UNESCAPE_SINGLE_QUOTE = r"&apos;", r"'"
+    UNESCAPE_SYNTAX_NONTERMINAL_LEFT = r'&#91;', r'['
+    UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = r'&#93;', r']'
+    UNESCAPE_AMPERSAND = r'&amp;', r'&'
+    # The legacy regexes are used to support outputs from older Moses versions.
+    UNESCAPE_FACTOR_SEPARATOR_LEGACY = r'&bar;', r'|'
+    UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = r'&bra;', r'['
+    UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = r'&ket;', r']'
+
+
+    MOSES_UNESCAPE_XML_REGEXES = [UNESCAPE_FACTOR_SEPARATOR_LEGACY,
+                        UNESCAPE_FACTOR_SEPARATOR, UNESCAPE_LEFT_ANGLE_BRACKET,
+                        UNESCAPE_RIGHT_ANGLE_BRACKET,
+                        UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,
+                        UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,
+                        UNESCAPE_DOUBLE_QUOTE, UNESCAPE_SINGLE_QUOTE,
+                        UNESCAPE_SYNTAX_NONTERMINAL_LEFT,
+                        UNESCAPE_SYNTAX_NONTERMINAL_RIGHT, UNESCAPE_AMPERSAND]
+
+    FINNISH_MORPHSET_1 = [u'N', u'n', u'A', u'a', u'\xc4', u'\xe4', u'ssa',
+                         u'Ssa', u'ss\xe4', u'Ss\xe4', u'sta', u'st\xe4',
+                         u'Sta', u'St\xe4', u'hun', u'Hun', u'hyn', u'Hyn',
+                         u'han', u'Han', u'h\xe4n', u'H\xe4n', u'h\xf6n',
+                         u'H\xf6n', u'un', u'Un', u'yn', u'Yn', u'an', u'An',
+                         u'\xe4n', u'\xc4n', u'\xf6n', u'\xd6n', u'seen',
+                         u'Seen', u'lla', u'Lla', u'll\xe4', u'Ll\xe4', u'lta',
+                         u'Lta', u'lt\xe4', u'Lt\xe4', u'lle', u'Lle', u'ksi',
+                         u'Ksi', u'kse', u'Kse', u'tta', u'Tta', u'ine', u'Ine']
+
+    FINNISH_MORPHSET_2 = [u'ni', u'si', u'mme', u'nne', u'nsa']
+
+    FINNISH_MORPHSET_3 = [u'ko', u'k\xf6', u'han', u'h\xe4n', u'pa', u'p\xe4',
+                         u'kaan', u'k\xe4\xe4n', u'kin']
+
+    FINNISH_REGEX = u'^({})({})?({})$'.format(text_type('|'.join(FINNISH_MORPHSET_1)),
+                                               text_type('|'.join(FINNISH_MORPHSET_2)),
+                                               text_type('|'.join(FINNISH_MORPHSET_3)))
+
+
+    def __init__(self, lang='en'):
+        super(MosesDetokenizer, self).__init__()
+        self.lang = lang
+
+
+    def unescape_xml(self, text):
+        for regexp, substitution in self.MOSES_UNESCAPE_XML_REGEXES:
+            text = re.sub(regexp, substitution, text)
+        return text
+
+
+    def tokenize(self, tokens, return_str=False, unescape=True):
+        """
+        Python port of the Moses detokenizer.
+
+        :param tokens: A list of strings, i.e. tokenized text.
+        :type tokens: list(str)
+        :return: str
+        """
+        # Convert the list of tokens into a string and pad it with spaces.
+        text = u" {} ".format(" ".join(tokens))
+        # Converts input string into unicode.
+        text = text_type(text)
+        # Detokenize the agressive hyphen split.
+        regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
+        text = re.sub(regexp, substitution, text)
+        if unescape:
+            # Unescape the XML symbols.
+            text = self.unescape_xml(text)
+        # Keep track of no. of quotation marks.
+        quote_counts = {u"'":0 , u'"':0, u"``":0, u"`":0, u"''":0}
+
+        # The *prepend_space* variable is used to control the "effects" of
+        # detokenization as the function loops through the list of tokens and
+        # changes the *prepend_space* accordingly as it sequentially checks
+        # through the language specific and language independent conditions.
+        prepend_space = " "
+        detokenized_text = ""
+        tokens = text.split()
+        # Iterate through every token and apply language specific detokenization rule(s).
+        for i, token in enumerate(iter(tokens)):
+            # Check if the first char is CJK.
+            if is_cjk(token[0]):
+                # Perform left shift if this is a second consecutive CJK word.
+                if i > 0 and is_cjk(token[-1]):
+                    detokenized_text += token
+                # But do nothing special if this is a CJK word that doesn't follow a CJK word
+                else:
+                    detokenized_text += prepend_space + token
+                prepend_space = " "
+
+            # If it's a currency symbol.
+            elif token in self.IsSc:
+                # Perform right shift on currency and other random punctuation items
+                detokenized_text += prepend_space + token
+                prepend_space = ""
+
+            elif re.search(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token):
+                # In French, these punctuations are prefixed with a non-breakable space.
+                if self.lang == 'fr' and re.search(r'^[\?\!\:\;\\\%]$', token):
+                    detokenized_text += " "
+                # Perform left shift on punctuation items.
+                detokenized_text += token
+                prepend_space = " "
+
+            elif (self.lang == 'en' and i > 0
+                  and re.search(u"^[\'][{}]".format(self.IsAlpha), token)):
+                  #and re.search(u'[{}]$'.format(self.IsAlnum), tokens[i-1])):
+                # For English, left-shift the contraction.
+                detokenized_text += token
+                prepend_space = " "
+
+            elif (self.lang == 'cs' and i > 1
+                  and re.search(r'^[0-9]+$', tokens[-2]) # If the previous previous token is a number.
+                  and re.search(r'^[.,]$', tokens[-1]) # If previous token is a dot.
+                  and re.search(r'^[0-9]+$', token)): # If the current token is a number.
+                # In Czech, left-shift floats that are decimal numbers.
+                detokenized_text += token
+                prepend_space = " "
+
+            elif (self.lang in ['fr', 'it'] and i <= len(tokens)-2
+                  and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
+                  and re.search(u'^[{}]$'.format(self.IsAlpha), tokens[i+1])): # If the next token is alpha.
+                # For French and Italian, right-shift the contraction.
+                detokenized_text += prepend_space + token
+                prepend_space = ""
+
+            elif (self.lang == 'cs' and i <= len(tokens)-3
+                  and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
+                  and re.search(u'^[-–]$', tokens[i+1])
+                  and re.search(u'^li$|^mail.*', tokens[i+2], re.IGNORECASE)): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)
+                # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
+                detokenized_text += prepend_space + token + tokens[i+1]
+                next(tokens, None) # Advance over the dash
+                prepend_space = ""
+
+            # Combine punctuation smartly.
+            elif re.search(r'''^[\'\"„“`]+$''', token):
+                normalized_quo = token
+                if re.search(r'^[„“”]+$', token):
+                    normalized_quo = '"'
+                quote_counts[normalized_quo] = quote_counts.get(normalized_quo, 0)
+
+                if self.lang == 'cs' and token == u"„":
+                    quote_counts[normalized_quo] = 0
+                if self.lang == 'cs' and token == u"“":
+                    quote_counts[normalized_quo] = 1
+
+
+                if quote_counts[normalized_quo] % 2 == 0:
+                    if (self.lang == 'en' and token == u"'" and i > 0
+                        and re.search(r'[s]$', tokens[i-1]) ):
+                        # Left shift on single quote for possessives ending
+                        # in "s", e.g. "The Jones' house"
+                        detokenized_text += token
+                        prepend_space = " "
+                    else:
+                        # Right shift.
+                        detokenized_text += prepend_space + token
+                        prepend_space = ""
+                        quote_counts[normalized_quo] += 1
+                else:
+                    # Left shift.
+                    detokenized_text += token
+                    prepend_space = " "
+                    quote_counts[normalized_quo] += 1
+
+            elif (self.lang == 'fi' and re.search(r':$', tokens[i-1])
+                  and re.search(self.FINNISH_REGEX, token)):
+                # Finnish : without intervening space if followed by case suffix
+                # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+                detokenized_text += prepend_space + token
+                prepend_space = " "
+
+            else:
+                detokenized_text += prepend_space + token
+                prepend_space = " "
+
+        # Merge multiple spaces.
+        regexp, substitution = self.ONE_SPACE
+        detokenized_text = re.sub(regexp, substitution, detokenized_text)
+        # Removes heading and trailing spaces.
+        detokenized_text = detokenized_text.strip()
+
+        return detokenized_text if return_str else detokenized_text.split()
+
+    def detokenize(self, tokens, return_str=False, unescape=True):
+        """ Duck-typing the abstract *tokenize()*."""
+        return self.tokenize(tokens, return_str, unescape)
diff --git a/nlp_resource_data/nltk/tokenize/moses.pyc b/nlp_resource_data/nltk/tokenize/moses.pyc

new file mode 100755 (executable)

index 0000000..e002ded

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/moses.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/mwe.py b/nlp_resource_data/nltk/tokenize/mwe.py

new file mode 100755 (executable)

index 0000000..40b3705
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/mwe.py
@@ -0,0 +1,120 @@
+# Multi-Word Expression tokenizer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Multi-Word Expression Tokenizer
+
+A ``MWETokenizer`` takes a string which has already been divided into tokens and
+retokenizes it, merging multi-word expressions into single tokens, using a lexicon
+of MWEs:
+
+
+    >>> from nltk.tokenize import MWETokenizer
+
+    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
+    >>> tokenizer.add_mwe(('in', 'spite', 'of'))
+
+    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
+    ['Testing', 'testing', 'testing', 'one', 'two', 'three']
+
+    >>> tokenizer.tokenize('This is a test in spite'.split())
+    ['This', 'is', 'a', 'test', 'in', 'spite']
+
+    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
+    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
+
+"""
+from nltk.util import Trie
+
+from nltk.tokenize.api import TokenizerI
+
+
+class MWETokenizer(TokenizerI):
+    """A tokenizer that processes tokenized text and merges multi-word expressions
+    into single tokens.
+    """
+
+    def __init__(self, mwes=None, separator='_'):
+        """Initialize the multi-word tokenizer with a list of expressions and a
+        separator
+
+        :type mwes: list(list(str))
+        :param mwes: A sequence of multi-word expressions to be merged, where
+            each MWE is a sequence of strings.
+        :type separator: str
+        :param separator: String that should be inserted between words in a multi-word
+            expression token. (Default is '_')
+
+        """
+        if not mwes:
+            mwes = []
+        self._mwes = Trie(mwes)
+        self._separator = separator
+
+    def add_mwe(self, mwe):
+        """Add a multi-word expression to the lexicon (stored as a word trie)
+
+        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. 
+        The key True marks the end of a valid MWE.
+
+        :param mwe: The multi-word expression we're adding into the word trie
+        :type mwe: tuple(str) or list(str)
+
+        :Example:
+
+        >>> tokenizer = MWETokenizer()
+        >>> tokenizer.add_mwe(('a', 'b'))
+        >>> tokenizer.add_mwe(('a', 'b', 'c'))
+        >>> tokenizer.add_mwe(('a', 'x'))
+        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
+        >>> tokenizer._mwes.as_dict() == expected
+        True
+
+        """
+        self._mwes.insert(mwe)
+
+    def tokenize(self, text):
+        """
+
+        :param text: A list containing tokenized text
+        :type text: list(str)
+        :return: A list of the tokenized text with multi-words merged together
+        :rtype: list(str)
+
+        :Example:
+
+        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
+        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
+        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
+        
+        """
+        i = 0
+        n = len(text)
+        result = []
+
+        while i < n:
+            if text[i] in self._mwes:
+                # possible MWE match
+                j = i
+                trie = self._mwes
+                while j < n and text[j] in trie:
+                    trie = trie[text[j]]
+                    j = j + 1
+                else:
+                    if Trie.LEAF in trie:
+                        # success!
+                        result.append(self._separator.join(text[i:j]))
+                        i = j
+                    else:
+                        # no match, so backtrack
+                        result.append(text[i])
+                        i += 1
+            else:
+                result.append(text[i])
+                i += 1
+
+        return result
diff --git a/nlp_resource_data/nltk/tokenize/mwe.pyc b/nlp_resource_data/nltk/tokenize/mwe.pyc

new file mode 100755 (executable)

index 0000000..e9bd5f3

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/mwe.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/nist.py b/nlp_resource_data/nltk/tokenize/nist.py

new file mode 100755 (executable)

index 0000000..419732f
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/nist.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
+# Contributors: Ozan Caglayan, Wiktor Stribizew
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
+https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
+which was also ported into Python in
+https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
+"""
+
+from __future__ import unicode_literals
+
+import io
+import re
+from six import text_type
+
+from nltk.corpus import perluniprops
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import xml_unescape
+
+
+class NISTTokenizer(TokenizerI):
+    """
+    This NIST tokenizer is sentence-based instead of the original
+    paragraph-based tokenization from mteval-14.pl; The sentence-based
+    tokenization is consistent with the other tokenizers available in NLTK.
+
+    >>> from six import text_type
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+    >>> s = "Good muffins cost $3.88 in New York."
+    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
+    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
+    >>> nist.tokenize(s, lowercase=False) == expected_cased
+    True
+    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
+    True
+
+    The international_tokenize() is the preferred function when tokenizing
+    non-european text, e.g.
+
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+
+    # Input strings.
+    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
+    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
+    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
+
+    # Expected tokens.
+    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
+    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
+    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
+
+    >>> nist.international_tokenize(albb)[:10] == expected_albb
+    True
+    >>> nist.international_tokenize(amz)[:10] == expected_amz
+    True
+    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
+    True
+    """
+    # Strip "skipped" tags
+    STRIP_SKIP = re.compile('<skipped>'), ''
+    #  Strip end-of-line hyphenation and join lines
+    STRIP_EOL_HYPHEN = re.compile(u'\u2028'), ' '
+    # Tokenize punctuation.
+    PUNCT = re.compile('([\{-\~\[-\` -\&\(-\+\:-\@\/])'), ' \\1 '
+    # Tokenize period and comma unless preceded by a digit.
+    PERIOD_COMMA_PRECEED = re.compile('([^0-9])([\.,])'), '\\1 \\2 '
+    # Tokenize period and comma unless followed by a digit.
+    PERIOD_COMMA_FOLLOW = re.compile('([\.,])([^0-9])'), ' \\1 \\2'
+    # Tokenize dash when preceded by a digit
+    DASH_PRECEED_DIGIT = re.compile('([0-9])(-)'), '\\1 \\2 '
+
+    LANG_DEPENDENT_REGEXES = [PUNCT, PERIOD_COMMA_PRECEED,
+                              PERIOD_COMMA_FOLLOW, DASH_PRECEED_DIGIT]
+
+    # Perluniprops characters used in NIST tokenizer.
+    pup_number = text_type(''.join(set(perluniprops.chars('Number')))) # i.e. \p{N}
+    pup_punct = text_type(''.join(set(perluniprops.chars('Punctuation')))) # i.e. \p{P}
+    pup_symbol = text_type(''.join(set(perluniprops.chars('Symbol')))) # i.e. \p{S}
+
+    # Python regexes needs to escape some special symbols, see
+    # see https://stackoverflow.com/q/45670950/610569
+    number_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_number)
+    punct_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_punct)
+    symbol_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_symbol)
+
+    # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
+    #       (i) strip trailing and heading spaces  and
+    #       (ii) de-deuplicate spaces.
+    #       In Python, this would do: ' '.join(str.strip().split())
+    # Thus, the next two lines were commented out.
+    #Line_Separator = text_type(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+    #Separator = text_type(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+
+    # Pads non-ascii strings with space.
+    NONASCII = re.compile('([\x00-\x7f]+)'), r' \1 '
+    #  Tokenize any punctuation unless followed AND preceded by a digit.
+    PUNCT_1 = re.compile(u"([{n}])([{p}])".format(n=number_regex, p=punct_regex)), '\\1 \\2 '
+    PUNCT_2 = re.compile(u"([{p}])([{n}])".format(n=number_regex, p=punct_regex)), ' \\1 \\2'
+    # Tokenize symbols
+    SYMBOLS = re.compile(u"({s})".format(s=symbol_regex)), ' \\1 '
+
+    INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
+
+    def lang_independent_sub(self, text):
+        """Performs the language independent string substituitions. """
+        # It's a strange order of regexes.
+        # It'll be better to unescape after STRIP_EOL_HYPHEN
+        # but let's keep it close to the original NIST implementation.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        return text
+
+    def tokenize(self, text, lowercase=False,
+                 western_lang=True, return_str=False):
+        text = text_type(text)
+        # Language independent regex.
+        text = self.lang_independent_sub(text)
+        # Language dependent regex.
+        if western_lang:
+            # Pad string with whitespace.
+            text = ' ' + text + ' '
+            if lowercase:
+                text = text.lower()
+            for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
+                text = regexp.sub(substitution, text)
+        # Remove contiguous whitespaces.
+        text = ' '.join(text.split())
+        # Finally, strips heading and trailing spaces
+        # and converts output string into unicode.
+        text = text_type(text.strip())
+        return text if return_str else text.split()
+
+    def international_tokenize(self, text, lowercase=False,
+                               split_non_ascii=True,
+                               return_str=False):
+        text = text_type(text)
+        # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
+        # first before unescaping.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+
+        if lowercase:
+            text = text.lower()
+
+        for regexp, substitution in self.INTERNATIONAL_REGEXES:
+            text = regexp.sub(substitution, text)
+
+        # Make sure that there's only one space only between words.
+        # Strip leading and trailing spaces.
+        text = ' '.join(text.strip().split())
+        return text if return_str else text.split()
diff --git a/nlp_resource_data/nltk/tokenize/nist.pyc b/nlp_resource_data/nltk/tokenize/nist.pyc

new file mode 100755 (executable)

index 0000000..86f2f30

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/nist.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/punkt.py b/nlp_resource_data/nltk/tokenize/punkt.py

new file mode 100755 (executable)

index 0000000..afd73a1
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/punkt.py
@@ -0,0 +1,1611 @@
+# Natural Language Toolkit: Punkt sentence tokenizer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Algorithm: Kiss & Strunk (2006)
+# Author: Willy <willy@csse.unimelb.edu.au> (original Python port)
+#         Steven Bird <stevenbird1@gmail.com> (additions)
+#         Edward Loper <edloper@gmail.com> (rewrite)
+#         Joel Nothman <jnothman@student.usyd.edu.au> (almost rewrite)
+#         Arthur Darcet <arthur@darcet.fr> (fixes)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+r"""
+Punkt Sentence Tokenizer
+
+This tokenizer divides a text into a list of sentences,
+by using an unsupervised algorithm to build a model for abbreviation
+words, collocations, and words that start sentences.  It must be
+trained on a large collection of plaintext in the target language
+before it can be used.
+
+The NLTK data package includes a pre-trained Punkt tokenizer for
+English.
+
+    >>> import nltk.data
+    >>> text = '''
+    ... Punkt knows that the periods in Mr. Smith and Johann S. Bach
+    ... do not mark sentence boundaries.  And sometimes sentences
+    ... can start with non-capitalized words.  i is a good variable
+    ... name.
+    ... '''
+    >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
+    >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip())))
+    Punkt knows that the periods in Mr. Smith and Johann S. Bach
+    do not mark sentence boundaries.
+    -----
+    And sometimes sentences
+    can start with non-capitalized words.
+    -----
+    i is a good variable
+    name.
+
+(Note that whitespace from the original text, including newlines, is
+retained in the output.)
+
+Punctuation following sentences is also included by default
+(from NLTK 3.0 onwards). It can be excluded with the realign_boundaries
+flag.
+
+    >>> text = '''
+    ... (How does it deal with this parenthesis?)  "It should be part of the
+    ... previous sentence." "(And the same with this one.)" ('And this one!')
+    ... "('(And (this)) '?)" [(and this. )]
+    ... '''
+    >>> print('\n-----\n'.join(
+    ...     sent_detector.tokenize(text.strip())))
+    (How does it deal with this parenthesis?)
+    -----
+    "It should be part of the
+    previous sentence."
+    -----
+    "(And the same with this one.)"
+    -----
+    ('And this one!')
+    -----
+    "('(And (this)) '?)"
+    -----
+    [(and this. )]
+    >>> print('\n-----\n'.join(
+    ...     sent_detector.tokenize(text.strip(), realign_boundaries=False)))
+    (How does it deal with this parenthesis?
+    -----
+    )  "It should be part of the
+    previous sentence.
+    -----
+    " "(And the same with this one.
+    -----
+    )" ('And this one!
+    -----
+    ')
+    "('(And (this)) '?
+    -----
+    )" [(and this.
+    -----
+    )]
+
+However, Punkt is designed to learn parameters (a list of abbreviations, etc.)
+unsupervised from a corpus similar to the target domain. The pre-packaged models
+may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn
+parameters from the given text.
+
+:class:`.PunktTrainer` learns parameters such as a list of abbreviations
+(without supervision) from portions of text. Using a ``PunktTrainer`` directly
+allows for incremental training and modification of the hyper-parameters used
+to decide what is considered an abbreviation, etc.
+
+The algorithm for this tokenizer is described in::
+
+  Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
+    Boundary Detection.  Computational Linguistics 32: 485-525.
+"""
+from __future__ import print_function, unicode_literals, division
+
+# TODO: Make orthographic heuristic less susceptible to overtraining
+# TODO: Frequent sentence starters optionally exclude always-capitalised words
+# FIXME: Problem with ending string with e.g. '!!!' -> '!! !'
+
+import re
+import math
+from collections import defaultdict
+
+from six import string_types
+
+from nltk.compat import unicode_repr, python_2_unicode_compatible
+from nltk.probability import FreqDist
+from nltk.tokenize.api import TokenizerI
+
+######################################################################
+#{ Orthographic Context Constants
+######################################################################
+# The following constants are used to describe the orthographic
+# contexts in which a word can occur.  BEG=beginning, MID=middle,
+# UNK=unknown, UC=uppercase, LC=lowercase, NC=no case.
+
+_ORTHO_BEG_UC    = 1 << 1
+"""Orthographic context: beginning of a sentence with upper case."""
+
+_ORTHO_MID_UC    = 1 << 2
+"""Orthographic context: middle of a sentence with upper case."""
+
+_ORTHO_UNK_UC    = 1 << 3
+"""Orthographic context: unknown position in a sentence with upper case."""
+
+_ORTHO_BEG_LC    = 1 << 4
+"""Orthographic context: beginning of a sentence with lower case."""
+
+_ORTHO_MID_LC    = 1 << 5
+"""Orthographic context: middle of a sentence with lower case."""
+
+_ORTHO_UNK_LC    = 1 << 6
+"""Orthographic context: unknown position in a sentence with lower case."""
+
+_ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC
+"""Orthographic context: occurs with upper case."""
+
+_ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC
+"""Orthographic context: occurs with lower case."""
+
+_ORTHO_MAP = {
+        ('initial',  'upper'): _ORTHO_BEG_UC,
+        ('internal', 'upper'): _ORTHO_MID_UC,
+        ('unknown',  'upper'): _ORTHO_UNK_UC,
+        ('initial',  'lower'): _ORTHO_BEG_LC,
+        ('internal', 'lower'): _ORTHO_MID_LC,
+        ('unknown',  'lower'): _ORTHO_UNK_LC,
+}
+"""A map from context position and first-letter case to the
+appropriate orthographic context flag."""
+
+#} (end orthographic context constants)
+######################################################################
+
+######################################################################
+#{ Decision reasons for debugging
+######################################################################
+
+REASON_DEFAULT_DECISION = 'default decision'
+REASON_KNOWN_COLLOCATION = 'known collocation (both words)'
+REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = 'abbreviation + orthographic heuristic'
+REASON_ABBR_WITH_SENTENCE_STARTER = 'abbreviation + frequent sentence starter'
+REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
+REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
+REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = 'initial + special orthographic heuristic'
+
+#} (end decision reasons for debugging)
+######################################################################
+
+######################################################################
+#{ Language-dependent variables
+######################################################################
+
+class PunktLanguageVars(object):
+    """
+    Stores variables, mostly regular expressions, which may be
+    language-dependent for correct application of the algorithm.
+    An extension of this class may modify its properties to suit
+    a language other than English; an instance can then be passed
+    as an argument to PunktSentenceTokenizer and PunktTrainer
+    constructors.
+    """
+
+    __slots__ = ('_re_period_context', '_re_word_tokenizer')
+
+    def __getstate__(self):
+        # All modifications to the class are performed by inheritance.
+        # Non-default parameters to be pickled must be defined in the inherited
+        # class.
+        return 1
+
+    def __setstate__(self, state):
+        return 1
+
+    sent_end_chars = ('.', '?', '!')
+    """Characters which are candidates for sentence boundaries"""
+
+    @property
+    def _re_sent_end_chars(self):
+        return '[%s]' % re.escape(''.join(self.sent_end_chars))
+
+    internal_punctuation = ',:;' # might want to extend this..
+    """sentence internal punctuation, which indicates an abbreviation if
+    preceded by a period-final token."""
+
+    re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)',
+            re.MULTILINE)
+    """Used to realign punctuation that should be included in a sentence
+    although it follows the period (or ?, !)."""
+
+    _re_word_start    = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]"
+    """Excludes some characters from starting word tokens"""
+
+    _re_non_word_chars   = r"(?:[?!)\";}\]\*:@\'\({\[])"
+    """Characters that cannot appear within words"""
+
+    _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
+    """Hyphen and ellipsis are multi-character punctuation"""
+
+    _word_tokenize_fmt = r'''(
+        %(MultiChar)s
+        |
+        (?=%(WordStart)s)\S+?  # Accept word characters until end is found
+        (?= # Sequences marking a word's end
+            \s|                                 # White-space
+            $|                                  # End-of-string
+            %(NonWord)s|%(MultiChar)s|          # Punctuation
+            ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
+        )
+        |
+        \S
+    )'''
+    """Format of a regular expression to split punctuation from words,
+    excluding period."""
+
+    def _word_tokenizer_re(self):
+        """Compiles and returns a regular expression for word tokenization"""
+        try:
+            return self._re_word_tokenizer
+        except AttributeError:
+            self._re_word_tokenizer = re.compile(
+                self._word_tokenize_fmt %
+                {
+                    'NonWord':   self._re_non_word_chars,
+                    'MultiChar': self._re_multi_char_punct,
+                    'WordStart': self._re_word_start,
+                },
+                re.UNICODE | re.VERBOSE
+            )
+            return self._re_word_tokenizer
+
+    def word_tokenize(self, s):
+        """Tokenize a string to split off punctuation other than periods"""
+        return self._word_tokenizer_re().findall(s)
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            \s+(?P<next_tok>\S+)     # or whitespace and some other token
+        ))"""
+    """Format of a regular expression to find contexts including possible
+    sentence boundaries. Matches token which the possible sentence boundary
+    ends, and matches the following token within a lookahead expression."""
+
+    def period_context_re(self):
+        """Compiles and returns a regular expression to find contexts
+        including possible sentence boundaries."""
+        try:
+            return self._re_period_context
+        except:
+            self._re_period_context = re.compile(
+                self._period_context_fmt %
+                {
+                    'NonWord':      self._re_non_word_chars,
+                    'SentEndChars': self._re_sent_end_chars,
+                },
+                re.UNICODE | re.VERBOSE)
+            return self._re_period_context
+
+
+_re_non_punct = re.compile(r'[^\W\d]', re.UNICODE)
+"""Matches token types that are not merely punctuation. (Types for
+numeric tokens are changed to ##number## and hence contain alpha.)"""
+
+#}
+######################################################################
+
+
+
+#////////////////////////////////////////////////////////////
+#{ Helper Functions
+#////////////////////////////////////////////////////////////
+
+def _pair_iter(it):
+    """
+    Yields pairs of tokens from the given iterator such that each input
+    token will appear as the first element in a yielded tuple. The last
+    pair will have None as its second element.
+    """
+    it = iter(it)
+    prev = next(it)
+    for el in it:
+        yield (prev, el)
+        prev = el
+    yield (prev, None)
+
+######################################################################
+#{ Punkt Parameters
+######################################################################
+
+class PunktParameters(object):
+    """Stores data used to perform sentence boundary detection with Punkt."""
+
+    def __init__(self):
+        self.abbrev_types = set()
+        """A set of word types for known abbreviations."""
+
+        self.collocations = set()
+        """A set of word type tuples for known common collocations
+        where the first word ends in a period.  E.g., ('S.', 'Bach')
+        is a common collocation in a text that discusses 'Johann
+        S. Bach'.  These count as negative evidence for sentence
+        boundaries."""
+
+        self.sent_starters = set()
+        """A set of word types for words that often appear at the
+        beginning of sentences."""
+
+        self.ortho_context = defaultdict(int)
+        """A dictionary mapping word types to the set of orthographic
+        contexts that word type appears in.  Contexts are represented
+        by adding orthographic context flags: ..."""
+
+    def clear_abbrevs(self):
+        self.abbrev_types = set()
+
+    def clear_collocations(self):
+        self.collocations = set()
+
+    def clear_sent_starters(self):
+        self.sent_starters = set()
+
+    def clear_ortho_context(self):
+        self.ortho_context = defaultdict(int)
+
+    def add_ortho_context(self, typ, flag):
+        self.ortho_context[typ] |= flag
+
+    def _debug_ortho_context(self, typ):
+        c = self.ortho_context[typ]
+        if c & _ORTHO_BEG_UC:
+            yield 'BEG-UC'
+        if c & _ORTHO_MID_UC:
+            yield 'MID-UC'
+        if c & _ORTHO_UNK_UC:
+            yield 'UNK-UC'
+        if c & _ORTHO_BEG_LC:
+            yield 'BEG-LC'
+        if c & _ORTHO_MID_LC:
+            yield 'MID-LC'
+        if c & _ORTHO_UNK_LC:
+            yield 'UNK-LC'
+
+######################################################################
+#{ PunktToken
+######################################################################
+
+@python_2_unicode_compatible
+class PunktToken(object):
+    """Stores a token of text with annotations produced during
+    sentence boundary detection."""
+
+    _properties = [
+        'parastart', 'linestart',
+        'sentbreak', 'abbr', 'ellipsis'
+    ]
+    __slots__ = ['tok', 'type', 'period_final'] + _properties
+
+    def __init__(self, tok, **params):
+        self.tok = tok
+        self.type = self._get_type(tok)
+        self.period_final = tok.endswith('.')
+
+        for p in self._properties:
+            setattr(self, p, None)
+        for k in params:
+            setattr(self, k, params[k])
+
+    #////////////////////////////////////////////////////////////
+    #{ Regular expressions for properties
+    #////////////////////////////////////////////////////////////
+    # Note: [A-Za-z] is approximated by [^\W\d] in the general case.
+    _RE_ELLIPSIS = re.compile(r'\.\.+$')
+    _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
+    _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
+    _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
+
+    #////////////////////////////////////////////////////////////
+    #{ Derived properties
+    #////////////////////////////////////////////////////////////
+
+    def _get_type(self, tok):
+        """Returns a case-normalized representation of the token."""
+        return self._RE_NUMERIC.sub('##number##', tok.lower())
+
+    @property
+    def type_no_period(self):
+        """
+        The type with its final period removed if it has one.
+        """
+        if len(self.type) > 1 and self.type[-1] == '.':
+            return self.type[:-1]
+        return self.type
+
+    @property
+    def type_no_sentperiod(self):
+        """
+        The type with its final period removed if it is marked as a
+        sentence break.
+        """
+        if self.sentbreak:
+            return self.type_no_period
+        return self.type
+
+    @property
+    def first_upper(self):
+        """True if the token's first character is uppercase."""
+        return self.tok[0].isupper()
+
+    @property
+    def first_lower(self):
+        """True if the token's first character is lowercase."""
+        return self.tok[0].islower()
+
+    @property
+    def first_case(self):
+        if self.first_lower:
+            return 'lower'
+        elif self.first_upper:
+            return 'upper'
+        return 'none'
+
+    @property
+    def is_ellipsis(self):
+        """True if the token text is that of an ellipsis."""
+        return self._RE_ELLIPSIS.match(self.tok)
+
+    @property
+    def is_number(self):
+        """True if the token text is that of a number."""
+        return self.type.startswith('##number##')
+
+    @property
+    def is_initial(self):
+        """True if the token text is that of an initial."""
+        return self._RE_INITIAL.match(self.tok)
+
+    @property
+    def is_alpha(self):
+        """True if the token text is all alphabetic."""
+        return self._RE_ALPHA.match(self.tok)
+
+    @property
+    def is_non_punct(self):
+        """True if the token is either a number or is alphabetic."""
+        return _re_non_punct.search(self.type)
+
+    #////////////////////////////////////////////////////////////
+    #{ String representation
+    #////////////////////////////////////////////////////////////
+
+    def __repr__(self):
+        """
+        A string representation of the token that can reproduce it
+        with eval(), which lists all the token's non-default
+        annotations.
+        """
+        typestr = (' type=%s,' % unicode_repr(self.type)
+                   if self.type != self.tok else '')
+
+        propvals = ', '.join(
+            '%s=%s' % (p, unicode_repr(getattr(self, p)))
+            for p in self._properties
+            if getattr(self, p)
+        )
+
+        return '%s(%s,%s %s)' % (self.__class__.__name__,
+            unicode_repr(self.tok), typestr, propvals)
+
+    def __str__(self):
+        """
+        A string representation akin to that used by Kiss and Strunk.
+        """
+        res = self.tok
+        if self.abbr:
+            res += '<A>'
+        if self.ellipsis:
+            res += '<E>'
+        if self.sentbreak:
+            res += '<S>'
+        return res
+
+######################################################################
+#{ Punkt base class
+######################################################################
+
+class PunktBaseClass(object):
+    """
+    Includes common components of PunktTrainer and PunktSentenceTokenizer.
+    """
+
+    def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken,
+            params=None):
+        if params is None:
+            params = PunktParameters() 
+        self._params = params
+        self._lang_vars = lang_vars
+        self._Token = token_cls
+        """The collection of parameters that determines the behavior
+        of the punkt tokenizer."""
+
+    #////////////////////////////////////////////////////////////
+    #{ Word tokenization
+    #////////////////////////////////////////////////////////////
+
+    def _tokenize_words(self, plaintext):
+        """
+        Divide the given text into tokens, using the punkt word
+        segmentation regular expression, and generate the resulting list
+        of tokens augmented as three-tuples with two boolean values for whether
+        the given token occurs at the start of a paragraph or a new line,
+        respectively.
+        """
+        parastart = False
+        for line in plaintext.split('\n'):
+            if line.strip():
+                line_toks = iter(self._lang_vars.word_tokenize(line))
+
+                yield self._Token(next(line_toks),
+                        parastart=parastart, linestart=True)
+                parastart = False
+
+                for t in line_toks:
+                    yield self._Token(t)
+            else:
+                parastart = True
+
+
+    #////////////////////////////////////////////////////////////
+    #{ Annotation Procedures
+    #////////////////////////////////////////////////////////////
+
+    def _annotate_first_pass(self, tokens):
+        """
+        Perform the first pass of annotation, which makes decisions
+        based purely based on the word type of each word:
+
+          - '?', '!', and '.' are marked as sentence breaks.
+          - sequences of two or more periods are marked as ellipsis.
+          - any word ending in '.' that's a known abbreviation is
+            marked as an abbreviation.
+          - any other word ending in '.' is marked as a sentence break.
+
+        Return these annotations as a tuple of three sets:
+
+          - sentbreak_toks: The indices of all sentence breaks.
+          - abbrev_toks: The indices of all abbreviations.
+          - ellipsis_toks: The indices of all ellipsis marks.
+        """
+        for aug_tok in tokens:
+            self._first_pass_annotation(aug_tok)
+            yield aug_tok
+
+    def _first_pass_annotation(self, aug_tok):
+        """
+        Performs type-based annotation on a single token.
+        """
+
+        tok = aug_tok.tok
+
+        if tok in self._lang_vars.sent_end_chars:
+            aug_tok.sentbreak = True
+        elif aug_tok.is_ellipsis:
+            aug_tok.ellipsis = True
+        elif aug_tok.period_final and not tok.endswith('..'):
+            if (tok[:-1].lower() in self._params.abbrev_types or
+                tok[:-1].lower().split('-')[-1] in self._params.abbrev_types):
+
+                aug_tok.abbr = True
+            else:
+                aug_tok.sentbreak = True
+
+        return
+
+######################################################################
+#{ Punkt Trainer
+######################################################################
+
+
+class PunktTrainer(PunktBaseClass):
+    """Learns parameters used in Punkt sentence boundary detection."""
+
+    def __init__(self, train_text=None, verbose=False,
+            lang_vars=PunktLanguageVars(), token_cls=PunktToken):
+
+        PunktBaseClass.__init__(self, lang_vars=lang_vars,
+                token_cls=token_cls)
+
+        self._type_fdist = FreqDist()
+        """A frequency distribution giving the frequency of each
+        case-normalized token type in the training data."""
+
+        self._num_period_toks = 0
+        """The number of words ending in period in the training data."""
+
+        self._collocation_fdist = FreqDist()
+        """A frequency distribution giving the frequency of all
+        bigrams in the training data where the first word ends in a
+        period.  Bigrams are encoded as tuples of word types.
+        Especially common collocations are extracted from this
+        frequency distribution, and stored in
+        ``_params``.``collocations <PunktParameters.collocations>``."""
+
+        self._sent_starter_fdist = FreqDist()
+        """A frequency distribution giving the frequency of all words
+        that occur at the training data at the beginning of a sentence
+        (after the first pass of annotation).  Especially common
+        sentence starters are extracted from this frequency
+        distribution, and stored in ``_params.sent_starters``.
+        """
+
+        self._sentbreak_count = 0
+        """The total number of sentence breaks identified in training, used for
+        calculating the frequent sentence starter heuristic."""
+
+        self._finalized = True
+        """A flag as to whether the training has been finalized by finding
+        collocations and sentence starters, or whether finalize_training()
+        still needs to be called."""
+
+        if train_text:
+            self.train(train_text, verbose, finalize=True)
+
+    def get_params(self):
+        """
+        Calculates and returns parameters for sentence boundary detection as
+        derived from training."""
+        if not self._finalized:
+            self.finalize_training()
+        return self._params
+
+    #////////////////////////////////////////////////////////////
+    #{ Customization Variables
+    #////////////////////////////////////////////////////////////
+
+    ABBREV = 0.3
+    """cut-off value whether a 'token' is an abbreviation"""
+
+    IGNORE_ABBREV_PENALTY = False
+    """allows the disabling of the abbreviation penalty heuristic, which
+    exponentially disadvantages words that are found at times without a
+    final period."""
+
+    ABBREV_BACKOFF = 5
+    """upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""
+
+    COLLOCATION = 7.88
+    """minimal log-likelihood value that two tokens need to be considered
+    as a collocation"""
+
+    SENT_STARTER = 30
+    """minimal log-likelihood value that a token requires to be considered
+    as a frequent sentence starter"""
+
+    INCLUDE_ALL_COLLOCS = False
+    """this includes as potential collocations all word pairs where the first
+    word ends in a period. It may be useful in corpora where there is a lot
+    of variation that makes abbreviations like Mr difficult to identify."""
+
+    INCLUDE_ABBREV_COLLOCS = False
+    """this includes as potential collocations all word pairs where the first
+    word is an abbreviation. Such collocations override the orthographic
+    heuristic, but not the sentence starter heuristic. This is overridden by
+    INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
+    and ordinals are considered."""
+    """"""
+
+    MIN_COLLOC_FREQ = 1
+    """this sets a minimum bound on the number of times a bigram needs to
+    appear before it can be considered a collocation, in addition to log
+    likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""
+
+    #////////////////////////////////////////////////////////////
+    #{ Training..
+    #////////////////////////////////////////////////////////////
+
+    def train(self, text, verbose=False, finalize=True):
+        """
+        Collects training data from a given text. If finalize is True, it
+        will determine all the parameters for sentence boundary detection. If
+        not, this will be delayed until get_params() or finalize_training() is
+        called. If verbose is True, abbreviations found will be listed.
+        """
+        # Break the text into tokens; record which token indices correspond to
+        # line starts and paragraph starts; and determine their types.
+        self._train_tokens(self._tokenize_words(text), verbose)
+        if finalize:
+            self.finalize_training(verbose)
+
+    def train_tokens(self, tokens, verbose=False, finalize=True):
+        """
+        Collects training data from a given list of tokens.
+        """
+        self._train_tokens((self._Token(t) for t in tokens), verbose)
+        if finalize:
+            self.finalize_training(verbose)
+
+    def _train_tokens(self, tokens, verbose):
+        self._finalized = False
+
+        # Ensure tokens are a list
+        tokens = list(tokens)
+
+        # Find the frequency of each case-normalized type.  (Don't
+        # strip off final periods.)  Also keep track of the number of
+        # tokens that end in periods.
+        for aug_tok in tokens:
+            self._type_fdist[aug_tok.type] += 1
+            if aug_tok.period_final:
+                self._num_period_toks += 1
+
+        # Look for new abbreviations, and for types that no longer are
+        unique_types = self._unique_types(tokens)
+        for abbr, score, is_add in self._reclassify_abbrev_types(unique_types):
+            if score >= self.ABBREV:
+                if is_add:
+                    self._params.abbrev_types.add(abbr)
+                    if verbose:
+                        print(('  Abbreviation: [%6.4f] %s' %
+                               (score, abbr)))
+            else:
+                if not is_add:
+                    self._params.abbrev_types.remove(abbr)
+                    if verbose:
+                        print(('  Removed abbreviation: [%6.4f] %s' %
+                               (score, abbr)))
+
+        # Make a preliminary pass through the document, marking likely
+        # sentence breaks, abbreviations, and ellipsis tokens.
+        tokens = list(self._annotate_first_pass(tokens))
+
+        # Check what contexts each word type can appear in, given the
+        # case of its first letter.
+        self._get_orthography_data(tokens)
+
+        # We need total number of sentence breaks to find sentence starters
+        self._sentbreak_count += self._get_sentbreak_count(tokens)
+
+        # The remaining heuristics relate to pairs of tokens where the first
+        # ends in a period.
+        for aug_tok1, aug_tok2 in _pair_iter(tokens):
+            if not aug_tok1.period_final or not aug_tok2:
+                continue
+
+            # Is the first token a rare abbreviation?
+            if self._is_rare_abbrev_type(aug_tok1, aug_tok2):
+                self._params.abbrev_types.add(aug_tok1.type_no_period)
+                if verbose:
+                    print(('  Rare Abbrev: %s' % aug_tok1.type))
+
+            # Does second token have a high likelihood of starting a sentence?
+            if self._is_potential_sent_starter(aug_tok2, aug_tok1):
+                self._sent_starter_fdist[aug_tok2.type] += 1
+
+            # Is this bigram a potential collocation?
+            if self._is_potential_collocation(aug_tok1, aug_tok2):
+                self._collocation_fdist[
+                    (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod)] += 1
+
+    def _unique_types(self, tokens):
+        return set(aug_tok.type for aug_tok in tokens)
+
+    def finalize_training(self, verbose=False):
+        """
+        Uses data that has been gathered in training to determine likely
+        collocations and sentence starters.
+        """
+        self._params.clear_sent_starters()
+        for typ, ll in self._find_sent_starters():
+            self._params.sent_starters.add(typ)
+            if verbose:
+                print(('  Sent Starter: [%6.4f] %r' % (ll, typ)))
+
+        self._params.clear_collocations()
+        for (typ1, typ2), ll in self._find_collocations():
+            self._params.collocations.add( (typ1,typ2) )
+            if verbose:
+                print(('  Collocation: [%6.4f] %r+%r' %
+                       (ll, typ1, typ2)))
+
+        self._finalized = True
+
+    #////////////////////////////////////////////////////////////
+    #{ Overhead reduction
+    #////////////////////////////////////////////////////////////
+
+    def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2,
+            sentstart_thresh=2):
+        """
+        Allows memory use to be reduced after much training by removing data
+        about rare tokens that are unlikely to have a statistical effect with
+        further training. Entries occurring above the given thresholds will be
+        retained.
+        """
+        if ortho_thresh > 1:
+            old_oc = self._params.ortho_context
+            self._params.clear_ortho_context()
+            for tok in self._type_fdist:
+                count = self._type_fdist[tok]
+                if count >= ortho_thresh:
+                    self._params.ortho_context[tok] = old_oc[tok]
+
+        self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh)
+        self._collocation_fdist = self._freq_threshold(
+                self._collocation_fdist, colloc_thres)
+        self._sent_starter_fdist = self._freq_threshold(
+                self._sent_starter_fdist, sentstart_thresh)
+
+    def _freq_threshold(self, fdist, threshold):
+        """
+        Returns a FreqDist containing only data with counts below a given
+        threshold, as well as a mapping (None -> count_removed).
+        """
+        # We assume that there is more data below the threshold than above it
+        # and so create a new FreqDist rather than working in place.
+        res = FreqDist()
+        num_removed = 0
+        for tok in fdist:
+            count = fdist[tok]
+            if count < threshold:
+                num_removed += 1
+            else:
+                res[tok] += count
+        res[None] += num_removed
+        return res
+
+    #////////////////////////////////////////////////////////////
+    #{ Orthographic data
+    #////////////////////////////////////////////////////////////
+
+    def _get_orthography_data(self, tokens):
+        """
+        Collect information about whether each token type occurs
+        with different case patterns (i) overall, (ii) at
+        sentence-initial positions, and (iii) at sentence-internal
+        positions.
+        """
+        # 'initial' or 'internal' or 'unknown'
+        context = 'internal'
+        tokens = list(tokens)
+
+        for aug_tok in tokens:
+            # If we encounter a paragraph break, then it's a good sign
+            # that it's a sentence break.  But err on the side of
+            # caution (by not positing a sentence break) if we just
+            # saw an abbreviation.
+            if aug_tok.parastart and context != 'unknown':
+                context = 'initial'
+
+            # If we're at the beginning of a line, then we can't decide
+            # between 'internal' and 'initial'.
+            if aug_tok.linestart and context == 'internal':
+                context = 'unknown'
+
+            # Find the case-normalized type of the token.  If it's a
+            # sentence-final token, strip off the period.
+            typ = aug_tok.type_no_sentperiod
+
+            # Update the orthographic context table.
+            flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0)
+            if flag:
+                self._params.add_ortho_context(typ, flag)
+
+            # Decide whether the next word is at a sentence boundary.
+            if aug_tok.sentbreak:
+                if not (aug_tok.is_number or aug_tok.is_initial):
+                    context = 'initial'
+                else:
+                    context = 'unknown'
+            elif aug_tok.ellipsis or aug_tok.abbr:
+                context = 'unknown'
+            else:
+                context = 'internal'
+
+    #////////////////////////////////////////////////////////////
+    #{ Abbreviations
+    #////////////////////////////////////////////////////////////
+
+    def _reclassify_abbrev_types(self, types):
+        """
+        (Re)classifies each given token if
+          - it is period-final and not a known abbreviation; or
+          - it is not period-final and is otherwise a known abbreviation
+        by checking whether its previous classification still holds according
+        to the heuristics of section 3.
+        Yields triples (abbr, score, is_add) where abbr is the type in question,
+        score is its log-likelihood with penalties applied, and is_add specifies
+        whether the present type is a candidate for inclusion or exclusion as an
+        abbreviation, such that:
+          - (is_add and score >= 0.3)    suggests a new abbreviation; and
+          - (not is_add and score < 0.3) suggests excluding an abbreviation.
+        """
+        # (While one could recalculate abbreviations from all .-final tokens at
+        # every iteration, in cases requiring efficiency, the number of tokens
+        # in the present training document will be much less.)
+
+        for typ in types:
+            # Check some basic conditions, to rule out words that are
+            # clearly not abbrev_types.
+            if not _re_non_punct.search(typ) or typ == '##number##':
+                continue
+
+            if typ.endswith('.'):
+                if typ in self._params.abbrev_types:
+                    continue
+                typ = typ[:-1]
+                is_add = True
+            else:
+                if typ not in self._params.abbrev_types:
+                    continue
+                is_add = False
+
+            # Count how many periods & nonperiods are in the
+            # candidate.
+            num_periods = typ.count('.') + 1
+            num_nonperiods = len(typ) - num_periods + 1
+
+            # Let <a> be the candidate without the period, and <b>
+            # be the period.  Find a log likelihood ratio that
+            # indicates whether <ab> occurs as a single unit (high
+            # value of ll), or as two independent units <a> and
+            # <b> (low value of ll).
+            count_with_period = self._type_fdist[typ + '.']
+            count_without_period = self._type_fdist[typ]
+            ll = self._dunning_log_likelihood(
+                count_with_period + count_without_period,
+                self._num_period_toks, count_with_period,
+                self._type_fdist.N())
+
+            # Apply three scaling factors to 'tweak' the basic log
+            # likelihood ratio:
+            #   F_length: long word -> less likely to be an abbrev
+            #   F_periods: more periods -> more likely to be an abbrev
+            #   F_penalty: penalize occurrences w/o a period
+            f_length = math.exp(-num_nonperiods)
+            f_periods = num_periods
+            f_penalty = (int(self.IGNORE_ABBREV_PENALTY)
+                    or math.pow(num_nonperiods, -count_without_period))
+            score = ll * f_length * f_periods * f_penalty
+
+            yield typ, score, is_add
+
+    def find_abbrev_types(self):
+        """
+        Recalculates abbreviations given type frequencies, despite no prior
+        determination of abbreviations.
+        This fails to include abbreviations otherwise found as "rare".
+        """
+        self._params.clear_abbrevs()
+        tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
+        for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
+            if score >= self.ABBREV:
+                self._params.abbrev_types.add(abbr)
+
+    # This function combines the work done by the original code's
+    # functions `count_orthography_context`, `get_orthography_count`,
+    # and `get_rare_abbreviations`.
+    def _is_rare_abbrev_type(self, cur_tok, next_tok):
+        """
+        A word type is counted as a rare abbreviation if...
+          - it's not already marked as an abbreviation
+          - it occurs fewer than ABBREV_BACKOFF times
+          - either it is followed by a sentence-internal punctuation
+            mark, *or* it is followed by a lower-case word that
+            sometimes appears with upper case, but never occurs with
+            lower case at the beginning of sentences.
+        """
+        if cur_tok.abbr or not cur_tok.sentbreak:
+            return False
+
+        # Find the case-normalized type of the token.  If it's
+        # a sentence-final token, strip off the period.
+        typ = cur_tok.type_no_sentperiod
+
+        # Proceed only if the type hasn't been categorized as an
+        # abbreviation already, and is sufficiently rare...
+        count = self._type_fdist[typ] + self._type_fdist[typ[:-1]]
+        if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF):
+            return False
+
+        # Record this token as an abbreviation if the next
+        # token is a sentence-internal punctuation mark.
+        # [XX] :1 or check the whole thing??
+        if next_tok.tok[:1] in self._lang_vars.internal_punctuation:
+            return True
+
+        # Record this type as an abbreviation if the next
+        # token...  (i) starts with a lower case letter,
+        # (ii) sometimes occurs with an uppercase letter,
+        # and (iii) never occus with an uppercase letter
+        # sentence-internally.
+        # [xx] should the check for (ii) be modified??
+        elif next_tok.first_lower:
+            typ2 = next_tok.type_no_sentperiod
+            typ2ortho_context = self._params.ortho_context[typ2]
+            if ( (typ2ortho_context & _ORTHO_BEG_UC) and
+                 not (typ2ortho_context & _ORTHO_MID_UC) ):
+                return True
+
+    #////////////////////////////////////////////////////////////
+    #{ Log Likelihoods
+    #////////////////////////////////////////////////////////////
+
+    # helper for _reclassify_abbrev_types:
+    @staticmethod
+    def _dunning_log_likelihood(count_a, count_b, count_ab, N):
+        """
+        A function that calculates the modified Dunning log-likelihood
+        ratio scores for abbreviation candidates.  The details of how
+        this works is available in the paper.
+        """
+        p1 = count_b / N
+        p2 = 0.99
+
+        null_hypo = (count_ab * math.log(p1) +
+                     (count_a - count_ab) * math.log(1.0 - p1))
+        alt_hypo  = (count_ab * math.log(p2) +
+                     (count_a - count_ab) * math.log(1.0 - p2))
+
+        likelihood = null_hypo - alt_hypo
+
+        return (-2.0 * likelihood)
+
+    @staticmethod
+    def _col_log_likelihood(count_a, count_b, count_ab, N):
+        """
+        A function that will just compute log-likelihood estimate, in
+        the original paper it's described in algorithm 6 and 7.
+
+        This *should* be the original Dunning log-likelihood values,
+        unlike the previous log_l function where it used modified
+        Dunning log-likelihood values
+        """
+        p = count_b / N
+        p1 = count_ab / count_a
+        try:
+            p2 = (count_b - count_ab) / (N - count_a)
+        except ZeroDivisionError as e:
+            p2 = 1
+
+        try:
+            summand1 = (count_ab * math.log(p) +
+                        (count_a - count_ab) * math.log(1.0 - p))
+        except ValueError as e:
+            summand1 = 0
+
+        try:
+            summand2 = ((count_b - count_ab) * math.log(p) +
+                        (N - count_a - count_b + count_ab) * math.log(1.0 - p))
+        except ValueError as e:
+            summand2 = 0
+
+        if count_a == count_ab or p1 <= 0 or p1 >= 1:
+            summand3 = 0
+        else:
+            summand3 = (count_ab * math.log(p1) +
+                        (count_a - count_ab) * math.log(1.0 - p1))
+
+        if count_b == count_ab or p2 <= 0 or p2 >= 1:
+            summand4 = 0
+        else:
+            summand4 = ((count_b - count_ab) * math.log(p2) +
+                        (N - count_a - count_b + count_ab) * math.log(1.0 - p2))
+
+        likelihood = summand1 + summand2 - summand3 - summand4
+
+        return (-2.0 * likelihood)
+
+    #////////////////////////////////////////////////////////////
+    #{ Collocation Finder
+    #////////////////////////////////////////////////////////////
+
+    def _is_potential_collocation(self, aug_tok1, aug_tok2):
+        """
+        Returns True if the pair of tokens may form a collocation given
+        log-likelihood statistics.
+        """
+        return ((self.INCLUDE_ALL_COLLOCS or
+                (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) or
+                (aug_tok1.sentbreak and
+                    (aug_tok1.is_number or aug_tok1.is_initial)))
+                and aug_tok1.is_non_punct
+                and aug_tok2.is_non_punct)
+
+    def _find_collocations(self):
+        """
+        Generates likely collocations and their log-likelihood.
+        """
+        for types in self._collocation_fdist:
+            try:
+                typ1, typ2 = types
+            except TypeError:
+                # types may be None after calling freq_threshold()
+                continue
+            if typ2 in self._params.sent_starters:
+                continue
+
+            col_count = self._collocation_fdist[types]
+            typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.']
+            typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.']
+            if (typ1_count > 1 and typ2_count > 1
+                    and self.MIN_COLLOC_FREQ <
+                        col_count <= min(typ1_count, typ2_count)):
+
+                ll = self._col_log_likelihood(typ1_count, typ2_count,
+                                              col_count, self._type_fdist.N())
+                # Filter out the not-so-collocative
+                if (ll >= self.COLLOCATION and
+                    (self._type_fdist.N()/typ1_count >
+                     typ2_count/col_count)):
+                    yield (typ1, typ2), ll
+
+    #////////////////////////////////////////////////////////////
+    #{ Sentence-Starter Finder
+    #////////////////////////////////////////////////////////////
+
+    def _is_potential_sent_starter(self, cur_tok, prev_tok):
+        """
+        Returns True given a token and the token that preceds it if it
+        seems clear that the token is beginning a sentence.
+        """
+        # If a token (i) is preceded by a sentece break that is
+        # not a potential ordinal number or initial, and (ii) is
+        # alphabetic, then it is a a sentence-starter.
+        return ( prev_tok.sentbreak and
+             not (prev_tok.is_number or prev_tok.is_initial) and
+             cur_tok.is_alpha )
+
+    def _find_sent_starters(self):
+        """
+        Uses collocation heuristics for each candidate token to
+        determine if it frequently starts sentences.
+        """
+        for typ in self._sent_starter_fdist:
+            if not typ:
+                continue
+
+            typ_at_break_count = self._sent_starter_fdist[typ]
+            typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.']
+            if typ_count < typ_at_break_count:
+                # needed after freq_threshold
+                continue
+
+            ll = self._col_log_likelihood(self._sentbreak_count, typ_count,
+                                         typ_at_break_count,
+                                          self._type_fdist.N())
+
+            if (ll >= self.SENT_STARTER and
+                self._type_fdist.N()/self._sentbreak_count >
+                typ_count/typ_at_break_count):
+
+                yield typ, ll
+
+    def _get_sentbreak_count(self, tokens):
+        """
+        Returns the number of sentence breaks marked in a given set of
+        augmented tokens.
+        """
+        return sum(1 for aug_tok in tokens if aug_tok.sentbreak)
+
+
+######################################################################
+#{ Punkt Sentence Tokenizer
+######################################################################
+
+
+class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
+    """
+    A sentence tokenizer which uses an unsupervised algorithm to build
+    a model for abbreviation words, collocations, and words that start
+    sentences; and then uses that model to find sentence boundaries.
+    This approach has been shown to work well for many European
+    languages.
+    """
+    def __init__(self, train_text=None, verbose=False,
+            lang_vars=PunktLanguageVars(), token_cls=PunktToken):
+        """
+        train_text can either be the sole training text for this sentence
+        boundary detector, or can be a PunktParameters object.
+        """
+        PunktBaseClass.__init__(self, lang_vars=lang_vars,
+                token_cls=token_cls)
+
+        if train_text:
+            self._params = self.train(train_text, verbose)
+
+    def train(self, train_text, verbose=False):
+        """
+        Derives parameters from a given training text, or uses the parameters
+        given. Repeated calls to this method destroy previous parameters. For
+        incremental training, instantiate a separate PunktTrainer instance.
+        """
+        if not isinstance(train_text, string_types):
+            return train_text
+        return PunktTrainer(train_text, lang_vars=self._lang_vars,
+                token_cls=self._Token).get_params()
+
+    #////////////////////////////////////////////////////////////
+    #{ Tokenization
+    #////////////////////////////////////////////////////////////
+
+    def tokenize(self, text, realign_boundaries=True):
+        """
+        Given a text, returns a list of the sentences in that text.
+        """
+        return list(self.sentences_from_text(text, realign_boundaries))
+
+    def debug_decisions(self, text):
+        """
+        Classifies candidate periods as sentence breaks, yielding a dict for
+        each that may be used to understand why the decision was made.
+
+        See format_debug_decision() to help make this output readable.
+        """
+
+        for match in self._lang_vars.period_context_re().finditer(text):
+            decision_text = match.group() + match.group('after_tok')
+            tokens = self._tokenize_words(decision_text)
+            tokens = list(self._annotate_first_pass(tokens))
+            while not tokens[0].period_final:
+                tokens.pop(0)
+            yield dict(period_index=match.end() - 1,
+                text=decision_text,
+                type1=tokens[0].type,
+                type2=tokens[1].type,
+                type1_in_abbrs=bool(tokens[0].abbr),
+                type1_is_initial=bool(tokens[0].is_initial),
+                type2_is_sent_starter=tokens[1].type_no_sentperiod in self._params.sent_starters,
+                type2_ortho_heuristic=self._ortho_heuristic(tokens[1]),
+                type2_ortho_contexts=set(self._params._debug_ortho_context(tokens[1].type_no_sentperiod)),
+                collocation=(tokens[0].type_no_sentperiod, tokens[1].type_no_sentperiod) in self._params.collocations,
+
+                reason=self._second_pass_annotation(tokens[0], tokens[1]) or REASON_DEFAULT_DECISION,
+                break_decision=tokens[0].sentbreak,
+            )
+
+    def span_tokenize(self, text, realign_boundaries=True):
+        """
+        Given a text, returns a list of the (start, end) spans of sentences
+        in the text.
+        """
+        slices = self._slices_from_text(text)
+        if realign_boundaries:
+            slices = self._realign_boundaries(text, slices)
+        return [(sl.start, sl.stop) for sl in slices]
+
+    def sentences_from_text(self, text, realign_boundaries=True):
+        """
+        Given a text, generates the sentences in that text by only
+        testing candidate sentence breaks. If realign_boundaries is
+        True, includes in the sentence closing punctuation that
+        follows the period.
+        """
+        return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
+
+    def _slices_from_text(self, text):
+        last_break = 0
+        for match in self._lang_vars.period_context_re().finditer(text):
+            context = match.group() + match.group('after_tok')
+            if self.text_contains_sentbreak(context):
+                yield slice(last_break, match.end())
+                if match.group('next_tok'):
+                    # next sentence starts after whitespace
+                    last_break = match.start('next_tok')
+                else:
+                    # next sentence starts at following punctuation
+                    last_break = match.end()
+        # The last sentence should not contain trailing whitespace.
+        yield slice(last_break, len(text.rstrip()))
+
+    def _realign_boundaries(self, text, slices):
+        """
+        Attempts to realign punctuation that falls after the period but
+        should otherwise be included in the same sentence.
+
+        For example: "(Sent1.) Sent2." will otherwise be split as::
+
+            ["(Sent1.", ") Sent1."].
+
+        This method will produce::
+
+            ["(Sent1.)", "Sent2."].
+        """
+        realign = 0
+        for sl1, sl2 in _pair_iter(slices):
+            sl1 = slice(sl1.start + realign, sl1.stop)
+            if not sl2:
+                if text[sl1]:
+                    yield sl1
+                continue
+
+            m = self._lang_vars.re_boundary_realignment.match(text[sl2])
+            if m:
+                yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
+                realign = m.end()
+            else:
+                realign = 0
+                if text[sl1]:
+                    yield sl1
+
+    def text_contains_sentbreak(self, text):
+        """
+        Returns True if the given text includes a sentence break.
+        """
+        found = False # used to ignore last token
+        for t in self._annotate_tokens(self._tokenize_words(text)):
+            if found:
+                return True
+            if t.sentbreak:
+                found = True
+        return False
+
+    def sentences_from_text_legacy(self, text):
+        """
+        Given a text, generates the sentences in that text. Annotates all
+        tokens, rather than just those with possible sentence breaks. Should
+        produce the same results as ``sentences_from_text``.
+        """
+        tokens = self._annotate_tokens(self._tokenize_words(text))
+        return self._build_sentence_list(text, tokens)
+
+    def sentences_from_tokens(self, tokens):
+        """
+        Given a sequence of tokens, generates lists of tokens, each list
+        corresponding to a sentence.
+        """
+        tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens))
+        sentence = []
+        for aug_tok in tokens:
+            sentence.append(aug_tok.tok)
+            if aug_tok.sentbreak:
+                yield sentence
+                sentence = []
+        if sentence:
+            yield sentence
+
+    def _annotate_tokens(self, tokens):
+        """
+        Given a set of tokens augmented with markers for line-start and
+        paragraph-start, returns an iterator through those tokens with full
+        annotation including predicted sentence breaks.
+        """
+        # Make a preliminary pass through the document, marking likely
+        # sentence breaks, abbreviations, and ellipsis tokens.
+        tokens = self._annotate_first_pass(tokens)
+
+        # Make a second pass through the document, using token context
+        # information to change our preliminary decisions about where
+        # sentence breaks, abbreviations, and ellipsis occurs.
+        tokens = self._annotate_second_pass(tokens)
+
+        ## [XX] TESTING
+        #tokens = list(tokens)
+        #self.dump(tokens)
+
+        return tokens
+
+    def _build_sentence_list(self, text, tokens):
+        """
+        Given the original text and the list of augmented word tokens,
+        construct and return a tokenized list of sentence strings.
+        """
+        # Most of the work here is making sure that we put the right
+        # pieces of whitespace back in all the right places.
+
+        # Our position in the source text, used to keep track of which
+        # whitespace to add:
+        pos = 0
+
+        # A regular expression that finds pieces of whitespace:
+        WS_REGEXP = re.compile(r'\s*')
+
+        sentence = ''
+        for aug_tok in tokens:
+            tok = aug_tok.tok
+
+            # Find the whitespace before this token, and update pos.
+            ws = WS_REGEXP.match(text, pos).group()
+            pos += len(ws)
+
+            # Some of the rules used by the punkt word tokenizer
+            # strip whitespace out of the text, resulting in tokens
+            # that contain whitespace in the source text.  If our
+            # token doesn't match, see if adding whitespace helps.
+            # If so, then use the version with whitespace.
+            if text[pos:pos+len(tok)] != tok:
+                pat = '\s*'.join(re.escape(c) for c in tok)
+                m = re.compile(pat).match(text,pos)
+                if m: tok = m.group()
+
+            # Move our position pointer to the end of the token.
+            assert text[pos:pos+len(tok)] == tok
+            pos += len(tok)
+
+            # Add this token.  If it's not at the beginning of the
+            # sentence, then include any whitespace that separated it
+            # from the previous token.
+            if sentence:
+                sentence += ws
+            sentence += tok
+
+            # If we're at a sentence break, then start a new sentence.
+            if aug_tok.sentbreak:
+                yield sentence
+                sentence = ''
+
+        # If the last sentence is emtpy, discard it.
+        if sentence:
+            yield sentence
+
+    # [XX] TESTING
+    def dump(self, tokens):
+        print('writing to /tmp/punkt.new...')
+        with open('/tmp/punkt.new', 'w') as outfile:
+            for aug_tok in tokens:
+                if aug_tok.parastart:
+                    outfile.write('\n\n')
+                elif aug_tok.linestart:
+                    outfile.write('\n')
+                else:
+                    outfile.write(' ')
+
+                outfile.write(str(aug_tok))
+
+    #////////////////////////////////////////////////////////////
+    #{ Customization Variables
+    #////////////////////////////////////////////////////////////
+
+    PUNCTUATION = tuple(';:,.!?')
+
+    #////////////////////////////////////////////////////////////
+    #{ Annotation Procedures
+    #////////////////////////////////////////////////////////////
+
+    def _annotate_second_pass(self, tokens):
+        """
+        Performs a token-based classification (section 4) over the given
+        tokens, making use of the orthographic heuristic (4.1.1), collocation
+        heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
+        """
+        for t1, t2 in _pair_iter(tokens):
+            self._second_pass_annotation(t1, t2)
+            yield t1
+
+    def _second_pass_annotation(self, aug_tok1, aug_tok2):
+        """
+        Performs token-based classification over a pair of contiguous tokens
+        updating the first.
+        """
+        # Is it the last token? We can't do anything then.
+        if not aug_tok2:
+            return
+
+        tok = aug_tok1.tok
+        if not aug_tok1.period_final:
+            # We only care about words ending in periods.
+            return
+
+        typ = aug_tok1.type_no_period
+        next_tok = aug_tok2.tok
+        next_typ = aug_tok2.type_no_sentperiod
+        tok_is_initial = aug_tok1.is_initial
+
+        # [4.1.2. Collocation Heuristic] If there's a
+        # collocation between the word before and after the
+        # period, then label tok as an abbreviation and NOT
+        # a sentence break. Note that collocations with
+        # frequent sentence starters as their second word are
+        # excluded in training.
+        if (typ, next_typ) in self._params.collocations:
+            aug_tok1.sentbreak = False
+            aug_tok1.abbr = True
+            return REASON_KNOWN_COLLOCATION
+
+        # [4.2. Token-Based Reclassification of Abbreviations] If
+        # the token is an abbreviation or an ellipsis, then decide
+        # whether we should *also* classify it as a sentbreak.
+        if ( (aug_tok1.abbr or aug_tok1.ellipsis) and
+             (not tok_is_initial) ):
+            # [4.1.1. Orthographic Heuristic] Check if there's
+            # orthogrpahic evidence about whether the next word
+            # starts a sentence or not.
+            is_sent_starter = self._ortho_heuristic(aug_tok2)
+            if is_sent_starter == True:
+                aug_tok1.sentbreak = True
+                return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC
+
+            # [4.1.3. Frequent Sentence Starter Heruistic] If the
+            # next word is capitalized, and is a member of the
+            # frequent-sentence-starters list, then label tok as a
+            # sentence break.
+            if ( aug_tok2.first_upper and
+                 next_typ in self._params.sent_starters):
+                aug_tok1.sentbreak = True
+                return REASON_ABBR_WITH_SENTENCE_STARTER
+
+        # [4.3. Token-Based Detection of Initials and Ordinals]
+        # Check if any initials or ordinals tokens that are marked
+        # as sentbreaks should be reclassified as abbreviations.
+        if tok_is_initial or typ == '##number##':
+
+            # [4.1.1. Orthographic Heuristic] Check if there's
+            # orthogrpahic evidence about whether the next word
+            # starts a sentence or not.
+            is_sent_starter = self._ortho_heuristic(aug_tok2)
+
+            if is_sent_starter == False:
+                aug_tok1.sentbreak = False
+                aug_tok1.abbr = True
+                if tok_is_initial:
+                    return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC
+                else:
+                    return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
+
+            # Special heuristic for initials: if orthogrpahic
+            # heuristc is unknown, and next word is always
+            # capitalized, then mark as abbrev (eg: J. Bach).
+            if ( is_sent_starter == 'unknown' and tok_is_initial and
+                 aug_tok2.first_upper and
+                 not (self._params.ortho_context[next_typ] & _ORTHO_LC) ):
+                aug_tok1.sentbreak = False
+                aug_tok1.abbr = True
+                return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC
+
+        return
+
+    def _ortho_heuristic(self, aug_tok):
+        """
+        Decide whether the given token is the first token in a sentence.
+        """
+        # Sentences don't start with punctuation marks:
+        if aug_tok.tok in self.PUNCTUATION:
+            return False
+
+        ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod]
+
+        # If the word is capitalized, occurs at least once with a
+        # lower case first letter, and never occurs with an upper case
+        # first letter sentence-internally, then it's a sentence starter.
+        if ( aug_tok.first_upper and
+             (ortho_context & _ORTHO_LC) and
+             not (ortho_context & _ORTHO_MID_UC) ):
+            return True
+
+        # If the word is lower case, and either (a) we've seen it used
+        # with upper case, or (b) we've never seen it used
+        # sentence-initially with lower case, then it's not a sentence
+        # starter.
+        if ( aug_tok.first_lower and
+             ((ortho_context & _ORTHO_UC) or
+              not (ortho_context & _ORTHO_BEG_LC)) ):
+            return False
+
+        # Otherwise, we're not sure.
+        return 'unknown'
+
+
+DEBUG_DECISION_FMT = '''Text: %(text)r (at offset %(period_index)d)
+Sentence break? %(break_decision)s (%(reason)s)
+Collocation? %(collocation)s
+%(type1)r:
+    known abbreviation: %(type1_in_abbrs)s
+    is initial: %(type1_is_initial)s
+%(type2)r:
+    known sentence starter: %(type2_is_sent_starter)s
+    orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s
+    orthographic contexts in training: %(type2_ortho_contexts)s
+'''
+def format_debug_decision(d):
+    return DEBUG_DECISION_FMT % d
+
+def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
+    """Builds a punkt model and applies it to the same text"""
+    cleanup = lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ')
+    trainer = train_cls()
+    trainer.INCLUDE_ALL_COLLOCS = True
+    trainer.train(text)
+    sbd = tok_cls(trainer.get_params())
+    for l in sbd.sentences_from_text(text):
+        print(cleanup(l))
diff --git a/nlp_resource_data/nltk/tokenize/punkt.pyc b/nlp_resource_data/nltk/tokenize/punkt.pyc

new file mode 100755 (executable)

index 0000000..0ced09b

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/punkt.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/regexp.py b/nlp_resource_data/nltk/tokenize/regexp.py

new file mode 100755 (executable)

index 0000000..cb0b61d
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/regexp.py
@@ -0,0 +1,202 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Trevor Cohn <tacohn@csse.unimelb.edu.au>
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+r"""
+Regular-Expression Tokenizers
+
+A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
+For example, the following tokenizer forms tokens out of alphabetic sequences,
+money expressions, and any other non-whitespace sequences:
+
+    >>> from nltk.tokenize import RegexpTokenizer
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
+    >>> tokenizer.tokenize(s)
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
+
+    >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
+    >>> tokenizer.tokenize(s)
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+
+Note that empty tokens are not returned when the delimiter appears at
+the start or end of the string.
+
+The material between the tokens is discarded.  For example,
+the following tokenizer selects just the capitalized words:
+
+    >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+')
+    >>> capword_tokenizer.tokenize(s)
+    ['Good', 'New', 'York', 'Please', 'Thanks']
+
+This module contains several subclasses of ``RegexpTokenizer``
+that use pre-defined regular expressions.
+
+    >>> from nltk.tokenize import BlanklineTokenizer
+    >>> # Uses '\s*\n\s*\n\s*':
+    >>> BlanklineTokenizer().tokenize(s)
+    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
+    'Thanks.']
+
+All of the regular expression tokenizers are also available as functions:
+
+    >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
+    >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> wordpunct_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
+     '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> blankline_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.', 'Thanks.']
+
+Caution: The function ``regexp_tokenize()`` takes the text as its
+first argument, and the regular expression pattern as its second
+argument.  This differs from the conventions used by Python's
+``re`` functions, where the pattern is always the first argument.
+(This is for consistency with the other NLTK tokenizers.)
+"""
+from __future__ import unicode_literals
+
+import re
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import regexp_span_tokenize
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class RegexpTokenizer(TokenizerI):
+    """
+    A tokenizer that splits a string using a regular expression, which
+    matches either the tokens or the separators between tokens.
+
+        >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
+
+    :type pattern: str
+    :param pattern: The pattern used to build this tokenizer.
+        (This pattern must not contain capturing parentheses;
+        Use non-capturing parentheses, e.g. (?:...), instead)
+    :type gaps: bool
+    :param gaps: True if this tokenizer's pattern should be used
+        to find separators between tokens; False if this
+        tokenizer's pattern should be used to find the tokens
+        themselves.
+    :type discard_empty: bool
+    :param discard_empty: True if any empty tokens `''`
+        generated by the tokenizer should be discarded.  Empty
+        tokens can only be generated if `_gaps == True`.
+    :type flags: int
+    :param flags: The regexp flags used to compile this
+        tokenizer's pattern.  By default, the following flags are
+        used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
+
+    """
+    def __init__(self, pattern, gaps=False, discard_empty=True,
+                 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
+        # If they gave us a regexp object, extract the pattern.
+        pattern = getattr(pattern, 'pattern', pattern)
+
+        self._pattern = pattern
+        self._gaps = gaps
+        self._discard_empty = discard_empty
+        self._flags = flags
+        self._regexp = None
+        
+    def _check_regexp(self):
+        if self._regexp is None:
+            self._regexp = re.compile(self._pattern, self._flags)
+        
+    def tokenize(self, text):
+        self._check_regexp()
+        # If our regexp matches gaps, use re.split:
+        if self._gaps:
+            if self._discard_empty:
+                return [tok for tok in self._regexp.split(text) if tok]
+            else:
+                return self._regexp.split(text)
+
+        # If our regexp matches tokens, use re.findall:
+        else:
+            return self._regexp.findall(text)
+
+    def span_tokenize(self, text):
+        self._check_regexp()
+
+        if self._gaps:
+            for left, right in regexp_span_tokenize(text, self._regexp):
+                if not (self._discard_empty and left == right):
+                    yield left, right
+        else:
+            for m in re.finditer(self._regexp, text):
+                yield m.span()
+
+    def __repr__(self):
+        return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' %
+                (self.__class__.__name__, self._pattern, self._gaps,
+                 self._discard_empty, self._flags))
+
+class WhitespaceTokenizer(RegexpTokenizer):
+    r"""
+    Tokenize a string on whitespace (space, tab, newline).
+    In general, users should use the string ``split()`` method instead.
+
+        >>> from nltk.tokenize import WhitespaceTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> WhitespaceTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+        'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+    """
+
+    def __init__(self):
+        RegexpTokenizer.__init__(self, r'\s+', gaps=True)
+
+class BlanklineTokenizer(RegexpTokenizer):
+    """
+    Tokenize a string, treating any sequence of blank lines as a delimiter.
+    Blank lines are defined as lines containing no characters, except for
+    space or tab characters.
+    """
+    def __init__(self):
+        RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
+
+class WordPunctTokenizer(RegexpTokenizer):
+    """
+    Tokenize a text into a sequence of alphabetic and
+    non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
+
+        >>> from nltk.tokenize import WordPunctTokenizer
+        >>> s = "Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\n\\nThanks."
+        >>> WordPunctTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
+        '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    """
+    def __init__(self):
+        RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
+
+######################################################################
+#{ Tokenization Functions
+######################################################################
+
+def regexp_tokenize(text, pattern, gaps=False, discard_empty=True,
+                    flags=re.UNICODE | re.MULTILINE | re.DOTALL):
+    """
+    Return a tokenized copy of *text*.  See :class:`.RegexpTokenizer`
+    for descriptions of the arguments.
+    """
+    tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
+    return tokenizer.tokenize(text)
+
+blankline_tokenize = BlanklineTokenizer().tokenize
+wordpunct_tokenize = WordPunctTokenizer().tokenize
+
+
+
diff --git a/nlp_resource_data/nltk/tokenize/regexp.pyc b/nlp_resource_data/nltk/tokenize/regexp.pyc

new file mode 100755 (executable)

index 0000000..35643f5

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/regexp.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/repp.py b/nlp_resource_data/nltk/tokenize/repp.py

new file mode 100755 (executable)

index 0000000..aa2aa6c
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/repp.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Repp Tokenizer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Authors: Rebecca Dridan and Stephan Oepen
+# Contributors: Liling Tan
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals, print_function
+from six import text_type
+
+import os
+import re
+import sys
+import subprocess
+import tempfile
+
+
+from nltk.data import ZipFilePathPointer
+from nltk.internals import find_dir
+
+from nltk.tokenize.api import TokenizerI
+
+class ReppTokenizer(TokenizerI):
+    """
+    A class for word tokenization using the REPP parser described in
+    Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a 
+    Long Solved Problem - A Survey, Contrastive  Experiment, Recommendations, 
+    and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
+
+    >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
+    ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
+    ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
+    ... ]
+    >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
+    >>> for sent in sents:                             # doctest: +SKIP
+    ...     tokenizer.tokenize(sent)                   # doctest: +SKIP
+    ... 
+    (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
+    (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
+    (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
+
+    >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
+    ...     print sent                               # doctest: +SKIP
+    ... 
+    (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
+    (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
+    (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
+    >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
+    ...     print sent                                                          # doctest: +SKIP
+    ... 
+    [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
+    [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
+    [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
+    """
+    def __init__(self, repp_dir, encoding='utf8'):
+        self.repp_dir = self.find_repptokenizer(repp_dir)
+        # Set a directory to store the temporary files. 
+        self.working_dir = tempfile.gettempdir()
+        # Set an encoding for the input strings.
+        self.encoding = encoding
+        
+    def tokenize(self, sentence):
+        """
+        Use Repp to tokenize a single sentence.  
+        
+        :param sentence: A single sentence string.
+        :type sentence: str
+        :return: A tuple of tokens. 
+        :rtype: tuple(str)
+        """
+        return next(self.tokenize_sents([sentence]))
+    
+    def tokenize_sents(self, sentences, keep_token_positions=False):
+        """
+        Tokenize multiple sentences using Repp.
+                
+        :param sentences: A list of sentence strings.
+        :type sentences: list(str)
+        :return: A list of tuples of tokens
+        :rtype: iter(tuple(str))
+        """
+        with tempfile.NamedTemporaryFile(prefix='repp_input.', 
+            dir=self.working_dir, mode='w', delete=False) as input_file:
+            # Write sentences to temporary input file.
+            for sent in sentences:
+                input_file.write(text_type(sent) + '\n')
+            input_file.close()
+            # Generate command to run REPP. 
+            cmd =self.generate_repp_command(input_file.name)
+            # Decode the stdout and strips the ending newline.
+            repp_output = self._execute(cmd).decode(self.encoding).strip()
+            for tokenized_sent in self.parse_repp_outputs(repp_output):
+                if not keep_token_positions:
+                    # Removes token position information.
+                    tokenized_sent, starts, ends = zip(*tokenized_sent)
+                yield tokenized_sent      
+        
+    def generate_repp_command(self, inputfilename):
+        """
+        This module generates the REPP command to be used at the terminal.
+        
+        :param inputfilename: path to the input file
+        :type inputfilename: str
+        """
+        cmd = [self.repp_dir + '/src/repp']
+        cmd+= ['-c', self.repp_dir + '/erg/repp.set']
+        cmd+= ['--format', 'triple']
+        cmd+= [inputfilename]
+        return cmd  
+
+    @staticmethod
+    def _execute(cmd):
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = p.communicate()
+        return stdout
+    
+    @staticmethod    
+    def parse_repp_outputs(repp_output):
+        """
+        This module parses the tri-tuple format that REPP outputs using the
+        "--format triple" option and returns an generator with tuple of string
+        tokens.
+        
+        :param repp_output:
+        :type repp_output: type
+        :return: an iterable of the tokenized sentences as tuples of strings
+        :rtype: iter(tuple)
+        """
+        line_regex = re.compile('^\((\d+), (\d+), (.+)\)$', re.MULTILINE)
+        for section in repp_output.split('\n\n'):
+            words_with_positions = [(token, int(start), int(end))
+                                    for start, end, token in 
+                                    line_regex.findall(section)]
+            words = tuple(t[2] for t in words_with_positions)
+            yield words_with_positions
+    
+    def find_repptokenizer(self, repp_dirname):
+        """
+        A module to find REPP tokenizer binary and its *repp.set* config file.
+        """
+        if os.path.exists(repp_dirname): # If a full path is given.
+            _repp_dir = repp_dirname
+        else: # Try to find path to REPP directory in environment variables.
+            _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
+        # Checks for the REPP binary and erg/repp.set config file.
+        assert os.path.exists(_repp_dir+'/src/repp')
+        assert os.path.exists(_repp_dir+'/erg/repp.set')
+        return _repp_dir
diff --git a/nlp_resource_data/nltk/tokenize/repp.pyc b/nlp_resource_data/nltk/tokenize/repp.pyc

new file mode 100755 (executable)

index 0000000..b2e140a

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/repp.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/sexpr.py b/nlp_resource_data/nltk/tokenize/sexpr.py

new file mode 100755 (executable)

index 0000000..d1bdb4e
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/sexpr.py
@@ -0,0 +1,142 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
+#         Steven Bird <stevenbird1@gmail.com> (minor edits)
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+S-Expression Tokenizer
+
+``SExprTokenizer`` is used to find parenthesized expressions in a
+string.  In particular, it divides a string into a sequence of
+substrings that are either parenthesized expressions (including any
+nested parenthesized expressions), or other whitespace-separated
+tokens.
+
+    >>> from nltk.tokenize import SExprTokenizer
+    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
+    ['(a b (c d))', 'e', 'f', '(g)']
+
+By default, `SExprTokenizer` will raise a ``ValueError`` exception if
+used to tokenize an expression with non-matching parentheses:
+
+    >>> SExprTokenizer().tokenize('c) d) e (f (g')
+    Traceback (most recent call last):
+      ...
+    ValueError: Un-matched close paren at char 1
+
+The ``strict`` argument can be set to False to allow for
+non-matching parentheses.  Any unmatched close parentheses will be
+listed as their own s-expression; and the last partial sexpr with
+unmatched open parentheses will be listed as its own sexpr:
+
+    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
+    ['c', ')', 'd', ')', 'e', '(f (g']
+
+The characters used for open and close parentheses may be customized
+using the ``parens`` argument to the `SExprTokenizer` constructor:
+
+    >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
+    ['{a b {c d}}', 'e', 'f', '{g}']
+
+The s-expression tokenizer is also available as a function:
+
+    >>> from nltk.tokenize import sexpr_tokenize
+    >>> sexpr_tokenize('(a b (c d)) e f (g)')
+    ['(a b (c d))', 'e', 'f', '(g)']
+
+"""
+
+import re
+
+from nltk.tokenize.api import TokenizerI
+
+class SExprTokenizer(TokenizerI):
+    """
+    A tokenizer that divides strings into s-expressions.
+    An s-expresion can be either:
+
+      - a parenthesized expression, including any nested parenthesized
+        expressions, or
+      - a sequence of non-whitespace non-parenthesis characters.
+
+    For example, the string ``(a (b c)) d e (f)`` consists of four
+    s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
+
+    By default, the characters ``(`` and ``)`` are treated as open and
+    close parentheses, but alternative strings may be specified.
+
+    :param parens: A two-element sequence specifying the open and close parentheses
+        that should be used to find sexprs.  This will typically be either a
+        two-character string, or a list of two strings.
+    :type parens: str or list
+    :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
+    """
+
+    def __init__(self, parens='()', strict=True):
+        if len(parens) != 2:
+            raise ValueError('parens must contain exactly two strings')
+        self._strict = strict
+        self._open_paren = parens[0]
+        self._close_paren = parens[1]
+        self._paren_regexp = re.compile('%s|%s' % (re.escape(parens[0]),
+                                                   re.escape(parens[1])))
+
+    def tokenize(self, text):
+        """
+        Return a list of s-expressions extracted from *text*.
+        For example:
+
+            >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
+            ['(a b (c d))', 'e', 'f', '(g)']
+
+        All parentheses are assumed to mark s-expressions.
+        (No special processing is done to exclude parentheses that occur
+        inside strings, or following backslash characters.)
+
+        If the given expression contains non-matching parentheses,
+        then the behavior of the tokenizer depends on the ``strict``
+        parameter to the constructor.  If ``strict`` is ``True``, then
+        raise a ``ValueError``.  If ``strict`` is ``False``, then any
+        unmatched close parentheses will be listed as their own
+        s-expression; and the last partial s-expression with unmatched open
+        parentheses will be listed as its own s-expression:
+
+            >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
+            ['c', ')', 'd', ')', 'e', '(f (g']
+
+        :param text: the string to be tokenized
+        :type text: str or iter(str)
+        :rtype: iter(str)
+        """
+        result = []
+        pos = 0
+        depth = 0
+        for m in self._paren_regexp.finditer(text):
+            paren = m.group()
+            if depth == 0:
+                result += text[pos:m.start()].split()
+                pos = m.start()
+            if paren == self._open_paren:
+                depth += 1
+            if paren == self._close_paren:
+                if self._strict and depth == 0:
+                    raise ValueError('Un-matched close paren at char %d'
+                                     % m.start())
+                depth = max(0, depth-1)
+                if depth == 0:
+                    result.append(text[pos:m.end()])
+                    pos = m.end()
+        if self._strict and depth > 0:
+            raise ValueError('Un-matched open paren at char %d' % pos)
+        if pos < len(text):
+            result.append(text[pos:])
+        return result
+
+sexpr_tokenize = SExprTokenizer().tokenize
+
+
+
+
diff --git a/nlp_resource_data/nltk/tokenize/sexpr.pyc b/nlp_resource_data/nltk/tokenize/sexpr.pyc

new file mode 100755 (executable)

index 0000000..1594542

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/sexpr.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/simple.py b/nlp_resource_data/nltk/tokenize/simple.py

new file mode 100755 (executable)

index 0000000..2b7ffe4
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/simple.py
@@ -0,0 +1,136 @@
+# Natural Language Toolkit: Simple Tokenizers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+r"""
+Simple Tokenizers
+
+These tokenizers divide strings into substrings using the string
+``split()`` method.
+When tokenizing using a particular delimiter string, use
+the string ``split()`` method directly, as this is more efficient.
+
+The simple tokenizers are *not* available as separate functions;
+instead, you should just use the string ``split()`` method directly:
+
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> s.split()
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+    >>> s.split(' ')
+    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
+    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
+    >>> s.split('\n')
+    ['Good muffins cost $3.88', 'in New York.  Please buy me',
+    'two of them.', '', 'Thanks.']
+
+The simple tokenizers are mainly useful because they follow the
+standard ``TokenizerI`` interface, and so can be used with any code
+that expects a tokenizer.  For example, these tokenizers can be used
+to specify the tokenization conventions when building a `CorpusReader`.
+
+"""
+from __future__ import unicode_literals
+from nltk.tokenize.api import TokenizerI, StringTokenizer
+from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
+
+class SpaceTokenizer(StringTokenizer):
+    r"""Tokenize a string using the space character as a delimiter,
+    which is the same as ``s.split(' ')``.
+
+        >>> from nltk.tokenize import SpaceTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> SpaceTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
+        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
+    """
+
+    _string = ' '
+
+class TabTokenizer(StringTokenizer):
+    r"""Tokenize a string use the tab character as a delimiter,
+    the same as ``s.split('\t')``.
+
+        >>> from nltk.tokenize import TabTokenizer
+        >>> TabTokenizer().tokenize('a\tb c\n\t d')
+        ['a', 'b c\n', ' d']
+    """
+
+    _string = '\t'
+
+class CharTokenizer(StringTokenizer):
+    """Tokenize a string into individual characters.  If this functionality
+    is ever required directly, use ``for char in string``.
+    """
+
+    def tokenize(self, s):
+        return list(s)
+
+    def span_tokenize(self, s):
+        for i, j in enumerate(range(1, len(s) + 1)):
+            yield i, j
+
+class LineTokenizer(TokenizerI):
+    r"""Tokenize a string into its lines, optionally discarding blank lines.
+    This is similar to ``s.split('\n')``.
+
+        >>> from nltk.tokenize import LineTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> LineTokenizer(blanklines='keep').tokenize(s)
+        ['Good muffins cost $3.88', 'in New York.  Please buy me',
+        'two of them.', '', 'Thanks.']
+        >>> # same as [l for l in s.split('\n') if l.strip()]:
+        >>> LineTokenizer(blanklines='discard').tokenize(s)
+        ['Good muffins cost $3.88', 'in New York.  Please buy me',
+        'two of them.', 'Thanks.']
+
+    :param blanklines: Indicates how blank lines should be handled.  Valid values are:
+
+        - ``discard``: strip blank lines out of the token list before returning it.
+           A line is considered blank if it contains only whitespace characters.
+        - ``keep``: leave all blank lines in the token list.
+        - ``discard-eof``: if the string ends with a newline, then do not generate
+           a corresponding token ``''`` after that newline.
+    """
+
+    def __init__(self, blanklines='discard'):
+        valid_blanklines = ('discard', 'keep', 'discard-eof')
+        if blanklines not in valid_blanklines:
+            raise ValueError('Blank lines must be one of: %s' %
+                             ' '.join(valid_blanklines))
+
+        self._blanklines = blanklines
+
+    def tokenize(self, s):
+        lines = s.splitlines()
+        # If requested, strip off blank lines.
+        if self._blanklines == 'discard':
+            lines = [l for l in lines if l.rstrip()]
+        elif self._blanklines == 'discard-eof':
+            if lines and not lines[-1].strip():
+                lines.pop()
+        return lines
+
+    # discard-eof not implemented
+    def span_tokenize(self, s):
+        if self._blanklines == 'keep':
+            for span in string_span_tokenize(s, r'\n'):
+                yield span
+        else:
+            for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
+                yield span
+
+######################################################################
+#{ Tokenization Functions
+######################################################################
+# XXX: it is stated in module docs that there is no function versions
+
+def line_tokenize(text, blanklines='discard'):
+    return LineTokenizer(blanklines).tokenize(text)
+
+
+
diff --git a/nlp_resource_data/nltk/tokenize/simple.pyc b/nlp_resource_data/nltk/tokenize/simple.pyc

new file mode 100755 (executable)

index 0000000..29678ec

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/simple.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/stanford.py b/nlp_resource_data/nltk/tokenize/stanford.py

new file mode 100755 (executable)

index 0000000..9ac8352
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/stanford.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Stanford Tokenizer
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Xu <xxu@student.unimelb.edu.au>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals, print_function
+
+import tempfile
+import os
+import json
+from subprocess import PIPE
+import warnings
+
+from six import text_type
+
+from nltk.internals import find_jar, config_java, java, _java_options
+from nltk.tokenize.api import TokenizerI
+from nltk.parse.corenlp import CoreNLPParser
+
+_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
+
+class StanfordTokenizer(TokenizerI):
+    r"""
+    Interface to the Stanford Tokenizer
+
+    >>> from nltk.tokenize.stanford import StanfordTokenizer
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
+    >>> StanfordTokenizer().tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> s = "The colour of the wall is blue."
+    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s)
+    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
+    """
+
+    _JAR = 'stanford-postagger.jar'
+
+    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(str("\nThe StanfordTokenizer will "
+                          "be deprecated in version 3.2.5.\n"
+                          "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+                      DeprecationWarning, stacklevel=2)
+        warnings.simplefilter('ignore', DeprecationWarning)
+        self._stanford_jar = find_jar(
+            self._JAR, path_to_jar,
+            env_vars=('STANFORD_POSTAGGER',),
+            searchpath=(), url=_stanford_url,
+            verbose=verbose
+        )
+
+        self._encoding = encoding
+        self.java_options = java_options
+
+        options = {} if options is None else options
+        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
+
+    @staticmethod
+    def _parse_tokenized_output(s):
+        return s.splitlines()
+
+    def tokenize(self, s):
+        """
+        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
+        """
+        cmd = [
+            'edu.stanford.nlp.process.PTBTokenizer',
+        ]
+        return self._parse_tokenized_output(self._execute(cmd, s))
+
+    def _execute(self, cmd, input_, verbose=False):
+        encoding = self._encoding
+        cmd.extend(['-charset', encoding])
+        _options_cmd = self._options_cmd
+        if _options_cmd:
+            cmd.extend(['-options', self._options_cmd])
+
+        default_options = ' '.join(_java_options)
+
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+
+        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
+        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
+            # Write the actual sentences to the temporary input file
+            if isinstance(input_, text_type) and encoding:
+                input_ = input_.encode(encoding)
+            input_file.write(input_)
+            input_file.flush()
+
+            cmd.append(input_file.name)
+
+            # Run the tagger and get the output.
+            stdout, stderr = java(cmd, classpath=self._stanford_jar,
+                                  stdout=PIPE, stderr=PIPE)
+            stdout = stdout.decode(encoding)
+
+        os.unlink(input_file.name)
+
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+
+        return stdout
+
+
+class CoreNLPTokenizer(CoreNLPParser):
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        r"""
+        This is a duck-type of CoreNLPParser that has the tokenizing
+        functionality similar to the original Stanford POS tagger.
+
+            >>> from nltk.tokenize.stanford import CoreNLPTokenizer
+            >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
+            >>> CoreNLPTokenizer(url='http://localhost:9000').tokenize(s) == expected # doctest: +SKIP
+            [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.', u'Please', u'buy', u'me', u'two', u'of', u'them', u'.', u'Thanks', u'.']
+        """
+        super(CoreNLPTokenizer, self).__init__(url, encoding)
+
+    def tokenize(self, text, properties=None):
+        """
+        Tokenize a string of text. Consistent with the StanfordTokenizer, This
+        function returns a list of string. The orignal CoreNLPParser.tokenize()
+        returns a generator of string.
+        """
+        return list(super(CoreNLPTokenizer, self).tokenize(text, properties))
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        StanfordTokenizer()
+    except LookupError:
+        raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')
+
+    try:
+        CoreNLPTokenizer()
+    except LookupError:
+        raise SkipTest('doctests from nltk.tokenize.stanford.CoreNLPTokenizer are skipped because the '
+                       'stanford corenlp server not started')
diff --git a/nlp_resource_data/nltk/tokenize/stanford.pyc b/nlp_resource_data/nltk/tokenize/stanford.pyc

new file mode 100755 (executable)

index 0000000..42e119a

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/stanford.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/stanford_segmenter.py b/nlp_resource_data/nltk/tokenize/stanford_segmenter.py

new file mode 100755 (executable)

index 0000000..077cbef
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/stanford_segmenter.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Stanford Segmenter
+# for Chinese and Arabic
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: 52nlp <52nlpcn@gmail.com>
+#         Casper Lehmann-Strøm <casperlehmann@gmail.com>
+#         Alex Constantin <alex@keyworder.ch>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals, print_function
+
+import tempfile
+import os
+import json
+from subprocess import PIPE
+import warnings
+
+from nltk import compat
+from nltk.internals import find_jar, find_file, find_dir, \
+                           config_java, java, _java_options
+from nltk.tokenize.api import TokenizerI
+
+from six import text_type
+
+_stanford_url = 'https://nlp.stanford.edu/software'
+
+
+class StanfordSegmenter(TokenizerI):
+    """Interface to the Stanford Segmenter
+
+    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
+    should be provieded, for example::
+
+        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
+
+    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+    >>> seg = StanfordSegmenter()
+    >>> seg.default_config('zh')
+    >>> sent = u'这是斯坦福中文分词器测试'
+    >>> print(seg.segment(sent))
+    \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
+    <BLANKLINE>
+    >>> seg.default_config('ar')
+    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
+    >>> print(seg.segment(sent.split()))
+    \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
+    <BLANKLINE>
+    """
+
+    _JAR = 'stanford-segmenter.jar'
+
+    def __init__(self,
+                 path_to_jar=None,
+                 path_to_slf4j=None,
+                 java_class=None,
+                 path_to_model=None,
+                 path_to_dict=None,
+                 path_to_sihan_corpora_dict=None,
+                 sihan_post_processing='false',
+                 keep_whitespaces='false',
+                 encoding='UTF-8', options=None,
+                 verbose=False, java_options='-mx2g'):
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(str("\nThe StanfordTokenizer will "
+                          "be deprecated in version 3.2.5.\n"
+                          "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+                      DeprecationWarning, stacklevel=2)
+        warnings.simplefilter('ignore', DeprecationWarning)
+
+        stanford_segmenter = find_jar(
+                self._JAR, path_to_jar,
+                env_vars=('STANFORD_SEGMENTER',),
+                searchpath=(), url=_stanford_url,
+                verbose=verbose)
+        if path_to_slf4j is not None:
+            slf4j = find_jar(
+                'slf4j-api.jar', path_to_slf4j,
+                env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
+                searchpath=(), url=_stanford_url,
+                verbose=verbose)
+        else:
+            slf4j = None
+
+        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
+        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
+        self._stanford_jar = os.pathsep.join(
+            _ for _ in [stanford_segmenter, slf4j] if _ is not None
+        )
+
+        self._java_class = java_class
+        self._model = path_to_model
+        self._sihan_corpora_dict = path_to_sihan_corpora_dict
+        self._sihan_post_processing = sihan_post_processing
+        self._keep_whitespaces = keep_whitespaces
+        self._dict = path_to_dict
+
+        self._encoding = encoding
+        self.java_options = java_options
+        options = {} if options is None else options
+        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
+
+    def default_config(self, lang):
+        """
+        Attempt to intialize Stanford Word Segmenter for the specified language
+        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
+        """
+
+        search_path = ()
+        if os.environ.get('STANFORD_SEGMENTER'):
+            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
+
+        # init for Chinese-specific files
+        self._dict = None
+        self._sihan_corpora_dict = None
+        self._sihan_post_processing = 'false'
+
+        if lang == 'ar':
+            self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
+            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
+
+        elif lang == 'zh':
+            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
+            model = 'pku.gz'
+            self._sihan_post_processing = 'true'
+
+            path_to_dict = 'dict-chris6.ser.gz'
+            try:
+                self._dict = find_file(path_to_dict, searchpath=search_path,
+                                       url=_stanford_url, verbose=False,
+                                       env_vars=('STANFORD_MODELS',))
+            except LookupError:
+                raise LookupError("Could not find '%s' (tried using env. "
+                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)
+
+            sihan_dir = './data/'
+            try:
+                path_to_sihan_dir = find_dir(sihan_dir,
+                                             url=_stanford_url, verbose=False,
+                                             env_vars=('STANFORD_SEGMENTER',))
+                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
+            except LookupError:
+                raise LookupError("Could not find '%s' (tried using the "
+                    "STANFORD_SEGMENTER environment variable)" % sihan_dir)
+        else:
+            raise LookupError("Unsupported language '%'" % lang)
+
+        try:
+            self._model = find_file(model, searchpath=search_path,
+                                    url=_stanford_url, verbose=False,
+                                    env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
+        except LookupError:
+            raise LookupError("Could not find '%s' (tried using env. "
+                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
+
+    def tokenize(self, s):
+        super().tokenize(s)
+
+    def segment_file(self, input_file_path):
+        """
+        """
+        cmd = [
+            self._java_class,
+            '-loadClassifier', self._model,
+            '-keepAllWhitespaces', self._keep_whitespaces,
+            '-textFile', input_file_path
+        ]
+        if self._sihan_corpora_dict is not None:
+            cmd.extend(['-serDictionary', self._dict,
+                        '-sighanCorporaDict', self._sihan_corpora_dict,
+                        '-sighanPostProcessing', self._sihan_post_processing])
+
+        stdout = self._execute(cmd)
+
+        return stdout
+
+    def segment(self, tokens):
+        return self.segment_sents([tokens])
+
+    def segment_sents(self, sentences):
+        """
+        """
+        encoding = self._encoding
+        # Create a temporary input file
+        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
+
+        # Write the actural sentences to the temporary input file
+        _input_fh = os.fdopen(_input_fh, 'wb')
+        _input = '\n'.join((' '.join(x) for x in sentences))
+        if isinstance(_input, text_type) and encoding:
+            _input = _input.encode(encoding)
+        _input_fh.write(_input)
+        _input_fh.close()
+
+        cmd = [
+            self._java_class,
+            '-loadClassifier', self._model,
+            '-keepAllWhitespaces', self._keep_whitespaces,
+            '-textFile', self._input_file_path
+        ]
+        if self._sihan_corpora_dict is not None:
+            cmd.extend(['-serDictionary', self._dict,
+                        '-sighanCorporaDict', self._sihan_corpora_dict,
+                        '-sighanPostProcessing', self._sihan_post_processing])
+
+        stdout = self._execute(cmd)
+
+        # Delete the temporary file
+        os.unlink(self._input_file_path)
+
+        return stdout
+
+    def _execute(self, cmd, verbose=False):
+        encoding = self._encoding
+        cmd.extend(['-inputEncoding', encoding])
+        _options_cmd = self._options_cmd
+        if _options_cmd:
+            cmd.extend(['-options', self._options_cmd])
+
+        default_options = ' '.join(_java_options)
+
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+
+        stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
+        stdout = stdout.decode(encoding)
+
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+
+        return stdout
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        seg = StanfordSegmenter()
+        seg.default_config('ar')
+        seg.default_config('zh')
+    except LookupError as e:
+        raise SkipTest('Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e))
diff --git a/nlp_resource_data/nltk/tokenize/stanford_segmenter.pyc b/nlp_resource_data/nltk/tokenize/stanford_segmenter.pyc

new file mode 100755 (executable)

index 0000000..c67cfdb

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/stanford_segmenter.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/texttiling.py b/nlp_resource_data/nltk/tokenize/texttiling.py

new file mode 100755 (executable)

index 0000000..cf2e46e
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/texttiling.py
@@ -0,0 +1,458 @@
+# Natural Language Toolkit: TextTiling
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: George Boutsioukis
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+import math
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.tokenize.api import TokenizerI
+
+BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1
+LC, HC = 0, 1
+DEFAULT_SMOOTHING = [0]
+
+
+class TextTilingTokenizer(TokenizerI):
+    """Tokenize a document into topical sections using the TextTiling algorithm.
+    This algorithm detects subtopic shifts based on the analysis of lexical
+    co-occurrence patterns.
+
+    The process starts by tokenizing the text into pseudosentences of
+    a fixed size w. Then, depending on the method used, similarity
+    scores are assigned at sentence gaps. The algorithm proceeds by
+    detecting the peak differences between these scores and marking
+    them as boundaries. The boundaries are normalized to the closest
+    paragraph break and the segmented text is returned.
+
+    :param w: Pseudosentence size
+    :type w: int
+    :param k: Size (in sentences) of the block used in the block comparison method
+    :type k: int
+    :param similarity_method: The method used for determining similarity scores:
+       `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
+    :type similarity_method: constant
+    :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
+    :type stopwords: list(str)
+    :param smoothing_method: The method used for smoothing the score plot:
+      `DEFAULT_SMOOTHING` (default)
+    :type smoothing_method: constant
+    :param smoothing_width: The width of the window used by the smoothing method
+    :type smoothing_width: int
+    :param smoothing_rounds: The number of smoothing passes
+    :type smoothing_rounds: int
+    :param cutoff_policy: The policy used to determine the number of boundaries:
+      `HC` (default) or `LC`
+    :type cutoff_policy: constant
+
+    >>> from nltk.corpus import brown
+    >>> tt = TextTilingTokenizer(demo_mode=True)
+    >>> text = brown.raw()[:4000]
+    >>> s, ss, d, b = tt.tokenize(text)
+    >>> b
+    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
+    """
+
+    def __init__(self,
+                 w=20,
+                 k=10,
+                 similarity_method=BLOCK_COMPARISON,
+                 stopwords=None,
+                 smoothing_method=DEFAULT_SMOOTHING,
+                 smoothing_width=2,
+                 smoothing_rounds=1,
+                 cutoff_policy=HC,
+                 demo_mode=False):
+
+
+        if stopwords is None:
+            from nltk.corpus import stopwords
+            stopwords = stopwords.words('english')
+        self.__dict__.update(locals())
+        del self.__dict__['self']
+
+    def tokenize(self, text):
+        """Return a tokenized copy of *text*, where each "token" represents
+        a separate topic."""
+
+        lowercase_text = text.lower()
+        paragraph_breaks = self._mark_paragraph_breaks(text)
+        text_length = len(lowercase_text)
+
+        # Tokenization step starts here
+
+        # Remove punctuation
+        nopunct_text = ''.join(c for c in lowercase_text
+                               if re.match("[a-z\-\' \n\t]", c))
+        nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
+
+        tokseqs = self._divide_to_tokensequences(nopunct_text)
+
+        # The morphological stemming step mentioned in the TextTile
+        # paper is not implemented.  A comment in the original C
+        # implementation states that it offers no benefit to the
+        # process. It might be interesting to test the existing
+        # stemmers though.
+        #words = _stem_words(words)
+
+        # Filter stopwords
+        for ts in tokseqs:
+            ts.wrdindex_list = [wi for wi in ts.wrdindex_list
+                                if wi[0] not in self.stopwords]
+
+        token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
+        # End of the Tokenization step
+
+        # Lexical score determination
+        if self.similarity_method == BLOCK_COMPARISON:
+            gap_scores = self._block_comparison(tokseqs, token_table)
+        elif self.similarity_method == VOCABULARY_INTRODUCTION:
+            raise NotImplementedError("Vocabulary introduction not implemented")
+
+        if self.smoothing_method == DEFAULT_SMOOTHING:
+            smooth_scores = self._smooth_scores(gap_scores)
+        # End of Lexical score Determination
+
+        # Boundary identification
+        depth_scores = self._depth_scores(smooth_scores)
+        segment_boundaries = self._identify_boundaries(depth_scores)
+
+        normalized_boundaries = self._normalize_boundaries(text,
+                                                           segment_boundaries,
+                                                           paragraph_breaks)
+        # End of Boundary Identification
+        segmented_text = []
+        prevb = 0
+
+        for b in normalized_boundaries:
+            if b == 0:
+                continue
+            segmented_text.append(text[prevb:b])
+            prevb = b
+
+        if prevb < text_length: # append any text that may be remaining
+            segmented_text.append(text[prevb:])
+
+        if not segmented_text:
+            segmented_text = [text]
+
+        if self.demo_mode:
+            return gap_scores, smooth_scores, depth_scores, segment_boundaries
+        return segmented_text
+
+    def _block_comparison(self, tokseqs, token_table):
+        "Implements the block comparison method"
+        def blk_frq(tok, block):
+            ts_occs = filter(lambda o: o[0] in block,
+                             token_table[tok].ts_occurences)
+            freq = sum([tsocc[1] for tsocc in ts_occs])
+            return freq
+
+        gap_scores = []
+        numgaps = len(tokseqs)-1
+
+        for curr_gap in range(numgaps):
+            score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
+            score = 0.0
+            #adjust window size for boundary conditions
+            if curr_gap < self.k-1:
+                window_size = curr_gap + 1
+            elif curr_gap > numgaps-self.k:
+                window_size = numgaps - curr_gap
+            else:
+                window_size = self.k
+
+            b1 = [ts.index
+                  for ts in tokseqs[curr_gap-window_size+1 : curr_gap+1]]
+            b2 = [ts.index
+                  for ts in tokseqs[curr_gap+1 : curr_gap+window_size+1]]
+
+            for t in token_table:
+                score_dividend += blk_frq(t, b1)*blk_frq(t, b2)
+                score_divisor_b1 += blk_frq(t, b1)**2
+                score_divisor_b2 += blk_frq(t, b2)**2
+            try:
+                score = score_dividend/math.sqrt(score_divisor_b1*
+                                                 score_divisor_b2)
+            except ZeroDivisionError:
+                pass # score += 0.0
+
+            gap_scores.append(score)
+
+        return gap_scores
+
+    def _smooth_scores(self, gap_scores):
+        "Wraps the smooth function from the SciPy Cookbook"
+        return list(smooth(numpy.array(gap_scores[:]),
+                           window_len = self.smoothing_width+1))
+
+    def _mark_paragraph_breaks(self, text):
+        """Identifies indented text or line breaks as the beginning of
+        paragraphs"""
+        MIN_PARAGRAPH = 100
+        pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*")
+        matches = pattern.finditer(text)
+
+        last_break = 0
+        pbreaks = [0]
+        for pb in matches:
+            if pb.start()-last_break < MIN_PARAGRAPH:
+                continue
+            else:
+                pbreaks.append(pb.start())
+                last_break = pb.start()
+
+        return pbreaks
+
+    def _divide_to_tokensequences(self, text):
+        "Divides the text into pseudosentences of fixed size"
+        w = self.w
+        wrdindex_list = []
+        matches = re.finditer("\w+", text)
+        for match in matches:
+            wrdindex_list.append((match.group(), match.start()))
+        return [TokenSequence(i/w, wrdindex_list[i:i+w])
+                for i in range(0, len(wrdindex_list), w)]
+
+    def _create_token_table(self, token_sequences, par_breaks):
+        "Creates a table of TokenTableFields"
+        token_table = {}
+        current_par = 0
+        current_tok_seq = 0
+        pb_iter = par_breaks.__iter__()
+        current_par_break = next(pb_iter)
+        if current_par_break == 0:
+            try:
+                current_par_break = next(pb_iter) #skip break at 0
+            except StopIteration:
+                raise ValueError(
+                    "No paragraph breaks were found(text too short perhaps?)"
+                    )
+        for ts in token_sequences:
+            for word, index in ts.wrdindex_list:
+                try:
+                    while index > current_par_break:
+                        current_par_break = next(pb_iter)
+                        current_par += 1
+                except StopIteration:
+                    #hit bottom
+                    pass
+
+                if word in token_table:
+                    token_table[word].total_count += 1
+
+                    if token_table[word].last_par != current_par:
+                        token_table[word].last_par = current_par
+                        token_table[word].par_count += 1
+
+                    if token_table[word].last_tok_seq != current_tok_seq:
+                        token_table[word].last_tok_seq = current_tok_seq
+                        token_table[word]\
+                                .ts_occurences.append([current_tok_seq,1])
+                    else:
+                        token_table[word].ts_occurences[-1][1] += 1
+                else: #new word
+                    token_table[word] = TokenTableField(first_pos=index,
+                                                        ts_occurences= \
+                                                          [[current_tok_seq,1]],
+                                                        total_count=1,
+                                                        par_count=1,
+                                                        last_par=current_par,
+                                                        last_tok_seq= \
+                                                          current_tok_seq)
+
+            current_tok_seq += 1
+
+        return token_table
+
+    def _identify_boundaries(self, depth_scores):
+        """Identifies boundaries at the peaks of similarity score
+        differences"""
+
+        boundaries = [0 for x in depth_scores]
+
+        avg = sum(depth_scores)/len(depth_scores)
+        stdev = numpy.std(depth_scores)
+
+        #SB: what is the purpose of this conditional?
+        if self.cutoff_policy == LC:
+            cutoff = avg-stdev/2.0
+        else:
+            cutoff = avg-stdev/2.0
+
+        depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
+        depth_tuples.reverse()
+        hp = list(filter(lambda x:x[0]>cutoff, depth_tuples))
+
+        for dt in hp:
+            boundaries[dt[1]] = 1
+            for dt2 in hp: #undo if there is a boundary close already
+                if dt[1] != dt2[1] and abs(dt2[1]-dt[1]) < 4 \
+                       and boundaries[dt2[1]] == 1:
+                    boundaries[dt[1]] = 0
+        return boundaries
+
+    def _depth_scores(self, scores):
+        """Calculates the depth of each gap, i.e. the average difference
+        between the left and right peaks and the gap's score"""
+
+        depth_scores = [0 for x in scores]
+        #clip boundaries: this holds on the rule of thumb(my thumb)
+        #that a section shouldn't be smaller than at least 2
+        #pseudosentences for small texts and around 5 for larger ones.
+
+        clip = min(max(len(scores) // 10, 2), 5)
+        index = clip
+
+        for gapscore in scores[clip:-clip]:
+            lpeak = gapscore
+            for score in scores[index::-1]:
+                if score >= lpeak:
+                    lpeak = score
+                else:
+                    break
+            rpeak = gapscore
+            for score in scores[index:]:
+                if score >= rpeak:
+                    rpeak = score
+                else:
+                    break
+            depth_scores[index] = lpeak + rpeak - 2 * gapscore
+            index += 1
+
+        return depth_scores
+
+    def _normalize_boundaries(self, text, boundaries, paragraph_breaks):
+        """Normalize the boundaries identified to the original text's
+        paragraph breaks"""
+
+        norm_boundaries = []
+        char_count, word_count, gaps_seen = 0, 0, 0
+        seen_word = False
+
+        for char in text:
+            char_count += 1
+            if char in " \t\n" and seen_word:
+                seen_word = False
+                word_count += 1
+            if char not in " \t\n" and not seen_word:
+                seen_word=True
+            if gaps_seen < len(boundaries) and word_count > \
+                                               (max(gaps_seen*self.w, self.w)):
+                if boundaries[gaps_seen] == 1:
+                    #find closest paragraph break
+                    best_fit = len(text)
+                    for br in paragraph_breaks:
+                        if best_fit > abs(br-char_count):
+                            best_fit = abs(br-char_count)
+                            bestbr = br
+                        else:
+                            break
+                    if bestbr not in norm_boundaries: #avoid duplicates
+                        norm_boundaries.append(bestbr)
+                gaps_seen += 1
+
+        return norm_boundaries
+
+
+class TokenTableField(object):
+    """A field in the token table holding parameters for each token,
+    used later in the process"""
+    def __init__(self,
+                 first_pos,
+                 ts_occurences,
+                 total_count=1,
+                 par_count=1,
+                 last_par=0,
+                 last_tok_seq=None):
+        self.__dict__.update(locals())
+        del self.__dict__['self']
+
+class TokenSequence(object):
+    "A token list with its original length and its index"
+    def __init__(self,
+                 index,
+                 wrdindex_list,
+                 original_length=None):
+        original_length=original_length or len(wrdindex_list)
+        self.__dict__.update(locals())
+        del self.__dict__['self']
+
+
+#Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
+def smooth(x,window_len=11,window='flat'):
+    """smooth the data using a window with requested size.
+
+    This method is based on the convolution of a scaled window with the signal.
+    The signal is prepared by introducing reflected copies of the signal
+    (with the window size) in both ends so that transient parts are minimized
+    in the beginning and end part of the output signal.
+
+    :param x: the input signal
+    :param window_len: the dimension of the smoothing window; should be an odd integer
+    :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
+        flat window will produce a moving average smoothing.
+
+    :return: the smoothed signal
+
+    example::
+
+        t=linspace(-2,2,0.1)
+        x=sin(t)+randn(len(t))*0.1
+        y=smooth(x)
+
+    :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
+        scipy.signal.lfilter
+
+    TODO: the window parameter could be the window itself if an array instead of a string
+    """
+
+    if x.ndim != 1:
+        raise ValueError("smooth only accepts 1 dimension arrays.")
+
+    if x.size < window_len:
+        raise ValueError("Input vector needs to be bigger than window size.")
+
+    if window_len < 3:
+        return x
+
+    if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
+        raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")
+
+    s=numpy.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]]
+
+    #print(len(s))
+    if window == 'flat': #moving average
+        w = numpy.ones(window_len,'d')
+    else:
+        w = eval('numpy.' + window + '(window_len)')
+
+    y = numpy.convolve(w/w.sum(), s, mode='same')
+
+    return y[window_len-1:-window_len+1]
+
+
+def demo(text=None):
+    from nltk.corpus import brown
+    from matplotlib import pylab
+    tt = TextTilingTokenizer(demo_mode=True)
+    if text is None: text = brown.raw()[:10000]
+    s, ss, d, b = tt.tokenize(text)
+    pylab.xlabel("Sentence Gap index")
+    pylab.ylabel("Gap Scores")
+    pylab.plot(range(len(s)), s, label="Gap Scores")
+    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
+    pylab.plot(range(len(d)), d, label="Depth scores")
+    pylab.stem(range(len(b)), b)
+    pylab.legend()
+    pylab.show()
+
+
diff --git a/nlp_resource_data/nltk/tokenize/texttiling.pyc b/nlp_resource_data/nltk/tokenize/texttiling.pyc

new file mode 100755 (executable)

index 0000000..025f18a

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/texttiling.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/toktok.py b/nlp_resource_data/nltk/tokenize/toktok.py

new file mode 100755 (executable)

index 0000000..3c46373
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/toktok.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Jon Dehdari
+# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+The tok-tok tokenizer is a simple, general tokenizer, where the input has one 
+sentence per line; thus only final period is tokenized.
+
+Tok-tok has been tested on, and gives reasonably good results for English, 
+Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. 
+The input should be in UTF-8 encoding.
+
+Reference:
+Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language 
+Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University. 
+"""
+
+import re
+from six import text_type
+
+from nltk.tokenize.api import TokenizerI
+
+class ToktokTokenizer(TokenizerI):
+    """
+    This is a Python port of the tok-tok.pl from
+    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
+    
+    >>> toktok = ToktokTokenizer()
+    >>> text = u'Is 9.5 or 525,600 my favorite number?'
+    >>> print (toktok.tokenize(text, return_str=True))
+    Is 9.5 or 525,600 my favorite number ?
+    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+    >>> print (toktok.tokenize(text, return_str=True))
+    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
+    >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+    >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+    >>> assert toktok.tokenize(text, return_str=True) == expected
+    >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+    True
+    """
+    # Replace non-breaking spaces with normal spaces.
+    NON_BREAKING = re.compile(u"\u00A0"), " "
+    
+    # Pad some funky punctuation.
+    FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
+    # Pad more funky punctuation.
+    FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 "
+    # Pad En dash and em dash
+    EN_EM_DASHES = re.compile(u'([–—])'), r" \1 "
+    
+    # Replace problematic character with numeric character reference.
+    AMPERCENT = re.compile('& '), '&amp; '
+    TAB = re.compile('\t'), ' &#9; '
+    PIPE = re.compile('\|'), ' &#124; '
+    
+    # Pad numbers with commas to keep them from further tokenization. 
+    COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 '
+    
+    # Just pad problematic (often neurotic) hyphen/single quote, etc.
+    PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 '
+    # Group ` ` stupid quotes ' ' into a single token.
+    STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
+    STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
+    
+    # Don't tokenize period unless it ends the line and that it isn't 
+    # preceded by another period, e.g.  
+    # "something ..." -> "something ..." 
+    # "something." -> "something ." 
+    FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
+    # Don't tokenize period unless it ends the line eg. 
+    # " ... stuff." ->  "... stuff ."
+    FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
+
+    # Treat continuous commas as fake German,Czech, etc.: „
+    MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 '
+    # Treat continuous dashes as fake en-dash, etc.
+    MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 '
+    # Treat multiple periods as a thing (eg. ellipsis)
+    MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 '
+
+    # This is the \p{Open_Punctuation} from Perl's perluniprops
+    # see http://perldoc.perl.org/perluniprops.html
+    OPEN_PUNCT = text_type(u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
+                            u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
+                            u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
+                            u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
+                            u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
+                            u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
+                            u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
+                            u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
+                            u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62')
+    # This is the \p{Close_Punctuation} from Perl's perluniprops
+    CLOSE_PUNCT = text_type(u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
+                            u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
+                            u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
+                            u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
+                            u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
+                            u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
+                            u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
+                            u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
+                            u'\uff09\uff3d\uff5d\uff60\uff63')
+    # This is the \p{Close_Punctuation} from Perl's perluniprops
+    CURRENCY_SYM = text_type(u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
+                             u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
+                             u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
+                             u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
+                             u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
+                             u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6')
+    
+    # Pad spaces after opening punctuations.
+    OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 '
+    # Pad spaces before closing punctuations.
+    CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 '
+    # Pad spaces after currency symbols.
+    CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 '
+    
+    # Use for tokenizing URL-unfriendly characters: [:/?#]
+    URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g;
+    URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g;
+    # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
+    URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
+    URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g;
+    
+    # Left/Right strip, i.e. remove heading/trailing spaces.
+    # These strip regexes should NOT be used,
+    # instead use str.lstrip(), str.rstrip() or str.strip() 
+    # (They are kept for reference purposes to the original toktok.pl code)  
+    LSTRIP = re.compile(r'^ +'), ''
+    RSTRIP = re.compile(r'\s+$'),'\n' 
+    # Merge multiple spaces.
+    ONE_SPACE = re.compile(r' {2,}'), ' '
+    
+    TOKTOK_REGEXES = [NON_BREAKING, FUNKY_PUNCT_1, 
+                      URL_FOE_1, URL_FOE_2, URL_FOE_3, URL_FOE_4,
+                      AMPERCENT, TAB, PIPE,
+                      OPEN_PUNCT_RE, CLOSE_PUNCT_RE, 
+                      MULTI_COMMAS, COMMA_IN_NUM, FINAL_PERIOD_2,
+                      PROB_SINGLE_QUOTES, STUPID_QUOTES_1, STUPID_QUOTES_2,
+                      CURRENCY_SYM_RE, EN_EM_DASHES, MULTI_DASHES, MULTI_DOTS,
+                      FINAL_PERIOD_1, FINAL_PERIOD_2, ONE_SPACE]
+    
+    def tokenize(self, text, return_str=False):
+        text = text_type(text) # Converts input string into unicode.
+        for regexp, subsitution in self.TOKTOK_REGEXES:
+            text = regexp.sub(subsitution, text)
+        # Finally, strips heading and trailing spaces
+        # and converts output string into unicode.
+        text = text_type(text.strip()) 
+        return text if return_str else text.split()
+\ No newline at end of file
diff --git a/nlp_resource_data/nltk/tokenize/toktok.pyc b/nlp_resource_data/nltk/tokenize/toktok.pyc

new file mode 100755 (executable)

index 0000000..1ff7f90

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/toktok.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/treebank.py b/nlp_resource_data/nltk/tokenize/treebank.py

new file mode 100755 (executable)

index 0000000..f3ae637
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/treebank.py
@@ -0,0 +1,343 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+r"""
+
+Penn Treebank Tokenizer
+
+The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
+This implementation is a port of the tokenizer sed script written by Robert McIntyre
+and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
+"""
+
+import re
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import align_tokens
+
+
+class MacIntyreContractions:
+    """
+    List of contractions adapted from Robert MacIntyre's tokenizer.
+    """
+    CONTRACTIONS2 = [r"(?i)\b(can)(?#X)(not)\b",
+                     r"(?i)\b(d)(?#X)('ye)\b",
+                     r"(?i)\b(gim)(?#X)(me)\b",
+                     r"(?i)\b(gon)(?#X)(na)\b",
+                     r"(?i)\b(got)(?#X)(ta)\b",
+                     r"(?i)\b(lem)(?#X)(me)\b",
+                     r"(?i)\b(mor)(?#X)('n)\b",
+                     r"(?i)\b(wan)(?#X)(na)\s"]
+    CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
+    CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b",
+                     r"(?i)\b(wha)(t)(cha)\b"]
+
+
+class TreebankWordTokenizer(TokenizerI):
+    """
+    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
+    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
+    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
+
+    This tokenizer performs the following steps:
+
+    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
+    - treat most punctuation characters as separate tokens
+    - split off commas and single quotes, when followed by whitespace
+    - separate periods that appear at the end of line
+
+        >>> from nltk.tokenize import TreebankWordTokenizer
+        >>> s = '''Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\nThanks.'''
+        >>> TreebankWordTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
+        >>> s = "They'll save and invest more."
+        >>> TreebankWordTokenizer().tokenize(s)
+        ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
+        >>> s = "hi, my name can't hello,"
+        >>> TreebankWordTokenizer().tokenize(s)
+        ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
+    """
+
+    #starting quotes
+    STARTING_QUOTES = [
+        (re.compile(r'^\"'), r'``'),
+        (re.compile(r'(``)'), r' \1 '),
+        (re.compile(r'([ (\[{<])"'), r'\1 `` '),
+    ]
+
+    #punctuation
+    PUNCTUATION = [
+        (re.compile(r'([:,])([^\d])'), r' \1 \2'),
+        (re.compile(r'([:,])$'), r' \1 '),
+        (re.compile(r'\.\.\.'), r' ... '),
+        (re.compile(r'[;@#$%&]'), r' \g<0> '),
+        (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '), # Handles the final period.
+        (re.compile(r'[?!]'), r' \g<0> '),
+
+        (re.compile(r"([^'])' "), r"\1 ' "),
+    ]
+
+    # Pads parentheses
+    PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')
+
+    # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
+    CONVERT_PARENTHESES = [
+        (re.compile(r'\('), '-LRB-'), (re.compile(r'\)'), '-RRB-'),
+        (re.compile(r'\['), '-LSB-'), (re.compile(r'\]'), '-RSB-'),
+        (re.compile(r'\{'), '-LCB-'), (re.compile(r'\}'), '-RCB-')
+    ]
+
+    DOUBLE_DASHES = (re.compile(r'--'), r' -- ')
+
+    #ending quotes
+    ENDING_QUOTES = [
+        (re.compile(r'"'), " '' "),
+        (re.compile(r'(\S)(\'\')'), r'\1 \2 '),
+        (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
+        (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
+    ]
+
+    # List of contractions adapted from Robert MacIntyre's tokenizer.
+    _contractions = MacIntyreContractions()
+    CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
+    CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
+
+    def tokenize(self, text, convert_parentheses=False, return_str=False):
+        for regexp, substitution in self.STARTING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        for regexp, substitution in self.PUNCTUATION:
+            text = regexp.sub(substitution, text)
+
+        # Handles parentheses.
+        regexp, substitution = self.PARENS_BRACKETS
+        text = regexp.sub(substitution, text)
+        # Optionally convert parentheses
+        if convert_parentheses:
+            for regexp, substitution in self.CONVERT_PARENTHESES:
+                text = regexp.sub(substitution, text)
+
+        # Handles double dash.
+        regexp, substitution = self.DOUBLE_DASHES
+        text = regexp.sub(substitution, text)
+
+        #add extra space to make things easier
+        text = " " + text + " "
+
+        for regexp, substitution in self.ENDING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        for regexp in self.CONTRACTIONS2:
+            text = regexp.sub(r' \1 \2 ', text)
+        for regexp in self.CONTRACTIONS3:
+            text = regexp.sub(r' \1 \2 ', text)
+
+        # We are not using CONTRACTIONS4 since
+        # they are also commented out in the SED scripts
+        # for regexp in self._contractions.CONTRACTIONS4:
+        #     text = regexp.sub(r' \1 \2 \3 ', text)
+
+        return text if return_str else text.split()
+
+    def span_tokenize(self, text):
+        """
+        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+
+            >>> from nltk.tokenize import TreebankWordTokenizer
+            >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
+            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+            >>> TreebankWordTokenizer().span_tokenize(s) == expected
+            True
+            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
+            True
+
+        """
+        raw_tokens = self.tokenize(text)
+
+        # Convert converted quotes back to original double quotes
+        # Do this only if original text contains double quote(s)
+        if '"' in text:
+            # Find double quotes and converted quotes
+            matched = [m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)]
+            
+            # Replace converted quotes back to double quotes
+            tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens]
+        else:
+            tokens = raw_tokens
+
+        return align_tokens(tokens, text)
+
+
+class TreebankWordDetokenizer(TokenizerI):
+    """
+    The Treebank detokenizer uses the reverse regex operations corresponding to
+    the Treebank tokenizer's regexes.
+
+    Note:
+    - There're additional assumption mades when undoing the padding of [;@#$%&]
+      punctuation symbols that isn't presupposed in the TreebankTokenizer.
+    - There're additional regexes added in reversing the parentheses tokenization,
+       - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added
+         to the closing parentheses precedding [:;,.].
+    - It's not possible to return the original whitespaces as they were because
+      there wasn't explicit records of where '\n', '\t' or '\s' were removed at
+      the text.split() operation.
+
+        >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
+        >>> s = '''Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\nThanks.'''
+        >>> d = TreebankWordDetokenizer()
+        >>> t = TreebankWordTokenizer()
+        >>> toks = t.tokenize(s)
+        >>> d.detokenize(toks)
+        'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
+
+    The MXPOST parentheses substitution can be undone using the `convert_parentheses`
+    parameter:
+
+    >>> s = '''Good muffins cost $3.88\\nin New (York).  Please (buy) me\\ntwo of them.\\n(Thanks).'''
+    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
+    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
+    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
+    True
+    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
+    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
+    True
+
+    During tokenization it's safe to add more spaces but during detokenization,
+    simply undoing the padding doesn't really help. 
+
+    - During tokenization, left and right pad is added to [!?], when
+      detokenizing, only left shift the [!?] is needed.
+      Thus (re.compile(r'\s([?!])'), r'\g<1>')
+
+    - During tokenization [:,] are left and right padded but when detokenizing,
+      only left shift is necessary and we keep right pad after comma/colon
+      if the string after is a non-digit.
+      Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
+
+    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
+    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
+    >>> twd = TreebankWordDetokenizer()
+    >>> twd.detokenize(toks)
+    "hello, i can't feel my feet! Help!!"
+
+    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
+    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
+    >>> twd.detokenize(toks)
+    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
+    """
+    _contractions = MacIntyreContractions()
+    CONTRACTIONS2 = [re.compile(pattern.replace('(?#X)', '\s'))
+                    for pattern in _contractions.CONTRACTIONS2]
+    CONTRACTIONS3 = [re.compile(pattern.replace('(?#X)', '\s'))
+                    for pattern in _contractions.CONTRACTIONS3]
+
+    #ending quotes
+    ENDING_QUOTES = [
+        (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
+        (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
+        (re.compile(r'(\S)(\'\')'), r'\1\2 '),
+        (re.compile(r" '' "), '"')
+        ]
+
+    # Handles double dashes
+    DOUBLE_DASHES = (re.compile(r' -- '), r'--')
+
+    # Optionally: Convert parentheses, brackets and converts them from PTB symbols.
+    CONVERT_PARENTHESES = [
+        (re.compile('-LRB-'), '('), (re.compile('-RRB-'), ')'),
+        (re.compile('-LSB-'), '['), (re.compile('-RSB-'), ']'),
+        (re.compile('-LCB-'), '{'), (re.compile('-RCB-'), '}')
+    ]
+
+    # Undo padding on parentheses.
+    PARENS_BRACKETS = [(re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'),
+                       (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '),
+                       (re.compile(r'([\]\)\}\>])\s([:;,.])'), r'\1\2')]
+
+    #punctuation
+    PUNCTUATION = [
+        (re.compile(r"([^'])\s'\s"), r"\1' "),
+        (re.compile(r'\s([?!])'), r'\g<1>'), # Strip left pad for [?!]
+        #(re.compile(r'\s([?!])\s'), r'\g<1>'),
+        (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
+        # When tokenizing, [;@#$%&] are padded with whitespace regardless of
+        # whether there are spaces before or after them.
+        # But during detokenization, we need to distinguish between left/right
+        # pad, so we split this up.
+        (re.compile(r'\s([#$])\s'), r' \g<1>'), # Left pad.
+        (re.compile(r'\s([;%])\s'), r'\g<1> '), # Right pad.
+        (re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
+        (re.compile(r'\s\.\.\.\s'), r'...'),
+        (re.compile(r'\s([:,])\s$'), r'\1'),
+        (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2') # Keep right pad after comma/colon before non-digits.
+        #(re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
+        ]
+
+    #starting quotes
+    STARTING_QUOTES = [
+        (re.compile(r'([ (\[{<])\s``'), r'\1"'),
+        (re.compile(r'\s(``)\s'), r'\1'),
+        (re.compile(r'^``'), r'\"'),
+    ]
+
+    def tokenize(self, tokens, convert_parentheses=False):
+        """
+        Python port of the Moses detokenizer.
+
+        :param tokens: A list of strings, i.e. tokenized text.
+        :type tokens: list(str)
+        :return: str
+        """
+        text = ' '.join(tokens)
+        # Reverse the contractions regexes.
+        # Note: CONTRACTIONS4 are not used in tokenization.
+        for regexp in self.CONTRACTIONS3:
+            text = regexp.sub(r'\1\2', text)
+        for regexp in self.CONTRACTIONS2:
+            text = regexp.sub(r'\1\2', text)
+
+        # Reverse the regexes applied for ending quotes.
+        for regexp, substitution in self.ENDING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        # Undo the space padding.
+        text = text.strip()
+
+        # Reverse the padding on double dashes.
+        regexp, substitution = self.DOUBLE_DASHES
+        text = regexp.sub(substitution, text)
+
+        if convert_parentheses:
+            for regexp, substitution in self.CONVERT_PARENTHESES:
+                text = regexp.sub(substitution, text)
+
+        # Reverse the padding regexes applied for parenthesis/brackets.
+        for regexp, substitution in self.PARENS_BRACKETS:
+            text = regexp.sub(substitution, text)
+
+        # Reverse the regexes applied for punctuations.
+        for regexp, substitution in self.PUNCTUATION:
+            text = regexp.sub(substitution, text)
+
+        # Reverse the regexes applied for starting quotes.
+        for regexp, substitution in self.STARTING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        return text.strip()
+
+    def detokenize(self, tokens, convert_parentheses=False):
+        """ Duck-typing the abstract *tokenize()*."""
+        return self.tokenize(tokens, convert_parentheses)
diff --git a/nlp_resource_data/nltk/tokenize/treebank.pyc b/nlp_resource_data/nltk/tokenize/treebank.pyc

new file mode 100755 (executable)

index 0000000..8ae93b3

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/treebank.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/util.py b/nlp_resource_data/nltk/tokenize/util.py

new file mode 100755 (executable)

index 0000000..f19894b
--- /dev/null
+++ b/nlp_resource_data/nltk/tokenize/util.py
@@ -0,0 +1,259 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Tokenizer Utilities
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+from re import finditer
+from xml.sax.saxutils import escape, unescape
+
+def string_span_tokenize(s, sep):
+    r"""
+    Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
+    tuples, by splitting the string at each occurrence of *sep*.
+
+        >>> from nltk.tokenize.util import string_span_tokenize
+        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+        ... two of them.\n\nThanks.'''
+        >>> list(string_span_tokenize(s, " "))
+        [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
+        (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
+
+    :param s: the string to be tokenized
+    :type s: str
+    :param sep: the token separator
+    :type sep: str
+    :rtype: iter(tuple(int, int))
+    """
+    if len(sep) == 0:
+        raise ValueError("Token delimiter must not be empty")
+    left = 0
+    while True:
+        try:
+            right = s.index(sep, left)
+            if right != 0:
+                yield left, right
+        except ValueError:
+            if left != len(s):
+                yield left, len(s)
+            break
+
+        left = right + len(sep)
+
+def regexp_span_tokenize(s, regexp):
+    r"""
+    Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
+    tuples, by splitting the string at each successive match of *regexp*.
+
+        >>> from nltk.tokenize.util import regexp_span_tokenize
+        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+        ... two of them.\n\nThanks.'''
+        >>> list(regexp_span_tokenize(s, r'\s'))
+        [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
+        (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+
+    :param s: the string to be tokenized
+    :type s: str
+    :param regexp: regular expression that matches token separators (must not be empty)
+    :type regexp: str
+    :rtype: iter(tuple(int, int))
+    """
+    left = 0
+    for m in finditer(regexp, s):
+        right, next = m.span()
+        if right != left:
+            yield left, right
+        left = next
+    yield left, len(s)
+
+def spans_to_relative(spans):
+    r"""
+    Return a sequence of relative spans, given a sequence of spans.
+
+        >>> from nltk.tokenize import WhitespaceTokenizer
+        >>> from nltk.tokenize.util import spans_to_relative
+        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+        ... two of them.\n\nThanks.'''
+        >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s)))
+        [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
+        (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
+
+    :param spans: a sequence of (start, end) offsets of the tokens
+    :type spans: iter(tuple(int, int))
+    :rtype: iter(tuple(int, int))
+    """
+    prev = 0
+    for left, right in spans:
+        yield left - prev, right - left
+        prev = right
+
+
+class CJKChars(object):
+    """
+    An object that enumerates the code points of the CJK characters as listed on
+    http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+    This is a Python port of the CJK code point enumerations of Moses tokenizer:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
+    """
+    # Hangul Jamo (1100–11FF)
+    Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
+
+    # CJK Radicals Supplement (2E80–2EFF)
+    # Kangxi Radicals (2F00–2FDF)
+    # Ideographic Description Characters (2FF0–2FFF)
+    # CJK Symbols and Punctuation (3000–303F)
+    # Hiragana (3040–309F)
+    # Katakana (30A0–30FF)
+    # Bopomofo (3100–312F)
+    # Hangul Compatibility Jamo (3130–318F)
+    # Kanbun (3190–319F)
+    # Bopomofo Extended (31A0–31BF)
+    # CJK Strokes (31C0–31EF)
+    # Katakana Phonetic Extensions (31F0–31FF)
+    # Enclosed CJK Letters and Months (3200–32FF)
+    # CJK Compatibility (3300–33FF)
+    # CJK Unified Ideographs Extension A (3400–4DBF)
+    # Yijing Hexagram Symbols (4DC0–4DFF)
+    # CJK Unified Ideographs (4E00–9FFF)
+    # Yi Syllables (A000–A48F)
+    # Yi Radicals (A490–A4CF)
+    CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
+
+    # Phags-pa (A840–A87F)
+    Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
+
+    # Hangul Syllables (AC00–D7AF)
+    Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
+
+    # CJK Compatibility Ideographs (F900–FAFF)
+    CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
+
+    # CJK Compatibility Forms (FE30–FE4F)
+    CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
+
+    # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+    Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
+
+    # Supplementary Ideographic Plane 20000–2FFFF
+    Supplementary_Ideographic_Plane = (131072, 196607) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
+
+    ranges = [Hangul_Jamo, CJK_Radicals, Phags_Pa, Hangul_Syllables,
+              CJK_Compatibility_Ideographs, CJK_Compatibility_Forms,
+              Katakana_Hangul_Halfwidth, Supplementary_Ideographic_Plane]
+
+
+
+def is_cjk(character):
+    """
+    Python port of Moses' code to check for CJK character.
+
+    >>> CJKChars().ranges
+    [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
+    >>> is_cjk(u'\u33fe')
+    True
+    >>> is_cjk(u'\uFE5F')
+    False
+
+    :param character: The character that needs to be checked.
+    :type character: char
+    :return: bool
+    """
+    return any([start <= ord(character) <= end for start, end in
+                [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215),
+                 (63744, 64255), (65072, 65103), (65381, 65500),
+                 (131072, 196607)]
+                ])
+
+
+def xml_escape(text):
+    """
+    This function transforms the input text into an "escaped" version suitable
+    for well-formed XML formatting.
+
+    Note that the default xml.sax.saxutils.escape() function don't escape
+    some characters that Moses does so we have to manually add them to the
+    entities dictionary.
+
+        >>> input_str = ''')| & < > ' " ] ['''
+        >>> expected_output =  ''')| &amp; &lt; &gt; ' " ] ['''
+        >>> escape(input_str) == expected_output
+        True
+        >>> xml_escape(input_str)
+        ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
+
+    :param text: The text that needs to be escaped.
+    :type text: str
+    :rtype: str
+    """
+    return escape(text, entities={ r"'": r"&apos;", r'"': r"&quot;",
+                                   r"|": r"&#124;",
+                                   r"[": r"&#91;",  r"]": r"&#93;", })
+
+
+def xml_unescape(text):
+    """
+    This function transforms the "escaped" version suitable
+    for well-formed XML formatting into humanly-readable string.
+
+    Note that the default xml.sax.saxutils.unescape() function don't unescape
+    some characters that Moses does so we have to manually add them to the
+    entities dictionary.
+
+        >>> from xml.sax.saxutils import unescape
+        >>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
+        >>> expected = ''')| & < > \' " ] ['''
+        >>> xml_unescape(s) == expected
+        True
+
+    :param text: The text that needs to be unescaped.
+    :type text: str
+    :rtype: str
+    """
+    return unescape(text, entities={ r"&apos;":r"'", r"&quot;":r'"',
+                                     r"&#124;":r"|",
+                                     r"&#91;":r"[",  r"&#93;":r"]", })
+
+
+def align_tokens(tokens, sentence):
+    """
+    This module attempt to find the offsets of the tokens in *s*, as a sequence
+    of ``(start, end)`` tuples, given the tokens and also the source string.
+
+        >>> from nltk.tokenize import TreebankWordTokenizer
+        >>> from nltk.tokenize.util import align_tokens
+        >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
+        ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
+        ... "on Saturday.")
+        >>> tokens = TreebankWordTokenizer().tokenize(s)
+        >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
+        ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
+        ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
+        ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
+        ... (123, 131), (131, 132)]
+        >>> output = list(align_tokens(tokens, s))
+        >>> len(tokens) == len(expected) == len(output)  # Check that length of tokens and tuples are the same.
+        True
+        >>> expected == list(align_tokens(tokens, s))  # Check that the output is as expected.
+        True
+        >>> tokens == [s[start:end] for start, end in output]  # Check that the slices of the string corresponds to the tokens.
+        True
+
+    :param tokens: The list of strings that are the result of tokenization
+    :type tokens: list(str)
+    :param sentence: The original string
+    :type sentence: str
+    :rtype: list(tuple(int,int))
+    """
+    point = 0
+    offsets = []
+    for token in tokens:
+        try:
+            start = sentence.index(token, point)
+        except ValueError:
+            raise ValueError('substring "{}" not found in "{}"'.format(token, sentence))
+        point = start + len(token)
+        offsets.append((start, point))
+    return offsets
diff --git a/nlp_resource_data/nltk/tokenize/util.pyc b/nlp_resource_data/nltk/tokenize/util.pyc

new file mode 100755 (executable)

index 0000000..7d17b93

Binary files /dev/null and b/nlp_resource_data/nltk/tokenize/util.pyc differ
diff --git a/nlp_resource_data/nltk/toolbox.py b/nlp_resource_data/nltk/toolbox.py

new file mode 100755 (executable)

index 0000000..a02ecbd
--- /dev/null
+++ b/nlp_resource_data/nltk/toolbox.py
@@ -0,0 +1,499 @@
+# coding: utf-8
+# Natural Language Toolkit: Toolbox Reader
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Greg Aumann <greg_aumann@sil.org>
+# URL: <http://nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+Module for reading, writing and manipulating
+Toolbox databases and settings files.
+"""
+from __future__ import print_function
+
+import os, re, codecs
+from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement
+
+from six import u
+
+from nltk.compat import StringIO, PY3
+from nltk.data import PathPointer, ZipFilePathPointer, find
+
+
+class StandardFormat(object):
+    """
+    Class for reading and processing standard format marker files and strings.
+    """
+    def __init__(self, filename=None, encoding=None):
+        self._encoding = encoding
+        if filename is not None:
+            self.open(filename)
+
+    def open(self, sfm_file):
+        """
+        Open a standard format marker file for sequential reading.
+
+        :param sfm_file: name of the standard format marker input file
+        :type sfm_file: str
+        """
+        if isinstance(sfm_file, PathPointer):
+            # [xx] We don't use 'rU' mode here -- do we need to?
+            #      (PathPointer.open doesn't take a mode option)
+            self._file = sfm_file.open(self._encoding)
+        else:
+            self._file = codecs.open(sfm_file, 'rU', self._encoding)
+
+    def open_string(self, s):
+        """
+        Open a standard format marker string for sequential reading.
+
+        :param s: string to parse as a standard format marker input file
+        :type s: str
+        """
+        self._file = StringIO(s)
+
+    def raw_fields(self):
+        """
+        Return an iterator that returns the next field in a (marker, value)
+        tuple. Linebreaks and trailing white space are preserved except
+        for the final newline in each field.
+
+        :rtype: iter(tuple(str, str))
+        """
+        join_string = '\n'
+        line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
+        # discard a BOM in the first line
+        first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?')
+        line_pat = re.compile(line_regexp % '')
+        # need to get first line outside the loop for correct handling
+        # of the first marker if it spans multiple lines
+        file_iter = iter(self._file)
+        line = next(file_iter)
+        mobj = re.match(first_line_pat, line)
+        mkr, line_value = mobj.groups()
+        value_lines = [line_value,]
+        self.line_num = 0
+        for line in file_iter:
+            self.line_num += 1
+            mobj = re.match(line_pat, line)
+            line_mkr, line_value = mobj.groups()
+            if line_mkr:
+                yield (mkr, join_string.join(value_lines))
+                mkr = line_mkr
+                value_lines = [line_value,]
+            else:
+                value_lines.append(line_value)
+        self.line_num += 1
+        yield (mkr, join_string.join(value_lines))
+
+    def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
+        """
+        Return an iterator that returns the next field in a ``(marker, value)``
+        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
+        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.
+
+        :param strip: strip trailing whitespace from the last line of each field
+        :type strip: bool
+        :param unwrap: Convert newlines in a field to spaces.
+        :type unwrap: bool
+        :param encoding: Name of an encoding to use. If it is specified then
+            the ``fields()`` method returns unicode strings rather than non
+            unicode strings.
+        :type encoding: str or None
+        :param errors: Error handling scheme for codec. Same as the ``decode()``
+            builtin string method.
+        :type errors: str
+        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
+            Ignored if encoding is None. If the whole file is UTF-8 encoded set
+            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
+            value of None.
+        :type unicode_fields: sequence
+        :rtype: iter(tuple(str, str))
+        """
+        if encoding is None and unicode_fields is not None:
+            raise ValueError('unicode_fields is set but not encoding.')
+        unwrap_pat = re.compile(r'\n+')
+        for mkr, val in self.raw_fields():
+            if encoding and not PY3: # kludge - already decoded in PY3?
+                if unicode_fields is not None and mkr in unicode_fields:
+                    val = val.decode('utf8', errors)
+                else:
+                    val = val.decode(encoding, errors)
+                mkr = mkr.decode(encoding, errors)
+            if unwrap:
+                val = unwrap_pat.sub(' ', val)
+            if strip:
+                val = val.rstrip()
+            yield (mkr, val)
+
+    def close(self):
+        """Close a previously opened standard format marker file or string."""
+        self._file.close()
+        try:
+            del self.line_num
+        except AttributeError:
+            pass
+
+class ToolboxData(StandardFormat):
+    def parse(self, grammar=None,  **kwargs):
+        if grammar:
+            return self._chunk_parse(grammar=grammar,  **kwargs)
+        else:
+            return self._record_parse(**kwargs)
+
+    def _record_parse(self, key=None, **kwargs):
+        """
+        Returns an element tree structure corresponding to a toolbox data file with
+        all markers at the same level.
+
+        Thus the following Toolbox database::
+            \_sh v3.0  400  Rotokas Dictionary
+            \_DateStampHasFourDigitYear
+
+            \lx kaa
+            \ps V.A
+            \ge gag
+            \gp nek i pas
+
+            \lx kaa
+            \ps V.B
+            \ge strangle
+            \gp pasim nek
+
+        after parsing will end up with the same structure (ignoring the extra
+        whitespace) as the following XML fragment after being parsed by
+        ElementTree::
+            <toolbox_data>
+                <header>
+                    <_sh>v3.0  400  Rotokas Dictionary</_sh>
+                    <_DateStampHasFourDigitYear/>
+                </header>
+
+                <record>
+                    <lx>kaa</lx>
+                    <ps>V.A</ps>
+                    <ge>gag</ge>
+                    <gp>nek i pas</gp>
+                </record>
+
+                <record>
+                    <lx>kaa</lx>
+                    <ps>V.B</ps>
+                    <ge>strangle</ge>
+                    <gp>pasim nek</gp>
+                </record>
+            </toolbox_data>
+
+        :param key: Name of key marker at the start of each record. If set to
+            None (the default value) the first marker that doesn't begin with
+            an underscore is assumed to be the key.
+        :type key: str
+        :param kwargs: Keyword arguments passed to ``StandardFormat.fields()``
+        :type kwargs: dict
+        :rtype: ElementTree._ElementInterface
+        :return: contents of toolbox data divided into header and records
+        """
+        builder = TreeBuilder()
+        builder.start('toolbox_data', {})
+        builder.start('header', {})
+        in_records = False
+        for mkr, value in self.fields(**kwargs):
+            if key is None and not in_records and mkr[0] != '_':
+                key = mkr
+            if mkr == key:
+                if in_records:
+                    builder.end('record')
+                else:
+                    builder.end('header')
+                    in_records = True
+                builder.start('record', {})
+            builder.start(mkr, {})
+            builder.data(value)
+            builder.end(mkr)
+        if in_records:
+            builder.end('record')
+        else:
+            builder.end('header')
+        builder.end('toolbox_data')
+        return builder.close()
+
+    def _tree2etree(self, parent):
+        from nltk.tree import Tree
+
+        root = Element(parent.label())
+        for child in parent:
+            if isinstance(child, Tree):
+                root.append(self._tree2etree(child))
+            else:
+                text, tag = child
+                e = SubElement(root, tag)
+                e.text = text
+        return root
+
+    def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs):
+        """
+        Returns an element tree structure corresponding to a toolbox data file
+        parsed according to the chunk grammar.
+
+        :type grammar: str
+        :param grammar: Contains the chunking rules used to parse the
+            database.  See ``chunk.RegExp`` for documentation.
+        :type root_label: str
+        :param root_label: The node value that should be used for the
+            top node of the chunk structure.
+        :type trace: int
+        :param trace: The level of tracing that should be used when
+            parsing a text.  ``0`` will generate no tracing output;
+            ``1`` will generate normal tracing output; and ``2`` or
+            higher will generate verbose tracing output.
+        :type kwargs: dict
+        :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()``
+        :rtype: ElementTree._ElementInterface
+        """
+        from nltk import chunk
+        from nltk.tree import Tree
+
+        cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
+        db = self.parse(**kwargs)
+        tb_etree = Element('toolbox_data')
+        header = db.find('header')
+        tb_etree.append(header)
+        for record in db.findall('record'):
+            parsed = cp.parse([(elem.text, elem.tag) for elem in record])
+            tb_etree.append(self._tree2etree(parsed))
+        return tb_etree
+
+_is_value = re.compile(r"\S")
+
+def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
+    """
+    Return a string with a standard format representation of the toolbox
+    data in tree (tree can be a toolbox database or a single record).
+
+    :param tree: flat representation of toolbox data (whole database or single record)
+    :type tree: ElementTree._ElementInterface
+    :param encoding: Name of an encoding to use.
+    :type encoding: str
+    :param errors: Error handling scheme for codec. Same as the ``encode()``
+        builtin string method.
+    :type errors: str
+    :param unicode_fields:
+    :type unicode_fields: dict(str) or set(str)
+    :rtype: str
+    """
+    if tree.tag == 'record':
+        root = Element('toolbox_data')
+        root.append(tree)
+        tree = root
+
+    if tree.tag != 'toolbox_data':
+        raise ValueError("not a toolbox_data element structure")
+    if encoding is None and unicode_fields is not None:
+        raise ValueError("if encoding is not specified then neither should unicode_fields")
+    l = []
+    for rec in tree:
+        l.append('\n')
+        for field in rec:
+            mkr = field.tag
+            value = field.text
+            if encoding is not None:
+                if unicode_fields is not None and mkr in unicode_fields:
+                    cur_encoding = 'utf8'
+                else:
+                    cur_encoding = encoding
+                if re.search(_is_value, value):
+                    l.append((u("\\%s %s\n") % (mkr, value)).encode(cur_encoding, errors))
+                else:
+                    l.append((u("\\%s%s\n") % (mkr, value)).encode(cur_encoding, errors))
+            else:
+                if re.search(_is_value, value):
+                    l.append("\\%s %s\n" % (mkr, value))
+                else:
+                    l.append("\\%s%s\n" % (mkr, value))
+    return ''.join(l[1:])
+
+class ToolboxSettings(StandardFormat):
+    """This class is the base class for settings files."""
+
+    def __init__(self):
+        super(ToolboxSettings, self).__init__()
+
+    def parse(self, encoding=None, errors='strict', **kwargs):
+        """
+        Return the contents of toolbox settings file with a nested structure.
+
+        :param encoding: encoding used by settings file
+        :type encoding: str
+        :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method.
+        :type errors: str
+        :param kwargs: Keyword arguments passed to ``StandardFormat.fields()``
+        :type kwargs: dict
+        :rtype: ElementTree._ElementInterface
+        """
+        builder = TreeBuilder()
+        for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs):
+            # Check whether the first char of the field marker
+            # indicates a block start (+) or end (-)
+            block=mkr[0]
+            if block in ("+", "-"):
+                mkr=mkr[1:]
+            else:
+                block=None
+            # Build tree on the basis of block char
+            if block == "+":
+                builder.start(mkr, {})
+                builder.data(value)
+            elif block == '-':
+                builder.end(mkr)
+            else:
+                builder.start(mkr, {})
+                builder.data(value)
+                builder.end(mkr)
+        return builder.close()
+
+def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):
+    # write XML to file
+    l = list()
+    _to_settings_string(tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields)
+    return ''.join(l)
+
+def _to_settings_string(node, l, **kwargs):
+    # write XML to file
+    tag = node.tag
+    text = node.text
+    if len(node) == 0:
+        if text:
+            l.append('\\%s %s\n' % (tag, text))
+        else:
+            l.append('\\%s\n' % tag)
+    else:
+        if text:
+            l.append('\\+%s %s\n' % (tag, text))
+        else:
+            l.append('\\+%s\n' % tag)
+        for n in node:
+            _to_settings_string(n, l, **kwargs)
+        l.append('\\-%s\n' % tag)
+    return
+
+def remove_blanks(elem):
+    """
+    Remove all elements and subelements with no text and no child elements.
+
+    :param elem: toolbox data in an elementtree structure
+    :type elem: ElementTree._ElementInterface
+    """
+    out = list()
+    for child in elem:
+        remove_blanks(child)
+        if child.text or len(child) > 0:
+            out.append(child)
+    elem[:] = out
+
+def add_default_fields(elem, default_fields):
+    """
+    Add blank elements and subelements specified in default_fields.
+
+    :param elem: toolbox data in an elementtree structure
+    :type elem: ElementTree._ElementInterface
+    :param default_fields: fields to add to each type of element and subelement
+    :type default_fields: dict(tuple)
+    """
+    for field in default_fields.get(elem.tag,  []):
+        if elem.find(field) is None:
+            SubElement(elem, field)
+    for child in elem:
+        add_default_fields(child, default_fields)
+
+def sort_fields(elem, field_orders):
+    """
+    Sort the elements and subelements in order specified in field_orders.
+
+    :param elem: toolbox data in an elementtree structure
+    :type elem: ElementTree._ElementInterface
+    :param field_orders: order of fields for each type of element and subelement
+    :type field_orders: dict(tuple)
+    """
+    order_dicts = dict()
+    for field, order in field_orders.items():
+        order_dicts[field] = order_key = dict()
+        for i, subfield in enumerate(order):
+            order_key[subfield] = i
+    _sort_fields(elem, order_dicts)
+
+def _sort_fields(elem, orders_dicts):
+    """sort the children of elem"""
+    try:
+        order = orders_dicts[elem.tag]
+    except KeyError:
+        pass
+    else:
+        tmp = sorted([((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem)])
+        elem[:] = [child for key, child in tmp]
+    for child in elem:
+        if len(child):
+            _sort_fields(child, orders_dicts)
+
+def add_blank_lines(tree, blanks_before, blanks_between):
+    """
+    Add blank lines before all elements and subelements specified in blank_before.
+
+    :param elem: toolbox data in an elementtree structure
+    :type elem: ElementTree._ElementInterface
+    :param blank_before: elements and subelements to add blank lines before
+    :type blank_before: dict(tuple)
+    """
+    try:
+        before = blanks_before[tree.tag]
+        between = blanks_between[tree.tag]
+    except KeyError:
+        for elem in tree:
+            if len(elem):
+                add_blank_lines(elem, blanks_before, blanks_between)
+    else:
+        last_elem = None
+        for elem in tree:
+            tag = elem.tag
+            if last_elem is not None and last_elem.tag != tag:
+                if tag in before and last_elem is not None:
+                    e = last_elem.getiterator()[-1]
+                    e.text = (e.text or "") + "\n"
+            else:
+                if tag in between:
+                    e = last_elem.getiterator()[-1]
+                    e.text = (e.text or "") + "\n"
+            if len(elem):
+                add_blank_lines(elem, blanks_before, blanks_between)
+            last_elem = elem
+
+def demo():
+    from itertools import islice
+
+#    zip_path = find('corpora/toolbox.zip')
+#    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
+    file_path = find('corpora/toolbox/rotokas.dic')
+    lexicon = ToolboxData(file_path).parse()
+    print('first field in fourth record:')
+    print(lexicon[3][0].tag)
+    print(lexicon[3][0].text)
+
+    print('\nfields in sequential order:')
+    for field in islice(lexicon.find('record'), 10):
+        print(field.tag, field.text)
+
+    print('\nlx fields:')
+    for field in islice(lexicon.findall('record/lx'), 10):
+        print(field.text)
+
+    settings = ToolboxSettings()
+    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
+    settings.open(file_path)
+#    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
+    tree = settings.parse(unwrap=False, encoding='cp1252')
+    print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
+    settings_tree = ElementTree(tree)
+    print(to_settings_string(settings_tree).encode('utf8'))
+
+if __name__ == '__main__':
+    demo()
diff --git a/nlp_resource_data/nltk/toolbox.pyc b/nlp_resource_data/nltk/toolbox.pyc

new file mode 100755 (executable)

index 0000000..73931c0

Binary files /dev/null and b/nlp_resource_data/nltk/toolbox.pyc differ
diff --git a/nlp_resource_data/nltk/translate/__init__.py b/nlp_resource_data/nltk/translate/__init__.py

new file mode 100755 (executable)

index 0000000..9e243e4
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/__init__.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Machine Translation
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Experimental features for machine translation.
+These interfaces are prone to change.
+"""
+
+from nltk.translate.api import AlignedSent, Alignment, PhraseTable
+from nltk.translate.ibm_model import IBMModel
+from nltk.translate.ibm1 import IBMModel1
+from nltk.translate.ibm2 import IBMModel2
+from nltk.translate.ibm3 import IBMModel3
+from nltk.translate.ibm4 import IBMModel4
+from nltk.translate.ibm5 import IBMModel5
+from nltk.translate.bleu_score import sentence_bleu as bleu
+from nltk.translate.ribes_score import sentence_ribes as ribes
+from nltk.translate.metrics import alignment_error_rate
+from nltk.translate.stack_decoder import StackDecoder
diff --git a/nlp_resource_data/nltk/translate/__init__.pyc b/nlp_resource_data/nltk/translate/__init__.pyc

new file mode 100755 (executable)

index 0000000..9f905c4

Binary files /dev/null and b/nlp_resource_data/nltk/translate/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/translate/api.py b/nlp_resource_data/nltk/translate/api.py

new file mode 100755 (executable)

index 0000000..c05db53
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/api.py
@@ -0,0 +1,321 @@
+# Natural Language Toolkit: API for alignment and translation objects
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Will Zhang <wilzzha@gmail.com>
+#         Guan Gui <ggui@student.unimelb.edu.au>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import print_function, unicode_literals
+import subprocess
+from collections import namedtuple
+
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
+class AlignedSent(object):
+    """
+    Return an aligned sentence object, which encapsulates two sentences
+    along with an ``Alignment`` between them.
+
+        >>> from nltk.translate import AlignedSent, Alignment
+        >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
+        ...     ['the', 'house', 'is', 'small'], Alignment.fromstring('0-2 1-3 2-1 3-0'))
+        >>> algnsent.words
+        ['klein', 'ist', 'das', 'Haus']
+        >>> algnsent.mots
+        ['the', 'house', 'is', 'small']
+        >>> algnsent.alignment
+        Alignment([(0, 2), (1, 3), (2, 1), (3, 0)])
+        >>> from nltk.corpus import comtrans
+        >>> print(comtrans.aligned_sents()[54])
+        <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
+        >>> print(comtrans.aligned_sents()[54].alignment)
+        0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
+
+    :param words: source language words
+    :type words: list(str)
+    :param mots: target language words
+    :type mots: list(str)
+    :param alignment: the word-level alignments between the source
+        and target language
+    :type alignment: Alignment
+    """
+
+    def __init__(self, words, mots, alignment=None):
+        self._words = words
+        self._mots = mots
+        if alignment is None:
+            self.alignment = Alignment([])
+        else:
+            assert type(alignment) is Alignment
+            self.alignment = alignment
+
+    @property
+    def words(self):
+        return self._words
+
+    @property
+    def mots(self):
+        return self._mots
+
+    def _get_alignment(self):
+        return self._alignment
+
+    def _set_alignment(self, alignment):
+        _check_alignment(len(self.words), len(self.mots), alignment)
+        self._alignment = alignment
+    alignment = property(_get_alignment, _set_alignment)
+
+    def __repr__(self):
+        """
+        Return a string representation for this ``AlignedSent``.
+
+        :rtype: str
+        """
+        words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
+        mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))
+
+        return "AlignedSent(%s, %s, %r)" % (words, mots, self._alignment)
+
+    def _to_dot(self):
+        """
+        Dot representation of the aligned sentence
+        """
+        s = 'graph align {\n'
+        s += 'node[shape=plaintext]\n'
+
+        # Declare node
+        for w in self._words:
+            s += '"%s_source" [label="%s"] \n' % (w, w)
+
+        for w in self._mots:
+            s += '"%s_target" [label="%s"] \n' % (w, w)
+
+        # Alignment
+        for u,v in self._alignment:
+            s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] )
+
+        # Connect the source words
+        for i in range(len(self._words)-1) :
+            s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1])
+
+        # Connect the target words
+        for i in range(len(self._mots)-1) :
+            s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1])
+
+        # Put it in the same rank
+        s  += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
+        s  += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
+
+        s += '}'
+
+        return s
+
+    def _repr_svg_(self):
+        """
+        Ipython magic : show SVG representation of this ``AlignedSent``.
+        """
+        dot_string = self._to_dot().encode('utf8')
+        output_format = 'svg'
+        try:
+            process = subprocess.Popen(['dot', '-T%s' % output_format], stdin=subprocess.PIPE,
+                                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except OSError:
+            raise Exception('Cannot find the dot binary from Graphviz package')
+        out, err = process.communicate(dot_string)
+
+        return out.decode('utf8')
+
+
+    def __str__(self):
+        """
+        Return a human-readable string representation for this ``AlignedSent``.
+
+        :rtype: str
+        """
+        source = " ".join(self._words)[:20] + "..."
+        target = " ".join(self._mots)[:20] + "..."
+        return "<AlignedSent: '%s' -> '%s'>" % (source, target)
+
+    def invert(self):
+        """
+        Return the aligned sentence pair, reversing the directionality
+
+        :rtype: AlignedSent
+        """
+        return AlignedSent(self._mots, self._words,
+                               self._alignment.invert())
+
+@python_2_unicode_compatible
+class Alignment(frozenset):
+    """
+    A storage class for representing alignment between two sequences, s1, s2.
+    In general, an alignment is a set of tuples of the form (i, j, ...)
+    representing an alignment between the i-th element of s1 and the
+    j-th element of s2.  Tuples are extensible (they might contain
+    additional data, such as a boolean to indicate sure vs possible alignments).
+
+        >>> from nltk.translate import Alignment
+        >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
+        >>> a.invert()
+        Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
+        >>> print(a.invert())
+        0-0 1-0 2-1 2-2
+        >>> a[0]
+        [(0, 1), (0, 0)]
+        >>> a.invert()[2]
+        [(2, 1), (2, 2)]
+        >>> b = Alignment([(0, 0), (0, 1)])
+        >>> b.issubset(a)
+        True
+        >>> c = Alignment.fromstring('0-0 0-1')
+        >>> b == c
+        True
+    """
+
+    def __new__(cls, pairs):
+        self = frozenset.__new__(cls, pairs)
+        self._len = (max(p[0] for p in self) if self != frozenset([]) else 0)
+        self._index = None
+        return self
+
+    @classmethod
+    def fromstring(cls, s):
+        """
+        Read a giza-formatted string and return an Alignment object.
+
+            >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
+            Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
+
+        :type s: str
+        :param s: the positional alignments in giza format
+        :rtype: Alignment
+        :return: An Alignment object corresponding to the string representation ``s``.
+        """
+
+        return Alignment([_giza2pair(a) for a in s.split()])
+
+    def __getitem__(self, key):
+        """
+        Look up the alignments that map from a given index or slice.
+        """
+        if not self._index:
+            self._build_index()
+        return self._index.__getitem__(key)
+
+    def invert(self):
+        """
+        Return an Alignment object, being the inverted mapping.
+        """
+        return Alignment(((p[1], p[0]) + p[2:]) for p in self)
+
+    def range(self, positions=None):
+        """
+        Work out the range of the mapping from the given positions.
+        If no positions are specified, compute the range of the entire mapping.
+        """
+        image = set()
+        if not self._index:
+            self._build_index()
+        if not positions:
+            positions = list(range(len(self._index)))
+        for p in positions:
+            image.update(f for _,f in self._index[p])
+        return sorted(image)
+
+    def __repr__(self):
+        """
+        Produce a Giza-formatted string representing the alignment.
+        """
+        return "Alignment(%r)" % sorted(self)
+
+    def __str__(self):
+        """
+        Produce a Giza-formatted string representing the alignment.
+        """
+        return " ".join("%d-%d" % p[:2] for p in sorted(self))
+
+    def _build_index(self):
+        """
+        Build a list self._index such that self._index[i] is a list
+        of the alignments originating from word i.
+        """
+        self._index = [[] for _ in range(self._len + 1)]
+        for p in self:
+            self._index[p[0]].append(p)
+
+
+def _giza2pair(pair_string):
+    i, j = pair_string.split("-")
+    return int(i), int(j)
+
+def _naacl2pair(pair_string):
+    i, j, p = pair_string.split("-")
+    return int(i), int(j)
+
+def _check_alignment(num_words, num_mots, alignment):
+    """
+    Check whether the alignments are legal.
+
+    :param num_words: the number of source language words
+    :type num_words: int
+    :param num_mots: the number of target language words
+    :type num_mots: int
+    :param alignment: alignment to be checked
+    :type alignment: Alignment
+    :raise IndexError: if alignment falls outside the sentence
+    """
+
+    assert type(alignment) is Alignment
+
+    if not all(0 <= pair[0] < num_words for pair in alignment):
+        raise IndexError("Alignment is outside boundary of words")
+    if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
+        raise IndexError("Alignment is outside boundary of mots")
+
+
+PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob'])
+class PhraseTable(object):
+    """
+    In-memory store of translations for a given phrase, and the log
+    probability of the those translations
+    """
+    def __init__(self):
+        self.src_phrases = dict()
+
+    def translations_for(self, src_phrase):
+        """
+        Get the translations for a source language phrase
+
+        :param src_phrase: Source language phrase of interest
+        :type src_phrase: tuple(str)
+
+        :return: A list of target language phrases that are translations
+            of ``src_phrase``, ordered in decreasing order of
+            likelihood. Each list element is a tuple of the target
+            phrase and its log probability.
+        :rtype: list(PhraseTableEntry)
+        """
+        return self.src_phrases[src_phrase]
+
+    def add(self, src_phrase, trg_phrase, log_prob):
+        """
+        :type src_phrase: tuple(str)
+        :type trg_phrase: tuple(str)
+
+        :param log_prob: Log probability that given ``src_phrase``,
+            ``trg_phrase`` is its translation
+        :type log_prob: float
+        """
+        entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
+        if src_phrase not in self.src_phrases:
+            self.src_phrases[src_phrase] = []
+        self.src_phrases[src_phrase].append(entry)
+        self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob,
+                                          reverse=True)
+
+    def __contains__(self, src_phrase):
+        return src_phrase in self.src_phrases
diff --git a/nlp_resource_data/nltk/translate/api.pyc b/nlp_resource_data/nltk/translate/api.pyc

new file mode 100755 (executable)

index 0000000..af1d614

Binary files /dev/null and b/nlp_resource_data/nltk/translate/api.pyc differ
diff --git a/nlp_resource_data/nltk/translate/bleu_score.py b/nlp_resource_data/nltk/translate/bleu_score.py

new file mode 100755 (executable)

index 0000000..e30d112
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/bleu_score.py
@@ -0,0 +1,600 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: BLEU Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
+# Contributors: Dmitrijs Milajevs, Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""BLEU score implementation."""
+from __future__ import division
+
+import math
+import sys
+import fractions
+import warnings
+from collections import Counter
+
+from nltk.util import ngrams
+
+try:
+    fractions.Fraction(0, 1000, _normalize=False)
+    from fractions import Fraction
+except TypeError:
+    from nltk.compat import Fraction
+
+
+def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
+                  smoothing_function=None, auto_reweigh=False,
+                  emulate_multibleu=False):
+    """
+    Calculate BLEU score (Bilingual Evaluation Understudy) from
+    Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
+    "BLEU: a method for automatic evaluation of machine translation."
+    In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
+
+    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...               'ensures', 'that', 'the', 'military', 'always',
+    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+
+    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+    ...               'that', 'party', 'direct']
+
+    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...               'heed', 'Party', 'commands']
+
+    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...               'guarantees', 'the', 'military', 'forces', 'always',
+    ...               'being', 'under', 'the', 'command', 'of', 'the',
+    ...               'Party']
+
+    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...               'of', 'the', 'party']
+
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
+    0.5045...
+
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
+    0.3969...
+
+    The default BLEU calculates a score for up to 4grams using uniform
+    weights. To evaluate your translations with higher/lower order ngrams,
+    use customized weights. E.g. when accounting for up to 6grams with uniform
+    weights:
+
+    >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
+    0.4583...
+
+    :param references: reference sentences
+    :type references: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh:
+    :type auto_reweigh: bool
+    :param emulate_multibleu: bool
+    :return: The sentence-level BLEU score.
+    :rtype: float
+    """
+    return corpus_bleu([references], [hypothesis],
+                        weights, smoothing_function, auto_reweigh,
+                        emulate_multibleu)
+
+
+def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
+                smoothing_function=None, auto_reweigh=False,
+                emulate_multibleu=False):
+    """
+    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+
+    Instead of averaging the sentence level BLEU scores (i.e. marco-average
+    precision), the original BLEU metric (Papineni et al. 2002) accounts for
+    the micro-average precision (i.e. summing the numerators and denominators
+    for each hypothesis-reference(s) pairs before the division).
+
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+    ...          'because', 'he', 'read', 'the', 'book']
+
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+    0.5920...
+
+    The example below show that corpus_bleu() is different from averaging
+    sentence_bleu() for hypotheses
+
+    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
+    >>> score2 = sentence_bleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+    0.6223...
+
+    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh:
+    :type auto_reweigh: bool
+    :param emulate_multibleu: bool
+    :return: The corpus-level BLEU score.
+    :rtype: float
+    """
+    # Before proceeding to compute BLEU, perform sanity checks.
+
+    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+    hyp_lengths, ref_lengths = 0, 0
+
+    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(weights, start=1):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len =  len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+
+    # Calculate corpus-level brevity penalty.
+    bp = brevity_penalty(ref_lengths, hyp_lengths)
+
+    # Uniformly re-weighting based on maximum hypothesis lengths if largest
+    # order of n-grams < 4 and weights is set at default.
+    if auto_reweigh:
+        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
+            weights = ( 1 / hyp_lengths ,) * hyp_lengths
+
+    # Collects the various precision values for the different ngram orders.
+    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+           for i, _ in enumerate(weights, start=1)]
+
+    # Returns 0 if there's no matching n-grams
+    # We only need to check for p_numerators[1] == 0, since if there's
+    # no unigrams, there won't be any higher order ngrams.
+    if p_numerators[1] == 0:
+        return 0
+
+    # If there's no smoothing, set use method0 from SmoothinFunction class.
+    if not smoothing_function:
+        smoothing_function = SmoothingFunction().method0
+    # Smoothen the modified precision.
+    # Note: smoothing_function() may convert values into floats;
+    #       it tries to retain the Fraction object as much as the
+    #       smoothing method allows.
+    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
+                             hyp_len=hyp_len, emulate_multibleu=emulate_multibleu)
+    s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))
+    s =  bp * math.exp(math.fsum(s))
+    return round(s, 4) if emulate_multibleu else s
+
+
+def modified_precision(references, hypothesis, n):
+    """
+    Calculate modified ngram precision.
+
+    The normal precision method may lead to some wrong translations with
+    high-precision, e.g., the translation, in which a word of reference
+    repeats several times, has very high precision.
+
+    This function only returns the Fraction object that contains the numerator
+    and denominator necessary to calculate the corpus-level precision.
+    To calculate the modified precision for a single pair of hypothesis and
+    references, cast the Fraction object into a float.
+
+    The famous "the the the ... " example shows that you can get BLEU precision
+    by duplicating high frequency words.
+
+        >>> reference1 = 'the cat is on the mat'.split()
+        >>> reference2 = 'there is a cat on the mat'.split()
+        >>> hypothesis1 = 'the the the the the the the'.split()
+        >>> references = [reference1, reference2]
+        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
+        0.2857...
+
+    In the modified n-gram precision, a reference word will be considered
+    exhausted after a matching hypothesis word is identified, e.g.
+
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+        ...               'ensures', 'that', 'the', 'military', 'will',
+        ...               'forever', 'heed', 'Party', 'commands']
+        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+        ...               'guarantees', 'the', 'military', 'forces', 'always',
+        ...               'being', 'under', 'the', 'command', 'of', 'the',
+        ...               'Party']
+        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+        ...               'of', 'the', 'party']
+        >>> hypothesis = 'of the'.split()
+        >>> references = [reference1, reference2, reference3]
+        >>> float(modified_precision(references, hypothesis, n=1))
+        1.0
+        >>> float(modified_precision(references, hypothesis, n=2))
+        1.0
+
+    An example of a normal machine translation hypothesis:
+
+        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+        ...               'ensures', 'that', 'the', 'military', 'always',
+        ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+
+        >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+        ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+        ...               'that', 'party', 'direct']
+
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+        ...               'ensures', 'that', 'the', 'military', 'will',
+        ...               'forever', 'heed', 'Party', 'commands']
+
+        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+        ...               'guarantees', 'the', 'military', 'forces', 'always',
+        ...               'being', 'under', 'the', 'command', 'of', 'the',
+        ...               'Party']
+
+        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+        ...               'of', 'the', 'party']
+        >>> references = [reference1, reference2, reference3]
+        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
+        0.9444...
+        >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
+        0.5714...
+        >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
+        0.5882352941176471
+        >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
+        0.07692...
+
+
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hypothesis: A hypothesis translation.
+    :type hypothesis: list(str)
+    :param n: The ngram order.
+    :type n: int
+    :return: BLEU's modified precision for the nth order ngram.
+    :rtype: Fraction
+    """
+    # Extracts all ngrams in hypothesis
+    # Set an empty Counter if hypothesis is empty.
+    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
+    # Extract a union of references' counts.
+    ## max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
+    max_counts = {}
+    for reference in references:
+        reference_counts = Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
+        for ngram in counts:
+            max_counts[ngram] = max(max_counts.get(ngram, 0),
+                                    reference_counts[ngram])
+
+    # Assigns the intersection between hypothesis and references' counts.
+    clipped_counts = {ngram: min(count, max_counts[ngram])
+                      for ngram, count in counts.items()}
+
+    numerator = sum(clipped_counts.values())
+    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
+    # Usually this happens when the ngram order is > len(reference).
+    denominator = max(1, sum(counts.values()))
+
+    return Fraction(numerator, denominator, _normalize=False)
+
+
+def closest_ref_length(references, hyp_len):
+    """
+    This function finds the reference that is the closest length to the
+    hypothesis. The closest reference length is referred to as *r* variable
+    from the brevity penalty formula in Papineni et. al. (2002)
+
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hypothesis: The length of the hypothesis.
+    :type hypothesis: int
+    :return: The length of the reference that's closest to the hypothesis.
+    :rtype: int
+    """
+    ref_lens = (len(reference) for reference in references)
+    closest_ref_len = min(ref_lens, key=lambda ref_len:
+                          (abs(ref_len - hyp_len), ref_len))
+    return closest_ref_len
+
+
+def brevity_penalty(closest_ref_len, hyp_len):
+    """
+    Calculate brevity penalty.
+
+    As the modified n-gram precision still has the problem from the short
+    length sentence, brevity penalty is used to modify the overall BLEU
+    score according to length.
+
+    An example from the paper. There are three references with length 12, 15
+    and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
+
+        >>> reference1 = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
+        >>> reference2 = list('aaaaaaaaaaaaaaa')   # i.e. ['a'] * 15
+        >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
+        >>> hypothesis = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
+        >>> references = [reference1, reference2, reference3]
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len)
+        1.0
+
+    In case a hypothesis translation is shorter than the references, penalty is
+    applied.
+
+        >>> references = [['a'] * 28, ['a'] * 28]
+        >>> hypothesis = ['a'] * 12
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len)
+        0.2635971381157267
+
+    The length of the closest reference is used to compute the penalty. If the
+    length of a hypothesis is 12, and the reference lengths are 13 and 2, the
+    penalty is applied because the hypothesis length (12) is less then the
+    closest reference length (13).
+
+        >>> references = [['a'] * 13, ['a'] * 2]
+        >>> hypothesis = ['a'] * 12
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
+        0.9200...
+
+    The brevity penalty doesn't depend on reference order. More importantly,
+    when two reference sentences are at the same distance, the shortest
+    reference sentence length is used.
+
+        >>> references = [['a'] * 13, ['a'] * 11]
+        >>> hypothesis = ['a'] * 12
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(reversed(references), hyp_len)
+        >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
+        >>> bp1 == bp2 == 1
+        True
+
+    A test example from mteval-v13a.pl (starting from the line 705):
+
+        >>> references = [['a'] * 11, ['a'] * 8]
+        >>> hypothesis = ['a'] * 7
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
+        0.8668...
+
+        >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
+        >>> hypothesis = ['a'] * 7
+        >>> hyp_len = len(hypothesis)
+        >>> closest_ref_len =  closest_ref_length(references, hyp_len)
+        >>> brevity_penalty(closest_ref_len, hyp_len)
+        1.0
+
+    :param hyp_len: The length of the hypothesis for a single sentence OR the
+    sum of all the hypotheses' lengths for a corpus
+    :type hyp_len: int
+    :param closest_ref_len: The length of the closest reference for a single
+    hypothesis OR the sum of all the closest references for every hypotheses.
+    :type closest_reference_len: int
+    :return: BLEU's brevity penalty.
+    :rtype: float
+    """
+    if hyp_len > closest_ref_len:
+        return 1
+    # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
+    elif hyp_len == 0:
+        return 0
+    else:
+        return math.exp(1 - closest_ref_len / hyp_len)
+
+
+class SmoothingFunction:
+    """
+    This is an implementation of the smoothing techniques
+    for segment-level BLEU scores that was presented in
+    Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
+    Smoothing Techniques for Sentence-Level BLEU. In WMT14.
+    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+    """
+    def __init__(self, epsilon=0.1, alpha=5, k=5):
+        """
+        This will initialize the parameters required for the various smoothing
+        techniques, the default values are set to the numbers used in the
+        experiments from Chen and Cherry (2014).
+
+        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
+        ...                 'that', 'the', 'military', 'always', 'obeys', 'the',
+        ...                 'commands', 'of', 'the', 'party']
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
+        ...               'that', 'the', 'military', 'will', 'forever', 'heed',
+        ...               'Party', 'commands']
+
+        >>> chencherry = SmoothingFunction()
+        >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
+        0.4489...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
+        0.4118...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
+        0.4905...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
+        0.4135...
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
+        0.4905...
+
+        :param epsilon: the epsilon value use in method 1
+        :type epsilon: float
+        :param alpha: the alpha value use in method 6
+        :type alpha: int
+        :param k: the k value use in method 4
+        :type k: int
+        """
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.k = k
+
+    def method0(self, p_n, *args, **kwargs):
+        """ No smoothing. """
+        p_n_new = []
+        _emulate_multibleu = kwargs['emulate_multibleu']
+        for i, p_i in enumerate(p_n):
+            if p_i.numerator != 0:
+                p_n_new.append(p_i)
+            elif _emulate_multibleu and i < 5:
+                return [sys.float_info.min]
+            else:
+                _msg = str("\nCorpus/Sentence contains 0 counts of {}-gram overlaps.\n"
+                           "BLEU scores might be undesirable; "
+                           "use SmoothingFunction().").format(i+1)
+                warnings.warn(_msg)
+                # If this order of n-gram returns 0 counts, the higher order
+                # n-gram would also return 0, thus breaking the loop here.
+                break
+        return p_n_new
+
+    def method1(self, p_n, *args, **kwargs):
+        """
+        Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
+        """
+        return [(p_i.numerator + self.epsilon)/ p_i.denominator
+                if p_i.numerator == 0 else p_i for p_i in p_n]
+
+    def method2(self, p_n, *args, **kwargs):
+        """
+        Smoothing method 2: Add 1 to both numerator and denominator from
+        Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
+        machine translation quality using longest common subsequence and
+        skip-bigram statistics. In ACL04.
+        """
+        return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n]
+
+    def method3(self, p_n, *args, **kwargs):
+        """
+        Smoothing method 3: NIST geometric sequence smoothing
+        The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
+        precision score whose matching n-gram count is null.
+        k is 1 for the first 'n' value for which the n-gram match count is null/
+        For example, if the text contains:
+         - one 2-gram match
+         - and (consequently) two 1-gram matches
+        the n-gram count for each individual precision score would be:
+         - n=1  =>  prec_count = 2     (two unigrams)
+         - n=2  =>  prec_count = 1     (one bigram)
+         - n=3  =>  prec_count = 1/2   (no trigram,  taking 'smoothed' value of 1 / ( 2^k ), with k=1)
+         - n=4  =>  prec_count = 1/4   (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
+        """
+        incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
+        for i, p_i in enumerate(p_n):
+            if p_i.numerator == 0:
+                p_n[i] = 1 / (2**incvnt * p_i.denominator)
+                incvnt+=1
+        return p_n
+
+    def method4(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
+        """
+        Smoothing method 4:
+        Shorter translations may have inflated precision values due to having
+        smaller denominators; therefore, we give them proportionally
+        smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
+        suggests dividing by 1/ln(len(T)), where T is the length of the translation.
+        """
+        for i, p_i in enumerate(p_n):
+            if p_i.numerator == 0 and hyp_len != 0:
+                incvnt = i+1 * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
+                p_n[i] = 1 / incvnt
+        return p_n
+
+
+    def method5(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
+        """
+        Smoothing method 5:
+        The matched counts for similar values of n should be similar. To a
+        calculate the n-gram matched count, it averages the n−1, n and n+1 gram
+        matched counts.
+        """
+        m = {}
+        # Requires an precision value for an addition ngram order.
+        p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
+        m[-1] = p_n[0] + 1
+        for i, p_i in enumerate(p_n):
+            p_n[i] = (m[i-1] + p_i + p_n_plus1[i+1]) / 3
+            m[i] = p_n[i]
+        return p_n
+
+    def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
+        """
+        Smoothing method 6:
+        Interpolates the maximum likelihood estimate of the precision *p_n* with
+        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
+        Gao and He (2013) Training MRF-Based Phrase Translation Models using
+        Gradient Ascent. In NAACL.
+        """
+        # This smoothing only works when p_1 and p_2 is non-zero.
+        # Raise an error with an appropriate message when the input is too short
+        # to use this smoothing technique.
+        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
+        for i, p_i in enumerate(p_n):
+            if i in [0,1]: # Skips the first 2 orders of ngrams.
+                continue
+            else:
+                pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
+                # No. of ngrams in translation that matches the reference.
+                m = p_i.numerator
+                # No. of ngrams in translation.
+                l = sum(1 for _ in ngrams(hypothesis, i+1))
+                # Calculates the interpolated precision.
+                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
+        return p_n
+
+    def method7(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
+        """
+        Smoothing method 6:
+        Interpolates the maximum likelihood estimate of the precision *p_n* with
+        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+        between pn and pn−1 will be the same as that between pn−1 and pn−2.
+        """
+        p_n = self.method4(p_n, references, hypothesis, hyp_len)
+        p_n = self.method5(p_n, references, hypothesis, hyp_len)
+        return p_n
diff --git a/nlp_resource_data/nltk/translate/bleu_score.pyc b/nlp_resource_data/nltk/translate/bleu_score.pyc

new file mode 100755 (executable)

index 0000000..9bc1c65

Binary files /dev/null and b/nlp_resource_data/nltk/translate/bleu_score.pyc differ
diff --git a/nlp_resource_data/nltk/translate/chrf_score.py b/nlp_resource_data/nltk/translate/chrf_score.py

new file mode 100755 (executable)

index 0000000..1748633
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/chrf_score.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: ChrF score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Maja Popovic
+# Contributors: Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+""" ChrF score implementation """
+from __future__ import division
+from collections import Counter
+
+from nltk.util import ngrams, everygrams
+
+def sentence_chrf(reference, hypothesis, min_len=1, max_len=6, beta=3.0):
+    """
+    Calculates the sentence level CHRF (Character n-gram F-score) described in
+     - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
+       In Proceedings of the 10th Workshop on Machine Translation.
+       http://www.statmt.org/wmt15/pdf/WMT49.pdf
+     - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
+       In Proceedings of the 1st Conference on Machine Translation.
+       http://www.statmt.org/wmt16/pdf/W16-2341.pdf
+
+    Unlike multi-reference BLEU, CHRF only supports a single reference.
+
+    An example from the original BLEU paper
+    http://www.aclweb.org/anthology/P02-1040.pdf
+
+        >>> ref1 = str('It is a guide to action that ensures that the military '
+        ...            'will forever heed Party commands').split()
+        >>> hyp1 = str('It is a guide to action which ensures that the military '
+        ...            'always obeys the commands of the party').split()
+        >>> hyp2 = str('It is to insure the troops forever hearing the activity '
+        ...            'guidebook that party direct').split()
+        >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
+        0.6768...
+        >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
+        0.4201...
+
+    The infamous "the the the ... " example
+
+        >>> ref = 'the cat is on the mat'.split()
+        >>> hyp = 'the the the the the the the'.split()
+        >>> sentence_chrf(ref, hyp)  # doctest: +ELLIPSIS
+        0.2530...
+
+    An example to show that this function allows users to use strings instead of
+    tokens, i.e. list(str) as inputs.
+
+        >>> ref1 = str('It is a guide to action that ensures that the military '
+        ...            'will forever heed Party commands')
+        >>> hyp1 = str('It is a guide to action which ensures that the military '
+        ...            'always obeys the commands of the party')
+        >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
+        0.6768...
+        >>> type(ref1) == type(hyp1) == str
+        True
+        >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
+        0.6768...
+
+    To skip the unigrams and only use 2- to 3-grams:
+
+        >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
+        0.7018...
+
+    :param references: reference sentence
+    :type references: list(str) / str
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str) / str
+    :param min_len: The minimum order of n-gram this function should extract.
+    :type min_len: int
+    :param max_len: The maximum order of n-gram this function should extract.
+    :type max_len: int
+    :param beta: the parameter to assign more importance to recall over precision
+    :type beta: float
+    :return: the sentence level CHRF score.
+    :rtype: float
+    """
+    return corpus_chrf([reference], [hypothesis], min_len, max_len, beta=beta)
+
+
+def corpus_chrf(list_of_references, hypotheses, min_len=1, max_len=6, beta=3.0):
+    """
+    Calculates the corpus level CHRF (Character n-gram F-score), it is the
+    micro-averaged value of the sentence/segment level CHRF score.
+
+    CHRF only supports a single reference.
+
+        >>> ref1 = str('It is a guide to action that ensures that the military '
+        ...            'will forever heed Party commands').split()
+        >>> ref2 = str('It is the guiding principle which guarantees the military '
+        ...            'forces always being under the command of the Party').split()
+        >>>
+        >>> hyp1 = str('It is a guide to action which ensures that the military '
+        ...            'always obeys the commands of the party').split()
+        >>> hyp2 = str('It is to insure the troops forever hearing the activity '
+        ...            'guidebook that party direct')
+        >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
+        0.4915...
+
+    :param references: a corpus of list of reference sentences, w.r.t. hypotheses
+    :type references: list(list(str)) / list(str)
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str)) / list(str)
+    :param min_len: The minimum order of n-gram this function should extract.
+    :type min_len: int
+    :param max_len: The maximum order of n-gram this function should extract.
+    :type max_len: int
+    :param beta: the parameter to assign more importance to recall over precision
+    :type beta: float
+    :return: the sentence level CHRF score.
+    :rtype: float
+    """
+
+    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their references should be the same"
+
+    # Iterate through each hypothesis and their corresponding references.
+    for reference, hypothesis in zip(list_of_references, hypotheses):
+        # Cheating condition to allow users to input strings instead of tokens.
+        if type(reference) and type(hypothesis) != str:
+            reference, hypothesis = ' '.join(reference), ' '.join(hypothesis)
+        # For each order of ngram, calculate the no. of ngram matches and
+        # keep track of no. of ngram in references.
+        ref_ngrams = Counter(everygrams(reference, min_len, max_len))
+        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
+        overlap_ngrams = ref_ngrams & hyp_ngrams
+        tp = sum(overlap_ngrams.values()) # True positives.
+        tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
+        tffn = sum(ref_ngrams.values()) # True posities + False negatives.
+
+    precision = tp / tpfp
+    recall = tp / tffn
+    factor = beta**2
+    score = (1+ factor ) * (precision * recall) / ( factor * precision + recall)
+    return score
diff --git a/nlp_resource_data/nltk/translate/chrf_score.pyc b/nlp_resource_data/nltk/translate/chrf_score.pyc

new file mode 100755 (executable)

index 0000000..13fa6a1

Binary files /dev/null and b/nlp_resource_data/nltk/translate/chrf_score.pyc differ
diff --git a/nlp_resource_data/nltk/translate/gale_church.py b/nlp_resource_data/nltk/translate/gale_church.py

new file mode 100755 (executable)

index 0000000..a543b4c
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/gale_church.py
@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+
+# Natural Language Toolkit: Gale-Church Aligner
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Torsten Marek <marek@ifi.uzh.ch>
+# Contributor: Cassidy Laidlaw, Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+
+A port of the Gale-Church Aligner.
+
+Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
+http://aclweb.org/anthology/J93-1004.pdf
+
+"""
+
+from __future__ import division
+import math
+
+try:
+    from scipy.stats import norm
+    from norm import logsf as norm_logsf
+except ImportError:
+    def erfcc(x):
+        """Complementary error function."""
+        z = abs(x)
+        t = 1 / (1 + 0.5 * z)
+        r = t * math.exp(-z * z -
+                         1.26551223 + t *
+                         (1.00002368 + t *
+                          (.37409196 + t *
+                           (.09678418 + t *
+                            (-.18628806 + t *
+                             (.27886807 + t *
+                              (-1.13520398 + t *
+                               (1.48851587 + t *
+                                (-.82215223 + t * .17087277)))))))))
+        if x >= 0.:
+            return r
+        else:
+            return 2. - r
+
+
+    def norm_cdf(x):
+        """Return the area under the normal distribution from M{-∞..x}."""
+        return 1 - 0.5 * erfcc(x / math.sqrt(2))
+
+
+    def norm_logsf(x):
+        try:
+            return math.log(1 - norm_cdf(x))
+        except ValueError:
+            return float('-inf')
+
+
+LOG2 = math.log(2)
+
+
+class LanguageIndependent(object):
+    # These are the language-independent probabilities and parameters
+    # given in Gale & Church
+
+    # for the computation, l_1 is always the language with less characters
+    PRIORS = {
+        (1, 0): 0.0099,
+        (0, 1): 0.0099,
+        (1, 1): 0.89,
+        (2, 1): 0.089,
+        (1, 2): 0.089,
+        (2, 2): 0.011,
+    }
+
+    AVERAGE_CHARACTERS = 1
+    VARIANCE_CHARACTERS = 6.8
+
+
+def trace(backlinks, source_sents_lens, target_sents_lens):
+    """
+    Traverse the alignment cost from the tracebacks and retrieves
+    appropriate sentence pairs. 
+    
+    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
+    :type backlinks: dict
+    :param source_sents_lens: A list of target sentences' lengths
+    :type source_sents_lens: list(int)
+    :param target_sents_lens: A list of target sentences' lengths
+    :type target_sents_lens: list(int)
+    """
+    links = []
+    position = (len(source_sents_lens), len(target_sents_lens))
+    while position != (0, 0) and all(p >=0 for p in position):
+        try:
+            s, t = backlinks[position]
+        except TypeError:
+            position = (position[0]-1 , position[1]-1)
+            continue
+        for i in range(s):
+            for j in range(t):
+                links.append((position[0] - i - 1, position[1] - j - 1))
+        position = (position[0] - s, position[1] - t)
+
+    return links[::-1]
+
+
+def align_log_prob(i, j, source_sents, target_sents, alignment, params):
+    """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
+    being aligned with a specific C{alignment}.
+
+    @param i: The offset of the source sentence.
+    @param j: The offset of the target sentence.
+    @param source_sents: The list of source sentence lengths.
+    @param target_sents: The list of target sentence lengths.
+    @param alignment: The alignment type, a tuple of two integers.
+    @param params: The sentence alignment parameters.
+
+    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
+    """
+    l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
+    l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
+    try:
+        # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
+        # reference implementation. With l_s in the denominator, insertions are impossible.
+        m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
+        delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(m * params.VARIANCE_CHARACTERS)
+    except ZeroDivisionError:
+        return float('-inf')
+
+    return - (LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
+
+
+def align_blocks(source_sents_lens, target_sents_lens, params = LanguageIndependent):
+    """Return the sentence alignment of two text blocks (usually paragraphs).
+
+        >>> align_blocks([5,5,5], [7,7,7])
+        [(0, 0), (1, 1), (2, 2)]
+        >>> align_blocks([10,5,5], [12,20])
+        [(0, 0), (1, 1), (2, 1)]
+        >>> align_blocks([12,20], [10,5,5])
+        [(0, 0), (1, 1), (1, 2)]
+        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
+        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
+
+    @param source_sents_lens: The list of source sentence lengths.
+    @param target_sents_lens: The list of target sentence lengths.
+    @param params: the sentence alignment parameters.
+    @return: The sentence alignments, a list of index pairs.
+    """
+
+    alignment_types = list(params.PRIORS.keys())
+
+    # there are always three rows in the history (with the last of them being filled)
+    D = [[]]
+
+    backlinks = {}
+
+    for i in range(len(source_sents_lens) + 1): 
+        for j in range(len(target_sents_lens) + 1):
+            min_dist = float('inf')
+            min_align = None
+            for a in alignment_types:
+                prev_i = - 1 - a[0]
+                prev_j = j - a[1]
+                if prev_i < -len(D) or prev_j < 0:
+                    continue
+                p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents_lens, 
+                                                       target_sents_lens, a, params)
+                if p < min_dist:
+                    min_dist = p
+                    min_align = a
+
+            if min_dist == float('inf'):
+                min_dist = 0
+
+            backlinks[(i, j)] = min_align
+            D[-1].append(min_dist)
+
+        if len(D) > 2:
+            D.pop(0)
+        D.append([])
+    
+    return trace(backlinks, source_sents_lens, target_sents_lens)
+
+
+def align_texts(source_blocks, target_blocks, params = LanguageIndependent):
+    """Creates the sentence alignment of two texts.
+
+    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence 
+    alignment links. 
+
+    Each block consists of a list that contains the lengths (in characters) of the sentences
+    in this block.
+    
+    @param source_blocks: The list of blocks in the source text.
+    @param target_blocks: The list of blocks in the target text.
+    @param params: the sentence alignment parameters.
+
+    @returns: A list of sentence alignment lists
+    """
+    if len(source_blocks) != len(target_blocks):
+        raise ValueError("Source and target texts do not have the same number of blocks.")
+    
+    return [align_blocks(source_block, target_block, params) 
+            for source_block, target_block in zip(source_blocks, target_blocks)]
+
+
+# File I/O functions; may belong in a corpus reader
+
+def split_at(it, split_value):
+    """Splits an iterator C{it} at values of C{split_value}. 
+
+    Each instance of C{split_value} is swallowed. The iterator produces
+    subiterators which need to be consumed fully before the next subiterator
+    can be used.
+    """
+    def _chunk_iterator(first):
+        v = first
+        while v != split_value:
+            yield v
+            v = it.next()
+    
+    while True:
+        yield _chunk_iterator(it.next())
+        
+
+def parse_token_stream(stream, soft_delimiter, hard_delimiter):
+    """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) 
+    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
+    """
+    return [
+        [sum(len(token) for token in sentence_it) 
+         for sentence_it in split_at(block_it, soft_delimiter)]
+        for block_it in split_at(stream, hard_delimiter)]
+
+
+
+
+#    Code for test files in nltk_contrib/align/data/*.tok
+#    import sys
+#    from contextlib import nested
+#    with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t):
+#        source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP")
+#        target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP")
+#        print align_texts(source, target)
+
+
diff --git a/nlp_resource_data/nltk/translate/gale_church.pyc b/nlp_resource_data/nltk/translate/gale_church.pyc

new file mode 100755 (executable)

index 0000000..c90be27

Binary files /dev/null and b/nlp_resource_data/nltk/translate/gale_church.pyc differ
diff --git a/nlp_resource_data/nltk/translate/gdfa.py b/nlp_resource_data/nltk/translate/gdfa.py

new file mode 100755 (executable)

index 0000000..e0e7f04
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/gdfa.py
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: GDFA word alignment symmetrization
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+import codecs
+from collections import defaultdict
+
+def grow_diag_final_and(srclen, trglen, e2f, f2e):
+    """
+    This module symmetrisatizes the source-to-target and target-to-source
+    word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).
+    
+    Step 1: Find the intersection of the bidirectional alignment.
+    
+    Step 2: Search for additional neighbor alignment points to be added, given
+            these criteria: (i) neighbor alignments points are not in the
+            intersection and (ii) neighbor alignments are in the union.
+            
+    Step 3: Add all other alignment points thats not in the intersection, not in
+            the neighboring alignments that met the criteria but in the original
+            foward/backward alignment outputs.
+    
+        >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '
+        ...         '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')
+        >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '
+        ...         '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 '
+        ...         '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18')
+        >>> srctext = ("この よう な ハロー 白色 わい 星 の Ｌ 関数 "
+        ...            "は Ｌ と 共 に 不連続 に 増加 する こと が "
+        ...            "期待 さ れる こと を 示し た 。")
+        >>> trgtext = ("Therefore , we expect that the luminosity function "
+        ...            "of such halo white dwarfs increases discontinuously "
+        ...            "with the luminosity .")
+        >>> srclen = len(srctext.split())
+        >>> trglen = len(trgtext.split())
+        >>>
+        >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)
+        >>> gdfa == set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
+        ...         (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,
+        ...         13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),
+        ...         (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,
+        ...         12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,
+        ...         12), (11, 6), (12, 8)])
+        True
+    
+    References:
+    Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot. 
+    2005. Edinburgh System Description for the 2005 IWSLT Speech 
+    Translation Evaluation. In MT Eval Workshop.
+
+    :type srclen: int
+    :param srclen: the number of tokens in the source language
+    :type trglen: int
+    :param trglen: the number of tokens in the target language
+    :type e2f: str
+    :param e2f: the forward word alignment outputs from source-to-target
+                language (in pharaoh output format)
+    :type f2e: str
+    :param f2e: the backward word alignment outputs from target-to-source
+                language (in pharaoh output format)
+    :rtype: set(tuple(int))
+    :return: the symmetrized alignment points from the GDFA algorithm
+    """
+
+    # Converts pharaoh text format into list of tuples.
+    e2f = [tuple(map(int,a.split('-'))) for a in e2f.split()]
+    f2e = [tuple(map(int,a.split('-'))) for a in f2e.split()]
+    
+    neighbors = [(-1,0),(0,-1),(1,0),(0,1),(-1,-1),(-1,1),(1,-1),(1,1)]
+    alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
+    union = set(e2f).union(set(f2e))
+
+    # *aligned* is used to check if neighbors are aligned in grow_diag()
+    aligned = defaultdict(set)
+    for i,j in alignment:
+        aligned['e'].add(i)
+        aligned['j'].add(j)
+    
+    def grow_diag():
+        """
+        Search for the neighbor points and them to the intersected alignment
+        points if criteria are met.
+        """
+        prev_len = len(alignment) - 1
+        # iterate until no new points added
+        while prev_len < len(alignment):
+            # for english word e = 0 ... en
+            for e in range(srclen):
+                # for foreign word f = 0 ... fn
+                for f in range(trglen): 
+                    # if ( e aligned with f)
+                    if (e,f) in alignment:
+                        # for each neighboring point (e-new, f-new)
+                        for neighbor in neighbors:
+                            neighbor = tuple(i+j for i,j in zip((e,f),neighbor))
+                            e_new, f_new = neighbor
+                            # if ( ( e-new not aligned and f-new not aligned) 
+                            # and (e-new, f-new in union(e2f, f2e) )
+                            if (e_new not in aligned and f_new not in aligned)\
+                            and neighbor in union:
+                                alignment.add(neighbor)
+                                aligned['e'].add(e_new); aligned['f'].add(f_new)
+                                prev_len+=1
+                                                                    
+    def final_and(a):
+        """
+        Adds remaining points that are not in the intersection, not in the 
+        neighboring alignments but in the original *e2f* and *f2e* alignments
+        """
+        # for english word e = 0 ... en
+        for e_new in range(srclen):
+            # for foreign word f = 0 ... fn
+            for f_new in range(trglen):
+                # if ( ( e-new not aligned and f-new not aligned) 
+                # and (e-new, f-new in union(e2f, f2e) )
+                if (e_new not in aligned
+                    and f_new not in aligned
+                    and (e_new, f_new) in a):
+
+                    alignment.add((e_new, f_new))
+                    aligned['e'].add(e_new); aligned['f'].add(f_new)
+
+    grow_diag()
+    final_and(e2f)
+    final_and(f2e)
+    return alignment
+
diff --git a/nlp_resource_data/nltk/translate/gdfa.pyc b/nlp_resource_data/nltk/translate/gdfa.pyc

new file mode 100755 (executable)

index 0000000..4015011

Binary files /dev/null and b/nlp_resource_data/nltk/translate/gdfa.pyc differ
diff --git a/nlp_resource_data/nltk/translate/gleu_score.py b/nlp_resource_data/nltk/translate/gleu_score.py

new file mode 100755 (executable)

index 0000000..e73be4e
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/gleu_score.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: GLEU Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors:
+# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+""" GLEU score implementation. """
+from __future__ import division
+from collections import Counter
+
+from nltk.util import ngrams, everygrams
+
+
+def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
+    """
+    Calculates the sentence level GLEU (Google-BLEU) score described in
+
+        Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
+        Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
+        Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
+        Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
+        George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
+        Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
+        Jeffrey Dean. (2016) Google’s Neural Machine Translation System:
+        Bridging the Gap between Human and Machine Translation.
+        eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
+        Retrieved on 27 Oct 2016.
+
+    From Wu et al. (2016):
+        "The BLEU score has some undesirable properties when used for single
+         sentences, as it was designed to be a corpus measure. We therefore
+         use a slightly different score for our RL experiments which we call
+         the 'GLEU score'. For the GLEU score, we record all sub-sequences of
+         1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
+         compute a recall, which is the ratio of the number of matching n-grams
+         to the number of total n-grams in the target (ground truth) sequence,
+         and a precision, which is the ratio of the number of matching n-grams
+         to the number of total n-grams in the generated output sequence. Then
+         GLEU score is simply the minimum of recall and precision. This GLEU
+         score's range is always between 0 (no matches) and 1 (all match) and
+         it is symmetrical when switching output and target. According to
+         our experiments, GLEU score correlates quite well with the BLEU
+         metric on a corpus level but does not have its drawbacks for our per
+         sentence reward objective."
+
+    Note: The initial implementation only allowed a single reference, but now
+          a list of references is required (which is consistent with
+          bleu_score.sentence_bleu()).
+
+    The infamous "the the the ... " example
+
+        >>> ref = 'the cat is on the mat'.split()
+        >>> hyp = 'the the the the the the the'.split()
+        >>> sentence_gleu([ref], hyp)  # doctest: +ELLIPSIS
+        0.0909...
+
+    An example to evaluate normal machine translation outputs
+
+        >>> ref1 = str('It is a guide to action that ensures that the military '
+        ...            'will forever heed Party commands').split()
+        >>> hyp1 = str('It is a guide to action which ensures that the military '
+        ...            'always obeys the commands of the party').split()
+        >>> hyp2 = str('It is to insure the troops forever hearing the activity '
+        ...            'guidebook that party direct').split()
+        >>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS
+        0.4393...
+        >>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS
+        0.1206...
+
+    :param references: a list of reference sentences
+    :type references: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param min_len: The minimum order of n-gram this function should extract.
+    :type min_len: int
+    :param max_len: The maximum order of n-gram this function should extract.
+    :type max_len: int
+    :return: the sentence level GLEU score.
+    :rtype: float
+    """
+    return corpus_gleu(
+        [references],
+        [hypothesis],
+        min_len=min_len,
+        max_len=max_len
+    )
+
+def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
+    """
+    Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
+    the hypotheses and their respective references.
+
+    Instead of averaging the sentence level GLEU scores (i.e. macro-average
+    precision), Wu et al. (2016) sum up the matching tokens and the max of
+    hypothesis and reference tokens for each sentence, then compute using the
+    aggregate values.
+
+    From Mike Schuster (via email):
+        "For the corpus, we just add up the two statistics n_match and
+         n_all = max(n_all_output, n_all_target) for all sentences, then
+         calculate gleu_score = n_match / n_all, so it is not just a mean of
+         the sentence gleu scores (in our case, longer sentences count more,
+         which I think makes sense as they are more difficult to translate)."
+
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+    ...          'because', 'he', 'read', 'the', 'book']
+
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+    0.5673...
+
+    The example below show that corpus_gleu() is different from averaging
+    sentence_gleu() for hypotheses
+
+    >>> score1 = sentence_gleu([ref1a], hyp1)
+    >>> score2 = sentence_gleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+    0.6144...
+
+    :param list_of_references: a list of reference sentences, w.r.t. hypotheses
+    :type list_of_references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param min_len: The minimum order of n-gram this function should extract.
+    :type min_len: int
+    :param max_len: The maximum order of n-gram this function should extract.
+    :type max_len: int
+    :return: The corpus-level GLEU score.
+    :rtype: float
+    """
+    # sanity check
+    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+    # sum matches and max-token-lengths over all sentences
+    corpus_n_match = 0
+    corpus_n_all = 0
+
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
+        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.
+        
+        hyp_counts = []
+        for reference in references:
+            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
+            tpfn = sum(ref_ngrams.values())  # True positives + False negatives.
+
+            overlap_ngrams = ref_ngrams & hyp_ngrams
+            tp = sum(overlap_ngrams.values())  # True positives.
+
+            # While GLEU is defined as the minimum of precision and
+            # recall, we can reduce the number of division operations by one by
+            # instead finding the maximum of the denominators for the precision
+            # and recall formulae, since the numerators are the same:
+            #     precision = tp / tpfp
+            #     recall = tp / tpfn
+            #     gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
+            n_all = max(tpfp, tpfn)
+
+            if n_all > 0:
+                hyp_counts.append((tp, n_all))
+
+        # use the reference yielding the highest score
+        if hyp_counts:
+            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0]/hc[1])
+            corpus_n_match += n_match
+            corpus_n_all += n_all
+
+    # corner case: empty corpus or empty references---don't divide by zero!
+    if corpus_n_all == 0:
+        gleu_score = 0.0
+    else:
+        gleu_score = corpus_n_match / corpus_n_all
+
+    return gleu_score
diff --git a/nlp_resource_data/nltk/translate/gleu_score.pyc b/nlp_resource_data/nltk/translate/gleu_score.pyc

new file mode 100755 (executable)

index 0000000..3e4710f

Binary files /dev/null and b/nlp_resource_data/nltk/translate/gleu_score.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ibm1.py b/nlp_resource_data/nltk/translate/ibm1.py

new file mode 100755 (executable)

index 0000000..35e0420
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ibm1.py
@@ -0,0 +1,247 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 1
+#
+# Copyright (C) 2001-2013 NLTK Project
+# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
+#         Hengfeng Li <hengfeng12345@gmail.com>
+#         Ruxin Hou <r.hou@student.unimelb.edu.au>
+#         Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
+# Based on earlier version by:
+#         Will Zhang <wilzzha@gmail.com>
+#         Guan Gui <ggui@student.unimelb.edu.au>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Lexical translation model that ignores word order.
+
+In IBM Model 1, word order is ignored for simplicity. Thus, the
+following three alignments are equally likely. As long as the word
+alignments are equivalent, it doesn't matter where the word
+occurs in the source or target sentence.
+
+Source: je mange du jambon
+Target: i eat some ham
+Alignment: (1,1) (2,2) (3,3) (4,4)
+
+Source: je mange du jambon
+Target: some ham eat i
+Alignment: (1,4) (2,3) (3,1) (4,2)
+
+Source: du jambon je mange
+Target: eat i some ham
+Alignment: (1,3) (2,4) (3,2) (4,1)
+
+The EM algorithm used in Model 1 is:
+E step - In the training data, count how many times a source language
+         word is translated into a target language word, weighted by
+         the prior probability of the translation.
+
+M step - Estimate the new probability of translation based on the
+         counts from the Expectation step.
+
+
+Notations:
+i: Position in the source sentence
+    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+    Valid values are 1, 2, ..., length of target sentence
+s: A word in the source language
+t: A word in the target language
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate.ibm_model import Counts
+import warnings
+
+
+class IBMModel1(IBMModel):
+    """
+    Lexical translation model that ignores word order
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+
+    >>> ibm1 = IBMModel1(bitext, 5)
+
+    >>> print(ibm1.translation_table['buch']['book'])
+    0.889...
+    >>> print(ibm1.translation_table['das']['book'])
+    0.061...
+    >>> print(ibm1.translation_table['buch'][None])
+    0.113...
+    >>> print(ibm1.translation_table['ja'][None])
+    0.072...
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
+
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, the following entry must be present:
+            ``translation_table``.
+            See ``IBMModel`` for the type and purpose of this table.
+        :type probability_tables: dict[str]: object
+        """
+        super(IBMModel1, self).__init__(sentence_aligned_corpus)
+
+        if probability_tables is None:
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+        self.__align_all(sentence_aligned_corpus)
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        initial_prob = 1 / len(self.trg_vocab)
+        if initial_prob < IBMModel.MIN_PROB:
+            warnings.warn("Target language vocabulary is too large (" +
+                          str(len(self.trg_vocab)) + " words). "
+                          "Results may be less accurate.")
+
+        for t in self.trg_vocab:
+            self.translation_table[t] = defaultdict(lambda: initial_prob)
+
+    def train(self, parallel_corpus):
+        counts = Counts()
+        for aligned_sentence in parallel_corpus:
+            trg_sentence = aligned_sentence.words
+            src_sentence = [None] + aligned_sentence.mots
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_all_alignments(src_sentence, trg_sentence)
+
+            # E step (b): Collect counts
+            for t in trg_sentence:
+                for s in src_sentence:
+                    count = self.prob_alignment_point(s, t)
+                    normalized_count = count / total_count[t]
+                    counts.t_given_s[t][s] += normalized_count
+                    counts.any_t_given_s[s] += normalized_count
+
+        # M step: Update probabilities with maximum likelihood estimate
+        self.maximize_lexical_translation_probabilities(counts)
+
+    def prob_all_alignments(self, src_sentence, trg_sentence):
+        """
+        Computes the probability of all possible word alignments,
+        expressed as a marginal distribution over target words t
+
+        Each entry in the return value represents the contribution to
+        the total alignment probability by the target word t.
+
+        To obtain probability(alignment | src_sentence, trg_sentence),
+        simply sum the entries in the return value.
+
+        :return: Probability of t for all s in ``src_sentence``
+        :rtype: dict(str): float
+        """
+        alignment_prob_for_t = defaultdict(lambda: 0.0)
+        for t in trg_sentence:
+            for s in src_sentence:
+                alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
+        return alignment_prob_for_t
+
+    def prob_alignment_point(self, s, t):
+        """
+        Probability that word ``t`` in the target sentence is aligned to
+        word ``s`` in the source sentence
+        """
+        return self.translation_table[t][s]
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        prob = 1.0
+
+        for j, i in enumerate(alignment_info.alignment):
+            if j == 0:
+                continue  # skip the dummy zeroeth element
+            trg_word = alignment_info.trg_sentence[j]
+            src_word = alignment_info.src_sentence[i]
+            prob *= self.translation_table[trg_word][src_word]
+
+        return max(prob, IBMModel.MIN_PROB)
+
+    def __align_all(self, parallel_corpus):
+        for sentence_pair in parallel_corpus:
+            self.__align(sentence_pair)
+
+    def __align(self, sentence_pair):
+        """
+        Determines the best word alignment for one sentence pair from
+        the corpus that the model was trained on.
+
+        The best alignment will be set in ``sentence_pair`` when the
+        method returns. In contrast with the internal implementation of
+        IBM models, the word indices in the ``Alignment`` are zero-
+        indexed, not one-indexed.
+
+        :param sentence_pair: A sentence in the source language and its
+            counterpart sentence in the target language
+        :type sentence_pair: AlignedSent
+        """
+        best_alignment = []
+
+        for j, trg_word in enumerate(sentence_pair.words):
+            # Initialize trg_word to align with the NULL token
+            best_prob = max(self.translation_table[trg_word][None],
+                            IBMModel.MIN_PROB)
+            best_alignment_point = None
+            for i, src_word in enumerate(sentence_pair.mots):
+                align_prob = self.translation_table[trg_word][src_word]
+                if align_prob >= best_prob:  # prefer newer word in case of tie
+                    best_prob = align_prob
+                    best_alignment_point = i
+
+            best_alignment.append((j, best_alignment_point))
+
+        sentence_pair.alignment = Alignment(best_alignment)
diff --git a/nlp_resource_data/nltk/translate/ibm1.pyc b/nlp_resource_data/nltk/translate/ibm1.pyc

new file mode 100755 (executable)

index 0000000..a72258f

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ibm1.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ibm2.py b/nlp_resource_data/nltk/translate/ibm2.py

new file mode 100755 (executable)

index 0000000..f2c17a7
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ibm2.py
@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 2
+#
+# Copyright (C) 2001-2013 NLTK Project
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Lexical translation model that considers word order.
+
+IBM Model 2 improves on Model 1 by accounting for word order.
+An alignment probability is introduced, a(i | j,l,m), which predicts
+a source word position, given its aligned target word's position.
+
+The EM algorithm used in Model 2 is:
+E step - In the training data, collect counts, weighted by prior
+         probabilities.
+         (a) count how many times a source language word is translated
+             into a target language word
+         (b) count how many times a particular position in the source
+             sentence is aligned to a particular position in the target
+             sentence
+
+M step - Estimate new probabilities based on the counts from the E step
+
+
+Notations:
+i: Position in the source sentence
+    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+    Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel1
+from nltk.translate.ibm_model import Counts
+import warnings
+
+
+class IBMModel2(IBMModel):
+    """
+    Lexical translation model that considers word order
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+
+    >>> ibm2 = IBMModel2(bitext, 5)
+
+    >>> print(round(ibm2.translation_table['buch']['book'], 3))
+    1.0
+    >>> print(round(ibm2.translation_table['das']['book'], 3))
+    0.0
+    >>> print(round(ibm2.translation_table['buch'][None], 3))
+    0.0
+    >>> print(round(ibm2.translation_table['ja'][None], 3))
+    0.0
+
+    >>> print(ibm2.alignment_table[1][1][2][2])
+    0.938...
+    >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
+    0.0
+    >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
+    1.0
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
+
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model and an alignment model.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, all the following entries must be present:
+            ``translation_table``, ``alignment_table``.
+            See ``IBMModel`` for the type and purpose of these tables.
+        :type probability_tables: dict[str]: object
+        """
+        super(IBMModel2, self).__init__(sentence_aligned_corpus)
+
+        if probability_tables is None:
+            # Get translation probabilities from IBM Model 1
+            # Run more iterations of training for Model 1, since it is
+            # faster than Model 2
+            ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
+            self.translation_table = ibm1.translation_table
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+        self.__align_all(sentence_aligned_corpus)
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
+        l_m_combinations = set()
+        for aligned_sentence in sentence_aligned_corpus:
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+            if (l, m) not in l_m_combinations:
+                l_m_combinations.add((l, m))
+                initial_prob = 1 / (l + 1)
+                if initial_prob < IBMModel.MIN_PROB:
+                    warnings.warn("A source sentence is too long (" + str(l) +
+                                  " words). Results may be less accurate.")
+
+                for i in range(0, l + 1):
+                    for j in range(1, m + 1):
+                        self.alignment_table[i][j][l][m] = initial_prob
+
+    def train(self, parallel_corpus):
+        counts = Model2Counts()
+        for aligned_sentence in parallel_corpus:
+            src_sentence = [None] + aligned_sentence.mots
+            trg_sentence = ['UNUSED'] + aligned_sentence.words  # 1-indexed
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_all_alignments(src_sentence, trg_sentence)
+
+            # E step (b): Collect counts
+            for j in range(1, m + 1):
+                t = trg_sentence[j]
+                for i in range(0, l + 1):
+                    s = src_sentence[i]
+                    count = self.prob_alignment_point(
+                        i, j, src_sentence, trg_sentence)
+                    normalized_count = count / total_count[t]
+
+                    counts.update_lexical_translation(normalized_count, s, t)
+                    counts.update_alignment(normalized_count, i, j, l, m)
+
+        # M step: Update probabilities with maximum likelihood estimates
+        self.maximize_lexical_translation_probabilities(counts)
+        self.maximize_alignment_probabilities(counts)
+
+    def maximize_alignment_probabilities(self, counts):
+        MIN_PROB = IBMModel.MIN_PROB
+        for i, j_s in counts.alignment.items():
+            for j, src_sentence_lengths in j_s.items():
+                for l, trg_sentence_lengths in src_sentence_lengths.items():
+                    for m in trg_sentence_lengths:
+                        estimate = (counts.alignment[i][j][l][m] /
+                                    counts.alignment_for_any_i[j][l][m])
+                        self.alignment_table[i][j][l][m] = max(estimate,
+                                                               MIN_PROB)
+
+    def prob_all_alignments(self, src_sentence, trg_sentence):
+        """
+        Computes the probability of all possible word alignments,
+        expressed as a marginal distribution over target words t
+
+        Each entry in the return value represents the contribution to
+        the total alignment probability by the target word t.
+
+        To obtain probability(alignment | src_sentence, trg_sentence),
+        simply sum the entries in the return value.
+
+        :return: Probability of t for all s in ``src_sentence``
+        :rtype: dict(str): float
+        """
+        alignment_prob_for_t = defaultdict(lambda: 0.0)
+        for j in range(1, len(trg_sentence)):
+            t = trg_sentence[j]
+            for i in range(0, len(src_sentence)):
+                alignment_prob_for_t[t] += self.prob_alignment_point(
+                    i, j, src_sentence, trg_sentence)
+        return alignment_prob_for_t
+
+    def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
+        """
+        Probability that position j in ``trg_sentence`` is aligned to
+        position i in the ``src_sentence``
+        """
+        l = len(src_sentence) - 1
+        m = len(trg_sentence) - 1
+        s = src_sentence[i]
+        t = trg_sentence[j]
+        return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        prob = 1.0
+        l = len(alignment_info.src_sentence) - 1
+        m = len(alignment_info.trg_sentence) - 1
+
+        for j, i in enumerate(alignment_info.alignment):
+            if j == 0:
+                continue  # skip the dummy zeroeth element
+            trg_word = alignment_info.trg_sentence[j]
+            src_word = alignment_info.src_sentence[i]
+            prob *= (self.translation_table[trg_word][src_word] *
+                     self.alignment_table[i][j][l][m])
+
+        return max(prob, IBMModel.MIN_PROB)
+
+    def __align_all(self, parallel_corpus):
+        for sentence_pair in parallel_corpus:
+            self.__align(sentence_pair)
+
+    def __align(self, sentence_pair):
+        """
+        Determines the best word alignment for one sentence pair from
+        the corpus that the model was trained on.
+
+        The best alignment will be set in ``sentence_pair`` when the
+        method returns. In contrast with the internal implementation of
+        IBM models, the word indices in the ``Alignment`` are zero-
+        indexed, not one-indexed.
+
+        :param sentence_pair: A sentence in the source language and its
+            counterpart sentence in the target language
+        :type sentence_pair: AlignedSent
+        """
+        best_alignment = []
+
+        l = len(sentence_pair.mots)
+        m = len(sentence_pair.words)
+
+        for j, trg_word in enumerate(sentence_pair.words):
+            # Initialize trg_word to align with the NULL token
+            best_prob = (self.translation_table[trg_word][None] *
+                         self.alignment_table[0][j + 1][l][m])
+            best_prob = max(best_prob, IBMModel.MIN_PROB)
+            best_alignment_point = None
+            for i, src_word in enumerate(sentence_pair.mots):
+                align_prob = (self.translation_table[trg_word][src_word] *
+                              self.alignment_table[i + 1][j + 1][l][m])
+                if align_prob >= best_prob:
+                    best_prob = align_prob
+                    best_alignment_point = i
+
+            best_alignment.append((j, best_alignment_point))
+
+        sentence_pair.alignment = Alignment(best_alignment)
+
+
+class Model2Counts(Counts):
+    """
+    Data object to store counts of various parameters during training.
+    Includes counts for alignment.
+    """
+    def __init__(self):
+        super(Model2Counts, self).__init__()
+        self.alignment = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+                lambda: 0.0))))
+        self.alignment_for_any_i = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+
+    def update_lexical_translation(self, count, s, t):
+        self.t_given_s[t][s] += count
+        self.any_t_given_s[s] += count
+
+    def update_alignment(self, count, i, j, l, m):
+        self.alignment[i][j][l][m] += count
+        self.alignment_for_any_i[j][l][m] += count
diff --git a/nlp_resource_data/nltk/translate/ibm2.pyc b/nlp_resource_data/nltk/translate/ibm2.pyc

new file mode 100755 (executable)

index 0000000..df9809f

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ibm2.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ibm3.py b/nlp_resource_data/nltk/translate/ibm3.py

new file mode 100755 (executable)

index 0000000..8af6059
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ibm3.py
@@ -0,0 +1,337 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 3
+#
+# Copyright (C) 2001-2013 NLTK Project
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Translation model that considers how a word can be aligned to
+multiple words in another language.
+
+IBM Model 3 improves on Model 2 by directly modeling the phenomenon
+where a word in one language may be translated into zero or more words
+in another. This is expressed by the fertility probability,
+n(phi | source word).
+
+If a source word translates into more than one word, it is possible to
+generate sentences that have the same alignment in multiple ways. This
+is modeled by a distortion step. The distortion probability, d(j|i,l,m),
+predicts a target word position, given its aligned source word's
+position. The distortion probability replaces the alignment probability
+of Model 2.
+
+The fertility probability is not applicable for NULL. Target words that
+align to NULL are assumed to be distributed uniformly in the target
+sentence. The existence of these words is modeled by p1, the probability
+that a target word produced by a real source word requires another
+target word that is produced by NULL.
+
+The EM algorithm used in Model 3 is:
+E step - In the training data, collect counts, weighted by prior
+         probabilities.
+         (a) count how many times a source language word is translated
+             into a target language word
+         (b) count how many times a particular position in the target
+             sentence is aligned to a particular position in the source
+             sentence
+         (c) count how many times a source word is aligned to phi number
+             of target words
+         (d) count how many times NULL is aligned to a target word
+
+M step - Estimate new probabilities based on the counts from the E step
+
+Because there are too many possible alignments, only the most probable
+ones are considered. First, the best alignment is determined using prior
+probabilities. Then, a hill climbing approach is used to find other good
+candidates.
+
+
+Notations:
+i: Position in the source sentence
+    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+    Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+phi: Fertility, the number of target words produced by a source word
+p1: Probability that a target word produced by a source word is
+    accompanied by another target word that is aligned to NULL
+p0: 1 - p1
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from math import factorial
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel2
+from nltk.translate.ibm_model import Counts
+import warnings
+
+
+class IBMModel3(IBMModel):
+    """
+    Translation model that considers how a word can be aligned to
+    multiple words in another language
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+    >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
+    >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
+
+    >>> ibm3 = IBMModel3(bitext, 5)
+
+    >>> print(round(ibm3.translation_table['buch']['book'], 3))
+    1.0
+    >>> print(round(ibm3.translation_table['das']['book'], 3))
+    0.0
+    >>> print(round(ibm3.translation_table['ja'][None], 3))
+    1.0
+
+    >>> print(round(ibm3.distortion_table[1][1][2][2], 3))
+    1.0
+    >>> print(round(ibm3.distortion_table[1][2][2][2], 3))
+    0.0
+    >>> print(round(ibm3.distortion_table[2][2][4][5], 3))
+    0.75
+
+    >>> print(round(ibm3.fertility_table[2]['summarize'], 3))
+    1.0
+    >>> print(round(ibm3.fertility_table[1]['book'], 3))
+    1.0
+
+    >>> print(ibm3.p1)
+    0.054...
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
+
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model, a distortion model, a fertility model, and a
+        model for generating NULL-aligned words.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, all the following entries must be present:
+            ``translation_table``, ``alignment_table``,
+            ``fertility_table``, ``p1``, ``distortion_table``.
+            See ``IBMModel`` for the type and purpose of these tables.
+        :type probability_tables: dict[str]: object
+        """
+        super(IBMModel3, self).__init__(sentence_aligned_corpus)
+        self.reset_probabilities()
+
+        if probability_tables is None:
+            # Get translation and alignment probabilities from IBM Model 2
+            ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
+            self.translation_table = ibm2.translation_table
+            self.alignment_table = ibm2.alignment_table
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.distortion_table = probability_tables['distortion_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+    def reset_probabilities(self):
+        super(IBMModel3, self).reset_probabilities()
+        self.distortion_table = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+                lambda: self.MIN_PROB))))
+        """
+        dict[int][int][int][int]: float. Probability(j | i,l,m).
+        Values accessed as ``distortion_table[j][i][l][m]``.
+        """
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        # d(j | i,l,m) = 1 / m for all i, j, l, m
+        l_m_combinations = set()
+        for aligned_sentence in sentence_aligned_corpus:
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+            if (l, m) not in l_m_combinations:
+                l_m_combinations.add((l, m))
+                initial_prob = 1 / m
+                if initial_prob < IBMModel.MIN_PROB:
+                    warnings.warn("A target sentence is too long (" + str(m) +
+                                  " words). Results may be less accurate.")
+                for j in range(1, m + 1):
+                    for i in range(0, l + 1):
+                        self.distortion_table[j][i][l][m] = initial_prob
+
+        # simple initialization, taken from GIZA++
+        self.fertility_table[0] = defaultdict(lambda: 0.2)
+        self.fertility_table[1] = defaultdict(lambda: 0.65)
+        self.fertility_table[2] = defaultdict(lambda: 0.1)
+        self.fertility_table[3] = defaultdict(lambda: 0.04)
+        MAX_FERTILITY = 10
+        initial_fert_prob = 0.01 / (MAX_FERTILITY - 4)
+        for phi in range(4, MAX_FERTILITY):
+            self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob)
+
+        self.p1 = 0.5
+
+    def train(self, parallel_corpus):
+        counts = Model3Counts()
+        for aligned_sentence in parallel_corpus:
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+
+            # Sample the alignment space
+            sampled_alignments, best_alignment = self.sample(aligned_sentence)
+            # Record the most probable alignment
+            aligned_sentence.alignment = Alignment(
+                best_alignment.zero_indexed_alignment())
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_of_alignments(sampled_alignments)
+
+            # E step (b): Collect counts
+            for alignment_info in sampled_alignments:
+                count = self.prob_t_a_given_s(alignment_info)
+                normalized_count = count / total_count
+
+                for j in range(1, m + 1):
+                    counts.update_lexical_translation(
+                        normalized_count, alignment_info, j)
+                    counts.update_distortion(
+                        normalized_count, alignment_info, j, l, m)
+
+                counts.update_null_generation(normalized_count, alignment_info)
+                counts.update_fertility(normalized_count, alignment_info)
+
+        # M step: Update probabilities with maximum likelihood estimates
+        # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+        existing_alignment_table = self.alignment_table
+        self.reset_probabilities()
+        self.alignment_table = existing_alignment_table  # don't retrain
+
+        self.maximize_lexical_translation_probabilities(counts)
+        self.maximize_distortion_probabilities(counts)
+        self.maximize_fertility_probabilities(counts)
+        self.maximize_null_generation_probabilities(counts)
+
+    def maximize_distortion_probabilities(self, counts):
+        MIN_PROB = IBMModel.MIN_PROB
+        for j, i_s in counts.distortion.items():
+            for i, src_sentence_lengths in i_s.items():
+                for l, trg_sentence_lengths in src_sentence_lengths.items():
+                    for m in trg_sentence_lengths:
+                        estimate = (counts.distortion[j][i][l][m] /
+                                    counts.distortion_for_any_j[i][l][m])
+                        self.distortion_table[j][i][l][m] = max(estimate,
+                                                                MIN_PROB)
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        src_sentence = alignment_info.src_sentence
+        trg_sentence = alignment_info.trg_sentence
+        l = len(src_sentence) - 1  # exclude NULL
+        m = len(trg_sentence) - 1
+        p1 = self.p1
+        p0 = 1 - p1
+
+        probability = 1.0
+        MIN_PROB = IBMModel.MIN_PROB
+
+        # Combine NULL insertion probability
+        null_fertility = alignment_info.fertility_of_i(0)
+        probability *= (pow(p1, null_fertility) *
+                        pow(p0, m - 2 * null_fertility))
+        if probability < MIN_PROB:
+            return MIN_PROB
+
+        # Compute combination (m - null_fertility) choose null_fertility
+        for i in range(1, null_fertility + 1):
+            probability *= (m - null_fertility - i + 1) / i
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+        # Combine fertility probabilities
+        for i in range(1, l + 1):
+            fertility = alignment_info.fertility_of_i(i)
+            probability *= (factorial(fertility) *
+                self.fertility_table[fertility][src_sentence[i]])
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+        # Combine lexical and distortion probabilities
+        for j in range(1, m + 1):
+            t = trg_sentence[j]
+            i = alignment_info.alignment[j]
+            s = src_sentence[i]
+
+            probability *= (self.translation_table[t][s] *
+                self.distortion_table[j][i][l][m])
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+        return probability
+
+
+class Model3Counts(Counts):
+    """
+    Data object to store counts of various parameters during training.
+    Includes counts for distortion.
+    """
+    def __init__(self):
+        super(Model3Counts, self).__init__()
+        self.distortion = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+                lambda: 0.0))))
+        self.distortion_for_any_j = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+
+    def update_distortion(self, count, alignment_info, j, l, m):
+        i = alignment_info.alignment[j]
+        self.distortion[j][i][l][m] += count
+        self.distortion_for_any_j[i][l][m] += count
diff --git a/nlp_resource_data/nltk/translate/ibm3.pyc b/nlp_resource_data/nltk/translate/ibm3.pyc

new file mode 100755 (executable)

index 0000000..073a93a

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ibm3.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ibm4.py b/nlp_resource_data/nltk/translate/ibm4.py

new file mode 100755 (executable)

index 0000000..bb8d913
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ibm4.py
@@ -0,0 +1,467 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 4
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Translation model that reorders output words based on their type and
+distance from other related words in the output sentence.
+
+IBM Model 4 improves the distortion model of Model 3, motivated by the
+observation that certain words tend to be re-ordered in a predictable
+way relative to one another. For example, <adjective><noun> in English
+usually has its order flipped as <noun><adjective> in French.
+
+Model 4 requires words in the source and target vocabularies to be
+categorized into classes. This can be linguistically driven, like parts
+of speech (adjective, nouns, prepositions, etc). Word classes can also
+be obtained by statistical methods. The original IBM Model 4 uses an
+information theoretic approach to group words into 50 classes for each
+vocabulary.
+
+Terminology:
+Cept:
+    A source word with non-zero fertility i.e. aligned to one or more
+    target words.
+Tablet:
+    The set of target word(s) aligned to a cept.
+Head of cept:
+    The first word of the tablet of that cept.
+Center of cept:
+    The average position of the words in that cept's tablet. If the
+    value is not an integer, the ceiling is taken.
+    For example, for a tablet with words in positions 2, 5, 6 in the
+    target sentence, the center of the corresponding cept is
+    ceil((2 + 5 + 6) / 3) = 5
+Displacement:
+    For a head word, defined as (position of head word - position of
+    previous cept's center). Can be positive or negative.
+    For a non-head word, defined as (position of non-head word -
+    position of previous word in the same tablet). Always positive,
+    because successive words in a tablet are assumed to appear to the
+    right of the previous word.
+
+In contrast to Model 3 which reorders words in a tablet independently of
+other words, Model 4 distinguishes between three cases.
+(1) Words generated by NULL are distributed uniformly.
+(2) For a head word t, its position is modeled by the probability
+    d_head(displacement | word_class_s(s),word_class_t(t)),
+    where s is the previous cept, and word_class_s and word_class_t maps
+    s and t to a source and target language word class respectively.
+(3) For a non-head word t, its position is modeled by the probability
+    d_non_head(displacement | word_class_t(t))
+
+The EM algorithm used in Model 4 is:
+E step - In the training data, collect counts, weighted by prior
+         probabilities.
+         (a) count how many times a source language word is translated
+             into a target language word
+         (b) for a particular word class, count how many times a head
+             word is located at a particular displacement from the
+             previous cept's center
+         (c) for a particular word class, count how many times a
+             non-head word is located at a particular displacement from
+             the previous target word
+         (d) count how many times a source word is aligned to phi number
+             of target words
+         (e) count how many times NULL is aligned to a target word
+
+M step - Estimate new probabilities based on the counts from the E step
+
+Like Model 3, there are too many possible alignments to consider. Thus,
+a hill climbing approach is used to sample good candidates.
+
+
+Notations:
+i: Position in the source sentence
+    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+    Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+phi: Fertility, the number of target words produced by a source word
+p1: Probability that a target word produced by a source word is
+    accompanied by another target word that is aligned to NULL
+p0: 1 - p1
+dj: Displacement, Δj
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from math import factorial
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel3
+from nltk.translate.ibm_model import Counts
+from nltk.translate.ibm_model import longest_target_sentence_length
+import warnings
+
+
+class IBMModel4(IBMModel):
+    """
+    Translation model that reorders output words based on their type and
+    their distance from other related words in the output sentence
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+    >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
+    >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
+    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
+    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
+
+    >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
+
+    >>> print(round(ibm4.translation_table['buch']['book'], 3))
+    1.0
+    >>> print(round(ibm4.translation_table['das']['book'], 3))
+    0.0
+    >>> print(round(ibm4.translation_table['ja'][None], 3))
+    1.0
+
+    >>> print(round(ibm4.head_distortion_table[1][0][1], 3))
+    1.0
+    >>> print(round(ibm4.head_distortion_table[2][0][1], 3))
+    0.0
+    >>> print(round(ibm4.non_head_distortion_table[3][6], 3))
+    0.5
+
+    >>> print(round(ibm4.fertility_table[2]['summarize'], 3))
+    1.0
+    >>> print(round(ibm4.fertility_table[1]['book'], 3))
+    1.0
+
+    >>> print(ibm4.p1)
+    0.033...
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
+
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 source_word_classes, target_word_classes,
+                 probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model, distortion models, a fertility model, and a
+        model for generating NULL-aligned words.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param source_word_classes: Lookup table that maps a source word
+            to its word class, the latter represented by an integer id
+        :type source_word_classes: dict[str]: int
+
+        :param target_word_classes: Lookup table that maps a target word
+            to its word class, the latter represented by an integer id
+        :type target_word_classes: dict[str]: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, all the following entries must be present:
+            ``translation_table``, ``alignment_table``,
+            ``fertility_table``, ``p1``, ``head_distortion_table``,
+            ``non_head_distortion_table``. See ``IBMModel`` and
+            ``IBMModel4`` for the type and purpose of these tables.
+        :type probability_tables: dict[str]: object
+        """
+        super(IBMModel4, self).__init__(sentence_aligned_corpus)
+        self.reset_probabilities()
+        self.src_classes = source_word_classes
+        self.trg_classes = target_word_classes
+
+        if probability_tables is None:
+            # Get probabilities from IBM model 3
+            ibm3 = IBMModel3(sentence_aligned_corpus, iterations)
+            self.translation_table = ibm3.translation_table
+            self.alignment_table = ibm3.alignment_table
+            self.fertility_table = ibm3.fertility_table
+            self.p1 = ibm3.p1
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.head_distortion_table = probability_tables[
+                'head_distortion_table']
+            self.non_head_distortion_table = probability_tables[
+                'non_head_distortion_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+    def reset_probabilities(self):
+        super(IBMModel4, self).reset_probabilities()
+        self.head_distortion_table = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
+        """
+        dict[int][int][int]: float. Probability(displacement of head
+        word | word class of previous cept,target word class).
+        Values accessed as ``distortion_table[dj][src_class][trg_class]``.
+        """
+
+        self.non_head_distortion_table = defaultdict(
+            lambda: defaultdict(lambda: self.MIN_PROB))
+        """
+        dict[int][int]: float. Probability(displacement of non-head
+        word | target word class).
+        Values accessed as ``distortion_table[dj][trg_class]``.
+        """
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        """
+        Set distortion probabilities uniformly to
+        1 / cardinality of displacement values
+        """
+        max_m = longest_target_sentence_length(sentence_aligned_corpus)
+
+        # The maximum displacement is m-1, when a word is in the last
+        # position m of the target sentence and the previously placed
+        # word is in the first position.
+        # Conversely, the minimum displacement is -(m-1).
+        # Thus, the displacement range is (m-1) - (-(m-1)). Note that
+        # displacement cannot be zero and is not included in the range.
+        if max_m <= 1:
+            initial_prob = IBMModel.MIN_PROB
+        else:
+            initial_prob = 1 / (2 * (max_m - 1))
+        if initial_prob < IBMModel.MIN_PROB:
+            warnings.warn("A target sentence is too long (" + str(max_m) +
+                          " words). Results may be less accurate.")
+
+        for dj in range(1, max_m):
+            self.head_distortion_table[dj] = defaultdict(
+                lambda: defaultdict(lambda: initial_prob))
+            self.head_distortion_table[-dj] = defaultdict(
+                lambda: defaultdict(lambda: initial_prob))
+            self.non_head_distortion_table[dj] = defaultdict(
+                lambda: initial_prob)
+            self.non_head_distortion_table[-dj] = defaultdict(
+                lambda: initial_prob)
+
+    def train(self, parallel_corpus):
+        counts = Model4Counts()
+        for aligned_sentence in parallel_corpus:
+            m = len(aligned_sentence.words)
+
+            # Sample the alignment space
+            sampled_alignments, best_alignment = self.sample(aligned_sentence)
+            # Record the most probable alignment
+            aligned_sentence.alignment = Alignment(
+                best_alignment.zero_indexed_alignment())
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_of_alignments(sampled_alignments)
+
+            # E step (b): Collect counts
+            for alignment_info in sampled_alignments:
+                count = self.prob_t_a_given_s(alignment_info)
+                normalized_count = count / total_count
+
+                for j in range(1, m + 1):
+                    counts.update_lexical_translation(
+                        normalized_count, alignment_info, j)
+                    counts.update_distortion(
+                        normalized_count, alignment_info, j,
+                        self.src_classes, self.trg_classes)
+
+                counts.update_null_generation(normalized_count, alignment_info)
+                counts.update_fertility(normalized_count, alignment_info)
+
+        # M step: Update probabilities with maximum likelihood estimates
+        # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+        existing_alignment_table = self.alignment_table
+        self.reset_probabilities()
+        self.alignment_table = existing_alignment_table  # don't retrain
+
+        self.maximize_lexical_translation_probabilities(counts)
+        self.maximize_distortion_probabilities(counts)
+        self.maximize_fertility_probabilities(counts)
+        self.maximize_null_generation_probabilities(counts)
+
+    def maximize_distortion_probabilities(self, counts):
+        head_d_table = self.head_distortion_table
+        for dj, src_classes in counts.head_distortion.items():
+            for s_cls, trg_classes in src_classes.items():
+                for t_cls in trg_classes:
+                    estimate = (counts.head_distortion[dj][s_cls][t_cls] /
+                                counts.head_distortion_for_any_dj[s_cls][t_cls])
+                    head_d_table[dj][s_cls][t_cls] = max(estimate,
+                                                         IBMModel.MIN_PROB)
+
+        non_head_d_table = self.non_head_distortion_table
+        for dj, trg_classes in counts.non_head_distortion.items():
+            for t_cls in trg_classes:
+                estimate = (counts.non_head_distortion[dj][t_cls] /
+                            counts.non_head_distortion_for_any_dj[t_cls])
+                non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        return IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
+
+    @staticmethod  # exposed for Model 5 to use
+    def model4_prob_t_a_given_s(alignment_info, ibm_model):
+        probability = 1.0
+        MIN_PROB = IBMModel.MIN_PROB
+
+        def null_generation_term():
+            # Binomial distribution: B(m - null_fertility, p1)
+            value = 1.0
+            p1 = ibm_model.p1
+            p0 = 1 - p1
+            null_fertility = alignment_info.fertility_of_i(0)
+            m = len(alignment_info.trg_sentence) - 1
+            value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility))
+            if value < MIN_PROB:
+                return MIN_PROB
+
+            # Combination: (m - null_fertility) choose null_fertility
+            for i in range(1, null_fertility + 1):
+                value *= (m - null_fertility - i + 1) / i
+            return value
+
+        def fertility_term():
+            value = 1.0
+            src_sentence = alignment_info.src_sentence
+            for i in range(1, len(src_sentence)):
+                fertility = alignment_info.fertility_of_i(i)
+                value *= (factorial(fertility) *
+                          ibm_model.fertility_table[fertility][src_sentence[i]])
+                if value < MIN_PROB:
+                    return MIN_PROB
+            return value
+
+        def lexical_translation_term(j):
+            t = alignment_info.trg_sentence[j]
+            i = alignment_info.alignment[j]
+            s = alignment_info.src_sentence[i]
+            return ibm_model.translation_table[t][s]
+
+        def distortion_term(j):
+            t = alignment_info.trg_sentence[j]
+            i = alignment_info.alignment[j]
+            if i == 0:
+                # case 1: t is aligned to NULL
+                return 1.0
+            if alignment_info.is_head_word(j):
+                # case 2: t is the first word of a tablet
+                previous_cept = alignment_info.previous_cept(j)
+                src_class = None
+                if previous_cept is not None:
+                    previous_s = alignment_info.src_sentence[previous_cept]
+                    src_class = ibm_model.src_classes[previous_s]
+                trg_class = ibm_model.trg_classes[t]
+                dj = j - alignment_info.center_of_cept(previous_cept)
+                return ibm_model.head_distortion_table[dj][src_class][trg_class]
+
+            # case 3: t is a subsequent word of a tablet
+            previous_position = alignment_info.previous_in_tablet(j)
+            trg_class = ibm_model.trg_classes[t]
+            dj = j - previous_position
+            return ibm_model.non_head_distortion_table[dj][trg_class]
+        # end nested functions
+
+        # Abort computation whenever probability falls below MIN_PROB at
+        # any point, since MIN_PROB can be considered as zero
+        probability *= null_generation_term()
+        if probability < MIN_PROB:
+            return MIN_PROB
+
+        probability *= fertility_term()
+        if probability < MIN_PROB:
+            return MIN_PROB
+
+        for j in range(1, len(alignment_info.trg_sentence)):
+            probability *= lexical_translation_term(j)
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+            probability *= distortion_term(j)
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+        return probability
+
+
+class Model4Counts(Counts):
+    """
+    Data object to store counts of various parameters during training.
+    Includes counts for distortion.
+    """
+    def __init__(self):
+        super(Model4Counts, self).__init__()
+        self.head_distortion = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+        self.head_distortion_for_any_dj = defaultdict(
+            lambda: defaultdict(lambda: 0.0))
+        self.non_head_distortion = defaultdict(
+            lambda: defaultdict(lambda: 0.0))
+        self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0)
+
+    def update_distortion(self, count, alignment_info, j,
+                          src_classes, trg_classes):
+        i = alignment_info.alignment[j]
+        t = alignment_info.trg_sentence[j]
+        if i == 0:
+            # case 1: t is aligned to NULL
+            pass
+        elif alignment_info.is_head_word(j):
+            # case 2: t is the first word of a tablet
+            previous_cept = alignment_info.previous_cept(j)
+            if previous_cept is not None:
+                previous_src_word = alignment_info.src_sentence[previous_cept]
+                src_class = src_classes[previous_src_word]
+            else:
+                src_class = None
+            trg_class = trg_classes[t]
+            dj = j - alignment_info.center_of_cept(previous_cept)
+            self.head_distortion[dj][src_class][trg_class] += count
+            self.head_distortion_for_any_dj[src_class][trg_class] += count
+        else:
+            # case 3: t is a subsequent word of a tablet
+            previous_j = alignment_info.previous_in_tablet(j)
+            trg_class = trg_classes[t]
+            dj = j - previous_j
+            self.non_head_distortion[dj][trg_class] += count
+            self.non_head_distortion_for_any_dj[trg_class] += count
diff --git a/nlp_resource_data/nltk/translate/ibm4.pyc b/nlp_resource_data/nltk/translate/ibm4.pyc

new file mode 100755 (executable)

index 0000000..144ecf1

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ibm4.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ibm5.py b/nlp_resource_data/nltk/translate/ibm5.py

new file mode 100755 (executable)

index 0000000..df34afc
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ibm5.py
@@ -0,0 +1,639 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 5
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Translation model that keeps track of vacant positions in the target
+sentence to decide where to place translated words.
+
+Translation can be viewed as a process where each word in the source
+sentence is stepped through sequentially, generating translated words
+for each source word. The target sentence can be viewed as being made
+up of ``m`` empty slots initially, which gradually fill up as generated
+words are placed in them.
+
+Models 3 and 4 use distortion probabilities to decide how to place
+translated words. For simplicity, these models ignore the history of
+which slots have already been occupied with translated words.
+Consider the placement of the last translated word: there is only one
+empty slot left in the target sentence, so the distortion probability
+should be 1.0 for that position and 0.0 everywhere else. However, the
+distortion probabilities for Models 3 and 4 are set up such that all
+positions are under consideration.
+
+IBM Model 5 fixes this deficiency by accounting for occupied slots
+during translation. It introduces the vacancy function v(j), the number
+of vacancies up to, and including, position j in the target sentence.
+
+Terminology:
+Maximum vacancy:
+    The number of valid slots that a word can be placed in.
+    This is not necessarily the same as the number of vacant slots.
+    For example, if a tablet contains more than one word, the head word
+    cannot be placed at the last vacant slot because there will be no
+    space for the other words in the tablet. The number of valid slots
+    has to take into account the length of the tablet.
+    Non-head words cannot be placed before the head word, so vacancies
+    to the left of the head word are ignored.
+Vacancy difference:
+    For a head word: (v(j) - v(center of previous cept))
+    Can be positive or negative.
+    For a non-head word: (v(j) - v(position of previously placed word))
+    Always positive, because successive words in a tablet are assumed to
+    appear to the right of the previous word.
+
+Positioning of target words fall under three cases:
+(1) Words generated by NULL are distributed uniformly
+(2) For a head word t, its position is modeled by the probability
+    v_head(dv | max_v,word_class_t(t))
+(3) For a non-head word t, its position is modeled by the probability
+    v_non_head(dv | max_v,word_class_t(t))
+dv and max_v are defined differently for head and non-head words.
+
+The EM algorithm used in Model 5 is:
+E step - In the training data, collect counts, weighted by prior
+         probabilities.
+         (a) count how many times a source language word is translated
+             into a target language word
+         (b) for a particular word class and maximum vacancy, count how
+             many times a head word and the previous cept's center have
+             a particular difference in number of vacancies
+         (b) for a particular word class and maximum vacancy, count how
+             many times a non-head word and the previous target word
+             have a particular difference in number of vacancies
+         (d) count how many times a source word is aligned to phi number
+             of target words
+         (e) count how many times NULL is aligned to a target word
+
+M step - Estimate new probabilities based on the counts from the E step
+
+Like Model 4, there are too many possible alignments to consider. Thus,
+a hill climbing approach is used to sample good candidates. In addition,
+pruning is used to weed out unlikely alignments based on Model 4 scores.
+
+
+Notations:
+i: Position in the source sentence
+    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+    Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+phi: Fertility, the number of target words produced by a source word
+p1: Probability that a target word produced by a source word is
+    accompanied by another target word that is aligned to NULL
+p0: 1 - p1
+max_v: Maximum vacancy
+dv: Vacancy difference, Δv
+
+The definition of v_head here differs from GIZA++, section 4.7 of
+[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is
+v_head(v(j) | v(center of previous cept),max_v,word_class(t)).
+
+Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with
+v(center of previous cept) to obtain dv:
+v_head(v(j) - v(center of previous cept) | max_v,word_class(t)).
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from math import factorial
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel4
+from nltk.translate.ibm_model import Counts
+from nltk.translate.ibm_model import longest_target_sentence_length
+import warnings
+
+
+class IBMModel5(IBMModel):
+    """
+    Translation model that keeps track of vacant positions in the target
+    sentence to decide where to place translated words
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+    >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
+    >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
+    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
+    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
+
+    >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes)
+
+    >>> print(round(ibm5.head_vacancy_table[1][1][1], 3))
+    1.0
+    >>> print(round(ibm5.head_vacancy_table[2][1][1], 3))
+    0.0
+    >>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3))
+    1.0
+
+    >>> print(round(ibm5.fertility_table[2]['summarize'], 3))
+    1.0
+    >>> print(round(ibm5.fertility_table[1]['book'], 3))
+    1.0
+
+    >>> print(ibm5.p1)
+    0.033...
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
+
+    """
+    MIN_SCORE_FACTOR = 0.2
+    """
+    Alignments with scores below this factor are pruned during sampling
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 source_word_classes, target_word_classes,
+                 probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model, vacancy models, a fertility model, and a
+        model for generating NULL-aligned words.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param source_word_classes: Lookup table that maps a source word
+            to its word class, the latter represented by an integer id
+        :type source_word_classes: dict[str]: int
+
+        :param target_word_classes: Lookup table that maps a target word
+            to its word class, the latter represented by an integer id
+        :type target_word_classes: dict[str]: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, all the following entries must be present:
+            ``translation_table``, ``alignment_table``,
+            ``fertility_table``, ``p1``, ``head_distortion_table``,
+            ``non_head_distortion_table``, ``head_vacancy_table``,
+            ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``,
+            and ``IBMModel5`` for the type and purpose of these tables.
+        :type probability_tables: dict[str]: object
+        """
+        super(IBMModel5, self).__init__(sentence_aligned_corpus)
+        self.reset_probabilities()
+        self.src_classes = source_word_classes
+        self.trg_classes = target_word_classes
+
+        if probability_tables is None:
+            # Get probabilities from IBM model 4
+            ibm4 = IBMModel4(sentence_aligned_corpus, iterations,
+                             source_word_classes, target_word_classes)
+            self.translation_table = ibm4.translation_table
+            self.alignment_table = ibm4.alignment_table
+            self.fertility_table = ibm4.fertility_table
+            self.p1 = ibm4.p1
+            self.head_distortion_table = ibm4.head_distortion_table
+            self.non_head_distortion_table = ibm4.non_head_distortion_table
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.head_distortion_table = probability_tables[
+                'head_distortion_table']
+            self.non_head_distortion_table = probability_tables[
+                'non_head_distortion_table']
+            self.head_vacancy_table = probability_tables[
+                'head_vacancy_table']
+            self.non_head_vacancy_table = probability_tables[
+                'non_head_vacancy_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+    def reset_probabilities(self):
+        super(IBMModel5, self).reset_probabilities()
+        self.head_vacancy_table = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
+        """
+        dict[int][int][int]: float. Probability(vacancy difference |
+        number of remaining valid positions,target word class).
+        Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``.
+        """
+
+        self.non_head_vacancy_table = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
+        """
+        dict[int][int][int]: float. Probability(vacancy difference |
+        number of remaining valid positions,target word class).
+        Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``.
+        """
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        """
+        Set vacancy probabilities uniformly to
+        1 / cardinality of vacancy difference values
+        """
+        max_m = longest_target_sentence_length(sentence_aligned_corpus)
+
+        # The maximum vacancy difference occurs when a word is placed in
+        # the last available position m of the target sentence and the
+        # previous word position has no vacancies.
+        # The minimum is 1-max_v, when a word is placed in the first
+        # available position and the previous word is placed beyond the
+        # last available position.
+        # Thus, the number of possible vacancy difference values is
+        # (max_v) - (1-max_v) + 1 = 2 * max_v.
+        if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB:
+            warnings.warn("A target sentence is too long (" + str(max_m) +
+                          " words). Results may be less accurate.")
+
+        for max_v in range(1, max_m + 1):
+            for dv in range(1, max_m + 1):
+                initial_prob = 1 / (2 * max_v)
+                self.head_vacancy_table[dv][max_v] = defaultdict(
+                    lambda: initial_prob)
+                self.head_vacancy_table[-(dv-1)][max_v] = defaultdict(
+                    lambda: initial_prob)
+                self.non_head_vacancy_table[dv][max_v] = defaultdict(
+                    lambda: initial_prob)
+                self.non_head_vacancy_table[-(dv-1)][max_v] = defaultdict(
+                    lambda: initial_prob)
+
+    def train(self, parallel_corpus):
+        counts = Model5Counts()
+        for aligned_sentence in parallel_corpus:
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+
+            # Sample the alignment space
+            sampled_alignments, best_alignment = self.sample(aligned_sentence)
+            # Record the most probable alignment
+            aligned_sentence.alignment = Alignment(
+                best_alignment.zero_indexed_alignment())
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_of_alignments(sampled_alignments)
+
+            # E step (b): Collect counts
+            for alignment_info in sampled_alignments:
+                count = self.prob_t_a_given_s(alignment_info)
+                normalized_count = count / total_count
+
+                for j in range(1, m + 1):
+                    counts.update_lexical_translation(
+                        normalized_count, alignment_info, j)
+
+                slots = Slots(m)
+                for i in range(1, l + 1):
+                    counts.update_vacancy(
+                        normalized_count, alignment_info, i,
+                        self.trg_classes, slots)
+
+                counts.update_null_generation(normalized_count, alignment_info)
+                counts.update_fertility(normalized_count, alignment_info)
+
+        # M step: Update probabilities with maximum likelihood estimates
+        # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+        existing_alignment_table = self.alignment_table
+        self.reset_probabilities()
+        self.alignment_table = existing_alignment_table  # don't retrain
+
+        self.maximize_lexical_translation_probabilities(counts)
+        self.maximize_vacancy_probabilities(counts)
+        self.maximize_fertility_probabilities(counts)
+        self.maximize_null_generation_probabilities(counts)
+
+    def sample(self, sentence_pair):
+        """
+        Sample the most probable alignments from the entire alignment
+        space according to Model 4
+
+        Note that Model 4 scoring is used instead of Model 5 because the
+        latter is too expensive to compute.
+
+        First, determine the best alignment according to IBM Model 2.
+        With this initial alignment, use hill climbing to determine the
+        best alignment according to a IBM Model 4. Add this
+        alignment and its neighbors to the sample set. Repeat this
+        process with other initial alignments obtained by pegging an
+        alignment point. Finally, prune alignments that have
+        substantially lower Model 4 scores than the best alignment.
+
+        :param sentence_pair: Source and target language sentence pair
+            to generate a sample of alignments from
+        :type sentence_pair: AlignedSent
+
+        :return: A set of best alignments represented by their ``AlignmentInfo``
+            and the best alignment of the set for convenience
+        :rtype: set(AlignmentInfo), AlignmentInfo
+        """
+        sampled_alignments, best_alignment = super(
+            IBMModel5, self).sample(sentence_pair)
+        return self.prune(sampled_alignments), best_alignment
+
+    def prune(self, alignment_infos):
+        """
+        Removes alignments from ``alignment_infos`` that have
+        substantially lower Model 4 scores than the best alignment
+
+        :return: Pruned alignments
+        :rtype: set(AlignmentInfo)
+        """
+        alignments = []
+        best_score = 0
+
+        for alignment_info in alignment_infos:
+            score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
+            best_score = max(score, best_score)
+            alignments.append((alignment_info, score))
+
+        threshold = IBMModel5.MIN_SCORE_FACTOR * best_score
+        alignments = [a[0] for a in alignments if a[1] > threshold]
+        return set(alignments)
+
+    def hillclimb(self, alignment_info, j_pegged=None):
+        """
+        Starting from the alignment in ``alignment_info``, look at
+        neighboring alignments iteratively for the best one, according
+        to Model 4
+
+        Note that Model 4 scoring is used instead of Model 5 because the
+        latter is too expensive to compute.
+
+        There is no guarantee that the best alignment in the alignment
+        space will be found, because the algorithm might be stuck in a
+        local maximum.
+
+        :param j_pegged: If specified, the search will be constrained to
+            alignments where ``j_pegged`` remains unchanged
+        :type j_pegged: int
+
+        :return: The best alignment found from hill climbing
+        :rtype: AlignmentInfo
+        """
+        alignment = alignment_info  # alias with shorter name
+        max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self)
+
+        while True:
+            old_alignment = alignment
+            for neighbor_alignment in self.neighboring(alignment, j_pegged):
+                neighbor_probability = IBMModel4.model4_prob_t_a_given_s(
+                    neighbor_alignment, self)
+
+                if neighbor_probability > max_probability:
+                    alignment = neighbor_alignment
+                    max_probability = neighbor_probability
+
+            if alignment == old_alignment:
+                # Until there are no better alignments
+                break
+
+        alignment.score = max_probability
+        return alignment
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        probability = 1.0
+        MIN_PROB = IBMModel.MIN_PROB
+        slots = Slots(len(alignment_info.trg_sentence) - 1)
+
+        def null_generation_term():
+            # Binomial distribution: B(m - null_fertility, p1)
+            value = 1.0
+            p1 = self.p1
+            p0 = 1 - p1
+            null_fertility = alignment_info.fertility_of_i(0)
+            m = len(alignment_info.trg_sentence) - 1
+            value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility))
+            if value < MIN_PROB:
+                return MIN_PROB
+
+            # Combination: (m - null_fertility) choose null_fertility
+            for i in range(1, null_fertility + 1):
+                value *= (m - null_fertility - i + 1) / i
+            return value
+
+        def fertility_term():
+            value = 1.0
+            src_sentence = alignment_info.src_sentence
+            for i in range(1, len(src_sentence)):
+                fertility = alignment_info.fertility_of_i(i)
+                value *= (factorial(fertility) *
+                          self.fertility_table[fertility][src_sentence[i]])
+                if value < MIN_PROB:
+                    return MIN_PROB
+            return value
+
+        def lexical_translation_term(j):
+            t = alignment_info.trg_sentence[j]
+            i = alignment_info.alignment[j]
+            s = alignment_info.src_sentence[i]
+            return self.translation_table[t][s]
+
+        def vacancy_term(i):
+            value = 1.0
+            tablet = alignment_info.cepts[i]
+            tablet_length = len(tablet)
+            total_vacancies = slots.vacancies_at(len(slots))
+
+            # case 1: NULL-aligned words
+            if tablet_length == 0:
+                return value
+
+            # case 2: head word
+            j = tablet[0]
+            previous_cept = alignment_info.previous_cept(j)
+            previous_center = alignment_info.center_of_cept(previous_cept)
+            dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
+            max_v = total_vacancies - tablet_length + 1
+            trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
+            value *= self.head_vacancy_table[dv][max_v][trg_class]
+            slots.occupy(j)  # mark position as occupied
+            total_vacancies -= 1
+            if value < MIN_PROB:
+                return MIN_PROB
+
+            # case 3: non-head words
+            for k in range(1, tablet_length):
+                previous_position = tablet[k - 1]
+                previous_vacancies = slots.vacancies_at(previous_position)
+                j = tablet[k]
+                dv = slots.vacancies_at(j) - previous_vacancies
+                max_v = (total_vacancies - tablet_length + k + 1 -
+                         previous_vacancies)
+                trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
+                value *= self.non_head_vacancy_table[dv][max_v][trg_class]
+                slots.occupy(j)  # mark position as occupied
+                total_vacancies -= 1
+                if value < MIN_PROB:
+                    return MIN_PROB
+
+            return value
+        # end nested functions
+
+        # Abort computation whenever probability falls below MIN_PROB at
+        # any point, since MIN_PROB can be considered as zero
+        probability *= null_generation_term()
+        if probability < MIN_PROB:
+            return MIN_PROB
+
+        probability *= fertility_term()
+        if probability < MIN_PROB:
+            return MIN_PROB
+
+        for j in range(1, len(alignment_info.trg_sentence)):
+            probability *= lexical_translation_term(j)
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+        for i in range(1, len(alignment_info.src_sentence)):
+            probability *= vacancy_term(i)
+            if probability < MIN_PROB:
+                return MIN_PROB
+
+        return probability
+
+    def maximize_vacancy_probabilities(self, counts):
+        MIN_PROB = IBMModel.MIN_PROB
+        head_vacancy_table = self.head_vacancy_table
+        for dv, max_vs in counts.head_vacancy.items():
+            for max_v, trg_classes in max_vs.items():
+                for t_cls in trg_classes:
+                    estimate = (counts.head_vacancy[dv][max_v][t_cls] /
+                                counts.head_vacancy_for_any_dv[max_v][t_cls])
+                    head_vacancy_table[dv][max_v][t_cls] = max(estimate,
+                                                               MIN_PROB)
+
+        non_head_vacancy_table = self.non_head_vacancy_table
+        for dv, max_vs in counts.non_head_vacancy.items():
+            for max_v, trg_classes in max_vs.items():
+                for t_cls in trg_classes:
+                    estimate = (
+                        counts.non_head_vacancy[dv][max_v][t_cls] /
+                        counts.non_head_vacancy_for_any_dv[max_v][t_cls])
+                    non_head_vacancy_table[dv][max_v][t_cls] = max(estimate,
+                                                                   MIN_PROB)
+
+
+class Model5Counts(Counts):
+    """
+    Data object to store counts of various parameters during training.
+    Includes counts for vacancies.
+    """
+    def __init__(self):
+        super(Model5Counts, self).__init__()
+        self.head_vacancy = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+        self.head_vacancy_for_any_dv = defaultdict(
+            lambda: defaultdict(lambda: 0.0))
+        self.non_head_vacancy = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+        self.non_head_vacancy_for_any_dv = defaultdict(
+            lambda: defaultdict(lambda: 0.0))
+
+    def update_vacancy(self, count, alignment_info, i, trg_classes, slots):
+        """
+        :param count: Value to add to the vacancy counts
+        :param alignment_info: Alignment under consideration
+        :param i: Source word position under consideration
+        :param trg_classes: Target word classes
+        :param slots: Vacancy states of the slots in the target sentence.
+            Output parameter that will be modified as new words are placed
+            in the target sentence.
+        """
+        tablet = alignment_info.cepts[i]
+        tablet_length = len(tablet)
+        total_vacancies = slots.vacancies_at(len(slots))
+
+        # case 1: NULL aligned words
+        if tablet_length == 0:
+            return  # ignore zero fertility words
+
+        # case 2: head word
+        j = tablet[0]
+        previous_cept = alignment_info.previous_cept(j)
+        previous_center = alignment_info.center_of_cept(previous_cept)
+        dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
+        max_v = total_vacancies - tablet_length + 1
+        trg_class = trg_classes[alignment_info.trg_sentence[j]]
+        self.head_vacancy[dv][max_v][trg_class] += count
+        self.head_vacancy_for_any_dv[max_v][trg_class] += count
+        slots.occupy(j)  # mark position as occupied
+        total_vacancies -= 1
+
+        # case 3: non-head words
+        for k in range(1, tablet_length):
+            previous_position = tablet[k - 1]
+            previous_vacancies = slots.vacancies_at(previous_position)
+            j = tablet[k]
+            dv = slots.vacancies_at(j) - previous_vacancies
+            max_v = (total_vacancies - tablet_length + k + 1 -
+                     previous_vacancies)
+            trg_class = trg_classes[alignment_info.trg_sentence[j]]
+            self.non_head_vacancy[dv][max_v][trg_class] += count
+            self.non_head_vacancy_for_any_dv[max_v][trg_class] += count
+            slots.occupy(j)  # mark position as occupied
+            total_vacancies -= 1
+
+
+class Slots(object):
+    """
+    Represents positions in a target sentence. Used to keep track of
+    which slot (position) is occupied.
+    """
+    def __init__(self, target_sentence_length):
+        self._slots = [False] * (target_sentence_length + 1)  # 1-indexed
+
+    def occupy(self, position):
+        """
+        :return: Mark slot at ``position`` as occupied
+        """
+        self._slots[position] = True
+
+    def vacancies_at(self, position):
+        """
+        :return: Number of vacant slots up to, and including, ``position``
+        """
+        vacancies = 0
+        for k in range(1, position + 1):
+            if not self._slots[k]:
+                vacancies += 1
+        return vacancies
+
+    def __len__(self):
+        return len(self._slots) - 1  # exclude dummy zeroeth element
diff --git a/nlp_resource_data/nltk/translate/ibm5.pyc b/nlp_resource_data/nltk/translate/ibm5.pyc

new file mode 100755 (executable)

index 0000000..b597bf4

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ibm5.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ibm_model.py b/nlp_resource_data/nltk/translate/ibm_model.py

new file mode 100755 (executable)

index 0000000..4dfe4e6
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ibm_model.py
@@ -0,0 +1,536 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model Core
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Common methods and classes for all IBM models. See ``IBMModel1``,
+``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
+for specific implementations.
+
+The IBM models are a series of generative models that learn lexical
+translation probabilities, p(target language word|source language word),
+given a sentence-aligned parallel corpus.
+
+The models increase in sophistication from model 1 to 5. Typically, the
+output of lower models is used to seed the higher models. All models
+use the Expectation-Maximization (EM) algorithm to learn various
+probability tables.
+
+Words in a sentence are one-indexed. The first word of a sentence has
+position 1, not 0. Index 0 is reserved in the source sentence for the
+NULL token. The concept of position does not apply to NULL, but it is
+indexed at 0 by convention.
+
+Each target word is aligned to exactly one source word or the NULL
+token.
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+from __future__ import division
+from bisect import insort_left
+from collections import defaultdict
+from copy import deepcopy
+from math import ceil
+
+
+def longest_target_sentence_length(sentence_aligned_corpus):
+    """
+    :param sentence_aligned_corpus: Parallel corpus under consideration
+    :type sentence_aligned_corpus: list(AlignedSent)
+    :return: Number of words in the longest target language sentence
+        of ``sentence_aligned_corpus``
+    """
+    max_m = 0
+    for aligned_sentence in sentence_aligned_corpus:
+        m = len(aligned_sentence.words)
+        max_m = max(m, max_m)
+    return max_m
+
+
+class IBMModel(object):
+    """
+    Abstract base class for all IBM models
+    """
+    # Avoid division by zero and precision errors by imposing a minimum
+    # value for probabilities. Note that this approach is theoretically
+    # incorrect, since it may create probabilities that sum to more
+    # than 1. In practice, the contribution of probabilities with MIN_PROB
+    # is tiny enough that the value of MIN_PROB can be treated as zero.
+    MIN_PROB = 1.0e-12  # GIZA++ is more liberal and uses 1.0e-7
+
+    def __init__(self, sentence_aligned_corpus):
+        self.init_vocab(sentence_aligned_corpus)
+        self.reset_probabilities()
+
+    def reset_probabilities(self):
+        self.translation_table = defaultdict(
+            lambda: defaultdict(lambda: IBMModel.MIN_PROB))
+        """
+        dict[str][str]: float. Probability(target word | source word).
+        Values accessed as ``translation_table[target_word][source_word]``.
+        """
+
+        self.alignment_table = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+                lambda: IBMModel.MIN_PROB))))
+        """
+        dict[int][int][int][int]: float. Probability(i | j,l,m).
+        Values accessed as ``alignment_table[i][j][l][m]``.
+        Used in model 2 and hill climbing in models 3 and above
+        """
+
+        self.fertility_table = defaultdict(
+            lambda: defaultdict(lambda: self.MIN_PROB))
+        """
+        dict[int][str]: float. Probability(fertility | source word).
+        Values accessed as ``fertility_table[fertility][source_word]``.
+        Used in model 3 and higher.
+        """
+
+        self.p1 = 0.5
+        """
+        Probability that a generated word requires another target word
+        that is aligned to NULL.
+        Used in model 3 and higher.
+        """
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        """
+        Initialize probability tables to a uniform distribution
+
+        Derived classes should implement this accordingly.
+        """
+        pass
+
+    def init_vocab(self, sentence_aligned_corpus):
+        src_vocab = set()
+        trg_vocab = set()
+        for aligned_sentence in sentence_aligned_corpus:
+            trg_vocab.update(aligned_sentence.words)
+            src_vocab.update(aligned_sentence.mots)
+        # Add the NULL token
+        src_vocab.add(None)
+
+        self.src_vocab = src_vocab
+        """
+        set(str): All source language words used in training
+        """
+
+        self.trg_vocab = trg_vocab
+        """
+        set(str): All target language words used in training
+        """
+
+    def sample(self, sentence_pair):
+        """
+        Sample the most probable alignments from the entire alignment
+        space
+
+        First, determine the best alignment according to IBM Model 2.
+        With this initial alignment, use hill climbing to determine the
+        best alignment according to a higher IBM Model. Add this
+        alignment and its neighbors to the sample set. Repeat this
+        process with other initial alignments obtained by pegging an
+        alignment point.
+
+        Hill climbing may be stuck in a local maxima, hence the pegging
+        and trying out of different alignments.
+
+        :param sentence_pair: Source and target language sentence pair
+            to generate a sample of alignments from
+        :type sentence_pair: AlignedSent
+
+        :return: A set of best alignments represented by their ``AlignmentInfo``
+            and the best alignment of the set for convenience
+        :rtype: set(AlignmentInfo), AlignmentInfo
+        """
+        sampled_alignments = set()
+        l = len(sentence_pair.mots)
+        m = len(sentence_pair.words)
+
+        # Start from the best model 2 alignment
+        initial_alignment = self.best_model2_alignment(sentence_pair)
+        potential_alignment = self.hillclimb(initial_alignment)
+        sampled_alignments.update(self.neighboring(potential_alignment))
+        best_alignment = potential_alignment
+
+        # Start from other model 2 alignments,
+        # with the constraint that j is aligned (pegged) to i
+        for j in range(1, m + 1):
+            for i in range(0, l + 1):
+                initial_alignment = self.best_model2_alignment(
+                    sentence_pair, j, i)
+                potential_alignment = self.hillclimb(initial_alignment, j)
+                neighbors = self.neighboring(potential_alignment, j)
+                sampled_alignments.update(neighbors)
+                if potential_alignment.score > best_alignment.score:
+                    best_alignment = potential_alignment
+
+        return sampled_alignments, best_alignment
+
+    def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
+        """
+        Finds the best alignment according to IBM Model 2
+
+        Used as a starting point for hill climbing in Models 3 and
+        above, because it is easier to compute than the best alignments
+        in higher models
+
+        :param sentence_pair: Source and target language sentence pair
+            to be word-aligned
+        :type sentence_pair: AlignedSent
+
+        :param j_pegged: If specified, the alignment point of j_pegged
+            will be fixed to i_pegged
+        :type j_pegged: int
+
+        :param i_pegged: Alignment point to j_pegged
+        :type i_pegged: int
+        """
+        src_sentence = [None] + sentence_pair.mots
+        trg_sentence = ['UNUSED'] + sentence_pair.words  # 1-indexed
+
+        l = len(src_sentence) - 1  # exclude NULL
+        m = len(trg_sentence) - 1
+
+        alignment = [0] * (m + 1)  # init all alignments to NULL
+        cepts = [[] for i in range((l + 1))]  # init all cepts to empty list
+
+        for j in range(1, m + 1):
+            if j == j_pegged:
+                # use the pegged alignment instead of searching for best one
+                best_i = i_pegged
+            else:
+                best_i = 0
+                max_alignment_prob = IBMModel.MIN_PROB
+                t = trg_sentence[j]
+
+                for i in range(0, l + 1):
+                    s = src_sentence[i]
+                    alignment_prob = (self.translation_table[t][s] *
+                                      self.alignment_table[i][j][l][m])
+
+                    if alignment_prob >= max_alignment_prob:
+                        max_alignment_prob = alignment_prob
+                        best_i = i
+
+            alignment[j] = best_i
+            cepts[best_i].append(j)
+
+        return AlignmentInfo(tuple(alignment), tuple(src_sentence),
+                             tuple(trg_sentence), cepts)
+
+    def hillclimb(self, alignment_info, j_pegged=None):
+        """
+        Starting from the alignment in ``alignment_info``, look at
+        neighboring alignments iteratively for the best one
+
+        There is no guarantee that the best alignment in the alignment
+        space will be found, because the algorithm might be stuck in a
+        local maximum.
+
+        :param j_pegged: If specified, the search will be constrained to
+            alignments where ``j_pegged`` remains unchanged
+        :type j_pegged: int
+
+        :return: The best alignment found from hill climbing
+        :rtype: AlignmentInfo
+        """
+        alignment = alignment_info  # alias with shorter name
+        max_probability = self.prob_t_a_given_s(alignment)
+
+        while True:
+            old_alignment = alignment
+            for neighbor_alignment in self.neighboring(alignment, j_pegged):
+                neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
+
+                if neighbor_probability > max_probability:
+                    alignment = neighbor_alignment
+                    max_probability = neighbor_probability
+
+            if alignment == old_alignment:
+                # Until there are no better alignments
+                break
+
+        alignment.score = max_probability
+        return alignment
+
+    def neighboring(self, alignment_info, j_pegged=None):
+        """
+        Determine the neighbors of ``alignment_info``, obtained by
+        moving or swapping one alignment point
+
+        :param j_pegged: If specified, neighbors that have a different
+            alignment point from j_pegged will not be considered
+        :type j_pegged: int
+
+        :return: A set neighboring alignments represented by their
+            ``AlignmentInfo``
+        :rtype: set(AlignmentInfo)
+        """
+        neighbors = set()
+
+        l = len(alignment_info.src_sentence) - 1  # exclude NULL
+        m = len(alignment_info.trg_sentence) - 1
+        original_alignment = alignment_info.alignment
+        original_cepts = alignment_info.cepts
+
+        for j in range(1, m + 1):
+            if j != j_pegged:
+                # Add alignments that differ by one alignment point
+                for i in range(0, l + 1):
+                    new_alignment = list(original_alignment)
+                    new_cepts = deepcopy(original_cepts)
+                    old_i = original_alignment[j]
+
+                    # update alignment
+                    new_alignment[j] = i
+
+                    # update cepts
+                    insort_left(new_cepts[i], j)
+                    new_cepts[old_i].remove(j)
+
+                    new_alignment_info = AlignmentInfo(
+                        tuple(new_alignment), alignment_info.src_sentence,
+                        alignment_info.trg_sentence, new_cepts)
+                    neighbors.add(new_alignment_info)
+
+        for j in range(1, m + 1):
+            if j != j_pegged:
+                # Add alignments that have two alignment points swapped
+                for other_j in range(1, m + 1):
+                    if other_j != j_pegged and other_j != j:
+                        new_alignment = list(original_alignment)
+                        new_cepts = deepcopy(original_cepts)
+                        other_i = original_alignment[other_j]
+                        i = original_alignment[j]
+
+                        # update alignments
+                        new_alignment[j] = other_i
+                        new_alignment[other_j] = i
+
+                        # update cepts
+                        new_cepts[other_i].remove(other_j)
+                        insort_left(new_cepts[other_i], j)
+                        new_cepts[i].remove(j)
+                        insort_left(new_cepts[i], other_j)
+
+                        new_alignment_info = AlignmentInfo(
+                            tuple(new_alignment), alignment_info.src_sentence,
+                            alignment_info.trg_sentence, new_cepts)
+                        neighbors.add(new_alignment_info)
+
+        return neighbors
+
+    def maximize_lexical_translation_probabilities(self, counts):
+        for t, src_words in counts.t_given_s.items():
+            for s in src_words:
+                estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
+                self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
+
+    def maximize_fertility_probabilities(self, counts):
+        for phi, src_words in counts.fertility.items():
+            for s in src_words:
+                estimate = (counts.fertility[phi][s] /
+                            counts.fertility_for_any_phi[s])
+                self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
+
+    def maximize_null_generation_probabilities(self, counts):
+        p1_estimate = counts.p1 / (counts.p1 + counts.p0)
+        p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
+        # Clip p1 if it is too large, because p0 = 1 - p1 should not be
+        # smaller than MIN_PROB
+        self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
+
+    def prob_of_alignments(self, alignments):
+        probability = 0
+        for alignment_info in alignments:
+            probability += self.prob_t_a_given_s(alignment_info)
+        return probability
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+
+        All required information is assumed to be in ``alignment_info``
+        and self.
+
+        Derived classes should override this method
+        """
+        return 0.0
+
+
+class AlignmentInfo(object):
+    """
+    Helper data object for training IBM Models 3 and up
+
+    Read-only. For a source sentence and its counterpart in the target
+    language, this class holds information about the sentence pair's
+    alignment, cepts, and fertility.
+
+    Warning: Alignments are one-indexed here, in contrast to
+    nltk.translate.Alignment and AlignedSent, which are zero-indexed
+    This class is not meant to be used outside of IBM models.
+    """
+
+    def __init__(self, alignment, src_sentence, trg_sentence, cepts):
+        if not isinstance(alignment, tuple):
+            raise TypeError("The alignment must be a tuple because it is used "
+                            "to uniquely identify AlignmentInfo objects.")
+
+        self.alignment = alignment
+        """
+        tuple(int): Alignment function. ``alignment[j]`` is the position
+        in the source sentence that is aligned to the position j in the
+        target sentence.
+        """
+
+        self.src_sentence = src_sentence
+        """
+        tuple(str): Source sentence referred to by this object.
+        Should include NULL token (None) in index 0.
+        """
+
+        self.trg_sentence = trg_sentence
+        """
+        tuple(str): Target sentence referred to by this object.
+        Should have a dummy element in index 0 so that the first word
+        starts from index 1.
+        """
+
+        self.cepts = cepts
+        """
+        list(list(int)): The positions of the target words, in
+        ascending order, aligned to a source word position. For example,
+        cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
+        of the target sentence are aligned to the word in position 4 of
+        the source sentence
+        """
+
+        self.score = None
+        """
+        float: Optional. Probability of alignment, as defined by the
+        IBM model that assesses this alignment
+        """
+
+    def fertility_of_i(self, i):
+        """
+        Fertility of word in position ``i`` of the source sentence
+        """
+        return len(self.cepts[i])
+
+    def is_head_word(self, j):
+        """
+        :return: Whether the word in position ``j`` of the target
+            sentence is a head word
+        """
+        i = self.alignment[j]
+        return self.cepts[i][0] == j
+
+    def center_of_cept(self, i):
+        """
+        :return: The ceiling of the average positions of the words in
+            the tablet of cept ``i``, or 0 if ``i`` is None
+        """
+        if i is None:
+            return 0
+
+        average_position = sum(self.cepts[i]) / len(self.cepts[i])
+        return int(ceil(average_position))
+
+    def previous_cept(self, j):
+        """
+        :return: The previous cept of ``j``, or None if ``j`` belongs to
+            the first cept
+        """
+        i = self.alignment[j]
+        if i == 0:
+            raise ValueError("Words aligned to NULL cannot have a previous "
+                             "cept because NULL has no position")
+        previous_cept = i - 1
+        while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
+            previous_cept -= 1
+
+        if previous_cept <= 0:
+            previous_cept = None
+        return previous_cept
+
+    def previous_in_tablet(self, j):
+        """
+        :return: The position of the previous word that is in the same
+            tablet as ``j``, or None if ``j`` is the first word of the
+            tablet
+        """
+        i = self.alignment[j]
+        tablet_position = self.cepts[i].index(j)
+        if tablet_position == 0:
+            return None
+        return self.cepts[i][tablet_position - 1]
+
+    def zero_indexed_alignment(self):
+        """
+        :return: Zero-indexed alignment, suitable for use in external
+            ``nltk.translate`` modules like ``nltk.translate.Alignment``
+        :rtype: list(tuple)
+        """
+        zero_indexed_alignment = []
+        for j in range(1, len(self.trg_sentence)):
+            i = self.alignment[j] - 1
+            if i < 0:
+                i = None  # alignment to NULL token
+            zero_indexed_alignment.append((j - 1, i))
+        return zero_indexed_alignment
+
+    def __eq__(self, other):
+        return self.alignment == other.alignment
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(self.alignment)
+
+
+class Counts(object):
+    """
+    Data object to store counts of various parameters during training
+    """
+    def __init__(self):
+        self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
+        self.any_t_given_s = defaultdict(lambda: 0.0)
+        self.p0 = 0.0
+        self.p1 = 0.0
+        self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0))
+        self.fertility_for_any_phi = defaultdict(lambda: 0.0)
+
+    def update_lexical_translation(self, count, alignment_info, j):
+        i = alignment_info.alignment[j]
+        t = alignment_info.trg_sentence[j]
+        s = alignment_info.src_sentence[i]
+        self.t_given_s[t][s] += count
+        self.any_t_given_s[s] += count
+
+    def update_null_generation(self, count, alignment_info):
+        m = len(alignment_info.trg_sentence) - 1
+        fertility_of_null = alignment_info.fertility_of_i(0)
+        self.p1 += fertility_of_null * count
+        self.p0 += (m - 2 * fertility_of_null) * count
+
+    def update_fertility(self, count, alignment_info):
+        for i in range(0, len(alignment_info.src_sentence)):
+            s = alignment_info.src_sentence[i]
+            phi = alignment_info.fertility_of_i(i)
+            self.fertility[phi][s] += count
+            self.fertility_for_any_phi[s] += count
diff --git a/nlp_resource_data/nltk/translate/ibm_model.pyc b/nlp_resource_data/nltk/translate/ibm_model.pyc

new file mode 100755 (executable)

index 0000000..249238b

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ibm_model.pyc differ
diff --git a/nlp_resource_data/nltk/translate/metrics.py b/nlp_resource_data/nltk/translate/metrics.py

new file mode 100755 (executable)

index 0000000..e9fef3e
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/metrics.py
@@ -0,0 +1,40 @@
+# Natural Language Toolkit: Translation metrics
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Will Zhang <wilzzha@gmail.com>
+#         Guan Gui <ggui@student.unimelb.edu.au>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import division
+
+def alignment_error_rate(reference, hypothesis, possible=None):
+    """
+    Return the Alignment Error Rate (AER) of an alignment
+    with respect to a "gold standard" reference alignment.
+    Return an error rate between 0.0 (perfect alignment) and 1.0 (no
+    alignment).
+
+        >>> from nltk.translate import Alignment
+        >>> ref = Alignment([(0, 0), (1, 1), (2, 2)])
+        >>> test = Alignment([(0, 0), (1, 2), (2, 1)])
+        >>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS
+        0.6666666666666667
+
+    :type reference: Alignment
+    :param reference: A gold standard alignment (sure alignments)
+    :type hypothesis: Alignment
+    :param hypothesis: A hypothesis alignment (aka. candidate alignments)
+    :type possible: Alignment or None
+    :param possible: A gold standard reference of possible alignments
+        (defaults to *reference* if None)
+    :rtype: float or None
+    """
+
+    if possible is None:
+        possible = reference
+    else:
+        assert(reference.issubset(possible)) # sanity check
+
+    return (1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) /
+            float(len(hypothesis) + len(reference)))
diff --git a/nlp_resource_data/nltk/translate/metrics.pyc b/nlp_resource_data/nltk/translate/metrics.pyc

new file mode 100755 (executable)

index 0000000..efff81f

Binary files /dev/null and b/nlp_resource_data/nltk/translate/metrics.pyc differ
diff --git a/nlp_resource_data/nltk/translate/nist_score.py b/nlp_resource_data/nltk/translate/nist_score.py

new file mode 100755 (executable)

index 0000000..1bedf65
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/nist_score.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: NIST Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors:
+# Contributors:
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""NIST score implementation."""
+from __future__ import division
+
+import math
+import fractions
+from collections import Counter
+
+from nltk.util import ngrams
+from nltk.translate.bleu_score import modified_precision, closest_ref_length
+
+try:
+    fractions.Fraction(0, 1000, _normalize=False)
+    from fractions import Fraction
+except TypeError:
+    from nltk.compat import Fraction
+
+
+def sentence_nist(references, hypothesis, n=5):
+    """
+    Calculate NIST score from
+    George Doddington. 2002. "Automatic evaluation of machine translation quality
+    using n-gram co-occurrence statistics." Proceedings of HLT.
+    Morgan Kaufmann Publishers Inc. http://dl.acm.org/citation.cfm?id=1289189.1289273
+
+    DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
+    score. The official script used by NIST to compute BLEU and NIST score is
+    mteval-14.pl. The main differences are:
+
+     - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
+     - NIST has a different brevity penalty
+     - NIST score from mteval-14.pl has a self-contained tokenizer
+
+    Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT
+          used in the NIST score computation.
+
+    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...               'ensures', 'that', 'the', 'military', 'always',
+    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+
+    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+    ...               'that', 'party', 'direct']
+
+    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...               'heed', 'Party', 'commands']
+
+    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...               'guarantees', 'the', 'military', 'forces', 'always',
+    ...               'being', 'under', 'the', 'command', 'of', 'the',
+    ...               'Party']
+
+    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...               'of', 'the', 'party']
+
+    >>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
+    0.0854...
+
+    >>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
+    0.1485...
+
+    :param references: reference sentences
+    :type references: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param n: highest n-gram order
+    :type n: int
+    """
+    return corpus_nist([references], [hypothesis], n)
+
+def corpus_nist(list_of_references, hypotheses, n=5):
+    """
+    Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+
+    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param n: highest n-gram order
+    :type n: int
+    """
+    # Before proceeding to compute NIST, perform sanity checks.
+    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+    sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp.
+    hyp_lengths, ref_lengths = 0, 0
+
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(range(1,n+1)):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+            # Adds the no. of ngrams in the hypothesis.
+            sysoutput_lengths[i] += len(hypothesis) - (i - 1)
+
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len =  len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+
+    # Calculate corpus-level brevity penalty.
+    bp = nist_length_penalty(ref_lengths, hyp_lengths)
+
+    # Collects the various precision values for the different ngram orders.
+    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+           for i, _ in enumerate(range(1,n+1))]
+
+    # Eqn 2 in Doddington (2002):
+    # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
+    info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors.
+            else math.log(p_n[i].numerator / p_n[i+1].numerator)
+            for i in range(len(p_n)-1)]
+    return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
+
+
+def nist_length_penalty(closest_ref_len, hyp_len):
+    """
+    Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
+
+        penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 )))
+
+    where,
+
+        `beta` is chosen to make the brevity penalty factor = 0.5 when the
+        no. of words in the system output (hyp) is 2/3 of the average
+        no. of words in the reference translation (ref)
+
+    The NIST penalty is different from BLEU's such that it minimize the impact
+    of the score of small variations in the length of a translation.
+    See Fig. 4 in  Doddington (2002)
+    """
+    ratio = closest_ref_len / hyp_len
+    if 0 < ratio < 1:
+        ratio_x, score_x = 1.5, 0.5
+        beta = math.log(score_x) / math.log(score_x)**2
+        return math.exp(beta * math.log(ratio)**2)
+    else: # ratio <= 0 or ratio >= 1
+        return max(min(ratio, 1.0), 0.0)
diff --git a/nlp_resource_data/nltk/translate/nist_score.pyc b/nlp_resource_data/nltk/translate/nist_score.pyc

new file mode 100755 (executable)

index 0000000..7b5afe4

Binary files /dev/null and b/nlp_resource_data/nltk/translate/nist_score.pyc differ
diff --git a/nlp_resource_data/nltk/translate/phrase_based.py b/nlp_resource_data/nltk/translate/phrase_based.py

new file mode 100755 (executable)

index 0000000..5bbc094
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/phrase_based.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Phrase Extraction Algorithm
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+def extract(f_start, f_end, e_start, e_end, 
+            alignment, f_aligned,
+            srctext, trgtext, srclen, trglen, max_phrase_length):
+    """
+    This function checks for alignment point consistency and extracts 
+    phrases using the chunk of consistent phrases.
+    
+    A phrase pair (e, f ) is consistent with an alignment A if and only if:
+
+    (i) No English words in the phrase pair are aligned to words outside it.
+    
+           ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
+    
+    (ii) No Foreign words in the phrase pair are aligned to words outside it. 
+            
+            ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
+    
+    (iii) The phrase pair contains at least one alignment point. 
+            
+            ∃e i ∈ e  ̄ , f j ∈ f  ̄ s.t. (e i , f j ) ∈ A
+    
+    :type f_start: int
+    :param f_start: Starting index of the possible foreign language phrases
+    :type f_end: int
+    :param f_end: Starting index of the possible foreign language phrases
+    :type e_start: int
+    :param e_start: Starting index of the possible source language phrases
+    :type e_end: int
+    :param e_end: Starting index of the possible source language phrases
+    :type srctext: list
+    :param srctext: The source language tokens, a list of string.
+    :type trgtext: list
+    :param trgtext: The target language tokens, a list of string.
+    :type srclen: int
+    :param srclen: The number of tokens in the source language tokens.
+    :type trglen: int
+    :param trglen: The number of tokens in the target language tokens.
+    """
+
+    if f_end < 0:  # 0-based indexing.
+        return {}
+    # Check if alignment points are consistent.
+    for e,f in alignment:
+        if ((f_start <= f <= f_end) and (e < e_start or e > e_end)):
+            return {}
+
+    # Add phrase pairs (incl. additional unaligned f)
+    phrases = set()
+    fs = f_start
+    while True:
+        fe = min(f_end, f_start + max_phrase_length - 1)
+        while True:
+            # add phrase pair ([e_start, e_end], [fs, fe]) to set E
+            # Need to +1 in range  to include the end-point.
+            src_phrase = " ".join(srctext[e_start:e_end+1])
+            trg_phrase = " ".join(trgtext[fs:fe+1])
+            # Include more data for later ordering.
+            phrases.add(((e_start, e_end+1), (f_start, f_end+1), 
+                         src_phrase, trg_phrase))
+            fe += 1
+            if fe in f_aligned or fe == trglen:
+                break
+        fs -=1 
+        if fs in f_aligned or fs < 0:
+            break
+    return phrases
+
+def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
+    """
+    Phrase extraction algorithm extracts all consistent phrase pairs from 
+    a word-aligned sentence pair.
+
+    The idea is to loop over all possible source language (e) phrases and find 
+    the minimal foreign phrase (f) that matches each of them. Matching is done 
+    by identifying all alignment points for the source phrase and finding the 
+    shortest foreign phrase that includes all the foreign counterparts for the 
+    source words.
+
+    In short, a phrase alignment has to 
+    (a) contain all alignment points for all covered words
+    (b) contain at least one alignment point
+            
+    >>> srctext = "michael assumes that he will stay in the house"
+    >>> trgtext = "michael geht davon aus , dass er im haus bleibt"
+    >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), 
+    ... (5,9), (6,7), (7,7), (8,8)]
+    >>> phrases = phrase_extraction(srctext, trgtext, alignment)
+    >>> for i in sorted(phrases):
+    ...    print(i)
+    ...
+    ((0, 1), (0, 1), 'michael', 'michael')
+    ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
+    ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,')
+    ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
+    ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
+    ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
+    ((1, 2), (1, 4), 'assumes', 'geht davon aus')
+    ((1, 2), (1, 4), 'assumes', 'geht davon aus ,')
+    ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
+    ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
+    ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
+    ((2, 3), (5, 6), 'that', ', dass')
+    ((2, 3), (5, 6), 'that', 'dass')
+    ((2, 4), (5, 7), 'that he', ', dass er')
+    ((2, 4), (5, 7), 'that he', 'dass er')
+    ((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt')
+    ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
+    ((3, 4), (6, 7), 'he', 'er')
+    ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
+    ((4, 6), (9, 10), 'will stay', 'bleibt')
+    ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
+    ((6, 8), (7, 8), 'in the', 'im')
+    ((6, 9), (7, 9), 'in the house', 'im haus')
+    ((8, 9), (8, 9), 'house', 'haus')
+    
+    :type srctext: str
+    :param srctext: The sentence string from the source language.
+    :type trgtext: str
+    :param trgtext: The sentence string from the target language.
+    :type alignment: str
+    :param alignment: The word alignment outputs as list of tuples, where
+        the first elements of tuples are the source words' indices and
+        second elements are the target words' indices. This is also the output
+        format of nltk.translate.ibm1
+    :rtype: list(tuple)
+    :return: A list of tuples, each element in a list is a phrase and each 
+        phrase is a tuple made up of (i) its source location, (ii) its target 
+        location, (iii) the source phrase and (iii) the target phrase. The phrase
+        list of tuples represents all the possible phrases extracted from the 
+        word alignments. 
+    :type max_phrase_length: int
+    :param max_phrase_length: maximal phrase length, if 0 or not specified
+        it is set to a length of the longer sentence (srctext or trgtext).
+    """
+
+    srctext = srctext.split()   # e
+    trgtext = trgtext.split()   # f
+    srclen = len(srctext)       # len(e)
+    trglen = len(trgtext)       # len(f)
+    # Keeps an index of which source/target words that are aligned.
+    f_aligned = [j for _,j in alignment]
+    max_phrase_length = max_phrase_length or max(srclen,trglen)
+
+    # set of phrase pairs BP
+    bp = set()
+
+    for e_start in range(srclen):
+        max_idx = min(srclen, e_start + max_phrase_length)
+        for e_end in range(e_start, max_idx):
+            # // find the minimally matching foreign phrase
+            # (f start , f end ) = ( length(f), 0 )
+            # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
+            f_start, f_end = trglen-1 , -1  #  0-based indexing
+ 
+            for e,f in alignment:
+                if e_start <= e <= e_end:
+                    f_start = min(f, f_start)
+                    f_end = max(f, f_end)
+            # add extract (f start , f end , e start , e end ) to set BP
+            phrases = extract(f_start, f_end, e_start, e_end, 
+                              alignment, f_aligned,
+                              srctext, trgtext, srclen, trglen,
+                              max_phrase_length)
+            if phrases:
+                bp.update(phrases)
+    return bp
+
diff --git a/nlp_resource_data/nltk/translate/phrase_based.pyc b/nlp_resource_data/nltk/translate/phrase_based.pyc

new file mode 100755 (executable)

index 0000000..c2e198f

Binary files /dev/null and b/nlp_resource_data/nltk/translate/phrase_based.pyc differ
diff --git a/nlp_resource_data/nltk/translate/ribes_score.py b/nlp_resource_data/nltk/translate/ribes_score.py

new file mode 100755 (executable)

index 0000000..553e68f
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/ribes_score.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: RIBES Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
+#               Mark Byers, ekhumoro, P. Ortiz
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+""" RIBES score implementation """
+from __future__ import division
+from itertools import islice
+import math
+
+from nltk.util import ngrams, choose
+
+
+def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
+    """
+    The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from 
+    Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and 
+    Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for 
+    Distant Language Pairs". In Proceedings of EMNLP. 
+    http://www.aclweb.org/anthology/D/D10/D10-1092.pdf 
+    
+    The generic RIBES scores used in shared task, e.g. Workshop for 
+    Asian Translation (WAT) uses the following RIBES calculations:
+    
+        RIBES = kendall_tau * (alpha**p1) * (beta**bp)
+    
+    Please note that this re-implementation differs from the official
+    RIBES implementation and though it emulates the results as describe
+    in the original paper, there are further optimization implemented 
+    in the official RIBES script.
+    
+    Users are encouraged to use the official RIBES script instead of this 
+    implementation when evaluating your machine translation system. Refer
+    to http://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
+    
+    :param references: a list of reference sentences
+    :type reference: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param alpha: hyperparameter used as a prior for the unigram precision.
+    :type alpha: float
+    :param beta: hyperparameter used as a prior for the brevity penalty.
+    :type beta: float
+    :return: The best ribes score from one of the references.
+    :rtype: float
+    """
+    best_ribes = -1.0
+    # Calculates RIBES for each reference and returns the best score.
+    for reference in references:
+        # Collects the *worder* from the ranked correlation alignments.
+        worder = word_rank_alignment(reference, hypothesis)
+        nkt = kendall_tau(worder)
+            
+        # Calculates the brevity penalty
+        bp = min(1.0, math.exp(1.0 - len(reference)/len(hypothesis)))
+        
+        # Calculates the unigram precision, *p1*
+        p1 = len(worder) / len(hypothesis)
+        
+        _ribes = nkt * (p1 ** alpha) *  (bp ** beta)
+        
+        if _ribes > best_ribes: # Keeps the best score.
+            best_ribes = _ribes
+        
+    return best_ribes
+
+
+def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
+    """
+    This function "calculates RIBES for a system output (hypothesis) with 
+    multiple references, and returns "best" score among multi-references and 
+    individual scores. The scores are corpus-wise, i.e., averaged by the number 
+    of sentences." (c.f. RIBES version 1.03.1 code).
+    
+    Different from BLEU's micro-average precision, RIBES calculates the 
+    macro-average precision by averaging the best RIBES score for each pair of 
+    hypothesis and its corresponding references 
+
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+    
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', 
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', 
+    ...          'because', 'he', 'read', 'the', 'book']
+    
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> round(corpus_ribes(list_of_references, hypotheses),4)
+    0.3597
+    
+    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param alpha: hyperparameter used as a prior for the unigram precision.
+    :type alpha: float
+    :param beta: hyperparameter used as a prior for the brevity penalty.
+    :type beta: float
+    :return: The best ribes score from one of the references.
+    :rtype: float
+    """
+    corpus_best_ribes = 0.0
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
+    return corpus_best_ribes / len(hypotheses)
+    
+        
+def position_of_ngram(ngram, sentence):
+    """
+    This function returns the position of the first instance of the ngram 
+    appearing in a sentence.
+    
+    Note that one could also use string as follows but the code is a little
+    convoluted with type casting back and forth:
+        
+        char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
+        word_pos = char_pos.count(' ')
+        
+    Another way to conceive this is:
+    
+        return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) 
+                    if ng == ngram)
+                    
+    :param ngram: The ngram that needs to be searched
+    :type ngram: tuple
+    :param sentence: The list of tokens to search from.
+    :type sentence: list(str)
+    """
+    # Iterates through the ngrams in sentence.
+    for i,sublist in enumerate(ngrams(sentence, len(ngram))):
+        # Returns the index of the word when ngram matches.
+        if ngram == sublist:
+            return i
+
+
+def word_rank_alignment(reference, hypothesis, character_based=False):
+    """    
+    This is the word rank alignment algorithm described in the paper to produce
+    the *worder* list, i.e. a list of word indices of the hypothesis word orders 
+    w.r.t. the list of reference words.
+    
+    Below is (H0, R0) example from the Isozaki et al. 2010 paper, 
+    note the examples are indexed from 1 but the results here are indexed from 0:
+    
+        >>> ref = str('he was interested in world history because he '
+        ... 'read the book').split()
+        >>> hyp = str('he read the book because he was interested in world '
+        ... 'history').split()
+        >>> word_rank_alignment(ref, hyp)
+        [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
+        
+    The (H1, R1) example from the paper, note the 0th index:
+    
+        >>> ref = 'John hit Bob yesterday'.split()
+        >>> hyp = 'Bob hit John yesterday'.split()
+        >>> word_rank_alignment(ref, hyp)
+        [2, 1, 0, 3]
+
+    Here is the (H2, R2) example from the paper, note the 0th index here too:
+    
+        >>> ref = 'the boy read the book'.split()
+        >>> hyp = 'the book was read by the boy'.split()
+        >>> word_rank_alignment(ref, hyp)
+        [3, 4, 2, 0, 1]
+        
+    :param reference: a reference sentence
+    :type reference: list(str)
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    """
+    worder = []
+    hyp_len = len(hypothesis)
+    # Stores a list of possible ngrams from the reference sentence.
+    # This is used for matching context window later in the algorithm.
+    ref_ngrams = []
+    hyp_ngrams = []
+    for n in range(1, len(reference)+1):
+        for ng in ngrams(reference, n):
+            ref_ngrams.append(ng)
+        for ng in ngrams(hypothesis, n):
+            hyp_ngrams.append(ng)
+    for i, h_word in enumerate(hypothesis):
+        # If word is not in the reference, continue.
+        if h_word not in reference:
+            continue
+        # If we can determine one-to-one word correspondence for unigrams that 
+        # only appear once in both the reference and hypothesis.
+        elif hypothesis.count(h_word) == reference.count(h_word) == 1:
+            worder.append(reference.index(h_word))
+        else:
+            max_window_size = max(i, hyp_len-i+1)
+            for window in range(1, max_window_size):
+                if i+window < hyp_len: # If searching the right context is possible.
+                    # Retrieve the right context window.
+                    right_context_ngram = tuple(islice(hypothesis, i, i+window+1))
+                    num_times_in_ref = ref_ngrams.count(right_context_ngram)
+                    num_times_in_hyp = hyp_ngrams.count(right_context_ngram) 
+                    # If ngram appears only once in both ref and hyp.
+                    if num_times_in_ref == num_times_in_hyp == 1:
+                        # Find the position of ngram that matched the reference.
+                        pos = position_of_ngram(right_context_ngram, reference)
+                        worder.append(pos)  # Add the positions of the ngram.
+                        break
+                if window <= i: # If searching the left context is possible.
+                    # Retrieve the left context window.
+                    left_context_ngram = tuple(islice(hypothesis, i-window, i+1))
+                    num_times_in_ref = ref_ngrams.count(left_context_ngram)
+                    num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
+                    if num_times_in_ref == num_times_in_hyp == 1:
+                        # Find the position of ngram that matched the reference.
+                        pos = position_of_ngram(left_context_ngram, reference)
+                        # Add the positions of the ngram.
+                        worder.append(pos+ len(left_context_ngram) -1)  
+                        break
+    return worder
+
+    
+def find_increasing_sequences(worder):
+    """
+    Given the *worder* list, this function groups monotonic +1 sequences. 
+    
+        >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
+        >>> list(find_increasing_sequences(worder))
+        [(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
+    
+    :param worder: The worder list output from word_rank_alignment
+    :param type: list(int)
+    """
+    items = iter(worder)
+    a, b = None, next(items, None)
+    result = [b]
+    while b is not None:
+        a, b = b, next(items, None)
+        if b is not None and a + 1 == b:
+            result.append(b)
+        else:
+            if len(result) > 1:
+                yield tuple(result)
+            result = [b]
+
+
+def kendall_tau(worder, normalize=True):
+    """
+    Calculates the Kendall's Tau correlation coefficient given the *worder*
+    list of word alignments from word_rank_alignment(), using the formula:
+    
+        tau = 2 * num_increasing_pairs / num_possible pairs -1
+    
+    Note that the no. of increasing pairs can be discontinuous in the *worder*
+    list and each each increasing sequence can be tabulated as choose(len(seq), 2) 
+    no. of increasing pairs, e.g.
+    
+        >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
+        >>> number_possible_pairs = choose(len(worder), 2)
+        >>> round(kendall_tau(worder, normalize=False),3)
+        -0.236
+        >>> round(kendall_tau(worder),3)
+        0.382
+    
+    :param worder: The worder list output from word_rank_alignment
+    :type worder: list(int)
+    :param normalize: Flag to indicate normalization
+    :type normalize: boolean
+    :return: The Kendall's Tau correlation coefficient.
+    :rtype: float
+    """
+    worder_len = len(worder)
+    # Extract the groups of increasing/monotonic sequences.
+    increasing_sequences = find_increasing_sequences(worder)
+    # Calculate no. of increasing_pairs in *worder* list.
+    num_increasing_pairs = sum(choose(len(seq),2) for seq in increasing_sequences) 
+    # Calculate no. of possible pairs.
+    num_possible_pairs = choose(worder_len, 2)
+    # Kendall's Tau computation.
+    tau = 2 * num_increasing_pairs / num_possible_pairs -1
+    if normalize: # If normalized, the tau output falls between 0.0 to 1.0
+        return (tau + 1) /2
+    else: # Otherwise, the tau outputs falls between -1.0 to +1.0
+        return tau
+
+
+def spearman_rho(worder, normalize=True):
+    """
+    Calculates the Spearman's Rho correlation coefficient given the *worder* 
+    list of word alignment from word_rank_alignment(), using the formula:
+    
+        rho = 1 - sum(d**2) / choose(len(worder)+1, 3)  
+        
+    Given that d is the sum of difference between the *worder* list of indices
+    and the original word indices from the reference sentence.
+    
+    Using the (H0,R0) and (H5, R5) example from the paper
+    
+        >>> worder =  [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
+        >>> round(spearman_rho(worder, normalize=False), 3)
+        -0.591
+        >>> round(spearman_rho(worder), 3)
+        0.205
+    
+    :param worder: The worder list output from word_rank_alignment
+    :param type: list(int)
+    """
+    worder_len = len(worder)
+    sum_d_square = sum((wi - i)**2 for wi, i in zip(worder, range(worder_len)))
+    rho = 1 - sum_d_square / choose(worder_len+1, 3)
+    
+    if normalize: # If normalized, the rho output falls between 0.0 to 1.0
+        return (rho + 1) /2
+    else: # Otherwise, the rho outputs falls between -1.0 to +1.0
+        return rho
diff --git a/nlp_resource_data/nltk/translate/ribes_score.pyc b/nlp_resource_data/nltk/translate/ribes_score.pyc

new file mode 100755 (executable)

index 0000000..8761542

Binary files /dev/null and b/nlp_resource_data/nltk/translate/ribes_score.pyc differ
diff --git a/nlp_resource_data/nltk/translate/stack_decoder.py b/nlp_resource_data/nltk/translate/stack_decoder.py

new file mode 100755 (executable)

index 0000000..e9442d7
--- /dev/null
+++ b/nlp_resource_data/nltk/translate/stack_decoder.py
@@ -0,0 +1,499 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Stack decoder
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A decoder that uses stacks to implement phrase-based translation.
+
+In phrase-based translation, the source sentence is segmented into
+phrases of one or more words, and translations for those phrases are
+used to build the target sentence.
+
+Hypothesis data structures are used to keep track of the source words
+translated so far and the partial output. A hypothesis can be expanded
+by selecting an untranslated phrase, looking up its translation in a
+phrase table, and appending that translation to the partial output.
+Translation is complete when a hypothesis covers all source words.
+
+The search space is huge because the source sentence can be segmented
+in different ways, the source phrases can be selected in any order,
+and there could be multiple translations for the same source phrase in
+the phrase table. To make decoding tractable, stacks are used to limit
+the number of candidate hypotheses by doing histogram and/or threshold
+pruning.
+
+Hypotheses with the same number of words translated are placed in the
+same stack. In histogram pruning, each stack has a size limit, and
+the hypothesis with the lowest score is removed when the stack is full.
+In threshold pruning, hypotheses that score below a certain threshold
+of the best hypothesis in that stack are removed.
+
+Hypothesis scoring can include various factors such as phrase
+translation probability, language model probability, length of
+translation, cost of remaining words to be translated, and so on.
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+"""
+
+import warnings
+from collections import defaultdict
+from math import log
+
+
+class StackDecoder(object):
+    """
+    Phrase-based stack decoder for machine translation
+
+    >>> from nltk.translate import PhraseTable
+    >>> phrase_table = PhraseTable()
+    >>> phrase_table.add(('niemand',), ('nobody',), log(0.8))
+    >>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2))
+    >>> phrase_table.add(('erwartet',), ('expects',), log(0.8))
+    >>> phrase_table.add(('erwartet',), ('expecting',), log(0.2))
+    >>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1))
+    >>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8))
+    >>> phrase_table.add(('!',), ('!',), log(0.8))
+
+    >>> #  nltk.model should be used here once it is implemented
+    >>> from collections import defaultdict
+    >>> language_prob = defaultdict(lambda: -999.0)
+    >>> language_prob[('nobody',)] = log(0.5)
+    >>> language_prob[('expects',)] = log(0.4)
+    >>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2)
+    >>> language_prob[('!',)] = log(0.1)
+    >>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()
+
+    >>> stack_decoder = StackDecoder(phrase_table, language_model)
+
+    >>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!'])
+    ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!']
+
+    """
+    def __init__(self, phrase_table, language_model):
+        """
+        :param phrase_table: Table of translations for source language
+            phrases and the log probabilities for those translations.
+        :type phrase_table: PhraseTable
+
+        :param language_model: Target language model. Must define a
+            ``probability_change`` method that calculates the change in
+            log probability of a sentence, if a given string is appended
+            to it.
+            This interface is experimental and will likely be replaced
+            with nltk.model once it is implemented.
+        :type language_model: object
+        """
+        self.phrase_table = phrase_table
+        self.language_model = language_model
+
+        self.word_penalty = 0.0
+        """
+        float: Influences the translation length exponentially.
+            If positive, shorter translations are preferred.
+            If negative, longer translations are preferred.
+            If zero, no penalty is applied.
+        """
+
+        self.beam_threshold = 0.0
+        """
+        float: Hypotheses that score below this factor of the best
+            hypothesis in a stack are dropped from consideration.
+            Value between 0.0 and 1.0.
+        """
+
+        self.stack_size = 100
+        """
+        int: Maximum number of hypotheses to consider in a stack.
+            Higher values increase the likelihood of a good translation,
+            but increases processing time.
+        """
+
+        self.__distortion_factor = 0.5
+        self.__compute_log_distortion()
+
+    @property
+    def distortion_factor(self):
+        """
+        float: Amount of reordering of source phrases.
+            Lower values favour monotone translation, suitable when
+            word order is similar for both source and target languages.
+            Value between 0.0 and 1.0. Default 0.5.
+        """
+        return self.__distortion_factor
+
+    @distortion_factor.setter
+    def distortion_factor(self, d):
+        self.__distortion_factor = d
+        self.__compute_log_distortion()
+
+    def __compute_log_distortion(self):
+        # cache log(distortion_factor) so we don't have to recompute it
+        # when scoring hypotheses
+        if self.__distortion_factor == 0.0:
+            self.__log_distortion_factor = log(1e-9)  # 1e-9 is almost zero
+        else:
+            self.__log_distortion_factor = log(self.__distortion_factor)
+
+    def translate(self, src_sentence):
+        """
+        :param src_sentence: Sentence to be translated
+        :type src_sentence: list(str)
+
+        :return: Translated sentence
+        :rtype: list(str)
+        """
+        sentence = tuple(src_sentence)  # prevent accidental modification
+        sentence_length = len(sentence)
+        stacks = [_Stack(self.stack_size, self.beam_threshold)
+                  for _ in range(0, sentence_length + 1)]
+        empty_hypothesis = _Hypothesis()
+        stacks[0].push(empty_hypothesis)
+
+        all_phrases = self.find_all_src_phrases(sentence)
+        future_score_table = self.compute_future_scores(sentence)
+        for stack in stacks:
+            for hypothesis in stack:
+                possible_expansions = StackDecoder.valid_phrases(all_phrases,
+                                                                 hypothesis)
+                for src_phrase_span in possible_expansions:
+                    src_phrase = sentence[src_phrase_span[0]:src_phrase_span[1]]
+                    for translation_option in (self.phrase_table.
+                                               translations_for(src_phrase)):
+                        raw_score = self.expansion_score(
+                            hypothesis, translation_option, src_phrase_span)
+                        new_hypothesis = _Hypothesis(
+                            raw_score=raw_score,
+                            src_phrase_span=src_phrase_span,
+                            trg_phrase=translation_option.trg_phrase,
+                            previous=hypothesis
+                        )
+                        new_hypothesis.future_score = self.future_score(
+                            new_hypothesis, future_score_table, sentence_length)
+                        total_words = new_hypothesis.total_translated_words()
+                        stacks[total_words].push(new_hypothesis)
+
+        if not stacks[sentence_length]:
+            warnings.warn('Unable to translate all words. '
+                          'The source sentence contains words not in '
+                          'the phrase table')
+            # Instead of returning empty output, perhaps a partial
+            # translation could be returned
+            return []
+
+        best_hypothesis = stacks[sentence_length].best()
+        return best_hypothesis.translation_so_far()
+
+    def find_all_src_phrases(self, src_sentence):
+        """
+        Finds all subsequences in src_sentence that have a phrase
+        translation in the translation table
+
+        :type src_sentence: tuple(str)
+
+        :return: Subsequences that have a phrase translation,
+            represented as a table of lists of end positions.
+            For example, if result[2] is [5, 6, 9], then there are
+            three phrases starting from position 2 in ``src_sentence``,
+            ending at positions 5, 6, and 9 exclusive. The list of
+            ending positions are in ascending order.
+        :rtype: list(list(int))
+        """
+        sentence_length = len(src_sentence)
+        phrase_indices = [[] for _ in src_sentence]
+        for start in range(0, sentence_length):
+            for end in range(start + 1, sentence_length + 1):
+                potential_phrase = src_sentence[start:end]
+                if potential_phrase in self.phrase_table:
+                    phrase_indices[start].append(end)
+        return phrase_indices
+
+    def compute_future_scores(self, src_sentence):
+        """
+        Determines the approximate scores for translating every
+        subsequence in ``src_sentence``
+
+        Future scores can be used a look-ahead to determine the
+        difficulty of translating the remaining parts of a src_sentence.
+
+        :type src_sentence: tuple(str)
+
+        :return: Scores of subsequences referenced by their start and
+        end positions. For example, result[2][5] is the score of the
+        subsequence covering positions 2, 3, and 4.
+        :rtype: dict(int: (dict(int): float))
+        """
+        scores = defaultdict(lambda: defaultdict(lambda: float('-inf')))
+        for seq_length in range(1, len(src_sentence) + 1):
+            for start in range(0, len(src_sentence) - seq_length + 1):
+                end = start + seq_length
+                phrase = src_sentence[start:end]
+                if phrase in self.phrase_table:
+                    score = self.phrase_table.translations_for(
+                        phrase)[0].log_prob  # pick best (first) translation
+                    # Warning: API of language_model is subject to change
+                    score += self.language_model.probability(phrase)
+                    scores[start][end] = score
+
+                # check if a better score can be obtained by combining
+                # two child subsequences
+                for mid in range(start + 1, end):
+                    combined_score = (scores[start][mid] +
+                                      scores[mid][end])
+                    if combined_score > scores[start][end]:
+                        scores[start][end] = combined_score
+        return scores
+
+    def future_score(self, hypothesis, future_score_table, sentence_length):
+        """
+        Determines the approximate score for translating the
+        untranslated words in ``hypothesis``
+        """
+        score = 0.0
+        for span in hypothesis.untranslated_spans(sentence_length):
+            score += future_score_table[span[0]][span[1]]
+        return score
+
+    def expansion_score(self, hypothesis, translation_option, src_phrase_span):
+        """
+        Calculate the score of expanding ``hypothesis`` with
+        ``translation_option``
+
+        :param hypothesis: Hypothesis being expanded
+        :type hypothesis: _Hypothesis
+
+        :param translation_option: Information about the proposed expansion
+        :type translation_option: PhraseTableEntry
+
+        :param src_phrase_span: Word position span of the source phrase
+        :type src_phrase_span: tuple(int, int)
+        """
+        score = hypothesis.raw_score
+        score += translation_option.log_prob
+        # The API of language_model is subject to change; it could accept
+        # a string, a list of words, and/or some other type
+        score += self.language_model.probability_change(
+            hypothesis, translation_option.trg_phrase)
+        score += self.distortion_score(hypothesis, src_phrase_span)
+        score -= self.word_penalty * len(translation_option.trg_phrase)
+        return score
+
+    def distortion_score(self, hypothesis, next_src_phrase_span):
+        if not hypothesis.src_phrase_span:
+            return 0.0
+        next_src_phrase_start = next_src_phrase_span[0]
+        prev_src_phrase_end = hypothesis.src_phrase_span[1]
+        distortion_distance = next_src_phrase_start - prev_src_phrase_end
+        return abs(distortion_distance) * self.__log_distortion_factor
+
+    @staticmethod
+    def valid_phrases(all_phrases_from, hypothesis):
+        """
+        Extract phrases from ``all_phrases_from`` that contains words
+        that have not been translated by ``hypothesis``
+
+        :param all_phrases_from: Phrases represented by their spans, in
+            the same format as the return value of
+            ``find_all_src_phrases``
+        :type all_phrases_from: list(list(int))
+
+        :type hypothesis: _Hypothesis
+
+        :return: A list of phrases, represented by their spans, that
+            cover untranslated positions.
+        :rtype: list(tuple(int, int))
+        """
+        untranslated_spans = hypothesis.untranslated_spans(
+            len(all_phrases_from))
+        valid_phrases = []
+        for available_span in untranslated_spans:
+            start = available_span[0]
+            available_end = available_span[1]
+            while start < available_end:
+                for phrase_end in all_phrases_from[start]:
+                    if phrase_end > available_end:
+                        # Subsequent elements in all_phrases_from[start]
+                        # will also be > available_end, since the
+                        # elements are in ascending order
+                        break
+                    valid_phrases.append((start, phrase_end))
+                start += 1
+        return valid_phrases
+
+
+class _Hypothesis(object):
+    """
+    Partial solution to a translation.
+
+    Records the word positions of the phrase being translated, its
+    translation, raw score, and the cost of the untranslated parts of
+    the sentence. When the next phrase is selected to build upon the
+    partial solution, a new _Hypothesis object is created, with a back
+    pointer to the previous hypothesis.
+
+    To find out which words have been translated so far, look at the
+    ``src_phrase_span`` in the hypothesis chain. Similarly, the
+    translation output can be found by traversing up the chain.
+    """
+    def __init__(self, raw_score=0.0, src_phrase_span=(), trg_phrase=(),
+                 previous=None, future_score=0.0):
+        """
+        :param raw_score: Likelihood of hypothesis so far.
+            Higher is better. Does not account for untranslated words.
+        :type raw_score: float
+
+        :param src_phrase_span: Span of word positions covered by the
+            source phrase in this hypothesis expansion. For example,
+            (2, 5) means that the phrase is from the second word up to,
+            but not including the fifth word in the source sentence.
+        :type src_phrase_span: tuple(int)
+
+        :param trg_phrase: Translation of the source phrase in this
+            hypothesis expansion
+        :type trg_phrase: tuple(str)
+
+        :param previous: Previous hypothesis before expansion to this one
+        :type previous: _Hypothesis
+
+        :param future_score: Approximate score for translating the
+            remaining words not covered by this hypothesis. Higher means
+            that the remaining words are easier to translate.
+        :type future_score: float
+        """
+        self.raw_score = raw_score
+        self.src_phrase_span = src_phrase_span
+        self.trg_phrase = trg_phrase
+        self.previous = previous
+        self.future_score = future_score
+
+    def score(self):
+        """
+        Overall score of hypothesis after accounting for local and
+        global features
+        """
+        return self.raw_score + self.future_score
+
+    def untranslated_spans(self, sentence_length):
+        """
+        Starting from each untranslated word, find the longest
+        continuous span of untranslated positions
+
+        :param sentence_length: Length of source sentence being
+            translated by the hypothesis
+        :type sentence_length: int
+
+        :rtype: list(tuple(int, int))
+        """
+        translated_positions = self.translated_positions()
+        translated_positions.sort()
+        translated_positions.append(sentence_length)  # add sentinel position
+
+        untranslated_spans = []
+        start = 0
+        # each untranslated span must end in one of the translated_positions
+        for end in translated_positions:
+            if start < end:
+                untranslated_spans.append((start, end))
+            start = end + 1
+
+        return untranslated_spans
+
+    def translated_positions(self):
+        """
+        List of positions in the source sentence of words already
+        translated. The list is not sorted.
+
+        :rtype: list(int)
+        """
+        translated_positions = []
+        current_hypothesis = self
+        while current_hypothesis.previous is not None:
+            translated_span = current_hypothesis.src_phrase_span
+            translated_positions.extend(range(translated_span[0],
+                                              translated_span[1]))
+            current_hypothesis = current_hypothesis.previous
+        return translated_positions
+
+    def total_translated_words(self):
+        return len(self.translated_positions())
+
+    def translation_so_far(self):
+        translation = []
+        self.__build_translation(self, translation)
+        return translation
+
+    def __build_translation(self, hypothesis, output):
+        if hypothesis.previous is None:
+            return
+        self.__build_translation(hypothesis.previous, output)
+        output.extend(hypothesis.trg_phrase)
+
+
+class _Stack(object):
+    """
+    Collection of _Hypothesis objects
+    """
+    def __init__(self, max_size=100, beam_threshold=0.0):
+        """
+        :param beam_threshold: Hypotheses that score less than this
+            factor of the best hypothesis are discarded from the stack.
+            Value must be between 0.0 and 1.0.
+        :type beam_threshold: float
+        """
+        self.max_size = max_size
+        self.items = []
+
+        if beam_threshold == 0.0:
+            self.__log_beam_threshold = float('-inf')
+        else:
+            self.__log_beam_threshold = log(beam_threshold)
+
+    def push(self, hypothesis):
+        """
+        Add ``hypothesis`` to the stack.
+        Removes lowest scoring hypothesis if the stack is full.
+        After insertion, hypotheses that score less than
+        ``beam_threshold`` times the score of the best hypothesis
+        are removed.
+        """
+        self.items.append(hypothesis)
+        self.items.sort(key=lambda h: h.score(), reverse=True)
+        while len(self.items) > self.max_size:
+            self.items.pop()
+        self.threshold_prune()
+
+    def threshold_prune(self):
+        if not self.items:
+            return
+        #  log(score * beam_threshold) = log(score) + log(beam_threshold)
+        threshold = self.items[0].score() + self.__log_beam_threshold
+        for hypothesis in reversed(self.items):
+            if hypothesis.score() < threshold:
+                self.items.pop()
+            else:
+                break
+
+    def best(self):
+        """
+        :return: Hypothesis with the highest score in the stack
+        :rtype: _Hypothesis
+        """
+        if self.items:
+            return self.items[0]
+        return None
+
+    def __iter__(self):
+        return iter(self.items)
+
+    def __contains__(self, hypothesis):
+        return hypothesis in self.items
+
+    def __bool__(self):
+        return len(self.items) != 0
+    __nonzero__=__bool__
diff --git a/nlp_resource_data/nltk/translate/stack_decoder.pyc b/nlp_resource_data/nltk/translate/stack_decoder.pyc

new file mode 100755 (executable)

index 0000000..9bacfdd

Binary files /dev/null and b/nlp_resource_data/nltk/translate/stack_decoder.pyc differ
diff --git a/nlp_resource_data/nltk/tree.py b/nlp_resource_data/nltk/tree.py

new file mode 100755 (executable)

index 0000000..193a003
--- /dev/null
+++ b/nlp_resource_data/nltk/tree.py
@@ -0,0 +1,1605 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Text Trees
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Peter Ljunglöf <peter.ljunglof@gu.se>
+#         Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Class for representing hierarchical language structures, such as
+syntax trees and morphological trees.
+"""
+from __future__ import print_function, unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+# TODO: add LabelledTree (can be used for dependency trees)
+
+import re
+
+from six import string_types
+
+from nltk.grammar import Production, Nonterminal
+from nltk.probability import ProbabilisticMixIn
+from nltk.util import slice_bounds
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+from nltk.internals import raise_unorderable_types
+
+######################################################################
+## Trees
+######################################################################
+
+@python_2_unicode_compatible
+class Tree(list):
+    """
+    A Tree represents a hierarchical grouping of leaves and subtrees.
+    For example, each constituent in a syntax tree is represented by a single Tree.
+
+    A tree's children are encoded as a list of leaves and subtrees,
+    where a leaf is a basic (non-tree) value; and a subtree is a
+    nested Tree.
+
+        >>> from nltk.tree import Tree
+        >>> print(Tree(1, [2, Tree(3, [4]), 5]))
+        (1 2 (3 4) 5)
+        >>> vp = Tree('VP', [Tree('V', ['saw']),
+        ...                  Tree('NP', ['him'])])
+        >>> s = Tree('S', [Tree('NP', ['I']), vp])
+        >>> print(s)
+        (S (NP I) (VP (V saw) (NP him)))
+        >>> print(s[1])
+        (VP (V saw) (NP him))
+        >>> print(s[1,1])
+        (NP him)
+        >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
+        >>> s == t
+        True
+        >>> t[1][1].set_label('X')
+        >>> t[1][1].label()
+        'X'
+        >>> print(t)
+        (S (NP I) (VP (V saw) (X him)))
+        >>> t[0], t[1,1] = t[1,1], t[0]
+        >>> print(t)
+        (S (X him) (VP (V saw) (NP I)))
+
+    The length of a tree is the number of children it has.
+
+        >>> len(t)
+        2
+
+    The set_label() and label() methods allow individual constituents
+    to be labeled.  For example, syntax trees use this label to specify
+    phrase tags, such as "NP" and "VP".
+
+    Several Tree methods use "tree positions" to specify
+    children or descendants of a tree.  Tree positions are defined as
+    follows:
+
+      - The tree position *i* specifies a Tree's *i*\ th child.
+      - The tree position ``()`` specifies the Tree itself.
+      - If *p* is the tree position of descendant *d*, then
+        *p+i* specifies the *i*\ th child of *d*.
+
+    I.e., every tree position is either a single index *i*,
+    specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*,
+    specifying ``tree[i1][i2]...[iN]``.
+
+    Construct a new tree.  This constructor can be called in one
+    of two ways:
+
+    - ``Tree(label, children)`` constructs a new tree with the
+        specified label and list of children.
+
+    - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``.
+    """
+    def __init__(self, node, children=None):
+        if children is None:
+            raise TypeError("%s: Expected a node value and child list "
+                                % type(self).__name__)
+        elif isinstance(children, string_types):
+            raise TypeError("%s() argument 2 should be a list, not a "
+                            "string" % type(self).__name__)
+        else:
+            list.__init__(self, children)
+            self._label = node
+
+    #////////////////////////////////////////////////////////////
+    # Comparison operators
+    #////////////////////////////////////////////////////////////
+
+    def __eq__(self, other):
+        return (self.__class__ is other.__class__ and
+                (self._label, list(self)) == (other._label, list(other)))
+
+    def __lt__(self, other):
+        if not isinstance(other, Tree):
+            # raise_unorderable_types("<", self, other)
+            # Sometimes children can be pure strings,
+            # so we need to be able to compare with non-trees:
+            return self.__class__.__name__ < other.__class__.__name__
+        elif self.__class__ is other.__class__:
+            return (self._label, list(self)) < (other._label, list(other))
+        else:
+            return self.__class__.__name__ < other.__class__.__name__
+
+    # @total_ordering doesn't work here, since the class inherits from a builtin class
+    __ne__ = lambda self, other: not self == other
+    __gt__ = lambda self, other: not (self < other or self == other)
+    __le__ = lambda self, other: self < other or self == other
+    __ge__ = lambda self, other: not self < other
+
+    #////////////////////////////////////////////////////////////
+    # Disabled list operations
+    #////////////////////////////////////////////////////////////
+
+    def __mul__(self, v):
+        raise TypeError('Tree does not support multiplication')
+    def __rmul__(self, v):
+        raise TypeError('Tree does not support multiplication')
+    def __add__(self, v):
+        raise TypeError('Tree does not support addition')
+    def __radd__(self, v):
+        raise TypeError('Tree does not support addition')
+
+    #////////////////////////////////////////////////////////////
+    # Indexing (with support for tree positions)
+    #////////////////////////////////////////////////////////////
+
+    def __getitem__(self, index):
+        if isinstance(index, (int, slice)):
+            return list.__getitem__(self, index)
+        elif isinstance(index, (list, tuple)):
+            if len(index) == 0:
+                return self
+            elif len(index) == 1:
+                return self[index[0]]
+            else:
+                return self[index[0]][index[1:]]
+        else:
+            raise TypeError("%s indices must be integers, not %s" %
+                            (type(self).__name__, type(index).__name__))
+
+    def __setitem__(self, index, value):
+        if isinstance(index, (int, slice)):
+            return list.__setitem__(self, index, value)
+        elif isinstance(index, (list, tuple)):
+            if len(index) == 0:
+                raise IndexError('The tree position () may not be '
+                                 'assigned to.')
+            elif len(index) == 1:
+                self[index[0]] = value
+            else:
+                self[index[0]][index[1:]] = value
+        else:
+            raise TypeError("%s indices must be integers, not %s" %
+                            (type(self).__name__, type(index).__name__))
+
+    def __delitem__(self, index):
+        if isinstance(index, (int, slice)):
+            return list.__delitem__(self, index)
+        elif isinstance(index, (list, tuple)):
+            if len(index) == 0:
+                raise IndexError('The tree position () may not be deleted.')
+            elif len(index) == 1:
+                del self[index[0]]
+            else:
+                del self[index[0]][index[1:]]
+        else:
+            raise TypeError("%s indices must be integers, not %s" %
+                            (type(self).__name__, type(index).__name__))
+
+    #////////////////////////////////////////////////////////////
+    # Basic tree operations
+    #////////////////////////////////////////////////////////////
+
+    def _get_node(self):
+        """Outdated method to access the node value; use the label() method instead."""
+        raise NotImplementedError("Use label() to access a node label.")
+    def _set_node(self, value):
+        """Outdated method to set the node value; use the set_label() method instead."""
+        raise NotImplementedError("Use set_label() method to set a node label.")
+    node = property(_get_node, _set_node)
+
+    def label(self):
+        """
+        Return the node label of the tree.
+
+            >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))')
+            >>> t.label()
+            'S'
+
+        :return: the node label (typically a string)
+        :rtype: any
+        """
+        return self._label
+
+    def set_label(self, label):
+        """
+        Set the node label of the tree.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> t.set_label("T")
+            >>> print(t)
+            (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))
+
+        :param label: the node label (typically a string)
+        :type label: any
+        """
+        self._label = label
+
+    def leaves(self):
+        """
+        Return the leaves of the tree.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> t.leaves()
+            ['the', 'dog', 'chased', 'the', 'cat']
+
+        :return: a list containing this tree's leaves.
+            The order reflects the order of the
+            leaves in the tree's hierarchical structure.
+        :rtype: list
+        """
+        leaves = []
+        for child in self:
+            if isinstance(child, Tree):
+                leaves.extend(child.leaves())
+            else:
+                leaves.append(child)
+        return leaves
+
+    def flatten(self):
+        """
+        Return a flat version of the tree, with all non-root non-terminals removed.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> print(t.flatten())
+            (S the dog chased the cat)
+
+        :return: a tree consisting of this tree's root connected directly to
+            its leaves, omitting all intervening non-terminal nodes.
+        :rtype: Tree
+        """
+        return Tree(self.label(), self.leaves())
+
+    def height(self):
+        """
+        Return the height of the tree.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> t.height()
+            5
+            >>> print(t[0,0])
+            (D the)
+            >>> t[0,0].height()
+            2
+
+        :return: The height of this tree.  The height of a tree
+            containing no children is 1; the height of a tree
+            containing only leaves is 2; and the height of any other
+            tree is one plus the maximum of its children's
+            heights.
+        :rtype: int
+        """
+        max_child_height = 0
+        for child in self:
+            if isinstance(child, Tree):
+                max_child_height = max(max_child_height, child.height())
+            else:
+                max_child_height = max(max_child_height, 1)
+        return 1 + max_child_height
+
+    def treepositions(self, order='preorder'):
+        """
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> t.treepositions() # doctest: +ELLIPSIS
+            [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...]
+            >>> for pos in t.treepositions('leaves'):
+            ...     t[pos] = t[pos][::-1].upper()
+            >>> print(t)
+            (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC))))
+
+        :param order: One of: ``preorder``, ``postorder``, ``bothorder``,
+            ``leaves``.
+        """
+        positions = []
+        if order in ('preorder', 'bothorder'): positions.append( () )
+        for i, child in enumerate(self):
+            if isinstance(child, Tree):
+                childpos = child.treepositions(order)
+                positions.extend((i,)+p for p in childpos)
+            else:
+                positions.append( (i,) )
+        if order in ('postorder', 'bothorder'): positions.append( () )
+        return positions
+
+    def subtrees(self, filter=None):
+        """
+        Generate all the subtrees of this tree, optionally restricted
+        to trees matching the filter function.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> for s in t.subtrees(lambda t: t.height() == 2):
+            ...     print(s)
+            (D the)
+            (N dog)
+            (V chased)
+            (D the)
+            (N cat)
+
+        :type filter: function
+        :param filter: the function to filter all local trees
+        """
+        if not filter or filter(self):
+            yield self
+        for child in self:
+            if isinstance(child, Tree):
+                for subtree in child.subtrees(filter):
+                    yield subtree
+
+    def productions(self):
+        """
+        Generate the productions that correspond to the non-terminal nodes of the tree.
+        For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
+        form P -> C1 C2 ... Cn.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> t.productions()
+            [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased',
+            NP -> D N, D -> 'the', N -> 'cat']
+
+        :rtype: list(Production)
+        """
+
+        if not isinstance(self._label, string_types):
+            raise TypeError('Productions can only be generated from trees having node labels that are strings')
+
+        prods = [Production(Nonterminal(self._label), _child_names(self))]
+        for child in self:
+            if isinstance(child, Tree):
+                prods += child.productions()
+        return prods
+
+    def pos(self):
+        """
+        Return a sequence of pos-tagged words extracted from the tree.
+
+            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
+            >>> t.pos()
+            [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]
+
+        :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
+            The order reflects the order of the leaves in the tree's hierarchical structure.
+        :rtype: list(tuple)
+        """
+        pos = []
+        for child in self:
+            if isinstance(child, Tree):
+                pos.extend(child.pos())
+            else:
+                pos.append((child, self._label))
+        return pos
+
+    def leaf_treeposition(self, index):
+        """
+        :return: The tree position of the ``index``-th leaf in this
+            tree.  I.e., if ``tp=self.leaf_treeposition(i)``, then
+            ``self[tp]==self.leaves()[i]``.
+
+        :raise IndexError: If this tree contains fewer than ``index+1``
+            leaves, or if ``index<0``.
+        """
+        if index < 0: raise IndexError('index must be non-negative')
+
+        stack = [(self, ())]
+        while stack:
+            value, treepos = stack.pop()
+            if not isinstance(value, Tree):
+                if index == 0: return treepos
+                else: index -= 1
+            else:
+                for i in range(len(value)-1, -1, -1):
+                    stack.append( (value[i], treepos+(i,)) )
+
+        raise IndexError('index must be less than or equal to len(self)')
+
+    def treeposition_spanning_leaves(self, start, end):
+        """
+        :return: The tree position of the lowest descendant of this
+            tree that dominates ``self.leaves()[start:end]``.
+        :raise ValueError: if ``end <= start``
+        """
+        if end <= start:
+            raise ValueError('end must be greater than start')
+        # Find the tree positions of the start & end leaves, and
+        # take the longest common subsequence.
+        start_treepos = self.leaf_treeposition(start)
+        end_treepos = self.leaf_treeposition(end-1)
+        # Find the first index where they mismatch:
+        for i in range(len(start_treepos)):
+            if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
+                return start_treepos[:i]
+        return start_treepos
+
+    #////////////////////////////////////////////////////////////
+    # Transforms
+    #////////////////////////////////////////////////////////////
+
+    def chomsky_normal_form(self, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"):
+        """
+        This method can modify a tree in three ways:
+
+          1. Convert a tree into its Chomsky Normal Form (CNF)
+             equivalent -- Every subtree has either two non-terminals
+             or one terminal as its children.  This process requires
+             the creation of more"artificial" non-terminal nodes.
+          2. Markov (vertical) smoothing of children in new artificial
+             nodes
+          3. Horizontal (parent) annotation of nodes
+
+        :param factor: Right or left factoring method (default = "right")
+        :type  factor: str = [left|right]
+        :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings)
+        :type  horzMarkov: int | None
+        :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation)
+        :type  vertMarkov: int | None
+        :param childChar: A string used in construction of the artificial nodes, separating the head of the
+                          original subtree from the child nodes that have yet to be expanded (default = "|")
+        :type  childChar: str
+        :param parentChar: A string used to separate the node representation from its vertical annotation
+        :type  parentChar: str
+        """
+        from nltk.treetransforms import chomsky_normal_form
+        chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
+
+    def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
+        """
+        This method modifies the tree in three ways:
+
+          1. Transforms a tree in Chomsky Normal Form back to its
+             original structure (branching greater than two)
+          2. Removes any parent annotation (if it exists)
+          3. (optional) expands unary subtrees (if previously
+             collapsed with collapseUnary(...) )
+
+        :param expandUnary: Flag to expand unary or not (default = True)
+        :type  expandUnary: bool
+        :param childChar: A string separating the head node from its children in an artificial node (default = "|")
+        :type  childChar: str
+        :param parentChar: A sting separating the node label from its parent annotation (default = "^")
+        :type  parentChar: str
+        :param unaryChar: A string joining two non-terminals in a unary production (default = "+")
+        :type  unaryChar: str
+        """
+        from nltk.treetransforms import un_chomsky_normal_form
+        un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
+
+    def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
+        """
+        Collapse subtrees with a single child (ie. unary productions)
+        into a new non-terminal (Tree node) joined by 'joinChar'.
+        This is useful when working with algorithms that do not allow
+        unary productions, and completely removing the unary productions
+        would require loss of useful information.  The Tree is modified
+        directly (since it is passed by reference) and no value is returned.
+
+        :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
+                            Part-of-Speech tags) since they are always unary productions
+        :type  collapsePOS: bool
+        :param collapseRoot: 'False' (default) will not modify the root production
+                             if it is unary.  For the Penn WSJ treebank corpus, this corresponds
+                             to the TOP -> productions.
+        :type collapseRoot: bool
+        :param joinChar: A string used to connect collapsed node values (default = "+")
+        :type  joinChar: str
+        """
+        from nltk.treetransforms import collapse_unary
+        collapse_unary(self, collapsePOS, collapseRoot, joinChar)
+
+    #////////////////////////////////////////////////////////////
+    # Convert, copy
+    #////////////////////////////////////////////////////////////
+
+    @classmethod
+    def convert(cls, tree):
+        """
+        Convert a tree between different subtypes of Tree.  ``cls`` determines
+        which class will be used to encode the new tree.
+
+        :type tree: Tree
+        :param tree: The tree that should be converted.
+        :return: The new Tree.
+        """
+        if isinstance(tree, Tree):
+            children = [cls.convert(child) for child in tree]
+            return cls(tree._label, children)
+        else:
+            return tree
+
+    def copy(self, deep=False):
+        if not deep: return type(self)(self._label, self)
+        else: return type(self).convert(self)
+
+    def _frozen_class(self): return ImmutableTree
+    def freeze(self, leaf_freezer=None):
+        frozen_class = self._frozen_class()
+        if leaf_freezer is None:
+            newcopy = frozen_class.convert(self)
+        else:
+            newcopy = self.copy(deep=True)
+            for pos in newcopy.treepositions('leaves'):
+                newcopy[pos] = leaf_freezer(newcopy[pos])
+            newcopy = frozen_class.convert(newcopy)
+        hash(newcopy) # Make sure the leaves are hashable.
+        return newcopy
+
+    #////////////////////////////////////////////////////////////
+    # Parsing
+    #////////////////////////////////////////////////////////////
+
+    @classmethod
+    def fromstring(cls, s, brackets='()', read_node=None, read_leaf=None,
+              node_pattern=None, leaf_pattern=None,
+              remove_empty_top_bracketing=False):
+        """
+        Read a bracketed tree string and return the resulting tree.
+        Trees are represented as nested brackettings, such as::
+
+          (S (NP (NNP John)) (VP (V runs)))
+
+        :type s: str
+        :param s: The string to read
+
+        :type brackets: str (length=2)
+        :param brackets: The bracket characters used to mark the
+            beginning and end of trees and subtrees.
+
+        :type read_node: function
+        :type read_leaf: function
+        :param read_node, read_leaf: If specified, these functions
+            are applied to the substrings of ``s`` corresponding to
+            nodes and leaves (respectively) to obtain the values for
+            those nodes and leaves.  They should have the following
+            signature:
+
+               read_node(str) -> value
+
+            For example, these functions could be used to process nodes
+            and leaves whose values should be some type other than
+            string (such as ``FeatStruct``).
+            Note that by default, node strings and leaf strings are
+            delimited by whitespace and brackets; to override this
+            default, use the ``node_pattern`` and ``leaf_pattern``
+            arguments.
+
+        :type node_pattern: str
+        :type leaf_pattern: str
+        :param node_pattern, leaf_pattern: Regular expression patterns
+            used to find node and leaf substrings in ``s``.  By
+            default, both nodes patterns are defined to match any
+            sequence of non-whitespace non-bracket characters.
+
+        :type remove_empty_top_bracketing: bool
+        :param remove_empty_top_bracketing: If the resulting tree has
+            an empty node label, and is length one, then return its
+            single child instead.  This is useful for treebank trees,
+            which sometimes contain an extra level of bracketing.
+
+        :return: A tree corresponding to the string representation ``s``.
+            If this class method is called using a subclass of Tree,
+            then it will return a tree of that type.
+        :rtype: Tree
+        """
+        if not isinstance(brackets, string_types) or len(brackets) != 2:
+            raise TypeError('brackets must be a length-2 string')
+        if re.search('\s', brackets):
+            raise TypeError('whitespace brackets not allowed')
+        # Construct a regexp that will tokenize the string.
+        open_b, close_b = brackets
+        open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
+        if node_pattern is None:
+            node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
+        if leaf_pattern is None:
+            leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
+        token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
+            open_pattern, node_pattern, close_pattern, leaf_pattern))
+        # Walk through each token, updating a stack of trees.
+        stack = [(None, [])] # list of (node, children) tuples
+        for match in token_re.finditer(s):
+            token = match.group()
+            # Beginning of a tree/subtree
+            if token[0] == open_b:
+                if len(stack) == 1 and len(stack[0][1]) > 0:
+                    cls._parse_error(s, match, 'end-of-string')
+                label = token[1:].lstrip()
+                if read_node is not None: label = read_node(label)
+                stack.append((label, []))
+            # End of a tree/subtree
+            elif token == close_b:
+                if len(stack) == 1:
+                    if len(stack[0][1]) == 0:
+                        cls._parse_error(s, match, open_b)
+                    else:
+                        cls._parse_error(s, match, 'end-of-string')
+                label, children = stack.pop()
+                stack[-1][1].append(cls(label, children))
+            # Leaf node
+            else:
+                if len(stack) == 1:
+                    cls._parse_error(s, match, open_b)
+                if read_leaf is not None: token = read_leaf(token)
+                stack[-1][1].append(token)
+
+        # check that we got exactly one complete tree.
+        if len(stack) > 1:
+            cls._parse_error(s, 'end-of-string', close_b)
+        elif len(stack[0][1]) == 0:
+            cls._parse_error(s, 'end-of-string', open_b)
+        else:
+            assert stack[0][0] is None
+            assert len(stack[0][1]) == 1
+        tree = stack[0][1][0]
+
+        # If the tree has an extra level with node='', then get rid of
+        # it.  E.g.: "((S (NP ...) (VP ...)))"
+        if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1:
+            tree = tree[0]
+        # return the tree.
+        return tree
+
+    @classmethod
+    def _parse_error(cls, s, match, expecting):
+        """
+        Display a friendly error message when parsing a tree string fails.
+        :param s: The string we're parsing.
+        :param match: regexp match of the problem token.
+        :param expecting: what we expected to see instead.
+        """
+        # Construct a basic error message
+        if match == 'end-of-string':
+            pos, token = len(s), 'end-of-string'
+        else:
+            pos, token = match.start(), match.group()
+        msg = '%s.read(): expected %r but got %r\n%sat index %d.' % (
+            cls.__name__, expecting, token, ' '*12, pos)
+        # Add a display showing the error token itsels:
+        s = s.replace('\n', ' ').replace('\t', ' ')
+        offset = pos
+        if len(s) > pos+10:
+            s = s[:pos+10]+'...'
+        if pos > 10:
+            s = '...'+s[pos-10:]
+            offset = 13
+        msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset))
+        raise ValueError(msg)
+
+    #////////////////////////////////////////////////////////////
+    # Visualization & String Representation
+    #////////////////////////////////////////////////////////////
+
+    def draw(self):
+        """
+        Open a new window containing a graphical diagram of this tree.
+        """
+        from nltk.draw.tree import draw_trees
+        draw_trees(self)
+
+    def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs):
+        """
+        Pretty-print this tree as ASCII or Unicode art.
+        For explanation of the arguments, see the documentation for
+        `nltk.treeprettyprinter.TreePrettyPrinter`.
+        """
+        from nltk.treeprettyprinter import TreePrettyPrinter
+        print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs),
+              file=stream)
+
+    def __repr__(self):
+        childstr = ", ".join(unicode_repr(c) for c in self)
+        return '%s(%s, [%s])' % (type(self).__name__, unicode_repr(self._label), childstr)
+
+    def _repr_png_(self):
+        """
+        Draws and outputs in PNG for ipython.
+        PNG is used instead of PDF, since it can be displayed in the qt console and
+        has wider browser support.
+        """
+        import os
+        import base64
+        import subprocess
+        import tempfile
+        from nltk.draw.tree import tree_to_treesegment
+        from nltk.draw.util import CanvasFrame
+        from nltk.internals import find_binary
+        _canvas_frame = CanvasFrame()
+        widget = tree_to_treesegment(_canvas_frame.canvas(), self)
+        _canvas_frame.add_widget(widget)
+        x, y, w, h = widget.bbox()
+        # print_to_file uses scrollregion to set the width and height of the pdf.
+        _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h)
+        with tempfile.NamedTemporaryFile() as file:
+            in_path = '{0:}.ps'.format(file.name)
+            out_path = '{0:}.png'.format(file.name)
+            _canvas_frame.print_to_file(in_path)
+            _canvas_frame.destroy_widget(widget)
+            subprocess.call([find_binary('gs', binary_names=['gswin32c.exe', 'gswin64c.exe'], env_vars=['PATH'], verbose=False)] +
+                            '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
+                            .format(out_path, in_path).split())
+            with open(out_path, 'rb') as sr:
+                res = sr.read()
+            os.remove(in_path)
+            os.remove(out_path)
+            return base64.b64encode(res).decode()
+
+    def __str__(self):
+        return self.pformat()
+
+    def pprint(self, **kwargs):
+        """
+        Print a string representation of this Tree to 'stream'
+        """
+
+        if "stream" in kwargs:
+            stream = kwargs["stream"]
+            del kwargs["stream"]
+        else:
+            stream = None
+        print(self.pformat(**kwargs), file=stream)
+
+    def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
+        """
+        :return: A pretty-printed string representation of this tree.
+        :rtype: str
+        :param margin: The right margin at which to do line-wrapping.
+        :type margin: int
+        :param indent: The indentation level at which printing
+            begins.  This number is used to decide how far to indent
+            subsequent lines.
+        :type indent: int
+        :param nodesep: A string that is used to separate the node
+            from the children.  E.g., the default value ``':'`` gives
+            trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``.
+        """
+
+        # Try writing it on one line.
+        s = self._pformat_flat(nodesep, parens, quotes)
+        if len(s) + indent < margin:
+            return s
+
+        # If it doesn't fit on one line, then write it on multi-lines.
+        if isinstance(self._label, string_types):
+            s = '%s%s%s' % (parens[0], self._label, nodesep)
+        else:
+            s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep)
+        for child in self:
+            if isinstance(child, Tree):
+                s += '\n'+' '*(indent+2)+child.pformat(margin, indent+2,
+                                                  nodesep, parens, quotes)
+            elif isinstance(child, tuple):
+                s += '\n'+' '*(indent+2)+ "/".join(child)
+            elif isinstance(child, string_types) and not quotes:
+                s += '\n'+' '*(indent+2)+ '%s' % child
+            else:
+                s += '\n'+' '*(indent+2)+ unicode_repr(child)
+        return s+parens[1]
+
+    def pformat_latex_qtree(self):
+        r"""
+        Returns a representation of the tree compatible with the
+        LaTeX qtree package. This consists of the string ``\Tree``
+        followed by the tree represented in bracketed notation.
+
+        For example, the following result was generated from a parse tree of
+        the sentence ``The announcement astounded us``::
+
+          \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
+              [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
+
+        See http://www.ling.upenn.edu/advice/latex.html for the LaTeX
+        style file for the qtree package.
+
+        :return: A latex qtree representation of this tree.
+        :rtype: str
+        """
+        reserved_chars = re.compile('([#\$%&~_\{\}])')
+
+        pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]'))
+        return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat)
+
+    def _pformat_flat(self, nodesep, parens, quotes):
+        childstrs = []
+        for child in self:
+            if isinstance(child, Tree):
+                childstrs.append(child._pformat_flat(nodesep, parens, quotes))
+            elif isinstance(child, tuple):
+                childstrs.append("/".join(child))
+            elif isinstance(child, string_types) and not quotes:
+                childstrs.append('%s' % child)
+            else:
+                childstrs.append(unicode_repr(child))
+        if isinstance(self._label, string_types):
+            return '%s%s%s %s%s' % (parens[0], self._label, nodesep,
+                                    " ".join(childstrs), parens[1])
+        else:
+            return '%s%s%s %s%s' % (parens[0], unicode_repr(self._label), nodesep,
+                                    " ".join(childstrs), parens[1])
+
+
+class ImmutableTree(Tree):
+    def __init__(self, node, children=None):
+        super(ImmutableTree, self).__init__(node, children)
+        # Precompute our hash value.  This ensures that we're really
+        # immutable.  It also means we only have to calculate it once.
+        try:
+            self._hash = hash((self._label, tuple(self)))
+        except (TypeError, ValueError):
+            raise ValueError("%s: node value and children "
+                             "must be immutable" % type(self).__name__)
+
+    def __setitem__(self, index, value):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def __setslice__(self, i, j, value):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def __delitem__(self, index):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def __delslice__(self, i, j):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def __iadd__(self, other):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def __imul__(self, other):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def append(self, v):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def extend(self, v):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def pop(self, v=None):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def remove(self, v):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def reverse(self):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def sort(self):
+        raise ValueError('%s may not be modified' % type(self).__name__)
+    def __hash__(self):
+        return self._hash
+
+    def set_label(self, value):
+        """
+        Set the node label.  This will only succeed the first time the
+        node label is set, which should occur in ImmutableTree.__init__().
+        """
+        if hasattr(self, '_label'):
+            raise ValueError('%s may not be modified' % type(self).__name__)
+        self._label = value
+
+
+######################################################################
+## Parented trees
+######################################################################
+@add_metaclass(ABCMeta)
+class AbstractParentedTree(Tree):
+    """
+    An abstract base class for a ``Tree`` that automatically maintains
+    pointers to parent nodes.  These parent pointers are updated
+    whenever any change is made to a tree's structure.  Two subclasses
+    are currently defined:
+
+      - ``ParentedTree`` is used for tree structures where each subtree
+        has at most one parent.  This class should be used in cases
+        where there is no"sharing" of subtrees.
+
+      - ``MultiParentedTree`` is used for tree structures where a
+        subtree may have zero or more parents.  This class should be
+        used in cases where subtrees may be shared.
+
+    Subclassing
+    ===========
+    The ``AbstractParentedTree`` class redefines all operations that
+    modify a tree's structure to call two methods, which are used by
+    subclasses to update parent information:
+
+      - ``_setparent()`` is called whenever a new child is added.
+      - ``_delparent()`` is called whenever a child is removed.
+    """
+
+    def __init__(self, node, children=None):
+        super(AbstractParentedTree, self).__init__(node, children)
+        # If children is None, the tree is read from node, and
+        # all parents will be set during parsing.
+        if children is not None:
+            # Otherwise we have to set the parent of the children.
+            # Iterate over self, and *not* children, because children
+            # might be an iterator.
+            for i, child in enumerate(self):
+                if isinstance(child, Tree):
+                    self._setparent(child, i, dry_run=True)
+            for i, child in enumerate(self):
+                if isinstance(child, Tree):
+                    self._setparent(child, i)
+
+    #////////////////////////////////////////////////////////////
+    # Parent management
+    #////////////////////////////////////////////////////////////
+    @abstractmethod
+    def _setparent(self, child, index, dry_run=False):
+        """
+        Update the parent pointer of ``child`` to point to ``self``.  This
+        method is only called if the type of ``child`` is ``Tree``;
+        i.e., it is not called when adding a leaf to a tree.  This method
+        is always called before the child is actually added to the
+        child list of ``self``.
+
+        :type child: Tree
+        :type index: int
+        :param index: The index of ``child`` in ``self``.
+        :raise TypeError: If ``child`` is a tree with an impropriate
+            type.  Typically, if ``child`` is a tree, then its type needs
+            to match the type of ``self``.  This prevents mixing of
+            different tree types (single-parented, multi-parented, and
+            non-parented).
+        :param dry_run: If true, the don't actually set the child's
+            parent pointer; just check for any error conditions, and
+            raise an exception if one is found.
+        """
+
+    @abstractmethod
+    def _delparent(self, child, index):
+        """
+        Update the parent pointer of ``child`` to not point to self.  This
+        method is only called if the type of ``child`` is ``Tree``; i.e., it
+        is not called when removing a leaf from a tree.  This method
+        is always called before the child is actually removed from the
+        child list of ``self``.
+
+        :type child: Tree
+        :type index: int
+        :param index: The index of ``child`` in ``self``.
+        """
+
+    #////////////////////////////////////////////////////////////
+    # Methods that add/remove children
+    #////////////////////////////////////////////////////////////
+    # Every method that adds or removes a child must make
+    # appropriate calls to _setparent() and _delparent().
+
+    def __delitem__(self, index):
+        # del ptree[start:stop]
+        if isinstance(index, slice):
+            start, stop, step = slice_bounds(self, index, allow_step=True)
+            # Clear all the children pointers.
+            for i in range(start, stop, step):
+                if isinstance(self[i], Tree):
+                    self._delparent(self[i], i)
+            # Delete the children from our child list.
+            super(AbstractParentedTree, self).__delitem__(index)
+
+        # del ptree[i]
+        elif isinstance(index, int):
+            if index < 0: index += len(self)
+            if index < 0: raise IndexError('index out of range')
+            # Clear the child's parent pointer.
+            if isinstance(self[index], Tree):
+                self._delparent(self[index], index)
+            # Remove the child from our child list.
+            super(AbstractParentedTree, self).__delitem__(index)
+
+        elif isinstance(index, (list, tuple)):
+            # del ptree[()]
+            if len(index) == 0:
+                raise IndexError('The tree position () may not be deleted.')
+            # del ptree[(i,)]
+            elif len(index) == 1:
+                del self[index[0]]
+            # del ptree[i1, i2, i3]
+            else:
+                del self[index[0]][index[1:]]
+
+        else:
+            raise TypeError("%s indices must be integers, not %s" %
+                            (type(self).__name__, type(index).__name__))
+
+    def __setitem__(self, index, value):
+        # ptree[start:stop] = value
+        if isinstance(index, slice):
+            start, stop, step = slice_bounds(self, index, allow_step=True)
+            # make a copy of value, in case it's an iterator
+            if not isinstance(value, (list, tuple)):
+                value = list(value)
+            # Check for any error conditions, so we can avoid ending
+            # up in an inconsistent state if an error does occur.
+            for i, child in enumerate(value):
+                if isinstance(child, Tree):
+                    self._setparent(child, start + i*step, dry_run=True)
+            # clear the child pointers of all parents we're removing
+            for i in range(start, stop, step):
+                if isinstance(self[i], Tree):
+                    self._delparent(self[i], i)
+            # set the child pointers of the new children.  We do this
+            # after clearing *all* child pointers, in case we're e.g.
+            # reversing the elements in a tree.
+            for i, child in enumerate(value):
+                if isinstance(child, Tree):
+                    self._setparent(child, start + i*step)
+            # finally, update the content of the child list itself.
+            super(AbstractParentedTree, self).__setitem__(index, value)
+
+        # ptree[i] = value
+        elif isinstance(index, int):
+            if index < 0: index += len(self)
+            if index < 0: raise IndexError('index out of range')
+            # if the value is not changing, do nothing.
+            if value is self[index]:
+                return
+            # Set the new child's parent pointer.
+            if isinstance(value, Tree):
+                self._setparent(value, index)
+            # Remove the old child's parent pointer
+            if isinstance(self[index], Tree):
+                self._delparent(self[index], index)
+            # Update our child list.
+            super(AbstractParentedTree, self).__setitem__(index, value)
+
+        elif isinstance(index, (list, tuple)):
+            # ptree[()] = value
+            if len(index) == 0:
+                raise IndexError('The tree position () may not be assigned to.')
+            # ptree[(i,)] = value
+            elif len(index) == 1:
+                self[index[0]] = value
+            # ptree[i1, i2, i3] = value
+            else:
+                self[index[0]][index[1:]] = value
+
+        else:
+            raise TypeError("%s indices must be integers, not %s" %
+                            (type(self).__name__, type(index).__name__))
+
+    def append(self, child):
+        if isinstance(child, Tree):
+            self._setparent(child, len(self))
+        super(AbstractParentedTree, self).append(child)
+
+    def extend(self, children):
+        for child in children:
+            if isinstance(child, Tree):
+                self._setparent(child, len(self))
+            super(AbstractParentedTree, self).append(child)
+
+    def insert(self, index, child):
+        # Handle negative indexes.  Note that if index < -len(self),
+        # we do *not* raise an IndexError, unlike __getitem__.  This
+        # is done for consistency with list.__getitem__ and list.index.
+        if index < 0: index += len(self)
+        if index < 0: index = 0
+        # Set the child's parent, and update our child list.
+        if isinstance(child, Tree):
+            self._setparent(child, index)
+        super(AbstractParentedTree, self).insert(index, child)
+
+    def pop(self, index=-1):
+        if index < 0: index += len(self)
+        if index < 0: raise IndexError('index out of range')
+        if isinstance(self[index], Tree):
+            self._delparent(self[index], index)
+        return super(AbstractParentedTree, self).pop(index)
+
+    # n.b.: like `list`, this is done by equality, not identity!
+    # To remove a specific child, use del ptree[i].
+    def remove(self, child):
+        index = self.index(child)
+        if isinstance(self[index], Tree):
+            self._delparent(self[index], index)
+        super(AbstractParentedTree, self).remove(child)
+
+    # We need to implement __getslice__ and friends, even though
+    # they're deprecated, because otherwise list.__getslice__ will get
+    # called (since we're subclassing from list).  Just delegate to
+    # __getitem__ etc., but use max(0, start) and max(0, stop) because
+    # because negative indices are already handled *before*
+    # __getslice__ is called; and we don't want to double-count them.
+    if hasattr(list, '__getslice__'):
+        def __getslice__(self, start, stop):
+            return self.__getitem__(slice(max(0, start), max(0, stop)))
+        def __delslice__(self, start, stop):
+            return self.__delitem__(slice(max(0, start), max(0, stop)))
+        def __setslice__(self, start, stop, value):
+            return self.__setitem__(slice(max(0, start), max(0, stop)), value)
+
+class ParentedTree(AbstractParentedTree):
+    """
+    A ``Tree`` that automatically maintains parent pointers for
+    single-parented trees.  The following are methods for querying
+    the structure of a parented tree: ``parent``, ``parent_index``,
+    ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
+
+    Each ``ParentedTree`` may have at most one parent.  In
+    particular, subtrees may not be shared.  Any attempt to reuse a
+    single ``ParentedTree`` as a child of more than one parent (or
+    as multiple children of the same parent) will cause a
+    ``ValueError`` exception to be raised.
+
+    ``ParentedTrees`` should never be used in the same tree as ``Trees``
+    or ``MultiParentedTrees``.  Mixing tree implementations may result
+    in incorrect parent pointers and in ``TypeError`` exceptions.
+    """
+    def __init__(self, node, children=None):
+        self._parent = None
+        """The parent of this Tree, or None if it has no parent."""
+        super(ParentedTree, self).__init__(node, children)
+        if children is None:
+            # If children is None, the tree is read from node.
+            # After parsing, the parent of the immediate children
+            # will point to an intermediate tree, not self.
+            # We fix this by brute force:
+            for i, child in enumerate(self):
+                if isinstance(child, Tree):
+                    child._parent = None
+                    self._setparent(child, i)
+
+    def _frozen_class(self): return ImmutableParentedTree
+
+    #/////////////////////////////////////////////////////////////////
+    # Methods
+    #/////////////////////////////////////////////////////////////////
+
+    def parent(self):
+        """The parent of this tree, or None if it has no parent."""
+        return self._parent
+
+    def parent_index(self):
+        """
+        The index of this tree in its parent.  I.e.,
+        ``ptree.parent()[ptree.parent_index()] is ptree``.  Note that
+        ``ptree.parent_index()`` is not necessarily equal to
+        ``ptree.parent.index(ptree)``, since the ``index()`` method
+        returns the first child that is equal to its argument.
+        """
+        if self._parent is None: return None
+        for i, child in enumerate(self._parent):
+            if child is self: return i
+        assert False, 'expected to find self in self._parent!'
+
+    def left_sibling(self):
+        """The left sibling of this tree, or None if it has none."""
+        parent_index = self.parent_index()
+        if self._parent and parent_index > 0:
+            return self._parent[parent_index-1]
+        return None # no left sibling
+
+    def right_sibling(self):
+        """The right sibling of this tree, or None if it has none."""
+        parent_index = self.parent_index()
+        if self._parent and parent_index < (len(self._parent)-1):
+            return self._parent[parent_index+1]
+        return None # no right sibling
+
+    def root(self):
+        """
+        The root of this tree.  I.e., the unique ancestor of this tree
+        whose parent is None.  If ``ptree.parent()`` is None, then
+        ``ptree`` is its own root.
+        """
+        root = self
+        while root.parent() is not None:
+            root = root.parent()
+        return root
+
+    def treeposition(self):
+        """
+        The tree position of this tree, relative to the root of the
+        tree.  I.e., ``ptree.root[ptree.treeposition] is ptree``.
+        """
+        if self.parent() is None:
+            return ()
+        else:
+            return self.parent().treeposition() + (self.parent_index(),)
+
+
+    #/////////////////////////////////////////////////////////////////
+    # Parent Management
+    #/////////////////////////////////////////////////////////////////
+
+    def _delparent(self, child, index):
+        # Sanity checks
+        assert isinstance(child, ParentedTree)
+        assert self[index] is child
+        assert child._parent is self
+
+        # Delete child's parent pointer.
+        child._parent = None
+
+    def _setparent(self, child, index, dry_run=False):
+        # If the child's type is incorrect, then complain.
+        if not isinstance(child, ParentedTree):
+            raise TypeError('Can not insert a non-ParentedTree '+
+                            'into a ParentedTree')
+
+        # If child already has a parent, then complain.
+        if child._parent is not None:
+            raise ValueError('Can not insert a subtree that already '
+                             'has a parent.')
+
+        # Set child's parent pointer & index.
+        if not dry_run:
+            child._parent = self
+
+
+class MultiParentedTree(AbstractParentedTree):
+    """
+    A ``Tree`` that automatically maintains parent pointers for
+    multi-parented trees.  The following are methods for querying the
+    structure of a multi-parented tree: ``parents()``, ``parent_indices()``,
+    ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
+
+    Each ``MultiParentedTree`` may have zero or more parents.  In
+    particular, subtrees may be shared.  If a single
+    ``MultiParentedTree`` is used as multiple children of the same
+    parent, then that parent will appear multiple times in its
+    ``parents()`` method.
+
+    ``MultiParentedTrees`` should never be used in the same tree as
+    ``Trees`` or ``ParentedTrees``.  Mixing tree implementations may
+    result in incorrect parent pointers and in ``TypeError`` exceptions.
+    """
+    def __init__(self, node, children=None):
+        self._parents = []
+        """A list of this tree's parents.  This list should not
+           contain duplicates, even if a parent contains this tree
+           multiple times."""
+        super(MultiParentedTree, self).__init__(node, children)
+        if children is None:
+            # If children is None, the tree is read from node.
+            # After parsing, the parent(s) of the immediate children
+            # will point to an intermediate tree, not self.
+            # We fix this by brute force:
+            for i, child in enumerate(self):
+                if isinstance(child, Tree):
+                    child._parents = []
+                    self._setparent(child, i)
+
+    def _frozen_class(self): return ImmutableMultiParentedTree
+
+    #/////////////////////////////////////////////////////////////////
+    # Methods
+    #/////////////////////////////////////////////////////////////////
+
+    def parents(self):
+        """
+        The set of parents of this tree.  If this tree has no parents,
+        then ``parents`` is the empty set.  To check if a tree is used
+        as multiple children of the same parent, use the
+        ``parent_indices()`` method.
+
+        :type: list(MultiParentedTree)
+        """
+        return list(self._parents)
+
+    def left_siblings(self):
+        """
+        A list of all left siblings of this tree, in any of its parent
+        trees.  A tree may be its own left sibling if it is used as
+        multiple contiguous children of the same parent.  A tree may
+        appear multiple times in this list if it is the left sibling
+        of this tree with respect to multiple parents.
+
+        :type: list(MultiParentedTree)
+        """
+        return [parent[index-1]
+                for (parent, index) in self._get_parent_indices()
+                if index > 0]
+
+    def right_siblings(self):
+        """
+        A list of all right siblings of this tree, in any of its parent
+        trees.  A tree may be its own right sibling if it is used as
+        multiple contiguous children of the same parent.  A tree may
+        appear multiple times in this list if it is the right sibling
+        of this tree with respect to multiple parents.
+
+        :type: list(MultiParentedTree)
+        """
+        return [parent[index+1]
+                for (parent, index) in self._get_parent_indices()
+                if index < (len(parent)-1)]
+
+    def _get_parent_indices(self):
+        return [(parent, index)
+                for parent in self._parents
+                for index, child in enumerate(parent)
+                if child is self]
+
+    def roots(self):
+        """
+        The set of all roots of this tree.  This set is formed by
+        tracing all possible parent paths until trees with no parents
+        are found.
+
+        :type: list(MultiParentedTree)
+        """
+        return list(self._get_roots_helper({}).values())
+
+    def _get_roots_helper(self, result):
+        if self._parents:
+            for parent in self._parents:
+                parent._get_roots_helper(result)
+        else:
+            result[id(self)] = self
+        return result
+
+    def parent_indices(self, parent):
+        """
+        Return a list of the indices where this tree occurs as a child
+        of ``parent``.  If this child does not occur as a child of
+        ``parent``, then the empty list is returned.  The following is
+        always true::
+
+          for parent_index in ptree.parent_indices(parent):
+              parent[parent_index] is ptree
+        """
+        if parent not in self._parents: return []
+        else: return [index for (index, child) in enumerate(parent)
+                      if child is self]
+
+    def treepositions(self, root):
+        """
+        Return a list of all tree positions that can be used to reach
+        this multi-parented tree starting from ``root``.  I.e., the
+        following is always true::
+
+          for treepos in ptree.treepositions(root):
+              root[treepos] is ptree
+        """
+        if self is root:
+            return [()]
+        else:
+            return [treepos+(index,)
+                    for parent in self._parents
+                    for treepos in parent.treepositions(root)
+                    for (index, child) in enumerate(parent) if child is self]
+
+
+    #/////////////////////////////////////////////////////////////////
+    # Parent Management
+    #/////////////////////////////////////////////////////////////////
+
+    def _delparent(self, child, index):
+        # Sanity checks
+        assert isinstance(child, MultiParentedTree)
+        assert self[index] is child
+        assert len([p for p in child._parents if p is self]) == 1
+
+        # If the only copy of child in self is at index, then delete
+        # self from child's parent list.
+        for i, c in enumerate(self):
+            if c is child and i != index: break
+        else:
+            child._parents.remove(self)
+
+    def _setparent(self, child, index, dry_run=False):
+        # If the child's type is incorrect, then complain.
+        if not isinstance(child, MultiParentedTree):
+            raise TypeError('Can not insert a non-MultiParentedTree '+
+                            'into a MultiParentedTree')
+
+        # Add self as a parent pointer if it's not already listed.
+        if not dry_run:
+            for parent in child._parents:
+                if parent is self: break
+            else:
+                child._parents.append(self)
+
+class ImmutableParentedTree(ImmutableTree, ParentedTree):
+    pass
+
+class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
+    pass
+
+
+######################################################################
+## Probabilistic trees
+######################################################################
+
+@python_2_unicode_compatible
+class ProbabilisticTree(Tree, ProbabilisticMixIn):
+    def __init__(self, node, children=None, **prob_kwargs):
+        Tree.__init__(self, node, children)
+        ProbabilisticMixIn.__init__(self, **prob_kwargs)
+
+    # We have to patch up these methods to make them work right:
+    def _frozen_class(self): return ImmutableProbabilisticTree
+    def __repr__(self):
+        return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob())
+    def __str__(self):
+        return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob())
+    def copy(self, deep=False):
+        if not deep: return type(self)(self._label, self, prob=self.prob())
+        else: return type(self).convert(self)
+    @classmethod
+    def convert(cls, val):
+        if isinstance(val, Tree):
+            children = [cls.convert(child) for child in val]
+            if isinstance(val, ProbabilisticMixIn):
+                return cls(val._label, children, prob=val.prob())
+            else:
+                return cls(val._label, children, prob=1.0)
+        else:
+            return val
+
+    def __eq__(self, other):
+        return (self.__class__ is other.__class__ and
+                (self._label, list(self), self.prob()) ==
+                (other._label, list(other), other.prob()))
+
+    def __lt__(self, other):
+        if not isinstance(other, Tree):
+            raise_unorderable_types("<", self, other)
+        if self.__class__ is other.__class__:
+            return ((self._label, list(self), self.prob()) <
+                    (other._label, list(other), other.prob()))
+        else:
+            return self.__class__.__name__ < other.__class__.__name__
+
+
+@python_2_unicode_compatible
+class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
+    def __init__(self, node, children=None, **prob_kwargs):
+        ImmutableTree.__init__(self, node, children)
+        ProbabilisticMixIn.__init__(self, **prob_kwargs)
+        self._hash = hash((self._label, tuple(self), self.prob()))
+
+    # We have to patch up these methods to make them work right:
+    def _frozen_class(self): return ImmutableProbabilisticTree
+    def __repr__(self):
+        return '%s [%s]' % (Tree.unicode_repr(self), self.prob())
+    def __str__(self):
+        return '%s [%s]' % (self.pformat(margin=60), self.prob())
+    def copy(self, deep=False):
+        if not deep: return type(self)(self._label, self, prob=self.prob())
+        else: return type(self).convert(self)
+    @classmethod
+    def convert(cls, val):
+        if isinstance(val, Tree):
+            children = [cls.convert(child) for child in val]
+            if isinstance(val, ProbabilisticMixIn):
+                return cls(val._label, children, prob=val.prob())
+            else:
+                return cls(val._label, children, prob=1.0)
+        else:
+            return val
+
+
+def _child_names(tree):
+    names = []
+    for child in tree:
+        if isinstance(child, Tree):
+            names.append(Nonterminal(child._label))
+        else:
+            names.append(child)
+    return names
+
+######################################################################
+## Parsing
+######################################################################
+
+def bracket_parse(s):
+    """
+    Use Tree.read(s, remove_empty_top_bracketing=True) instead.
+    """
+    raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.")
+
+def sinica_parse(s):
+    """
+    Parse a Sinica Treebank string and return a tree.  Trees are represented as nested brackettings,
+    as shown in the following example (X represents a Chinese character):
+    S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
+
+    :return: A tree corresponding to the string representation.
+    :rtype: Tree
+    :param s: The string to be converted
+    :type s: str
+    """
+    tokens = re.split(r'([()| ])', s)
+    for i in range(len(tokens)):
+        if tokens[i] == '(':
+            tokens[i-1], tokens[i] = tokens[i], tokens[i-1]     # pull nonterminal inside parens
+        elif ':' in tokens[i]:
+            fields = tokens[i].split(':')
+            if len(fields) == 2:                                # non-terminal
+                tokens[i] = fields[1]
+            else:
+                tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
+        elif tokens[i] == '|':
+            tokens[i] = ''
+
+    treebank_string = " ".join(tokens)
+    return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
+
+#    s = re.sub(r'^#[^\s]*\s', '', s)  # remove leading identifier
+#    s = re.sub(r'\w+:', '', s)       # remove role tags
+
+#    return s
+
+######################################################################
+## Demonstration
+######################################################################
+
+def demo():
+    """
+    A demonstration showing how Trees and Trees can be
+    used.  This demonstration creates a Tree, and loads a
+    Tree from the Treebank corpus,
+    and shows the results of calling several of their methods.
+    """
+
+    from nltk import Tree, ProbabilisticTree
+
+    # Demonstrate tree parsing.
+    s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
+    t = Tree.fromstring(s)
+    print("Convert bracketed string into tree:")
+    print(t)
+    print(t.__repr__())
+
+    print("Display tree properties:")
+    print(t.label())         # tree's constituent type
+    print(t[0])             # tree's first child
+    print(t[1])             # tree's second child
+    print(t.height())
+    print(t.leaves())
+    print(t[1])
+    print(t[1,1])
+    print(t[1,1,0])
+
+    # Demonstrate tree modification.
+    the_cat = t[0]
+    the_cat.insert(1, Tree.fromstring('(JJ big)'))
+    print("Tree modification:")
+    print(t)
+    t[1,1,1] = Tree.fromstring('(NN cake)')
+    print(t)
+    print()
+
+    # Tree transforms
+    print("Collapse unary:")
+    t.collapse_unary()
+    print(t)
+    print("Chomsky normal form:")
+    t.chomsky_normal_form()
+    print(t)
+    print()
+
+    # Demonstrate probabilistic trees.
+    pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5)
+    print("Probabilistic Tree:")
+    print(pt)
+    print()
+
+    # Demonstrate parsing of treebank output format.
+    t = Tree.fromstring(t.pformat())
+    print("Convert tree to bracketed string and back again:")
+    print(t)
+    print()
+
+    # Demonstrate LaTeX output
+    print("LaTeX output:")
+    print(t.pformat_latex_qtree())
+    print()
+
+    # Demonstrate Productions
+    print("Production output:")
+    print(t.productions())
+    print()
+
+    # Demonstrate tree nodes containing objects other than strings
+    t.set_label(('test', 3))
+    print(t)
+
+__all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
+           'ProbabilisticTree', 'Tree', 'bracket_parse',
+           'sinica_parse', 'ParentedTree', 'MultiParentedTree',
+           'ImmutableParentedTree', 'ImmutableMultiParentedTree']
diff --git a/nlp_resource_data/nltk/tree.pyc b/nlp_resource_data/nltk/tree.pyc

new file mode 100755 (executable)

index 0000000..e4dae61

Binary files /dev/null and b/nlp_resource_data/nltk/tree.pyc differ
diff --git a/nlp_resource_data/nltk/treeprettyprinter.py b/nlp_resource_data/nltk/treeprettyprinter.py

new file mode 100755 (executable)

index 0000000..9e82d5b
--- /dev/null
+++ b/nlp_resource_data/nltk/treeprettyprinter.py
@@ -0,0 +1,564 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: ASCII visualization of NLTK trees
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Andreas van Cranenburgh <A.W.vanCranenburgh@uva.nl>
+#         Peter Ljunglöf <peter.ljunglof@gu.se>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Pretty-printing of discontinuous trees.
+Adapted from the disco-dop project, by Andreas van Cranenburgh.
+https://github.com/andreasvc/disco-dop
+
+Interesting reference (not used for this code):
+T. Eschbach et al., Orth. Hypergraph Drawing, Journal of
+Graph Algorithms and Applications, 10(2) 141--157 (2006)149.
+http://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf
+"""
+
+from __future__ import division, print_function, unicode_literals
+
+from nltk.util import slice_bounds, OrderedDict
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+from nltk.internals import raise_unorderable_types
+from nltk.tree import Tree
+
+import re
+import sys
+import codecs
+from cgi import escape
+from collections import defaultdict
+from operator import itemgetter
+from itertools import chain, islice
+
+
+ANSICOLOR = {
+    'black': 30,
+    'red': 31,
+    'green': 32,
+    'yellow': 33,
+    'blue': 34,
+    'magenta': 35,
+    'cyan': 36,
+    'white': 37,
+}
+
+
+@python_2_unicode_compatible
+class TreePrettyPrinter(object):
+    """
+    Pretty-print a tree in text format, either as ASCII or Unicode.
+    The tree can be a normal tree, or discontinuous.
+
+    ``TreePrettyPrinter(tree, sentence=None, highlight=())``
+    creates an object from which different visualizations can be created.
+
+    :param tree: a Tree object.
+    :param sentence: a list of words (strings). If `sentence` is given,
+        `tree` must contain integers as leaves, which are taken as indices
+        in `sentence`. Using this you can display a discontinuous tree.
+    :param highlight: Optionally, a sequence of Tree objects in `tree` which
+        should be highlighted. Has the effect of only applying colors to nodes
+        in this sequence (nodes should be given as Tree objects, terminals as
+        indices).
+
+    >>> from nltk.tree import Tree
+    >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))')
+    >>> print(TreePrettyPrinter(tree).text())
+    ... # doctest: +NORMALIZE_WHITESPACE
+          S
+      ____|____
+     NP        VP
+     |         |
+    Mary     walks
+    """
+
+    def __init__(self, tree, sentence=None, highlight=()):
+        if sentence is None:
+            leaves = tree.leaves()
+            if (leaves and not any(len(a) == 0 for a in tree.subtrees())
+                    and all(isinstance(a, int) for a in leaves)):
+                sentence = [str(a) for a in leaves]
+            else:
+                # this deals with empty nodes (frontier non-terminals)
+                # and multiple/mixed terminals under non-terminals.
+                tree = tree.copy(True)
+                sentence = []
+                for a in tree.subtrees():
+                    if len(a) == 0:
+                        a.append(len(sentence))
+                        sentence.append(None)
+                    elif any(not isinstance(b, Tree) for b in a):
+                        for n, b in enumerate(a):
+                            if not isinstance(b, Tree):
+                                a[n] = len(sentence)
+                                sentence.append('%s' % b)
+        self.nodes, self.coords, self.edges, self.highlight = self.nodecoords(
+                tree, sentence, highlight)
+
+    def __str__(self):
+        return self.text()
+
+    def __repr__(self):
+        return '<TreePrettyPrinter with %d nodes>' % len(self.nodes)
+
+
+    @staticmethod
+    def nodecoords(tree, sentence, highlight):
+        """
+        Produce coordinates of nodes on a grid.
+
+        Objective:
+
+        - Produce coordinates for a non-overlapping placement of nodes and
+            horizontal lines.
+        - Order edges so that crossing edges cross a minimal number of previous
+            horizontal lines (never vertical lines).
+
+        Approach:
+
+        - bottom up level order traversal (start at terminals)
+        - at each level, identify nodes which cannot be on the same row
+        - identify nodes which cannot be in the same column
+        - place nodes into a grid at (row, column)
+        - order child-parent edges with crossing edges last
+
+        Coordinates are (row, column); the origin (0, 0) is at the top left;
+        the root node is on row 0. Coordinates do not consider the size of a
+        node (which depends on font, &c), so the width of a column of the grid
+        should be automatically determined by the element with the greatest
+        width in that column. Alternatively, the integer coordinates could be
+        converted to coordinates in which the distances between adjacent nodes
+        are non-uniform.
+
+        Produces tuple (nodes, coords, edges, highlighted) where:
+
+        - nodes[id]: Tree object for the node with this integer id
+        - coords[id]: (n, m) coordinate where to draw node with id in the grid
+        - edges[id]: parent id of node with this id (ordered dictionary)
+        - highlighted: set of ids that should be highlighted
+        """
+        def findcell(m, matrix, startoflevel, children):
+            """
+            Find vacant row, column index for node ``m``.
+            Iterate over current rows for this level (try lowest first)
+            and look for cell between first and last child of this node,
+            add new row to level if no free row available.
+            """
+            candidates = [a for _, a in children[m]]
+            minidx, maxidx = min(candidates), max(candidates)
+            leaves = tree[m].leaves()
+            center = scale * sum(leaves) // len(leaves)  # center of gravity
+            if minidx < maxidx and not minidx < center < maxidx:
+                center = sum(candidates) // len(candidates)
+            if max(candidates) - min(candidates) > 2 * scale:
+                center -= center % scale  # round to unscaled coordinate
+                if minidx < maxidx and not minidx < center < maxidx:
+                    center += scale
+            if ids[m] == 0:
+                startoflevel = len(matrix)
+            for rowidx in range(startoflevel, len(matrix) + 1):
+                if rowidx == len(matrix):  # need to add a new row
+                    matrix.append([vertline if a not in (corner, None)
+                            else None for a in matrix[-1]])
+                row = matrix[rowidx]
+                i = j = center
+                if len(children[m]) == 1:  # place unaries directly above child
+                    return rowidx, next(iter(children[m]))[1]
+                elif all(a is None or a == vertline for a
+                        in row[min(candidates):max(candidates) + 1]):
+                    # find free column
+                    for n in range(scale):
+                        i = j = center + n
+                        while j > minidx or i < maxidx:
+                            if i < maxidx and (matrix[rowidx][i] is None
+                                    or i in candidates):
+                                return rowidx, i
+                            elif j > minidx and (matrix[rowidx][j] is None
+                                    or j in candidates):
+                                return rowidx, j
+                            i += scale
+                            j -= scale
+            raise ValueError('could not find a free cell for:\n%s\n%s'
+                    'min=%d; max=%d' % (tree[m], minidx, maxidx, dumpmatrix()))
+
+        def dumpmatrix():
+            """Dump matrix contents for debugging purposes."""
+            return '\n'.join(
+                '%2d: %s' % (n, ' '.join(('%2r' % i)[:2] for i in row))
+                for n, row in enumerate(matrix))
+
+        leaves = tree.leaves()
+        if not all(isinstance(n, int) for n in leaves):
+            raise ValueError('All leaves must be integer indices.')
+        if len(leaves) != len(set(leaves)):
+            raise ValueError('Indices must occur at most once.')
+        if not all(0 <= n < len(sentence) for n in leaves):
+            raise ValueError('All leaves must be in the interval 0..n '
+                    'with n=len(sentence)\ntokens: %d indices: '
+                    '%r\nsentence: %s' % (len(sentence), tree.leaves(), sentence))
+        vertline, corner = -1, -2  # constants
+        tree = tree.copy(True)
+        for a in tree.subtrees():
+            a.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n)
+        scale = 2
+        crossed = set()
+        # internal nodes and lexical nodes (no frontiers)
+        positions = tree.treepositions()
+        maxdepth = max(map(len, positions)) + 1
+        childcols = defaultdict(set)
+        matrix = [[None] * (len(sentence) * scale)]
+        nodes = {}
+        ids = dict((a, n) for n, a in enumerate(positions))
+        highlighted_nodes = set(n for a, n in ids.items()
+                                if not highlight or tree[a] in highlight)
+        levels = dict((n, []) for n in range(maxdepth - 1))
+        terminals = []
+        for a in positions:
+            node = tree[a]
+            if isinstance(node, Tree):
+                levels[maxdepth - node.height()].append(a)
+            else:
+                terminals.append(a)
+
+        for n in levels:
+            levels[n].sort(key=lambda n: max(tree[n].leaves())
+                    - min(tree[n].leaves()))
+        terminals.sort()
+        positions = set(positions)
+
+        for m in terminals:
+            i = int(tree[m]) * scale
+            assert matrix[0][i] is None, (matrix[0][i], m, i)
+            matrix[0][i] = ids[m]
+            nodes[ids[m]] = sentence[tree[m]]
+            if nodes[ids[m]] is None:
+                nodes[ids[m]] = '...'
+                highlighted_nodes.discard(ids[m])
+            positions.remove(m)
+            childcols[m[:-1]].add((0, i))
+
+        # add other nodes centered on their children,
+        # if the center is already taken, back off
+        # to the left and right alternately, until an empty cell is found.
+        for n in sorted(levels, reverse=True):
+            nodesatdepth = levels[n]
+            startoflevel = len(matrix)
+            matrix.append([vertline if a not in (corner, None) else None
+                    for a in matrix[-1]])
+            for m in nodesatdepth:  # [::-1]:
+                if n < maxdepth - 1 and childcols[m]:
+                    _, pivot = min(childcols[m], key=itemgetter(1))
+                    if (set(a[:-1] for row in matrix[:-1] for a in row[:pivot]
+                            if isinstance(a, tuple)) &
+                        set(a[:-1] for row in matrix[:-1] for a in row[pivot:]
+                            if isinstance(a, tuple))):
+                        crossed.add(m)
+
+                rowidx, i = findcell(m, matrix, startoflevel, childcols)
+                positions.remove(m)
+
+                # block positions where children of this node branch out
+                for _, x in childcols[m]:
+                    matrix[rowidx][x] = corner
+                # assert m == () or matrix[rowidx][i] in (None, corner), (
+                #         matrix[rowidx][i], m, str(tree), ' '.join(sentence))
+                # node itself
+                matrix[rowidx][i] = ids[m]
+                nodes[ids[m]] = tree[m]
+                # add column to the set of children for its parent
+                if m != ():
+                    childcols[m[:-1]].add((rowidx, i))
+        assert len(positions) == 0
+
+        # remove unused columns, right to left
+        for m in range(scale * len(sentence) - 1, -1, -1):
+            if not any(isinstance(row[m], (Tree, int))
+                    for row in matrix):
+                for row in matrix:
+                    del row[m]
+
+        # remove unused rows, reverse
+        matrix = [row for row in reversed(matrix)
+                if not all(a is None or a == vertline for a in row)]
+
+        # collect coordinates of nodes
+        coords = {}
+        for n, _ in enumerate(matrix):
+            for m, i in enumerate(matrix[n]):
+                if isinstance(i, int) and i >= 0:
+                    coords[i] = n, m
+
+        # move crossed edges last
+        positions = sorted([a for level in levels.values()
+                for a in level], key=lambda a: a[:-1] in crossed)
+
+        # collect edges from node to node
+        edges = OrderedDict()
+        for i in reversed(positions):
+            for j, _ in enumerate(tree[i]):
+                edges[ids[i + (j, )]] = ids[i]
+
+        return nodes, coords, edges, highlighted_nodes
+
+
+    def text(self, nodedist=1, unicodelines=False, html=False, ansi=False,
+             nodecolor='blue', leafcolor='red', funccolor='green',
+             abbreviate=None, maxwidth=16):
+        """
+        :return: ASCII art for a discontinuous tree.
+
+        :param unicodelines: whether to use Unicode line drawing characters
+            instead of plain (7-bit) ASCII.
+        :param html: whether to wrap output in html code (default plain text).
+        :param ansi: whether to produce colors with ANSI escape sequences
+            (only effective when html==False).
+        :param leafcolor, nodecolor: specify colors of leaves and phrasal
+            nodes; effective when either html or ansi is True.
+        :param abbreviate: if True, abbreviate labels longer than 5 characters.
+            If integer, abbreviate labels longer than `abbr` characters.
+        :param maxwidth: maximum number of characters before a label starts to
+            wrap; pass None to disable.
+        """
+        if abbreviate == True:
+            abbreviate = 5
+        if unicodelines:
+            horzline = '\u2500'
+            leftcorner = '\u250c'
+            rightcorner = '\u2510'
+            vertline = ' \u2502 '
+            tee = horzline + '\u252C' + horzline
+            bottom = horzline + '\u2534' + horzline
+            cross = horzline + '\u253c' + horzline
+            ellipsis = '\u2026'
+        else:
+            horzline = '_'
+            leftcorner = rightcorner = ' '
+            vertline = ' | '
+            tee = 3 * horzline
+            cross = bottom = '_|_'
+            ellipsis = '.'
+
+        def crosscell(cur, x=vertline):
+            """Overwrite center of this cell with a vertical branch."""
+            splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1
+            lst = list(cur)
+            lst[splitl:splitl + len(x)] = list(x)
+            return ''.join(lst)
+
+        result = []
+        matrix = defaultdict(dict)
+        maxnodewith = defaultdict(lambda: 3)
+        maxnodeheight = defaultdict(lambda: 1)
+        maxcol = 0
+        minchildcol = {}
+        maxchildcol = {}
+        childcols = defaultdict(set)
+        labels = {}
+        wrapre = re.compile('(.{%d,%d}\\b\\W*|.{%d})' % (
+                maxwidth - 4, maxwidth, maxwidth))
+        # collect labels and coordinates
+        for a in self.nodes:
+            row, column = self.coords[a]
+            matrix[row][column] = a
+            maxcol = max(maxcol, column)
+            label = (self.nodes[a].label() if isinstance(self.nodes[a], Tree)
+                     else self.nodes[a])
+            if abbreviate and len(label) > abbreviate:
+                label = label[:abbreviate] + ellipsis
+            if maxwidth and len(label) > maxwidth:
+                label = wrapre.sub(r'\1\n', label).strip()
+            label = label.split('\n')
+            maxnodeheight[row] = max(maxnodeheight[row], len(label))
+            maxnodewith[column] = max(maxnodewith[column], max(map(len, label)))
+            labels[a] = label
+            if a not in self.edges:
+                continue  # e.g., root
+            parent = self.edges[a]
+            childcols[parent].add((row, column))
+            minchildcol[parent] = min(minchildcol.get(parent, column), column)
+            maxchildcol[parent] = max(maxchildcol.get(parent, column), column)
+        # bottom up level order traversal
+        for row in sorted(matrix, reverse=True):
+            noderows = [[''.center(maxnodewith[col]) for col in range(maxcol + 1)]
+                    for _ in range(maxnodeheight[row])]
+            branchrow = [''.center(maxnodewith[col]) for col in range(maxcol + 1)]
+            for col in matrix[row]:
+                n = matrix[row][col]
+                node = self.nodes[n]
+                text = labels[n]
+                if isinstance(node, Tree):
+                    # draw horizontal branch towards children for this node
+                    if n in minchildcol and minchildcol[n] < maxchildcol[n]:
+                        i, j = minchildcol[n], maxchildcol[n]
+                        a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2
+                        branchrow[i] = ((' ' * a) + leftcorner).ljust(
+                                maxnodewith[i], horzline)
+                        branchrow[j] = (rightcorner + (' ' * b)).rjust(
+                                maxnodewith[j], horzline)
+                        for i in range(minchildcol[n] + 1, maxchildcol[n]):
+                            if i == col and any(
+                                    a == i for _, a in childcols[n]):
+                                line = cross
+                            elif i == col:
+                                line = bottom
+                            elif any(a == i for _, a in childcols[n]):
+                                line = tee
+                            else:
+                                line = horzline
+                            branchrow[i] = line.center(maxnodewith[i], horzline)
+                    else:  # if n and n in minchildcol:
+                        branchrow[col] = crosscell(branchrow[col])
+                text = [a.center(maxnodewith[col]) for a in text]
+                color = nodecolor if isinstance(node, Tree) else leafcolor
+                if isinstance(node, Tree) and node.label().startswith('-'):
+                    color = funccolor
+                if html:
+                    text = [escape(a) for a in text]
+                    if n in self.highlight:
+                        text = ['<font color=%s>%s</font>' % (
+                                color, a) for a in text]
+                elif ansi and n in self.highlight:
+                    text = ['\x1b[%d;1m%s\x1b[0m' % (
+                            ANSICOLOR[color], a) for a in text]
+                for x in range(maxnodeheight[row]):
+                    # draw vertical lines in partially filled multiline node
+                    # labels, but only if it's not a frontier node.
+                    noderows[x][col] = (text[x] if x < len(text)
+                            else (vertline if childcols[n] else ' ').center(
+                                maxnodewith[col], ' '))
+            # for each column, if there is a node below us which has a parent
+            # above us, draw a vertical branch in that column.
+            if row != max(matrix):
+                for n, (childrow, col) in self.coords.items():
+                    if (n > 0 and
+                            self.coords[self.edges[n]][0] < row < childrow):
+                        branchrow[col] = crosscell(branchrow[col])
+                        if col not in matrix[row]:
+                            for noderow in noderows:
+                                noderow[col] = crosscell(noderow[col])
+                branchrow = [a + ((a[-1] if a[-1] != ' ' else b[0]) * nodedist)
+                        for a, b in zip(branchrow, branchrow[1:] + [' '])]
+                result.append(''.join(branchrow))
+            result.extend((' ' * nodedist).join(noderow)
+                    for noderow in reversed(noderows))
+        return '\n'.join(reversed(result)) + '\n'
+
+
+    def svg(self, nodecolor='blue', leafcolor='red', funccolor='green'):
+        """
+        :return: SVG representation of a tree.
+        """
+        fontsize = 12
+        hscale = 40
+        vscale = 25
+        hstart = vstart = 20
+        width = max(col for _, col in self.coords.values())
+        height = max(row for row, _ in self.coords.values())
+        result = ['<svg version="1.1" xmlns="http://www.w3.org/2000/svg" '
+                  'width="%dem" height="%dem" viewBox="%d %d %d %d">' % (
+                      width * 3,
+                      height * 2.5,
+                      -hstart, -vstart,
+                      width * hscale + 3 * hstart,
+                      height * vscale + 3 * vstart)
+                      ]
+
+        children = defaultdict(set)
+        for n in self.nodes:
+            if n:
+                children[self.edges[n]].add(n)
+
+        # horizontal branches from nodes to children
+        for node in self.nodes:
+            if not children[node]:
+                continue
+            y, x = self.coords[node]
+            x *= hscale
+            y *= vscale
+            x += hstart
+            y += vstart + fontsize // 2
+            childx = [self.coords[c][1] for c in children[node]]
+            xmin = hstart + hscale * min(childx)
+            xmax = hstart + hscale * max(childx)
+            result.append(
+                '\t<polyline style="stroke:black; stroke-width:1; fill:none;" '
+                'points="%g,%g %g,%g" />' % (xmin, y, xmax, y))
+            result.append(
+                '\t<polyline style="stroke:black; stroke-width:1; fill:none;" '
+                'points="%g,%g %g,%g" />' % (x, y, x, y - fontsize // 3))
+
+        # vertical branches from children to parents
+        for child, parent in self.edges.items():
+            y, _ = self.coords[parent]
+            y *= vscale
+            y += vstart + fontsize // 2
+            childy, childx = self.coords[child]
+            childx *= hscale
+            childy *= vscale
+            childx += hstart
+            childy += vstart - fontsize
+            result += [
+                '\t<polyline style="stroke:white; stroke-width:10; fill:none;"'
+                ' points="%g,%g %g,%g" />' % (childx, childy, childx, y + 5),
+                '\t<polyline style="stroke:black; stroke-width:1; fill:none;"'
+                ' points="%g,%g %g,%g" />' % (childx, childy, childx, y),
+                ]
+
+        # write nodes with coordinates
+        for n, (row, column) in self.coords.items():
+            node = self.nodes[n]
+            x = column * hscale + hstart
+            y = row * vscale + vstart
+            if n in self.highlight:
+                color = nodecolor if isinstance(node, Tree) else leafcolor
+                if isinstance(node, Tree) and node.label().startswith('-'):
+                    color = funccolor
+            else:
+                color = 'black'
+            result += ['\t<text style="text-anchor: middle; fill: %s; '
+                       'font-size: %dpx;" x="%g" y="%g">%s</text>' % (
+                           color, fontsize, x, y,
+                           escape(node.label() if isinstance(node, Tree)
+                                  else node))]
+
+        result += ['</svg>']
+        return '\n'.join(result)
+
+
+def test():
+    """Do some tree drawing tests."""
+    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
+        print()
+        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
+        print(tree)
+        print()
+        drawtree = TreePrettyPrinter(tree, sentence)
+        try:
+            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
+        except (UnicodeDecodeError, UnicodeEncodeError):
+            print(drawtree.text(unicodelines=False, ansi=False, **xargs))
+
+    from nltk.corpus import treebank
+    for n in [0, 1440, 1591, 2771, 2170]:
+        tree = treebank.parsed_sents()[n]
+        print_tree(n, tree, nodedist=2, maxwidth=8)
+    print()
+    print('ASCII version:')
+    print(TreePrettyPrinter(tree).text(nodedist=2))
+
+    tree = Tree.fromstring(
+        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
+        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
+        '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
+    sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
+                ' zwemmen of terrassen .'.split())
+    print_tree('Discontinuous tree', tree, sentence, nodedist=2)
+
+
+__all__ = ['TreePrettyPrinter']
+
+if __name__ == '__main__':
+    test()
diff --git a/nlp_resource_data/nltk/treeprettyprinter.pyc b/nlp_resource_data/nltk/treeprettyprinter.pyc

new file mode 100755 (executable)

index 0000000..ffd6d3f

Binary files /dev/null and b/nlp_resource_data/nltk/treeprettyprinter.pyc differ
diff --git a/nlp_resource_data/nltk/treetransforms.py b/nlp_resource_data/nltk/treetransforms.py

new file mode 100755 (executable)

index 0000000..cf21528
--- /dev/null
+++ b/nlp_resource_data/nltk/treetransforms.py
@@ -0,0 +1,309 @@
+# Natural Language Toolkit: Tree Transformations
+#
+# Copyright (C) 2005-2007 Oregon Graduate Institute
+# Author: Nathan Bodenstab <bodenstab@cslu.ogi.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A collection of methods for tree (grammar) transformations used
+in parsing natural language.
+
+Although many of these methods are technically grammar transformations
+(ie. Chomsky Norm Form), when working with treebanks it is much more
+natural to visualize these modifications in a tree structure.  Hence,
+we will do all transformation directly to the tree itself.
+Transforming the tree directly also allows us to do parent annotation.
+A grammar can then be simply induced from the modified tree.
+
+The following is a short tutorial on the available transformations.
+
+ 1. Chomsky Normal Form (binarization)
+
+    It is well known that any grammar has a Chomsky Normal Form (CNF)
+    equivalent grammar where CNF is defined by every production having
+    either two non-terminals or one terminal on its right hand side.
+    When we have hierarchically structured data (ie. a treebank), it is
+    natural to view this in terms of productions where the root of every
+    subtree is the head (left hand side) of the production and all of
+    its children are the right hand side constituents.  In order to
+    convert a tree into CNF, we simply need to ensure that every subtree
+    has either two subtrees as children (binarization), or one leaf node
+    (non-terminal).  In order to binarize a subtree with more than two
+    children, we must introduce artificial nodes.
+
+    There are two popular methods to convert a tree into CNF: left
+    factoring and right factoring.  The following example demonstrates
+    the difference between them.  Example::
+
+     Original       Right-Factored     Left-Factored
+
+          A              A                      A
+        / | \          /   \                  /   \
+       B  C  D   ==>  B    A|<C-D>   OR   A|<B-C>  D
+                            /  \          /  \
+                           C    D        B    C
+
+ 2. Parent Annotation
+
+    In addition to binarizing the tree, there are two standard
+    modifications to node labels we can do in the same traversal: parent
+    annotation and Markov order-N smoothing (or sibling smoothing).
+
+    The purpose of parent annotation is to refine the probabilities of
+    productions by adding a small amount of context.  With this simple
+    addition, a CYK (inside-outside, dynamic programming chart parse)
+    can improve from 74% to 79% accuracy.  A natural generalization from
+    parent annotation is to grandparent annotation and beyond.  The
+    tradeoff becomes accuracy gain vs. computational complexity.  We
+    must also keep in mind data sparcity issues.  Example::
+
+     Original       Parent Annotation
+
+          A                A^<?>
+        / | \             /   \
+       B  C  D   ==>  B^<A>    A|<C-D>^<?>     where ? is the
+                                 /  \          parent of A
+                             C^<A>   D^<A>
+
+
+ 3. Markov order-N smoothing
+
+    Markov smoothing combats data sparcity issues as well as decreasing
+    computational requirements by limiting the number of children
+    included in artificial nodes.  In practice, most people use an order
+    2 grammar.  Example::
+
+      Original       No Smoothing       Markov order 1   Markov order 2   etc.
+
+       __A__            A                      A                A
+      / /|\ \         /   \                  /   \            /   \
+     B C D E F  ==>  B    A|<C-D-E-F>  ==>  B   A|<C>  ==>   B  A|<C-D>
+                            /   \               /   \            /   \
+                           C    ...            C    ...         C    ...
+
+
+
+    Annotation decisions can be thought about in the vertical direction
+    (parent, grandparent, etc) and the horizontal direction (number of
+    siblings to keep).  Parameters to the following functions specify
+    these values.  For more information see:
+
+    Dan Klein and Chris Manning (2003) "Accurate Unlexicalized
+    Parsing", ACL-03.  http://www.aclweb.org/anthology/P03-1054
+
+ 4. Unary Collapsing
+
+    Collapse unary productions (ie. subtrees with a single child) into a
+    new non-terminal (Tree node).  This is useful when working with
+    algorithms that do not allow unary productions, yet you do not wish
+    to lose the parent information.  Example::
+
+       A
+       |
+       B   ==>   A+B
+      / \        / \
+     C   D      C   D
+
+"""
+from __future__ import print_function
+
+from nltk.tree import Tree
+
+def chomsky_normal_form(tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"):
+    # assume all subtrees have homogeneous children
+    # assume all terminals have no siblings
+
+    # A semi-hack to have elegant looking code below.  As a result,
+    # any subtree with a branching factor greater than 999 will be incorrectly truncated.
+    if horzMarkov is None: horzMarkov = 999
+
+    # Traverse the tree depth-first keeping a list of ancestor nodes to the root.
+    # I chose not to use the tree.treepositions() method since it requires
+    # two traversals of the tree (one to get the positions, one to iterate
+    # over them) and node access time is proportional to the height of the node.
+    # This method is 7x faster which helps when parsing 40,000 sentences.
+
+    nodeList = [(tree, [tree.label()])]
+    while nodeList != []:
+        node, parent = nodeList.pop()
+        if isinstance(node,Tree):
+
+            # parent annotation
+            parentString = ""
+            originalNode = node.label()
+            if vertMarkov != 0 and node != tree and isinstance(node[0],Tree):
+                parentString = "%s<%s>" % (parentChar, "-".join(parent))
+                node.set_label(node.label() + parentString)
+                parent = [originalNode] + parent[:vertMarkov - 1]
+
+            # add children to the agenda before we mess with them
+            for child in node:
+                nodeList.append((child, parent))
+
+            # chomsky normal form factorization
+            if len(node) > 2:
+                childNodes = [child.label() for child in node]
+                nodeCopy = node.copy()
+                node[0:] = [] # delete the children
+
+                curNode = node
+                numChildren = len(nodeCopy)
+                for i in range(1,numChildren - 1):
+                    if factor == "right":
+                        newHead = "%s%s<%s>%s" % (originalNode, childChar, "-".join(childNodes[i:min([i+horzMarkov,numChildren])]),parentString) # create new head
+                        newNode = Tree(newHead, [])
+                        curNode[0:] = [nodeCopy.pop(0), newNode]
+                    else:
+                        newHead = "%s%s<%s>%s" % (originalNode, childChar, "-".join(childNodes[max([numChildren-i-horzMarkov,0]):-i]),parentString)
+                        newNode = Tree(newHead, [])
+                        curNode[0:] = [newNode, nodeCopy.pop()]
+
+                    curNode = newNode
+
+                curNode[0:] = [child for child in nodeCopy]
+
+
+def un_chomsky_normal_form(tree, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
+    # Traverse the tree-depth first keeping a pointer to the parent for modification purposes.
+    nodeList = [(tree,[])]
+    while nodeList != []:
+        node,parent = nodeList.pop()
+        if isinstance(node,Tree):
+            # if the node contains the 'childChar' character it means that
+            # it is an artificial node and can be removed, although we still need
+            # to move its children to its parent
+            childIndex = node.label().find(childChar)
+            if childIndex != -1:
+                nodeIndex = parent.index(node)
+                parent.remove(parent[nodeIndex])
+                # Generated node was on the left if the nodeIndex is 0 which
+                # means the grammar was left factored.  We must insert the children
+                # at the beginning of the parent's children
+                if nodeIndex == 0:
+                    parent.insert(0,node[0])
+                    parent.insert(1,node[1])
+                else:
+                    parent.extend([node[0],node[1]])
+
+                # parent is now the current node so the children of parent will be added to the agenda
+                node = parent
+            else:
+                parentIndex = node.label().find(parentChar)
+                if parentIndex != -1:
+                    # strip the node name of the parent annotation
+                    node.set_label(node.label()[:parentIndex])
+
+                # expand collapsed unary productions
+                if expandUnary == True:
+                    unaryIndex = node.label().find(unaryChar)
+                    if unaryIndex != -1:
+                        newNode = Tree(node.label()[unaryIndex + 1:], [i for i in node])
+                        node.set_label(node.label()[:unaryIndex])
+                        node[0:] = [newNode]
+
+            for child in node:
+                nodeList.append((child,node))
+
+
+def collapse_unary(tree, collapsePOS = False, collapseRoot = False, joinChar = "+"):
+    """
+    Collapse subtrees with a single child (ie. unary productions)
+    into a new non-terminal (Tree node) joined by 'joinChar'.
+    This is useful when working with algorithms that do not allow
+    unary productions, and completely removing the unary productions
+    would require loss of useful information.  The Tree is modified
+    directly (since it is passed by reference) and no value is returned.
+
+    :param tree: The Tree to be collapsed
+    :type  tree: Tree
+    :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
+                        Part-of-Speech tags) since they are always unary productions
+    :type  collapsePOS: bool
+    :param collapseRoot: 'False' (default) will not modify the root production
+                         if it is unary.  For the Penn WSJ treebank corpus, this corresponds
+                         to the TOP -> productions.
+    :type collapseRoot: bool
+    :param joinChar: A string used to connect collapsed node values (default = "+")
+    :type  joinChar: str
+    """
+
+    if collapseRoot == False and isinstance(tree, Tree) and len(tree) == 1:
+        nodeList = [tree[0]]
+    else:
+        nodeList = [tree]
+
+    # depth-first traversal of tree
+    while nodeList != []:
+        node = nodeList.pop()
+        if isinstance(node,Tree):
+            if len(node) == 1 and isinstance(node[0], Tree) and (collapsePOS == True or isinstance(node[0,0], Tree)):
+                node.set_label(node.label() + joinChar + node[0].label())
+                node[0:] = [child for child in node[0]]
+                # since we assigned the child's children to the current node,
+                # evaluate the current node again
+                nodeList.append(node)
+            else:
+                for child in node:
+                    nodeList.append(child)
+
+#################################################################
+# Demonstration
+#################################################################
+
+def demo():
+    """
+    A demonstration showing how each tree transform can be used.
+    """
+
+    from nltk.draw.tree import draw_trees
+    from nltk import tree, treetransforms
+    from copy import deepcopy
+
+    # original tree from WSJ bracketed text
+    sentence = """(TOP
+  (S
+    (S
+      (VP
+        (VBN Turned)
+        (ADVP (RB loose))
+        (PP
+          (IN in)
+          (NP
+            (NP (NNP Shane) (NNP Longman) (POS 's))
+            (NN trading)
+            (NN room)))))
+    (, ,)
+    (NP (DT the) (NN yuppie) (NNS dealers))
+    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
+    (. .)))"""
+    t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True)
+
+    # collapse subtrees with only one child
+    collapsedTree = deepcopy(t)
+    treetransforms.collapse_unary(collapsedTree)
+
+    # convert the tree to CNF
+    cnfTree = deepcopy(collapsedTree)
+    treetransforms.chomsky_normal_form(cnfTree)
+
+    # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
+    parentTree = deepcopy(collapsedTree)
+    treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)
+
+    # convert the tree back to its original form (used to make CYK results comparable)
+    original = deepcopy(parentTree)
+    treetransforms.un_chomsky_normal_form(original)
+
+    # convert tree back to bracketed text
+    sentence2 = original.pprint()
+    print(sentence)
+    print(sentence2)
+    print("Sentences the same? ", sentence == sentence2)
+
+    draw_trees(t, collapsedTree, cnfTree, parentTree, original)
+
+if __name__ == '__main__':
+    demo()
+
+__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"]
diff --git a/nlp_resource_data/nltk/treetransforms.pyc b/nlp_resource_data/nltk/treetransforms.pyc

new file mode 100755 (executable)

index 0000000..cf931cd

Binary files /dev/null and b/nlp_resource_data/nltk/treetransforms.pyc differ
diff --git a/nlp_resource_data/nltk/twitter/__init__.py b/nlp_resource_data/nltk/twitter/__init__.py

new file mode 100755 (executable)

index 0000000..655d7a9
--- /dev/null
+++ b/nlp_resource_data/nltk/twitter/__init__.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Twitter Package
+
+This package contains classes for retrieving Tweet documents using the
+Twitter API.
+
+"""
+try:
+    import twython
+except ImportError:
+    import warnings
+    warnings.warn("The twython library has not been installed. "
+                  "Some functionality from the twitter package will not be available.")
+else:
+    from nltk.twitter.util import Authenticate, credsfromfile
+    from nltk.twitter.twitterclient import Streamer, Query, Twitter,\
+         TweetViewer, TweetWriter
+
+
+from nltk.twitter.common import json2csv
diff --git a/nlp_resource_data/nltk/twitter/__init__.pyc b/nlp_resource_data/nltk/twitter/__init__.pyc

new file mode 100755 (executable)

index 0000000..7627ae6

Binary files /dev/null and b/nlp_resource_data/nltk/twitter/__init__.pyc differ
diff --git a/nlp_resource_data/nltk/twitter/api.py b/nlp_resource_data/nltk/twitter/api.py

new file mode 100755 (executable)

index 0000000..05c71f8
--- /dev/null
+++ b/nlp_resource_data/nltk/twitter/api.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter API
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Lorenzo Rubio <lrnzcig@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This module provides an interface for TweetHandlers, and support for timezone
+handling.
+"""
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from datetime import tzinfo, timedelta, datetime
+from nltk.compat import UTC
+import time as _time
+
+
+class LocalTimezoneOffsetWithUTC(tzinfo):
+    """
+    This is not intended to be a general purpose class for dealing with the
+    local timezone. In particular:
+
+    * it assumes that the date passed has been created using
+      `datetime(..., tzinfo=Local)`, where `Local` is an instance of
+      the object `LocalTimezoneOffsetWithUTC`;
+    * for such an object, it returns the offset with UTC, used for date comparisons.
+
+    Reference: https://docs.python.org/3/library/datetime.html
+    """
+    STDOFFSET = timedelta(seconds=-_time.timezone)
+
+    if _time.daylight:
+        DSTOFFSET = timedelta(seconds=-_time.altzone)
+    else:
+        DSTOFFSET = STDOFFSET
+
+    def utcoffset(self, dt):
+        """
+        Access the relevant time offset.
+        """
+        return self.DSTOFFSET
+
+
+LOCAL = LocalTimezoneOffsetWithUTC()
+
+
+@add_metaclass(ABCMeta)
+class BasicTweetHandler(object):
+    """
+    Minimal implementation of `TweetHandler`.
+
+    Counts the number of Tweets and decides when the client should stop
+    fetching them.
+    """
+    def __init__(self, limit=20):
+        self.limit = limit
+        self.counter = 0
+
+        """
+        A flag to indicate to the client whether to stop fetching data given
+        some condition (e.g., reaching a date limit).
+        """
+        self.do_stop = False
+
+        """
+        Stores the id of the last fetched Tweet to handle pagination.
+        """
+        self.max_id = None
+
+    def do_continue(self):
+        """
+        Returns `False` if the client should stop fetching Tweets.
+        """
+        return self.counter < self.limit and not self.do_stop
+
+class TweetHandlerI(BasicTweetHandler):
+    """
+    Interface class whose subclasses should implement a handle method that
+    Twitter clients can delegate to.
+    """
+    def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
+        """
+        :param int limit: The number of data items to process in the current\
+        round of processing.
+
+        :param tuple upper_date_limit: The date at which to stop collecting\
+        new data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`.\
+        E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
+
+        :param tuple lower_date_limit: The date at which to stop collecting\
+        new data. See `upper_data_limit` for formatting.
+        """
+        BasicTweetHandler.__init__(self, limit)
+
+        self.upper_date_limit = None
+        self.lower_date_limit = None
+        if upper_date_limit:
+            self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
+        if lower_date_limit:
+            self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
+
+        self.startingup = True
+
+    @abstractmethod
+    def handle(self, data):
+        """
+        Deal appropriately with data returned by the Twitter API
+        """
+
+    @abstractmethod
+    def on_finish(self):
+        """
+        Actions when the tweet limit has been reached
+        """
+
+    def check_date_limit(self, data, verbose=False):
+        """
+        Validate date limits.
+        """
+        if self.upper_date_limit or self.lower_date_limit:
+            date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
+            tweet_date = \
+                datetime.strptime(data['created_at'],
+                                  date_fmt).replace(tzinfo=UTC)
+            if (self.upper_date_limit and tweet_date > self.upper_date_limit) or \
+               (self.lower_date_limit and tweet_date < self.lower_date_limit):
+                if self.upper_date_limit:
+                    message = "earlier"
+                    date_limit = self.upper_date_limit
+                else:
+                    message = "later"
+                    date_limit = self.lower_date_limit
+                if verbose:
+                    print("Date limit {0} is {1} than date of current tweet {2}".\
+                      format(date_limit, message, tweet_date))
+                self.do_stop = True
diff --git a/nlp_resource_data/nltk/twitter/api.pyc b/nlp_resource_data/nltk/twitter/api.pyc

new file mode 100755 (executable)

index 0000000..8805e46

Binary files /dev/null and b/nlp_resource_data/nltk/twitter/api.pyc differ
diff --git a/nlp_resource_data/nltk/twitter/common.py b/nlp_resource_data/nltk/twitter/common.py

new file mode 100755 (executable)

index 0000000..811f56f
--- /dev/null
+++ b/nlp_resource_data/nltk/twitter/common.py
@@ -0,0 +1,258 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Lorenzo Rubio <lrnzcig@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility functions for the :module:`twitterclient` module which do not require
+the `twython` library to have been installed.
+"""
+from __future__ import print_function
+
+import csv
+import gzip
+import json
+
+import nltk.compat as compat
+
+
+HIER_SEPARATOR = "."
+
+def extract_fields(tweet, fields):
+    """
+    Extract field values from a full tweet and return them as a list
+
+    :param json tweet: The tweet in JSON format
+    :param list fields: The fields to be extracted from the tweet
+    :rtype: list(str)
+    """
+    out = []
+    for field in fields:
+        try:
+            _add_field_to_out(tweet, field, out)
+        except TypeError:
+            raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
+    return out
+
+def _add_field_to_out(json, field, out):
+    if _is_composed_key(field):
+        key, value = _get_key_value_composed(field)
+        _add_field_to_out(json[key], value, out)
+    else:
+        out += [json[field]]
+
+def _is_composed_key(field):
+    if HIER_SEPARATOR in field:
+        return True
+    return False
+
+def _get_key_value_composed(field):
+    out = field.split(HIER_SEPARATOR)
+    # there could be up to 3 levels
+    key = out[0]
+    value = HIER_SEPARATOR.join(out[1:])
+    return key, value
+
+def _get_entity_recursive(json, entity):
+    if not json:
+        return None
+    elif isinstance(json, dict):
+        for key, value in json.items():
+            if key == entity:
+                return value
+            # 'entities' and 'extended_entities' are wrappers in Twitter json
+            # structure that contain other Twitter objects. See:
+            # https://dev.twitter.com/overview/api/entities-in-twitter-objects
+
+            if key == 'entities' or key == 'extended_entities':
+                candidate = _get_entity_recursive(value, entity)
+                if candidate is not None:
+                    return candidate
+        return None
+    elif isinstance(json, list):
+        for item in json:
+            candidate = _get_entity_recursive(item, entity)
+            if candidate is not None:
+                return candidate
+        return None
+    else:
+        return None
+
+def json2csv(fp, outfile, fields, encoding='utf8', errors='replace',
+             gzip_compress=False):
+    """
+    Extract selected fields from a file of line-separated JSON tweets and
+    write to a file in CSV format.
+
+    This utility function allows a file of full tweets to be easily converted
+    to a CSV file for easier processing. For example, just TweetIDs or
+    just the text content of the Tweets can be extracted.
+
+    Additionally, the function allows combinations of fields of other Twitter
+    objects (mainly the users, see below).
+
+    For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
+    `json2csv_entities`
+
+    :param str infile: The name of the file containing full tweets
+
+    :param str outfile: The name of the text file where results should be\
+    written
+
+    :param list fields: The list of fields to be extracted. Useful examples\
+    are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
+    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
+    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
+    Additonally, it allows IDs from other Twitter objects, e. g.,\
+    ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
+
+    :param error: Behaviour for encoding errors, see\
+    https://docs.python.org/3/library/codecs.html#codec-base-classes
+
+    :param gzip_compress: if `True`, output files are compressed with gzip
+    """
+    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
+    # write the list of fields as header
+    writer.writerow(fields)
+    # process the file
+    for line in fp:
+        tweet = json.loads(line)
+        row = extract_fields(tweet, fields)
+        writer.writerow(row)
+    outf.close()
+
+
+def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
+    """
+    Identify appropriate CSV writer given the Python version
+    """
+    if compat.PY3:
+        if gzip_compress:
+            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
+        else:
+            outf = open(outfile, 'w', encoding=encoding, errors=errors)
+        writer = csv.writer(outf)
+    else:
+        if gzip_compress:
+            outf = gzip.open(outfile, 'wb')
+        else:
+            outf = open(outfile, 'wb')
+        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
+    return (writer, outf)
+
+
+def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
+                      encoding='utf8', errors='replace', gzip_compress=False):
+    """
+    Extract selected fields from a file of line-separated JSON tweets and
+    write to a file in CSV format.
+
+    This utility function allows a file of full Tweets to be easily converted
+    to a CSV file for easier processing of Twitter entities. For example, the
+    hashtags or media elements of a tweet can be extracted.
+
+    It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
+    there will be two lines in the output file, one per hashtag
+
+    :param tweets_file: the file-like object containing full Tweets
+
+    :param str outfile: The path of the text file where results should be\
+    written
+
+    :param list main_fields: The list of fields to be extracted from the main\
+    object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
+    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
+    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
+    If `entity_type` is expressed with hierarchy, then it is the list of\
+    fields of the object that corresponds to the key of the entity_type,\
+    (e.g., for entity_type='user.urls', the fields in the main_fields list\
+    belong to the user object; for entity_type='place.bounding_box', the\
+    files in the main_field list belong to the place object of the tweet).
+
+    :param list entity_type: The name of the entity: 'hashtags', 'media',\
+    'urls' and 'user_mentions' for the tweet object. For a user object,\
+    this needs to be expressed with a hierarchy: `'user.urls'`. For the\
+    bounding box of the Tweet location, use `'place.bounding_box'`.
+
+    :param list entity_fields: The list of fields to be extracted from the\
+    entity. E.g. `['text']` (of the Tweet)
+
+    :param error: Behaviour for encoding errors, see\
+    https://docs.python.org/3/library/codecs.html#codec-base-classes
+
+    :param gzip_compress: if `True`, ouput files are compressed with gzip
+    """
+
+    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
+    header = get_header_field_list(main_fields, entity_type, entity_fields)
+    writer.writerow(header)
+    for line in tweets_file:
+        tweet = json.loads(line)
+        if _is_composed_key(entity_type):
+            key, value = _get_key_value_composed(entity_type)
+            object_json = _get_entity_recursive(tweet, key)
+            if not object_json:
+                # this can happen in the case of "place"
+                continue
+            object_fields = extract_fields(object_json, main_fields)
+            items = _get_entity_recursive(object_json, value)
+            _write_to_file(object_fields, items, entity_fields, writer)
+        else:
+            tweet_fields = extract_fields(tweet, main_fields)
+            items = _get_entity_recursive(tweet, entity_type)
+            _write_to_file(tweet_fields, items, entity_fields, writer)
+    outf.close()
+
+def get_header_field_list(main_fields, entity_type, entity_fields):
+    if _is_composed_key(entity_type):
+        key, value = _get_key_value_composed(entity_type)
+        main_entity = key
+        sub_entity = value
+    else:
+        main_entity = None
+        sub_entity = entity_type
+
+    if main_entity:
+        output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
+    else:
+        output1 = main_fields
+    output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
+    return output1 + output2
+
+def _write_to_file(object_fields, items, entity_fields, writer):
+    if not items:
+        # it could be that the entity is just not present for the tweet
+        # e.g. tweet hashtag is always present, even as [], however
+        # tweet media may not be present
+        return
+    if isinstance(items, dict):
+        # this happens e.g. for "place" of a tweet
+        row = object_fields
+        # there might be composed keys in de list of required fields
+        entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
+        entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
+        for field in entity_field_values:
+            value = items[field]
+            if isinstance(value, list):
+                row += value
+            else:
+                row += [value]
+        # now check required dictionaries
+        for d in entity_field_composed:
+            kd, vd = _get_key_value_composed(d)
+            json_dict = items[kd]
+            if not isinstance(json_dict, dict):
+                raise RuntimeError("""Key {0} does not contain a dictionary
+                in the json file""".format(kd))
+            row += [json_dict[vd]]
+        writer.writerow(row)
+        return
+    # in general it is a list
+    for item in items:
+        row = object_fields + extract_fields(item, entity_fields)
+        writer.writerow(row)
+
diff --git a/nlp_resource_data/nltk/twitter/common.pyc b/nlp_resource_data/nltk/twitter/common.pyc

new file mode 100755 (executable)

index 0000000..7b371e2

Binary files /dev/null and b/nlp_resource_data/nltk/twitter/common.pyc differ
diff --git a/nlp_resource_data/nltk/twitter/twitter_demo.py b/nlp_resource_data/nltk/twitter/twitter_demo.py

new file mode 100755 (executable)

index 0000000..3338587
--- /dev/null
+++ b/nlp_resource_data/nltk/twitter/twitter_demo.py
@@ -0,0 +1,288 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Lorenzo Rubio <lrnzcig@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Examples to demo the :py:mod:`twitterclient` code.
+
+These demo functions should all run, with the following caveats:
+
+* You must have obtained API keys from Twitter, and installed them according to
+  the instructions in the `twitter HOWTO <http://www.nltk.org/howto/twitter.html>`_.
+
+* If you are on a slow network, some of the calls to the Twitter API may
+  timeout.
+
+* If you are being rate limited while searching, you will receive a 420
+  error response.
+
+* Your terminal window / console must be able to display UTF-8 encoded characters.
+
+For documentation about the Twitter APIs, see `The Streaming APIs Overview
+<https://dev.twitter.com/streaming/overview>`_ and `The REST APIs Overview
+<https://dev.twitter.com/rest/public>`_.
+
+For error codes see Twitter's
+`Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
+"""
+from __future__ import print_function
+
+import datetime
+from functools import wraps
+import json
+
+from nltk.compat import StringIO
+
+from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter,\
+     credsfromfile
+
+
+SPACER = '###################################'
+
+def verbose(func):
+    """Decorator for demo functions"""
+    @wraps(func)
+    def with_formatting(*args, **kwargs):
+        print()
+        print(SPACER)
+        print("Using %s" % (func.__name__))
+        print(SPACER)
+        return func(*args, **kwargs)
+    return with_formatting
+
+def yesterday():
+    """
+    Get yesterday's datetime as a 5-tuple.
+    """
+    date =  datetime.datetime.now()
+    date -= datetime.timedelta(days=1)
+    date_tuple = date.timetuple()[:6]
+    return date_tuple
+
+def setup():
+    """
+    Initialize global variables for the demos.
+    """
+    global USERIDS, FIELDS
+
+    USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800']
+    # UserIDs corresponding to\
+    #           @CNN,    @BBCNews, @ReutersLive, @BreakingNews, @AJELive
+    FIELDS = ['id_str']
+
+
+@verbose
+def twitterclass_demo():
+    """
+    Use the simplified :class:`Twitter` class to write some tweets to a file.
+    """
+    tw = Twitter()
+    print("Track from the public stream\n")
+    tw.tweets(keywords='love, hate', limit=10) #public stream
+    print(SPACER)
+    print("Search past Tweets\n")
+    tw = Twitter()
+    tw.tweets(keywords='love, hate', stream=False, limit=10) # search past tweets
+    print(SPACER)
+    print("Follow two accounts in the public stream" +
+          " -- be prepared to wait a few minutes\n")
+    tw = Twitter()
+    tw.tweets(follow=['759251', '6017542'], stream=True, limit=5) #public stream
+
+
+@verbose
+def sampletoscreen_demo(limit=20):
+    """
+    Sample from the Streaming API and send output to terminal.
+    """
+    oauth = credsfromfile()
+    client = Streamer(**oauth)
+    client.register(TweetViewer(limit=limit))
+    client.sample()
+
+
+@verbose
+def tracktoscreen_demo(track="taylor swift", limit=10):
+    """
+    Track keywords from the public Streaming API and send output to terminal.
+    """
+    oauth = credsfromfile()
+    client = Streamer(**oauth)
+    client.register(TweetViewer(limit=limit))
+    client.filter(track=track)
+
+
+@verbose
+def search_demo(keywords='nltk'):
+    """
+    Use the REST API to search for past tweets containing a given keyword.
+    """
+    oauth = credsfromfile()
+    client = Query(**oauth)
+    for tweet in client.search_tweets(keywords=keywords, limit=10):
+        print(tweet['text'])
+
+
+@verbose
+def tweets_by_user_demo(user='NLTK_org', count=200):
+    """
+    Use the REST API to search for past tweets by a given user.
+    """
+    oauth = credsfromfile()
+    client = Query(**oauth)
+    client.register(TweetWriter())
+    client.user_tweets(user, count)
+
+
+@verbose
+def lookup_by_userid_demo():
+    """
+    Use the REST API to convert a userID to a screen name.
+    """
+    oauth = credsfromfile()
+    client = Query(**oauth)
+    user_info = client.user_info_from_id(USERIDS)
+    for info in user_info:
+        name = info['screen_name']
+        followers = info['followers_count']
+        following = info['friends_count']
+        print("{0}, followers: {1}, following: {2}".format(name, followers, following))
+
+
+@verbose
+def followtoscreen_demo(limit=10):
+    """
+    Using the Streaming API, select just the tweets from a specified list of
+    userIDs.
+
+    This is will only give results in a reasonable time if the users in
+    question produce a high volume of tweets, and may even so show some delay.
+    """
+    oauth = credsfromfile()
+    client = Streamer(**oauth)
+    client.register(TweetViewer(limit=limit))
+    client.statuses.filter(follow=USERIDS)
+
+
+@verbose
+def streamtofile_demo(limit=20):
+    """
+    Write 20 tweets sampled from the public Streaming API to a file.
+    """
+    oauth = credsfromfile()
+    client = Streamer(**oauth)
+    client.register(TweetWriter(limit=limit, repeat=False))
+    client.statuses.sample()
+
+
+@verbose
+def limit_by_time_demo(keywords="nltk"):
+    """
+    Query the REST API for Tweets about NLTK since yesterday and send
+    the output to terminal.
+
+    This example makes the assumption that there are sufficient Tweets since
+    yesterday for the date to be an effective cut-off.
+    """
+    date = yesterday()
+    dt_date = datetime.datetime(*date)
+    oauth = credsfromfile()
+    client = Query(**oauth)
+    client.register(TweetViewer(limit=100, lower_date_limit=date))
+
+    print("Cutoff date: {}\n".format(dt_date))
+
+    for tweet in client.search_tweets(keywords=keywords):
+        print("{} ".format(tweet['created_at']), end='')
+        client.handler.handle(tweet)
+
+
+@verbose
+def corpusreader_demo():
+    """
+    Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out
+
+    * some full tweets in JSON format;
+    * some raw strings from the tweets (i.e., the value of the `text` field); and
+    * the result of tokenising the raw strings.
+
+    """
+    from nltk.corpus import twitter_samples as tweets
+
+    print()
+    print("Complete tweet documents")
+    print(SPACER)
+    for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
+        print(json.dumps(tweet, indent=1, sort_keys=True))
+
+    print()
+    print("Raw tweet strings:")
+    print(SPACER)
+    for text in tweets.strings("tweets.20150430-223406.json")[:15]:
+        print(text)
+
+    print()
+    print("Tokenized tweet strings:")
+    print(SPACER)
+    for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
+        print(toks)
+
+
+@verbose
+def expand_tweetids_demo():
+    """
+    Given a file object containing a list of Tweet IDs, fetch the
+    corresponding full Tweets, if available.
+
+    """
+    ids_f =\
+        StringIO("""\
+        588665495492124672
+        588665495487909888
+        588665495508766721
+        588665495513006080
+        588665495517200384
+        588665495487811584
+        588665495525588992
+        588665495487844352
+        588665495492014081
+        588665495512948737""")
+    oauth = credsfromfile()
+    client = Query(**oauth)
+    hydrated = client.expand_tweetids(ids_f)
+
+    for tweet in hydrated:
+            id_str = tweet['id_str']
+            print('id: {}'.format(id_str))
+            text = tweet['text']
+            if text.startswith('@null'):
+                text = "[Tweet not available]"
+            print(text + '\n')
+
+
+
+ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
+       search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo,
+       streamtofile_demo, limit_by_time_demo, corpusreader_demo, expand_tweetids_demo]
+
+"""
+Select demo functions to run. E.g. replace the following line with "DEMOS =
+ALL[8:]" to execute only the final three demos.
+"""
+DEMOS = ALL[:]
+
+if __name__ == "__main__":
+    setup()
+
+    for demo in DEMOS:
+        demo()
+
+    print("\n" + SPACER)
+    print("All demos completed")
+    print(SPACER)
+
diff --git a/nlp_resource_data/nltk/twitter/twitter_demo.pyc b/nlp_resource_data/nltk/twitter/twitter_demo.pyc

new file mode 100755 (executable)

index 0000000..c79b286

Binary files /dev/null and b/nlp_resource_data/nltk/twitter/twitter_demo.pyc differ
diff --git a/nlp_resource_data/nltk/twitter/twitterclient.py b/nlp_resource_data/nltk/twitter/twitterclient.py

new file mode 100755 (executable)

index 0000000..bd6197f
--- /dev/null
+++ b/nlp_resource_data/nltk/twitter/twitterclient.py
@@ -0,0 +1,530 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Lorenzo Rubio <lrnzcig@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+NLTK Twitter client
+
+This module offers methods for collecting and processing Tweets. Most of the
+functionality depends on access to the Twitter APIs, and this is handled via
+the third party Twython library.
+
+If one of the methods below returns an integer, it is probably a `Twitter
+error code <https://dev.twitter.com/overview/api/response-codes>`_. For
+example, the response of '420' means that you have reached the limit of the
+requests you can currently make to the Twitter API. Currently, `rate limits
+for the search API <https://dev.twitter.com/rest/public/rate-limiting>`_ are
+divided into 15 minute windows.
+"""
+
+import datetime
+import itertools
+import json
+import os
+import requests
+import time
+import gzip
+
+
+from twython import Twython, TwythonStreamer
+from twython.exceptions import TwythonRateLimitError, TwythonError
+
+from nltk.twitter.util import credsfromfile, guess_path
+from nltk.twitter.api import TweetHandlerI, BasicTweetHandler
+
+
+
+class Streamer(TwythonStreamer):
+    """
+    Retrieve data from the Twitter Streaming API.
+
+    The streaming API requires
+    `OAuth 1.0 <http://en.wikipedia.org/wiki/OAuth>`_ authentication.
+    """
+    def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
+
+        self.handler = None
+        self.do_continue = True
+        TwythonStreamer.__init__(self, app_key, app_secret, oauth_token,
+                                 oauth_token_secret)
+
+    def register(self, handler):
+        """
+        Register a method for handling Tweets.
+
+        :param TweetHandlerI handler: method for viewing
+        """
+        self.handler = handler
+
+    def on_success(self, data):
+        """
+        :param data: response from Twitter API
+        """
+        if self.do_continue:
+            if self.handler is not None:
+                if 'text' in data:
+                    self.handler.counter += 1
+                    self.handler.handle(data)
+                    self.do_continue = self.handler.do_continue()
+            else:
+                raise ValueError("No data handler has been registered.")
+        else:
+            self.disconnect()
+            self.handler.on_finish()
+
+
+    def on_error(self, status_code, data):
+        """
+        :param status_code: The status code returned by the Twitter API
+        :param data: The response from Twitter API
+
+        """
+        print(status_code)
+
+    def sample(self):
+        """
+        Wrapper for 'statuses / sample' API call
+        """
+        while self.do_continue:
+
+            # Stream in an endless loop until limit is reached. See twython
+            # issue 288: https://github.com/ryanmcgrath/twython/issues/288
+            # colditzjb commented on 9 Dec 2014
+
+            try:
+                self.statuses.sample()
+            except requests.exceptions.ChunkedEncodingError as e:
+                if e is not None:
+                    print("Error (stream will continue): {0}".format(e))
+                continue
+
+    def filter(self, track='', follow='', lang='en'):
+        """
+        Wrapper for 'statuses / filter' API call
+        """
+        while self.do_continue:
+            #Stream in an endless loop until limit is reached
+
+            try:
+                if track == '' and follow == '':
+                    msg = "Please supply a value for 'track', 'follow'"
+                    raise ValueError(msg)
+                self.statuses.filter(track=track, follow=follow, lang=lang)
+            except requests.exceptions.ChunkedEncodingError as e:
+                if e is not None:
+                    print("Error (stream will continue): {0}".format(e))
+                continue
+
+
+class Query(Twython):
+    """
+    Retrieve data from the Twitter REST API.
+    """
+    def __init__(self, app_key, app_secret, oauth_token,
+                 oauth_token_secret):
+        self.handler = None
+        self.do_continue = True
+        Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret)
+
+    def register(self, handler):
+        """
+        Register a method for handling Tweets.
+
+        :param TweetHandlerI handler: method for viewing or writing Tweets to a file.
+        """
+        self.handler = handler
+
+    def expand_tweetids(self, ids_f, verbose=True):
+        """
+        Given a file object containing a list of Tweet IDs, fetch the
+        corresponding full Tweets from the Twitter API.
+
+        The API call `statuses/lookup` will fail to retrieve a Tweet if the
+        user has deleted it.
+
+        This call to the Twitter API is rate-limited. See
+        <https://dev.twitter.com/rest/reference/get/statuses/lookup> for details.
+
+        :param ids_f: input file object consisting of Tweet IDs, one to a line
+        :return: iterable of Tweet objects in JSON format
+        """
+        ids = [line.strip() for line in ids_f if line]
+
+        if verbose:
+            print("Counted {0} Tweet IDs in {1}.".format(len(ids), ids_f))
+
+        # The Twitter endpoint takes lists of up to 100 ids, so we chunk the
+        # ids.
+        id_chunks = [ids[i:i+100] for i in range(0, len(ids), 100)]
+
+        chunked_tweets = (self.lookup_status(id=chunk) for chunk in
+                          id_chunks)
+
+        return itertools.chain.from_iterable(chunked_tweets)
+
+
+
+    def _search_tweets(self, keywords, limit=100, lang='en'):
+        """
+        Assumes that the handler has been informed. Fetches Tweets from
+        search_tweets generator output and passses them to handler
+
+        :param str keywords: A list of query terms to search for, written as\
+        a comma-separated string.
+        :param int limit: Number of Tweets to process
+        :param str lang: language
+        """
+        while True:
+            tweets = self.search_tweets(keywords=keywords, limit=limit, lang=lang,
+                                        max_id=self.handler.max_id)
+            for tweet in tweets:
+                self.handler.handle(tweet)
+            if not (self.handler.do_continue() and self.handler.repeat):
+                break
+        self.handler.on_finish()
+
+    def search_tweets(self, keywords, limit=100, lang='en', max_id=None,
+                      retries_after_twython_exception=0):
+        """
+        Call the REST API ``'search/tweets'`` endpoint with some plausible
+        defaults. See `the Twitter search documentation
+        <https://dev.twitter.com/rest/public/search>`_ for more information
+        about admissable search parameters.
+
+        :param str keywords: A list of query terms to search for, written as\
+        a comma-separated string
+        :param int limit: Number of Tweets to process
+        :param str lang: language
+        :param int max_id: id of the last tweet fetched
+        :param int retries_after_twython_exception: number of retries when\
+        searching Tweets before raising an exception
+        :rtype: python generator
+        """
+        if not self.handler:
+            # if no handler is provided, `BasicTweetHandler` provides minimum
+            # functionality for limiting the number of Tweets retrieved
+            self.handler = BasicTweetHandler(limit=limit)
+
+        count_from_query = 0
+        if max_id:
+            self.handler.max_id = max_id
+        else:
+            results = self.search(q=keywords, count=min(100, limit), lang=lang,
+                                  result_type='recent')
+            count = len(results['statuses'])
+            if count == 0:
+                print("No Tweets available through REST API for those keywords")
+                return
+            count_from_query = count
+            self.handler.max_id = results['statuses'][count - 1]['id'] - 1
+
+            for result in results['statuses']:
+                yield result
+                self.handler.counter += 1
+                if self.handler.do_continue() == False:
+                    return
+
+
+        # Pagination loop: keep fetching Tweets until the desired count is
+        # reached while dealing with Twitter rate limits.
+        retries = 0
+        while count_from_query < limit:
+            try:
+                mcount = min(100, limit-count_from_query)
+                results = self.search(q=keywords, count=mcount, lang=lang,
+                                      max_id=self.handler.max_id, result_type='recent')
+            except TwythonRateLimitError as e:
+                print("Waiting for 15 minutes -{0}".format(e))
+                time.sleep(15*60) # wait 15 minutes
+                continue
+            except TwythonError as e:
+                print("Fatal error in Twython request -{0}".format(e))
+                if retries_after_twython_exception == retries:
+                    raise e
+                retries += 1
+
+            count = len(results['statuses'])
+            if count == 0:
+                print("No more Tweets available through rest api")
+                return
+            count_from_query += count
+            # the max_id is also present in the Tweet metadata
+            # results['search_metadata']['next_results'], but as part of a
+            # query and difficult to fetch. This is doing the equivalent
+            # (last tweet id minus one)
+            self.handler.max_id = results['statuses'][count - 1]['id'] - 1
+
+            for result in results['statuses']:
+                yield result
+                self.handler.counter += 1
+                if self.handler.do_continue() == False:
+                    return
+
+    def user_info_from_id(self, userids):
+        """
+        Convert a list of userIDs into a variety of information about the users.
+
+        See <https://dev.twitter.com/rest/reference/get/users/show>.
+
+        :param list userids: A list of integer strings corresponding to Twitter userIDs
+        :rtype: list(json)
+        """
+        return [self.show_user(user_id=userid) for userid in userids]
+
+    def user_tweets(self, screen_name, limit, include_rts='false'):
+        """
+        Return a collection of the most recent Tweets posted by the user
+
+        :param str user: The user's screen name; the initial '@' symbol\
+        should be omitted
+        :param int limit: The number of Tweets to recover; 200 is the maximum allowed
+        :param str include_rts: Whether to include statuses which have been\
+        retweeted by the user; possible values are 'true' and 'false'
+        """
+        data = self.get_user_timeline(screen_name=screen_name, count=limit,
+                                      include_rts=include_rts)
+        for item in data:
+            self.handler.handle(item)
+
+
+
+
+class Twitter(object):
+    """
+    Wrapper class with restricted functionality and fewer options.
+    """
+    def __init__(self):
+        self._oauth = credsfromfile()
+        self.streamer = Streamer(**self._oauth)
+        self.query = Query(**self._oauth)
+
+
+    def tweets(self, keywords='', follow='', to_screen=True, stream=True,
+               limit=100, date_limit=None, lang='en', repeat=False,
+               gzip_compress=False):
+        """
+        Process some Tweets in a simple manner.
+
+        :param str keywords: Keywords to use for searching or filtering
+        :param list follow: UserIDs to use for filtering Tweets from the public stream
+        :param bool to_screen: If `True`, display the tweet texts on the screen,\
+        otherwise print to a file
+
+        :param bool stream: If `True`, use the live public stream,\
+        otherwise search past public Tweets
+
+        :param int limit: The number of data items to process in the current\
+        round of processing.
+
+        :param tuple date_limit: The date at which to stop collecting\
+        new data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`.\
+        E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
+        Note that, in the case of streaming, this is the maximum date, i.e.\
+        a date in the future; if not, it is the minimum date, i.e. a date\
+        in the past
+
+        :param str lang: language
+
+        :param bool repeat: A flag to determine whether multiple files should\
+        be written. If `True`, the length of each file will be set by the\
+        value of `limit`. Use only if `to_screen` is `False`. See also
+        :py:func:`handle`.
+
+        :param gzip_compress: if `True`, output files are compressed with gzip.
+        """
+        if stream:
+            upper_date_limit = date_limit
+            lower_date_limit = None
+        else:
+            upper_date_limit = None
+            lower_date_limit = date_limit
+
+        if to_screen:
+            handler = TweetViewer(limit=limit,
+                                  upper_date_limit=upper_date_limit,
+                                  lower_date_limit=lower_date_limit)
+        else:
+            handler = TweetWriter(limit=limit,
+                                  upper_date_limit=upper_date_limit,
+                                  lower_date_limit=lower_date_limit, repeat=repeat,
+                                  gzip_compress=gzip_compress)
+
+
+
+        if to_screen:
+            handler = TweetViewer(limit=limit)
+        else:
+            if stream:
+                upper_date_limit = date_limit
+                lower_date_limit = None
+            else:
+                upper_date_limit = None
+                lower_date_limit = date_limit
+
+            handler = TweetWriter(limit=limit, upper_date_limit=upper_date_limit,
+                                  lower_date_limit=lower_date_limit, repeat=repeat,
+                                  gzip_compress=gzip_compress)
+
+        if stream:
+            self.streamer.register(handler)
+            if keywords == '' and follow == '':
+                self.streamer.sample()
+            else:
+                self.streamer.filter(track=keywords, follow=follow, lang=lang)
+        else:
+            self.query.register(handler)
+            if keywords == '':
+                raise ValueError("Please supply at least one keyword to search for.")
+            else:
+                self.query._search_tweets(keywords, limit=limit, lang=lang)
+
+
+
+class TweetViewer(TweetHandlerI):
+    """
+    Handle data by sending it to the terminal.
+    """
+
+    def handle(self, data):
+        """
+        Direct data to `sys.stdout`
+
+        :return: return ``False`` if processing should cease, otherwise return ``True``.
+        :rtype: bool
+        :param data: Tweet object returned by Twitter API
+        """
+        text = data['text']
+        print(text)
+
+        self.check_date_limit(data)
+        if self.do_stop:
+            return
+
+    def on_finish(self):
+        print('Written {0} Tweets'.format(self.counter))
+
+
+class TweetWriter(TweetHandlerI):
+    """
+    Handle data by writing it to a file.
+    """
+    def __init__(self, limit=2000, upper_date_limit=None, lower_date_limit=None,
+                 fprefix='tweets', subdir='twitter-files', repeat=False,
+                 gzip_compress=False):
+        """
+        The difference between the upper and lower date limits depends on
+        whether Tweets are coming in an ascending date order (i.e. when
+        streaming) or descending date order (i.e. when searching past Tweets).
+
+        :param int limit: number of data items to process in the current\
+        round of processing.
+
+        :param tuple upper_date_limit: The date at which to stop collecting new\
+        data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\
+        40)` for 12:30 pm on April 1 2015.
+
+        :param tuple lower_date_limit: The date at which to stop collecting new\
+        data. See `upper_data_limit` for formatting.
+
+        :param str fprefix: The prefix to use in creating file names for Tweet\
+        collections.
+
+        :param str subdir: The name of the directory where Tweet collection\
+        files should be stored.
+
+        :param bool repeat: flag to determine whether multiple files should be\
+        written. If `True`, the length of each file will be set by the value\
+        of `limit`. See also :py:func:`handle`.
+
+        :param gzip_compress: if `True`, ouput files are compressed with gzip.
+        """
+        self.fprefix = fprefix
+        self.subdir = guess_path(subdir)
+        self.gzip_compress = gzip_compress
+        self.fname = self.timestamped_file()
+        self.repeat = repeat
+        self.output = None
+        TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit)
+
+
+    def timestamped_file(self):
+        """
+        :return: timestamped file name
+        :rtype: str
+        """
+        subdir = self.subdir
+        fprefix = self.fprefix
+        if subdir:
+            if not os.path.exists(subdir):
+                os.mkdir(subdir)
+
+        fname = os.path.join(subdir, fprefix)
+        fmt = '%Y%m%d-%H%M%S'
+        timestamp = datetime.datetime.now().strftime(fmt)
+        if self.gzip_compress:
+            suffix = '.gz'
+        else:
+            suffix = ''
+        outfile = '{0}.{1}.json{2}'.format(fname, timestamp, suffix)
+        return outfile
+
+
+    def handle(self, data):
+        """
+        Write Twitter data as line-delimited JSON into one or more files.
+
+        :return: return `False` if processing should cease, otherwise return `True`.
+        :param data: tweet object returned by Twitter API
+        """
+        if self.startingup:
+            if self.gzip_compress:
+                self.output = gzip.open(self.fname, 'w')
+            else:
+                self.output = open(self.fname, 'w')
+            print('Writing to {0}'.format(self.fname))
+
+        json_data = json.dumps(data)
+        if self.gzip_compress:
+            self.output.write((json_data + "\n").encode('utf-8'))
+        else:
+            self.output.write(json_data + "\n")
+
+        self.check_date_limit(data)
+        if self.do_stop:
+            return
+
+        self.startingup = False
+
+    def on_finish(self):
+        print('Written {0} Tweets'.format(self.counter))
+        if self.output:
+            self.output.close()
+
+    def do_continue(self):
+        if self.repeat == False:
+            return TweetHandlerI.do_continue(self)
+
+        if self.do_stop:
+            # stop for a functional cause (e.g. date limit)
+            return False
+
+        if self.counter == self.limit:
+            # repeat is True, thus close output file and
+            # create a new one
+            self._restart_file()
+        return True
+
+
+    def _restart_file(self):
+        self.on_finish()
+        self.fname = self.timestamped_file()
+        self.startingup = True
+        self.counter = 0
diff --git a/nlp_resource_data/nltk/twitter/twitterclient.pyc b/nlp_resource_data/nltk/twitter/twitterclient.pyc

new file mode 100755 (executable)

index 0000000..1ea2733

Binary files /dev/null and b/nlp_resource_data/nltk/twitter/twitterclient.pyc differ
diff --git a/nlp_resource_data/nltk/twitter/util.py b/nlp_resource_data/nltk/twitter/util.py

new file mode 100755 (executable)

index 0000000..16b1507
--- /dev/null
+++ b/nlp_resource_data/nltk/twitter/util.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+#         Lorenzo Rubio <lrnzcig@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Authentication utilities to accompany :module:`twitterclient`.
+"""
+
+from __future__ import print_function
+
+import os
+import pprint
+from twython import Twython
+
+def credsfromfile(creds_file=None, subdir=None, verbose=False):
+    """
+    Convenience function for authentication
+    """
+    return Authenticate().load_creds(creds_file=creds_file, subdir=subdir, verbose=verbose)
+
+
+class Authenticate(object):
+    """
+    Methods for authenticating with Twitter.
+    """
+    def __init__(self):
+        self.creds_file = 'credentials.txt'
+        self.creds_fullpath = None
+
+        self.oauth = {}
+        try:
+            self.twitter_dir = os.environ['TWITTER']
+            self.creds_subdir = self.twitter_dir
+        except KeyError:
+            self.twitter_dir = None
+            self.creds_subdir = None
+
+
+    def load_creds(self, creds_file=None, subdir=None, verbose=False):
+        """
+        Read OAuth credentials from a text file.
+
+        ::
+           File format for OAuth 1
+           =======================
+           app_key=YOUR_APP_KEY
+           app_secret=YOUR_APP_SECRET
+           oauth_token=OAUTH_TOKEN
+           oauth_token_secret=OAUTH_TOKEN_SECRET
+
+
+        ::
+           File format for OAuth 2
+           =======================
+
+           app_key=YOUR_APP_KEY
+           app_secret=YOUR_APP_SECRET
+           access_token=ACCESS_TOKEN
+
+        :param str file_name: File containing credentials. ``None`` (default) reads\
+        data from `TWITTER/'credentials.txt'`
+        """
+        if creds_file is not None:
+            self.creds_file = creds_file
+
+        if subdir is None:
+            if self.creds_subdir is None:
+                msg = "Supply a value to the 'subdir' parameter or" +\
+                      " set the TWITTER environment variable."
+                raise ValueError(msg)
+        else:
+            self.creds_subdir = subdir
+
+        self.creds_fullpath =\
+            os.path.normpath(os.path.join(self.creds_subdir, self.creds_file))
+
+        if not os.path.isfile(self.creds_fullpath):
+            raise OSError('Cannot find file {}'.format(self.creds_fullpath))
+
+        with open(self.creds_fullpath) as infile:
+            if verbose:
+                print('Reading credentials file {}'.format(self.creds_fullpath))
+
+            for line in infile:
+                if '=' in line:
+                    name, value = line.split('=', 1)
+                    self.oauth[name.strip()] = value.strip()
+
+        self._validate_creds_file(verbose=verbose)
+
+        return self.oauth
+
+    def _validate_creds_file(self, verbose=False):
+        """Check validity of a credentials file."""
+        oauth1 = False
+        oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret']
+        oauth2 = False
+        oauth2_keys = ['app_key', 'app_secret', 'access_token']
+        if all(k in self.oauth for k in oauth1_keys):
+            oauth1 = True
+        elif all(k in self.oauth for k in oauth2_keys):
+            oauth2 = True
+
+        if not (oauth1 or oauth2):
+            msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file)
+            msg += pprint.pformat(self.oauth)
+            raise ValueError(msg)
+        elif verbose:
+            print('Credentials file "{}" looks good'.format(self.creds_file))
+
+
+def add_access_token(creds_file=None):
+    """
+    For OAuth 2, retrieve an access token for an app and append it to a
+    credentials file.
+    """
+    if creds_file is None:
+        path = os.path.dirname(__file__)
+        creds_file = os.path.join(path, 'credentials2.txt')
+    oauth2 = credsfromfile(creds_file=creds_file)
+    app_key = oauth2['app_key']
+    app_secret = oauth2['app_secret']
+
+    twitter = Twython(app_key, app_secret, oauth_version=2)
+    access_token = twitter.obtain_access_token()
+    tok = 'access_token={}\n'.format(access_token)
+    with open(creds_file, 'a') as infile:
+        print(tok, file=infile)
+
+
+def guess_path(pth):
+    """
+    If the path is not absolute, guess that it is a subdirectory of the
+    user's home directory.
+
+    :param str pth: The pathname of the directory where files of tweets should be written
+    """
+    if os.path.isabs(pth):
+        return pth
+    else:
+        return os.path.expanduser(os.path.join("~", pth))
diff --git a/nlp_resource_data/nltk/twitter/util.pyc b/nlp_resource_data/nltk/twitter/util.pyc

new file mode 100755 (executable)

index 0000000..bda337c

Binary files /dev/null and b/nlp_resource_data/nltk/twitter/util.pyc differ
diff --git a/nlp_resource_data/nltk/util.py b/nlp_resource_data/nltk/util.py

new file mode 100755 (executable)

index 0000000..2dcb782
--- /dev/null
+++ b/nlp_resource_data/nltk/util.py
@@ -0,0 +1,751 @@
+# Natural Language Toolkit: Utility functions
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function
+
+import sys
+import inspect
+import locale
+import re
+import types
+import textwrap
+import pydoc
+import bisect
+import os
+
+from itertools import islice, chain, combinations
+from pprint import pprint
+from collections import defaultdict, deque
+from sys import version_info
+
+from six import class_types, string_types, text_type
+from six.moves.urllib.request import (build_opener, install_opener, getproxies,
+                                      ProxyHandler, ProxyBasicAuthHandler,
+                                      ProxyDigestAuthHandler,
+                                      HTTPPasswordMgrWithDefaultRealm)
+
+from nltk.internals import slice_bounds, raise_unorderable_types
+from nltk.collections import *
+from nltk.compat import python_2_unicode_compatible
+
+
+
+######################################################################
+# Short usage message
+######################################################################
+
+def usage(obj, selfname='self'):
+    str(obj) # In case it's lazy, this will load it.
+
+    if not isinstance(obj, class_types):
+        obj = obj.__class__
+
+    print('%s supports the following operations:' % obj.__name__)
+    for (name, method) in sorted(pydoc.allmethods(obj).items()):
+        if name.startswith('_'): continue
+        if getattr(method, '__deprecated__', False): continue
+
+        if sys.version_info[0] >= 3:
+            getargspec = inspect.getfullargspec
+        else:
+            getargspec = inspect.getargspec
+        args, varargs, varkw, defaults = getargspec(method)[:4]
+        if (args and args[0]=='self' and
+            (defaults is None or len(args)>len(defaults))):
+            args = args[1:]
+            name = '%s.%s' % (selfname, name)
+        argspec = inspect.formatargspec(
+            args, varargs, varkw, defaults)
+        print(textwrap.fill('%s%s' % (name, argspec),
+                            initial_indent='  - ',
+                            subsequent_indent=' '*(len(name)+5)))
+
+##########################################################################
+# IDLE
+##########################################################################
+
+def in_idle():
+    """
+    Return True if this function is run within idle.  Tkinter
+    programs that are run in idle should never call ``Tk.mainloop``; so
+    this function should be used to gate all calls to ``Tk.mainloop``.
+
+    :warning: This function works by checking ``sys.stdin``.  If the
+        user has modified ``sys.stdin``, then it may return incorrect
+        results.
+    :rtype: bool
+    """
+    import sys
+    return sys.stdin.__class__.__name__ in ('PyShell', 'RPCProxy')
+
+##########################################################################
+# PRETTY PRINTING
+##########################################################################
+
+def pr(data, start=0, end=None):
+    """
+    Pretty print a sequence of data items
+
+    :param data: the data stream to print
+    :type data: sequence or iter
+    :param start: the start position
+    :type start: int
+    :param end: the end position
+    :type end: int
+    """
+    pprint(list(islice(data, start, end)))
+
+def print_string(s, width=70):
+    """
+    Pretty print a string, breaking lines on whitespace
+
+    :param s: the string to print, consisting of words and spaces
+    :type s: str
+    :param width: the display width
+    :type width: int
+    """
+    print('\n'.join(textwrap.wrap(s, width=width)))
+
+def tokenwrap(tokens, separator=" ", width=70):
+    """
+    Pretty print a list of text tokens, breaking lines on whitespace
+
+    :param tokens: the tokens to print
+    :type tokens: list
+    :param separator: the string to use to separate tokens
+    :type separator: str
+    :param width: the display width (default=70)
+    :type width: int
+    """
+    return '\n'.join(textwrap.wrap(separator.join(tokens), width=width))
+
+
+##########################################################################
+# Python version
+##########################################################################
+
+def py25():
+    return version_info[0] == 2 and version_info[1] == 5
+def py26():
+    return version_info[0] == 2 and version_info[1] == 6
+def py27():
+    return version_info[0] == 2 and version_info[1] == 7
+
+
+##########################################################################
+# Indexing
+##########################################################################
+
+class Index(defaultdict):
+
+    def __init__(self, pairs):
+        defaultdict.__init__(self, list)
+        for key, value in pairs:
+            self[key].append(value)
+
+
+######################################################################
+## Regexp display (thanks to David Mertz)
+######################################################################
+
+def re_show(regexp, string, left="{", right="}"):
+    """
+    Return a string with markers surrounding the matched substrings.
+    Search str for substrings matching ``regexp`` and wrap the matches
+    with braces.  This is convenient for learning about regular expressions.
+
+    :param regexp: The regular expression.
+    :type regexp: str
+    :param string: The string being matched.
+    :type string: str
+    :param left: The left delimiter (printed before the matched substring)
+    :type left: str
+    :param right: The right delimiter (printed after the matched substring)
+    :type right: str
+    :rtype: str
+    """
+    print(re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip()))
+
+
+##########################################################################
+# READ FROM FILE OR STRING
+##########################################################################
+
+# recipe from David Mertz
+def filestring(f):
+    if hasattr(f, 'read'):
+        return f.read()
+    elif isinstance(f, string_types):
+        with open(f, 'r') as infile:
+            return infile.read()
+    else:
+        raise ValueError("Must be called with a filename or file-like object")
+
+##########################################################################
+# Breadth-First Search
+##########################################################################
+
+def breadth_first(tree, children=iter, maxdepth=-1):
+    """Traverse the nodes of a tree in breadth-first order.
+    (No need to check for cycles.)
+    The first argument should be the tree root;
+    children should be a function taking as argument a tree node
+    and returning an iterator of the node's children.
+    """
+    queue = deque([(tree, 0)])
+
+    while queue:
+        node, depth = queue.popleft()
+        yield node
+
+        if depth != maxdepth:
+            try:
+                queue.extend((c, depth + 1) for c in children(node))
+            except TypeError:
+                pass
+
+##########################################################################
+# Guess Character Encoding
+##########################################################################
+
+# adapted from io.py in the docutils extension module (http://docutils.sourceforge.net)
+# http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html
+
+def guess_encoding(data):
+    """
+    Given a byte string, attempt to decode it.
+    Tries the standard 'UTF8' and 'latin-1' encodings,
+    Plus several gathered from locale information.
+
+    The calling program *must* first call::
+
+        locale.setlocale(locale.LC_ALL, '')
+
+    If successful it returns ``(decoded_unicode, successful_encoding)``.
+    If unsuccessful it raises a ``UnicodeError``.
+    """
+    successful_encoding = None
+    # we make 'utf-8' the first encoding
+    encodings = ['utf-8']
+    #
+    # next we add anything we can learn from the locale
+    try:
+        encodings.append(locale.nl_langinfo(locale.CODESET))
+    except AttributeError:
+        pass
+    try:
+        encodings.append(locale.getlocale()[1])
+    except (AttributeError, IndexError):
+        pass
+    try:
+        encodings.append(locale.getdefaultlocale()[1])
+    except (AttributeError, IndexError):
+        pass
+    #
+    # we try 'latin-1' last
+    encodings.append('latin-1')
+    for enc in encodings:
+        # some of the locale calls
+        # may have returned None
+        if not enc:
+            continue
+        try:
+            decoded = text_type(data, enc)
+            successful_encoding = enc
+
+        except (UnicodeError, LookupError):
+            pass
+        else:
+            break
+    if not successful_encoding:
+         raise UnicodeError(
+        'Unable to decode input data.  Tried the following encodings: %s.'
+        % ', '.join([repr(enc) for enc in encodings if enc]))
+    else:
+         return (decoded, successful_encoding)
+
+
+##########################################################################
+# Remove repeated elements from a list deterministcally
+##########################################################################
+
+def unique_list(xs):
+    seen = set()
+    # not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None.
+    return [x for x in xs if x not in seen and not seen.add(x)]
+
+##########################################################################
+# Invert a dictionary
+##########################################################################
+
+def invert_dict(d):
+    inverted_dict = defaultdict(list)
+    for key in d:
+        if hasattr(d[key], '__iter__'):
+            for term in d[key]:
+                inverted_dict[term].append(key)
+        else:
+            inverted_dict[d[key]] = key
+    return inverted_dict
+
+
+##########################################################################
+# Utilities for directed graphs: transitive closure, and inversion
+# The graph is represented as a dictionary of sets
+##########################################################################
+
+def transitive_closure(graph, reflexive=False):
+    """
+    Calculate the transitive closure of a directed graph,
+    optionally the reflexive transitive closure.
+
+    The algorithm is a slight modification of the "Marking Algorithm" of
+    Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms".
+
+    :param graph: the initial graph, represented as a dictionary of sets
+    :type graph: dict(set)
+    :param reflexive: if set, also make the closure reflexive
+    :type reflexive: bool
+    :rtype: dict(set)
+    """
+    if reflexive:
+        base_set = lambda k: set([k])
+    else:
+        base_set = lambda k: set()
+    # The graph U_i in the article:
+    agenda_graph = dict((k, graph[k].copy()) for k in graph)
+    # The graph M_i in the article:
+    closure_graph = dict((k, base_set(k)) for k in graph)
+    for i in graph:
+        agenda = agenda_graph[i]
+        closure = closure_graph[i]
+        while agenda:
+            j = agenda.pop()
+            closure.add(j)
+            closure |= closure_graph.setdefault(j, base_set(j))
+            agenda |= agenda_graph.get(j, base_set(j))
+            agenda -= closure
+    return closure_graph
+
+
+def invert_graph(graph):
+    """
+    Inverts a directed graph.
+
+    :param graph: the graph, represented as a dictionary of sets
+    :type graph: dict(set)
+    :return: the inverted graph
+    :rtype: dict(set)
+    """
+    inverted = {}
+    for key in graph:
+        for value in graph[key]:
+            inverted.setdefault(value, set()).add(key)
+    return inverted
+
+
+
+##########################################################################
+# HTML Cleaning
+##########################################################################
+
+def clean_html(html):
+    raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
+
+def clean_url(url):
+    raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
+
+##########################################################################
+# FLATTEN LISTS
+##########################################################################
+
+def flatten(*args):
+    """
+    Flatten a list.
+
+        >>> from nltk.util import flatten
+        >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3)
+        [1, 2, 'b', 'a', 'c', 'd', 3]
+
+    :param args: items and lists to be combined into a single list
+    :rtype: list
+    """
+
+    x = []
+    for l in args:
+        if not isinstance(l, (list, tuple)): l = [l]
+        for item in l:
+            if isinstance(item, (list, tuple)):
+                x.extend(flatten(item))
+            else:
+                x.append(item)
+    return x
+
+##########################################################################
+# Ngram iteration
+##########################################################################
+
+def pad_sequence(sequence, n, pad_left=False, pad_right=False,
+                 left_pad_symbol=None, right_pad_symbol=None):
+    """
+    Returns a padded sequence of items before ngram extraction.
+
+        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
+        ['<s>', 1, 2, 3, 4, 5, '</s>']
+        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
+        ['<s>', 1, 2, 3, 4, 5]
+        >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
+        [1, 2, 3, 4, 5, '</s>']
+
+    :param sequence: the source data to be padded
+    :type sequence: sequence or iter
+    :param n: the degree of the ngrams
+    :type n: int
+    :param pad_left: whether the ngrams should be left-padded
+    :type pad_left: bool
+    :param pad_right: whether the ngrams should be right-padded
+    :type pad_right: bool
+    :param left_pad_symbol: the symbol to use for left padding (default is None)
+    :type left_pad_symbol: any
+    :param right_pad_symbol: the symbol to use for right padding (default is None)
+    :type right_pad_symbol: any
+    :rtype: sequence or iter
+    """
+    sequence = iter(sequence)
+    if pad_left:
+        sequence = chain((left_pad_symbol,) * (n-1), sequence)
+    if pad_right:
+        sequence = chain(sequence, (right_pad_symbol,) * (n-1))
+    return sequence
+
+# add a flag to pad the sequence so we get peripheral ngrams?
+
+def ngrams(sequence, n, pad_left=False, pad_right=False,
+           left_pad_symbol=None, right_pad_symbol=None):
+    """
+    Return the ngrams generated from a sequence of items, as an iterator.
+    For example:
+
+        >>> from nltk.util import ngrams
+        >>> list(ngrams([1,2,3,4,5], 3))
+        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
+
+    Wrap with list for a list version of this function.  Set pad_left
+    or pad_right to true in order to get additional ngrams:
+
+        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
+        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
+        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
+        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
+        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
+        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
+        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
+        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
+
+
+    :param sequence: the source data to be converted into ngrams
+    :type sequence: sequence or iter
+    :param n: the degree of the ngrams
+    :type n: int
+    :param pad_left: whether the ngrams should be left-padded
+    :type pad_left: bool
+    :param pad_right: whether the ngrams should be right-padded
+    :type pad_right: bool
+    :param left_pad_symbol: the symbol to use for left padding (default is None)
+    :type left_pad_symbol: any
+    :param right_pad_symbol: the symbol to use for right padding (default is None)
+    :type right_pad_symbol: any
+    :rtype: sequence or iter
+    """
+    sequence = pad_sequence(sequence, n, pad_left, pad_right,
+                            left_pad_symbol, right_pad_symbol)
+
+    history = []
+    while n > 1:
+        history.append(next(sequence))
+        n -= 1
+    for item in sequence:
+        history.append(item)
+        yield tuple(history)
+        del history[0]
+
+def bigrams(sequence, **kwargs):
+    """
+    Return the bigrams generated from a sequence of items, as an iterator.
+    For example:
+
+        >>> from nltk.util import bigrams
+        >>> list(bigrams([1,2,3,4,5]))
+        [(1, 2), (2, 3), (3, 4), (4, 5)]
+
+    Use bigrams for a list version of this function.
+
+    :param sequence: the source data to be converted into bigrams
+    :type sequence: sequence or iter
+    :rtype: iter(tuple)
+    """
+
+    for item in ngrams(sequence, 2, **kwargs):
+        yield item
+
+def trigrams(sequence, **kwargs):
+    """
+    Return the trigrams generated from a sequence of items, as an iterator.
+    For example:
+
+        >>> from nltk.util import trigrams
+        >>> list(trigrams([1,2,3,4,5]))
+        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
+
+    Use trigrams for a list version of this function.
+
+    :param sequence: the source data to be converted into trigrams
+    :type sequence: sequence or iter
+    :rtype: iter(tuple)
+    """
+
+    for item in ngrams(sequence, 3, **kwargs):
+        yield item
+
+def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
+    """
+    Returns all possible ngrams generated from a sequence of items, as an iterator.
+
+        >>> sent = 'a b c'.split()
+        >>> list(everygrams(sent))
+        [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
+        >>> list(everygrams(sent, max_len=2))
+        [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c')]
+
+    :param sequence: the source data to be converted into trigrams
+    :type sequence: sequence or iter
+    :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram
+    :type  min_len: int
+    :param max_len: maximum length of the ngrams (set to length of sequence by default)
+    :type  max_len: int
+    :rtype: iter(tuple)
+    """
+
+    if max_len == -1:
+        max_len = len(sequence)
+    for n in range(min_len, max_len+1):
+        for ng in ngrams(sequence, n, **kwargs):
+            yield ng
+
+def skipgrams(sequence, n, k, **kwargs):
+    """
+    Returns all possible skipgrams generated from a sequence of items, as an iterator.
+    Skipgrams are ngrams that allows tokens to be skipped.
+    Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf
+
+        >>> sent = "Insurgents killed in ongoing fighting".split()
+        >>> list(skipgrams(sent, 2, 2))
+        [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')]
+        >>> list(skipgrams(sent, 3, 2))
+        [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')]
+
+    :param sequence: the source data to be converted into trigrams
+    :type sequence: sequence or iter
+    :param n: the degree of the ngrams
+    :type n: int
+    :param k: the skip distance
+    :type  k: int
+    :rtype: iter(tuple)
+    """
+
+    # Pads the sequence as desired by **kwargs.
+    if 'pad_left' in kwargs or 'pad_right' in kwargs:
+        sequence = pad_sequence(sequence, n, **kwargs)
+
+    # Note when iterating through the ngrams, the pad_right here is not
+    # the **kwargs padding, it's for the algorithm to detect the SENTINEL
+    # object on the right pad to stop inner loop.
+    SENTINEL = object()
+    for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL):
+        head = ngram[:1]
+        tail = ngram[1:]
+        for skip_tail in combinations(tail, n - 1):
+            if skip_tail[-1] is SENTINEL:
+                continue
+            yield head + skip_tail
+
+######################################################################
+# Binary Search in a File
+######################################################################
+
+# inherited from pywordnet, by Oliver Steele
+def binary_search_file(file, key, cache={}, cacheDepth=-1):
+    """
+    Return the line from the file with first word key.
+    Searches through a sorted file using the binary search algorithm.
+
+    :type file: file
+    :param file: the file to be searched through.
+    :type key: str
+    :param key: the identifier we are searching for.
+    """
+
+    key = key + ' '
+    keylen = len(key)
+    start = 0
+    currentDepth = 0
+
+    if hasattr(file, 'name'):
+        end = os.stat(file.name).st_size - 1
+    else:
+        file.seek(0, 2)
+        end = file.tell() - 1
+        file.seek(0)
+
+    while start < end:
+        lastState = start, end
+        middle = (start + end) // 2
+
+        if cache.get(middle):
+            offset, line = cache[middle]
+
+        else:
+            line = ""
+            while True:
+                file.seek(max(0, middle - 1))
+                if middle > 0:
+                    file.readline()
+                offset = file.tell()
+                line = file.readline()
+                if line != "": break
+                # at EOF; try to find start of the last line
+                middle = (start + middle)//2
+                if middle == end -1:
+                    return None
+            if currentDepth < cacheDepth:
+                cache[middle] = (offset, line)
+
+        if offset > end:
+            assert end != middle - 1, "infinite loop"
+            end = middle - 1
+        elif line[:keylen] == key:
+            return line
+        elif line > key:
+            assert end != middle - 1, "infinite loop"
+            end = middle - 1
+        elif line < key:
+            start = offset + len(line) - 1
+
+        currentDepth += 1
+        thisState = start, end
+
+        if lastState == thisState:
+            # Detects the condition where we're searching past the end
+            # of the file, which is otherwise difficult to detect
+            return None
+
+    return None
+
+######################################################################
+# Proxy configuration
+######################################################################
+
+def set_proxy(proxy, user=None, password=''):
+    """
+    Set the HTTP proxy for Python to download through.
+
+    If ``proxy`` is None then tries to set proxy from environment or system
+    settings.
+
+    :param proxy: The HTTP proxy server to use. For example:
+        'http://proxy.example.com:3128/'
+    :param user: The username to authenticate with. Use None to disable
+        authentication.
+    :param password: The password to authenticate with.
+    """
+    from nltk import compat
+
+    if proxy is None:
+        # Try and find the system proxy settings
+        try:
+            proxy = getproxies()['http']
+        except KeyError:
+            raise ValueError('Could not detect default proxy settings')
+
+    # Set up the proxy handler
+    proxy_handler = ProxyHandler({'https': proxy, 'http': proxy})
+    opener = build_opener(proxy_handler)
+
+    if user is not None:
+        # Set up basic proxy authentication if provided
+        password_manager = HTTPPasswordMgrWithDefaultRealm()
+        password_manager.add_password(realm=None, uri=proxy, user=user,
+                passwd=password)
+        opener.add_handler(ProxyBasicAuthHandler(password_manager))
+        opener.add_handler(ProxyDigestAuthHandler(password_manager))
+
+    # Overide the existing url opener
+    install_opener(opener)
+
+
+######################################################################
+# ElementTree pretty printing from http://www.effbot.org/zone/element-lib.htm
+######################################################################
+
+
+def elementtree_indent(elem, level=0):
+    """
+    Recursive function to indent an ElementTree._ElementInterface
+    used for pretty printing. Run indent on elem and then output
+    in the normal way.
+
+    :param elem: element to be indented. will be modified.
+    :type elem: ElementTree._ElementInterface
+    :param level: level of indentation for this element
+    :type level: nonnegative integer
+    :rtype:   ElementTree._ElementInterface
+    :return:  Contents of elem indented to reflect its structure
+    """
+
+    i = "\n" + level*"  "
+    if len(elem):
+        if not elem.text or not elem.text.strip():
+            elem.text = i + "  "
+        for elem in elem:
+            elementtree_indent(elem, level+1)
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = i
+    else:
+        if level and (not elem.tail or not elem.tail.strip()):
+            elem.tail = i
+
+######################################################################
+# Mathematical approximations
+######################################################################
+
+def choose(n, k):
+    """
+    This function is a fast way to calculate binomial coefficients, commonly
+    known as nCk, i.e. the number of combinations of n things taken k at a time.
+    (https://en.wikipedia.org/wiki/Binomial_coefficient).
+
+    This is the *scipy.special.comb()* with long integer computation but this
+    approximation is faster, see https://github.com/nltk/nltk/issues/1181
+
+        >>> choose(4, 2)
+        6
+        >>> choose(6, 2)
+        15
+
+    :param n: The number of things.
+    :type n: int
+    :param r: The number of times a thing is taken.
+    :type r: int
+    """
+    if 0 <= k <= n:
+        ntok, ktok = 1, 1
+        for t in range(1, min(k, n - k) + 1):
+            ntok *= n
+            ktok *= t
+            n -= 1
+        return ntok // ktok
+    else:
+        return 0
diff --git a/nlp_resource_data/nltk/util.pyc b/nlp_resource_data/nltk/util.pyc

new file mode 100755 (executable)

index 0000000..897630b

Binary files /dev/null and b/nlp_resource_data/nltk/util.pyc differ
diff --git a/nlp_resource_data/nltk/wsd.py b/nlp_resource_data/nltk/wsd.py

new file mode 100755 (executable)

index 0000000..f77b0cb
--- /dev/null
+++ b/nlp_resource_data/nltk/wsd.py
@@ -0,0 +1,53 @@
+# Natural Language Toolkit: Word Sense Disambiguation Algorithms
+#
+# Authors: Liling Tan <alvations@gmail.com>,
+#          Dmitrijs Milajevs <dimazest@gmail.com>
+#
+# Copyright (C) 2001-2017 NLTK Project
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from nltk.corpus import wordnet
+
+
+def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
+    """Return a synset for an ambiguous word in a context.
+
+    :param iter context_sentence: The context sentence where the ambiguous word
+    occurs, passed as an iterable of words.
+    :param str ambiguous_word: The ambiguous word that requires WSD.
+    :param str pos: A specified Part-of-Speech (POS).
+    :param iter synsets: Possible synsets of the ambiguous word.
+    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
+
+    This function is an implementation of the original Lesk algorithm (1986) [1].
+
+    Usage example::
+
+        >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')
+        Synset('savings_bank.n.02')
+
+    [1] Lesk, Michael. "Automatic sense disambiguation using machine
+    readable dictionaries: how to tell a pine cone from an ice cream
+    cone." Proceedings of the 5th Annual International Conference on
+    Systems Documentation. ACM, 1986.
+    http://dl.acm.org/citation.cfm?id=318728
+    """
+
+    context = set(context_sentence)
+    if synsets is None:
+        synsets = wordnet.synsets(ambiguous_word)
+
+    if pos:
+        synsets = [ss for ss in synsets if str(ss.pos()) == pos]
+
+    if not synsets:
+        return None
+
+    _, sense = max(
+        (len(context.intersection(ss.definition().split())), ss) for ss in synsets
+    )
+
+    return sense
+
+
diff --git a/nlp_resource_data/nltk/wsd.pyc b/nlp_resource_data/nltk/wsd.pyc

new file mode 100755 (executable)

index 0000000..d3281e2

Binary files /dev/null and b/nlp_resource_data/nltk/wsd.pyc differ
diff --git a/packaging/nlp.spec b/packaging/nlp.spec

index bf08685a28e4892fafc424d7dadaef514c68d7c8..4b73d921169c7263bb051357e62f3d4a104c58e8 100755 (executable)
--- a/packaging/nlp.spec
+++ b/packaging/nlp.spec
@@ -59,4 +59,5 @@ rm -rf %{buildroot}
  %{_app_bin_dir}/*
  %{TZ_SYS_RO_PACKAGES}/org.tizen.nlp.service.xml
  %{_libdir}/python2.7/site-packages/langdetect/*
+%{_libdir}/python2.7/site-packages/nltk/*
  %license LICENSE
author	jingjin.geng <jingjin.geng@samsung.com>
	Mon, 16 Jul 2018 19:09:58 +0000 (03:09 +0800)
committer	jingjing geng <jingjin.geng@samsung.com>
	Mon, 16 Jul 2018 03:24:51 +0000 (03:24 +0000)