# Natural Language Toolkit (NLTK)
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
Natural Language Processing with Python. O'Reilly Media Inc.
http://nltk.org/book
"""
+from __future__ import print_function, absolute_import
import os
# in the file VERSION.
try:
# If a VERSION file exists, use it!
- version_file = os.path.join(os.path.dirname(__file__), "VERSION")
- with open(version_file, "r") as infile:
+ version_file = os.path.join(os.path.dirname(__file__), 'VERSION')
+ with open(version_file, 'r') as infile:
__version__ = infile.read().strip()
except NameError:
- __version__ = "unknown (running code interactively?)"
+ __version__ = 'unknown (running code interactively?)'
except IOError as ex:
__version__ = "unknown (%s)" % ex
if __doc__ is not None: # fix for the ``python -OO``
- __doc__ += "\n@version: " + __version__
+ __doc__ += '\n@version: ' + __version__
# Copyright notice
__copyright__ = """\
-Copyright (C) 2001-2020 NLTK Project.
+Copyright (C) 2001-2019 NLTK Project.
Distributed and Licensed under the Apache License, Version 2.0,
which is included by reference.
The Natural Language Toolkit (NLTK) is a Python package for
natural language processing. NLTK requires Python 2.6 or higher."""
__keywords__ = [
- "NLP",
- "CL",
- "natural language processing",
- "computational linguistics",
- "parsing",
- "tagging",
- "tokenizing",
- "syntax",
- "linguistics",
- "language",
- "natural language",
- "text analytics",
+ 'NLP',
+ 'CL',
+ 'natural language processing',
+ 'computational linguistics',
+ 'parsing',
+ 'tagging',
+ 'tokenizing',
+ 'syntax',
+ 'linguistics',
+ 'language',
+ 'natural language',
+ 'text analytics',
]
__url__ = "http://nltk.org/"
# "Trove" classifiers for Python Package Index.
__classifiers__ = [
- "Development Status :: 5 - Production/Stable",
- "Intended Audience :: Developers",
- "Intended Audience :: Education",
- "Intended Audience :: Information Technology",
- "Intended Audience :: Science/Research",
- "License :: OSI Approved :: Apache Software License",
- "Operating System :: OS Independent",
- "Programming Language :: Python :: 2.6",
- "Programming Language :: Python :: 2.7",
- "Topic :: Scientific/Engineering",
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
- "Topic :: Scientific/Engineering :: Human Machine Interfaces",
- "Topic :: Scientific/Engineering :: Information Analysis",
- "Topic :: Text Processing",
- "Topic :: Text Processing :: Filters",
- "Topic :: Text Processing :: General",
- "Topic :: Text Processing :: Indexing",
- "Topic :: Text Processing :: Linguistic",
+ 'Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Information Technology',
+ 'Intended Audience :: Science/Research',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python :: 2.6',
+ 'Programming Language :: Python :: 2.7',
+ 'Topic :: Scientific/Engineering',
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
+ 'Topic :: Scientific/Engineering :: Human Machine Interfaces',
+ 'Topic :: Scientific/Engineering :: Information Analysis',
+ 'Topic :: Text Processing',
+ 'Topic :: Text Processing :: Filters',
+ 'Topic :: Text Processing :: General',
+ 'Topic :: Text Processing :: Indexing',
+ 'Topic :: Text Processing :: Linguistic',
]
from nltk.internals import config_java
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
-if not hasattr(subprocess, "PIPE"):
+if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
- raise NotImplementedError("subprocess.PIPE is not supported.")
+ raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
-if not hasattr(subprocess, "Popen"):
+if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
- raise NotImplementedError("subprocess.Popen is not supported.")
+ raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
from nltk import lazyimport
-app = lazyimport.LazyModule("nltk.app", locals(), globals())
-chat = lazyimport.LazyModule("nltk.chat", locals(), globals())
-corpus = lazyimport.LazyModule("nltk.corpus", locals(), globals())
-draw = lazyimport.LazyModule("nltk.draw", locals(), globals())
-toolbox = lazyimport.LazyModule("nltk.toolbox", locals(), globals())
+app = lazyimport.LazyModule('nltk.app', locals(), globals())
+chat = lazyimport.LazyModule('nltk.chat', locals(), globals())
+corpus = lazyimport.LazyModule('nltk.corpus', locals(), globals())
+draw = lazyimport.LazyModule('nltk.draw', locals(), globals())
+toolbox = lazyimport.LazyModule('nltk.toolbox', locals(), globals())
# Optional loading
from nltk.downloader import download, download_shell
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
pass
else:
# Natural Language Toolkit: Applications package
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# Import Tkinter-based modules if Tkinter is installed
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
import warnings
# Natural Language Toolkit: Chart Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Jean Mark Gawron <gawron@mail.sdsu.edu>
# Steven Bird <stevenbird1@gmail.com>
# widget system.
+from __future__ import division
import pickle
import os.path
-from tkinter import (
+from six.moves.tkinter import (
Button,
Canvas,
Checkbutton,
Tk,
Toplevel,
)
-from tkinter.font import Font
-from tkinter.messagebox import showerror, showinfo
-from tkinter.filedialog import asksaveasfilename, askopenfilename
+from six.moves.tkinter_font import Font
+from six.moves.tkinter_messagebox import showerror, showinfo
+from six.moves.tkinter_tkfiledialog import asksaveasfilename, askopenfilename
from nltk.parse.chart import (
BottomUpPredictCombineRule,
class EdgeList(ColorizedList):
- ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+ ARROW = SymbolWidget.SYMBOLS['rightarrow']
def _init_colortags(self, textwidget, options):
- textwidget.tag_config("terminal", foreground="#006000")
- textwidget.tag_config("arrow", font="symbol", underline="0")
- textwidget.tag_config("dot", foreground="#000000")
+ textwidget.tag_config('terminal', foreground='#006000')
+ textwidget.tag_config('arrow', font='symbol', underline='0')
+ textwidget.tag_config('dot', foreground='#000000')
textwidget.tag_config(
- "nonterminal", foreground="blue", font=("helvetica", -12, "bold")
+ 'nonterminal', foreground='blue', font=('helvetica', -12, 'bold')
)
def _item_repr(self, item):
contents = []
- contents.append(("%s\t" % item.lhs(), "nonterminal"))
- contents.append((self.ARROW, "arrow"))
+ contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+ contents.append((self.ARROW, 'arrow'))
for i, elt in enumerate(item.rhs()):
if i == item.dot():
- contents.append((" *", "dot"))
+ contents.append((' *', 'dot'))
if isinstance(elt, Nonterminal):
- contents.append((" %s" % elt.symbol(), "nonterminal"))
+ contents.append((' %s' % elt.symbol(), 'nonterminal'))
else:
- contents.append((" %r" % elt, "terminal"))
+ contents.append((' %r' % elt, 'terminal'))
if item.is_complete():
- contents.append((" *", "dot"))
+ contents.append((' *', 'dot'))
return contents
"""
def __init__(
- self, parent, chart, toplevel=True, title="Chart Matrix", show_numedges=False
+ self, parent, chart, toplevel=True, title='Chart Matrix', show_numedges=False
):
self._chart = chart
self._cells = []
if toplevel:
self._root = Toplevel(parent)
self._root.title(title)
- self._root.bind("<Control-q>", self.destroy)
+ self._root.bind('<Control-q>', self.destroy)
self._init_quit(self._root)
else:
self._root = Frame(parent)
self.draw()
def _init_quit(self, root):
- quit = Button(root, text="Quit", command=self.destroy)
- quit.pack(side="bottom", expand=0, fill="none")
+ quit = Button(root, text='Quit', command=self.destroy)
+ quit.pack(side='bottom', expand=0, fill='none')
def _init_matrix(self, root):
- cframe = Frame(root, border=2, relief="sunken")
- cframe.pack(expand=0, fill="none", padx=1, pady=3, side="top")
- self._canvas = Canvas(cframe, width=200, height=200, background="white")
- self._canvas.pack(expand=0, fill="none")
+ cframe = Frame(root, border=2, relief='sunken')
+ cframe.pack(expand=0, fill='none', padx=1, pady=3, side='top')
+ self._canvas = Canvas(cframe, width=200, height=200, background='white')
+ self._canvas.pack(expand=0, fill='none')
def _init_numedges(self, root):
- self._numedges_label = Label(root, text="0 edges")
- self._numedges_label.pack(expand=0, fill="none", side="top")
+ self._numedges_label = Label(root, text='0 edges')
+ self._numedges_label.pack(expand=0, fill='none', side='top')
def _init_list(self, root):
self._list = EdgeList(root, [], width=20, height=5)
- self._list.pack(side="top", expand=1, fill="both", pady=3)
+ self._list.pack(side='top', expand=1, fill='both', pady=3)
def cb(edge, self=self):
- self._fire_callbacks("select", edge)
+ self._fire_callbacks('select', edge)
- self._list.add_callback("select", cb)
+ self._list.add_callback('select', cb)
self._list.focus()
def destroy(self, *e):
for i in range(N):
for j in range(i, N):
if cell_edges[i][j] == 0:
- color = "gray20"
+ color = 'gray20'
else:
- color = "#00%02x%02x" % (
+ color = '#00%02x%02x' % (
min(255, 50 + 128 * cell_edges[i][j] / 10),
max(0, 128 - 128 * cell_edges[i][j] / 10),
)
cell_tag = self._cells[i][j]
self._canvas.itemconfig(cell_tag, fill=color)
if (i, j) == self._selected_cell:
- self._canvas.itemconfig(cell_tag, outline="#00ffff", width=3)
+ self._canvas.itemconfig(cell_tag, outline='#00ffff', width=3)
self._canvas.tag_raise(cell_tag)
else:
- self._canvas.itemconfig(cell_tag, outline="black", width=1)
+ self._canvas.itemconfig(cell_tag, outline='black', width=1)
# Update the edge list.
edges = list(self._chart.select(span=self._selected_cell))
# Update our edge count.
self._num_edges = self._chart.num_edges()
if self._numedges_label is not None:
- self._numedges_label["text"] = "%d edges" % self._num_edges
+ self._numedges_label['text'] = '%d edges' % self._num_edges
def activate(self):
- self._canvas.itemconfig("inactivebox", state="hidden")
+ self._canvas.itemconfig('inactivebox', state='hidden')
self.update()
def inactivate(self):
- self._canvas.itemconfig("inactivebox", state="normal")
+ self._canvas.itemconfig('inactivebox', state='normal')
self.update()
def add_callback(self, event, func):
self.update()
# Fire the callback.
- self._fire_callbacks("select_cell", i, j)
+ self._fire_callbacks('select_cell', i, j)
def deselect_cell(self):
if self._root is None:
LEFT_MARGIN = BOT_MARGIN = 15
TOP_MARGIN = 5
c = self._canvas
- c.delete("all")
+ c.delete('all')
N = self._chart.num_leaves() + 1
- dx = (int(c["width"]) - LEFT_MARGIN) / N
- dy = (int(c["height"]) - TOP_MARGIN - BOT_MARGIN) / N
+ dx = (int(c['width']) - LEFT_MARGIN) / N
+ dy = (int(c['height']) - TOP_MARGIN - BOT_MARGIN) / N
- c.delete("all")
+ c.delete('all')
# Labels and dotted lines
for i in range(N):
c.create_text(
- LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor="e"
+ LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor='e'
)
c.create_text(
i * dx + dx / 2 + LEFT_MARGIN,
N * dy + TOP_MARGIN + 1,
text=repr(i),
- anchor="n",
+ anchor='n',
)
c.create_line(
LEFT_MARGIN,
dy * (i + 1) + TOP_MARGIN,
dx * N + LEFT_MARGIN,
dy * (i + 1) + TOP_MARGIN,
- dash=".",
+ dash='.',
)
c.create_line(
dx * i + LEFT_MARGIN,
TOP_MARGIN,
dx * i + LEFT_MARGIN,
dy * N + TOP_MARGIN,
- dash=".",
+ dash='.',
)
# A box around the whole thing
i * dy + TOP_MARGIN,
(j + 1) * dx + LEFT_MARGIN,
(i + 1) * dy + TOP_MARGIN,
- fill="gray20",
+ fill='gray20',
)
self._cells[i][j] = t
def cb(event, self=self, i=i, j=j):
self._click_cell(i, j)
- c.tag_bind(t, "<Button-1>", cb)
+ c.tag_bind(t, '<Button-1>', cb)
# Inactive box
- xmax, ymax = int(c["width"]), int(c["height"])
+ xmax, ymax = int(c['width']), int(c['height'])
t = c.create_rectangle(
-100,
-100,
xmax + 100,
ymax + 100,
- fill="gray50",
- state="hidden",
- tag="inactivebox",
+ fill='gray50',
+ state='hidden',
+ tag='inactivebox',
)
c.tag_lower(t)
if toplevel:
self._root = Toplevel(parent)
- self._root.title("Chart Parser Application: Results")
- self._root.bind("<Control-q>", self.destroy)
+ self._root.title('Chart Parser Application: Results')
+ self._root.bind('<Control-q>', self.destroy)
else:
self._root = Frame(parent)
# Buttons
if toplevel:
buttons = Frame(self._root)
- buttons.pack(side="bottom", expand=0, fill="x")
- Button(buttons, text="Quit", command=self.destroy).pack(side="right")
- Button(buttons, text="Print All", command=self.print_all).pack(side="left")
- Button(buttons, text="Print Selection", command=self.print_selection).pack(
- side="left"
+ buttons.pack(side='bottom', expand=0, fill='x')
+ Button(buttons, text='Quit', command=self.destroy).pack(side='right')
+ Button(buttons, text='Print All', command=self.print_all).pack(side='left')
+ Button(buttons, text='Print Selection', command=self.print_selection).pack(
+ side='left'
)
# Canvas frame.
self._cframe = CanvasFrame(self._root, closeenough=20)
- self._cframe.pack(side="top", expand=1, fill="both")
+ self._cframe.pack(side='top', expand=1, fill='both')
# Initial update
self.update()
c.delete(self._selectbox)
self._selection = widget
(x1, y1, x2, y2) = widget.bbox()
- self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline="#088")
+ self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline='#088')
def _color(self, treewidget, color):
- treewidget.label()["color"] = color
+ treewidget.label()['color'] = color
for child in treewidget.subtrees():
if isinstance(child, TreeSegmentWidget):
self._color(child, color)
else:
- child["color"] = color
+ child['color'] = color
def print_all(self, *e):
if self._root is None:
if self._root is None:
return
if self._selection is None:
- showerror("Print Error", "No tree selected")
+ showerror('Print Error', 'No tree selected')
else:
c = self._cframe.canvas()
for widget in self._treewidgets:
c.delete(self._selectbox)
(x1, y1, x2, y2) = self._selection.bbox()
self._selection.move(10 - x1, 10 - y1)
- c["scrollregion"] = "0 0 %s %s" % (x2 - x1 + 20, y2 - y1 + 20)
+ c['scrollregion'] = '0 0 %s %s' % (x2 - x1 + 20, y2 - y1 + 20)
self._cframe.print_to_file()
# Restore our state.
"""
_OPSYMBOL = {
- "-": "-",
- "and": SymbolWidget.SYMBOLS["intersection"],
- "or": SymbolWidget.SYMBOLS["union"],
+ '-': '-',
+ 'and': SymbolWidget.SYMBOLS['intersection'],
+ 'or': SymbolWidget.SYMBOLS['union'],
}
def __init__(self, *chart_filenames):
# This chart is displayed when we don't have a value (eg
# before any chart is loaded).
- faketok = [""] * 8
+ faketok = [''] * 8
self._emptychart = Chart(faketok)
# The left & right charts start out empty.
- self._left_name = "None"
- self._right_name = "None"
+ self._left_name = 'None'
+ self._right_name = 'None'
self._left_chart = self._emptychart
self._right_chart = self._emptychart
# The charts that have been loaded.
- self._charts = {"None": self._emptychart}
+ self._charts = {'None': self._emptychart}
# The output chart.
self._out_chart = self._emptychart
# Set up the root window.
self._root = Tk()
- self._root.title("Chart Comparison")
- self._root.bind("<Control-q>", self.destroy)
- self._root.bind("<Control-x>", self.destroy)
+ self._root.title('Chart Comparison')
+ self._root.bind('<Control-q>', self.destroy)
+ self._root.bind('<Control-x>', self.destroy)
# Initialize all widgets, etc.
self._init_menubar(self._root)
# File menu
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Load Chart",
- accelerator="Ctrl-o",
+ label='Load Chart',
+ accelerator='Ctrl-o',
underline=0,
command=self.load_chart_dialog,
)
filemenu.add_command(
- label="Save Output",
- accelerator="Ctrl-s",
+ label='Save Output',
+ accelerator='Ctrl-s',
underline=0,
command=self.save_chart_dialog,
)
filemenu.add_separator()
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
# Compare menu
opmenu = Menu(menubar, tearoff=0)
opmenu.add_command(
- label="Intersection", command=self._intersection, accelerator="+"
+ label='Intersection', command=self._intersection, accelerator='+'
)
- opmenu.add_command(label="Union", command=self._union, accelerator="*")
+ opmenu.add_command(label='Union', command=self._union, accelerator='*')
opmenu.add_command(
- label="Difference", command=self._difference, accelerator="-"
+ label='Difference', command=self._difference, accelerator='-'
)
opmenu.add_separator()
- opmenu.add_command(label="Swap Charts", command=self._swapcharts)
- menubar.add_cascade(label="Compare", underline=0, menu=opmenu)
+ opmenu.add_command(label='Swap Charts', command=self._swapcharts)
+ menubar.add_cascade(label='Compare', underline=0, menu=opmenu)
# Add the menu
self._root.config(menu=menubar)
def _init_divider(self, root):
- divider = Frame(root, border=2, relief="sunken")
- divider.pack(side="top", fill="x", ipady=2)
+ divider = Frame(root, border=2, relief='sunken')
+ divider.pack(side='top', fill='x', ipady=2)
def _init_chartviews(self, root):
- opfont = ("symbol", -36) # Font for operator.
- eqfont = ("helvetica", -36) # Font for equals sign.
+ opfont = ('symbol', -36) # Font for operator.
+ eqfont = ('helvetica', -36) # Font for equals sign.
- frame = Frame(root, background="#c0c0c0")
- frame.pack(side="top", expand=1, fill="both")
+ frame = Frame(root, background='#c0c0c0')
+ frame.pack(side='top', expand=1, fill='both')
# The left matrix.
- cv1_frame = Frame(frame, border=3, relief="groove")
- cv1_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
+ cv1_frame = Frame(frame, border=3, relief='groove')
+ cv1_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
self._left_selector = MutableOptionMenu(
cv1_frame, list(self._charts.keys()), command=self._select_left
)
- self._left_selector.pack(side="top", pady=5, fill="x")
+ self._left_selector.pack(side='top', pady=5, fill='x')
self._left_matrix = ChartMatrixView(
cv1_frame, self._emptychart, toplevel=False, show_numedges=True
)
- self._left_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
- self._left_matrix.add_callback("select", self.select_edge)
- self._left_matrix.add_callback("select_cell", self.select_cell)
+ self._left_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both')
+ self._left_matrix.add_callback('select', self.select_edge)
+ self._left_matrix.add_callback('select_cell', self.select_cell)
self._left_matrix.inactivate()
# The operator.
self._op_label = Label(
- frame, text=" ", width=3, background="#c0c0c0", font=opfont
+ frame, text=' ', width=3, background='#c0c0c0', font=opfont
)
- self._op_label.pack(side="left", padx=5, pady=5)
+ self._op_label.pack(side='left', padx=5, pady=5)
# The right matrix.
- cv2_frame = Frame(frame, border=3, relief="groove")
- cv2_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
+ cv2_frame = Frame(frame, border=3, relief='groove')
+ cv2_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
self._right_selector = MutableOptionMenu(
cv2_frame, list(self._charts.keys()), command=self._select_right
)
- self._right_selector.pack(side="top", pady=5, fill="x")
+ self._right_selector.pack(side='top', pady=5, fill='x')
self._right_matrix = ChartMatrixView(
cv2_frame, self._emptychart, toplevel=False, show_numedges=True
)
- self._right_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
- self._right_matrix.add_callback("select", self.select_edge)
- self._right_matrix.add_callback("select_cell", self.select_cell)
+ self._right_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both')
+ self._right_matrix.add_callback('select', self.select_edge)
+ self._right_matrix.add_callback('select_cell', self.select_cell)
self._right_matrix.inactivate()
# The equals sign
- Label(frame, text="=", width=3, background="#c0c0c0", font=eqfont).pack(
- side="left", padx=5, pady=5
+ Label(frame, text='=', width=3, background='#c0c0c0', font=eqfont).pack(
+ side='left', padx=5, pady=5
)
# The output matrix.
- out_frame = Frame(frame, border=3, relief="groove")
- out_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
- self._out_label = Label(out_frame, text="Output")
- self._out_label.pack(side="top", pady=9)
+ out_frame = Frame(frame, border=3, relief='groove')
+ out_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
+ self._out_label = Label(out_frame, text='Output')
+ self._out_label.pack(side='top', pady=9)
self._out_matrix = ChartMatrixView(
out_frame, self._emptychart, toplevel=False, show_numedges=True
)
- self._out_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
- self._out_matrix.add_callback("select", self.select_edge)
- self._out_matrix.add_callback("select_cell", self.select_cell)
+ self._out_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both')
+ self._out_matrix.add_callback('select', self.select_edge)
+ self._out_matrix.add_callback('select_cell', self.select_cell)
self._out_matrix.inactivate()
def _init_buttons(self, root):
buttons = Frame(root)
- buttons.pack(side="bottom", pady=5, fill="x", expand=0)
- Button(buttons, text="Intersection", command=self._intersection).pack(
- side="left"
+ buttons.pack(side='bottom', pady=5, fill='x', expand=0)
+ Button(buttons, text='Intersection', command=self._intersection).pack(
+ side='left'
)
- Button(buttons, text="Union", command=self._union).pack(side="left")
- Button(buttons, text="Difference", command=self._difference).pack(side="left")
- Frame(buttons, width=20).pack(side="left")
- Button(buttons, text="Swap Charts", command=self._swapcharts).pack(side="left")
+ Button(buttons, text='Union', command=self._union).pack(side='left')
+ Button(buttons, text='Difference', command=self._difference).pack(side='left')
+ Frame(buttons, width=20).pack(side='left')
+ Button(buttons, text='Swap Charts', command=self._swapcharts).pack(side='left')
- Button(buttons, text="Detatch Output", command=self._detatch_out).pack(
- side="right"
+ Button(buttons, text='Detatch Output', command=self._detatch_out).pack(
+ side='right'
)
def _init_bindings(self, root):
# root.bind('<Control-s>', self.save_chart)
- root.bind("<Control-o>", self.load_chart_dialog)
+ root.bind('<Control-o>', self.load_chart_dialog)
# root.bind('<Control-r>', self.reset)
# ////////////////////////////////////////////////////////////
self._left_name = name
self._left_chart = self._charts[name]
self._left_matrix.set_chart(self._left_chart)
- if name == "None":
+ if name == 'None':
self._left_matrix.inactivate()
self._apply_op()
self._right_name = name
self._right_chart = self._charts[name]
self._right_matrix.set_chart(self._right_chart)
- if name == "None":
+ if name == 'None':
self._right_matrix.inactivate()
self._apply_op()
def _apply_op(self):
- if self._operator == "-":
+ if self._operator == '-':
self._difference()
- elif self._operator == "or":
+ elif self._operator == 'or':
self._union()
- elif self._operator == "and":
+ elif self._operator == 'and':
self._intersection()
# ////////////////////////////////////////////////////////////
# File
# ////////////////////////////////////////////////////////////
- CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")]
+ CHART_FILE_TYPES = [('Pickle file', '.pickle'), ('All files', '*')]
def save_chart_dialog(self, *args):
filename = asksaveasfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+ filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
)
if not filename:
return
try:
- with open(filename, "wb") as outfile:
+ with open(filename, 'wb') as outfile:
pickle.dump(self._out_chart, outfile)
except Exception as e:
showerror(
- "Error Saving Chart", "Unable to open file: %r\n%s" % (filename, e)
+ 'Error Saving Chart', 'Unable to open file: %r\n%s' % (filename, e)
)
def load_chart_dialog(self, *args):
filename = askopenfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+ filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
)
if not filename:
return
self.load_chart(filename)
except Exception as e:
showerror(
- "Error Loading Chart", "Unable to open file: %r\n%s" % (filename, e)
+ 'Error Loading Chart', 'Unable to open file: %r\n%s' % (filename, e)
)
def load_chart(self, filename):
- with open(filename, "rb") as infile:
+ with open(filename, 'rb') as infile:
chart = pickle.load(infile)
name = os.path.basename(filename)
- if name.endswith(".pickle"):
+ if name.endswith('.pickle'):
name = name[:-7]
- if name.endswith(".chart"):
+ if name.endswith('.chart'):
name = name[:-6]
self._charts[name] = chart
self._left_selector.add(name)
if edge not in self._right_chart:
out_chart.insert(edge, [])
- self._update("-", out_chart)
+ self._update('-', out_chart)
def _intersection(self):
if not self._checkcompat():
if edge in self._right_chart:
out_chart.insert(edge, [])
- self._update("and", out_chart)
+ self._update('and', out_chart)
def _union(self):
if not self._checkcompat():
for edge in self._right_chart:
out_chart.insert(edge, [])
- self._update("or", out_chart)
+ self._update('or', out_chart)
def _swapcharts(self):
left, right = self._left_name, self._right_name
self._out_chart = self._emptychart
self._out_matrix.set_chart(self._out_chart)
self._out_matrix.inactivate()
- self._out_label["text"] = "Output"
+ self._out_label['text'] = 'Output'
# Issue some other warning?
return False
else:
def _update(self, operator, out_chart):
self._operator = operator
- self._op_label["text"] = self._OPSYMBOL[operator]
+ self._op_label['text'] = self._OPSYMBOL[operator]
self._out_chart = out_chart
self._out_matrix.set_chart(out_chart)
- self._out_label["text"] = "%s %s %s" % (
+ self._out_label['text'] = '%s %s %s' % (
self._left_name,
self._operator,
self._right_name,
def _clear_out_chart(self):
self._out_chart = self._emptychart
self._out_matrix.set_chart(self._out_chart)
- self._op_label["text"] = " "
+ self._op_label['text'] = ' '
self._out_matrix.inactivate()
def _detatch_out(self):
- ChartMatrixView(self._root, self._out_chart, title=self._out_label["text"])
+ ChartMatrixView(self._root, self._out_chart, title=self._out_label['text'])
#######################################################################
Construct a new ``Chart`` display.
"""
# Process keyword args.
- draw_tree = kw.get("draw_tree", 0)
- draw_sentence = kw.get("draw_sentence", 1)
- self._fontsize = kw.get("fontsize", -12)
+ draw_tree = kw.get('draw_tree', 0)
+ draw_sentence = kw.get('draw_sentence', 1)
+ self._fontsize = kw.get('fontsize', -12)
# The chart!
self._chart = chart
# If they didn't provide a main window, then set one up.
if root is None:
top = Tk()
- top.title("Chart View")
+ top.title('Chart View')
def destroy1(e, top=top):
top.destroy()
def destroy2(top=top):
top.destroy()
- top.bind("q", destroy1)
- b = Button(top, text="Done", command=destroy2)
- b.pack(side="bottom")
+ top.bind('q', destroy1)
+ b = Button(top, text='Done', command=destroy2)
+ b.pack(side='bottom')
self._root = top
else:
self._root = root
# Create the chart canvas.
(self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root)
- self._chart_canvas["height"] = 300
- self._chart_canvas["closeenough"] = 15
+ self._chart_canvas['height'] = 300
+ self._chart_canvas['closeenough'] = 15
# Create the sentence canvas.
if draw_sentence:
- cframe = Frame(self._root, relief="sunk", border=2)
- cframe.pack(fill="both", side="bottom")
+ cframe = Frame(self._root, relief='sunk', border=2)
+ cframe.pack(fill='both', side='bottom')
self._sentence_canvas = Canvas(cframe, height=50)
- self._sentence_canvas["background"] = "#e0e0e0"
- self._sentence_canvas.pack(fill="both")
+ self._sentence_canvas['background'] = '#e0e0e0'
+ self._sentence_canvas.pack(fill='both')
# self._sentence_canvas['height'] = self._sentence_height
else:
self._sentence_canvas = None
# Create the tree canvas.
if draw_tree:
- (sb, canvas) = self._sb_canvas(self._root, "n", "x")
+ (sb, canvas) = self._sb_canvas(self._root, 'n', 'x')
(self._tree_sb, self._tree_canvas) = (sb, canvas)
- self._tree_canvas["height"] = 200
+ self._tree_canvas['height'] = 200
else:
self._tree_canvas = None
# Set up the configure callback, which will be called whenever
# the window is resized.
- self._chart_canvas.bind("<Configure>", self._configure)
+ self._chart_canvas.bind('<Configure>', self._configure)
def _init_fonts(self, root):
- self._boldfont = Font(family="helvetica", weight="bold", size=self._fontsize)
- self._font = Font(family="helvetica", size=self._fontsize)
+ self._boldfont = Font(family='helvetica', weight='bold', size=self._fontsize)
+ self._font = Font(family='helvetica', size=self._fontsize)
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
- def _sb_canvas(self, root, expand="y", fill="both", side="bottom"):
+ def _sb_canvas(self, root, expand='y', fill='both', side='bottom'):
"""
Helper for __init__: construct a canvas with a scrollbar.
"""
- cframe = Frame(root, relief="sunk", border=2)
+ cframe = Frame(root, relief='sunk', border=2)
cframe.pack(fill=fill, expand=expand, side=side)
- canvas = Canvas(cframe, background="#e0e0e0")
+ canvas = Canvas(cframe, background='#e0e0e0')
# Give the canvas a scrollbar.
- sb = Scrollbar(cframe, orient="vertical")
- sb.pack(side="right", fill="y")
- canvas.pack(side="left", fill=fill, expand="yes")
+ sb = Scrollbar(cframe, orient='vertical')
+ sb.pack(side='right', fill='y')
+ canvas.pack(side='left', fill=fill, expand='yes')
# Connect the scrollbars to the canvas.
- sb["command"] = canvas.yview
- canvas["yscrollcommand"] = sb.set
+ sb['command'] = canvas.yview
+ canvas['yscrollcommand'] = sb.set
return (sb, canvas)
def scroll_up(self, *e):
- self._chart_canvas.yview("scroll", -1, "units")
+ self._chart_canvas.yview('scroll', -1, 'units')
def scroll_down(self, *e):
- self._chart_canvas.yview("scroll", 1, "units")
+ self._chart_canvas.yview('scroll', 1, 'units')
def page_up(self, *e):
- self._chart_canvas.yview("scroll", -1, "pages")
+ self._chart_canvas.yview('scroll', -1, 'pages')
def page_down(self, *e):
- self._chart_canvas.yview("scroll", 1, "pages")
+ self._chart_canvas.yview('scroll', 1, 'pages')
def _grow(self):
"""
# Grow, if need-be
N = self._chart.num_leaves()
width = max(
- int(self._chart_canvas["width"]), N * self._unitsize + ChartView._MARGIN * 2
+ int(self._chart_canvas['width']), N * self._unitsize + ChartView._MARGIN * 2
)
# It won't resize without the second (height) line, but I
# don't understand why not.
self._chart_canvas.configure(width=width)
- self._chart_canvas.configure(height=self._chart_canvas["height"])
+ self._chart_canvas.configure(height=self._chart_canvas['height'])
self._unitsize = (width - 2 * ChartView._MARGIN) / N
# Reset the height for the sentence window.
if self._sentence_canvas is not None:
- self._sentence_canvas["height"] = self._sentence_height
+ self._sentence_canvas['height'] = self._sentence_height
def set_font_size(self, size):
self._font.configure(size=-abs(size))
rhs = " ".join(rhselts)
else:
lhs = edge.lhs()
- rhs = ""
+ rhs = ''
for s in (lhs, rhs):
tag = c.create_text(
- 0, 0, text=s, font=self._boldfont, anchor="nw", justify="left"
+ 0, 0, text=s, font=self._boldfont, anchor='nw', justify='left'
)
bbox = c.bbox(tag)
c.delete(tag)
# Try to view the new edge..
y = (level + 1) * self._chart_level_size
dy = self._text_height + 10
- self._chart_canvas.yview("moveto", 1.0)
+ self._chart_canvas.yview('moveto', 1.0)
if self._chart_height != 0:
- self._chart_canvas.yview("moveto", (y - dy) / self._chart_height)
+ self._chart_canvas.yview('moveto', (y - dy) / self._chart_height)
def _draw_edge(self, edge, lvl):
"""
if x2 == x1:
x2 += max(4, self._unitsize / 5)
y = (lvl + 1) * self._chart_level_size
- linetag = c.create_line(x1, y, x2, y, arrow="last", width=3)
+ linetag = c.create_line(x1, y, x2, y, arrow='last', width=3)
# Draw a label for the edge.
if isinstance(edge, TreeEdge):
rhs1 = " ".join(rhs[:pos])
rhs2 = " ".join(rhs[pos:])
- rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor="nw")
+ rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor='nw')
dotx = c.bbox(rhstag1)[2] + 6
doty = (c.bbox(rhstag1)[1] + c.bbox(rhstag1)[3]) / 2
dottag = c.create_oval(dotx - 2, doty - 2, dotx + 2, doty + 2)
- rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor="nw")
+ rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor='nw')
lhstag = c.create_text(
- (x1 + x2) / 2, y, text=str(edge.lhs()), anchor="s", font=self._boldfont
+ (x1 + x2) / 2, y, text=str(edge.lhs()), anchor='s', font=self._boldfont
)
# Keep track of the edge's tags.
# Register a callback for clicking on the edge.
def cb(event, self=self, edge=edge):
- self._fire_callbacks("select", edge)
+ self._fire_callbacks('select', edge)
- c.tag_bind(rhstag1, "<Button-1>", cb)
- c.tag_bind(rhstag2, "<Button-1>", cb)
- c.tag_bind(linetag, "<Button-1>", cb)
- c.tag_bind(dottag, "<Button-1>", cb)
- c.tag_bind(lhstag, "<Button-1>", cb)
+ c.tag_bind(rhstag1, '<Button-1>', cb)
+ c.tag_bind(rhstag2, '<Button-1>', cb)
+ c.tag_bind(linetag, '<Button-1>', cb)
+ c.tag_bind(dottag, '<Button-1>', cb)
+ c.tag_bind(lhstag, '<Button-1>', cb)
self._color_edge(edge)
if edge in self._marks:
self._color_edge(self._marks[edge])
if edge.is_complete() and edge.span() == (0, N):
- self._color_edge(edge, "#084", "#042")
+ self._color_edge(edge, '#084', '#042')
elif isinstance(edge, LeafEdge):
- self._color_edge(edge, "#48c", "#246")
+ self._color_edge(edge, '#48c', '#246')
else:
- self._color_edge(edge, "#00f", "#008")
+ self._color_edge(edge, '#00f', '#008')
- def mark_edge(self, edge, mark="#0df"):
+ def mark_edge(self, edge, mark='#0df'):
"""
Mark an edge
"""
del self._marks[edge]
self._color_edge(edge)
- def markonly_edge(self, edge, mark="#0df"):
+ def markonly_edge(self, edge, mark='#0df'):
self.unmark_edge()
self.mark_edge(edge, mark)
# Check against all tokens
for leaf in self._chart.leaves():
tag = c.create_text(
- 0, 0, text=repr(leaf), font=self._font, anchor="nw", justify="left"
+ 0, 0, text=repr(leaf), font=self._font, anchor='nw', justify='left'
)
bbox = c.bbox(tag)
c.delete(tag)
levels = len(self._edgelevels)
self._chart_height = (levels + 2) * self._chart_level_size
- c["scrollregion"] = (0, 0, width, self._chart_height)
+ c['scrollregion'] = (0, 0, width, self._chart_height)
# Reset the tree scroll region
if self._tree_canvas:
- self._tree_canvas["scrollregion"] = (0, 0, width, self._tree_height)
+ self._tree_canvas['scrollregion'] = (0, 0, width, self._tree_height)
def _draw_loclines(self):
"""
c2.tag_lower(t2)
t3 = c3.create_line(x, 0, x, BOTTOM)
c3.tag_lower(t3)
- t4 = c3.create_text(x + 2, 0, text=repr(i), anchor="nw", font=self._font)
+ t4 = c3.create_text(x + 2, 0, text=repr(i), anchor='nw', font=self._font)
c3.tag_lower(t4)
# if i % 4 == 0:
# if c1: c1.itemconfig(t1, width=2, fill='gray60')
# c3.itemconfig(t3, width=2, fill='gray60')
if i % 2 == 0:
if c1:
- c1.itemconfig(t1, fill="gray60")
+ c1.itemconfig(t1, fill='gray60')
if c2:
- c2.itemconfig(t2, fill="gray60")
- c3.itemconfig(t3, fill="gray60")
+ c2.itemconfig(t2, fill='gray60')
+ c3.itemconfig(t3, fill='gray60')
else:
if c1:
- c1.itemconfig(t1, fill="gray80")
+ c1.itemconfig(t1, fill='gray80')
if c2:
- c2.itemconfig(t2, fill="gray80")
- c3.itemconfig(t3, fill="gray80")
+ c2.itemconfig(t2, fill='gray80')
+ c3.itemconfig(t3, fill='gray80')
def _draw_sentence(self):
"""Draw the sentence string."""
x2 = x1 + self._unitsize
x = (x1 + x2) / 2
tag = c.create_text(
- x, y, text=repr(leaf), font=self._font, anchor="n", justify="left"
+ x, y, text=repr(leaf), font=self._font, anchor='n', justify='left'
)
bbox = c.bbox(tag)
rt = c.create_rectangle(
bbox[1] - (ChartView._LEAF_SPACING / 2),
x2 - 2,
bbox[3] + (ChartView._LEAF_SPACING / 2),
- fill="#f0f0f0",
- outline="#f0f0f0",
+ fill='#f0f0f0',
+ outline='#f0f0f0',
)
c.tag_lower(rt)
# Update the scroll region.
w = self._chart.num_leaves() * self._unitsize + 2 * ChartView._MARGIN
h = tree.height() * (ChartView._TREE_LEVEL_SIZE + self._text_height)
- self._tree_canvas["scrollregion"] = (0, 0, w, h)
+ self._tree_canvas['scrollregion'] = (0, 0, w, h)
def cycle_tree(self):
self._treetoks_index = (self._treetoks_index + 1) % len(self._treetoks)
return
# Draw the label.
- label = "%d Trees" % len(self._treetoks)
+ label = '%d Trees' % len(self._treetoks)
c = self._tree_canvas
margin = ChartView._MARGIN
right = self._chart.num_leaves() * self._unitsize + margin - 2
- tag = c.create_text(right, 2, anchor="ne", text=label, font=self._boldfont)
+ tag = c.create_text(right, 2, anchor='ne', text=label, font=self._boldfont)
self._tree_tags.append(tag)
_, _, _, y = c.bbox(tag)
for i in range(len(self._treetoks)):
x = right - 20 * (len(self._treetoks) - i - 1)
if i == self._treetoks_index:
- fill = "#084"
+ fill = '#084'
else:
- fill = "#fff"
+ fill = '#fff'
tag = c.create_polygon(
- x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline="black"
+ x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline='black'
)
self._tree_tags.append(tag)
self._treetoks_index = i
self.draw_tree()
- c.tag_bind(tag, "<Button-1>", cb)
+ c.tag_bind(tag, '<Button-1>', cb)
def _draw_treetok(self, treetok, index, depth=0):
"""
tag = c.create_text(
nodex,
nodey,
- anchor="n",
- justify="center",
+ anchor='n',
+ justify='center',
text=str(treetok.label()),
- fill="#042",
+ fill='#042',
font=self._boldfont,
)
self._tree_tags.append(tag)
childx,
childy,
width=2,
- fill="#084",
+ fill='#084',
)
self._tree_tags.append(tag)
if isinstance(child, Tree) and not child:
childx,
childy,
width=2,
- fill="#048",
- dash="2 3",
+ fill='#048',
+ dash='2 3',
)
self._tree_tags.append(tag)
if not isinstance(child, Tree):
childx,
10000,
width=2,
- fill="#084",
+ fill='#084',
)
self._tree_tags.append(tag)
Draw everything (from scratch).
"""
if self._tree_canvas:
- self._tree_canvas.delete("all")
+ self._tree_canvas.delete('all')
self.draw_tree()
if self._sentence_canvas:
- self._sentence_canvas.delete("all")
+ self._sentence_canvas.delete('all')
self._draw_sentence()
- self._chart_canvas.delete("all")
+ self._chart_canvas.delete('all')
self._edgetags = {}
# Redraw any edges we erased.
class ChartParserApp(object):
- def __init__(self, grammar, tokens, title="Chart Parser Application"):
+ def __init__(self, grammar, tokens, title='Chart Parser Application'):
# Initialize the parser
self._init_parser(grammar, tokens)
# Create the root window.
self._root = Tk()
self._root.title(title)
- self._root.bind("<Control-q>", self.destroy)
+ self._root.bind('<Control-q>', self.destroy)
# Set up some frames.
frame3 = Frame(self._root)
frame2 = Frame(self._root)
frame1 = Frame(self._root)
- frame3.pack(side="bottom", fill="none")
- frame2.pack(side="bottom", fill="x")
- frame1.pack(side="bottom", fill="both", expand=1)
+ frame3.pack(side='bottom', fill='none')
+ frame2.pack(side='bottom', fill='x')
+ frame1.pack(side='bottom', fill='both', expand=1)
self._init_fonts(self._root)
self._init_animation()
self._init_bindings()
except:
- print("Error creating Tree View")
+ print('Error creating Tree View')
self.destroy()
raise
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
+ self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+ self._font = Font(family='helvetica', size=self._size.get())
def _init_animation(self):
# Are we stepping? (default=yes)
def _init_chartview(self, parent):
self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1)
- self._cv.add_callback("select", self._click_cv_edge)
+ self._cv.add_callback('select', self._click_cv_edge)
def _init_rulelabel(self, parent):
- ruletxt = "Last edge generated by:"
+ ruletxt = 'Last edge generated by:'
self._rulelabel1 = Label(parent, text=ruletxt, font=self._boldfont)
self._rulelabel2 = Label(
- parent, width=40, relief="groove", anchor="w", font=self._boldfont
+ parent, width=40, relief='groove', anchor='w', font=self._boldfont
)
- self._rulelabel1.pack(side="left")
- self._rulelabel2.pack(side="left")
- step = Checkbutton(parent, variable=self._step, text="Step")
- step.pack(side="right")
+ self._rulelabel1.pack(side='left')
+ self._rulelabel2.pack(side='left')
+ step = Checkbutton(parent, variable=self._step, text='Step')
+ step.pack(side='right')
def _init_buttons(self, parent):
frame1 = Frame(parent)
frame2 = Frame(parent)
- frame1.pack(side="bottom", fill="x")
- frame2.pack(side="top", fill="none")
+ frame1.pack(side='bottom', fill='x')
+ frame2.pack(side='top', fill='none')
Button(
frame1,
- text="Reset\nParser",
- background="#90c0d0",
- foreground="black",
+ text='Reset\nParser',
+ background='#90c0d0',
+ foreground='black',
command=self.reset,
- ).pack(side="right")
+ ).pack(side='right')
# Button(frame1, text='Pause',
# background='#90c0d0', foreground='black',
# command=self.pause).pack(side='left')
Button(
frame1,
- text="Top Down\nStrategy",
- background="#90c0d0",
- foreground="black",
+ text='Top Down\nStrategy',
+ background='#90c0d0',
+ foreground='black',
command=self.top_down_strategy,
- ).pack(side="left")
+ ).pack(side='left')
Button(
frame1,
- text="Bottom Up\nStrategy",
- background="#90c0d0",
- foreground="black",
+ text='Bottom Up\nStrategy',
+ background='#90c0d0',
+ foreground='black',
command=self.bottom_up_strategy,
- ).pack(side="left")
+ ).pack(side='left')
Button(
frame1,
- text="Bottom Up\nLeft-Corner Strategy",
- background="#90c0d0",
- foreground="black",
+ text='Bottom Up\nLeft-Corner Strategy',
+ background='#90c0d0',
+ foreground='black',
command=self.bottom_up_leftcorner_strategy,
- ).pack(side="left")
+ ).pack(side='left')
Button(
frame2,
- text="Top Down Init\nRule",
- background="#90f090",
- foreground="black",
+ text='Top Down Init\nRule',
+ background='#90f090',
+ foreground='black',
command=self.top_down_init,
- ).pack(side="left")
+ ).pack(side='left')
Button(
frame2,
- text="Top Down Predict\nRule",
- background="#90f090",
- foreground="black",
+ text='Top Down Predict\nRule',
+ background='#90f090',
+ foreground='black',
command=self.top_down_predict,
- ).pack(side="left")
- Frame(frame2, width=20).pack(side="left")
+ ).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
Button(
frame2,
- text="Bottom Up Predict\nRule",
- background="#90f090",
- foreground="black",
+ text='Bottom Up Predict\nRule',
+ background='#90f090',
+ foreground='black',
command=self.bottom_up,
- ).pack(side="left")
- Frame(frame2, width=20).pack(side="left")
+ ).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
Button(
frame2,
- text="Bottom Up Left-Corner\nPredict Rule",
- background="#90f090",
- foreground="black",
+ text='Bottom Up Left-Corner\nPredict Rule',
+ background='#90f090',
+ foreground='black',
command=self.bottom_up_leftcorner,
- ).pack(side="left")
- Frame(frame2, width=20).pack(side="left")
+ ).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
Button(
frame2,
- text="Fundamental\nRule",
- background="#90f090",
- foreground="black",
+ text='Fundamental\nRule',
+ background='#90f090',
+ foreground='black',
command=self.fundamental,
- ).pack(side="left")
+ ).pack(side='left')
def _init_bindings(self):
- self._root.bind("<Up>", self._cv.scroll_up)
- self._root.bind("<Down>", self._cv.scroll_down)
- self._root.bind("<Prior>", self._cv.page_up)
- self._root.bind("<Next>", self._cv.page_down)
- self._root.bind("<Control-q>", self.destroy)
- self._root.bind("<Control-x>", self.destroy)
- self._root.bind("<F1>", self.help)
-
- self._root.bind("<Control-s>", self.save_chart)
- self._root.bind("<Control-o>", self.load_chart)
- self._root.bind("<Control-r>", self.reset)
-
- self._root.bind("t", self.top_down_strategy)
- self._root.bind("b", self.bottom_up_strategy)
- self._root.bind("c", self.bottom_up_leftcorner_strategy)
- self._root.bind("<space>", self._stop_animation)
-
- self._root.bind("<Control-g>", self.edit_grammar)
- self._root.bind("<Control-t>", self.edit_sentence)
+ self._root.bind('<Up>', self._cv.scroll_up)
+ self._root.bind('<Down>', self._cv.scroll_down)
+ self._root.bind('<Prior>', self._cv.page_up)
+ self._root.bind('<Next>', self._cv.page_down)
+ self._root.bind('<Control-q>', self.destroy)
+ self._root.bind('<Control-x>', self.destroy)
+ self._root.bind('<F1>', self.help)
+
+ self._root.bind('<Control-s>', self.save_chart)
+ self._root.bind('<Control-o>', self.load_chart)
+ self._root.bind('<Control-r>', self.reset)
+
+ self._root.bind('t', self.top_down_strategy)
+ self._root.bind('b', self.bottom_up_strategy)
+ self._root.bind('c', self.bottom_up_leftcorner_strategy)
+ self._root.bind('<space>', self._stop_animation)
+
+ self._root.bind('<Control-g>', self.edit_grammar)
+ self._root.bind('<Control-t>', self.edit_sentence)
# Animation speed control
- self._root.bind("-", lambda e, a=self._animate: a.set(1))
- self._root.bind("=", lambda e, a=self._animate: a.set(2))
- self._root.bind("+", lambda e, a=self._animate: a.set(3))
+ self._root.bind('-', lambda e, a=self._animate: a.set(1))
+ self._root.bind('=', lambda e, a=self._animate: a.set(2))
+ self._root.bind('+', lambda e, a=self._animate: a.set(3))
# Step control
- self._root.bind("s", lambda e, s=self._step: s.set(not s.get()))
+ self._root.bind('s', lambda e, s=self._step: s.set(not s.get()))
def _init_menubar(self):
menubar = Menu(self._root)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Save Chart",
+ label='Save Chart',
underline=0,
command=self.save_chart,
- accelerator="Ctrl-s",
+ accelerator='Ctrl-s',
)
filemenu.add_command(
- label="Load Chart",
+ label='Load Chart',
underline=0,
command=self.load_chart,
- accelerator="Ctrl-o",
+ accelerator='Ctrl-o',
)
filemenu.add_command(
- label="Reset Chart", underline=0, command=self.reset, accelerator="Ctrl-r"
+ label='Reset Chart', underline=0, command=self.reset, accelerator='Ctrl-r'
)
filemenu.add_separator()
- filemenu.add_command(label="Save Grammar", command=self.save_grammar)
- filemenu.add_command(label="Load Grammar", command=self.load_grammar)
+ filemenu.add_command(label='Save Grammar', command=self.save_grammar)
+ filemenu.add_command(label='Load Grammar', command=self.load_grammar)
filemenu.add_separator()
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
editmenu.add_command(
- label="Edit Grammar",
+ label='Edit Grammar',
underline=5,
command=self.edit_grammar,
- accelerator="Ctrl-g",
+ accelerator='Ctrl-g',
)
editmenu.add_command(
- label="Edit Text",
+ label='Edit Text',
underline=5,
command=self.edit_sentence,
- accelerator="Ctrl-t",
+ accelerator='Ctrl-t',
)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_command(
- label="Chart Matrix", underline=6, command=self.view_matrix
+ label='Chart Matrix', underline=6, command=self.view_matrix
)
- viewmenu.add_command(label="Results", underline=0, command=self.view_results)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ viewmenu.add_command(label='Results', underline=0, command=self.view_results)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
rulemenu = Menu(menubar, tearoff=0)
rulemenu.add_command(
- label="Top Down Strategy",
+ label='Top Down Strategy',
underline=0,
command=self.top_down_strategy,
- accelerator="t",
+ accelerator='t',
)
rulemenu.add_command(
- label="Bottom Up Strategy",
+ label='Bottom Up Strategy',
underline=0,
command=self.bottom_up_strategy,
- accelerator="b",
+ accelerator='b',
)
rulemenu.add_command(
- label="Bottom Up Left-Corner Strategy",
+ label='Bottom Up Left-Corner Strategy',
underline=0,
command=self.bottom_up_leftcorner_strategy,
- accelerator="c",
+ accelerator='c',
)
rulemenu.add_separator()
- rulemenu.add_command(label="Bottom Up Rule", command=self.bottom_up)
+ rulemenu.add_command(label='Bottom Up Rule', command=self.bottom_up)
rulemenu.add_command(
- label="Bottom Up Left-Corner Rule", command=self.bottom_up_leftcorner
+ label='Bottom Up Left-Corner Rule', command=self.bottom_up_leftcorner
)
- rulemenu.add_command(label="Top Down Init Rule", command=self.top_down_init)
+ rulemenu.add_command(label='Top Down Init Rule', command=self.top_down_init)
rulemenu.add_command(
- label="Top Down Predict Rule", command=self.top_down_predict
+ label='Top Down Predict Rule', command=self.top_down_predict
)
- rulemenu.add_command(label="Fundamental Rule", command=self.fundamental)
- menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+ rulemenu.add_command(label='Fundamental Rule', command=self.fundamental)
+ menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
animatemenu = Menu(menubar, tearoff=0)
animatemenu.add_checkbutton(
- label="Step", underline=0, variable=self._step, accelerator="s"
+ label="Step", underline=0, variable=self._step, accelerator='s'
)
animatemenu.add_separator()
animatemenu.add_radiobutton(
underline=0,
variable=self._animate,
value=1,
- accelerator="-",
+ accelerator='-',
)
animatemenu.add_radiobutton(
label="Normal Animation",
underline=0,
variable=self._animate,
value=2,
- accelerator="=",
+ accelerator='=',
)
animatemenu.add_radiobutton(
label="Fast Animation",
underline=0,
variable=self._animate,
value=3,
- accelerator="+",
+ accelerator='+',
)
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
zoommenu = Menu(menubar, tearoff=0)
zoommenu.add_radiobutton(
- label="Tiny",
+ label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Small",
+ label='Small',
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Medium",
+ label='Medium',
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Large",
+ label='Large',
variable=self._size,
underline=0,
value=18,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Huge",
+ label='Huge',
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
- menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu)
+ menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
+ helpmenu.add_command(label='About', underline=0, command=self.about)
helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
+ label='Instructions', underline=0, command=self.help, accelerator='F1'
)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
self._root.config(menu=menubar)
def _select_edge(self, edge):
self._selection = edge
# Update the chart view.
- self._cv.markonly_edge(edge, "#f00")
+ self._cv.markonly_edge(edge, '#f00')
self._cv.draw_tree(edge)
# Update the matrix view.
if self._matrix:
# Update the chart view.
self._cv.update()
self._cv.draw_tree(edge)
- self._cv.markonly_edge(edge, "#0df")
+ self._cv.markonly_edge(edge, '#0df')
self._cv.view_edge(edge)
# Update the matrix view.
if self._matrix:
try:
ShowText(
self._root,
- "Help: Chart Parser Application",
- (__doc__ or "").strip(),
+ 'Help: Chart Parser Application',
+ (__doc__ or '').strip(),
width=75,
- font="fixed",
+ font='fixed',
)
except:
ShowText(
self._root,
- "Help: Chart Parser Application",
- (__doc__ or "").strip(),
+ 'Help: Chart Parser Application',
+ (__doc__ or '').strip(),
width=75,
)
def about(self, *e):
ABOUT = "NLTK Chart Parser Application\n" + "Written by Edward Loper"
- showinfo("About: Chart Parser Application", ABOUT)
+ showinfo('About: Chart Parser Application', ABOUT)
# ////////////////////////////////////////////////////////////
# File Menu
# ////////////////////////////////////////////////////////////
- CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")]
+ CHART_FILE_TYPES = [('Pickle file', '.pickle'), ('All files', '*')]
GRAMMAR_FILE_TYPES = [
- ("Plaintext grammar file", ".cfg"),
- ("Pickle file", ".pickle"),
- ("All files", "*"),
+ ('Plaintext grammar file', '.cfg'),
+ ('Pickle file', '.pickle'),
+ ('All files', '*'),
]
def load_chart(self, *args):
"Load a chart from a pickle file"
filename = askopenfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+ filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
)
if not filename:
return
try:
- with open(filename, "rb") as infile:
+ with open(filename, 'rb') as infile:
chart = pickle.load(infile)
self._chart = chart
self._cv.update(chart)
self._cp.set_chart(chart)
except Exception as e:
raise
- showerror("Error Loading Chart", "Unable to open file: %r" % filename)
+ showerror('Error Loading Chart', 'Unable to open file: %r' % filename)
def save_chart(self, *args):
"Save a chart to a pickle file"
filename = asksaveasfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+ filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
)
if not filename:
return
try:
- with open(filename, "wb") as outfile:
+ with open(filename, 'wb') as outfile:
pickle.dump(self._chart, outfile)
except Exception as e:
raise
- showerror("Error Saving Chart", "Unable to open file: %r" % filename)
+ showerror('Error Saving Chart', 'Unable to open file: %r' % filename)
def load_grammar(self, *args):
"Load a grammar from a pickle file"
filename = askopenfilename(
- filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg"
+ filetypes=self.GRAMMAR_FILE_TYPES, defaultextension='.cfg'
)
if not filename:
return
try:
- if filename.endswith(".pickle"):
- with open(filename, "rb") as infile:
+ if filename.endswith('.pickle'):
+ with open(filename, 'rb') as infile:
grammar = pickle.load(infile)
else:
- with open(filename, "r") as infile:
+ with open(filename, 'r') as infile:
grammar = CFG.fromstring(infile.read())
self.set_grammar(grammar)
except Exception as e:
- showerror("Error Loading Grammar", "Unable to open file: %r" % filename)
+ showerror('Error Loading Grammar', 'Unable to open file: %r' % filename)
def save_grammar(self, *args):
filename = asksaveasfilename(
- filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg"
+ filetypes=self.GRAMMAR_FILE_TYPES, defaultextension='.cfg'
)
if not filename:
return
try:
- if filename.endswith(".pickle"):
- with open(filename, "wb") as outfile:
+ if filename.endswith('.pickle'):
+ with open(filename, 'wb') as outfile:
pickle.dump((self._chart, self._tokens), outfile)
else:
- with open(filename, "w") as outfile:
+ with open(filename, 'w') as outfile:
prods = self._grammar.productions()
start = [p for p in prods if p.lhs() == self._grammar.start()]
rest = [p for p in prods if p.lhs() != self._grammar.start()]
for prod in start:
- outfile.write("%s\n" % prod)
+ outfile.write('%s\n' % prod)
for prod in rest:
- outfile.write("%s\n" % prod)
+ outfile.write('%s\n' % prod)
except Exception as e:
- showerror("Error Saving Grammar", "Unable to open file: %r" % filename)
+ showerror('Error Saving Grammar', 'Unable to open file: %r' % filename)
def reset(self, *args):
self._animating = 0
def edit_sentence(self, *e):
sentence = " ".join(self._tokens)
- title = "Edit Text"
- instr = "Enter a new sentence to parse."
+ title = 'Edit Text'
+ instr = 'Enter a new sentence to parse.'
EntryDialog(self._root, sentence, instr, self.set_sentence, title)
def set_sentence(self, sentence):
if self._matrix is not None:
self._matrix.destroy()
self._matrix = ChartMatrixView(self._root, self._chart)
- self._matrix.add_callback("select", self._select_matrix_edge)
+ self._matrix.add_callback('select', self._select_matrix_edge)
def view_results(self, *e):
if self._results is not None:
def _display_rule(self, rule):
if rule is None:
- self._rulelabel2["text"] = ""
+ self._rulelabel2['text'] = ''
else:
name = str(rule)
- self._rulelabel2["text"] = name
+ self._rulelabel2['text'] = name
size = self._cv.get_font_size()
# ////////////////////////////////////////////////////////////
"""
)
- sent = "John ate the cake on the table with a fork"
- sent = "John ate the cake on the table"
+ sent = 'John ate the cake on the table with a fork'
+ sent = 'John ate the cake on the table'
tokens = list(sent.split())
- print("grammar= (")
+ print('grammar= (')
for rule in grammar.productions():
- print((" ", repr(rule) + ","))
- print(")")
- print(("tokens = %r" % tokens))
+ print((' ', repr(rule) + ','))
+ print(')')
+ print(('tokens = %r' % tokens))
print('Calling "ChartParserApp(grammar, tokens)"...')
ChartParserApp(grammar, tokens).mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
# Chart comparer:
# p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
# p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Regexp Chunk Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# configuration parameters to select what's being chunked (eg VP vs NP)
# and what part of the data is being used as the development set.
+from __future__ import division
import time
import textwrap
import re
import random
-from tkinter import (
+from six.moves.tkinter import (
Button,
Canvas,
Checkbutton,
Text,
Tk,
)
-from tkinter.filedialog import askopenfilename, asksaveasfilename
-from tkinter.font import Font
+from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
+from six.moves.tkinter_font import Font
from nltk.tree import Tree
from nltk.util import in_idle
#: which is used in the help text. (This should probably live with
#: the conll and/or treebank corpus instead.)
TAGSET = {
- "CC": "Coordinating conjunction",
- "PRP$": "Possessive pronoun",
- "CD": "Cardinal number",
- "RB": "Adverb",
- "DT": "Determiner",
- "RBR": "Adverb, comparative",
- "EX": "Existential there",
- "RBS": "Adverb, superlative",
- "FW": "Foreign word",
- "RP": "Particle",
- "JJ": "Adjective",
- "TO": "to",
- "JJR": "Adjective, comparative",
- "UH": "Interjection",
- "JJS": "Adjective, superlative",
- "VB": "Verb, base form",
- "LS": "List item marker",
- "VBD": "Verb, past tense",
- "MD": "Modal",
- "NNS": "Noun, plural",
- "NN": "Noun, singular or masps",
- "VBN": "Verb, past participle",
- "VBZ": "Verb,3rd ps. sing. present",
- "NNP": "Proper noun, singular",
- "NNPS": "Proper noun plural",
- "WDT": "wh-determiner",
- "PDT": "Predeterminer",
- "WP": "wh-pronoun",
- "POS": "Possessive ending",
- "WP$": "Possessive wh-pronoun",
- "PRP": "Personal pronoun",
- "WRB": "wh-adverb",
- "(": "open parenthesis",
- ")": "close parenthesis",
- "``": "open quote",
- ",": "comma",
- "''": "close quote",
- ".": "period",
- "#": "pound sign (currency marker)",
- "$": "dollar sign (currency marker)",
- "IN": "Preposition/subord. conjunction",
- "SYM": "Symbol (mathematical or scientific)",
- "VBG": "Verb, gerund/present participle",
- "VBP": "Verb, non-3rd ps. sing. present",
- ":": "colon",
+ 'CC': 'Coordinating conjunction',
+ 'PRP$': 'Possessive pronoun',
+ 'CD': 'Cardinal number',
+ 'RB': 'Adverb',
+ 'DT': 'Determiner',
+ 'RBR': 'Adverb, comparative',
+ 'EX': 'Existential there',
+ 'RBS': 'Adverb, superlative',
+ 'FW': 'Foreign word',
+ 'RP': 'Particle',
+ 'JJ': 'Adjective',
+ 'TO': 'to',
+ 'JJR': 'Adjective, comparative',
+ 'UH': 'Interjection',
+ 'JJS': 'Adjective, superlative',
+ 'VB': 'Verb, base form',
+ 'LS': 'List item marker',
+ 'VBD': 'Verb, past tense',
+ 'MD': 'Modal',
+ 'NNS': 'Noun, plural',
+ 'NN': 'Noun, singular or masps',
+ 'VBN': 'Verb, past participle',
+ 'VBZ': 'Verb,3rd ps. sing. present',
+ 'NNP': 'Proper noun, singular',
+ 'NNPS': 'Proper noun plural',
+ 'WDT': 'wh-determiner',
+ 'PDT': 'Predeterminer',
+ 'WP': 'wh-pronoun',
+ 'POS': 'Possessive ending',
+ 'WP$': 'Possessive wh-pronoun',
+ 'PRP': 'Personal pronoun',
+ 'WRB': 'wh-adverb',
+ '(': 'open parenthesis',
+ ')': 'close parenthesis',
+ '``': 'open quote',
+ ',': 'comma',
+ "''": 'close quote',
+ '.': 'period',
+ '#': 'pound sign (currency marker)',
+ '$': 'dollar sign (currency marker)',
+ 'IN': 'Preposition/subord. conjunction',
+ 'SYM': 'Symbol (mathematical or scientific)',
+ 'VBG': 'Verb, gerund/present participle',
+ 'VBP': 'Verb, non-3rd ps. sing. present',
+ ':': 'colon',
}
#: Contents for the help box. This is a list of tuples, one for
#: for a list of tags you can use for colorizing.
HELP = [
(
- "Help",
- "20",
+ 'Help',
+ '20',
"Welcome to the regular expression chunk-parser grammar editor. "
"You can use this editor to develop and test chunk parser grammars "
"based on NLTK's RegexpChunkParser class.\n\n"
"the status bar at the bottom of the window.",
),
(
- "Rules",
- "10",
+ 'Rules',
+ '10',
"<h1>{...regexp...}</h1>"
"<indent>\nChunk rule: creates new chunks from words matching "
"regexp.</indent>\n\n"
"and regexp2</indent>\n",
),
(
- "Regexps",
- "10 60",
+ 'Regexps',
+ '10 60',
# "Regular Expression Syntax Summary:\n\n"
"<h1>Pattern\t\tMatches...</h1>\n"
"<hangindent>"
"</hangindent>"
"\n<h1>Examples:</h1>\n"
"<hangindent>"
- "\t<regexp><NN></regexp>\n"
+ '\t<regexp><NN></regexp>\n'
'\t\tMatches <match>"cow/NN"</match>\n'
'\t\tMatches <match>"green/NN"</match>\n'
- "\t<regexp><VB.*></regexp>\n"
+ '\t<regexp><VB.*></regexp>\n'
'\t\tMatches <match>"eating/VBG"</match>\n'
'\t\tMatches <match>"ate/VBD"</match>\n'
- "\t<regexp><IN><DT><NN></regexp>\n"
+ '\t<regexp><IN><DT><NN></regexp>\n'
'\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
- "\t<regexp><RB>?<VBD></regexp>\n"
+ '\t<regexp><RB>?<VBD></regexp>\n'
'\t\tMatches <match>"ran/VBD"</match>\n'
'\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
- "\t<regexp><\#><CD> # This is a comment...</regexp>\n"
+ '\t<regexp><\#><CD> # This is a comment...</regexp>\n'
'\t\tMatches <match>"#/# 100/CD"</match>\n'
"</hangindent>",
),
(
- "Tags",
- "10 60",
+ 'Tags',
+ '10 60',
"<h1>Part of Speech Tags:</h1>\n"
- + "<hangindent>"
- + "<<TAGSET>>"
- + "</hangindent>\n", # this gets auto-substituted w/ self.TAGSET
+ + '<hangindent>'
+ + '<<TAGSET>>'
+ + '</hangindent>\n', # this gets auto-substituted w/ self.TAGSET
),
]
HELP_AUTOTAG = [
- ("red", dict(foreground="#a00")),
- ("green", dict(foreground="#080")),
- ("highlight", dict(background="#ddd")),
- ("underline", dict(underline=True)),
- ("h1", dict(underline=True)),
- ("indent", dict(lmargin1=20, lmargin2=20)),
- ("hangindent", dict(lmargin1=0, lmargin2=60)),
- ("var", dict(foreground="#88f")),
- ("regexp", dict(foreground="#ba7")),
- ("match", dict(foreground="#6a6")),
+ ('red', dict(foreground='#a00')),
+ ('green', dict(foreground='#080')),
+ ('highlight', dict(background='#ddd')),
+ ('underline', dict(underline=True)),
+ ('h1', dict(underline=True)),
+ ('indent', dict(lmargin1=20, lmargin2=20)),
+ ('hangindent', dict(lmargin1=0, lmargin2=60)),
+ ('var', dict(foreground='#88f')),
+ ('regexp', dict(foreground='#ba7')),
+ ('match', dict(foreground='#6a6')),
]
##/////////////////////////////////////////////////////////////////
_GRAMMARBOX_PARAMS = dict(
width=40,
height=12,
- background="#efe",
- highlightbackground="#efe",
+ background='#efe',
+ highlightbackground='#efe',
highlightthickness=1,
- relief="groove",
+ relief='groove',
border=2,
- wrap="word",
+ wrap='word',
)
_HELPBOX_PARAMS = dict(
width=15,
height=15,
- background="#efe",
- highlightbackground="#efe",
- foreground="#555",
+ background='#efe',
+ highlightbackground='#efe',
+ foreground='#555',
highlightthickness=1,
- relief="groove",
+ relief='groove',
border=2,
- wrap="word",
+ wrap='word',
)
_DEVSETBOX_PARAMS = dict(
width=70,
height=10,
- background="#eef",
- highlightbackground="#eef",
+ background='#eef',
+ highlightbackground='#eef',
highlightthickness=1,
- relief="groove",
+ relief='groove',
border=2,
- wrap="word",
+ wrap='word',
tabs=(30,),
)
- _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
- _FONT_PARAMS = dict(family="helvetica", size=-20)
- _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
+ _STATUS_PARAMS = dict(background='#9bb', relief='groove', border=2)
+ _FONT_PARAMS = dict(family='helvetica', size=-20)
+ _FRAME_PARAMS = dict(background='#777', padx=2, pady=2, border=3)
_EVALBOX_PARAMS = dict(
- background="#eef",
- highlightbackground="#eef",
+ background='#eef',
+ highlightbackground='#eef',
highlightthickness=1,
- relief="groove",
+ relief='groove',
border=2,
width=300,
height=280,
)
_BUTTON_PARAMS = dict(
- background="#777", activebackground="#777", highlightbackground="#777"
+ background='#777', activebackground='#777', highlightbackground='#777'
)
- _HELPTAB_BG_COLOR = "#aba"
- _HELPTAB_FG_COLOR = "#efe"
+ _HELPTAB_BG_COLOR = '#aba'
+ _HELPTAB_FG_COLOR = '#efe'
- _HELPTAB_FG_PARAMS = dict(background="#efe")
- _HELPTAB_BG_PARAMS = dict(background="#aba")
+ _HELPTAB_FG_PARAMS = dict(background='#efe')
+ _HELPTAB_BG_PARAMS = dict(background='#aba')
_HELPTAB_SPACER = 6
def normalize_grammar(self, grammar):
# Strip comments
- grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
+ grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
# Normalize whitespace
- grammar = re.sub(" +", " ", grammar)
- grammar = re.sub("\n\s+", "\n", grammar)
+ grammar = re.sub(' +', ' ', grammar)
+ grammar = re.sub('\n\s+', '\n', grammar)
grammar = grammar.strip()
# [xx] Hack: automatically backslash $!
- grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
+ grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
return grammar
def __init__(
self,
- devset_name="conll2000",
+ devset_name='conll2000',
devset=None,
- grammar="",
- chunk_label="NP",
+ grammar='',
+ chunk_label='NP',
tagset=None,
):
"""
# Named development sets:
if devset is None:
- if devset_name == "conll2000":
- devset = conll2000.chunked_sents("train.txt") # [:100]
- elif devset == "treebank":
+ if devset_name == 'conll2000':
+ devset = conll2000.chunked_sents('train.txt') # [:100]
+ elif devset == 'treebank':
devset = treebank_chunk.chunked_sents() # [:100]
else:
- raise ValueError("Unknown development set %s" % devset_name)
+ raise ValueError('Unknown development set %s' % devset_name)
self.chunker = None
"""The chunker built from the grammar string"""
# Set up the main window.
top = self.top = Tk()
- top.geometry("+50+50")
- top.title("Regexp Chunk Parser App")
- top.bind("<Control-q>", self.destroy)
+ top.geometry('+50+50')
+ top.title('Regexp Chunk Parser App')
+ top.bind('<Control-q>', self.destroy)
# Varaible that restricts how much of the devset we look at.
self._devset_size = IntVar(top)
# If a grammar was given, then display it.
if grammar:
- self.grammarbox.insert("end", grammar + "\n")
- self.grammarbox.mark_set("insert", "1.0")
+ self.grammarbox.insert('end', grammar + '\n')
+ self.grammarbox.mark_set('insert', '1.0')
# Display the first item in the development set
self.show_devset(0)
self.update()
def _init_bindings(self, top):
- top.bind("<Control-n>", self._devset_next)
- top.bind("<Control-p>", self._devset_prev)
- top.bind("<Control-t>", self.toggle_show_trace)
- top.bind("<KeyPress>", self.update)
- top.bind("<Control-s>", lambda e: self.save_grammar())
- top.bind("<Control-o>", lambda e: self.load_grammar())
- self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
- self.grammarbox.bind("<Control-n>", self._devset_next)
- self.grammarbox.bind("<Control-p>", self._devset_prev)
+ top.bind('<Control-n>', self._devset_next)
+ top.bind('<Control-p>', self._devset_prev)
+ top.bind('<Control-t>', self.toggle_show_trace)
+ top.bind('<KeyPress>', self.update)
+ top.bind('<Control-s>', lambda e: self.save_grammar())
+ top.bind('<Control-o>', lambda e: self.load_grammar())
+ self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
+ self.grammarbox.bind('<Control-n>', self._devset_next)
+ self.grammarbox.bind('<Control-p>', self._devset_prev)
# Redraw the eval graph when the window size changes
- self.evalbox.bind("<Configure>", self._eval_plot)
+ self.evalbox.bind('<Configure>', self._eval_plot)
def _init_fonts(self, top):
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(top)
self._size.set(20)
- self._font = Font(family="helvetica", size=-self._size.get())
+ self._font = Font(family='helvetica', size=-self._size.get())
self._smallfont = Font(
- family="helvetica", size=-(int(self._size.get() * 14 // 20))
+ family='helvetica', size=-(int(self._size.get() * 14 // 20))
)
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
+ filemenu.add_command(label='Reset Application', underline=0, command=self.reset)
filemenu.add_command(
- label="Save Current Grammar",
+ label='Save Current Grammar',
underline=0,
- accelerator="Ctrl-s",
+ accelerator='Ctrl-s',
command=self.save_grammar,
)
filemenu.add_command(
- label="Load Grammar",
+ label='Load Grammar',
underline=0,
- accelerator="Ctrl-o",
+ accelerator='Ctrl-o',
command=self.load_grammar,
)
filemenu.add_command(
- label="Save Grammar History", underline=13, command=self.save_history
+ label='Save Grammar History', underline=13, command=self.save_history
)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_radiobutton(
- label="Tiny",
+ label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Small",
+ label='Small',
variable=self._size,
underline=0,
value=16,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Medium",
+ label='Medium',
variable=self._size,
underline=0,
value=20,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Large",
+ label='Large',
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Huge",
+ label='Huge',
variable=self._size,
underline=0,
value=34,
command=self.resize,
)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
devsetmenu = Menu(menubar, tearoff=0)
devsetmenu.add_radiobutton(
- label="50 sentences",
+ label='50 sentences',
variable=self._devset_size,
value=50,
command=self.set_devset_size,
)
devsetmenu.add_radiobutton(
- label="100 sentences",
+ label='100 sentences',
variable=self._devset_size,
value=100,
command=self.set_devset_size,
)
devsetmenu.add_radiobutton(
- label="200 sentences",
+ label='200 sentences',
variable=self._devset_size,
value=200,
command=self.set_devset_size,
)
devsetmenu.add_radiobutton(
- label="500 sentences",
+ label='500 sentences',
variable=self._devset_size,
value=500,
command=self.set_devset_size,
)
- menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
+ menubar.add_cascade(label='Development-Set', underline=0, menu=devsetmenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0, command=self.about)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
self.show_devset()
else:
self.show_trace()
- return "break"
+ return 'break'
_SCALE_N = 5 # center on the last 5 examples.
_DRAW_LINES = False
def _eval_plot(self, *e, **config):
- width = config.get("width", self.evalbox.winfo_width())
- height = config.get("height", self.evalbox.winfo_height())
+ width = config.get('width', self.evalbox.winfo_width())
+ height = config.get('height', self.evalbox.winfo_height())
# Clear the canvas
- self.evalbox.delete("all")
+ self.evalbox.delete('all')
# Draw the precision & recall labels.
tag = self.evalbox.create_text(
- 10, height // 2 - 10, justify="left", anchor="w", text="Precision"
+ 10, height // 2 - 10, justify='left', anchor='w', text='Precision'
)
left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
tag = self.evalbox.create_text(
left + (width - left) // 2,
height - 10,
- anchor="s",
- text="Recall",
- justify="center",
+ anchor='s',
+ text='Recall',
+ justify='center',
)
top, bot = 10, self.evalbox.bbox(tag)[1] - 10
# Draw masks for clipping the plot.
- bg = self._EVALBOX_PARAMS["background"]
+ bg = self._EVALBOX_PARAMS['background']
self.evalbox.lower(
self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
)
(i / 10.0 - min_precision) / (max_precision - min_precision)
)
if left < x < right:
- self.evalbox.create_line(x, top, x, bot, fill="#888")
+ self.evalbox.create_line(x, top, x, bot, fill='#888')
if top < y < bot:
- self.evalbox.create_line(left, y, right, y, fill="#888")
+ self.evalbox.create_line(left, y, right, y, fill='#888')
self.evalbox.create_line(left, top, left, bot)
self.evalbox.create_line(left, bot, right, bot)
self.evalbox.create_text(
left - 3,
bot,
- justify="right",
- anchor="se",
- text="%d%%" % (100 * min_precision),
+ justify='right',
+ anchor='se',
+ text='%d%%' % (100 * min_precision),
)
self.evalbox.create_text(
left - 3,
top,
- justify="right",
- anchor="ne",
- text="%d%%" % (100 * max_precision),
+ justify='right',
+ anchor='ne',
+ text='%d%%' % (100 * max_precision),
)
self.evalbox.create_text(
left,
bot + 3,
- justify="center",
- anchor="nw",
- text="%d%%" % (100 * min_recall),
+ justify='center',
+ anchor='nw',
+ text='%d%%' % (100 * min_recall),
)
self.evalbox.create_text(
right,
bot + 3,
- justify="center",
- anchor="ne",
- text="%d%%" % (100 * max_recall),
+ justify='center',
+ anchor='ne',
+ text='%d%%' % (100 * max_recall),
)
# Display the scores.
)
if i == self._history_index:
self.evalbox.create_oval(
- x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
+ x - 2, y - 2, x + 2, y + 2, fill='#0f0', outline='#000'
)
- self.status["text"] = (
- "Precision: %.2f%%\t" % (precision * 100)
- + "Recall: %.2f%%\t" % (recall * 100)
- + "F-score: %.2f%%" % (fscore * 100)
+ self.status['text'] = (
+ 'Precision: %.2f%%\t' % (precision * 100)
+ + 'Recall: %.2f%%\t' % (recall * 100)
+ + 'F-score: %.2f%%' % (fscore * 100)
)
else:
self.evalbox.lower(
self.evalbox.create_oval(
- x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
+ x - 2, y - 2, x + 2, y + 2, fill='#afa', outline='#8c8'
)
)
if prev_x is not None and self._eval_lines.get():
self.evalbox.lower(
- self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
+ self.evalbox.create_line(prev_x, prev_y, x, y, fill='#8c8')
)
prev_x, prev_y = x, y
# If the grammar is empty, the don't bother evaluating it, or
# recording it in history -- the score will just be 0.
- if self.normalized_grammar.strip() == "":
+ if self.normalized_grammar.strip() == '':
# self._eval_index = self._devset_size.get()
self._eval_demon_running = False
return
self._eval_normalized_grammar = None
else:
progress = 100 * self._eval_index / self._devset_size.get()
- self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
+ self.status['text'] = 'Evaluating on Development Set (%d%%)' % progress
self._eval_demon_running = True
self._adaptively_modify_eval_chunk(time.time() - t0)
self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
self.grammarlabel = Label(
frame0,
font=self._font,
- text="Grammar:",
- highlightcolor="black",
- background=self._GRAMMARBOX_PARAMS["background"],
+ text='Grammar:',
+ highlightcolor='black',
+ background=self._GRAMMARBOX_PARAMS['background'],
)
- self.grammarlabel.grid(column=0, row=0, sticky="SW")
- self.grammarbox.grid(column=0, row=1, sticky="NEWS")
+ self.grammarlabel.grid(column=0, row=0, sticky='SW')
+ self.grammarbox.grid(column=0, row=1, sticky='NEWS')
# Scroll bar for grammar
grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
- grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
+ grammar_scrollbar.grid(column=1, row=1, sticky='NWS')
self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
# grammar buttons
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
frame3 = Frame(frame0, background=bg)
- frame3.grid(column=0, row=2, sticky="EW")
+ frame3.grid(column=0, row=2, sticky='EW')
Button(
frame3,
- text="Prev Grammar",
+ text='Prev Grammar',
command=self._history_prev,
**self._BUTTON_PARAMS
- ).pack(side="left")
+ ).pack(side='left')
Button(
frame3,
- text="Next Grammar",
+ text='Next Grammar',
command=self._history_next,
**self._BUTTON_PARAMS
- ).pack(side="left")
+ ).pack(side='left')
# Help box
self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
- self.helpbox.grid(column=3, row=1, sticky="NEWS")
+ self.helpbox.grid(column=3, row=1, sticky='NEWS')
self.helptabs = {}
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
helptab_frame = Frame(frame0, background=bg)
- helptab_frame.grid(column=3, row=0, sticky="SW")
+ helptab_frame.grid(column=3, row=0, sticky='SW')
for i, (tab, tabstops, text) in enumerate(self.HELP):
label = Label(helptab_frame, text=tab, font=self._smallfont)
- label.grid(column=i * 2, row=0, sticky="S")
+ label.grid(column=i * 2, row=0, sticky='S')
# help_frame.grid_columnconfigure(i, weight=1)
# label.pack(side='left')
- label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
+ label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
self.helptabs[tab] = label
Frame(
helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
).grid(column=i * 2 + 1, row=0)
self.helptabs[self.HELP[0][0]].configure(font=self._font)
- self.helpbox.tag_config("elide", elide=True)
+ self.helpbox.tag_config('elide', elide=True)
for (tag, params) in self.HELP_AUTOTAG:
- self.helpbox.tag_config("tag-%s" % tag, **params)
+ self.helpbox.tag_config('tag-%s' % tag, **params)
self.show_help(self.HELP[0][0])
# Scroll bar for helpbox
help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
self.helpbox.config(yscrollcommand=help_scrollbar.set)
- help_scrollbar.grid(column=4, row=1, sticky="NWS")
+ help_scrollbar.grid(column=4, row=1, sticky='NWS')
# The dev set
- frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
+ frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
- self.devsetbox.pack(expand=True, fill="both")
+ self.devsetbox.pack(expand=True, fill='both')
self.devsetlabel = Label(
frame0,
font=self._font,
- text="Development Set:",
- justify="right",
- background=self._DEVSETBOX_PARAMS["background"],
+ text='Development Set:',
+ justify='right',
+ background=self._DEVSETBOX_PARAMS['background'],
)
- self.devsetlabel.grid(column=0, row=4, sticky="SW")
- frame4.grid(column=0, row=5, sticky="NEWS")
+ self.devsetlabel.grid(column=0, row=4, sticky='SW')
+ frame4.grid(column=0, row=5, sticky='NEWS')
# dev set scrollbars
self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
- self.devset_scroll.grid(column=1, row=5, sticky="NWS")
+ self.devset_scroll.grid(column=1, row=5, sticky='NWS')
self.devset_xscroll = Scrollbar(
- frame4, command=self.devsetbox.xview, orient="horiz"
+ frame4, command=self.devsetbox.xview, orient='horiz'
)
- self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
- self.devset_xscroll.pack(side="bottom", fill="x")
+ self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
+ self.devset_xscroll.pack(side='bottom', fill='x')
# dev set buttons
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
frame1 = Frame(frame0, background=bg)
- frame1.grid(column=0, row=7, sticky="EW")
+ frame1.grid(column=0, row=7, sticky='EW')
Button(
frame1,
- text="Prev Example (Ctrl-p)",
+ text='Prev Example (Ctrl-p)',
command=self._devset_prev,
**self._BUTTON_PARAMS
- ).pack(side="left")
+ ).pack(side='left')
Button(
frame1,
- text="Next Example (Ctrl-n)",
+ text='Next Example (Ctrl-n)',
command=self._devset_next,
**self._BUTTON_PARAMS
- ).pack(side="left")
+ ).pack(side='left')
self.devset_button = Button(
frame1,
- text="Show example",
+ text='Show example',
command=self.show_devset,
- state="disabled",
+ state='disabled',
**self._BUTTON_PARAMS
)
- self.devset_button.pack(side="right")
+ self.devset_button.pack(side='right')
self.trace_button = Button(
- frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
+ frame1, text='Show trace', command=self.show_trace, **self._BUTTON_PARAMS
)
- self.trace_button.pack(side="right")
+ self.trace_button.pack(side='right')
# evaluation box
self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
label = Label(
frame0,
font=self._font,
- text="Evaluation:",
- justify="right",
- background=self._EVALBOX_PARAMS["background"],
+ text='Evaluation:',
+ justify='right',
+ background=self._EVALBOX_PARAMS['background'],
)
- label.grid(column=3, row=4, sticky="SW")
- self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
+ label.grid(column=3, row=4, sticky='SW')
+ self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)
# evaluation box buttons
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
frame2 = Frame(frame0, background=bg)
- frame2.grid(column=3, row=7, sticky="EW")
+ frame2.grid(column=3, row=7, sticky='EW')
self._autoscale = IntVar(self.top)
self._autoscale.set(False)
Checkbutton(
frame2,
variable=self._autoscale,
command=self._eval_plot,
- text="Zoom",
+ text='Zoom',
**self._BUTTON_PARAMS
- ).pack(side="left")
+ ).pack(side='left')
self._eval_lines = IntVar(self.top)
self._eval_lines.set(False)
Checkbutton(
frame2,
variable=self._eval_lines,
command=self._eval_plot,
- text="Lines",
+ text='Lines',
**self._BUTTON_PARAMS
- ).pack(side="left")
- Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
+ ).pack(side='left')
+ Button(frame2, text='History', **self._BUTTON_PARAMS).pack(side='right')
# The status label
self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
- self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
+ self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2, columnspan=5)
# Help box & devset box can't be edited.
- self.helpbox["state"] = "disabled"
- self.devsetbox["state"] = "disabled"
+ self.helpbox['state'] = 'disabled'
+ self.devsetbox['state'] = 'disabled'
# Spacers
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
# pack the frame.
- frame0.pack(fill="both", expand=True)
+ frame0.pack(fill='both', expand=True)
# Set up colors for the devset box
- self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
- self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
- self.devsetbox.tag_config("false-pos", background="#faa")
- self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
- self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
- self.devsetbox.tag_config("error", foreground="#800")
+ self.devsetbox.tag_config('true-pos', background='#afa', underline='True')
+ self.devsetbox.tag_config('false-neg', underline='True', foreground='#800')
+ self.devsetbox.tag_config('false-pos', background='#faa')
+ self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
+ self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
+ self.devsetbox.tag_config('error', foreground='#800')
# And for the grammarbox
- self.grammarbox.tag_config("error", background="#fec")
- self.grammarbox.tag_config("comment", foreground="#840")
- self.grammarbox.tag_config("angle", foreground="#00f")
- self.grammarbox.tag_config("brace", foreground="#0a0")
- self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
+ self.grammarbox.tag_config('error', background='#fec')
+ self.grammarbox.tag_config('comment', foreground='#840')
+ self.grammarbox.tag_config('angle', foreground='#00f')
+ self.grammarbox.tag_config('brace', foreground='#0a0')
+ self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
_showing_trace = False
def show_trace(self, *e):
self._showing_trace = True
- self.trace_button["state"] = "disabled"
- self.devset_button["state"] = "normal"
+ self.trace_button['state'] = 'disabled'
+ self.devset_button['state'] = 'normal'
- self.devsetbox["state"] = "normal"
+ self.devsetbox['state'] = 'normal'
# self.devsetbox['wrap'] = 'none'
- self.devsetbox.delete("1.0", "end")
- self.devsetlabel["text"] = "Development Set (%d/%d)" % (
+ self.devsetbox.delete('1.0', 'end')
+ self.devsetlabel['text'] = 'Development Set (%d/%d)' % (
(self.devset_index + 1, self._devset_size.get())
)
if self.chunker is None:
- self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
- self.devsetbox.tag_add("error", "1.0", "end")
+ self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
+ self.devsetbox.tag_add('error', '1.0', 'end')
return # can't do anything more
gold_tree = self.devset[self.devset_index]
rules = self.chunker.rules()
# Calculate the tag sequence
- tagseq = "\t"
+ tagseq = '\t'
charnum = [1]
for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
- tagseq += "%s " % pos
+ tagseq += '%s ' % pos
charnum.append(len(tagseq))
self.charnum = dict(
((i, j), charnum[j])
for i in range(len(rules) + 1):
if i == 0:
- self.devsetbox.insert("end", "Start:\n")
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+ self.devsetbox.insert('end', 'Start:\n')
+ self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
else:
- self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+ self.devsetbox.insert('end', 'Apply %s:\n' % rules[i - 1])
+ self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
# Display the tag sequence.
- self.devsetbox.insert("end", tagseq + "\n")
- self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
+ self.devsetbox.insert('end', tagseq + '\n')
+ self.devsetbox.tag_add('wrapindent', 'end -2c linestart', 'end -2c')
# Run a partial parser, and extract gold & test chunks
chunker = RegexpChunkParser(rules[:i])
test_tree = self._chunkparse(gold_tree.leaves())
test_chunks = self._chunks(test_tree)
# Compare them.
for chunk in gold_chunks.intersection(test_chunks):
- self._color_chunk(i, chunk, "true-pos")
+ self._color_chunk(i, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
- self._color_chunk(i, chunk, "false-neg")
+ self._color_chunk(i, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
- self._color_chunk(i, chunk, "false-pos")
- self.devsetbox.insert("end", "Finished.\n")
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+ self._color_chunk(i, chunk, 'false-pos')
+ self.devsetbox.insert('end', 'Finished.\n')
+ self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
# This is a hack, because the x-scrollbar isn't updating its
# position right -- I'm not sure what the underlying cause is
self.top.after(100, self.devset_xscroll.set, 0, 0.3)
def show_help(self, tab):
- self.helpbox["state"] = "normal"
- self.helpbox.delete("1.0", "end")
+ self.helpbox['state'] = 'normal'
+ self.helpbox.delete('1.0', 'end')
for (name, tabstops, text) in self.HELP:
if name == tab:
text = text.replace(
- "<<TAGSET>>",
- "\n".join(
+ '<<TAGSET>>',
+ '\n'.join(
(
- "\t%s\t%s" % item
+ '\t%s\t%s' % item
for item in sorted(
list(self.tagset.items()),
- key=lambda t_w: re.match("\w+", t_w[0])
+ key=lambda t_w: re.match('\w+', t_w[0])
and (0, t_w[0])
or (1, t_w[0]),
)
self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
self.helpbox.config(tabs=tabstops)
- self.helpbox.insert("1.0", text + "\n" * 20)
- C = "1.0 + %d chars"
+ self.helpbox.insert('1.0', text + '\n' * 20)
+ C = '1.0 + %d chars'
for (tag, params) in self.HELP_AUTOTAG:
- pattern = "(?s)(<%s>)(.*?)(</%s>)" % (tag, tag)
+ pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
for m in re.finditer(pattern, text):
- self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
+ self.helpbox.tag_add('elide', C % m.start(1), C % m.end(1))
self.helpbox.tag_add(
- "tag-%s" % tag, C % m.start(2), C % m.end(2)
+ 'tag-%s' % tag, C % m.start(2), C % m.end(2)
)
- self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
+ self.helpbox.tag_add('elide', C % m.start(3), C % m.end(3))
else:
self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
- self.helpbox["state"] = "disabled"
+ self.helpbox['state'] = 'disabled'
def _history_prev(self, *e):
self._view_history(self._history_index - 1)
- return "break"
+ return 'break'
def _history_next(self, *e):
self._view_history(self._history_index + 1)
- return "break"
+ return 'break'
def _view_history(self, index):
# Bounds & sanity checking:
return
# Show the requested grammar. It will get added to _history
# only if they edit it (causing self.update() to get run.)
- self.grammarbox["state"] = "normal"
- self.grammarbox.delete("1.0", "end")
- self.grammarbox.insert("end", self._history[index][0])
- self.grammarbox.mark_set("insert", "1.0")
+ self.grammarbox['state'] = 'normal'
+ self.grammarbox.delete('1.0', 'end')
+ self.grammarbox.insert('end', self._history[index][0])
+ self.grammarbox.mark_set('insert', '1.0')
self._history_index = index
self._syntax_highlight_grammar(self._history[index][0])
# Record the normalized grammar & regenerate the chunker.
if self.normalized_grammar:
rules = [
RegexpChunkRule.fromstring(line)
- for line in self.normalized_grammar.split("\n")
+ for line in self.normalized_grammar.split('\n')
]
else:
rules = []
self.show_trace()
# Update the grammar label
if self._history_index < len(self._history) - 1:
- self.grammarlabel["text"] = "Grammar %s/%s:" % (
+ self.grammarlabel['text'] = 'Grammar %s/%s:' % (
self._history_index + 1,
len(self._history),
)
else:
- self.grammarlabel["text"] = "Grammar:"
+ self.grammarlabel['text'] = 'Grammar:'
def _devset_next(self, *e):
- self._devset_scroll("scroll", 1, "page")
- return "break"
+ self._devset_scroll('scroll', 1, 'page')
+ return 'break'
def _devset_prev(self, *e):
- self._devset_scroll("scroll", -1, "page")
- return "break"
+ self._devset_scroll('scroll', -1, 'page')
+ return 'break'
def destroy(self, *e):
if self.top is None:
def _devset_scroll(self, command, *args):
N = 1 # size of a page -- one sentence.
showing_trace = self._showing_trace
- if command == "scroll" and args[1].startswith("unit"):
+ if command == 'scroll' and args[1].startswith('unit'):
self.show_devset(self.devset_index + int(args[0]))
- elif command == "scroll" and args[1].startswith("page"):
+ elif command == 'scroll' and args[1].startswith('page'):
self.show_devset(self.devset_index + N * int(args[0]))
- elif command == "moveto":
+ elif command == 'moveto':
self.show_devset(int(float(args[0]) * self._devset_size.get()))
else:
- assert 0, "bad scroll command %s %s" % (command, args)
+ assert 0, 'bad scroll command %s %s' % (command, args)
if showing_trace:
self.show_trace()
self.devset_index = index
self._showing_trace = False
- self.trace_button["state"] = "normal"
- self.devset_button["state"] = "disabled"
+ self.trace_button['state'] = 'normal'
+ self.devset_button['state'] = 'disabled'
# Clear the text box.
- self.devsetbox["state"] = "normal"
- self.devsetbox["wrap"] = "word"
- self.devsetbox.delete("1.0", "end")
- self.devsetlabel["text"] = "Development Set (%d/%d)" % (
+ self.devsetbox['state'] = 'normal'
+ self.devsetbox['wrap'] = 'word'
+ self.devsetbox.delete('1.0', 'end')
+ self.devsetlabel['text'] = 'Development Set (%d/%d)' % (
(self.devset_index + 1, self._devset_size.get())
)
self.charnum = {}
self.linenum = {0: 1}
for sentnum, sent in enumerate(sample):
- linestr = ""
+ linestr = ''
for wordnum, (word, pos) in enumerate(sent.leaves()):
self.charnum[sentnum, wordnum] = len(linestr)
- linestr += "%s/%s " % (word, pos)
+ linestr += '%s/%s ' % (word, pos)
self.charnum[sentnum, wordnum + 1] = len(linestr)
- self.devsetbox.insert("end", linestr[:-1] + "\n\n")
+ self.devsetbox.insert('end', linestr[:-1] + '\n\n')
# Highlight chunks in the dev set
if self.chunker is not None:
self._highlight_devset()
- self.devsetbox["state"] = "disabled"
+ self.devsetbox['state'] = 'disabled'
# Update the scrollbar
first = self.devset_index / self._devset_size.get()
def _syntax_highlight_grammar(self, grammar):
if self.top is None:
return
- self.grammarbox.tag_remove("comment", "1.0", "end")
- self.grammarbox.tag_remove("angle", "1.0", "end")
- self.grammarbox.tag_remove("brace", "1.0", "end")
- self.grammarbox.tag_add("hangindent", "1.0", "end")
- for lineno, line in enumerate(grammar.split("\n")):
+ self.grammarbox.tag_remove('comment', '1.0', 'end')
+ self.grammarbox.tag_remove('angle', '1.0', 'end')
+ self.grammarbox.tag_remove('brace', '1.0', 'end')
+ self.grammarbox.tag_add('hangindent', '1.0', 'end')
+ for lineno, line in enumerate(grammar.split('\n')):
if not line.strip():
continue
- m = re.match(r"(\\.|[^#])*(#.*)?", line)
+ m = re.match(r'(\\.|[^#])*(#.*)?', line)
comment_start = None
if m.group(2):
comment_start = m.start(2)
- s = "%d.%d" % (lineno + 1, m.start(2))
- e = "%d.%d" % (lineno + 1, m.end(2))
- self.grammarbox.tag_add("comment", s, e)
- for m in re.finditer("[<>{}]", line):
+ s = '%d.%d' % (lineno + 1, m.start(2))
+ e = '%d.%d' % (lineno + 1, m.end(2))
+ self.grammarbox.tag_add('comment', s, e)
+ for m in re.finditer('[<>{}]', line):
if comment_start is not None and m.start() >= comment_start:
break
- s = "%d.%d" % (lineno + 1, m.start())
- e = "%d.%d" % (lineno + 1, m.end())
- if m.group() in "<>":
- self.grammarbox.tag_add("angle", s, e)
+ s = '%d.%d' % (lineno + 1, m.start())
+ e = '%d.%d' % (lineno + 1, m.end())
+ if m.group() in '<>':
+ self.grammarbox.tag_add('angle', s, e)
else:
- self.grammarbox.tag_add("brace", s, e)
+ self.grammarbox.tag_add('brace', s, e)
def _grammarcheck(self, grammar):
if self.top is None:
return
- self.grammarbox.tag_remove("error", "1.0", "end")
+ self.grammarbox.tag_remove('error', '1.0', 'end')
self._grammarcheck_errs = []
- for lineno, line in enumerate(grammar.split("\n")):
- line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
+ for lineno, line in enumerate(grammar.split('\n')):
+ line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
line = line.strip()
if line:
try:
RegexpChunkRule.fromstring(line)
except ValueError as e:
self.grammarbox.tag_add(
- "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
+ 'error', '%s.0' % (lineno + 1), '%s.0 lineend' % (lineno + 1)
)
- self.status["text"] = ""
+ self.status['text'] = ''
def update(self, *event):
# Record when update was called (for grammarcheck)
self._last_keypress = time.time()
# Read the grammar from the Text box.
- self.grammar = grammar = self.grammarbox.get("1.0", "end")
+ self.grammar = grammar = self.grammarbox.get('1.0', 'end')
# If the grammar hasn't changed, do nothing:
normalized_grammar = self.normalize_grammar(grammar)
# If the grammar has changed, and we're looking at history,
# then stop looking at history.
if self._history_index < len(self._history) - 1:
- self.grammarlabel["text"] = "Grammar:"
+ self.grammarlabel['text'] = 'Grammar:'
self._syntax_highlight_grammar(grammar)
if normalized_grammar:
rules = [
RegexpChunkRule.fromstring(line)
- for line in normalized_grammar.split("\n")
+ for line in normalized_grammar.split('\n')
]
else:
rules = []
return
self.chunker = RegexpChunkParser(rules)
- self.grammarbox.tag_remove("error", "1.0", "end")
+ self.grammarbox.tag_remove('error', '1.0', 'end')
self.grammar_changed = time.time()
# Display the results
if self._showing_trace:
if sample is None:
sample = self.devset[self.devset_index : self.devset_index + 1]
- self.devsetbox.tag_remove("true-pos", "1.0", "end")
- self.devsetbox.tag_remove("false-neg", "1.0", "end")
- self.devsetbox.tag_remove("false-pos", "1.0", "end")
+ self.devsetbox.tag_remove('true-pos', '1.0', 'end')
+ self.devsetbox.tag_remove('false-neg', '1.0', 'end')
+ self.devsetbox.tag_remove('false-pos', '1.0', 'end')
# Run the grammar on the test cases.
for sentnum, gold_tree in enumerate(sample):
test_chunks = self._chunks(test_tree)
# Compare them.
for chunk in gold_chunks.intersection(test_chunks):
- self._color_chunk(sentnum, chunk, "true-pos")
+ self._color_chunk(sentnum, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
- self._color_chunk(sentnum, chunk, "false-neg")
+ self._color_chunk(sentnum, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
- self._color_chunk(sentnum, chunk, "false-pos")
+ self._color_chunk(sentnum, chunk, 'false-pos')
def _chunkparse(self, words):
try:
# There's an error somewhere in the grammar, but we're not sure
# exactly where, so just mark the whole grammar as bad.
# E.g., this is caused by: "({<NN>})"
- self.grammarbox.tag_add("error", "1.0", "end")
+ self.grammarbox.tag_add('error', '1.0', 'end')
# Treat it as tagging nothing:
return words
start, end = chunk
self.devsetbox.tag_add(
tag,
- "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, start]),
- "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
+ '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
+ '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
)
def reset(self):
self._history = []
self._history_index = 0
# Update the on-screen display.
- self.grammarbox.delete("1.0", "end")
+ self.grammarbox.delete('1.0', 'end')
self.show_devset(0)
self.update()
# self._eval_plot()
SAVE_GRAMMAR_TEMPLATE = (
- "# Regexp Chunk Parsing Grammar\n"
- "# Saved %(date)s\n"
- "#\n"
- "# Development set: %(devset)s\n"
- "# Precision: %(precision)s\n"
- "# Recall: %(recall)s\n"
- "# F-score: %(fscore)s\n\n"
- "%(grammar)s\n"
+ '# Regexp Chunk Parsing Grammar\n'
+ '# Saved %(date)s\n'
+ '#\n'
+ '# Development set: %(devset)s\n'
+ '# Precision: %(precision)s\n'
+ '# Recall: %(recall)s\n'
+ '# F-score: %(fscore)s\n\n'
+ '%(grammar)s\n'
)
def save_grammar(self, filename=None):
if not filename:
- ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
+ ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')]
+ filename = asksaveasfilename(filetypes=ftypes, defaultextension='.chunk')
if not filename:
return
if self._history and self.normalized_grammar == self.normalize_grammar(
self._history[-1][0]
):
precision, recall, fscore = [
- "%.2f%%" % (100 * v) for v in self._history[-1][1:]
+ '%.2f%%' % (100 * v) for v in self._history[-1][1:]
]
elif self.chunker is None:
- precision = recall = fscore = "Grammar not well formed"
+ precision = recall = fscore = 'Grammar not well formed'
else:
- precision = recall = fscore = "Not finished evaluation yet"
+ precision = recall = fscore = 'Not finished evaluation yet'
- with open(filename, "w") as outfile:
+ with open(filename, 'w') as outfile:
outfile.write(
self.SAVE_GRAMMAR_TEMPLATE
% dict(
def load_grammar(self, filename=None):
if not filename:
- ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
- filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
+ ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')]
+ filename = askopenfilename(filetypes=ftypes, defaultextension='.chunk')
if not filename:
return
- self.grammarbox.delete("1.0", "end")
+ self.grammarbox.delete('1.0', 'end')
self.update()
- with open(filename, "r") as infile:
+ with open(filename, 'r') as infile:
grammar = infile.read()
grammar = re.sub(
- "^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
+ '^\# Regexp Chunk Parsing Grammar[\s\S]*' 'F-score:.*\n', '', grammar
).lstrip()
- self.grammarbox.insert("1.0", grammar)
+ self.grammarbox.insert('1.0', grammar)
self.update()
def save_history(self, filename=None):
if not filename:
- ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
+ ftypes = [('Chunk Gramamr History', '.txt'), ('All files', '*')]
+ filename = asksaveasfilename(filetypes=ftypes, defaultextension='.txt')
if not filename:
return
- with open(filename, "w") as outfile:
- outfile.write("# Regexp Chunk Parsing Grammar History\n")
- outfile.write("# Saved %s\n" % time.ctime())
- outfile.write("# Development set: %s\n" % self.devset_name)
+ with open(filename, 'w') as outfile:
+ outfile.write('# Regexp Chunk Parsing Grammar History\n')
+ outfile.write('# Saved %s\n' % time.ctime())
+ outfile.write('# Development set: %s\n' % self.devset_name)
for i, (g, p, r, f) in enumerate(self._history):
hdr = (
- "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
- "fscore=%.2f%%)"
+ 'Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
+ 'fscore=%.2f%%)'
% (i + 1, len(self._history), p * 100, r * 100, f * 100)
)
- outfile.write("\n%s\n" % hdr)
- outfile.write("".join(" %s\n" % line for line in g.strip().split()))
+ outfile.write('\n%s\n' % hdr)
+ outfile.write(''.join(' %s\n' % line for line in g.strip().split()))
if not (
self._history
== self.normalize_grammar(self._history[-1][0])
):
if self.chunker is None:
- outfile.write("\nCurrent Grammar (not well-formed)\n")
+ outfile.write('\nCurrent Grammar (not well-formed)\n')
else:
- outfile.write("\nCurrent Grammar (not evaluated)\n")
+ outfile.write('\nCurrent Grammar (not evaluated)\n')
outfile.write(
- "".join(" %s\n" % line for line in self.grammar.strip().split())
+ ''.join(' %s\n' % line for line in self.grammar.strip().split())
)
def about(self, *e):
ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
- TITLE = "About: Regular Expression Chunk Parser Application"
+ TITLE = 'About: Regular Expression Chunk Parser Application'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
RegexpChunkApp().mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import division
+
import threading
-import queue as q
-from tkinter.font import Font
-from tkinter import (
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (
Button,
END,
Frame,
from nltk.probability import FreqDist
-CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
-ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 100
-_DEFAULT = "English: Brown Corpus (Humor)"
+_DEFAULT = 'English: Brown Corpus (Humor)'
_CORPORA = {
- "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
- "English: Brown Corpus": lambda: brown.words(),
- "English: Brown Corpus (Press)": lambda: brown.words(
- categories=["news", "editorial", "reviews"]
+ 'Catalan: CESS-CAT Corpus': lambda: cess_cat.words(),
+ 'English: Brown Corpus': lambda: brown.words(),
+ 'English: Brown Corpus (Press)': lambda: brown.words(
+ categories=['news', 'editorial', 'reviews']
),
- "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
- "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
- "English: Brown Corpus (Science Fiction)": lambda: brown.words(
- categories="science_fiction"
+ 'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'),
+ 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'),
+ 'English: Brown Corpus (Science Fiction)': lambda: brown.words(
+ categories='science_fiction'
),
- "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
- "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
- "English: NPS Chat Corpus": lambda: nps_chat.words(),
- "English: Wall Street Journal Corpus": lambda: treebank.words(),
- "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
- "Dutch: Alpino Corpus": lambda: alpino.words(),
- "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
- "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
- "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
- "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
- "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
+ 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'),
+ 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'),
+ 'English: NPS Chat Corpus': lambda: nps_chat.words(),
+ 'English: Wall Street Journal Corpus': lambda: treebank.words(),
+ 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(),
+ 'Dutch: Alpino Corpus': lambda: alpino.words(),
+ 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'),
+ 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(),
+ 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(),
+ 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(),
+ 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words(),
}
class CollocationsView:
- _BACKGROUND_COLOUR = "#FFF" # white
+ _BACKGROUND_COLOUR = '#FFF' # white
def __init__(self):
self.queue = q.Queue()
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
- top.geometry("550x650+50+50")
- top.title("NLTK Collocations List")
- top.bind("<Control-q>", self.destroy)
- top.protocol("WM_DELETE_WINDOW", self.destroy)
+ top.geometry('550x650+50+50')
+ top.title('NLTK Collocations List')
+ top.bind('<Control-q>', self.destroy)
+ top.protocol('WM_DELETE_WINDOW', self.destroy)
top.minsize(550, 650)
def _init_widgets(self, parent):
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
- self.main_frame.pack(fill="both", expand=True)
+ self.main_frame.pack(fill='both', expand=True)
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
Label(
innerframe,
justify=LEFT,
- text=" Corpus: ",
+ text=' Corpus: ',
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
- ).pack(side="left")
+ ).pack(side='left')
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
command=self.corpus_selected,
*self.model.non_default_corpora()
)
- om["borderwidth"] = 0
- om["highlightthickness"] = 1
- om.pack(side="left")
- innerframe.pack(side="top", fill="x", anchor="n")
+ om['borderwidth'] = 0
+ om['highlightthickness'] = 1
+ om.pack(side='left')
+ innerframe.pack(side='top', fill='x', anchor='n')
def _init_status(self, parent):
self.status = Label(
padx=1,
pady=0,
)
- self.status.pack(side="top", anchor="sw")
+ self.status.pack(side='top', anchor='sw')
def _init_menubar(self):
self._result_size = IntVar(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
- label="20",
+ label='20',
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
- label="50",
+ label='50',
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
- label="100",
+ label='100',
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
- editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
+ editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
- hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
+ hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
self.results_box = Text(
i1,
- font=Font(family="courier", size="16"),
- state="disabled",
+ font=Font(family='courier', size='16'),
+ state='disabled',
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
- wrap="none",
- width="40",
- height="20",
+ wrap='none',
+ width='40',
+ height='20',
exportselection=1,
)
- self.results_box.pack(side="left", fill="both", expand=True)
- vscrollbar.pack(side="left", fill="y", anchor="e")
+ self.results_box.pack(side='left', fill='both', expand=True)
+ vscrollbar.pack(side='left', fill='y', anchor='e')
vscrollbar.config(command=self.results_box.yview)
- hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
+ hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
- Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
- side="left", anchor="e"
+ Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(
+ side='left', anchor='e'
)
- i1.pack(side="top", fill="both", expand=True, anchor="n")
- i2.pack(side="bottom", fill="x", anchor="s")
- innerframe.pack(side="top", fill="both", expand=True)
+ i1.pack(side='top', fill='both', expand=True, anchor='n')
+ i2.pack(side='bottom', fill='x', anchor='s')
+ innerframe.pack(side='top', fill='both', expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
- text="Previous",
+ text='Previous',
command=self.previous,
- width="10",
+ width='10',
borderwidth=1,
highlightthickness=1,
- state="disabled",
+ state='disabled',
)
- prev.pack(side="left", anchor="center")
+ prev.pack(side='left', anchor='center')
self.next = next = Button(
innerframe,
- text="Next",
+ text='Next',
command=self.__next__,
- width="10",
+ width='10',
borderwidth=1,
highlightthickness=1,
- state="disabled",
+ state='disabled',
)
- next.pack(side="right", anchor="center")
- innerframe.pack(side="top", fill="y")
+ next.pack(side='right', anchor='center')
+ innerframe.pack(side='top', fill='y')
self.reset_current_page()
def reset_current_page(self):
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
- self.status["text"] = "Error in loading " + self.var.get()
+ self.status['text'] = 'Error in loading ' + self.var.get()
self.unfreeze_editable()
self.clear_results_box()
self.freeze_editable()
self.reset_current_page()
def handle_corpus_loaded(self, event):
- self.status["text"] = self.var.get() + " is loaded"
+ self.status['text'] = self.var.get() + ' is loaded'
self.unfreeze_editable()
self.clear_results_box()
self.reset_current_page()
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
- self.status["text"] = "Loading " + selection + "..."
+ self.status['text'] = 'Loading ' + selection + '...'
self.freeze_editable()
self.model.load_corpus(selection)
def freeze_editable(self):
- self.prev["state"] = "disabled"
- self.next["state"] = "disabled"
+ self.prev['state'] = 'disabled'
+ self.next['state'] = 'disabled'
def clear_results_box(self):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
self.results_box.delete("1.0", END)
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
- self.top.event_generate(event, when="tail")
+ self.top.event_generate(event, when='tail')
def destroy(self, *e):
if self.top is None:
def set_paging_button_states(self):
if self.current_page == -1 or self.current_page == 0:
- self.prev["state"] = "disabled"
+ self.prev['state'] = 'disabled'
else:
- self.prev["state"] = "normal"
+ self.prev['state'] = 'normal'
if self.model.is_last_page(self.current_page):
- self.next["state"] = "disabled"
+ self.next['state'] = 'disabled'
else:
- self.next["state"] = "normal"
+ self.next['state'] = 'normal'
def write_results(self, results):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
row = 1
for each in results:
- self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
+ self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
row += 1
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
class CollocationsModel:
c.mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Concordance Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
import threading
-import queue as q
-from tkinter.font import Font
-from tkinter import (
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (
Tk,
Button,
END,
Text,
)
+import nltk.compat
from nltk.corpus import (
cess_cat,
brown,
from nltk.util import in_idle
from nltk.draw.util import ShowText
-WORD_OR_TAG = "[^/ ]+"
-BOUNDARY = r"\b"
+WORD_OR_TAG = '[^/ ]+'
+BOUNDARY = r'\b'
-CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
-SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
-SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
-ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
+SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 50
# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.
-_DEFAULT = "English: Brown Corpus (Humor, simplified)"
+_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
- "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
- tagset="universal"
+ 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(
+ tagset='universal'
),
- "English: Brown Corpus": lambda: brown.tagged_sents(),
- "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
- tagset="universal"
+ 'English: Brown Corpus': lambda: brown.tagged_sents(),
+ 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(
+ tagset='universal'
),
- "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
- categories=["news", "editorial", "reviews"], tagset="universal"
+ 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(
+ categories=['news', 'editorial', 'reviews'], tagset='universal'
),
- "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
- categories="religion", tagset="universal"
+ 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(
+ categories='religion', tagset='universal'
),
- "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
- categories="learned", tagset="universal"
+ 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(
+ categories='learned', tagset='universal'
),
- "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
- categories="science_fiction", tagset="universal"
+ 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(
+ categories='science_fiction', tagset='universal'
),
- "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
- categories="romance", tagset="universal"
+ 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(
+ categories='romance', tagset='universal'
),
- "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
- categories="humor", tagset="universal"
+ 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(
+ categories='humor', tagset='universal'
),
- "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
- "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
- tagset="universal"
+ 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(),
+ 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(
+ tagset='universal'
),
- "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
- "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
- tagset="universal"
+ 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(),
+ 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(
+ tagset='universal'
),
- "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
- "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
- tagset="universal"
+ 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(),
+ 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(
+ tagset='universal'
),
- "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
- "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
- tagset="universal"
+ 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(),
+ 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(
+ tagset='universal'
),
- "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
- "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
- files="hindi.pos", tagset="universal"
+ 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'),
+ 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(
+ files='hindi.pos', tagset='universal'
),
- "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
- "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
- tagset="universal"
+ 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(),
+ 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(
+ tagset='universal'
),
- "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
- "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
- tagset="universal"
+ 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(),
+ 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(
+ tagset='universal'
),
- "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
- tagset="universal"
+ 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(
+ tagset='universal'
),
}
class ConcordanceSearchView(object):
- _BACKGROUND_COLOUR = "#FFF" # white
+ _BACKGROUND_COLOUR = '#FFF' # white
# Colour of highlighted results
- _HIGHLIGHT_WORD_COLOUR = "#F00" # red
- _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
+ _HIGHLIGHT_WORD_COLOUR = '#F00' # red
+ _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'
- _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
- _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
+ _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey
+ _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG'
# Percentage of text left of the scrollbar position
_FRACTION_LEFT_TEXT = 0.30
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
- top.geometry("950x680+50+50")
- top.title("NLTK Concordance Search")
- top.bind("<Control-q>", self.destroy)
- top.protocol("WM_DELETE_WINDOW", self.destroy)
+ top.geometry('950x680+50+50')
+ top.title('NLTK Concordance Search')
+ top.bind('<Control-q>', self.destroy)
+ top.protocol('WM_DELETE_WINDOW', self.destroy)
top.minsize(950, 680)
def _init_widgets(self, parent):
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
- self.main_frame.pack(fill="both", expand=True)
+ self.main_frame.pack(fill='both', expand=True)
def _init_menubar(self):
self._result_size = IntVar(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
- label="20",
+ label='20',
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
- label="50",
+ label='50',
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
- label="100",
+ label='100',
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
- editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
+ editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
cntxmenu = Menu(editmenu, tearoff=0)
cntxbfmenu = Menu(cntxmenu, tearoff=0)
cntxbfmenu.add_radiobutton(
- label="60 characters",
+ label='60 characters',
variable=self._cntx_bf_len,
underline=0,
value=60,
command=self.set_cntx_bf_len,
)
cntxbfmenu.add_radiobutton(
- label="80 characters",
+ label='80 characters',
variable=self._cntx_bf_len,
underline=0,
value=80,
command=self.set_cntx_bf_len,
)
cntxbfmenu.add_radiobutton(
- label="100 characters",
+ label='100 characters',
variable=self._cntx_bf_len,
underline=0,
value=100,
command=self.set_cntx_bf_len,
)
cntxbfmenu.invoke(1)
- cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
+ cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu)
cntxafmenu = Menu(cntxmenu, tearoff=0)
cntxafmenu.add_radiobutton(
- label="70 characters",
+ label='70 characters',
variable=self._cntx_af_len,
underline=0,
value=70,
command=self.set_cntx_af_len,
)
cntxafmenu.add_radiobutton(
- label="90 characters",
+ label='90 characters',
variable=self._cntx_af_len,
underline=0,
value=90,
command=self.set_cntx_af_len,
)
cntxafmenu.add_radiobutton(
- label="110 characters",
+ label='110 characters',
variable=self._cntx_af_len,
underline=0,
value=110,
command=self.set_cntx_af_len,
)
cntxafmenu.invoke(1)
- cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
+ cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu)
- editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
+ editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
self.top.config(menu=menubar)
Label(
innerframe,
justify=LEFT,
- text=" Corpus: ",
+ text=' Corpus: ',
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
- ).pack(side="left")
+ ).pack(side='left')
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
command=self.corpus_selected,
*self.model.non_default_corpora()
)
- om["borderwidth"] = 0
- om["highlightthickness"] = 1
- om.pack(side="left")
- innerframe.pack(side="top", fill="x", anchor="n")
+ om['borderwidth'] = 0
+ om['highlightthickness'] = 1
+ om.pack(side='left')
+ innerframe.pack(side='top', fill='x', anchor='n')
def _init_status(self, parent):
self.status = Label(
padx=1,
pady=0,
)
- self.status.pack(side="top", anchor="sw")
+ self.status.pack(side='top', anchor='sw')
def _init_query_box(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
self.query_box = Entry(another, width=60)
- self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
+ self.query_box.pack(side='left', fill='x', pady=25, anchor='center')
self.search_button = Button(
another,
- text="Search",
+ text='Search',
command=self.search,
borderwidth=1,
highlightthickness=1,
)
- self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
- self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
+ self.search_button.pack(side='left', fill='x', pady=25, anchor='center')
+ self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler)
another.pack()
- innerframe.pack(side="top", fill="x", anchor="n")
+ innerframe.pack(side='top', fill='x', anchor='n')
def search_enter_keypress_handler(self, *event):
self.search()
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
- hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
+ hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
self.results_box = Text(
i1,
- font=Font(family="courier", size="16"),
- state="disabled",
+ font=Font(family='courier', size='16'),
+ state='disabled',
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
- wrap="none",
- width="40",
- height="20",
+ wrap='none',
+ width='40',
+ height='20',
exportselection=1,
)
- self.results_box.pack(side="left", fill="both", expand=True)
+ self.results_box.pack(side='left', fill='both', expand=True)
self.results_box.tag_config(
self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
)
self.results_box.tag_config(
self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
)
- vscrollbar.pack(side="left", fill="y", anchor="e")
+ vscrollbar.pack(side='left', fill='y', anchor='e')
vscrollbar.config(command=self.results_box.yview)
- hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
+ hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
- Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
- side="left", anchor="e"
+ Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(
+ side='left', anchor='e'
)
- i1.pack(side="top", fill="both", expand=True, anchor="n")
- i2.pack(side="bottom", fill="x", anchor="s")
- innerframe.pack(side="top", fill="both", expand=True)
+ i1.pack(side='top', fill='both', expand=True, anchor='n')
+ i2.pack(side='bottom', fill='x', anchor='s')
+ innerframe.pack(side='top', fill='both', expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
- text="Previous",
+ text='Previous',
command=self.previous,
- width="10",
+ width='10',
borderwidth=1,
highlightthickness=1,
- state="disabled",
+ state='disabled',
)
- prev.pack(side="left", anchor="center")
+ prev.pack(side='left', anchor='center')
self.next = next = Button(
innerframe,
- text="Next",
+ text='Next',
command=self.__next__,
- width="10",
+ width='10',
borderwidth=1,
highlightthickness=1,
- state="disabled",
+ state='disabled',
)
- next.pack(side="right", anchor="center")
- innerframe.pack(side="top", fill="y")
+ next.pack(side='right', anchor='center')
+ innerframe.pack(side='top', fill='y')
self.current_page = 0
def previous(self):
def about(self, *e):
ABOUT = "NLTK Concordance Search Demo\n"
- TITLE = "About: NLTK Concordance Search Demo"
+ TITLE = 'About: NLTK Concordance Search Demo'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
except:
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
- self.status["text"] = "Error in loading " + self.var.get()
+ self.status['text'] = 'Error in loading ' + self.var.get()
self.unfreeze_editable()
self.clear_all()
self.freeze_editable()
def handle_corpus_loaded(self, event):
- self.status["text"] = self.var.get() + " is loaded"
+ self.status['text'] = self.var.get() + ' is loaded'
self.unfreeze_editable()
self.clear_all()
self.query_box.focus_set()
# todo: refactor the model such that it is less state sensitive
results = self.model.get_results()
self.write_results(results)
- self.status["text"] = ""
+ self.status['text'] = ''
if len(results) == 0:
- self.status["text"] = "No results found for " + self.model.query
+ self.status['text'] = 'No results found for ' + self.model.query
else:
self.current_page = self.model.last_requested_page
self.unfreeze_editable()
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
def handle_search_error(self, event):
- self.status["text"] = "Error in query " + self.model.query
+ self.status['text'] = 'Error in query ' + self.model.query
self.unfreeze_editable()
def corpus_selected(self, *args):
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
- self.status["text"] = "Loading " + selection + "..."
+ self.status['text'] = 'Loading ' + selection + '...'
self.freeze_editable()
self.model.load_corpus(selection)
query = self.query_box.get()
if len(query.strip()) == 0:
return
- self.status["text"] = "Searching for " + query
+ self.status['text'] = 'Searching for ' + query
self.freeze_editable()
self.model.search(query, self.current_page + 1)
def write_results(self, results):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
row = 1
for each in results:
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
if not row == len(results):
- sentence += "\n"
- self.results_box.insert(str(row) + ".0", sentence)
+ sentence += '\n'
+ self.results_box.insert(str(row) + '.0', sentence)
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
for marker in word_markers:
self.results_box.tag_add(
self._HIGHLIGHT_WORD_TAG,
- str(row) + "." + str(marker[0]),
- str(row) + "." + str(marker[1]),
+ str(row) + '.' + str(marker[0]),
+ str(row) + '.' + str(marker[1]),
)
for marker in label_markers:
self.results_box.tag_add(
self._HIGHLIGHT_LABEL_TAG,
- str(row) + "." + str(marker[0]),
- str(row) + "." + str(marker[1]),
+ str(row) + '.' + str(marker[0]),
+ str(row) + '.' + str(marker[1]),
)
row += 1
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
def words_and_labels(self, sentence, pos1, pos2):
search_exp = sentence[pos1:pos2]
words, labels = [], []
- labeled_words = search_exp.split(" ")
+ labeled_words = search_exp.split(' ')
index = 0
for each in labeled_words:
- if each == "":
+ if each == '':
index += 1
else:
- word, label = each.split("/")
+ word, label = each.split('/')
words.append(
(self._char_before + index, self._char_before + index + len(word))
)
if hstart >= self._char_before:
return sent, hstart, hend
d = self._char_before - hstart
- sent = "".join([" "] * d) + sent
+ sent = ''.join([' '] * d) + sent
return sent, hstart + d, hend + d
def destroy(self, *e):
self.clear_results_box()
def clear_results_box(self):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
self.results_box.delete("1.0", END)
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
def freeze_editable(self):
- self.query_box["state"] = "disabled"
- self.search_button["state"] = "disabled"
- self.prev["state"] = "disabled"
- self.next["state"] = "disabled"
+ self.query_box['state'] = 'disabled'
+ self.search_button['state'] = 'disabled'
+ self.prev['state'] = 'disabled'
+ self.next['state'] = 'disabled'
def unfreeze_editable(self):
- self.query_box["state"] = "normal"
- self.search_button["state"] = "normal"
+ self.query_box['state'] = 'normal'
+ self.search_button['state'] = 'normal'
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == 0 or self.current_page == 1:
- self.prev["state"] = "disabled"
+ self.prev['state'] = 'disabled'
else:
- self.prev["state"] = "normal"
+ self.prev['state'] = 'normal'
if self.model.has_more_pages(self.current_page):
- self.next["state"] = "normal"
+ self.next['state'] = 'normal'
else:
- self.next["state"] = "disabled"
+ self.next['state'] = 'disabled'
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
- self.top.event_generate(event, when="tail")
+ self.top.event_generate(event, when='tail')
def mainloop(self, *args, **kwargs):
if in_idle():
try:
ts = self.model.CORPORA[self.name]()
self.model.tagged_sents = [
- " ".join(w + "/" + t for (w, t) in sent) for sent in ts
+ ' '.join(w + '/' + t for (w, t) in sent) for sent in ts
]
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
def processed_query(self):
new = []
for term in self.model.query.split():
- term = re.sub(r"\.", r"[^/ ]", term)
- if re.match("[A-Z]+$", term):
- new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
- elif "/" in term:
+ term = re.sub(r'\.', r'[^/ ]', term)
+ if re.match('[A-Z]+$', term):
+ new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY)
+ elif '/' in term:
new.append(BOUNDARY + term + BOUNDARY)
else:
- new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
- return " ".join(new)
+ new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY)
+ return ' '.join(new)
def app():
d.mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
import re
import itertools
-from tkinter import (
+from six.moves.tkinter import (
Frame,
Label,
PhotoImage,
def __init__(self, image, initialField, initialText):
frm = Frame(root)
frm.config(background="white")
- self.image = PhotoImage(format="gif", data=images[image.upper()])
- self.imageDimmed = PhotoImage(format="gif", data=images[image])
+ self.image = PhotoImage(format='gif', data=images[image.upper()])
+ self.imageDimmed = PhotoImage(format='gif', data=images[image])
self.img = Label(frm)
self.img.config(borderwidth=0)
self.img.pack(side="left")
root.mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Recursive Descent Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
[Ctrl-p]\t Print
[q]\t Quit
"""
+from __future__ import division
-from tkinter.font import Font
-from tkinter import Listbox, IntVar, Button, Frame, Label, Menu, Scrollbar, Tk
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import Listbox, IntVar, Button, Frame, Label, Menu, Scrollbar, Tk
from nltk.tree import Tree
from nltk.util import in_idle
# Set up the main window.
self._top = Tk()
- self._top.title("Recursive Descent Parser Application")
+ self._top.title('Recursive Descent Parser Application')
# Set up key bindings.
self._init_bindings()
self._parser.initialize(self._sent)
# Resize callback
- self._canvas.bind("<Configure>", self._configure)
+ self._canvas.bind('<Configure>', self._configure)
#########################################
## Initialization Helpers
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
+ self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+ self._font = Font(family='helvetica', size=self._size.get())
if self._size.get() < 0:
big = self._size.get() - 2
else:
big = self._size.get() + 2
- self._bigfont = Font(family="helvetica", weight="bold", size=big)
+ self._bigfont = Font(family='helvetica', weight='bold', size=big)
def _init_grammar(self, parent):
# Grammar view.
self._prodframe = listframe = Frame(parent)
- self._prodframe.pack(fill="both", side="left", padx=2)
+ self._prodframe.pack(fill='both', side='left', padx=2)
self._prodlist_label = Label(
- self._prodframe, font=self._boldfont, text="Available Expansions"
+ self._prodframe, font=self._boldfont, text='Available Expansions'
)
self._prodlist_label.pack()
self._prodlist = Listbox(
self._prodframe,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
+ selectmode='single',
+ relief='groove',
+ background='white',
+ foreground='#909090',
font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
+ selectforeground='#004040',
+ selectbackground='#c0f0c0',
)
- self._prodlist.pack(side="right", fill="both", expand=1)
+ self._prodlist.pack(side='right', fill='both', expand=1)
self._productions = list(self._parser.grammar().productions())
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
self._prodlist.config(height=min(len(self._productions), 25))
# Add a scrollbar if there are more than 25 productions.
if len(self._productions) > 25:
- listscroll = Scrollbar(self._prodframe, orient="vertical")
+ listscroll = Scrollbar(self._prodframe, orient='vertical')
self._prodlist.config(yscrollcommand=listscroll.set)
listscroll.config(command=self._prodlist.yview)
- listscroll.pack(side="left", fill="y")
+ listscroll.pack(side='left', fill='y')
# If they select a production, apply it.
- self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
+ self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
def _init_bindings(self):
# Key bindings are a good thing.
- self._top.bind("<Control-q>", self.destroy)
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Escape>", self.destroy)
- self._top.bind("e", self.expand)
+ self._top.bind('<Control-q>', self.destroy)
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Escape>', self.destroy)
+ self._top.bind('e', self.expand)
# self._top.bind('<Alt-e>', self.expand)
# self._top.bind('<Control-e>', self.expand)
- self._top.bind("m", self.match)
- self._top.bind("<Alt-m>", self.match)
- self._top.bind("<Control-m>", self.match)
- self._top.bind("b", self.backtrack)
- self._top.bind("<Alt-b>", self.backtrack)
- self._top.bind("<Control-b>", self.backtrack)
- self._top.bind("<Control-z>", self.backtrack)
- self._top.bind("<BackSpace>", self.backtrack)
- self._top.bind("a", self.autostep)
+ self._top.bind('m', self.match)
+ self._top.bind('<Alt-m>', self.match)
+ self._top.bind('<Control-m>', self.match)
+ self._top.bind('b', self.backtrack)
+ self._top.bind('<Alt-b>', self.backtrack)
+ self._top.bind('<Control-b>', self.backtrack)
+ self._top.bind('<Control-z>', self.backtrack)
+ self._top.bind('<BackSpace>', self.backtrack)
+ self._top.bind('a', self.autostep)
# self._top.bind('<Control-a>', self.autostep)
- self._top.bind("<Control-space>", self.autostep)
- self._top.bind("<Control-c>", self.cancel_autostep)
- self._top.bind("<space>", self.step)
- self._top.bind("<Delete>", self.reset)
- self._top.bind("<Control-p>", self.postscript)
+ self._top.bind('<Control-space>', self.autostep)
+ self._top.bind('<Control-c>', self.cancel_autostep)
+ self._top.bind('<space>', self.step)
+ self._top.bind('<Delete>', self.reset)
+ self._top.bind('<Control-p>', self.postscript)
# self._top.bind('<h>', self.help)
# self._top.bind('<Alt-h>', self.help)
- self._top.bind("<Control-h>", self.help)
- self._top.bind("<F1>", self.help)
+ self._top.bind('<Control-h>', self.help)
+ self._top.bind('<F1>', self.help)
# self._top.bind('<g>', self.toggle_grammar)
# self._top.bind('<Alt-g>', self.toggle_grammar)
# self._top.bind('<Control-g>', self.toggle_grammar)
- self._top.bind("<Control-g>", self.edit_grammar)
- self._top.bind("<Control-t>", self.edit_sentence)
+ self._top.bind('<Control-g>', self.edit_grammar)
+ self._top.bind('<Control-t>', self.edit_sentence)
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
- buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
+ buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
Button(
buttonframe,
- text="Step",
- background="#90c0d0",
- foreground="black",
+ text='Step',
+ background='#90c0d0',
+ foreground='black',
command=self.step,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Autostep",
- background="#90c0d0",
- foreground="black",
+ text='Autostep',
+ background='#90c0d0',
+ foreground='black',
command=self.autostep,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Expand",
+ text='Expand',
underline=0,
- background="#90f090",
- foreground="black",
+ background='#90f090',
+ foreground='black',
command=self.expand,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Match",
+ text='Match',
underline=0,
- background="#90f090",
- foreground="black",
+ background='#90f090',
+ foreground='black',
command=self.match,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Backtrack",
+ text='Backtrack',
underline=0,
- background="#f0a0a0",
- foreground="black",
+ background='#f0a0a0',
+ foreground='black',
command=self.backtrack,
- ).pack(side="left")
+ ).pack(side='left')
# Replace autostep...
# self._autostep_button = Button(buttonframe, text='Autostep',
self._autostep = 0
(x1, y1, x2, y2) = self._cframe.scrollregion()
y2 = event.height - 6
- self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+ self._canvas['scrollregion'] = '%d %d %d %d' % (x1, y1, x2, y2)
self._redraw()
def _init_feedback(self, parent):
self._feedbackframe = feedbackframe = Frame(parent)
- feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
+ feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
self._lastoper_label = Label(
- feedbackframe, text="Last Operation:", font=self._font
+ feedbackframe, text='Last Operation:', font=self._font
)
- self._lastoper_label.pack(side="left")
- lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
- lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
+ self._lastoper_label.pack(side='left')
+ lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+ lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
self._lastoper1 = Label(
- lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
+ lastoperframe, foreground='#007070', background='#f0f0f0', font=self._font
)
self._lastoper2 = Label(
lastoperframe,
- anchor="w",
+ anchor='w',
width=30,
- foreground="#004040",
- background="#f0f0f0",
+ foreground='#004040',
+ background='#f0f0f0',
font=self._font,
)
- self._lastoper1.pack(side="left")
- self._lastoper2.pack(side="left", fill="x", expand=1)
+ self._lastoper1.pack(side='left')
+ self._lastoper2.pack(side='left', fill='x', expand=1)
def _init_canvas(self, parent):
self._cframe = CanvasFrame(
parent,
- background="white",
+ background='white',
# width=525, height=250,
closeenough=10,
border=2,
- relief="sunken",
+ relief='sunken',
)
- self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+ self._cframe.pack(expand=1, fill='both', side='top', pady=2)
canvas = self._canvas = self._cframe.canvas()
# Initially, there's no tree or text
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
+ label='Reset Parser', underline=0, command=self.reset, accelerator='Del'
)
filemenu.add_command(
- label="Print to Postscript",
+ label='Print to Postscript',
underline=0,
command=self.postscript,
- accelerator="Ctrl-p",
+ accelerator='Ctrl-p',
)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
editmenu.add_command(
- label="Edit Grammar",
+ label='Edit Grammar',
underline=5,
command=self.edit_grammar,
- accelerator="Ctrl-g",
+ accelerator='Ctrl-g',
)
editmenu.add_command(
- label="Edit Text",
+ label='Edit Text',
underline=5,
command=self.edit_sentence,
- accelerator="Ctrl-t",
+ accelerator='Ctrl-t',
)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
rulemenu = Menu(menubar, tearoff=0)
rulemenu.add_command(
- label="Step", underline=1, command=self.step, accelerator="Space"
+ label='Step', underline=1, command=self.step, accelerator='Space'
)
rulemenu.add_separator()
rulemenu.add_command(
- label="Match", underline=0, command=self.match, accelerator="Ctrl-m"
+ label='Match', underline=0, command=self.match, accelerator='Ctrl-m'
)
rulemenu.add_command(
- label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e"
+ label='Expand', underline=0, command=self.expand, accelerator='Ctrl-e'
)
rulemenu.add_separator()
rulemenu.add_command(
- label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b"
+ label='Backtrack', underline=0, command=self.backtrack, accelerator='Ctrl-b'
)
- menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+ menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_checkbutton(
)
viewmenu.add_separator()
viewmenu.add_radiobutton(
- label="Tiny",
+ label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Small",
+ label='Small',
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Medium",
+ label='Medium',
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Large",
+ label='Large',
variable=self._size,
underline=0,
value=18,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Huge",
+ label='Huge',
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
animatemenu = Menu(menubar, tearoff=0)
animatemenu.add_radiobutton(
underline=0,
variable=self._animation_frames,
value=10,
- accelerator="-",
+ accelerator='-',
)
animatemenu.add_radiobutton(
label="Normal Animation",
underline=0,
variable=self._animation_frames,
value=5,
- accelerator="=",
+ accelerator='=',
)
animatemenu.add_radiobutton(
label="Fast Animation",
underline=0,
variable=self._animation_frames,
value=2,
- accelerator="+",
+ accelerator='+',
)
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
+ helpmenu.add_command(label='About', underline=0, command=self.about)
helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
+ label='Instructions', underline=0, command=self.help, accelerator='F1'
)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
self._canvas.delete(self._textline)
# Draw the tree.
- helv = ("helvetica", -self._size.get())
- bold = ("helvetica", -self._size.get(), "bold")
+ helv = ('helvetica', -self._size.get())
+ bold = ('helvetica', -self._size.get(), 'bold')
attribs = {
- "tree_color": "#000000",
- "tree_width": 2,
- "node_font": bold,
- "leaf_font": helv,
+ 'tree_color': '#000000',
+ 'tree_width': 2,
+ 'node_font': bold,
+ 'leaf_font': helv,
}
tree = self._parser.tree()
self._tree = tree_to_treesegment(canvas, tree, **attribs)
self._cframe.add_widget(self._tree, 30, 5)
# Draw the text.
- helv = ("helvetica", -self._size.get())
+ helv = ('helvetica', -self._size.get())
bottom = y = self._cframe.scrollregion()[3]
self._textwidgets = [
TextWidget(canvas, word, font=self._font) for word in self._sent
y = min(y, twidget.bbox()[1])
# Draw a line over the text, to separate it from the tree.
- self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".")
+ self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash='.')
# Highlight appropriate nodes.
self._highlight_nodes()
def _highlight_nodes(self):
# Highlight the list of nodes to be checked.
- bold = ("helvetica", -self._size.get(), "bold")
+ bold = ('helvetica', -self._size.get(), 'bold')
for treeloc in self._parser.frontier()[:1]:
- self._get(self._tree, treeloc)["color"] = "#20a050"
- self._get(self._tree, treeloc)["font"] = bold
+ self._get(self._tree, treeloc)['color'] = '#20a050'
+ self._get(self._tree, treeloc)['font'] = bold
for treeloc in self._parser.frontier()[1:]:
- self._get(self._tree, treeloc)["color"] = "#008080"
+ self._get(self._tree, treeloc)['color'] = '#008080'
def _highlight_prodlist(self):
# Highlight the productions that can be expanded.
# Boy, too bad tkinter doesn't implement Listbox.itemconfig;
# that would be pretty useful here.
- self._prodlist.delete(0, "end")
+ self._prodlist.delete(0, 'end')
expandable = self._parser.expandable_productions()
untried = self._parser.untried_expandable_productions()
productions = self._productions
for index in range(len(productions)):
if productions[index] in expandable:
if productions[index] in untried:
- self._prodlist.insert(index, " %s" % productions[index])
+ self._prodlist.insert(index, ' %s' % productions[index])
else:
- self._prodlist.insert(index, " %s (TRIED)" % productions[index])
+ self._prodlist.insert(index, ' %s (TRIED)' % productions[index])
self._prodlist.selection_set(index)
else:
- self._prodlist.insert(index, " %s" % productions[index])
+ self._prodlist.insert(index, ' %s' % productions[index])
def _position_text(self):
# Line up the text widgets that are matched against the tree
for i in range(0, len(leaves)):
widget = self._textwidgets[i]
leaf = leaves[i]
- widget["color"] = "#006040"
- leaf["color"] = "#006040"
+ widget['color'] = '#006040'
+ leaf['color'] = '#006040'
widget.move(leaf.bbox()[0] - widget.bbox()[0], 0)
xmax = widget.bbox()[2] + 10
# Line up the text widgets that are not matched against the tree.
for i in range(len(leaves), numwords):
widget = self._textwidgets[i]
- widget["color"] = "#a0a0a0"
+ widget['color'] = '#a0a0a0'
widget.move(xmax - widget.bbox()[0], 0)
xmax = widget.bbox()[2] + 10
# If we have a complete parse, make everything green :)
if self._parser.currently_complete():
for twidget in self._textwidgets:
- twidget["color"] = "#00a000"
+ twidget['color'] = '#00a000'
# Move the matched leaves down to the text.
for i in range(0, len(leaves)):
def reset(self, *e):
self._autostep = 0
self._parser.initialize(self._sent)
- self._lastoper1["text"] = "Reset Application"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Reset Application'
+ self._lastoper2['text'] = ''
self._redraw()
def autostep(self, *e):
elif self._backtrack():
pass
else:
- self._lastoper1["text"] = "Finished"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Finished'
+ self._lastoper2['text'] = ''
self._autostep = 0
# Check if we just completed a parse.
if self._parser.currently_complete():
self._autostep = 0
- self._lastoper2["text"] += " [COMPLETE PARSE]"
+ self._lastoper2['text'] += ' [COMPLETE PARSE]'
def _expand(self, *e):
if self._animating_lock:
old_frontier = self._parser.frontier()
rv = self._parser.expand()
if rv is not None:
- self._lastoper1["text"] = "Expand:"
- self._lastoper2["text"] = rv
- self._prodlist.selection_clear(0, "end")
+ self._lastoper1['text'] = 'Expand:'
+ self._lastoper2['text'] = rv
+ self._prodlist.selection_clear(0, 'end')
index = self._productions.index(rv)
self._prodlist.selection_set(index)
self._animate_expand(old_frontier[0])
return True
else:
- self._lastoper1["text"] = "Expand:"
- self._lastoper2["text"] = "(all expansions tried)"
+ self._lastoper1['text'] = 'Expand:'
+ self._lastoper2['text'] = '(all expansions tried)'
return False
def _match(self, *e):
old_frontier = self._parser.frontier()
rv = self._parser.match()
if rv is not None:
- self._lastoper1["text"] = "Match:"
- self._lastoper2["text"] = rv
+ self._lastoper1['text'] = 'Match:'
+ self._lastoper2['text'] = rv
self._animate_match(old_frontier[0])
return True
else:
- self._lastoper1["text"] = "Match:"
- self._lastoper2["text"] = "(failed)"
+ self._lastoper1['text'] = 'Match:'
+ self._lastoper2['text'] = '(failed)'
return False
def _backtrack(self, *e):
elt = self._parser.tree()
for i in self._parser.frontier()[0]:
elt = elt[i]
- self._lastoper1["text"] = "Backtrack"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Backtrack'
+ self._lastoper2['text'] = ''
if isinstance(elt, Tree):
self._animate_backtrack(self._parser.frontier()[0])
else:
return True
else:
self._autostep = 0
- self._lastoper1["text"] = "Finished"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Finished'
+ self._lastoper2['text'] = ''
return False
def about(self, *e):
ABOUT = (
"NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper"
)
- TITLE = "About: Recursive Descent Parser Application"
+ TITLE = 'About: Recursive Descent Parser Application'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
try:
ShowText(
self._top,
- "Help: Recursive Descent Parser Application",
- (__doc__ or "").strip(),
+ 'Help: Recursive Descent Parser Application',
+ (__doc__ or '').strip(),
width=75,
- font="fixed",
+ font='fixed',
)
except:
ShowText(
self._top,
- "Help: Recursive Descent Parser Application",
- (__doc__ or "").strip(),
+ 'Help: Recursive Descent Parser Application',
+ (__doc__ or '').strip(),
width=75,
)
def _toggle_grammar(self, *e):
if self._show_grammar.get():
self._prodframe.pack(
- fill="both", side="left", padx=2, after=self._feedbackframe
+ fill='both', side='left', padx=2, after=self._feedbackframe
)
- self._lastoper1["text"] = "Show Grammar"
+ self._lastoper1['text'] = 'Show Grammar'
else:
self._prodframe.pack_forget()
- self._lastoper1["text"] = "Hide Grammar"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Hide Grammar'
+ self._lastoper2['text'] = ''
# def toggle_grammar(self, *e):
# self._show_grammar = not self._show_grammar
production = self._parser.expand(self._productions[index])
if production:
- self._lastoper1["text"] = "Expand:"
- self._lastoper2["text"] = production
- self._prodlist.selection_clear(0, "end")
+ self._lastoper1['text'] = 'Expand:'
+ self._lastoper2['text'] = production
+ self._prodlist.selection_clear(0, 'end')
self._prodlist.selection_set(index)
self._animate_expand(old_frontier[0])
else:
# Reset the production selections.
- self._prodlist.selection_clear(0, "end")
+ self._prodlist.selection_clear(0, 'end')
for prod in self._parser.expandable_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
self._canvas,
tree,
node_font=self._boldfont,
- leaf_color="white",
+ leaf_color='white',
tree_width=2,
- tree_color="white",
- node_color="white",
+ tree_color='white',
+ node_color='white',
leaf_font=self._font,
)
- widget.label()["color"] = "#20a050"
+ widget.label()['color'] = '#20a050'
(oldx, oldy) = oldtree.label().bbox()[:2]
(newx, newy) = widget.label().bbox()[:2]
oldtree.destroy()
colors = [
- "gray%d" % (10 * int(10 * x / self._animation_frames.get()))
+ 'gray%d' % (10 * int(10 * x / self._animation_frames.get()))
for x in range(self._animation_frames.get(), 0, -1)
]
def _animate_expand_frame(self, widget, colors):
if len(colors) > 0:
self._animating_lock = 1
- widget["color"] = colors[0]
+ widget['color'] = colors[0]
for subtree in widget.subtrees():
if isinstance(subtree, TreeSegmentWidget):
- subtree.label()["color"] = colors[0]
+ subtree.label()['color'] = colors[0]
else:
- subtree["color"] = colors[0]
+ subtree['color'] = colors[0]
self._top.after(50, self._animate_expand_frame, widget, colors[1:])
else:
- widget["color"] = "black"
+ widget['color'] = 'black'
for subtree in widget.subtrees():
if isinstance(subtree, TreeSegmentWidget):
- subtree.label()["color"] = "black"
+ subtree.label()['color'] = 'black'
else:
- subtree["color"] = "black"
+ subtree['color'] = 'black'
self._redraw_quick()
- widget.label()["color"] = "black"
+ widget.label()['color'] = 'black'
self._animating_lock = 0
if self._autostep:
self._step()
if self._animation_frames.get() == 0:
colors = []
else:
- colors = ["#a00000", "#000000", "#a00000"]
+ colors = ['#a00000', '#000000', '#a00000']
colors += [
- "gray%d" % (10 * int(10 * x / (self._animation_frames.get())))
+ 'gray%d' % (10 * int(10 * x / (self._animation_frames.get())))
for x in range(1, self._animation_frames.get() + 1)
]
if len(colors) > 0:
self._animating_lock = 1
for widget in widgets:
- widget["color"] = colors[0]
+ widget['color'] = colors[0]
self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:])
else:
for widget in widgets[0].subtrees():
widget.move(0, dy)
self._top.after(10, self._animate_match_frame, frame - 1, widget, dy)
else:
- widget["color"] = "#006040"
+ widget['color'] = '#006040'
self._redraw_quick()
self._animating_lock = 0
if self._autostep:
def set_grammar(self, grammar):
self._parser.set_grammar(grammar)
self._productions = list(grammar.productions())
- self._prodlist.delete(0, "end")
+ self._prodlist.delete(0, 'end')
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
def edit_sentence(self, *e):
sentence = " ".join(self._sent)
- title = "Edit Text"
- instr = "Enter a new sentence to parse."
+ title = 'Edit Text'
+ instr = 'Enter a new sentence to parse.'
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
def set_sentence(self, sentence):
"""
)
- sent = "the dog saw a man in the park".split()
+ sent = 'the dog saw a man in the park'.split()
RecursiveDescentApp(grammar, sent).mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Shift-Reduce Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
-from tkinter.font import Font
-from tkinter import IntVar, Listbox, Button, Frame, Label, Menu, Scrollbar, Tk
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import IntVar, Listbox, Button, Frame, Label, Menu, Scrollbar, Tk
from nltk.tree import Tree
from nltk.parse import SteppingShiftReduceParser
# Set up the main window.
self._top = Tk()
- self._top.title("Shift Reduce Parser Application")
+ self._top.title('Shift Reduce Parser Application')
# Animations. animating_lock is a lock to prevent the demo
# from performing new operations while it's animating.
# Reset the demo, and set the feedback frame to empty.
self.reset()
- self._lastoper1["text"] = ""
+ self._lastoper1['text'] = ''
#########################################
## Initialization Helpers
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
+ self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+ self._font = Font(family='helvetica', size=self._size.get())
def _init_grammar(self, parent):
# Grammar view.
self._prodframe = listframe = Frame(parent)
- self._prodframe.pack(fill="both", side="left", padx=2)
+ self._prodframe.pack(fill='both', side='left', padx=2)
self._prodlist_label = Label(
- self._prodframe, font=self._boldfont, text="Available Reductions"
+ self._prodframe, font=self._boldfont, text='Available Reductions'
)
self._prodlist_label.pack()
self._prodlist = Listbox(
self._prodframe,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
+ selectmode='single',
+ relief='groove',
+ background='white',
+ foreground='#909090',
font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
+ selectforeground='#004040',
+ selectbackground='#c0f0c0',
)
- self._prodlist.pack(side="right", fill="both", expand=1)
+ self._prodlist.pack(side='right', fill='both', expand=1)
self._productions = list(self._parser.grammar().productions())
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
self._prodlist.config(height=min(len(self._productions), 25))
# Add a scrollbar if there are more than 25 productions.
if 1: # len(self._productions) > 25:
- listscroll = Scrollbar(self._prodframe, orient="vertical")
+ listscroll = Scrollbar(self._prodframe, orient='vertical')
self._prodlist.config(yscrollcommand=listscroll.set)
listscroll.config(command=self._prodlist.yview)
- listscroll.pack(side="left", fill="y")
+ listscroll.pack(side='left', fill='y')
# If they select a production, apply it.
- self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
+ self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
# When they hover over a production, highlight it.
self._hover = -1
- self._prodlist.bind("<Motion>", self._highlight_hover)
- self._prodlist.bind("<Leave>", self._clear_hover)
+ self._prodlist.bind('<Motion>', self._highlight_hover)
+ self._prodlist.bind('<Leave>', self._clear_hover)
def _init_bindings(self):
# Quit
- self._top.bind("<Control-q>", self.destroy)
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Alt-q>", self.destroy)
- self._top.bind("<Alt-x>", self.destroy)
+ self._top.bind('<Control-q>', self.destroy)
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Alt-q>', self.destroy)
+ self._top.bind('<Alt-x>', self.destroy)
# Ops (step, shift, reduce, undo)
- self._top.bind("<space>", self.step)
- self._top.bind("<s>", self.shift)
- self._top.bind("<Alt-s>", self.shift)
- self._top.bind("<Control-s>", self.shift)
- self._top.bind("<r>", self.reduce)
- self._top.bind("<Alt-r>", self.reduce)
- self._top.bind("<Control-r>", self.reduce)
- self._top.bind("<Delete>", self.reset)
- self._top.bind("<u>", self.undo)
- self._top.bind("<Alt-u>", self.undo)
- self._top.bind("<Control-u>", self.undo)
- self._top.bind("<Control-z>", self.undo)
- self._top.bind("<BackSpace>", self.undo)
+ self._top.bind('<space>', self.step)
+ self._top.bind('<s>', self.shift)
+ self._top.bind('<Alt-s>', self.shift)
+ self._top.bind('<Control-s>', self.shift)
+ self._top.bind('<r>', self.reduce)
+ self._top.bind('<Alt-r>', self.reduce)
+ self._top.bind('<Control-r>', self.reduce)
+ self._top.bind('<Delete>', self.reset)
+ self._top.bind('<u>', self.undo)
+ self._top.bind('<Alt-u>', self.undo)
+ self._top.bind('<Control-u>', self.undo)
+ self._top.bind('<Control-z>', self.undo)
+ self._top.bind('<BackSpace>', self.undo)
# Misc
- self._top.bind("<Control-p>", self.postscript)
- self._top.bind("<Control-h>", self.help)
- self._top.bind("<F1>", self.help)
- self._top.bind("<Control-g>", self.edit_grammar)
- self._top.bind("<Control-t>", self.edit_sentence)
+ self._top.bind('<Control-p>', self.postscript)
+ self._top.bind('<Control-h>', self.help)
+ self._top.bind('<F1>', self.help)
+ self._top.bind('<Control-g>', self.edit_grammar)
+ self._top.bind('<Control-t>', self.edit_sentence)
# Animation speed control
- self._top.bind("-", lambda e, a=self._animate: a.set(20))
- self._top.bind("=", lambda e, a=self._animate: a.set(10))
- self._top.bind("+", lambda e, a=self._animate: a.set(4))
+ self._top.bind('-', lambda e, a=self._animate: a.set(20))
+ self._top.bind('=', lambda e, a=self._animate: a.set(10))
+ self._top.bind('+', lambda e, a=self._animate: a.set(4))
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
- buttonframe.pack(fill="none", side="bottom")
+ buttonframe.pack(fill='none', side='bottom')
Button(
buttonframe,
- text="Step",
- background="#90c0d0",
- foreground="black",
+ text='Step',
+ background='#90c0d0',
+ foreground='black',
command=self.step,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Shift",
+ text='Shift',
underline=0,
- background="#90f090",
- foreground="black",
+ background='#90f090',
+ foreground='black',
command=self.shift,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Reduce",
+ text='Reduce',
underline=0,
- background="#90f090",
- foreground="black",
+ background='#90f090',
+ foreground='black',
command=self.reduce,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Undo",
+ text='Undo',
underline=0,
- background="#f0a0a0",
- foreground="black",
+ background='#f0a0a0',
+ foreground='black',
command=self.undo,
- ).pack(side="left")
+ ).pack(side='left')
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
+ label='Reset Parser', underline=0, command=self.reset, accelerator='Del'
)
filemenu.add_command(
- label="Print to Postscript",
+ label='Print to Postscript',
underline=0,
command=self.postscript,
- accelerator="Ctrl-p",
+ accelerator='Ctrl-p',
)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
editmenu.add_command(
- label="Edit Grammar",
+ label='Edit Grammar',
underline=5,
command=self.edit_grammar,
- accelerator="Ctrl-g",
+ accelerator='Ctrl-g',
)
editmenu.add_command(
- label="Edit Text",
+ label='Edit Text',
underline=5,
command=self.edit_sentence,
- accelerator="Ctrl-t",
+ accelerator='Ctrl-t',
)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
rulemenu = Menu(menubar, tearoff=0)
rulemenu.add_command(
- label="Step", underline=1, command=self.step, accelerator="Space"
+ label='Step', underline=1, command=self.step, accelerator='Space'
)
rulemenu.add_separator()
rulemenu.add_command(
- label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
+ label='Shift', underline=0, command=self.shift, accelerator='Ctrl-s'
)
rulemenu.add_command(
- label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
+ label='Reduce', underline=0, command=self.reduce, accelerator='Ctrl-r'
)
rulemenu.add_separator()
rulemenu.add_command(
- label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
+ label='Undo', underline=0, command=self.undo, accelerator='Ctrl-u'
)
- menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+ menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_checkbutton(
)
viewmenu.add_separator()
viewmenu.add_radiobutton(
- label="Tiny",
+ label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Small",
+ label='Small',
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Medium",
+ label='Medium',
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Large",
+ label='Large',
variable=self._size,
underline=0,
value=18,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Huge",
+ label='Huge',
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
animatemenu = Menu(menubar, tearoff=0)
animatemenu.add_radiobutton(
underline=0,
variable=self._animate,
value=20,
- accelerator="-",
+ accelerator='-',
)
animatemenu.add_radiobutton(
label="Normal Animation",
underline=0,
variable=self._animate,
value=10,
- accelerator="=",
+ accelerator='=',
)
animatemenu.add_radiobutton(
label="Fast Animation",
underline=0,
variable=self._animate,
value=4,
- accelerator="+",
+ accelerator='+',
)
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
+ helpmenu.add_command(label='About', underline=0, command=self.about)
helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
+ label='Instructions', underline=0, command=self.help, accelerator='F1'
)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
def _init_feedback(self, parent):
self._feedbackframe = feedbackframe = Frame(parent)
- feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
+ feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
self._lastoper_label = Label(
- feedbackframe, text="Last Operation:", font=self._font
+ feedbackframe, text='Last Operation:', font=self._font
)
- self._lastoper_label.pack(side="left")
- lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
- lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
+ self._lastoper_label.pack(side='left')
+ lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+ lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
self._lastoper1 = Label(
- lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
+ lastoperframe, foreground='#007070', background='#f0f0f0', font=self._font
)
self._lastoper2 = Label(
lastoperframe,
- anchor="w",
+ anchor='w',
width=30,
- foreground="#004040",
- background="#f0f0f0",
+ foreground='#004040',
+ background='#f0f0f0',
font=self._font,
)
- self._lastoper1.pack(side="left")
- self._lastoper2.pack(side="left", fill="x", expand=1)
+ self._lastoper1.pack(side='left')
+ self._lastoper2.pack(side='left', fill='x', expand=1)
def _init_canvas(self, parent):
self._cframe = CanvasFrame(
parent,
- background="white",
+ background='white',
width=525,
closeenough=10,
border=2,
- relief="sunken",
+ relief='sunken',
)
- self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+ self._cframe.pack(expand=1, fill='both', side='top', pady=2)
canvas = self._canvas = self._cframe.canvas()
self._stackwidgets = []
self._rtextwidgets = []
self._titlebar = canvas.create_rectangle(
- 0, 0, 0, 0, fill="#c0f0f0", outline="black"
+ 0, 0, 0, 0, fill='#c0f0f0', outline='black'
)
- self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
- self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
+ self._exprline = canvas.create_line(0, 0, 0, 0, dash='.')
+ self._stacktop = canvas.create_line(0, 0, 0, 0, fill='#408080')
size = self._size.get() + 4
self._stacklabel = TextWidget(
- canvas, "Stack", color="#004040", font=self._boldfont
+ canvas, 'Stack', color='#004040', font=self._boldfont
)
self._rtextlabel = TextWidget(
- canvas, "Remaining Text", color="#004040", font=self._boldfont
+ canvas, 'Remaining Text', color='#004040', font=self._boldfont
)
self._cframe.add_widget(self._stacklabel)
self._cframe.add_widget(self._rtextlabel)
#########################################
def _redraw(self):
- scrollregion = self._canvas["scrollregion"].split()
+ scrollregion = self._canvas['scrollregion'].split()
(cx1, cy1, cx2, cy2) = [int(c) for c in scrollregion]
# Delete the old stack & rtext widgets.
for tok in self._parser.stack():
if isinstance(tok, Tree):
attribs = {
- "tree_color": "#4080a0",
- "tree_width": 2,
- "node_font": self._boldfont,
- "node_color": "#006060",
- "leaf_color": "#006060",
- "leaf_font": self._font,
+ 'tree_color': '#4080a0',
+ 'tree_width': 2,
+ 'node_font': self._boldfont,
+ 'node_color': '#006060',
+ 'leaf_color': '#006060',
+ 'leaf_font': self._font,
}
widget = tree_to_treesegment(self._canvas, tok, **attribs)
- widget.label()["color"] = "#000000"
+ widget.label()['color'] = '#000000'
else:
- widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
+ widget = TextWidget(self._canvas, tok, color='#000000', font=self._font)
widget.bind_click(self._popup_reduce)
self._stackwidgets.append(widget)
self._cframe.add_widget(widget, stackx, y)
# Draw the remaining text.
rtextwidth = 0
for tok in self._parser.remaining_text():
- widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
+ widget = TextWidget(self._canvas, tok, color='#000000', font=self._font)
self._rtextwidgets.append(widget)
self._cframe.add_widget(widget, rtextwidth, y)
rtextwidth = widget.bbox()[2] + 4
def _highlight_productions(self):
# Highlight the productions that can be reduced.
- self._prodlist.selection_clear(0, "end")
+ self._prodlist.selection_clear(0, 'end')
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
def reset(self, *e):
self._parser.initialize(self._sent)
- self._lastoper1["text"] = "Reset App"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Reset App'
+ self._lastoper2['text'] = ''
self._redraw()
def step(self, *e):
return True
else:
if list(self._parser.parses()):
- self._lastoper1["text"] = "Finished:"
- self._lastoper2["text"] = "Success"
+ self._lastoper1['text'] = 'Finished:'
+ self._lastoper2['text'] = 'Success'
else:
- self._lastoper1["text"] = "Finished:"
- self._lastoper2["text"] = "Failure"
+ self._lastoper1['text'] = 'Finished:'
+ self._lastoper2['text'] = 'Failure'
def shift(self, *e):
if self._animating_lock:
return
if self._parser.shift():
tok = self._parser.stack()[-1]
- self._lastoper1["text"] = "Shift:"
- self._lastoper2["text"] = "%r" % tok
+ self._lastoper1['text'] = 'Shift:'
+ self._lastoper2['text'] = '%r' % tok
if self._animate.get():
self._animate_shift()
else:
return
production = self._parser.reduce()
if production:
- self._lastoper1["text"] = "Reduce:"
- self._lastoper2["text"] = "%s" % production
+ self._lastoper1['text'] = 'Reduce:'
+ self._lastoper2['text'] = '%s' % production
if self._animate.get():
self._animate_reduce()
else:
try:
ShowText(
self._top,
- "Help: Shift-Reduce Parser Application",
- (__doc__ or "").strip(),
+ 'Help: Shift-Reduce Parser Application',
+ (__doc__ or '').strip(),
width=75,
- font="fixed",
+ font='fixed',
)
except:
ShowText(
self._top,
- "Help: Shift-Reduce Parser Application",
- (__doc__ or "").strip(),
+ 'Help: Shift-Reduce Parser Application',
+ (__doc__ or '').strip(),
width=75,
)
def about(self, *e):
ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
- TITLE = "About: Shift-Reduce Parser Application"
+ TITLE = 'About: Shift-Reduce Parser Application'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
def set_grammar(self, grammar):
self._parser.set_grammar(grammar)
self._productions = list(grammar.productions())
- self._prodlist.delete(0, "end")
+ self._prodlist.delete(0, 'end')
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
def edit_sentence(self, *e):
sentence = " ".join(self._sent)
- title = "Edit Text"
- instr = "Enter a new sentence to parse."
+ title = 'Edit Text'
+ instr = 'Enter a new sentence to parse.'
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
def set_sentence(self, sent):
def _toggle_grammar(self, *e):
if self._show_grammar.get():
self._prodframe.pack(
- fill="both", side="left", padx=2, after=self._feedbackframe
+ fill='both', side='left', padx=2, after=self._feedbackframe
)
- self._lastoper1["text"] = "Show Grammar"
+ self._lastoper1['text'] = 'Show Grammar'
else:
self._prodframe.pack_forget()
- self._lastoper1["text"] = "Hide Grammar"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Hide Grammar'
+ self._lastoper2['text'] = ''
def _prodlist_select(self, event):
selection = self._prodlist.curselection()
index = int(selection[0])
production = self._parser.reduce(self._productions[index])
if production:
- self._lastoper1["text"] = "Reduce:"
- self._lastoper2["text"] = "%s" % production
+ self._lastoper1['text'] = 'Reduce:'
+ self._lastoper2['text'] = '%s' % production
if self._animate.get():
self._animate_reduce()
else:
self._redraw()
else:
# Reset the production selections.
- self._prodlist.selection_clear(0, "end")
+ self._prodlist.selection_clear(0, 'end')
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
if len(productions) == 0:
return
- self._reduce_menu.delete(0, "end")
+ self._reduce_menu.delete(0, 'end')
for production in productions:
self._reduce_menu.add_command(label=str(production), command=self.reduce)
self._reduce_menu.post(
if not isinstance(tok, Tree):
raise ValueError()
label = TextWidget(
- self._canvas, str(tok.label()), color="#006060", font=self._boldfont
+ self._canvas, str(tok.label()), color='#006060', font=self._boldfont
)
widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
(x1, y1, x2, y2) = self._stacklabel.bbox()
rhslen = len(self._productions[index].rhs())
for stackwidget in self._stackwidgets[-rhslen:]:
if isinstance(stackwidget, TreeSegmentWidget):
- stackwidget.label()["color"] = "#00a000"
+ stackwidget.label()['color'] = '#00a000'
else:
- stackwidget["color"] = "#00a000"
+ stackwidget['color'] = '#00a000'
# Remember what production we're hovering over.
self._hover = index
self._hover = -1
for stackwidget in self._stackwidgets:
if isinstance(stackwidget, TreeSegmentWidget):
- stackwidget.label()["color"] = "black"
+ stackwidget.label()['color'] = 'black'
else:
- stackwidget["color"] = "black"
+ stackwidget['color'] = 'black'
def app():
from nltk.grammar import Nonterminal, Production, CFG
- nonterminals = "S VP NP PP P N Name V Det"
+ nonterminals = 'S VP NP PP P N Name V Det'
(S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
productions = (
Production(VP, [V, NP]),
Production(PP, [P, NP]),
# Lexical Productions
- Production(NP, ["I"]),
- Production(Det, ["the"]),
- Production(Det, ["a"]),
- Production(N, ["man"]),
- Production(V, ["saw"]),
- Production(P, ["in"]),
- Production(P, ["with"]),
- Production(N, ["park"]),
- Production(N, ["dog"]),
- Production(N, ["statue"]),
- Production(Det, ["my"]),
+ Production(NP, ['I']),
+ Production(Det, ['the']),
+ Production(Det, ['a']),
+ Production(N, ['man']),
+ Production(V, ['saw']),
+ Production(P, ['in']),
+ Production(P, ['with']),
+ Production(N, ['park']),
+ Production(N, ['dog']),
+ Production(N, ['statue']),
+ Production(Det, ['my']),
)
grammar = CFG(S, productions)
# tokenize the sentence
- sent = "my dog saw a man in the park with a statue".split()
+ sent = 'my dog saw a man in the park with a statue'.split()
ShiftReduceApp(grammar, sent).mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Wordfreq Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
def app():
- t1 = Text(gutenberg.words("melville-moby_dick.txt"))
+ t1 = Text(gutenberg.words('melville-moby_dick.txt'))
plot_word_freq_dist(t1)
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: WordNet Browser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
# Paul Bone <pbone@students.csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# modifying to be compliant with NLTK's coding standards. Tests also
# need to be develop to ensure this continues to work in the face of
# changes to other NLTK packages.
+from __future__ import print_function
# Allow this program to run inside the NLTK source tree.
from sys import path
import base64
import pickle
import copy
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from urllib.parse import unquote_plus
+from six.moves.urllib.parse import unquote_plus
+
+from nltk import compat
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset, Lemma
+if compat.PY3:
+ from http.server import HTTPServer, BaseHTTPRequestHandler
+else:
+ from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
+
# now included in local file
# from util import html_header, html_trailer, \
# get_static_index_page, get_static_page_by_path, \
def do_GET(self):
global firstClient
sp = self.path[1:]
- if unquote_plus(sp) == "SHUTDOWN THE SERVER":
+ if unquote_plus(sp) == 'SHUTDOWN THE SERVER':
if server_mode:
page = "Server must be killed with SIGTERM."
type = "text/plain"
else:
- print("Server shutting down!")
+ print('Server shutting down!')
os._exit(0)
- elif sp == "": # First request.
- type = "text/html"
+ elif sp == '': # First request.
+ type = 'text/html'
if not server_mode and firstClient:
firstClient = False
page = get_static_index_page(True)
else:
page = get_static_index_page(False)
- word = "green"
+ word = 'green'
- elif sp.endswith(".html"): # Trying to fetch a HTML file TODO:
- type = "text/html"
+ elif sp.endswith('.html'): # Trying to fetch a HTML file TODO:
+ type = 'text/html'
usp = unquote_plus(sp)
- if usp == "NLTK Wordnet Browser Database Info.html":
- word = "* Database Info *"
+ if usp == 'NLTK Wordnet Browser Database Info.html':
+ word = '* Database Info *'
if os.path.isfile(usp):
- with open(usp, "r") as infile:
+ with open(usp, 'r') as infile:
page = infile.read()
else:
page = (
- (html_header % word) + "<p>The database info file:"
- "<p><b>"
+ (html_header % word) + '<p>The database info file:'
+ '<p><b>'
+ usp
- + "</b>"
- + "<p>was not found. Run this:"
- + "<p><b>python dbinfo_html.py</b>"
- + "<p>to produce it."
+ + '</b>'
+ + '<p>was not found. Run this:'
+ + '<p><b>python dbinfo_html.py</b>'
+ + '<p>to produce it.'
+ html_trailer
)
else:
page = get_static_page_by_path(usp)
elif sp.startswith("search"):
# This doesn't seem to work with MWEs.
- type = "text/html"
+ type = 'text/html'
parts = (sp.split("?")[1]).split("&")
word = [
p.split("=")[1].replace("+", " ")
page, word = page_from_word(word)
elif sp.startswith("lookup_"):
# TODO add a variation of this that takes a non ecoded word or MWE.
- type = "text/html"
+ type = 'text/html'
sp = sp[len("lookup_") :]
page, word = page_from_href(sp)
elif sp == "start_page":
# if this is the first request we should display help
# information, and possibly set a default word.
- type = "text/html"
+ type = 'text/html'
page, word = page_from_word("wordnet")
else:
- type = "text/plain"
+ type = 'text/plain'
page = "Could not parse request: '%s'" % sp
# Send result.
self.send_head(type)
- self.wfile.write(page.encode("utf8"))
+ self.wfile.write(page.encode('utf8'))
def send_head(self, type=None):
self.send_response(200)
- self.send_header("Content-type", type)
+ self.send_header('Content-type', type)
self.end_headers()
def log_message(self, format, *args):
Extract the unique counter from the URL if it has one. Otherwise return
null.
"""
- pos = sp.rfind("%23")
+ pos = sp.rfind('%23')
if pos != -1:
return int(sp[(pos + 3) :])
else:
logfile = None
# Compute URL and start web browser
- url = "http://localhost:" + str(port)
+ url = 'http://localhost:' + str(port)
server_ready = None
browser_thread = None
browser_thread = startBrowser(url, server_ready)
# Start the server.
- server = HTTPServer(("", port), MyServerHandler)
+ server = HTTPServer(('', port), MyServerHandler)
if logfile:
- logfile.write("NLTK Wordnet browser server running serving: %s\n" % url)
+ logfile.write('NLTK Wordnet browser server running serving: %s\n' % url)
if runBrowser:
server_ready.set()
# WordNet corpus is installed.
def _pos_tuples():
return [
- (wn.NOUN, "N", "noun"),
- (wn.VERB, "V", "verb"),
- (wn.ADJ, "J", "adj"),
- (wn.ADV, "R", "adv"),
+ (wn.NOUN, 'N', 'noun'),
+ (wn.VERB, 'V', 'verb'),
+ (wn.ADJ, 'J', 'adj'),
+ (wn.ADV, 'R', 'adv'),
]
tuple given to it. It attempts to match it against the first
non-null component of the given pos tuple.
"""
- if pos_tuple[0] == "s":
- pos_tuple = ("a", pos_tuple[1], pos_tuple[2])
+ if pos_tuple[0] == 's':
+ pos_tuple = ('a', pos_tuple[1], pos_tuple[2])
for n, x in enumerate(pos_tuple):
if x is not None:
break
"""
if synset.pos() == wn.NOUN:
return (
- (HYPONYM, "Hyponyms", synset.hyponyms()),
- (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()),
- (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
+ (HYPONYM, 'Hyponyms', synset.hyponyms()),
+ (INSTANCE_HYPONYM, 'Instance hyponyms', synset.instance_hyponyms()),
+ (HYPERNYM, 'Direct hypernyms', synset.hypernyms()),
(
INDIRECT_HYPERNYMS,
- "Indirect hypernyms",
+ 'Indirect hypernyms',
rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
),
# hypernyms', 'Sister terms',
- (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()),
+ (INSTANCE_HYPERNYM, 'Instance hypernyms', synset.instance_hypernyms()),
# (CLASS_REGIONAL, ['domain term region'], ),
- (PART_HOLONYM, "Part holonyms", synset.part_holonyms()),
- (PART_MERONYM, "Part meronyms", synset.part_meronyms()),
- (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()),
- (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()),
- (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()),
- (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()),
- (ATTRIBUTE, "Attributes", synset.attributes()),
+ (PART_HOLONYM, 'Part holonyms', synset.part_holonyms()),
+ (PART_MERONYM, 'Part meronyms', synset.part_meronyms()),
+ (SUBSTANCE_HOLONYM, 'Substance holonyms', synset.substance_holonyms()),
+ (SUBSTANCE_MERONYM, 'Substance meronyms', synset.substance_meronyms()),
+ (MEMBER_HOLONYM, 'Member holonyms', synset.member_holonyms()),
+ (MEMBER_MERONYM, 'Member meronyms', synset.member_meronyms()),
+ (ATTRIBUTE, 'Attributes', synset.attributes()),
(ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())),
(
DERIVATIONALLY_RELATED_FORM,
)
elif synset.pos() == wn.VERB:
return (
- (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
- (HYPONYM, "Hyponym", synset.hyponyms()),
- (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
+ (ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),
+ (HYPONYM, 'Hyponym', synset.hyponyms()),
+ (HYPERNYM, 'Direct hypernyms', synset.hypernyms()),
(
INDIRECT_HYPERNYMS,
- "Indirect hypernyms",
+ 'Indirect hypernyms',
rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
),
- (ENTAILMENT, "Entailments", synset.entailments()),
- (CAUSE, "Causes", synset.causes()),
- (ALSO_SEE, "Also see", synset.also_sees()),
- (VERB_GROUP, "Verb Groups", synset.verb_groups()),
+ (ENTAILMENT, 'Entailments', synset.entailments()),
+ (CAUSE, 'Causes', synset.causes()),
+ (ALSO_SEE, 'Also see', synset.also_sees()),
+ (VERB_GROUP, 'Verb Groups', synset.verb_groups()),
(
DERIVATIONALLY_RELATED_FORM,
"Derivationally related form",
)
elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT:
return (
- (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
- (SIMILAR, "Similar to", synset.similar_tos()),
+ (ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),
+ (SIMILAR, 'Similar to', synset.similar_tos()),
# Participle of verb - not supported by corpus
(
PERTAINYM,
- "Pertainyms",
+ 'Pertainyms',
lemma_property(word, synset, lambda l: l.pertainyms()),
),
- (ATTRIBUTE, "Attributes", synset.attributes()),
- (ALSO_SEE, "Also see", synset.also_sees()),
+ (ATTRIBUTE, 'Attributes', synset.attributes()),
+ (ALSO_SEE, 'Also see', synset.also_sees()),
)
elif synset.pos() == wn.ADV:
# This is weird. adverbs such as 'quick' and 'fast' don't seem
# to have antonyms returned by the corpus.a
return (
- (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
+ (ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),
)
# Derived from adjective - not supported by corpus
else:
raise TypeError("Unhandles synset POS type: " + str(synset.pos()))
-html_header = """
+html_header = '''
<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
'http://www.w3.org/TR/html4/strict.dtd'>
<html>
'text/html; charset=us-ascii'>
<title>NLTK Wordnet Browser display of: %s</title></head>
<body bgcolor='#F5F5F5' text='#000000'>
-"""
-html_trailer = """
+'''
+html_trailer = '''
</body>
</html>
-"""
+'''
-explanation = """
+explanation = '''
<h3>Search Help</h3>
<ul><li>The display below the line is an example of the output the browser
shows you when you enter a search word. The search word was <b>green</b>.</li>
<b>Enter/Return</b> key or click the <b>Search</b> button.</li>
</ul>
<hr width='100%'>
-"""
+'''
# HTML oriented functions
def _bold(txt):
- return "<b>%s</b>" % txt
+ return '<b>%s</b>' % txt
def _center(txt):
- return "<center>%s</center>" % txt
+ return '<center>%s</center>' % txt
def _hlev(n, txt):
- return "<h%d>%s</h%d>" % (n, txt, n)
+ return '<h%d>%s</h%d>' % (n, txt, n)
def _italic(txt):
- return "<i>%s</i>" % txt
+ return '<i>%s</i>' % txt
def _li(txt):
- return "<li>%s</li>" % txt
+ return '<li>%s</li>' % txt
def pg(word, body):
- """
+ '''
Return a HTML page of NLTK Browser format constructed from the
word and body
:type body: str
:return: a HTML page for the word-body combination
:rtype: str
- """
+ '''
return (html_header % word) + body + html_trailer
def _ul(txt):
- return "<ul>" + txt + "</ul>"
+ return '<ul>' + txt + '</ul>'
def _abbc(txt):
"""
abbc = asterisks, breaks, bold, center
"""
- return _center(_bold("<br>" * 10 + "*" * 10 + " " + txt + " " + "*" * 10))
+ return _center(_bold('<br>' * 10 + '*' * 10 + ' ' + txt + ' ' + '*' * 10))
-full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n"
+full_hyponym_cont_text = _ul(_li(_italic('(has full hyponym continuation)'))) + '\n'
def _get_synset(synset_key):
def _collect_one_synset(word, synset, synset_relations):
- """
+ '''
Returns the HTML string for one synset or word
:param word: the current word
:type synset_relations: dict(synset_key, set(relation_id))
:return: The HTML string built for this synset
:rtype: str
- """
+ '''
if isinstance(synset, tuple): # It's a word
raise NotImplementedError("word not supported by _collect_one_synset")
- typ = "S"
+ typ = 'S'
pos_tuple = _pos_match((synset.pos(), None, None))
assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos()
descr = pos_tuple[2]
synset_label = typ + ";"
if synset.name() in synset_relations:
synset_label = _bold(synset_label)
- s = "<li>%s (%s) " % (make_lookup_link(ref, synset_label), descr)
+ s = '<li>%s (%s) ' % (make_lookup_link(ref, synset_label), descr)
def format_lemma(w):
- w = w.replace("_", " ")
+ w = w.replace('_', ' ')
if w.lower() == word:
return _bold(w)
else:
ref = Reference(w)
return make_lookup_link(ref, w)
- s += ", ".join(format_lemma(l.name()) for l in synset.lemmas())
+ s += ', '.join(format_lemma(l.name()) for l in synset.lemmas())
gl = " (%s) <i>%s</i> " % (
synset.definition(),
- "; ".join('"%s"' % e for e in synset.examples()),
+ "; ".join("\"%s\"" % e for e in synset.examples()),
)
- return s + gl + _synset_relations(word, synset, synset_relations) + "</li>\n"
+ return s + gl + _synset_relations(word, synset, synset_relations) + '</li>\n'
def _collect_all_synsets(word, pos, synset_relations=dict()):
Return a HTML unordered list of synsets for the given word and
part of speech.
"""
- return "<ul>%s\n</ul>\n" % "".join(
+ return '<ul>%s\n</ul>\n' % ''.join(
(
_collect_one_synset(word, synset, synset_relations)
for synset in wn.synsets(word, pos)
def _synset_relations(word, synset, synset_relations):
- """
+ '''
Builds the HTML string for the relations of a synset
:param word: The current word
:type synset_relations: dict(synset_key, set(relation_type))
:return: The HTML for a synset's relations
:rtype: str
- """
+ '''
if not synset.name() in synset_relations:
return ""
# similar tuples. This forms a tree of synsets.
return "%s\n<ul>%s</ul>\n" % (
relation_html(r[0]),
- "".join("<li>%s</li>\n" % relation_html(sr) for sr in r[1]),
+ ''.join('<li>%s</li>\n' % relation_html(sr) for sr in r[1]),
)
else:
raise TypeError(
)
def make_synset_html(db_name, disp_name, rels):
- synset_html = "<i>%s</i>\n" % make_lookup_link(
+ synset_html = '<i>%s</i>\n' % make_lookup_link(
copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(),
disp_name,
)
if db_name in ref.synset_relations[synset.name()]:
- synset_html += "<ul>%s</ul>\n" % "".join(
+ synset_html += '<ul>%s</ul>\n' % ''.join(
"<li>%s</li>\n" % relation_html(r) for r in rels
)
return synset_html
html = (
- "<ul>"
- + "\n".join(
+ '<ul>'
+ + '\n'.join(
(
"<li>%s</li>" % make_synset_html(*rel_data)
for rel_data in get_relations_data(word, synset)
if rel_data[2] != []
)
)
- + "</ul>"
+ + '</ul>'
)
return html
def page_from_href(href):
- """
+ '''
Returns a tuple of the HTML page built and the new current word
:param href: The hypertext reference to be solved
to be sent to the browser and
word is the new current word
:rtype: A tuple (str,str)
- """
+ '''
return page_from_reference(Reference.decode(href))
def page_from_reference(href):
- """
+ '''
Returns a tuple of the HTML page built and the new current word
:param href: The hypertext reference to be solved
to be sent to the browser and
word is the new current word
:rtype: A tuple (str,str)
- """
+ '''
word = href.word
pos_forms = defaultdict(list)
- words = word.split(",")
- words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""]
+ words = word.split(',')
+ words = [w for w in [w.strip().lower().replace(' ', '_') for w in words] if w != ""]
if len(words) == 0:
# No words were found.
return "", "Please specify a word to search for."
form = wn.morphy(w, pos)
if form and form not in pos_forms[pos]:
pos_forms[pos].append(form)
- body = ""
+ body = ''
for pos, pos_str, name in _pos_tuples():
if pos in pos_forms:
- body += _hlev(3, name) + "\n"
+ body += _hlev(3, name) + '\n'
for w in pos_forms[pos]:
# Not all words of exc files are in the database, skip
# to the next word if a KeyError is raised.
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2020 NLTK Project
+ Copyright (C) 2001-2019 NLTK Project
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
<HTML>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2020 NLTK Project
+ Copyright (C) 2001-2019 NLTK Project
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2020 NLTK Project
+ Copyright (C) 2001-2019 NLTK Project
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
</html>
"""
if with_shutdown:
- shutdown_link = '<a href="SHUTDOWN THE SERVER">Shutdown</a>'
+ shutdown_link = "<a href=\"SHUTDOWN THE SERVER\">Shutdown</a>"
else:
shutdown_link = ""
wnb(port, not server_mode, logfilename)
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from nltk.corpus import (
gutenberg,
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")
-text1 = Text(gutenberg.words("melville-moby_dick.txt"))
+text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)
-text2 = Text(gutenberg.words("austen-sense.txt"))
+text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)
-text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
+text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)
-text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
+text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)
text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)
-text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
+text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)
-text9 = Text(gutenberg.words("chesterton-thursday.txt"))
+text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)
"JOIN",
]
sent6 = [
- "SCENE",
- "1",
- ":",
- "[",
- "wind",
- "]",
- "[",
- "clop",
- "clop",
- "clop",
- "]",
- "KING",
- "ARTHUR",
- ":",
- "Whoa",
- "there",
- "!",
+ 'SCENE',
+ '1',
+ ':',
+ '[',
+ 'wind',
+ ']',
+ '[',
+ 'clop',
+ 'clop',
+ 'clop',
+ ']',
+ 'KING',
+ 'ARTHUR',
+ ':',
+ 'Whoa',
+ 'there',
+ '!',
]
sent7 = [
"Pierre",
".",
]
sent8 = [
- "25",
- "SEXY",
- "MALE",
- ",",
- "seeks",
- "attrac",
- "older",
- "single",
- "lady",
- ",",
- "for",
- "discreet",
- "encounters",
- ".",
+ '25',
+ 'SEXY',
+ 'MALE',
+ ',',
+ 'seeks',
+ 'attrac',
+ 'older',
+ 'single',
+ 'lady',
+ ',',
+ 'for',
+ 'discreet',
+ 'encounters',
+ '.',
]
sent9 = [
"THE",
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Natural Language Toolkit: CCG Categories
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
+from __future__ import unicode_literals
from functools import total_ordering
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
+@add_metaclass(ABCMeta)
@total_ordering
-class AbstractCCGCategory(metaclass=ABCMeta):
- """
+class AbstractCCGCategory(object):
+ '''
Interface for categories in combinatory grammars.
- """
+ '''
@abstractmethod
def is_primitive(self):
return self._hash
+@python_2_unicode_compatible
class CCGVar(AbstractCCGCategory):
- """
+ '''
Class representing a variable CCG category.
Used for conjunctions (and possibly type-raising, if implemented as a
unary rule).
- """
+ '''
_maxID = 0
@total_ordering
+@python_2_unicode_compatible
class Direction(object):
- """
+ '''
Class representing the direction of a function application.
Also contains maintains information as to which combinators
may be used with the category.
- """
+ '''
def __init__(self, dir, restrictions):
self._dir = dir
# Testing the application direction
def is_forward(self):
- return self._dir == "/"
+ return self._dir == '/'
def is_backward(self):
- return self._dir == "\\"
+ return self._dir == '\\'
def dir(self):
return self._dir
return self._restrs
def is_variable(self):
- return self._restrs == "_"
+ return self._restrs == '_'
# Unification and substitution of variable directions.
# Used only if type-raising is implemented as a unary rule, as it
# must inherit restrictions from the argument category.
def can_unify(self, other):
if other.is_variable():
- return [("_", self.restrs())]
+ return [('_', self.restrs())]
elif self.is_variable():
- return [("_", other.restrs())]
+ return [('_', other.restrs())]
else:
if self.restrs() == other.restrs():
return []
return self
for (var, restrs) in subs:
- if var == "_":
+ if var == '_':
return Direction(self._dir, restrs)
return self
# Testing permitted combinators
def can_compose(self):
- return "," not in self._restrs
+ return ',' not in self._restrs
def can_cross(self):
- return "." not in self._restrs
+ return '.' not in self._restrs
def __eq__(self, other):
return (
# The negation operator reverses the direction of the application
def __neg__(self):
- if self._dir == "/":
- return Direction("\\", self._restrs)
+ if self._dir == '/':
+ return Direction('\\', self._restrs)
else:
- return Direction("/", self._restrs)
+ return Direction('/', self._restrs)
+@python_2_unicode_compatible
class PrimitiveCategory(AbstractCCGCategory):
- """
+ '''
Class representing primitive categories.
Takes a string representation of the category, and a
list of strings specifying the morphological subcategories.
- """
+ '''
def __init__(self, categ, restrictions=[]):
self._categ = categ
def __str__(self):
if self._restrs == []:
return "%s" % self._categ
- restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
+ restrictions = "[%s]" % ",".join(unicode_repr(r) for r in self._restrs)
return "%s%s" % (self._categ, restrictions)
+@python_2_unicode_compatible
class FunctionalCategory(AbstractCCGCategory):
- """
+ '''
Class that represents a function application category.
Consists of argument and result categories, together with
an application direction.
- """
+ '''
def __init__(self, res, arg, dir):
self._res = res
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
This entire process is shown far more clearly in the demonstration:
python chart.py
"""
+from __future__ import print_function, division, unicode_literals
import itertools
+from six import string_types
+
from nltk.parse import ParserI
from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
from nltk.tree import Tree
BackwardBx,
BackwardSx,
)
-
+from nltk.compat import python_2_unicode_compatible
from nltk.ccg.combinator import *
from nltk.ccg.logic import *
from nltk.sem.logic import *
class CCGLeafEdge(EdgeI):
- """
+ '''
Class representing leaf edges in a CCG derivation.
- """
+ '''
def __init__(self, pos, token, leaf):
self._pos = pos
return self._leaf
+@python_2_unicode_compatible
class BinaryCombinatorRule(AbstractChartRule):
- """
+ '''
Class implementing application of a binary combinator to a chart.
Takes the directed combinator to apply.
- """
+ '''
NUMEDGES = 2
# Type-raising must be handled slightly differently to the other rules, as the
# resulting rules only span a single edge, rather than both edges.
-
-
+@python_2_unicode_compatible
class ForwardTypeRaiseRule(AbstractChartRule):
- """
+ '''
Class for applying forward type raising
- """
+ '''
NUMEDGES = 2
return "%s" % self._combinator
+@python_2_unicode_compatible
class BackwardTypeRaiseRule(AbstractChartRule):
- """
+ '''
Class for applying backward type raising.
- """
+ '''
NUMEDGES = 2
class CCGChartParser(ParserI):
- """
+ '''
Chart parser for CCGs.
Based largely on the ChartParser class from NLTK.
- """
+ '''
def __init__(self, lexicon, rules, trace=0):
self._lexicon = lexicon
elif isinstance(combinator, UndirectedSubstitution):
return compute_substitution_semantics(function, argument)
else:
- raise AssertionError("Unsupported combinator '" + combinator + "'")
+ raise AssertionError('Unsupported combinator \'' + combinator + '\'')
else:
return compute_type_raised_semantics(children[0].label()[0].semantics())
def printCCGDerivation(tree):
# Get the leaves and initial categories
leafcats = tree.pos()
- leafstr = ""
- catstr = ""
+ leafstr = ''
+ catstr = ''
# Construct a string with both the leaf word and corresponding
# category aligned.
nextlen = 2 + max(len(leaf), len(str_cat))
lcatlen = (nextlen - len(str_cat)) // 2
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
- catstr += " " * lcatlen + str_cat + " " * rcatlen
+ catstr += ' ' * lcatlen + str_cat + ' ' * rcatlen
lleaflen = (nextlen - len(leaf)) // 2
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
- leafstr += " " * lleaflen + leaf + " " * rleaflen
+ leafstr += ' ' * lleaflen + leaf + ' ' * rleaflen
print(leafstr.rstrip())
print(catstr.rstrip())
(token, op) = tree.label()
- if op == "Leaf":
+ if op == 'Leaf':
return rwidth
# Pad to the left with spaces, followed by a sequence of '-'
# and the derivation rule.
- print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
+ print(lwidth * ' ' + (rwidth - lwidth) * '-' + "%s" % op)
# Print the resulting category on a new line.
str_res = "%s" % (token.categ())
if token.semantics() is not None:
str_res += " {" + str(token.semantics()) + "}"
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
- print(respadlen * " " + str_res)
+ print(respadlen * ' ' + str_res)
return rwidth
# Construct the lexicon
lex = fromstring(
- """
+ '''
:- S, NP, N, VP # Primitive categories, S is the target primitive
Det :: NP/N # Family of words
mushrooms => N
parsnips => N
bacon => N
- """
+ '''
)
printCCGDerivation(parse)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
CCG Combinators
"""
+from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from nltk.compat import python_2_unicode_compatible
from nltk.ccg.api import FunctionalCategory
-class UndirectedBinaryCombinator(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class UndirectedBinaryCombinator(object):
"""
Abstract class for representing a binary combinator.
Merely defines functions for checking if the function and argument
pass
-class DirectedBinaryCombinator(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class DirectedBinaryCombinator(object):
"""
Wrapper for the undirected binary combinator.
It takes left and right categories, and decides which is to be
pass
+@python_2_unicode_compatible
class ForwardCombinator(DirectedBinaryCombinator):
"""
Class representing combinators where the primary functor is on the left.
restricting the cases in which it may apply.
"""
- def __init__(self, combinator, predicate, suffix=""):
+ def __init__(self, combinator, predicate, suffix=''):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
return ">%s%s" % (self._combinator, self._suffix)
+@python_2_unicode_compatible
class BackwardCombinator(DirectedBinaryCombinator):
"""
The backward equivalent of the ForwardCombinator class.
"""
- def __init__(self, combinator, predicate, suffix=""):
+ def __init__(self, combinator, predicate, suffix=''):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
return "<%s%s" % (self._combinator, self._suffix)
+@python_2_unicode_compatible
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
"""
Class representing function application.
yield function.res().substitute(subs)
def __str__(self):
- return ""
+ return ''
# Predicates for function application.
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
+@python_2_unicode_compatible
class UndirectedComposition(UndirectedBinaryCombinator):
"""
Functional composition (harmonic) combinator.
)
def __str__(self):
- return "B"
+ return 'B'
# Predicates for restricting application of straight composition.
# Backward crossed composition
BackwardBx = BackwardCombinator(
- UndirectedComposition(), backwardBxConstraint, suffix="x"
+ UndirectedComposition(), backwardBxConstraint, suffix='x'
)
+@python_2_unicode_compatible
class UndirectedSubstitution(UndirectedBinaryCombinator):
"""
Substitution (permutation) combinator.
)
def __str__(self):
- return "S"
+ return 'S'
# Predicate for forward substitution
# Instances of substitution combinators
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
-BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
+BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, 'x')
# Retrieves the left-most functional category.
return categ
+@python_2_unicode_compatible
class UndirectedTypeRaise(UndirectedBinaryCombinator):
"""
Undirected combinator for type raising.
)
def __str__(self):
- return "T"
+ return 'T'
# Predicates for type-raising
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
CCG Lexicons
"""
+from __future__ import unicode_literals
+
import re
from collections import defaultdict
from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
+from nltk.compat import python_2_unicode_compatible
from nltk.internals import deprecated
from nltk.sem.logic import Expression
# ------------
# Parses a primitive category and subscripts
-PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
+PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
# Separates the next primitive category from the remainder of the
# string
-NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
+NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
# Separates the next application operator from the remainder
-APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
+APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
# Parses the definition of the right-hand side (rhs) of either a word or a family
-LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
+LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
# Parses the right hand side that contains category and maybe semantic predicate
-RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
+RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
# Parses the semantic predicate
-SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
+SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
# Strips comments from a line
-COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
+COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
class Token(object):
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
+@python_2_unicode_compatible
class CCGLexicon(object):
"""
Class representing a lexicon for CCG grammars.
rest = string[1:]
inside = "("
- while rest != "" and not rest.startswith(")"):
- if rest.startswith("("):
+ while rest != "" and not rest.startswith(')'):
+ if rest.startswith('('):
(part, rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
- if rest.startswith(")"):
- return (inside + ")", rest[1:])
- raise AssertionError("Unmatched bracket in string '" + string + "'")
+ if rest.startswith(')'):
+ return (inside + ')', rest[1:])
+ raise AssertionError('Unmatched bracket in string \'' + string + '\'')
def nextCategory(string):
Separate the string for the next portion of the category from the rest
of the string
"""
- if string.startswith("("):
+ if string.startswith('('):
return matchBrackets(string)
return NEXTPRIM_RE.match(string).groups()
Parse the subscripts for a primitive category
"""
if subscr:
- return subscr[1:-1].split(",")
+ return subscr[1:-1].split(',')
return []
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr, subscrs), var)
raise AssertionError(
- "String '" + catstr + "' is neither a family nor primitive category."
+ 'String \'' + catstr + '\' is neither a family nor primitive category.'
)
"""
(cat_string, rest) = nextCategory(line)
- if cat_string.startswith("("):
+ if cat_string.startswith('('):
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
+ # print rePrim.match(str).groups()
(res, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
rest = app[3]
(cat_string, rest) = nextCategory(rest)
- if cat_string.startswith("("):
+ if cat_string.startswith('('):
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(arg, var) = parsePrimitiveCategory(
if line == "":
continue
- if line.startswith(":-"):
+ if line.startswith(':-'):
# A line of primitive categories.
# The first one is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [
- prim.strip() for prim in line[2:].strip().split(",")
+ prim.strip() for prim in line[2:].strip().split(',')
]
else:
# Either a family definition, or a word definition
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
- if sep == "::":
+ if sep == '::':
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
return CCGLexicon(primitives[0], primitives, families, entries)
-@deprecated("Use fromstring() instead.")
+@deprecated('Use fromstring() instead.')
def parseLexicon(lex_str):
return fromstring(lex_str)
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tanin Na Nakorn (@tanin)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Natural Language Toolkit: Chatbots
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
These chatbots may not work using the windows command line or the
windows IDLE GUI.
"""
+from __future__ import print_function
from nltk.chat.util import Chat
from nltk.chat.eliza import eliza_chat
from nltk.chat.zen import zen_chat
bots = [
- (eliza_chat, "Eliza (psycho-babble)"),
- (iesha_chat, "Iesha (teen anime junky)"),
- (rude_chat, "Rude (abusive bot)"),
- (suntsu_chat, "Suntsu (Chinese sayings)"),
- (zen_chat, "Zen (gems of wisdom)"),
+ (eliza_chat, 'Eliza (psycho-babble)'),
+ (iesha_chat, 'Iesha (teen anime junky)'),
+ (rude_chat, 'Rude (abusive bot)'),
+ (suntsu_chat, 'Suntsu (Chinese sayings)'),
+ (zen_chat, 'Zen (gems of wisdom)'),
]
def chatbots():
import sys
- print("Which chatbot would you like to talk to?")
+ print('Which chatbot would you like to talk to?')
botcount = len(bots)
for i in range(botcount):
- print(" %d: %s" % (i + 1, bots[i][1]))
+ print(' %d: %s' % (i + 1, bots[i][1]))
while True:
- print("\nEnter a number in the range 1-%d: " % botcount, end=" ")
+ print('\nEnter a number in the range 1-%d: ' % botcount, end=' ')
choice = sys.stdin.readline().strip()
if choice.isdigit() and (int(choice) - 1) in range(botcount):
break
else:
- print(" Error: bad chatbot number")
+ print(' Error: bad chatbot number')
chatbot = bots[int(choice) - 1][0]
chatbot()
# Natural Language Toolkit: Eliza
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# a translation table used to convert things you say into things the
# computer says back, e.g. "I am" --> "you are"
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
# a table of response pairs, where each pair consists of a
pairs = (
(
- r"I need (.*)",
+ r'I need (.*)',
(
"Why do you need %1?",
"Would it really help you to get %1?",
),
),
(
- r"Why don\'t you (.*)",
+ r'Why don\'t you (.*)',
(
"Do you really think I don't %1?",
"Perhaps eventually I will %1.",
),
),
(
- r"Why can\'t I (.*)",
+ r'Why can\'t I (.*)',
(
"Do you think you should be able to %1?",
"If you could %1, what would you do?",
),
),
(
- r"I can\'t (.*)",
+ r'I can\'t (.*)',
(
"How do you know you can't %1?",
"Perhaps you could %1 if you tried.",
),
),
(
- r"I am (.*)",
+ r'I am (.*)',
(
"Did you come to me because you are %1?",
"How long have you been %1?",
),
),
(
- r"I\'m (.*)",
+ r'I\'m (.*)',
(
"How does being %1 make you feel?",
"Do you enjoy being %1?",
),
),
(
- r"Are you (.*)",
+ r'Are you (.*)',
(
"Why does it matter whether I am %1?",
"Would you prefer it if I were not %1?",
),
),
(
- r"What (.*)",
+ r'What (.*)',
(
"Why do you ask?",
"How would an answer to that help you?",
),
),
(
- r"How (.*)",
+ r'How (.*)',
(
"How do you suppose?",
"Perhaps you can answer your own question.",
),
),
(
- r"Because (.*)",
+ r'Because (.*)',
(
"Is that the real reason?",
"What other reasons come to mind?",
),
),
(
- r"(.*) sorry (.*)",
+ r'(.*) sorry (.*)',
(
"There are many times when no apology is needed.",
"What feelings do you have when you apologize?",
),
),
(
- r"Hello(.*)",
+ r'Hello(.*)',
(
"Hello... I'm glad you could drop by today.",
"Hi there... how are you today?",
),
),
(
- r"I think (.*)",
+ r'I think (.*)',
("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
),
(
- r"(.*) friend (.*)",
+ r'(.*) friend (.*)',
(
"Tell me more about your friends.",
"When you think of a friend, what comes to mind?",
"Why don't you tell me about a childhood friend?",
),
),
- (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
+ (r'Yes', ("You seem quite sure.", "OK, but can you elaborate a bit?")),
(
- r"(.*) computer(.*)",
+ r'(.*) computer(.*)',
(
"Are you really talking about me?",
"Does it seem strange to talk to a computer?",
),
),
(
- r"Is it (.*)",
+ r'Is it (.*)',
(
"Do you think it is %1?",
"Perhaps it's %1 -- what do you think?",
),
),
(
- r"It is (.*)",
+ r'It is (.*)',
(
"You seem very certain.",
"If I told you that it probably isn't %1, what would you feel?",
),
),
(
- r"Can you (.*)",
+ r'Can you (.*)',
(
"What makes you think I can't %1?",
"If I could %1, then what?",
),
),
(
- r"Can I (.*)",
+ r'Can I (.*)',
(
"Perhaps you don't want to %1.",
"Do you want to be able to %1?",
),
),
(
- r"You are (.*)",
+ r'You are (.*)',
(
"Why do you think I am %1?",
"Does it please you to think that I'm %1?",
),
),
(
- r"You\'re (.*)",
+ r'You\'re (.*)',
(
"Why do you say I am %1?",
"Why do you think I am %1?",
),
),
(
- r"I don\'t (.*)",
+ r'I don\'t (.*)',
("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
),
(
- r"I feel (.*)",
+ r'I feel (.*)',
(
"Good, tell me more about these feelings.",
"Do you often feel %1?",
),
),
(
- r"I have (.*)",
+ r'I have (.*)',
(
"Why do you tell me that you've %1?",
"Have you really %1?",
),
),
(
- r"I would (.*)",
+ r'I would (.*)',
(
"Could you explain why you would %1?",
"Why would you %1?",
),
),
(
- r"Is there (.*)",
+ r'Is there (.*)',
(
"Do you think there is %1?",
"It's likely that there is %1.",
),
),
(
- r"My (.*)",
+ r'My (.*)',
(
"I see, your %1.",
"Why do you say that your %1?",
),
),
(
- r"You (.*)",
+ r'You (.*)',
(
"We should be discussing you, not me.",
"Why do you say that about me?",
"Why do you care whether I %1?",
),
),
- (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
+ (r'Why (.*)', ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
(
- r"I want (.*)",
+ r'I want (.*)',
(
"What would it mean to you if you got %1?",
"Why do you want %1?",
),
),
(
- r"(.*) mother(.*)",
+ r'(.*) mother(.*)',
(
"Tell me more about your mother.",
"What was your relationship with your mother like?",
),
),
(
- r"(.*) father(.*)",
+ r'(.*) father(.*)',
(
"Tell me more about your father.",
"How did your father make you feel?",
),
),
(
- r"(.*) child(.*)",
+ r'(.*) child(.*)',
(
"Did you have close friends as a child?",
"What is your favorite childhood memory?",
),
),
(
- r"(.*)\?",
+ r'(.*)\?',
(
"Why do you ask that?",
"Please consider whether you can answer your own question.",
),
),
(
- r"quit",
+ r'quit',
(
"Thank you for talking with me.",
"Good-bye.",
),
),
(
- r"(.*)",
+ r'(.*)',
(
"Please tell me more.",
"Let's change focus a bit... Tell me about your family.",
print("Therapist\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('=' * 72)
print("Hello. How are you feeling today?")
eliza_chatbot.converse()
# Natural Language Toolkit: Teen Chatbot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
anime junky that frequents YahooMessenger or MSNM.
All spelling mistakes and flawed grammar are intentional.
"""
+from __future__ import print_function
from nltk.chat.util import Chat
pairs = (
(
- r"I\'m (.*)",
+ r'I\'m (.*)',
(
"ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
"ur%1? neat!! kekeke >_<",
),
),
(
- r"(.*) don\'t you (.*)",
+ r'(.*) don\'t you (.*)',
(
"u think I can%2??! really?? kekeke \<_\<",
"what do u mean%2??!",
"i could if i wanted, don't you think!! kekeke",
),
),
- (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
+ (r'ye[as] [iI] (.*)', ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
(
- r"do (you|u) (.*)\??",
+ r'do (you|u) (.*)\??',
("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
),
(
- r"(.*)\?",
+ r'(.*)\?',
(
"man u ask lots of questions!",
"booooring! how old r u??",
),
),
(
- r"(cos|because) (.*)",
+ r'(cos|because) (.*)',
("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
),
(
- r"why can\'t [iI] (.*)",
+ r'why can\'t [iI] (.*)',
(
"i dunno! y u askin me for!",
"try harder, silly! hee! ^_^",
),
),
(
- r"I can\'t (.*)",
+ r'I can\'t (.*)',
(
"u can't what??! >_<",
"that's ok! i can't%1 either! kekekekeke ^_^",
),
),
(
- r"(.*) (like|love|watch) anime",
+ r'(.*) (like|love|watch) anime',
(
"omg i love anime!! do u like sailor moon??! ^&^",
"anime yay! anime rocks sooooo much!",
),
),
(
- r"I (like|love|watch|play) (.*)",
+ r'I (like|love|watch|play) (.*)',
("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
),
(
- r"anime sucks|(.*) (hate|detest) anime",
+ r'anime sucks|(.*) (hate|detest) anime',
(
"ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
"no way! anime is the best ever!",
),
),
(
- r"(are|r) (you|u) (.*)",
+ r'(are|r) (you|u) (.*)',
("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
),
(
- r"what (.*)",
+ r'what (.*)',
("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
),
- (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
- (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
+ (r'how (.*)', ("not tellin!! kekekekekeke ^_^",)),
+ (r'(hi|hello|hey) (.*)', ("hi!!! how r u!!",)),
(
- r"quit",
+ r'quit',
(
"mom says i have to go eat dinner now :,( bye!!",
"awww u have to go?? see u next time!!",
),
),
(
- r"(.*)",
+ r'(.*)',
(
"ur funny! kekeke",
"boooooring! talk about something else! tell me wat u like!",
print("Iesha the TeenBoT\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('=' * 72)
print("hi!! i'm iesha! who r u??!")
iesha_chatbot.converse()
# Natural Language Toolkit: Rude Chatbot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
pairs = (
(
- r"We (.*)",
+ r'We (.*)',
(
"What do you mean, 'we'?",
"Don't include me in that!",
),
),
(
- r"You should (.*)",
+ r'You should (.*)',
("Don't tell me what to do, buddy.", "Really? I should, should I?"),
),
(
- r"You\'re(.*)",
+ r'You\'re(.*)',
(
"More like YOU'RE %1!",
"Hah! Look who's talking.",
),
),
(
- r"You are(.*)",
+ r'You are(.*)',
(
"More like YOU'RE %1!",
"Hah! Look who's talking.",
),
),
(
- r"I can\'t(.*)",
+ r'I can\'t(.*)',
(
"You do sound like the type who can't %1.",
"Hear that splashing sound? That's my heart bleeding for you.",
),
),
(
- r"I think (.*)",
+ r'I think (.*)',
(
"I wouldn't think too hard if I were you.",
"You actually think? I'd never have guessed...",
),
),
(
- r"I (.*)",
+ r'I (.*)',
(
"I'm getting a bit tired of hearing about you.",
"How about we talk about me instead?",
),
),
(
- r"How (.*)",
+ r'How (.*)',
(
"How do you think?",
"Take a wild guess.",
"I'm not even going to dignify that with an answer.",
),
),
- (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
+ (r'What (.*)', ("Do I look like an encyclopedia?", "Figure it out yourself.")),
(
- r"Why (.*)",
+ r'Why (.*)',
(
"Why not?",
"That's so obvious I thought even you'd have already figured it out.",
),
),
(
- r"(.*)shut up(.*)",
+ r'(.*)shut up(.*)',
(
"Make me.",
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
),
),
(
- r"Shut up(.*)",
+ r'Shut up(.*)',
(
"Make me.",
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
),
),
(
- r"Hello(.*)",
+ r'Hello(.*)',
("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
),
(
- r"(.*)",
+ r'(.*)',
(
"I'm getting bored here. Become more interesting.",
"Either become more thrilling or get lost, buddy.",
def rude_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('=' * 72)
print("I suppose I should say hello.")
rude_chatbot.converse()
# Natural Language Toolkit: Sun Tsu-Bot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Sam Huston 2007
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Hosted by the Gutenberg Project
http://www.gutenberg.org/
"""
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
pairs = (
- (r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
+ (r'quit', ("Good-bye.", "Plan well", "May victory be your future")),
(
- r"[^\?]*\?",
+ r'[^\?]*\?',
(
"Please consider whether you can answer your own question.",
"Ask me no questions!",
),
),
(
- r"[0-9]+(.*)",
+ r'[0-9]+(.*)',
(
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
"There are five essentials for victory",
),
),
(
- r"[A-Ca-c](.*)",
+ r'[A-Ca-c](.*)',
(
"The art of war is of vital importance to the State.",
"All warfare is based on deception.",
),
),
(
- r"[D-Fd-f](.*)",
+ r'[D-Fd-f](.*)',
(
"The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
"Bring war material with you from home, but forage on the enemy.",
),
),
(
- r"[G-Ig-i](.*)",
+ r'[G-Ig-i](.*)',
(
"Heaven signifies night and day, cold and heat, times and seasons.",
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
),
),
(
- r"[J-Lj-l](.*)",
+ r'[J-Lj-l](.*)',
(
"There are three ways in which a ruler can bring misfortune upon his army.",
"By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
),
),
(
- r"[M-Om-o](.*)",
+ r'[M-Om-o](.*)',
(
"If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
"If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
),
),
(
- r"[P-Rp-r](.*)",
+ r'[P-Rp-r](.*)',
(
"Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
"Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
),
),
(
- r"[S-Us-u](.*)",
+ r'[S-Us-u](.*)',
(
"What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
"Hence his victories bring him neither reputation for wisdom nor credit for courage.",
),
),
(
- r"[V-Zv-z](.*)",
+ r'[V-Zv-z](.*)',
(
"It is a matter of life and death, a road either to safety or to ruin.",
"Hold out baits to entice the enemy. Feign disorder, and crush him.",
"Just as water retains no constant shape, so in warfare there are no constant conditions.",
),
),
- (r"(.*)", ("Your statement insults me.", "")),
+ (r'(.*)', ("Your statement insults me.", "")),
)
suntsu_chatbot = Chat(pairs, reflections)
def suntsu_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('=' * 72)
print("You seek enlightenment?")
suntsu_chatbot.converse()
# Natural Language Toolkit: Chatbot Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
+from __future__ import print_function
import re
import random
+from six.moves import input
+
reflections = {
"i am": "you are",
self._regex = self._compile_reflections()
def _compile_reflections(self):
- sorted_refl = sorted(self._reflections, key=len, reverse=True)
+ sorted_refl = sorted(self._reflections.keys(), key=len, reverse=True)
return re.compile(
r"\b({0})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
)
)
def _wildcards(self, response, match):
- pos = response.find("%")
+ pos = response.find('%')
while pos >= 0:
num = int(response[pos + 1 : pos + 2])
response = (
+ self._substitute(match.group(num))
+ response[pos + 2 :]
)
- pos = response.find("%")
+ pos = response.find('%')
return response
def respond(self, str):
resp = self._wildcards(resp, match) # process wildcards
# fix munged punctuation at the end
- if resp[-2:] == "?.":
- resp = resp[:-2] + "."
- if resp[-2:] == "??":
- resp = resp[:-2] + "?"
+ if resp[-2:] == '?.':
+ resp = resp[:-2] + '.'
+ if resp[-2:] == '??':
+ resp = resp[:-2] + '?'
return resp
# Hold a conversation with a chatbot
# Natural Language Toolkit: Zen Chatbot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
respond to a question by asking a different question, in much the same way
as Eliza.
"""
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
# "good day" etc, but also "good grief!" and other sentences starting
# with the word 'good' that may not be a greeting
(
- r"(hello(.*))|(good [a-zA-Z]+)",
+ r'(hello(.*))|(good [a-zA-Z]+)',
(
"The path to enlightenment is often difficult to see.",
"Greetings. I sense your mind is troubled. Tell me of your troubles.",
# interpretation only makes sense for some inputs
#
(
- r"i need (.*)",
+ r'i need (.*)',
(
"%1 can be achieved by hard work and dedication of the mind.",
"%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
),
),
(
- r"i want (.*)",
+ r'i want (.*)',
(
"Desires of the heart will distract you from the path to enlightenment.",
"Will%1 help you attain enlightenment?",
# chatbot: "Are you sure I tell you?"
# - this style works for positives (e.g. "why do you like cake?")
# but does not work for negatives (e.g. "why don't you like cake?")
- (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
- (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
- (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
+ (r'why (.*) i (.*)\?', ("You%1%2?", "Perhaps you only think you%1%2")),
+ (r'why (.*) you(.*)\?', ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
+ (r'why (.*)\?', ("I cannot tell you why%1.", "Why do you think %1?")),
# e.g. "are you listening?", "are you a duck"
(
- r"are you (.*)\?",
+ r'are you (.*)\?',
("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
),
# e.g. "am I a duck?", "am I going to die?"
(
- r"am i (.*)\?",
+ r'am i (.*)\?',
("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
),
# what questions, e.g. "what time is it?"
# problems:
# person: "What do you want?"
# chatbot: "Seek truth, not what do me want."
- (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
+ (r'what (.*)\?', ("Seek truth, not what%1.", "What%1 should not concern you.")),
# how questions, e.g. "how do you do?"
(
- r"how (.*)\?",
+ r'how (.*)\?',
(
"How do you suppose?",
"Will an answer to that really help in your search for enlightenment?",
),
# can questions, e.g. "can you run?", "can you come over here please?"
(
- r"can you (.*)\?",
+ r'can you (.*)\?',
(
"I probably can, but I may not.",
"Maybe I can%1, and maybe I cannot.",
),
# can questions, e.g. "can I have some cake?", "can I know truth?"
(
- r"can i (.*)\?",
+ r'can i (.*)\?',
(
"You can%1 if you believe you can%1, and have a pure spirit.",
"Seek truth and you will know if you can%1.",
),
# e.g. "It is raining" - implies the speaker is certain of a fact
(
- r"it is (.*)",
+ r'it is (.*)',
(
"How can you be certain that%1, when you do not even know yourself?",
"Whether it is%1 or not does not change the way the world is.",
),
# e.g. "is there a doctor in the house?"
(
- r"is there (.*)\?",
+ r'is there (.*)\?',
("There is%1 if you believe there is.", "It is possible that there is%1."),
),
# e.g. "is it possible?", "is this true?"
- (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
+ (r'is(.*)\?', ("%1 is not relevant.", "Does this matter?")),
# non-specific question
(
- r"(.*)\?",
+ r'(.*)\?',
(
"Do you think %1?",
"You seek the truth. Does the truth seek you?",
),
# expression of hate of form "I hate you" or "Kelly hates cheese"
(
- r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
+ r'(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)',
(
"Perhaps it is not about hating %2, but about hate from within.",
"Weeds only grow when we dislike them",
),
# statement containing the word 'truth'
(
- r"(.*) truth(.*)",
+ r'(.*) truth(.*)',
(
"Seek truth, and truth will seek you.",
"Remember, it is not the spoon which bends - only yourself.",
# desire to do an action
# e.g. "I want to go shopping"
(
- r"i want to (.*)",
+ r'i want to (.*)',
("You may %1 if your heart truly desires to.", "You may have to %1."),
),
# desire for an object
# e.g. "I want a pony"
(
- r"i want (.*)",
+ r'i want (.*)',
(
"Does your heart truly desire %1?",
"Is this a desire of the heart, or of the mind?",
),
# e.g. "I can't wait" or "I can't do this"
(
- r"i can\'t (.*)",
+ r'i can\'t (.*)',
(
"What we can and can't do is a limitation of the mind.",
"There are limitations of the body, and limitations of the mind.",
# problem: exceptions...
# e.g. "I think, therefore I am"
(
- r"i think (.*)",
+ r'i think (.*)',
(
"Uncertainty in an uncertain world.",
"Indeed, how can we be certain of anything in such uncertain times.",
),
# "I feel...emotions/sick/light-headed..."
(
- r"i feel (.*)",
+ r'i feel (.*)',
(
"Your body and your emotions are both symptoms of your mind."
"What do you believe is the root of such feelings?",
# exclaimation mark indicating emotion
# e.g. "Wow!" or "No!"
(
- r"(.*)!",
+ r'(.*)!',
(
"I sense that you are feeling emotional today.",
"You need to calm your emotions.",
# because [statement]
# e.g. "because I said so"
(
- r"because (.*)",
+ r'because (.*)',
(
"Does knowning the reasons behind things help you to understand"
" the things themselves?",
),
# yes or no - raise an issue of certainty/correctness
(
- r"(yes)|(no)",
+ r'(yes)|(no)',
(
"Is there certainty in an uncertain world?",
"It is better to be right than to be certain.",
),
# sentence containing word 'love'
(
- r"(.*)love(.*)",
+ r'(.*)love(.*)',
(
"Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
"Free love!",
),
# sentence containing word 'understand' - r
(
- r"(.*)understand(.*)",
+ r'(.*)understand(.*)',
(
"If you understand, things are just as they are;"
" if you do not understand, things are just as they are.",
# 'I', 'me', 'my' - person is talking about themself.
# this breaks down when words contain these - eg 'Thyme', 'Irish'
(
- r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
+ r'(.*)(me )|( me)|(my)|(mine)|(i)(.*)',
(
"'I', 'me', 'my'... these are selfish expressions.",
"Have you ever considered that you might be a selfish person?",
# 'you' starting a sentence
# e.g. "you stink!"
(
- r"you (.*)",
+ r'you (.*)',
("My path is not of conern to you.", "I am but one, and you but one more."),
),
# say goodbye with some extra Zen wisdom.
(
- r"exit",
+ r'exit',
(
"Farewell. The obstacle is the path.",
"Farewell. Life is a journey, not a destination.",
# when stumped, respond with generic zen wisdom
#
(
- r"(.*)",
+ r'(.*)',
(
"When you're enlightened, every word is wisdom.",
"Random talk is useless.",
def zen_chat():
- print("*" * 75)
+ print('*' * 75)
print("Zen Chatbot!".center(75))
- print("*" * 75)
+ print('*' * 75)
print('"Look beyond mere words and letters - look into your mind"'.center(75))
print("* Talk your way to truth with Zen Chatbot.")
print("* Type 'quit' when you have had enough.")
- print("*" * 75)
+ print('*' * 75)
print("Welcome, my child.")
zen_chatbot.converse()
# Natural Language Toolkit: Chunkers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
# Standard treebank POS tagger
-_BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle"
-_MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle"
+_BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
+_MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
def ne_chunk(tagged_tokens, binary=False):
# Natural Language Toolkit: Chunk parsing API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# Natural Language Toolkit: Chunk parsing API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Named entity chunker
"""
+from __future__ import print_function
+from __future__ import unicode_literals
import os, re, pickle
from xml.etree import ElementTree as ET
def _classifier_builder(self, train):
return MaxentClassifier.train(
- train, algorithm="megam", gaussian_prior_sigma=1, trace=2
+ train, algorithm='megam', gaussian_prior_sigma=1, trace=2
)
def _english_wordlist(self):
except AttributeError:
from nltk.corpus import words
- self._en_wordlist = set(words.words("en-basic"))
+ self._en_wordlist = set(words.words('en-basic'))
wl = self._en_wordlist
return wl
# 89.6
features = {
- "bias": True,
- "shape": shape(word),
- "wordlen": len(word),
- "prefix3": word[:3].lower(),
- "suffix3": word[-3:].lower(),
- "pos": pos,
- "word": word,
- "en-wordlist": (word in self._english_wordlist()),
- "prevtag": prevtag,
- "prevpos": prevpos,
- "nextpos": nextpos,
- "prevword": prevword,
- "nextword": nextword,
- "word+nextpos": "{0}+{1}".format(word.lower(), nextpos),
- "pos+prevtag": "{0}+{1}".format(pos, prevtag),
- "shape+prevtag": "{0}+{1}".format(prevshape, prevtag),
+ 'bias': True,
+ 'shape': shape(word),
+ 'wordlen': len(word),
+ 'prefix3': word[:3].lower(),
+ 'suffix3': word[-3:].lower(),
+ 'pos': pos,
+ 'word': word,
+ 'en-wordlist': (word in self._english_wordlist()),
+ 'prevtag': prevtag,
+ 'prevpos': prevpos,
+ 'nextpos': nextpos,
+ 'prevword': prevword,
+ 'nextword': nextword,
+ 'word+nextpos': '{0}+{1}'.format(word.lower(), nextpos),
+ 'pos+prevtag': '{0}+{1}'.format(pos, prevtag),
+ 'shape+prevtag': '{0}+{1}'.format(prevshape, prevtag),
}
return features
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
- sent = Tree("S", [])
+ sent = Tree('S', [])
for (tok, tag) in tagged_tokens:
- if tag == "O":
+ if tag == 'O':
sent.append(tok)
- elif tag.startswith("B-"):
+ elif tag.startswith('B-'):
sent.append(Tree(tag[2:], [tok]))
- elif tag.startswith("I-"):
+ elif tag.startswith('I-'):
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
sent[-1].append(tok)
else:
if len(child) == 0:
print("Warning -- empty chunk in sentence")
continue
- toks.append((child[0], "B-{0}".format(child.label())))
+ toks.append((child[0], 'B-{0}'.format(child.label())))
for tok in child[1:]:
- toks.append((tok, "I-{0}".format(child.label())))
+ toks.append((tok, 'I-{0}'.format(child.label())))
else:
- toks.append((child, "O"))
+ toks.append((child, 'O'))
return toks
def shape(word):
- if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
- return "number"
- elif re.match("\W+$", word, re.UNICODE):
- return "punct"
- elif re.match("\w+$", word, re.UNICODE):
+ if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word, re.UNICODE):
+ return 'number'
+ elif re.match('\W+$', word, re.UNICODE):
+ return 'punct'
+ elif re.match('\w+$', word, re.UNICODE):
if word.istitle():
- return "upcase"
+ return 'upcase'
elif word.islower():
- return "downcase"
+ return 'downcase'
else:
- return "mixedcase"
+ return 'mixedcase'
else:
- return "other"
+ return 'other'
def simplify_pos(s):
- if s.startswith("V"):
+ if s.startswith('V'):
return "V"
else:
- return s.split("-")[0]
+ return s.split('-')[0]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
- newtree = Tree("S", [])
+ newtree = Tree('S', [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.label(), []))
return newtree
-def load_ace_data(roots, fmt="binary", skip_bnews=True):
+def load_ace_data(roots, fmt='binary', skip_bnews=True):
for root in roots:
for root, dirs, files in os.walk(root):
- if root.endswith("bnews") and skip_bnews:
+ if root.endswith('bnews') and skip_bnews:
continue
for f in files:
- if f.endswith(".sgm"):
+ if f.endswith('.sgm'):
for sent in load_ace_file(os.path.join(root, f), fmt):
yield sent
def load_ace_file(textfile, fmt):
- print(" - {0}".format(os.path.split(textfile)[1]))
- annfile = textfile + ".tmx.rdc.xml"
+ print(' - {0}'.format(os.path.split(textfile)[1]))
+ annfile = textfile + '.tmx.rdc.xml'
# Read the xml file, and get a list of entities
entities = []
- with open(annfile, "r") as infile:
+ with open(annfile, 'r') as infile:
xml = ET.parse(infile).getroot()
- for entity in xml.findall("document/entity"):
- typ = entity.find("entity_type").text
- for mention in entity.findall("entity_mention"):
- if mention.get("TYPE") != "NAME":
+ for entity in xml.findall('document/entity'):
+ typ = entity.find('entity_type').text
+ for mention in entity.findall('entity_mention'):
+ if mention.get('TYPE') != 'NAME':
continue # only NEs
- s = int(mention.find("head/charseq/start").text)
- e = int(mention.find("head/charseq/end").text) + 1
+ s = int(mention.find('head/charseq/start').text)
+ e = int(mention.find('head/charseq/end').text) + 1
entities.append((s, e, typ))
# Read the text file, and mark the entities.
- with open(textfile, "r") as infile:
+ with open(textfile, 'r') as infile:
text = infile.read()
# Strip XML tags, since they don't count towards the indices
- text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
+ text = re.sub('<(?!/?TEXT)[^>]+>', '', text)
# Blank out anything before/after <TEXT>
def subfunc(m):
- return " " * (m.end() - m.start() - 6)
+ return ' ' * (m.end() - m.start() - 6)
- text = re.sub("[\s\S]*<TEXT>", subfunc, text)
- text = re.sub("</TEXT>[\s\S]*", "", text)
+ text = re.sub('[\s\S]*<TEXT>', subfunc, text)
+ text = re.sub('</TEXT>[\s\S]*', '', text)
# Simplify quotes
text = re.sub("``", ' "', text)
entity_types = set(typ for (s, e, typ) in entities)
# Binary distinction (NE or not NE)
- if fmt == "binary":
+ if fmt == 'binary':
i = 0
- toks = Tree("S", [])
+ toks = Tree('S', [])
for (s, e, typ) in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
- toks.append(Tree("NE", text[s:e].split()))
+ toks.append(Tree('NE', text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
- elif fmt == "multiclass":
+ elif fmt == 'multiclass':
i = 0
- toks = Tree("S", [])
+ toks = Tree('S', [])
for (s, e, typ) in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
yield toks
else:
- raise ValueError("bad fmt value")
+ raise ValueError('bad fmt value')
# This probably belongs in a more general-purpose location (as does
guessed = NEChunkParser._parse_to_tagged(guessed)
ellipsis = False
for (w, ct), (w, gt) in zip(correct, guessed):
- if ct == gt == "O":
+ if ct == gt == 'O':
if not ellipsis:
print(" {:15} {:15} {2}".format(ct, gt, w))
- print(" {:15} {:15} {2}".format("...", "...", "..."))
+ print(' {:15} {:15} {2}'.format('...', '...', '...'))
ellipsis = True
else:
ellipsis = False
print(" {:15} {:15} {2}".format(ct, gt, w))
-def build_model(fmt="binary"):
- print("Loading training data...")
+def build_model(fmt='binary'):
+ print('Loading training data...')
train_paths = [
- find("corpora/ace_data/ace.dev"),
- find("corpora/ace_data/ace.heldout"),
- find("corpora/ace_data/bbn.dev"),
- find("corpora/ace_data/muc.dev"),
+ find('corpora/ace_data/ace.dev'),
+ find('corpora/ace_data/ace.heldout'),
+ find('corpora/ace_data/bbn.dev'),
+ find('corpora/ace_data/muc.dev'),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
- print("Training...")
+ print('Training...')
cp = NEChunkParser(train_data)
del train_data
- print("Loading eval data...")
- eval_paths = [find("corpora/ace_data/ace.eval")]
+ print('Loading eval data...')
+ eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
- print("Evaluating...")
+ print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
cmp_chunks(correct, guess)
print(chunkscore)
- outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt)
- print("Saving chunker to {0}...".format(outfilename))
+ outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
+ print('Saving chunker to {0}...'.format(outfilename))
- with open(outfilename, "wb") as outfile:
+ with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
return cp
-if __name__ == "__main__":
+if __name__ == '__main__':
# Make sure that the pickled object has the right class name:
from nltk.chunk.named_entity import build_model
- build_model("binary")
- build_model("multiclass")
+ build_model('binary')
+ build_model('multiclass')
# Natural Language Toolkit: Regular Expression Chunkers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+from __future__ import division
import re
+from six import string_types
+
from nltk.tree import Tree
from nltk.chunk.api import ChunkParserI
+from nltk.compat import python_2_unicode_compatible, unicode_repr
##//////////////////////////////////////////////////////
## ChunkString
##//////////////////////////////////////////////////////
+@python_2_unicode_compatible
class ChunkString(object):
"""
A string-based encoding of a particular chunking of a text.
will only match positions that are in chinks.
"""
- CHUNK_TAG_CHAR = r"[^\{\}<>]"
- CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR
+ CHUNK_TAG_CHAR = r'[^\{\}<>]'
+ CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
- IN_CHUNK_PATTERN = r"(?=[^\{]*\})"
- IN_CHINK_PATTERN = r"(?=[^\}]*(\{|$))"
+ IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
+ IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
# These are used by _verify
- _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
- _CHINK = r"(%s+?)+?" % CHUNK_TAG
- _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
- _BRACKETS = re.compile("[^\{\}]+")
- _BALANCED_BRACKETS = re.compile(r"(\{\})*$")
+ _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
+ _CHINK = r'(%s+?)+?' % CHUNK_TAG
+ _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
+ _BRACKETS = re.compile('[^\{\}]+')
+ _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
def __init__(self, chunk_struct, debug_level=1):
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
- self._str = "<" + "><".join(tags) + ">"
+ self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
def _tag(self, tok):
elif isinstance(tok, Tree):
return tok.label()
else:
- raise ValueError("chunk structures must contain tagged " "tokens or trees")
+ raise ValueError('chunk structures must contain tagged ' 'tokens or trees')
def _verify(self, s, verify_tags):
"""
# Check overall form
if not ChunkString._VALID.match(s):
raise ValueError(
- "Transformation generated invalid " "chunkstring:\n %s" % s
+ 'Transformation generated invalid ' 'chunkstring:\n %s' % s
)
# Check that parens are balanced. If the string is long, we
# have to do this in pieces, to avoid a maximum recursion
# depth limit for regular expressions.
- brackets = ChunkString._BRACKETS.sub("", s)
+ brackets = ChunkString._BRACKETS.sub('', s)
for i in range(1 + len(brackets) // 5000):
substr = brackets[i * 5000 : i * 5000 + 5000]
if not ChunkString._BALANCED_BRACKETS.match(substr):
raise ValueError(
- "Transformation generated invalid " "chunkstring:\n %s" % s
+ 'Transformation generated invalid ' 'chunkstring:\n %s' % s
)
if verify_tags <= 0:
return
- tags1 = (re.split(r"[\{\}<>]+", s))[1:-1]
+ tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
tags2 = [self._tag(piece) for piece in self._pieces]
if tags1 != tags2:
raise ValueError(
- "Transformation generated invalid " "chunkstring: tag changed"
+ 'Transformation generated invalid ' 'chunkstring: tag changed'
)
- def to_chunkstruct(self, chunk_label="CHUNK"):
+ def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
pieces = []
index = 0
piece_in_chunk = 0
- for piece in re.split("[{}]", self._str):
+ for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
- length = piece.count("<")
+ length = piece.count('<')
subsequence = self._pieces[index : index + length]
# Add this list of tokens to our pieces.
# The substitution might have generated "empty chunks"
# (substrings of the form "{}"). Remove them, so they don't
# interfere with other transformations.
- s = re.sub("\{\}", "", s)
+ s = re.sub('\{\}', '', s)
# Make sure that the transformation was legal.
if self._debug > 1:
:rtype: str
"""
- return "<ChunkString: %s>" % repr(self._str)
+ return '<ChunkString: %s>' % unicode_repr(self._str)
def __str__(self):
"""
:rtype: str
"""
# Add spaces to make everything line up.
- str = re.sub(r">(?!\})", r"> ", self._str)
- str = re.sub(r"([^\{])<", r"\1 <", str)
- if str[0] == "<":
- str = " " + str
+ str = re.sub(r'>(?!\})', r'> ', self._str)
+ str = re.sub(r'([^\{])<', r'\1 <', str)
+ if str[0] == '<':
+ str = ' ' + str
return str
##//////////////////////////////////////////////////////
+@python_2_unicode_compatible
class RegexpChunkRule(object):
"""
A rule specifying how to modify the chunking in a ``ChunkString``,
:param descr: A short description of the purpose and/or effect
of this rule.
"""
- if isinstance(regexp, str):
+ if isinstance(regexp, string_types):
regexp = re.compile(regexp)
self._repl = repl
self._descr = descr
:rtype: str
"""
return (
- "<RegexpChunkRule: "
- + repr(self._regexp.pattern)
- + "->"
- + repr(self._repl)
- + ">"
+ '<RegexpChunkRule: '
+ + unicode_repr(self._regexp.pattern)
+ + '->'
+ + unicode_repr(self._repl)
+ + '>'
)
@staticmethod
<ChunkRule: '<DT>?<NN.*>+'>
"""
# Split off the comment (but don't split on '\#')
- m = re.match(r"(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?", s)
- rule = m.group("rule").strip()
- comment = (m.group("comment") or "")[1:].strip()
+ m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
+ rule = m.group('rule').strip()
+ comment = (m.group('comment') or '')[1:].strip()
# Pattern bodies: chunk, chink, split, merge
try:
if not rule:
- raise ValueError("Empty chunk pattern")
- if rule[0] == "{" and rule[-1] == "}":
+ raise ValueError('Empty chunk pattern')
+ if rule[0] == '{' and rule[-1] == '}':
return ChunkRule(rule[1:-1], comment)
- elif rule[0] == "}" and rule[-1] == "{":
+ elif rule[0] == '}' and rule[-1] == '{':
return ChinkRule(rule[1:-1], comment)
- elif "}{" in rule:
- left, right = rule.split("}{")
+ elif '}{' in rule:
+ left, right = rule.split('}{')
return SplitRule(left, right, comment)
- elif "{}" in rule:
- left, right = rule.split("{}")
+ elif '{}' in rule:
+ left, right = rule.split('{}')
return MergeRule(left, right, comment)
- elif re.match("[^{}]*{[^{}]*}[^{}]*", rule):
- left, chunk, right = re.split("[{}]", rule)
+ elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
+ left, chunk, right = re.split('[{}]', rule)
return ChunkRuleWithContext(left, chunk, right, comment)
else:
- raise ValueError("Illegal chunk pattern: %s" % rule)
+ raise ValueError('Illegal chunk pattern: %s' % rule)
except (ValueError, re.error):
- raise ValueError("Illegal chunk pattern: %s" % rule)
+ raise ValueError('Illegal chunk pattern: %s' % rule)
+@python_2_unicode_compatible
class ChunkRule(RegexpChunkRule):
"""
A rule specifying how to add chunks to a ``ChunkString``, using a
"""
self._pattern = tag_pattern
regexp = re.compile(
- "(?P<chunk>%s)%s"
+ '(?P<chunk>%s)%s'
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
)
- RegexpChunkRule.__init__(self, regexp, "{\g<chunk>}", descr)
+ RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
def __repr__(self):
"""
:rtype: str
"""
- return "<ChunkRule: " + repr(self._pattern) + ">"
+ return '<ChunkRule: ' + unicode_repr(self._pattern) + '>'
+@python_2_unicode_compatible
class ChinkRule(RegexpChunkRule):
"""
A rule specifying how to remove chinks to a ``ChunkString``,
"""
self._pattern = tag_pattern
regexp = re.compile(
- "(?P<chink>%s)%s"
+ '(?P<chink>%s)%s'
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
)
- RegexpChunkRule.__init__(self, regexp, "}\g<chink>{", descr)
+ RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
def __repr__(self):
"""
:rtype: str
"""
- return "<ChinkRule: " + repr(self._pattern) + ">"
+ return '<ChinkRule: ' + unicode_repr(self._pattern) + '>'
+@python_2_unicode_compatible
class UnChunkRule(RegexpChunkRule):
"""
A rule specifying how to remove chunks to a ``ChunkString``,
of this rule.
"""
self._pattern = tag_pattern
- regexp = re.compile("\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
- RegexpChunkRule.__init__(self, regexp, "\g<chunk>", descr)
+ regexp = re.compile('\{(?P<chunk>%s)\}' % tag_pattern2re_pattern(tag_pattern))
+ RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
def __repr__(self):
"""
:rtype: str
"""
- return "<UnChunkRule: " + repr(self._pattern) + ">"
+ return '<UnChunkRule: ' + unicode_repr(self._pattern) + '>'
+@python_2_unicode_compatible
class MergeRule(RegexpChunkRule):
"""
A rule specifying how to merge chunks in a ``ChunkString``, using
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
regexp = re.compile(
- "(?P<left>%s)}{(?=%s)"
+ '(?P<left>%s)}{(?=%s)'
% (
tag_pattern2re_pattern(left_tag_pattern),
tag_pattern2re_pattern(right_tag_pattern),
)
)
- RegexpChunkRule.__init__(self, regexp, "\g<left>", descr)
+ RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
def __repr__(self):
"""
:rtype: str
"""
return (
- "<MergeRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
+ '<MergeRule: '
+ + unicode_repr(self._left_tag_pattern)
+ + ', '
+ + unicode_repr(self._right_tag_pattern)
+ + '>'
)
+@python_2_unicode_compatible
class SplitRule(RegexpChunkRule):
"""
A rule specifying how to split chunks in a ``ChunkString``, using
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
regexp = re.compile(
- "(?P<left>%s)(?=%s)"
+ '(?P<left>%s)(?=%s)'
% (
tag_pattern2re_pattern(left_tag_pattern),
tag_pattern2re_pattern(right_tag_pattern),
)
)
- RegexpChunkRule.__init__(self, regexp, r"\g<left>}{", descr)
+ RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
def __repr__(self):
"""
:rtype: str
"""
return (
- "<SplitRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
+ '<SplitRule: '
+ + unicode_repr(self._left_tag_pattern)
+ + ', '
+ + unicode_repr(self._right_tag_pattern)
+ + '>'
)
+@python_2_unicode_compatible
class ExpandLeftRule(RegexpChunkRule):
"""
A rule specifying how to expand chunks in a ``ChunkString`` to the left,
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
regexp = re.compile(
- "(?P<left>%s)\{(?P<right>%s)"
+ '(?P<left>%s)\{(?P<right>%s)'
% (
tag_pattern2re_pattern(left_tag_pattern),
tag_pattern2re_pattern(right_tag_pattern),
)
)
- RegexpChunkRule.__init__(self, regexp, "{\g<left>\g<right>", descr)
+ RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
def __repr__(self):
"""
:rtype: str
"""
return (
- "<ExpandLeftRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
+ '<ExpandLeftRule: '
+ + unicode_repr(self._left_tag_pattern)
+ + ', '
+ + unicode_repr(self._right_tag_pattern)
+ + '>'
)
+@python_2_unicode_compatible
class ExpandRightRule(RegexpChunkRule):
"""
A rule specifying how to expand chunks in a ``ChunkString`` to the
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
regexp = re.compile(
- "(?P<left>%s)\}(?P<right>%s)"
+ '(?P<left>%s)\}(?P<right>%s)'
% (
tag_pattern2re_pattern(left_tag_pattern),
tag_pattern2re_pattern(right_tag_pattern),
)
)
- RegexpChunkRule.__init__(self, regexp, "\g<left>\g<right>}", descr)
+ RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
def __repr__(self):
"""
:rtype: str
"""
return (
- "<ExpandRightRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
+ '<ExpandRightRule: '
+ + unicode_repr(self._left_tag_pattern)
+ + ', '
+ + unicode_repr(self._right_tag_pattern)
+ + '>'
)
+@python_2_unicode_compatible
class ChunkRuleWithContext(RegexpChunkRule):
"""
A rule specifying how to add chunks to a ``ChunkString``, using
self._chunk_tag_pattern = chunk_tag_pattern
self._right_context_tag_pattern = right_context_tag_pattern
regexp = re.compile(
- "(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s"
+ '(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s'
% (
tag_pattern2re_pattern(left_context_tag_pattern),
tag_pattern2re_pattern(chunk_tag_pattern),
ChunkString.IN_CHINK_PATTERN,
)
)
- replacement = r"\g<left>{\g<chunk>}\g<right>"
+ replacement = r'\g<left>{\g<chunk>}\g<right>'
RegexpChunkRule.__init__(self, regexp, replacement, descr)
def __repr__(self):
:rtype: str
"""
- return "<ChunkRuleWithContext: %r, %r, %r>" % (
+ return '<ChunkRuleWithContext: %r, %r, %r>' % (
self._left_context_tag_pattern,
self._chunk_tag_pattern,
self._right_context_tag_pattern,
# this should probably be made more strict than it is -- e.g., it
# currently accepts 'foo'.
CHUNK_TAG_PATTERN = re.compile(
- r"^((%s|<%s>)*)$" % ("([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", "[^\{\}<>]+")
+ r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
)
``tag_pattern``.
"""
# Clean up the regular expression
- tag_pattern = re.sub(r"\s", "", tag_pattern)
- tag_pattern = re.sub(r"<", "(<(", tag_pattern)
- tag_pattern = re.sub(r">", ")>)", tag_pattern)
+ tag_pattern = re.sub(r'\s', '', tag_pattern)
+ tag_pattern = re.sub(r'<', '(<(', tag_pattern)
+ tag_pattern = re.sub(r'>', ')>)', tag_pattern)
# Check the regular expression
if not CHUNK_TAG_PATTERN.match(tag_pattern):
- raise ValueError("Bad tag pattern: %r" % tag_pattern)
+ raise ValueError('Bad tag pattern: %r' % tag_pattern)
# Replace "." with CHUNK_TAG_CHAR.
# We have to do this after, since it adds {}[]<>s, which would
def reverse_str(str):
lst = list(str)
lst.reverse()
- return "".join(lst)
+ return ''.join(lst)
tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
reversed = reverse_str(tag_pattern)
- reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed)
+ reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
tag_pattern = reverse_str(reversed)
return tag_pattern
##//////////////////////////////////////////////////////
+@python_2_unicode_compatible
class RegexpChunkParser(ChunkParserI):
"""
A regular expression based chunk parser. ``RegexpChunkParser`` uses a
"""
- def __init__(self, rules, chunk_label="NP", root_label="S", trace=0):
+ def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
"""
Construct a new ``RegexpChunkParser``.
:param verbose: Whether output should be verbose.
:rtype: None
"""
- print("# Input:")
+ print('# Input:')
print(chunkstr)
for rule in self._rules:
rule.apply(chunkstr)
if verbose:
- print("#", rule.descr() + " (" + repr(rule) + "):")
+ print('#', rule.descr() + ' (' + unicode_repr(rule) + '):')
else:
- print("#", rule.descr() + ":")
+ print('#', rule.descr() + ':')
print(chunkstr)
def _notrace_apply(self, chunkstr):
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
- print("Warning: parsing empty text")
+ print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
else:
format = " %s\n %s\n"
for rule in self._rules:
- s += format % (rule.descr(), repr(rule))
+ s += format % (rule.descr(), unicode_repr(rule))
return s[:-1]
##//////////////////////////////////////////////////////
+@python_2_unicode_compatible
class RegexpParser(ChunkParserI):
"""
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
"""
- def __init__(self, grammar, root_label="S", loop=1, trace=0):
+ def __init__(self, grammar, root_label='S', loop=1, trace=0):
"""
Create a new chunk parser, from the given start state
and set of chunk patterns.
self._grammar = grammar
self._loop = loop
- if isinstance(grammar, str):
+ if isinstance(grammar, string_types):
self._read_grammar(grammar, root_label, trace)
else:
# Make sur the grammar looks like it has the right type:
type_err = (
- "Expected string or list of RegexpChunkParsers " "for the grammar."
+ 'Expected string or list of RegexpChunkParsers ' 'for the grammar.'
)
try:
grammar = list(grammar)
"""
rules = []
lhs = None
- for line in grammar.split("\n"):
+ for line in grammar.split('\n'):
line = line.strip()
# New stage begins if there's an unescaped ':'
- m = re.match("(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))", line)
+ m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
if m:
# Record the stage that we just completed.
self._add_stage(rules, lhs, root_label, trace)
# Start a new stage.
- lhs = m.group("nonterminal").strip()
+ lhs = m.group('nonterminal').strip()
rules = []
- line = m.group("rule").strip()
+ line = m.group('rule').strip()
# Skip blank & comment-only lines
- if line == "" or line.startswith("#"):
+ if line == '' or line.startswith('#'):
continue
# Add the rule
"""
if rules != []:
if not lhs:
- raise ValueError("Expected stage marker (eg NP:)")
+ raise ValueError('Expected stage marker (eg NP:)')
parser = RegexpChunkParser(
rules, chunk_label=lhs, root_label=root_label, trace=trace
)
# Evaluate our chunk parser.
chunkscore = chunk.ChunkScore()
- for sentence in text.split("\n"):
+ for sentence in text.split('\n'):
print(sentence)
sentence = sentence.strip()
if not sentence:
continue
gold = chunk.tagstr2tree(sentence)
tokens = gold.leaves()
- test = chunkparser.parse(Tree("S", tokens), trace=1)
+ test = chunkparser.parse(Tree('S', tokens), trace=1)
chunkscore.score(gold, test)
print()
- print("/" + ("=" * 75) + "\\")
- print("Scoring", chunkparser)
- print(("-" * 77))
- print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ")
- print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ")
- print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100))
+ print('/' + ('=' * 75) + '\\')
+ print('Scoring', chunkparser)
+ print(('-' * 77))
+ print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ')
+ print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ')
+ print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100))
# Missed chunks.
if chunkscore.missed():
- print("Missed:")
+ print('Missed:')
missed = chunkscore.missed()
for chunk in missed[:10]:
- print(" ", " ".join(map(str, chunk)))
+ print(' ', ' '.join(map(str, chunk)))
if len(chunkscore.missed()) > 10:
- print(" ...")
+ print(' ...')
# Incorrect chunks.
if chunkscore.incorrect():
- print("Incorrect:")
+ print('Incorrect:')
incorrect = chunkscore.incorrect()
for chunk in incorrect[:10]:
- print(" ", " ".join(map(str, chunk)))
+ print(' ', ' '.join(map(str, chunk)))
if len(chunkscore.incorrect()) > 10:
- print(" ...")
+ print(' ...')
- print("\\" + ("=" * 75) + "/")
+ print('\\' + ('=' * 75) + '/')
print()
[ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
"""
- print("*" * 75)
- print("Evaluation text:")
+ print('*' * 75)
+ print('Evaluation text:')
print(text)
- print("*" * 75)
+ print('*' * 75)
print()
grammar = r"""
print("Demonstration of empty grammar:")
cp = chunk.RegexpParser("")
- print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",))))
+ print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))))
print()
print("Demonstration of accuracy evaluation using CoNLL tags:")
<DT|JJ>{}<NN.*> # merge det/adj with nouns
"""
cp = chunk.RegexpParser(grammar)
- print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5]))
+ print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
print()
print("Demonstration of tagged token input")
)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Chunk format conversions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
import re
from nltk.tree import Tree
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
+from nltk.compat import python_2_unicode_compatible
##//////////////////////////////////////////////////////
## EVALUATION
self._tp = set()
self._fp = set()
self._fn = set()
- self._max_tp = kwargs.get("max_tp_examples", 100)
- self._max_fp = kwargs.get("max_fp_examples", 100)
- self._max_fn = kwargs.get("max_fn_examples", 100)
- self._chunk_label = kwargs.get("chunk_label", ".*")
+ self._max_tp = kwargs.get('max_tp_examples', 100)
+ self._max_fp = kwargs.get('max_fp_examples', 100)
+ self._max_fn = kwargs.get('max_fn_examples', 100)
+ self._chunk_label = kwargs.get('chunk_label', '.*')
self._tp_num = 0
self._fp_num = 0
self._fn_num = 0
:rtype: str
"""
- return "<ChunkScoring of " + repr(len(self)) + " chunks>"
+ return '<ChunkScoring of ' + repr(len(self)) + ' chunks>'
def __str__(self):
"""
def tagstr2tree(
- s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
+ s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
):
"""
Divide a string of bracketted tagged text into
:rtype: Tree
"""
- WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
+ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
stack = [Tree(root_label, [])]
for match in WORD_OR_BRACKET.finditer(s):
text = match.group()
- if text[0] == "[":
+ if text[0] == '[':
if len(stack) != 1:
- raise ValueError("Unexpected [ at char {:d}".format(match.start()))
+ raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
chunk = Tree(chunk_label, [])
stack[-1].append(chunk)
stack.append(chunk)
- elif text[0] == "]":
+ elif text[0] == ']':
if len(stack) != 2:
- raise ValueError("Unexpected ] at char {:d}".format(match.start()))
+ raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
stack.pop()
else:
if sep is None:
stack[-1].append((word, tag))
if len(stack) != 1:
- raise ValueError("Expected ] at char {:d}".format(len(s)))
+ raise ValueError('Expected ] at char {:d}'.format(len(s)))
return stack[0]
### CONLL
-_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
+_LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
-def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
+def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
"""
Return a chunk structure for a single sentence
encoded in the given CONLL 2000 style string.
stack = [Tree(root_label, [])]
- for lineno, line in enumerate(s.split("\n")):
+ for lineno, line in enumerate(s.split('\n')):
if not line.strip():
continue
# Decode the line.
match = _LINE_RE.match(line)
if match is None:
- raise ValueError("Error on line {:d}".format(lineno))
+ raise ValueError('Error on line {:d}'.format(lineno))
(word, tag, state, chunk_type) = match.groups()
# If it's a chunk type we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
- state = "O"
+ state = 'O'
# For "Begin"/"Outside", finish any completed chunks -
# also do so for "Inside" which don't match the previous token.
- mismatch_I = state == "I" and chunk_type != stack[-1].label()
- if state in "BO" or mismatch_I:
+ mismatch_I = state == 'I' and chunk_type != stack[-1].label()
+ if state in 'BO' or mismatch_I:
if len(stack) == 2:
stack.pop()
# For "Begin", start a new chunk.
- if state == "B" or mismatch_I:
+ if state == 'B' or mismatch_I:
chunk = Tree(chunk_type, [])
stack[-1].append(chunk)
stack.append(chunk)
def conlltags2tree(
- sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
+ sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False
):
"""
Convert the CoNLL IOB format to a tree.
else:
# Treat as O
tree.append((word, postag))
- elif chunktag.startswith("B-"):
+ elif chunktag.startswith('B-'):
tree.append(Tree(chunktag[2:], [(word, postag)]))
- elif chunktag.startswith("I-"):
+ elif chunktag.startswith('I-'):
if (
len(tree) == 0
or not isinstance(tree[-1], Tree)
tree.append(Tree(chunktag[2:], [(word, postag)]))
else:
tree[-1].append((word, postag))
- elif chunktag == "O":
+ elif chunktag == 'O':
tree.append((word, postag))
else:
raise ValueError("Bad conll tag {0!r}".format(chunktag))
:rtype: str
"""
lines = [" ".join(token) for token in tree2conlltags(t)]
- return "\n".join(lines)
+ return '\n'.join(lines)
### IEER
_IEER_DOC_RE = re.compile(
- r"<DOC>\s*"
- r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
- r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
- r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
- r"<BODY>\s*"
- r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
- r"<TEXT>(?P<text>.*?)</TEXT>\s*"
- r"</BODY>\s*</DOC>\s*",
+ r'<DOC>\s*'
+ r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
+ r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
+ r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
+ r'<BODY>\s*'
+ r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
+ r'<TEXT>(?P<text>.*?)</TEXT>\s*'
+ r'</BODY>\s*</DOC>\s*',
re.DOTALL,
)
# return the empty list in place of a Tree
if s is None:
return []
- for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
+ for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
piece = piece_m.group()
try:
- if piece.startswith("<b_"):
+ if piece.startswith('<b_'):
m = _IEER_TYPE_RE.match(piece)
if m is None:
- print("XXXX", piece)
- chunk = Tree(m.group("type"), [])
+ print('XXXX', piece)
+ chunk = Tree(m.group('type'), [])
stack[-1].append(chunk)
stack.append(chunk)
- elif piece.startswith("<e_"):
+ elif piece.startswith('<e_'):
stack.pop()
# elif piece.startswith('<'):
# print "ERROR:", piece
stack[-1].append(piece)
except (IndexError, ValueError):
raise ValueError(
- "Bad IEER string (error at character {:d})".format(piece_m.start())
+ 'Bad IEER string (error at character {:d})'.format(piece_m.start())
)
if len(stack) != 1:
- raise ValueError("Bad IEER string")
+ raise ValueError('Bad IEER string')
return stack[0]
def ieerstr2tree(
s,
chunk_types=[
- "LOCATION",
- "ORGANIZATION",
- "PERSON",
- "DURATION",
- "DATE",
- "CARDINAL",
- "PERCENT",
- "MONEY",
- "MEASURE",
+ 'LOCATION',
+ 'ORGANIZATION',
+ 'PERSON',
+ 'DURATION',
+ 'DATE',
+ 'CARDINAL',
+ 'PERCENT',
+ 'MONEY',
+ 'MEASURE',
],
root_label="S",
):
m = _IEER_DOC_RE.match(s)
if m:
return {
- "text": _ieer_read_text(m.group("text"), root_label),
- "docno": m.group("docno"),
- "doctype": m.group("doctype"),
- "date_time": m.group("date_time"),
+ 'text': _ieer_read_text(m.group('text'), root_label),
+ 'docno': m.group('docno'),
+ 'doctype': m.group('doctype'),
+ 'date_time': m.group('date_time'),
#'headline': m.group('headline')
# we want to capture NEs in the headline too!
- "headline": _ieer_read_text(m.group("headline"), root_label),
+ 'headline': _ieer_read_text(m.group('headline'), root_label),
}
else:
return _ieer_read_text(s, root_label)
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
import nltk
- t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
+ t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
t.pprint()
print()
. . O
"""
- conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
+ conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
conll_tree.pprint()
# Demonstrate CoNLL output
print()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
>>> from nltk.corpus import gutenberg
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
... doc = gutenberg.words(fileid) # doctest: +SKIP
- ... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
+ ... print fileid, classifier.classify(document_features(doc)) # doctest: +SKIP
The parameters that a feature detector expects will vary, depending on
the task and the needs of the feature detector. For example, a
# Natural Language Toolkit: Classifier Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# Natural Language Toolkit: Decision Tree Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
the basis of a tree structure, where branches correspond to conditions
on feature values, and leaves correspond to label assignments.
"""
+from __future__ import print_function, unicode_literals, division
from collections import defaultdict
from nltk.probability import FreqDist, MLEProbDist, entropy
from nltk.classify.api import ClassifierI
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class DecisionTreeClassifier(ClassifierI):
def __init__(self, label, feature_name=None, decisions=None, default=None):
"""
errors += 1
return errors / len(labeled_featuresets)
- def pretty_format(self, width=70, prefix="", depth=4):
+ def pretty_format(self, width=70, prefix='', depth=4):
"""
Return a string containing a pretty-printed version of this
decision tree. Each line in this string corresponds to a
n = width - len(prefix) - 15
return '{0}{1} {2}\n'.format(prefix, '.' * n, self._label)
s = ''
- for i, (fval, result) in enumerate(sorted(self._decisions.items(),
- key=lambda item:
- (item[0] in [None, False, True], str(item[0]).lower())
- )
- ):
+ for i, (fval, result) in enumerate(sorted(self._decisions.items())):
hdr = '{0}{1}={2}? '.format(prefix, self._fname, fval)
n = width - 15 - len(hdr)
- s += "{0}{1} {2}\n".format(hdr, "." * (n), result._label)
+ s += '{0}{1} {2}\n'.format(hdr, '.' * (n), result._label)
if result._fname is not None and depth > 1:
- s += result.pretty_format(width, prefix + " ", depth - 1)
+ s += result.pretty_format(width, prefix + ' ', depth - 1)
if self._default is not None:
n = width - len(prefix) - 21
- s += "{0}else: {1} {2}\n".format(prefix, "." * n, self._default._label)
+ s += '{0}else: {1} {2}\n'.format(prefix, '.' * n, self._default._label)
if self._default._fname is not None and depth > 1:
- s += self._default.pretty_format(width, prefix + " ", depth - 1)
+ s += self._default.pretty_format(width, prefix + ' ', depth - 1)
return s
- def pseudocode(self, prefix="", depth=4):
+ def pseudocode(self, prefix='', depth=4):
"""
Return a string representation of this decision tree that
expresses the decisions it makes as a nested set of pseudocode
if self._fname is None:
return "{0}return {1!r}\n".format(prefix, self._label)
s = ''
- for (fval, result) in sorted(self._decisions.items(),
- key=lambda item:
- (item[0] in [None, False, True], str(item[0]).lower())
- ):
+ for (fval, result) in sorted(self._decisions.items()):
s += '{0}if {1} == {2!r}: '.format(prefix, self._fname, fval)
if result._fname is not None and depth > 1:
- s += "\n" + result.pseudocode(prefix + " ", depth - 1)
+ s += '\n' + result.pseudocode(prefix + ' ', depth - 1)
else:
- s += "return {0!r}\n".format(result._label)
+ s += 'return {0!r}\n'.format(result._label)
if self._default is not None:
if len(self._decisions) == 1:
- s += "{0}if {1} != {2!r}: ".format(
+ s += '{0}if {1} != {2!r}: '.format(
prefix, self._fname, list(self._decisions.keys())[0]
)
else:
- s += "{0}else: ".format(prefix)
+ s += '{0}else: '.format(prefix)
if self._default._fname is not None and depth > 1:
- s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
+ s += '\n' + self._default.pseudocode(prefix + ' ', depth - 1)
else:
- s += "return {0!r}\n".format(self._default._label)
+ s += 'return {0!r}\n'.format(self._default._label)
return s
def __str__(self):
if verbose:
print(
(
- "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
+ 'best stump for {:6d} toks uses {:20} err={:6.4f}'.format(
len(labeled_featuresets), best_stump._fname, best_error
)
)
best_stump = stump
if verbose:
if best_stump._decisions:
- descr = "{0}={1}".format(
+ descr = '{0}={1}'.format(
best_stump._fname, list(best_stump._decisions.keys())[0]
)
else:
- descr = "(default)"
+ descr = '(default)'
print(
(
- "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
+ 'best stump for {:6d} toks uses {:20} err={:6.4f}'.format(
len(labeled_featuresets), descr, best_error
)
)
classifier = names_demo(
f, binary_names_demo_features # DecisionTreeClassifier.train,
)
- print(classifier.pretty_format(depth=7))
+ print(classifier.pp(depth=7))
print(classifier.pseudocode(depth=7))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Maximum Entropy Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
# URL: <http://nltk.org/>
performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
"""
+from __future__ import print_function, unicode_literals
+
try:
import numpy
except ImportError:
import os
from collections import defaultdict
+from six import integer_types
+
+from nltk import compat
from nltk.data import gzip_open_unicode
from nltk.util import OrderedDict
from nltk.probability import DictionaryProbDist
from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights
from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
######################################################################
# { Classifier Model
######################################################################
+@compat.python_2_unicode_compatible
class MaxentClassifier(ClassifierI):
"""
A maximum entropy classifier (also known as a "conditional
probabilities of each label for that featureset.
"""
descr_width = 50
- TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f"
+ TEMPLATE = ' %-' + str(descr_width - 2) + 's%s%8.3f'
pdist = self.prob_classify(featureset)
labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
labels = labels[:columns]
print(
- " Feature".ljust(descr_width)
- + "".join("%8s" % (("%s" % l)[:7]) for l in labels)
+ ' Feature'.ljust(descr_width)
+ + ''.join('%8s' % (("%s" % l)[:7]) for l in labels)
)
- print(" " + "-" * (descr_width - 2 + 8 * len(labels)))
+ print(' ' + '-' * (descr_width - 2 + 8 * len(labels)))
sums = defaultdict(int)
for i, label in enumerate(labels):
feature_vector = self._encoding.encode(featureset, label)
else:
score = self._weights[f_id] ** f_val
descr = self._encoding.describe(f_id)
- descr = descr.split(" and label is ")[0] # hack
- descr += " (%s)" % f_val # hack
+ descr = descr.split(' and label is ')[0] # hack
+ descr += ' (%s)' % f_val # hack
if len(descr) > 47:
- descr = descr[:44] + "..."
- print(TEMPLATE % (descr, i * 8 * " ", score))
+ descr = descr[:44] + '...'
+ print(TEMPLATE % (descr, i * 8 * ' ', score))
sums[label] += score
- print(" " + "-" * (descr_width - 1 + 8 * len(labels)))
+ print(' ' + '-' * (descr_width - 1 + 8 * len(labels)))
print(
- " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
+ ' TOTAL:'.ljust(descr_width) + ''.join('%8.3f' % sums[l] for l in labels)
)
print(
- " PROBS:".ljust(descr_width)
- + "".join("%8.3f" % pdist.prob(l) for l in labels)
+ ' PROBS:'.ljust(descr_width)
+ + ''.join('%8.3f' % pdist.prob(l) for l in labels)
)
def most_informative_features(self, n=10):
"""
Generates the ranked list of informative features from most to least.
"""
- if hasattr(self, "_most_informative_features"):
+ if hasattr(self, '_most_informative_features'):
return self._most_informative_features[:n]
else:
self._most_informative_features = sorted(
)
return self._most_informative_features[:n]
- def show_most_informative_features(self, n=10, show="all"):
+ def show_most_informative_features(self, n=10, show='all'):
"""
:param show: all, neg, or pos (for negative-only or positive-only)
:type show: str
"""
# Use None the full list of ranked features.
fids = self.most_informative_features(None)
- if show == "pos":
+ if show == 'pos':
fids = [fid for fid in fids if self._weights[fid] > 0]
- elif show == "neg":
+ elif show == 'neg':
fids = [fid for fid in fids if self._weights[fid] < 0]
for fid in fids[:n]:
- print("%8.3f %s" % (self._weights[fid], self._encoding.describe(fid)))
+ print('%8.3f %s' % (self._weights[fid], self._encoding.describe(fid)))
def __repr__(self):
- return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
+ return '<ConditionalExponentialClassifier: %d labels, %d features>' % (
len(self._encoding.labels()),
self._encoding.length(),
)
#: A list of the algorithm names that are accepted for the
#: ``train()`` method's ``algorithm`` parameter.
- ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]
+ ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']
@classmethod
def train(
log likelihood by less than ``v``.
"""
if algorithm is None:
- algorithm = "iis"
+ algorithm = 'iis'
for key in cutoffs:
if key not in (
- "max_iter",
- "min_ll",
- "min_lldelta",
- "max_acc",
- "min_accdelta",
- "count_cutoff",
- "norm",
- "explicit",
- "bernoulli",
+ 'max_iter',
+ 'min_ll',
+ 'min_lldelta',
+ 'max_acc',
+ 'min_accdelta',
+ 'count_cutoff',
+ 'norm',
+ 'explicit',
+ 'bernoulli',
):
- raise TypeError("Unexpected keyword arg %r" % key)
+ raise TypeError('Unexpected keyword arg %r' % key)
algorithm = algorithm.lower()
- if algorithm == "iis":
+ if algorithm == 'iis':
return train_maxent_classifier_with_iis(
train_toks, trace, encoding, labels, **cutoffs
)
- elif algorithm == "gis":
+ elif algorithm == 'gis':
return train_maxent_classifier_with_gis(
train_toks, trace, encoding, labels, **cutoffs
)
- elif algorithm == "megam":
+ elif algorithm == 'megam':
return train_maxent_classifier_with_megam(
train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
)
- elif algorithm == "tadm":
+ elif algorithm == 'tadm':
kwargs = cutoffs
- kwargs["trace"] = trace
- kwargs["encoding"] = encoding
- kwargs["labels"] = labels
- kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
+ kwargs['trace'] = trace
+ kwargs['encoding'] = encoding
+ kwargs['labels'] = labels
+ kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma
return TadmMaxentClassifier.train(train_toks, **kwargs)
else:
- raise ValueError("Unknown algorithm %s" % algorithm)
+ raise ValueError('Unknown algorithm %s' % algorithm)
#: Alias for MaxentClassifier.
return self._labels
def describe(self, fid):
- return "no description available"
+ return 'no description available'
class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
"""
if set(mapping.values()) != set(range(len(mapping))):
raise ValueError(
- "Mapping values must be exactly the "
- "set of integers from 0...len(mapping)"
+ 'Mapping values must be exactly the '
+ 'set of integers from 0...len(mapping)'
)
self._labels = list(labels)
def describe(self, f_id):
# Inherit docs.
- if not isinstance(f_id, int):
- raise TypeError("describe() expected an int")
+ if not isinstance(f_id, integer_types):
+ raise TypeError('describe() expected an int')
try:
self._inv_mapping
except AttributeError:
if f_id < len(self._mapping):
(fname, fval, label) = self._inv_mapping[f_id]
- return "%s==%r and label is %r" % (fname, fval, label)
+ return '%s==%r and label is %r' % (fname, fval, label)
elif self._alwayson and f_id in self._alwayson.values():
for (label, f_id2) in self._alwayson.items():
if f_id == f_id2:
- return "label is %r" % label
+ return 'label is %r' % label
elif self._unseen and f_id in self._unseen.values():
for (fname, f_id2) in self._unseen.items():
if f_id == f_id2:
- return "%s is unseen" % fname
+ return '%s is unseen' % fname
else:
- raise ValueError("Bad feature id")
+ raise ValueError('Bad feature id')
def labels(self):
# Inherit docs.
for (tok, label) in train_toks:
if labels and label not in labels:
- raise ValueError("Unexpected label %s" % label)
+ raise ValueError('Unexpected label %s' % label)
seen_labels.add(label)
# Record each of the features.
# Add a correction feature.
total = sum(v for (f, v) in encoding)
if total >= self._C:
- raise ValueError("Correction feature is not high enough!")
+ raise ValueError('Correction feature is not high enough!')
encoding.append((base_length, self._C - total))
# Return the result
def describe(self, f_id):
if f_id == BinaryMaxentFeatureEncoding.length(self):
- return "Correction feature (%s)" % self._C
+ return 'Correction feature (%s)' % self._C
else:
return BinaryMaxentFeatureEncoding.describe(self, f_id)
"""
if set(mapping.values()) != set(range(len(mapping))):
raise ValueError(
- "Mapping values must be exactly the "
- "set of integers from 0...len(mapping)"
+ 'Mapping values must be exactly the '
+ 'set of integers from 0...len(mapping)'
)
self._labels = list(labels)
# Convert input-features to joint-features:
for fname, fval in featureset.items():
- if isinstance(fval, (int, float)):
+ if isinstance(fval, (integer_types, float)):
# Known feature name & value:
if (fname, type(fval), label) in self._mapping:
encoding.append((self._mapping[fname, type(fval), label], fval))
def describe(self, f_id):
# Inherit docs.
- if not isinstance(f_id, int):
- raise TypeError("describe() expected an int")
+ if not isinstance(f_id, integer_types):
+ raise TypeError('describe() expected an int')
try:
self._inv_mapping
except AttributeError:
if f_id < len(self._mapping):
(fname, fval, label) = self._inv_mapping[f_id]
- return "%s==%r and label is %r" % (fname, fval, label)
+ return '%s==%r and label is %r' % (fname, fval, label)
elif self._alwayson and f_id in self._alwayson.values():
for (label, f_id2) in self._alwayson.items():
if f_id == f_id2:
- return "label is %r" % label
+ return 'label is %r' % label
elif self._unseen and f_id in self._unseen.values():
for (fname, f_id2) in self._unseen.items():
if f_id == f_id2:
- return "%s is unseen" % fname
+ return '%s is unseen' % fname
else:
- raise ValueError("Bad feature id")
+ raise ValueError('Bad feature id')
def labels(self):
# Inherit docs.
for (tok, label) in train_toks:
if labels and label not in labels:
- raise ValueError("Unexpected label %s" % label)
+ raise ValueError('Unexpected label %s' % label)
seen_labels.add(label)
# Record each of the features.
:see: ``train_maxent_classifier()`` for parameter descriptions.
"""
- cutoffs.setdefault("max_iter", 100)
+ cutoffs.setdefault('max_iter', 100)
cutoffchecker = CutoffChecker(cutoffs)
# Construct an encoding from the training data.
if encoding is None:
encoding = GISEncoding.train(train_toks, labels=labels)
- if not hasattr(encoding, "C"):
+ if not hasattr(encoding, 'C'):
raise TypeError(
- "The GIS algorithm requires an encoding that "
- "defines C (e.g., GISEncoding)."
+ 'The GIS algorithm requires an encoding that '
+ 'defines C (e.g., GISEncoding).'
)
# Cinv is the inverse of the sum of each joint feature vector.
# Build the classifier. Start with weight=0 for each attested
# feature, and weight=-infinity for each unattested feature.
- weights = numpy.zeros(len(empirical_fcount), "d")
+ weights = numpy.zeros(len(empirical_fcount), 'd')
for fid in unattested:
weights[fid] = numpy.NINF
classifier = ConditionalExponentialClassifier(encoding, weights)
del empirical_fcount
if trace > 0:
- print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
+ print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
if trace > 2:
print()
- print(" Iteration Log Likelihood Accuracy")
- print(" ---------------------------------------")
+ print(' Iteration Log Likelihood Accuracy')
+ print(' ---------------------------------------')
# Train the classifier.
try:
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
iternum = cutoffchecker.iter
- print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
+ print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
# Use the model to estimate the number of times each
# feature should occur in the training data.
break
except KeyboardInterrupt:
- print(" Training stopped: keyboard interrupt")
+ print(' Training stopped: keyboard interrupt')
except:
raise
if trace > 2:
ll = log_likelihood(classifier, train_toks)
acc = accuracy(classifier, train_toks)
- print(" Final %14.5f %9.3f" % (ll, acc))
+ print(' Final %14.5f %9.3f' % (ll, acc))
# Return the classifier.
return classifier
def calculate_empirical_fcount(train_toks, encoding):
- fcount = numpy.zeros(encoding.length(), "d")
+ fcount = numpy.zeros(encoding.length(), 'd')
for tok, label in train_toks:
for (index, val) in encoding.encode(tok, label):
def calculate_estimated_fcount(classifier, train_toks, encoding):
- fcount = numpy.zeros(encoding.length(), "d")
+ fcount = numpy.zeros(encoding.length(), 'd')
for tok, label in train_toks:
pdist = classifier.prob_classify(tok)
:see: ``train_maxent_classifier()`` for parameter descriptions.
"""
- cutoffs.setdefault("max_iter", 100)
+ cutoffs.setdefault('max_iter', 100)
cutoffchecker = CutoffChecker(cutoffs)
# Construct an encoding from the training data.
# nfarray performs the reverse operation. nfident is
# nfarray multiplied by an identity matrix.
nfmap = calculate_nfmap(train_toks, encoding)
- nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
+ nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
# Check for any features that are not attested in train_toks.
# Build the classifier. Start with weight=0 for each attested
# feature, and weight=-infinity for each unattested feature.
- weights = numpy.zeros(len(empirical_ffreq), "d")
+ weights = numpy.zeros(len(empirical_ffreq), 'd')
for fid in unattested:
weights[fid] = numpy.NINF
classifier = ConditionalExponentialClassifier(encoding, weights)
if trace > 0:
- print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
+ print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
if trace > 2:
print()
- print(" Iteration Log Likelihood Accuracy")
- print(" ---------------------------------------")
+ print(' Iteration Log Likelihood Accuracy')
+ print(' ---------------------------------------')
# Train the classifier.
try:
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
iternum = cutoffchecker.iter
- print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
+ print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
# Calculate the deltas for this iteration, using Newton's method.
deltas = calculate_deltas(
break
except KeyboardInterrupt:
- print(" Training stopped: keyboard interrupt")
+ print(' Training stopped: keyboard interrupt')
except:
raise
if trace > 2:
ll = log_likelihood(classifier, train_toks)
acc = accuracy(classifier, train_toks)
- print(" Final %14.5f %9.3f" % (ll, acc))
+ print(' Final %14.5f %9.3f' % (ll, acc))
# Return the classifier.
return classifier
NEWTON_CONVERGE = 1e-12
MAX_NEWTON = 300
- deltas = numpy.ones(encoding.length(), "d")
+ deltas = numpy.ones(encoding.length(), 'd')
# Precompute the A matrix:
# A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
# over all label,fs s.t. num_features[label,fs]=nf
- A = numpy.zeros((len(nfmap), encoding.length()), "d")
+ A = numpy.zeros((len(nfmap), encoding.length()), 'd')
for tok, label in train_toks:
dist = classifier.prob_classify(tok)
explicit = True
bernoulli = True
- if "explicit" in kwargs:
- explicit = kwargs["explicit"]
- if "bernoulli" in kwargs:
- bernoulli = kwargs["bernoulli"]
+ if 'explicit' in kwargs:
+ explicit = kwargs['explicit']
+ if 'bernoulli' in kwargs:
+ bernoulli = kwargs['bernoulli']
# Construct an encoding from the training data.
if encoding is None:
# Count cutoff can also be controlled by megam with the -minfc
# option. Not sure where the best place for it is.
- count_cutoff = kwargs.get("count_cutoff", 0)
+ count_cutoff = kwargs.get('count_cutoff', 0)
encoding = BinaryMaxentFeatureEncoding.train(
train_toks, count_cutoff, labels=labels, alwayson_features=True
)
elif labels is not None:
- raise ValueError("Specify encoding or labels, not both")
+ raise ValueError('Specify encoding or labels, not both')
# Write a training file for megam.
try:
- fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
- with open(trainfile_name, "w") as trainfile:
+ fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
+ with open(trainfile_name, 'w') as trainfile:
write_megam_file(
train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
)
os.close(fd)
except (OSError, IOError, ValueError) as e:
- raise ValueError("Error while creating megam training file: %s" % e)
+ raise ValueError('Error while creating megam training file: %s' % e)
# Run megam on the training file.
options = []
- options += ["-nobias", "-repeat", "10"]
+ options += ['-nobias', '-repeat', '10']
if explicit:
- options += ["-explicit"]
+ options += ['-explicit']
if not bernoulli:
- options += ["-fvals"]
+ options += ['-fvals']
if gaussian_prior_sigma:
# Lambda is just the precision of the Gaussian prior, i.e. it's the
# inverse variance, so the parameter conversion is 1.0/sigma**2.
inv_variance = 1.0 / gaussian_prior_sigma ** 2
else:
inv_variance = 0
- options += ["-lambda", "%.2f" % inv_variance, "-tune"]
+ options += ['-lambda', '%.2f' % inv_variance, '-tune']
if trace < 3:
- options += ["-quiet"]
- if "max_iter" in kwargs:
- options += ["-maxi", "%s" % kwargs["max_iter"]]
- if "ll_delta" in kwargs:
+ options += ['-quiet']
+ if 'max_iter' in kwargs:
+ options += ['-maxi', '%s' % kwargs['max_iter']]
+ if 'll_delta' in kwargs:
# [xx] this is actually a perplexity delta, not a log
# likelihood delta
- options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
- if hasattr(encoding, "cost"):
- options += ["-multilabel"] # each possible la
- options += ["multiclass", trainfile_name]
+ options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]
+ if hasattr(encoding, 'cost'):
+ options += ['-multilabel'] # each possible la
+ options += ['multiclass', trainfile_name]
stdout = call_megam(options)
- # print('./megam_i686.opt ', ' '.join(options))
+ # print './megam_i686.opt ', ' '.join(options)
# Delete the training file
try:
os.remove(trainfile_name)
except (OSError, IOError) as e:
- print("Warning: unable to delete %s: %s" % (trainfile_name, e))
+ print('Warning: unable to delete %s: %s' % (trainfile_name, e))
# Parse the generated weight vector.
weights = parse_megam_weights(stdout, encoding.length(), explicit)
class TadmMaxentClassifier(MaxentClassifier):
@classmethod
def train(cls, train_toks, **kwargs):
- algorithm = kwargs.get("algorithm", "tao_lmvm")
- trace = kwargs.get("trace", 3)
- encoding = kwargs.get("encoding", None)
- labels = kwargs.get("labels", None)
- sigma = kwargs.get("gaussian_prior_sigma", 0)
- count_cutoff = kwargs.get("count_cutoff", 0)
- max_iter = kwargs.get("max_iter")
- ll_delta = kwargs.get("min_lldelta")
+ algorithm = kwargs.get('algorithm', 'tao_lmvm')
+ trace = kwargs.get('trace', 3)
+ encoding = kwargs.get('encoding', None)
+ labels = kwargs.get('labels', None)
+ sigma = kwargs.get('gaussian_prior_sigma', 0)
+ count_cutoff = kwargs.get('count_cutoff', 0)
+ max_iter = kwargs.get('max_iter')
+ ll_delta = kwargs.get('min_lldelta')
# Construct an encoding from the training data.
if not encoding:
)
trainfile_fd, trainfile_name = tempfile.mkstemp(
- prefix="nltk-tadm-events-", suffix=".gz"
+ prefix='nltk-tadm-events-', suffix='.gz'
)
- weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")
+ weightfile_fd, weightfile_name = tempfile.mkstemp(prefix='nltk-tadm-weights-')
- trainfile = gzip_open_unicode(trainfile_name, "w")
+ trainfile = gzip_open_unicode(trainfile_name, 'w')
write_tadm_file(train_toks, encoding, trainfile)
trainfile.close()
options = []
- options.extend(["-monitor"])
- options.extend(["-method", algorithm])
+ options.extend(['-monitor'])
+ options.extend(['-method', algorithm])
if sigma:
- options.extend(["-l2", "%.6f" % sigma ** 2])
+ options.extend(['-l2', '%.6f' % sigma ** 2])
if max_iter:
- options.extend(["-max_it", "%d" % max_iter])
+ options.extend(['-max_it', '%d' % max_iter])
if ll_delta:
- options.extend(["-fatol", "%.6f" % abs(ll_delta)])
- options.extend(["-events_in", trainfile_name])
- options.extend(["-params_out", weightfile_name])
+ options.extend(['-fatol', '%.6f' % abs(ll_delta)])
+ options.extend(['-events_in', trainfile_name])
+ options.extend(['-params_out', weightfile_name])
if trace < 3:
- options.extend(["2>&1"])
+ options.extend(['2>&1'])
else:
- options.extend(["-summary"])
+ options.extend(['-summary'])
call_tadm(options)
- with open(weightfile_name, "r") as weightfile:
+ with open(weightfile_name, 'r') as weightfile:
weights = parse_tadm_weights(weightfile)
os.remove(trainfile_name)
classifier = names_demo(MaxentClassifier.train)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Interface to Megam Classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
.. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html
"""
+from __future__ import print_function
+
import subprocess
+from six import string_types
+
+from nltk import compat
from nltk.internals import find_binary
try:
"""
global _megam_bin
_megam_bin = find_binary(
- "megam",
+ 'megam',
bin,
- env_vars=["MEGAM"],
- binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
- url="http://www.umiacs.umd.edu/~hal/megam/index.html",
+ env_vars=['MEGAM'],
+ binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
+ url='http://www.umiacs.umd.edu/~hal/megam/index.html',
)
# Write the file, which contains one line per instance.
for featureset, label in train_toks:
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
- if hasattr(encoding, "cost"):
+ if hasattr(encoding, 'cost'):
stream.write(
- ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
+ ':'.join(str(encoding.cost(featureset, label, l)) for l in labels)
)
else:
- stream.write("%d" % labelnum[label])
+ stream.write('%d' % labelnum[label])
# For implicit file formats, just list the features that fire
# for this instance's actual label.
# any of the possible labels.
else:
for l in labels:
- stream.write(" #")
+ stream.write(' #')
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
# End of the instance.
- stream.write("\n")
+ stream.write('\n')
def parse_megam_weights(s, features_count, explicit=True):
vector. This function does not currently handle bias features.
"""
if numpy is None:
- raise ValueError("This function requires that numpy be installed")
- assert explicit, "non-explicit not supported yet"
- lines = s.strip().split("\n")
- weights = numpy.zeros(features_count, "d")
+ raise ValueError('This function requires that numpy be installed')
+ assert explicit, 'non-explicit not supported yet'
+ lines = s.strip().split('\n')
+ weights = numpy.zeros(features_count, 'd')
for line in lines:
if line.strip():
fid, weight = line.split()
def _write_megam_features(vector, stream, bernoulli):
if not vector:
raise ValueError(
- "MEGAM classifier requires the use of an " "always-on feature."
+ 'MEGAM classifier requires the use of an ' 'always-on feature.'
)
for (fid, fval) in vector:
if bernoulli:
if fval == 1:
- stream.write(" %s" % fid)
+ stream.write(' %s' % fid)
elif fval != 0:
raise ValueError(
- "If bernoulli=True, then all" "features must be binary."
+ 'If bernoulli=True, then all' 'features must be binary.'
)
else:
- stream.write(" %s %s" % (fid, fval))
+ stream.write(' %s %s' % (fid, fval))
def call_megam(args):
"""
Call the ``megam`` binary with the given arguments.
"""
- if isinstance(args, str):
- raise TypeError("args should be a list of strings")
+ if isinstance(args, string_types):
+ raise TypeError('args should be a list of strings')
if _megam_bin is None:
config_megam()
if p.returncode != 0:
print()
print(stderr)
- raise OSError("megam command failed!")
+ raise OSError('megam command failed!')
- if isinstance(stdout, str):
+ if isinstance(stdout, string_types):
return stdout
else:
- return stdout.decode("utf-8")
+ return stdout.decode('utf-8')
# Natural Language Toolkit: Naive Bayes Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
| P(label|features) = --------------------------------------------
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
+from __future__ import print_function, unicode_literals
from collections import defaultdict
if (label, fname) in self._feature_probdist:
break
else:
- # print('Ignoring unseen feature %s' % fname)
+ # print 'Ignoring unseen feature %s' % fname
del featureset[fname]
# Find the log probabilty of each label, given the features.
def show_most_informative_features(self, n=10):
# Determine the most relevant features, and display them.
cpdist = self._feature_probdist
- print("Most Informative Features")
+ print('Most Informative Features')
for (fname, fval) in self.most_informative_features(n):
labels = sorted(
[l for l in self._labels if fval in cpdist[l, fname].samples()],
- key=lambda element: (-labelprob(element), element),
- reverse=True
+ key=labelprob,
)
if len(labels) == 1:
continue
l0 = labels[0]
l1 = labels[-1]
if cpdist[l0, fname].prob(fval) == 0:
- ratio = "INF"
+ ratio = 'INF'
else:
- ratio = "%8.1f" % (
+ ratio = '%8.1f' % (
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
)
print(
(
- "%24s = %-14r %6s : %-6s = %s : 1.0"
+ '%24s = %-14r %6s : %-6s = %s : 1.0'
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
)
)
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
"""
- if hasattr(self, "_most_informative_features"):
+ if hasattr(self, '_most_informative_features'):
return self._most_informative_features[:n]
else:
# The set of (fname, fval) pairs used by this classifier.
# Convert features to a list, & sort it by how informative
# features are.
self._most_informative_features = sorted(
- features, key=lambda feature_: (minprob[feature_] / maxprob[feature_], feature_[0],
- feature_[1] in [None, False, True], str(feature_[1]).lower())
+ features, key=lambda feature_: minprob[feature_] / maxprob[feature_]
)
return self._most_informative_features[:n]
classifier.show_most_informative_features()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
- >>> positive_featuresets = map(features, sports_sentences)
- >>> unlabeled_featuresets = map(features, various_sentences)
+ >>> positive_featuresets = list(map(features, sports_sentences))
+ >>> unlabeled_featuresets = list(map(features, various_sentences))
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
... unlabeled_featuresets)
estimator=ELEProbDist,
):
"""
- :param positive_featuresets: An iterable of featuresets that are known as positive
+ :param positive_featuresets: A list of featuresets that are known as positive
examples (i.e., their label is ``True``).
- :param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
+ :param unlabeled_featuresets: A list of featuresets whose label is unknown.
:param positive_prob_prior: A prior estimate of the probability of the label
``True`` (default 0.5).
fnames = set()
# Count up how many times each feature value occurred in positive examples.
- num_positive_examples = 0
for featureset in positive_featuresets:
for fname, fval in featureset.items():
positive_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
- num_positive_examples += 1
# Count up how many times each feature value occurred in unlabeled examples.
- num_unlabeled_examples = 0
for featureset in unlabeled_featuresets:
for fname, fval in featureset.items():
unlabeled_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
- num_unlabeled_examples += 1
# If a feature didn't have a value given for an instance, then we assume that
# it gets the implicit value 'None'.
+ num_positive_examples = len(positive_featuresets)
for fname in fnames:
count = positive_feature_freqdist[fname].N()
positive_feature_freqdist[fname][None] += num_positive_examples - count
feature_values[fname].add(None)
+ num_unlabeled_examples = len(unlabeled_featuresets)
for fname in fnames:
count = unlabeled_feature_freqdist[fname].N()
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
# Natural Language Toolkit: RTE Classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
TO DO: better Named Entity classification
TO DO: add lemmatization
"""
+from __future__ import print_function
from nltk.tokenize import RegexpTokenizer
from nltk.classify.util import accuracy, check_megam_config
self.stop = stop
self.stopwords = set(
[
- "a",
- "the",
- "it",
- "they",
- "of",
- "in",
- "to",
- "is",
- "have",
- "are",
- "were",
- "and",
- "very",
- ".",
- ",",
+ 'a',
+ 'the',
+ 'it',
+ 'they',
+ 'of',
+ 'in',
+ 'to',
+ 'is',
+ 'have',
+ 'are',
+ 'were',
+ 'and',
+ 'very',
+ '.',
+ ',',
]
)
- self.negwords = set(["no", "not", "never", "failed", "rejected", "denied"])
+ self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied'])
# Try to tokenize so that abbreviations, monetary amounts, email
# addresses, URLs are single tokens.
- tokenizer = RegexpTokenizer("[\w.@:/]+|\w+|\$[\d.]+")
+ tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
# Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
:type toktype: 'ne' or 'word'
"""
ne_overlap = set(token for token in self._overlap if self._ne(token))
- if toktype == "ne":
+ if toktype == 'ne':
if debug:
print("ne overlap", ne_overlap)
return ne_overlap
- elif toktype == "word":
+ elif toktype == 'word':
if debug:
print("word overlap", self._overlap - ne_overlap)
return self._overlap - ne_overlap
:type toktype: 'ne' or 'word'
"""
ne_extra = set(token for token in self._hyp_extra if self._ne(token))
- if toktype == "ne":
+ if toktype == 'ne':
return ne_extra
- elif toktype == "word":
+ elif toktype == 'word':
return self._hyp_extra - ne_extra
else:
raise ValueError("Type not recognized: '%s'" % toktype)
def rte_features(rtepair):
extractor = RTEFeatureExtractor(rtepair)
features = {}
- features["alwayson"] = True
- features["word_overlap"] = len(extractor.overlap("word"))
- features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
- features["ne_overlap"] = len(extractor.overlap("ne"))
- features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
- features["neg_txt"] = len(extractor.negwords & extractor.text_words)
- features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
+ features['alwayson'] = True
+ features['word_overlap'] = len(extractor.overlap('word'))
+ features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
+ features['ne_overlap'] = len(extractor.overlap('ne'))
+ features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
+ features['neg_txt'] = len(extractor.negwords & extractor.text_words)
+ features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words)
return features
def rte_classifier(algorithm):
from nltk.corpus import rte as rte_corpus
- train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
- test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
+ train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
+ test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
featurized_train_set = rte_featurize(train_set)
featurized_test_set = rte_featurize(test_set)
# Train the classifier
- print("Training classifier...")
- if algorithm in ["megam", "BFGS"]: # MEGAM based algorithms.
+ print('Training classifier...')
+ if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms.
# Ensure that MEGAM is configured first.
check_megam_config()
clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
- elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
+ elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm
clf = MaxentClassifier.train(featurized_train_set, algorithm)
else:
err_msg = str(
"'megam', 'BFGS', 'GIS', 'IIS'.\n"
)
raise Exception(err_msg)
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(clf, featurized_test_set)
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
return clf
... ('nb', MultinomialNB())])
>>> classif = SklearnClassifier(pipeline)
"""
+from __future__ import print_function, unicode_literals
+
+from six.moves import zip
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist
+from nltk import compat
try:
from sklearn.feature_extraction import DictVectorizer
except ImportError:
pass
-__all__ = ["SklearnClassifier"]
+__all__ = ['SklearnClassifier']
+@compat.python_2_unicode_compatible
class SklearnClassifier(ClassifierI):
"""Wrapper for scikit-learn classifiers."""
# encoding: utf-8
# Natural Language Toolkit: Senna Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Note: Unit tests for this module can be found in test/unit/test_senna.py
+ >>> from __future__ import unicode_literals
>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
>>> sent = 'Dusseldorf is an international business center'.split()
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
"""
+
+from __future__ import unicode_literals
from os import path, sep, environ
from subprocess import Popen, PIPE
from platform import architecture, system
+from six import text_type
+
from nltk.tag.api import TaggerI
+from nltk.compat import python_2_unicode_compatible
-_senna_url = "http://ml.nec-labs.com/senna/"
+_senna_url = 'http://ml.nec-labs.com/senna/'
+@python_2_unicode_compatible
class Senna(TaggerI):
- SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
+ SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
- def __init__(self, senna_path, operations, encoding="utf-8"):
+ def __init__(self, senna_path, operations, encoding='utf-8'):
self._encoding = encoding
self._path = path.normpath(senna_path) + sep
exe_file_1 = self.executable(self._path)
if not path.isfile(exe_file_1):
# Check for the system environment
- if "SENNA" in environ:
+ if 'SENNA' in environ:
# self._path = path.join(environ['SENNA'],'')
- self._path = path.normpath(environ["SENNA"]) + sep
+ self._path = path.normpath(environ['SENNA']) + sep
exe_file_2 = self.executable(self._path)
if not path.isfile(exe_file_2):
raise OSError(
be used.
"""
os_name = system()
- if os_name == "Linux":
+ if os_name == 'Linux':
bits = architecture()[0]
- if bits == "64bit":
- return path.join(base_path, "senna-linux64")
- return path.join(base_path, "senna-linux32")
- if os_name == "Windows":
- return path.join(base_path, "senna-win32.exe")
- if os_name == "Darwin":
- return path.join(base_path, "senna-osx")
- return path.join(base_path, "senna")
+ if bits == '64bit':
+ return path.join(base_path, 'senna-linux64')
+ return path.join(base_path, 'senna-linux32')
+ if os_name == 'Windows':
+ return path.join(base_path, 'senna-win32.exe')
+ if os_name == 'Darwin':
+ return path.join(base_path, 'senna-osx')
+ return path.join(base_path, 'senna')
def _map(self):
"""
# Build the senna command to run the tagger
_senna_cmd = [
self.executable(self._path),
- "-path",
+ '-path',
self._path,
- "-usrtokens",
- "-iobtags",
+ '-usrtokens',
+ '-iobtags',
]
- _senna_cmd.extend(["-" + op for op in self.operations])
+ _senna_cmd.extend(['-' + op for op in self.operations])
# Serialize the actual sentences to a temporary string
- _input = "\n".join((" ".join(x) for x in sentences)) + "\n"
- if isinstance(_input, str) and encoding:
+ _input = '\n'.join((' '.join(x) for x in sentences)) + '\n'
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
# Run the tagger and get the output
# Check the return code.
if p.returncode != 0:
- raise RuntimeError("Senna command failed! Details: %s" % stderr)
+ raise RuntimeError('Senna command failed! Details: %s' % stderr)
if encoding:
senna_output = stdout.decode(encoding)
sentence_index += 1
token_index = 0
continue
- tags = tagged_word.split("\t")
+ tags = tagged_word.split('\t')
result = {}
for tag in map_:
result[tag] = tags[map_[tag]].strip()
try:
- result["word"] = sentences[sentence_index][token_index]
+ result['word'] = sentences[sentence_index][token_index]
except IndexError:
raise IndexError(
"Misalignment error occurred at sentence number %d. Possible reason"
from nose import SkipTest
try:
- tagger = Senna("/usr/share/senna-v3.0", ["pos", "chk", "ner"])
+ tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
except OSError:
raise SkipTest("Senna executable not found")
# Natural Language Toolkit: SVM-based classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
#
# URL: <http://nltk.org/>
# Natural Language Toolkit: Interface to TADM Classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
import sys
import subprocess
+from six import string_types
+
from nltk.internals import find_binary
try:
def config_tadm(bin=None):
global _tadm_bin
_tadm_bin = find_binary(
- "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
+ 'tadm', bin, env_vars=['TADM'], binary_names=['tadm'], url='http://tadm.sf.net'
)
# http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
labels = encoding.labels()
for featureset, label in train_toks:
- length_line = "%d\n" % len(labels)
+ length_line = '%d\n' % len(labels)
stream.write(length_line)
for known_label in labels:
v = encoding.encode(featureset, known_label)
- line = "%d %d %s\n" % (
+ line = '%d %d %s\n' % (
int(label == known_label),
len(v),
- " ".join("%d %d" % u for u in v),
+ ' '.join('%d %d' % u for u in v),
)
stream.write(line)
weights = []
for line in paramfile:
weights.append(float(line.strip()))
- return numpy.array(weights, "d")
+ return numpy.array(weights, 'd')
def call_tadm(args):
"""
Call the ``tadm`` binary with the given arguments.
"""
- if isinstance(args, str):
- raise TypeError("args should be a list of strings")
+ if isinstance(args, string_types):
+ raise TypeError('args should be a list of strings')
if _tadm_bin is None:
config_tadm()
if p.returncode != 0:
print()
print(stderr)
- raise OSError("tadm command failed!")
+ raise OSError('tadm command failed!')
def names_demo():
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
tokens = [
- ({"f0": 1, "f1": 1, "f3": 1}, "A"),
- ({"f0": 1, "f2": 1, "f4": 1}, "B"),
- ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
+ ({'f0': 1, 'f1': 1, 'f3': 1}, 'A'),
+ ({'f0': 1, 'f2': 1, 'f4': 1}, 'B'),
+ ({'f0': 2, 'f2': 1, 'f3': 1, 'f4': 1}, 'A'),
]
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
write_tadm_file(tokens, encoding, sys.stdout)
print()
for i in range(encoding.length()):
- print("%s --> %d" % (encoding.describe(i), i))
+ print('%s --> %d' % (encoding.describe(i), i))
print()
-if __name__ == "__main__":
+if __name__ == '__main__':
encoding_demo()
names_demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
be identified-then compares using a distance measure.
Language n-grams are provided by the "An Crubadan"
-project. A corpus reader was created separately to read
+project. A corpus reader was created seperately to read
those files.
For details regarding the algorithm, see:
http://borel.slu.edu/crubadan/index.html
"""
-from sys import maxsize
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+from nltk.compat import PY3
from nltk.util import trigrams
+if PY3:
+ from sys import maxsize
+else:
+ from sys import maxint
+
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
self._corpus.lang_freq(lang)
def remove_punctuation(self, text):
- """ Get rid of punctuation except apostrophes """
+ ''' Get rid of punctuation except apostrophes '''
return re.sub(r"[^\P{P}\']+", "", text)
def profile(self, text):
- """ Create FreqDist of trigrams within text """
+ ''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
- token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
+ token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
return fingerprint
def calc_dist(self, lang, trigram, text_profile):
- """ Calculate the "out-of-place" measure between the
- text and language profile for a single trigram """
+ ''' Calculate the "out-of-place" measure between the
+ text and language profile for a single trigram '''
lang_fd = self._corpus.lang_freq(lang)
dist = 0
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
- dist = maxsize
+ if PY3:
+ dist = maxsize
+ else:
+ dist = maxint
return dist
def lang_dists(self, text):
- """ Calculate the "out-of-place" measure between
- the text and all languages """
+ ''' Calculate the "out-of-place" measure between
+ the text and all languages '''
distances = {}
profile = self.profile(text)
return distances
def guess_language(self, text):
- """ Find the language with the min distance
- to the text and return its ISO 639-3 code """
+ ''' Find the language with the min distance
+ to the text and return its ISO 639-3 code '''
self.last_distances = self.lang_dists(text)
return min(self.last_distances, key=self.last_distances.get)
from nltk.corpus import udhr
langs = [
- "Kurdish-UTF8",
- "Abkhaz-UTF8",
- "Farsi_Persian-UTF8",
- "Hindi-UTF8",
- "Hawaiian-UTF8",
- "Russian-UTF8",
- "Vietnamese-UTF8",
- "Serbian_Srpski-UTF8",
- "Esperanto-UTF8",
+ 'Kurdish-UTF8',
+ 'Abkhaz-UTF8',
+ 'Farsi_Persian-UTF8',
+ 'Hindi-UTF8',
+ 'Hawaiian-UTF8',
+ 'Russian-UTF8',
+ 'Vietnamese-UTF8',
+ 'Serbian_Srpski-UTF8',
+ 'Esperanto-UTF8',
]
friendly = {
- "kmr": "Northern Kurdish",
- "abk": "Abkhazian",
- "pes": "Iranian Persian",
- "hin": "Hindi",
- "haw": "Hawaiian",
- "rus": "Russian",
- "vie": "Vietnamese",
- "srp": "Serbian",
- "epo": "Esperanto",
+ 'kmr': 'Northern Kurdish',
+ 'abk': 'Abkhazian',
+ 'pes': 'Iranian Persian',
+ 'hin': 'Hindi',
+ 'haw': 'Hawaiian',
+ 'rus': 'Russian',
+ 'vie': 'Vietnamese',
+ 'srp': 'Serbian',
+ 'epo': 'Esperanto',
}
tc = TextCat()
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
- sample = ""
+ sample = ''
# Generate a sample text of the language
for i in range(0, rows):
- cur_sent = ""
+ cur_sent = ''
for j in range(0, cols[i]):
- cur_sent += " " + raw_sentences[i][j]
+ cur_sent += ' ' + raw_sentences[i][j]
sample += cur_sent
# Try to detect what it is
- print("Language snippet: " + sample[0:140] + "...")
+ print('Language snippet: ' + sample[0:140] + '...')
guess = tc.guess_language(sample)
- print("Language detection: %s (%s)" % (guess, friendly[guess]))
- print("#" * 140)
+ print('Language detection: %s (%s)' % (guess, friendly[guess]))
+ print('#' * 140)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Classifier Utility Functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
"""
Utility functions and classes for classifiers.
"""
+from __future__ import print_function, division
import math
def __init__(self, cutoffs):
self.cutoffs = cutoffs.copy()
- if "min_ll" in cutoffs:
- cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
- if "min_lldelta" in cutoffs:
- cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
+ if 'min_ll' in cutoffs:
+ cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
+ if 'min_lldelta' in cutoffs:
+ cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
self.ll = None
self.acc = None
self.iter = 1
def check(self, classifier, train_toks):
cutoffs = self.cutoffs
self.iter += 1
- if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
+ if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
return True # iteration cutoff.
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
if math.isnan(new_ll):
return True
- if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
- if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
+ if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
+ if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
return True # log likelihood cutoff
if (
- "min_lldelta" in cutoffs
+ 'min_lldelta' in cutoffs
and self.ll
- and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
+ and ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))
):
return True # log likelihood delta cutoff
self.ll = new_ll
- if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
+ if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
- if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
+ if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
return True # log likelihood cutoff
if (
- "min_accdelta" in cutoffs
+ 'min_accdelta' in cutoffs
and self.acc
- and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
+ and ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))
):
return True # log likelihood delta cutoff
self.acc = new_acc
def names_demo_features(name):
features = {}
- features["alwayson"] = True
- features["startswith"] = name[0].lower()
- features["endswith"] = name[-1].lower()
- for letter in "abcdefghijklmnopqrstuvwxyz":
- features["count(%s)" % letter] = name.lower().count(letter)
- features["has(%s)" % letter] = letter in name.lower()
+ features['alwayson'] = True
+ features['startswith'] = name[0].lower()
+ features['endswith'] = name[-1].lower()
+ for letter in 'abcdefghijklmnopqrstuvwxyz':
+ features['count(%s)' % letter] = name.lower().count(letter)
+ features['has(%s)' % letter] = letter in name.lower()
return features
def binary_names_demo_features(name):
features = {}
- features["alwayson"] = True
- features["startswith(vowel)"] = name[0].lower() in "aeiouy"
- features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
- for letter in "abcdefghijklmnopqrstuvwxyz":
- features["count(%s)" % letter] = name.lower().count(letter)
- features["has(%s)" % letter] = letter in name.lower()
- features["startswith(%s)" % letter] = letter == name[0].lower()
- features["endswith(%s)" % letter] = letter == name[-1].lower()
+ features['alwayson'] = True
+ features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
+ features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
+ for letter in 'abcdefghijklmnopqrstuvwxyz':
+ features['count(%s)' % letter] = name.lower().count(letter)
+ features['has(%s)' % letter] = letter in name.lower()
+ features['startswith(%s)' % letter] = letter == name[0].lower()
+ features['endswith(%s)' % letter] = letter == name[-1].lower()
return features
import random
# Construct a list of classified names, using the names corpus.
- namelist = [(name, "male") for name in names.words("male.txt")] + [
- (name, "female") for name in names.words("female.txt")
+ namelist = [(name, 'male') for name in names.words('male.txt')] + [
+ (name, 'female') for name in names.words('female.txt')
]
# Randomly split the names into a test & train set.
test = namelist[5000:5500]
# Train up a classifier.
- print("Training classifier...")
+ print('Training classifier...')
classifier = trainer([(features(n), g) for (n, g) in train])
# Run the classifier on the test data.
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+ print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
print()
- print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
+ print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
- if gender == "male":
- fmt = " %-15s *%6.4f %6.4f"
+ if gender == 'male':
+ fmt = ' %-15s *%6.4f %6.4f'
else:
- fmt = " %-15s %6.4f *%6.4f"
- print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
+ fmt = ' %-15s %6.4f *%6.4f'
+ print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
except NotImplementedError:
pass
from nltk.corpus import names
import random
- male_names = names.words("male.txt")
- female_names = names.words("female.txt")
+ male_names = names.words('male.txt')
+ female_names = names.words('female.txt')
random.seed(654321)
random.shuffle(male_names)
random.shuffle(test)
# Train up a classifier.
- print("Training classifier...")
+ print('Training classifier...')
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+ print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
print()
- print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
+ print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
if is_male == True:
- fmt = " %-15s *%6.4f %6.4f"
+ fmt = ' %-15s *%6.4f %6.4f'
else:
- fmt = " %-15s %6.4f *%6.4f"
+ fmt = ' %-15s %6.4f *%6.4f'
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
pass
import random
# Get the instances.
- print("Reading data...")
+ print('Reading data...')
global _inst_cache
if word not in _inst_cache:
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
if n > len(instances):
n = len(instances)
senses = list(set(l for (i, l) in instances))
- print(" Senses: " + " ".join(senses))
+ print(' Senses: ' + ' '.join(senses))
# Randomly split the names into a test & train set.
- print("Splitting into test & train...")
+ print('Splitting into test & train...')
random.seed(123456)
random.shuffle(instances)
train = instances[: int(0.8 * n)]
test = instances[int(0.8 * n) : n]
# Train up a classifier.
- print("Training classifier...")
+ print('Training classifier...')
classifier = trainer([(features(i), l) for (i, l) in train])
# Run the classifier on the test data.
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
test_featuresets = [features(i) for (i, n) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+ print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
except NotImplementedError:
pass
# Natural Language Toolkit: Interface to Weka Classsifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Classifiers that make use of the external 'Weka' package.
"""
-
+from __future__ import print_function
import time
import tempfile
import os
import zipfile
from sys import stdin
+from six import integer_types, string_types
+
from nltk.probability import DictionaryProbDist
from nltk.internals import java, config_java
_weka_classpath = None
_weka_search = [
- ".",
- "/usr/share/weka",
- "/usr/local/share/weka",
- "/usr/lib/weka",
- "/usr/local/lib/weka",
+ '.',
+ '/usr/share/weka',
+ '/usr/local/share/weka',
+ '/usr/lib/weka',
+ '/usr/local/lib/weka',
]
if _weka_classpath is None:
searchpath = _weka_search
- if "WEKAHOME" in os.environ:
- searchpath.insert(0, os.environ["WEKAHOME"])
+ if 'WEKAHOME' in os.environ:
+ searchpath.insert(0, os.environ['WEKAHOME'])
for path in searchpath:
- if os.path.exists(os.path.join(path, "weka.jar")):
- _weka_classpath = os.path.join(path, "weka.jar")
+ if os.path.exists(os.path.join(path, 'weka.jar')):
+ _weka_classpath = os.path.join(path, 'weka.jar')
version = _check_weka_version(_weka_classpath)
if version:
print(
- ("[Found Weka: %s (version %s)]" % (_weka_classpath, version))
+ ('[Found Weka: %s (version %s)]' % (_weka_classpath, version))
)
else:
- print("[Found Weka: %s]" % _weka_classpath)
+ print('[Found Weka: %s]' % _weka_classpath)
_check_weka_version(_weka_classpath)
if _weka_classpath is None:
raise LookupError(
- "Unable to find weka.jar! Use config_weka() "
- "or set the WEKAHOME environment variable. "
- "For more information about Weka, please see "
- "http://www.cs.waikato.ac.nz/ml/weka/"
+ 'Unable to find weka.jar! Use config_weka() '
+ 'or set the WEKAHOME environment variable. '
+ 'For more information about Weka, please see '
+ 'http://www.cs.waikato.ac.nz/ml/weka/'
)
return None
try:
try:
- return zf.read("weka/core/version.txt")
+ return zf.read('weka/core/version.txt')
except KeyError:
return None
finally:
self._model = model_filename
def prob_classify_many(self, featuresets):
- return self._classify_many(featuresets, ["-p", "0", "-distribution"])
+ return self._classify_many(featuresets, ['-p', '0', '-distribution'])
def classify_many(self, featuresets):
- return self._classify_many(featuresets, ["-p", "0"])
+ return self._classify_many(featuresets, ['-p', '0'])
def _classify_many(self, featuresets, options):
# Make sure we can find java & weka.
temp_dir = tempfile.mkdtemp()
try:
# Write the test data file.
- test_filename = os.path.join(temp_dir, "test.arff")
+ test_filename = os.path.join(temp_dir, 'test.arff')
self._formatter.write(test_filename, featuresets)
# Call weka to classify the data.
cmd = [
- "weka.classifiers.bayes.NaiveBayes",
- "-l",
+ 'weka.classifiers.bayes.NaiveBayes',
+ '-l',
self._model,
- "-T",
+ '-T',
test_filename,
] + options
(stdout, stderr) = java(
# Check if something went wrong:
if stderr and not stdout:
- if "Illegal options: -distribution" in stderr:
+ if 'Illegal options: -distribution' in stderr:
raise ValueError(
- "The installed version of weka does "
- "not support probability distribution "
- "output."
+ 'The installed version of weka does '
+ 'not support probability distribution '
+ 'output.'
)
else:
- raise ValueError("Weka failed to generate output:\n%s" % stderr)
+ raise ValueError('Weka failed to generate output:\n%s' % stderr)
# Parse weka's output.
- return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
+ return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))
finally:
for f in os.listdir(temp_dir):
os.rmdir(temp_dir)
def parse_weka_distribution(self, s):
- probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
+ probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]
probs = dict(zip(self._formatter.labels(), probs))
return DictionaryProbDist(probs)
lines = lines[i:]
break
- if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
- return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
+ if lines[0].split() == ['inst#', 'actual', 'predicted', 'error', 'prediction']:
+ return [line.split()[2].split(':')[1] for line in lines[1:] if line.strip()]
elif lines[0].split() == [
- "inst#",
- "actual",
- "predicted",
- "error",
- "distribution",
+ 'inst#',
+ 'actual',
+ 'predicted',
+ 'error',
+ 'distribution',
]:
return [
self.parse_weka_distribution(line.split()[-1])
]
# is this safe:?
- elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
+ elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]):
return [line.split()[1] for line in lines if line.strip()]
else:
for line in lines[:10]:
print(line)
raise ValueError(
- "Unhandled output format -- your version "
- "of weka may not be supported.\n"
- " Header: %s" % lines[0]
+ 'Unhandled output format -- your version '
+ 'of weka may not be supported.\n'
+ ' Header: %s' % lines[0]
)
# [xx] full list of classifiers (some may be abstract?):
# VotedPerceptron, Winnow, ZeroR
_CLASSIFIER_CLASS = {
- "naivebayes": "weka.classifiers.bayes.NaiveBayes",
- "C4.5": "weka.classifiers.trees.J48",
- "log_regression": "weka.classifiers.functions.Logistic",
- "svm": "weka.classifiers.functions.SMO",
- "kstar": "weka.classifiers.lazy.KStar",
- "ripper": "weka.classifiers.rules.JRip",
+ 'naivebayes': 'weka.classifiers.bayes.NaiveBayes',
+ 'C4.5': 'weka.classifiers.trees.J48',
+ 'log_regression': 'weka.classifiers.functions.Logistic',
+ 'svm': 'weka.classifiers.functions.SMO',
+ 'kstar': 'weka.classifiers.lazy.KStar',
+ 'ripper': 'weka.classifiers.rules.JRip',
}
@classmethod
cls,
model_filename,
featuresets,
- classifier="naivebayes",
+ classifier='naivebayes',
options=[],
quiet=True,
):
temp_dir = tempfile.mkdtemp()
try:
# Write the training data file.
- train_filename = os.path.join(temp_dir, "train.arff")
+ train_filename = os.path.join(temp_dir, 'train.arff')
formatter.write(train_filename, featuresets)
if classifier in cls._CLASSIFIER_CLASS:
elif classifier in cls._CLASSIFIER_CLASS.values():
javaclass = classifier
else:
- raise ValueError("Unknown classifier %s" % classifier)
+ raise ValueError('Unknown classifier %s' % classifier)
# Train the weka model.
- cmd = [javaclass, "-d", model_filename, "-t", train_filename]
+ cmd = [javaclass, '-d', model_filename, '-t', train_filename]
cmd += list(options)
if quiet:
stdout = subprocess.PIPE
def write(self, outfile, tokens):
"""Writes ARFF data to a file for the given data."""
- if not hasattr(outfile, "write"):
- outfile = open(outfile, "w")
+ if not hasattr(outfile, 'write'):
+ outfile = open(outfile, 'w')
outfile.write(self.format(tokens))
outfile.close()
for tok, label in tokens:
for (fname, fval) in tok.items():
if issubclass(type(fval), bool):
- ftype = "{True, False}"
- elif issubclass(type(fval), (int, float, bool)):
- ftype = "NUMERIC"
- elif issubclass(type(fval), str):
- ftype = "STRING"
+ ftype = '{True, False}'
+ elif issubclass(type(fval), (integer_types, float, bool)):
+ ftype = 'NUMERIC'
+ elif issubclass(type(fval), string_types):
+ ftype = 'STRING'
elif fval is None:
continue # can't tell the type.
else:
- raise ValueError("Unsupported value type %r" % ftype)
+ raise ValueError('Unsupported value type %r' % ftype)
if features.get(fname, ftype) != ftype:
- raise ValueError("Inconsistent type for %s" % fname)
+ raise ValueError('Inconsistent type for %s' % fname)
features[fname] = ftype
features = sorted(features.items())
"""Returns an ARFF header as a string."""
# Header comment.
s = (
- "% Weka ARFF file\n"
- + "% Generated automatically by NLTK\n"
- + "%% %s\n\n" % time.ctime()
+ '% Weka ARFF file\n'
+ + '% Generated automatically by NLTK\n'
+ + '%% %s\n\n' % time.ctime()
)
# Relation name
- s += "@RELATION rel\n\n"
+ s += '@RELATION rel\n\n'
# Input attribute specifications
for fname, ftype in self._features:
- s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
+ s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype)
# Label attribute specification
- s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
+ s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels))
return s
tokens = [(tok, None) for tok in tokens]
# Data section
- s = "\n@DATA\n"
+ s = '\n@DATA\n'
for (tok, label) in tokens:
for fname, ftype in self._features:
- s += "%s," % self._fmt_arff_val(tok.get(fname))
- s += "%s\n" % self._fmt_arff_val(label)
+ s += '%s,' % self._fmt_arff_val(tok.get(fname))
+ s += '%s\n' % self._fmt_arff_val(label)
return s
def _fmt_arff_val(self, fval):
if fval is None:
- return "?"
- elif isinstance(fval, (bool, int)):
- return "%s" % fval
+ return '?'
+ elif isinstance(fval, (bool, integer_types)):
+ return '%s' % fval
elif isinstance(fval, float):
- return "%r" % fval
+ return '%r' % fval
else:
- return "%r" % fval
+ return '%r' % fval
-if __name__ == "__main__":
+if __name__ == '__main__':
from nltk.classify.util import names_demo, binary_names_demo_features
def make_classifier(featuresets):
- return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
+ return WekaClassifier.train('/tmp/name.model', featuresets, 'C4.5')
classifier = names_demo(make_classifier, binary_names_demo_features)
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: NLTK Command-Line Interface
-#
-# Copyright (C) 2001-2020 NLTK Project
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from functools import partial
-from itertools import chain
-from tqdm import tqdm
-
-import click
-
-from nltk import word_tokenize
-from nltk.util import parallelize_preprocess
-
-CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
-@click.version_option()
-def cli():
- pass
-
-
-@cli.command("tokenize")
-@click.option(
- "--language",
- "-l",
- default="en",
- help="The language for the Punkt sentence tokenization.",
-)
-@click.option(
- "--preserve-line",
- "-l",
- default=True,
- is_flag=True,
- help="An option to keep the preserve the sentence and not sentence tokenize it.",
-)
-@click.option("--processes", "-j", default=1, help="No. of processes.")
-@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.")
-@click.option(
- "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens."
-)
-def tokenize_file(language, preserve_line, processes, encoding, delimiter):
- """ This command tokenizes text stream using nltk.word_tokenize """
- with click.get_text_stream("stdin", encoding=encoding) as fin:
- with click.get_text_stream("stdout", encoding=encoding) as fout:
- # If it's single process, joblib parallization is slower,
- # so just process line by line normally.
- if processes == 1:
- for line in tqdm(fin.readlines()):
- print(delimiter.join(word_tokenize(line)), end="\n", file=fout)
- else:
- for outline in parallelize_preprocess(
- word_tokenize, fin.readlines(), processes, progress_bar=True
- ):
- print(delimiter.join(outline), end="\n", file=fout)
# Natural Language Toolkit: Clusterers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Natural Language Toolkit: Clusterer Interfaces
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Porting: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.probability import DictionaryProbDist
-class ClusterI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ClusterI(object):
"""
Interface covering basic clustering functionality.
"""
# Natural Language Toolkit: Expectation Maximization Clusterer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
try:
import numpy
except ImportError:
pass
+from nltk.compat import python_2_unicode_compatible
from nltk.cluster.util import VectorSpaceClusterer
+@python_2_unicode_compatible
class EMClusterer(VectorSpaceClusterer):
"""
The Gaussian EM clusterer models the vectors as being produced by
while not converged:
if trace:
- print("iteration; loglikelihood", lastl)
+ print('iteration; loglikelihood', lastl)
# E-step, calculate hidden variables, h[i,j]
h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
for i in range(len(vectors)):
def _gaussian(self, mean, cvm, x):
m = len(mean)
- assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
+ assert cvm.shape == (m, m), 'bad sized covariance matrix, %s' % str(cvm.shape)
try:
det = numpy.linalg.det(cvm)
inv = numpy.linalg.inv(cvm)
return llh
def __repr__(self):
- return "<EMClusterer means=%s>" % list(self._means)
+ return '<EMClusterer means=%s>' % list(self._means)
def demo():
clusterer = cluster.EMClusterer(means, bias=0.1)
clusters = clusterer.cluster(vectors, True, trace=True)
- print("Clustered:", vectors)
- print("As: ", clusters)
+ print('Clustered:', vectors)
+ print('As: ', clusters)
print()
for c in range(2):
- print("Cluster:", c)
- print("Prior: ", clusterer._priors[c])
- print("Mean: ", clusterer._means[c])
- print("Covar: ", clusterer._covariance_matrices[c])
+ print('Cluster:', c)
+ print('Prior: ', clusterer._priors[c])
+ print('Mean: ', clusterer._means[c])
+ print('Covar: ', clusterer._covariance_matrices[c])
print()
# classify a new vector
vector = numpy.array([2, 2])
- print("classify(%s):" % vector, end=" ")
+ print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
# show the classification probabilities
vector = numpy.array([2, 2])
- print("classification_probdist(%s):" % vector)
+ print('classification_probdist(%s):' % vector)
pdist = clusterer.classification_probdist(vector)
for sample in pdist.samples():
- print("%s => %.0f%%" % (sample, pdist.prob(sample) * 100))
+ print('%s => %.0f%%' % (sample, pdist.prob(sample) * 100))
-if __name__ == "__main__":
+
+#
+# The following demo code is broken.
+#
+# # use a set of tokens with 2D indices
+# vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+# # test the EM clusterer with means given by k-means (2) and
+# # dimensionality reduction
+# clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1)
+# print 'Clusterer:', clusterer
+# clusters = clusterer.cluster(vectors)
+# means = clusterer.means()
+# print 'Means:', clusterer.means()
+# print
+
+# clusterer = cluster.EMClusterer(means, svd_dimensions=1)
+# clusters = clusterer.cluster(vectors, True)
+# print 'Clusterer:', clusterer
+# print 'Clustered:', str(vectors)[:60], '...'
+# print 'As:', str(clusters)[:60], '...'
+# print
+
+# # classify a new vector
+# vector = numpy.array([3, 3])
+# print 'classify(%s):' % vector,
+# print clusterer.classify(vector)
+# print
+
+# # show the classification probabilities
+# vector = numpy.array([2.2, 2])
+# print 'classification_probdist(%s)' % vector
+# pdist = clusterer.classification_probdist(vector)
+# for sample in pdist:
+# print '%s => %.0f%%' % (sample, pdist.prob(sample) *100)
+
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Group Average Agglomerative Clusterer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
try:
import numpy
pass
from nltk.cluster.util import VectorSpaceClusterer, Dendrogram, cosine_distance
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class GAAClusterer(VectorSpaceClusterer):
"""
The Group Average Agglomerative starts with each of the N vectors as singleton
return self._num_clusters
def __repr__(self):
- return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
+ return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
def demo():
clusterer = GAAClusterer(4)
clusters = clusterer.cluster(vectors, True)
- print("Clusterer:", clusterer)
- print("Clustered:", vectors)
- print("As:", clusters)
+ print('Clusterer:', clusterer)
+ print('Clustered:', vectors)
+ print('As:', clusters)
print()
# show the dendrogram
# classify a new vector
vector = numpy.array([3, 3])
- print("classify(%s):" % vector, end=" ")
+ print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
print()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: K-Means Clusterer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
import copy
import random
from nltk.cluster.util import VectorSpaceClusterer
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class KMeansClusterer(VectorSpaceClusterer):
"""
The K-means clusterer starts with k arbitrary chosen means then allocates
def cluster_vectorspace(self, vectors, trace=False):
if self._means and self._repeats > 1:
- print("Warning: means will be discarded for subsequent trials")
+ print('Warning: means will be discarded for subsequent trials')
meanss = []
for trial in range(self._repeats):
if trace:
- print("k-means trial", trial)
+ print('k-means trial', trial)
if not self._means or trial > 1:
self._means = self._rng.sample(list(vectors), self._num_means)
self._cluster_vectorspace(vectors, trace)
clusters[index].append(vector)
if trace:
- print("iteration")
+ print('iteration')
# for i in range(self._num_means):
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
return centroid / (1 + len(cluster))
else:
if not len(cluster):
- sys.stderr.write("Error: no centroid defined for empty cluster.\n")
+ sys.stderr.write('Error: no centroid defined for empty cluster.\n')
sys.stderr.write(
- "Try setting argument 'avoid_empty_clusters' to True\n"
+ 'Try setting argument \'avoid_empty_clusters\' to True\n'
)
assert False
centroid = copy.copy(cluster[0])
return centroid / len(cluster)
def __repr__(self):
- return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
+ return '<KMeansClusterer means=%s repeats=%d>' % (self._means, self._repeats)
#################################################################################
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)
- print("Clustered:", vectors)
- print("As:", clusters)
- print("Means:", clusterer.means())
+ print('Clustered:', vectors)
+ print('As:', clusters)
+ print('Means:', clusterer.means())
print()
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
- print("Clustered:", vectors)
- print("As:", clusters)
- print("Means:", clusterer.means())
+ print('Clustered:', vectors)
+ print('As:', clusters)
+ print('Means:', clusterer.means())
print()
# classify a new vector
vector = numpy.array([3, 3])
- print("classify(%s):" % vector, end=" ")
+ print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
print()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Clusterer Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Contributor: J Richard Snape
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
from abc import abstractmethod
import copy
pass
from nltk.cluster.api import ClusterI
+from nltk.compat import python_2_unicode_compatible
class VectorSpaceClusterer(ClusterI):
return cosine_distance(self._value, comparator._value) < 0
+@python_2_unicode_compatible
class Dendrogram(object):
"""
Represents a dendrogram, a tree with a specified branching order. This
"""
# ASCII rendering characters
- JOIN, HLINK, VLINK = "+", "-", "|"
+ JOIN, HLINK, VLINK = '+', '-', '|'
# find the root (or create one)
if len(self._items) > 1:
rhalf = int(width - lhalf - 1)
# display functions
- def format(centre, left=" ", right=" "):
- return "%s%s%s" % (lhalf * left, centre, right * rhalf)
+ def format(centre, left=' ', right=' '):
+ return '%s%s%s' % (lhalf * left, centre, right * rhalf)
def display(str):
stdout.write(str)
# for each merge, top down
queue = [(root._value, root)]
- verticals = [format(" ") for leaf in leaves]
+ verticals = [format(' ') for leaf in leaves]
while queue:
priority, node = queue.pop()
child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
for i in range(len(leaves)):
if leaves[i] in child_left_leaf:
if i == min_idx:
- display(format(JOIN, " ", HLINK))
+ display(format(JOIN, ' ', HLINK))
elif i == max_idx:
- display(format(JOIN, HLINK, " "))
+ display(format(JOIN, HLINK, ' '))
else:
display(format(JOIN, HLINK, HLINK))
verticals[i] = format(VLINK)
display(format(HLINK, HLINK, HLINK))
else:
display(verticals[i])
- display("\n")
+ display('\n')
for child in node._children:
if child._children:
queue.append((child._value, child))
for vertical in verticals:
display(vertical)
- display("\n")
+ display('\n')
# finally, display the last line
- display("".join(item.center(width) for item in last_row))
- display("\n")
+ display(''.join(item.center(width) for item in last_row))
+ display('\n')
def __repr__(self):
if len(self._items) > 1:
else:
root = self._items[0]
leaves = root.leaves(False)
- return "<Dendrogram with %d leaves>" % len(leaves)
+ return '<Dendrogram with %d leaves>' % len(leaves)
# Natural Language Toolkit: Collections
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import
import bisect
from itertools import islice, chain
from functools import total_ordering
-
# this unused import is for python 2.7
from collections import defaultdict, deque, Counter
+from six import text_type
+
from nltk.internals import slice_bounds, raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
##########################################################################
class OrderedDict(dict):
def __init__(self, data=None, **kwargs):
- self._keys = self.keys(data, kwargs.get("keys"))
- self._default_factory = kwargs.get("default_factory")
+ self._keys = self.keys(data, kwargs.get('keys'))
+ self._default_factory = kwargs.get('default_factory')
if data is None:
dict.__init__(self)
else:
return data.keys()
elif isinstance(data, list):
return [key for (key, value) in data]
- elif "_keys" in self.__dict__:
+ elif '_keys' in self.__dict__:
return self._keys
else:
return []
@total_ordering
+@python_2_unicode_compatible
class AbstractLazySequence(object):
"""
An abstract base class for read-only sequences whose values are
Return the number of tokens in the corpus file underlying this
corpus view.
"""
- raise NotImplementedError("should be implemented by subclass")
+ raise NotImplementedError('should be implemented by subclass')
def iterate_from(self, start):
"""
``start``. If ``start>=len(self)``, then this iterator will
generate no tokens.
"""
- raise NotImplementedError("should be implemented by subclass")
+ raise NotImplementedError('should be implemented by subclass')
def __getitem__(self, i):
"""
if i < 0:
i += len(self)
if i < 0:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# Use iterate_from to extract it.
try:
return next(self.iterate_from(i))
except StopIteration:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
def __iter__(self):
"""Return an iterator that generates the tokens in the corpus
for i, elt in enumerate(islice(self, start, stop)):
if elt == value:
return i + start
- raise ValueError("index(x): x not in list")
+ raise ValueError('index(x): x not in list')
def __contains__(self, value):
"""Return true if this list contains ``value``."""
pieces.append(repr(elt))
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % ", ".join(pieces[:-1])
- return "[%s]" % ", ".join(pieces)
+ return '[%s, ...]' % text_type(', ').join(pieces[:-1])
+ return '[%s]' % text_type(', ').join(pieces)
def __eq__(self, other):
return type(self) == type(other) and list(self) == list(other)
"""
:raise ValueError: Corpus view objects are unhashable.
"""
- raise ValueError("%s objects are unhashable" % self.__class__.__name__)
+ raise ValueError('%s objects are unhashable' % self.__class__.__name__)
class LazySubsequence(AbstractLazySequence):
if sublist_index == (len(self._offsets) - 1):
assert (
index + len(sublist) >= self._offsets[-1]
- ), "offests not monotonic increasing!"
+ ), 'offests not monotonic increasing!'
self._offsets.append(index + len(sublist))
else:
assert self._offsets[sublist_index + 1] == index + len(
sublist
- ), "inconsistent list value (num elts)"
+ ), 'inconsistent list value (num elts)'
for value in sublist[max(0, start_index - index) :]:
yield value
by this lazy map. (default=5)
"""
if not lists:
- raise TypeError("LazyMap requires at least two args")
+ raise TypeError('LazyMap requires at least two args')
self._lists = lists
self._func = function
- self._cache_size = config.get("cache_size", 5)
+ self._cache_size = config.get('cache_size', 5)
self._cache = {} if self._cache_size > 0 else None
# If you just take bool() of sum() here _all_lazy will be true just
if index < 0:
index += len(self)
if index < 0:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# Check the cache
if self._cache is not None and index in self._cache:
return self._cache[index]
try:
val = next(self.iterate_from(index))
except StopIteration:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# Update the cache
if self._cache is not None:
if len(self._cache) > self._cache_size:
# Natural Language Toolkit: Collocations and Association Measures
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
ngram given appropriate frequency counts. A number of standard association
measures are provided in bigram_measures and trigram_measures.
"""
+from __future__ import print_function
# Possible TODOs:
# - consider the distinction between f(x,_) and f(x) and whether our
# and unigram counts (raw_freq, pmi, student_t)
import itertools as _itertools
+from six import iteritems
from nltk.probability import FreqDist
from nltk.util import ngrams
-
# these two unused imports are referenced in collocations.doctest
-from nltk.metrics import (
- ContingencyMeasures,
- BigramAssocMeasures,
- TrigramAssocMeasures,
- QuadgramAssocMeasures,
-)
+from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
def _build_new_documents(
cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
):
- """
+ '''
Pad the document with the place holder according to the window_size
- """
+ '''
padding = (pad_symbol,) * (window_size - 1)
if pad_right:
return _itertools.chain.from_iterable(
if the function returns True when passed an ngram tuple.
"""
tmp_ngram = FreqDist()
- for ngram, freq in self.ngram_fd.items():
+ for ngram, freq in iteritems(self.ngram_fd):
if not fn(ngram, freq):
tmp_ngram[ngram] = freq
self.ngram_fd = tmp_ngram
from nltk.corpus import stopwords, webtext
- ignored_words = stopwords.words("english")
+ ignored_words = stopwords.words('english')
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
for file in webtext.fileids():
ranks_from_scores(cf.score_ngrams(compare_scorer)),
)
print(file)
- print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)])
- print("\t Correlation to %s: %0.4f" % (compare_scorer.__name__, corr))
+ print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
+ print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
# Slows down loading too much
# bigram_measures = BigramAssocMeasures()
# trigram_measures = TrigramAssocMeasures()
-if __name__ == "__main__":
+if __name__ == '__main__':
import sys
from nltk.metrics import BigramAssocMeasures
try:
- scorer = eval("BigramAssocMeasures." + sys.argv[1])
+ scorer = eval('BigramAssocMeasures.' + sys.argv[1])
except IndexError:
scorer = None
try:
- compare_scorer = eval("BigramAssocMeasures." + sys.argv[2])
+ compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
except IndexError:
compare_scorer = None
demo(scorer, compare_scorer)
__all__ = [
- "BigramCollocationFinder",
- "TrigramCollocationFinder",
- "QuadgramCollocationFinder",
+ 'BigramCollocationFinder',
+ 'TrigramCollocationFinder',
+ 'QuadgramCollocationFinder',
]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Compatibility
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import absolute_import, print_function
import os
-from functools import wraps
+import sys
+from functools import update_wrapper, wraps
+import fractions
+import unicodedata
+
+from six import string_types, text_type
+
+# Python 2/3 compatibility layer. Based on six.
+
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+
+ def get_im_class(meth):
+ return meth.__self__.__class__
+
+ import io
+
+ StringIO = io.StringIO
+ BytesIO = io.BytesIO
+
+ from datetime import timezone
+
+ UTC = timezone.utc
+
+ from tempfile import TemporaryDirectory
+
+else:
+
+ def get_im_class(meth):
+ return meth.im_class
+
+ try:
+ from cStringIO import StringIO
+ except ImportError:
+ from StringIO import StringIO
+ BytesIO = StringIO
+
+ from datetime import tzinfo, timedelta
+
+ ZERO = timedelta(0)
+ HOUR = timedelta(hours=1)
+
+ # A UTC class for python 2.7
+ class UTC(tzinfo):
+ """UTC"""
+
+ def utcoffset(self, dt):
+ return ZERO
+
+ def tzname(self, dt):
+ return "UTC"
+
+ def dst(self, dt):
+ return ZERO
+
+ UTC = UTC()
+
+ import csv
+ import codecs
+ import cStringIO
+
+ class UnicodeWriter:
+ """
+ A CSV writer which will write rows to CSV file "f",
+ which is encoded in the given encoding.
+ see https://docs.python.org/2/library/csv.html
+ """
+
+ def __init__(
+ self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds
+ ):
+ # Redirect output to a queue
+ self.queue = cStringIO.StringIO()
+ self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+ self.stream = f
+ encoder_cls = codecs.getincrementalencoder(encoding)
+ self.encoder = encoder_cls(errors=errors)
+
+ def encode(self, data):
+ if isinstance(data, string_types):
+ return data.encode("utf-8")
+ else:
+ return data
+
+ def writerow(self, row):
+ self.writer.writerow([self.encode(s) for s in row])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and reencode it into the target encoding
+ data = self.encoder.encode(data, 'replace')
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
+
+ import warnings as _warnings
+ import os as _os
+ from tempfile import mkdtemp
+
+ class TemporaryDirectory(object):
+ """Create and return a temporary directory. This has the same
+ behavior as mkdtemp but can be used as a context manager. For
+ example:
+
+ with TemporaryDirectory() as tmpdir:
+ ...
+
+ Upon exiting the context, the directory and everything contained
+ in it are removed.
+
+ http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
+ """
+
+ def __init__(self, suffix="", prefix="tmp", dir=None):
+ self._closed = False
+ self.name = None # Handle mkdtemp raising an exception
+ self.name = mkdtemp(suffix, prefix, dir)
+
+ def __repr__(self):
+ return "<{} {!r}>".format(self.__class__.__name__, self.name)
+
+ def __enter__(self):
+ return self.name
+
+ def cleanup(self, _warn=False):
+ if self.name and not self._closed:
+ try:
+ self._rmtree(self.name)
+ except (TypeError, AttributeError) as ex:
+ # Issue #10188: Emit a warning on stderr
+ # if the directory could not be cleaned
+ # up due to missing globals
+ if "None" not in str(ex):
+ raise
+ print(
+ "ERROR: {!r} while cleaning up {!r}".format(ex, self),
+ file=sys.stderr,
+ )
+ return
+ self._closed = True
+ if _warn:
+ self._warn("Implicitly cleaning up {!r}".format(self), Warning)
+
+ def __exit__(self, exc, value, tb):
+ self.cleanup()
+
+ def __del__(self):
+ # Issue a Warning if implicit cleanup needed
+ self.cleanup(_warn=True)
+
+ # XXX (ncoghlan): The following code attempts to make
+ # this class tolerant of the module nulling out process
+ # that happens during CPython interpreter shutdown
+ # Alas, it doesn't actually manage it. See issue #10188
+ _listdir = staticmethod(_os.listdir)
+ _path_join = staticmethod(_os.path.join)
+ _isdir = staticmethod(_os.path.isdir)
+ _islink = staticmethod(_os.path.islink)
+ _remove = staticmethod(_os.remove)
+ _rmdir = staticmethod(_os.rmdir)
+ _warn = _warnings.warn
+
+ def _rmtree(self, path):
+ # Essentially a stripped down version of shutil.rmtree. We can't
+ # use globals because they may be None'ed out at shutdown.
+ for name in self._listdir(path):
+ fullname = self._path_join(path, name)
+ try:
+ isdir = self._isdir(fullname) and not self._islink(fullname)
+ except OSError:
+ isdir = False
+ if isdir:
+ self._rmtree(fullname)
+ else:
+ try:
+ self._remove(fullname)
+ except OSError:
+ pass
+ try:
+ self._rmdir(path)
+ except OSError:
+ pass
+
# ======= Compatibility for datasets that care about Python versions ========
_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
+
def add_py3_data(path):
- for item in _PY3_DATA_UPDATES:
- if item in str(path) and "/PY3" not in str(path):
- pos = path.index(item) + len(item)
- if path[pos : pos + 4] == ".zip":
- pos += 4
- path = path[:pos] + "/PY3" + path[pos:]
- break
+ if PY3:
+ for item in _PY3_DATA_UPDATES:
+ if item in str(path) and "/PY3" not in str(path):
+ pos = path.index(item) + len(item)
+ if path[pos : pos + 4] == ".zip":
+ pos += 4
+ path = path[:pos] + "/PY3" + path[pos:]
+ break
return path
return init_func(*args, **kwargs)
return wraps(init_func)(_decorator)
+
+
+# ======= Compatibility layer for __str__ and __repr__ ==========
+def remove_accents(text):
+
+ if isinstance(text, bytes):
+ text = text.decode('ascii')
+
+ category = unicodedata.category # this gives a small (~10%) speedup
+ return ''.join(
+ c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
+ )
+
+
+# Select the best transliteration method:
+try:
+ # Older versions of Unidecode are licensed under Artistic License;
+ # assume an older version is installed.
+ from unidecode import unidecode as transliterate
+except ImportError:
+ try:
+ # text-unidecode implementation is worse than Unidecode
+ # implementation so Unidecode is preferred.
+ from text_unidecode import unidecode as transliterate
+ except ImportError:
+ # This transliteration method should be enough
+ # for many Western languages.
+ transliterate = remove_accents
+
+
+def python_2_unicode_compatible(klass):
+ """
+ This decorator defines __unicode__ method and fixes
+ __repr__ and __str__ methods under Python 2.
+
+ To support Python 2 and 3 with a single code base,
+ define __str__ and __repr__ methods returning unicode
+ text and apply this decorator to the class.
+
+ Original __repr__ and __str__ would be available
+ as unicode_repr and __unicode__ (under both Python 2
+ and Python 3).
+ """
+
+ if not issubclass(klass, object):
+ raise ValueError("This decorator doesn't work for old-style classes")
+
+ # both __unicode__ and unicode_repr are public because they
+ # may be useful in console under Python 2.x
+
+ # if __str__ or __repr__ are not overriden in a subclass,
+ # they may be already fixed by this decorator in a parent class
+ # and we shouldn't them again
+
+ if not _was_fixed(klass.__str__):
+ klass.__unicode__ = klass.__str__
+ if not PY3:
+ klass.__str__ = _7bit(_transliterated(klass.__unicode__))
+
+ if not _was_fixed(klass.__repr__):
+ klass.unicode_repr = klass.__repr__
+ if not PY3:
+ klass.__repr__ = _7bit(klass.unicode_repr)
+
+ return klass
+
+
+def unicode_repr(obj):
+ """
+ For classes that was fixed with @python_2_unicode_compatible
+ ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
+ the result is returned without "u" letter (to make output the
+ same under Python 2.x and Python 3.x); for other variables
+ it is the same as ``repr``.
+ """
+ if PY3:
+ return repr(obj)
+
+ # Python 2.x
+ if hasattr(obj, 'unicode_repr'):
+ return obj.unicode_repr()
+
+ if isinstance(obj, text_type):
+ return repr(obj)[1:] # strip "u" letter from output
+
+ return repr(obj)
+
+
+def _transliterated(method):
+ def wrapper(self):
+ return transliterate(method(self))
+
+ update_wrapper(wrapper, method, ["__name__", "__doc__"])
+ if hasattr(method, "_nltk_compat_7bit"):
+ wrapper._nltk_compat_7bit = method._nltk_compat_7bit
+
+ wrapper._nltk_compat_transliterated = True
+ return wrapper
+
+
+def _7bit(method):
+ def wrapper(self):
+ return method(self).encode('ascii', 'backslashreplace')
+
+ update_wrapper(wrapper, method, ["__name__", "__doc__"])
+
+ if hasattr(method, "_nltk_compat_transliterated"):
+ wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
+
+ wrapper._nltk_compat_7bit = True
+ return wrapper
+
+
+def _was_fixed(method):
+ return getattr(method, "_nltk_compat_7bit", False) or getattr(
+ method, "_nltk_compat_transliterated", False
+ )
+
+
+class Fraction(fractions.Fraction):
+ """
+ This is a simplified backwards compatible version of fractions.Fraction
+ from Python >=3.5. It adds the `_normalize` parameter such that it does
+ not normalize the denominator to the Greatest Common Divisor (gcd) when
+ the numerator is 0.
+
+ This is most probably only used by the nltk.translate.bleu_score.py where
+ numerator and denominator of the different ngram precisions are mutable.
+ But the idea of "mutable" fraction might not be applicable to other usages,
+ See http://stackoverflow.com/questions/34561265
+
+ This objects should be deprecated once NLTK stops supporting Python < 3.5
+ See https://github.com/nltk/nltk/issues/1330
+ """
+
+ def __new__(cls, numerator=0, denominator=None, _normalize=True):
+ cls = super(Fraction, cls).__new__(cls, numerator, denominator)
+ # To emulate fraction.Fraction.from_float across Python >=2.7,
+ # check that numerator is an integer and denominator is not None.
+ if not _normalize and type(numerator) == int and denominator:
+ cls._numerator = numerator
+ cls._denominator = denominator
+ return cls
# Natural Language Toolkit: Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader import *
abc = LazyCorpusLoader(
- "abc",
+ 'abc',
PlaintextCorpusReader,
- r"(?!\.).*\.txt",
- encoding=[("science", "latin_1"), ("rural", "utf8")],
+ r'(?!\.).*\.txt',
+ encoding=[('science', 'latin_1'), ('rural', 'utf8')],
)
-alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino")
+alpino = LazyCorpusLoader('alpino', AlpinoCorpusReader, tagset='alpino')
brown = LazyCorpusLoader(
- "brown",
+ 'brown',
CategorizedTaggedCorpusReader,
- r"c[a-z]\d\d",
- cat_file="cats.txt",
- tagset="brown",
+ r'c[a-z]\d\d',
+ cat_file='cats.txt',
+ tagset='brown',
encoding="ascii",
)
cess_cat = LazyCorpusLoader(
- "cess_cat",
+ 'cess_cat',
BracketParseCorpusReader,
- r"(?!\.).*\.tbf",
- tagset="unknown",
- encoding="ISO-8859-15",
+ r'(?!\.).*\.tbf',
+ tagset='unknown',
+ encoding='ISO-8859-15',
)
cess_esp = LazyCorpusLoader(
- "cess_esp",
+ 'cess_esp',
BracketParseCorpusReader,
- r"(?!\.).*\.tbf",
- tagset="unknown",
- encoding="ISO-8859-15",
+ r'(?!\.).*\.tbf',
+ tagset='unknown',
+ encoding='ISO-8859-15',
)
-cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"])
-comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt")
+cmudict = LazyCorpusLoader('cmudict', CMUDictCorpusReader, ['cmudict'])
+comtrans = LazyCorpusLoader('comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
comparative_sentences = LazyCorpusLoader(
- "comparative_sentences",
+ 'comparative_sentences',
ComparativeSentencesCorpusReader,
- r"labeledSentences\.txt",
- encoding="latin-1",
+ r'labeledSentences\.txt',
+ encoding='latin-1',
)
conll2000 = LazyCorpusLoader(
- "conll2000",
+ 'conll2000',
ConllChunkCorpusReader,
- ["train.txt", "test.txt"],
- ("NP", "VP", "PP"),
- tagset="wsj",
- encoding="ascii",
+ ['train.txt', 'test.txt'],
+ ('NP', 'VP', 'PP'),
+ tagset='wsj',
+ encoding='ascii',
)
conll2002 = LazyCorpusLoader(
- "conll2002",
+ 'conll2002',
ConllChunkCorpusReader,
- ".*\.(test|train).*",
- ("LOC", "PER", "ORG", "MISC"),
- encoding="utf-8",
+ '.*\.(test|train).*',
+ ('LOC', 'PER', 'ORG', 'MISC'),
+ encoding='utf-8',
)
conll2007 = LazyCorpusLoader(
- "conll2007",
+ 'conll2007',
DependencyCorpusReader,
- ".*\.(test|train).*",
- encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
+ '.*\.(test|train).*',
+ encoding=[('eus', 'ISO-8859-2'), ('esp', 'utf8')],
)
-crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
+crubadan = LazyCorpusLoader('crubadan', CrubadanCorpusReader, '.*\.txt')
dependency_treebank = LazyCorpusLoader(
- "dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
+ 'dependency_treebank', DependencyCorpusReader, '.*\.dp', encoding='ascii'
)
floresta = LazyCorpusLoader(
- "floresta",
+ 'floresta',
BracketParseCorpusReader,
- r"(?!\.).*\.ptb",
- "#",
- tagset="unknown",
- encoding="ISO-8859-15",
+ r'(?!\.).*\.ptb',
+ '#',
+ tagset='unknown',
+ encoding='ISO-8859-15',
)
framenet15 = LazyCorpusLoader(
- "framenet_v15",
+ 'framenet_v15',
FramenetCorpusReader,
[
- "frRelation.xml",
- "frameIndex.xml",
- "fulltextIndex.xml",
- "luIndex.xml",
- "semTypes.xml",
+ 'frRelation.xml',
+ 'frameIndex.xml',
+ 'fulltextIndex.xml',
+ 'luIndex.xml',
+ 'semTypes.xml',
],
)
framenet = LazyCorpusLoader(
- "framenet_v17",
+ 'framenet_v17',
FramenetCorpusReader,
[
- "frRelation.xml",
- "frameIndex.xml",
- "fulltextIndex.xml",
- "luIndex.xml",
- "semTypes.xml",
+ 'frRelation.xml',
+ 'frameIndex.xml',
+ 'fulltextIndex.xml',
+ 'luIndex.xml',
+ 'semTypes.xml',
],
)
gazetteers = LazyCorpusLoader(
- "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
+ 'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt', encoding='ISO-8859-2'
)
genesis = LazyCorpusLoader(
- "genesis",
+ 'genesis',
PlaintextCorpusReader,
- r"(?!\.).*\.txt",
+ r'(?!\.).*\.txt',
encoding=[
- ("finnish|french|german", "latin_1"),
- ("swedish", "cp865"),
- (".*", "utf_8"),
+ ('finnish|french|german', 'latin_1'),
+ ('swedish', 'cp865'),
+ ('.*', 'utf_8'),
],
)
gutenberg = LazyCorpusLoader(
- "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
+ 'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
)
-ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
+ieer = LazyCorpusLoader('ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
- "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
+ 'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
)
# [XX] This should probably just use TaggedCorpusReader:
indian = LazyCorpusLoader(
- "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
+ 'indian', IndianCorpusReader, r'(?!\.).*\.pos', tagset='unknown', encoding='utf8'
)
-jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8")
-knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
-lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp")
+jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
+knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
+lin_thesaurus = LazyCorpusLoader('lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
mac_morpho = LazyCorpusLoader(
- "mac_morpho",
+ 'mac_morpho',
MacMorphoCorpusReader,
- r"(?!\.).*\.txt",
- tagset="unknown",
- encoding="latin-1",
+ r'(?!\.).*\.txt',
+ tagset='unknown',
+ encoding='latin-1',
)
machado = LazyCorpusLoader(
- "machado",
+ 'machado',
PortugueseCategorizedPlaintextCorpusReader,
- r"(?!\.).*\.txt",
- cat_pattern=r"([a-z]*)/.*",
- encoding="latin-1",
+ r'(?!\.).*\.txt',
+ cat_pattern=r'([a-z]*)/.*',
+ encoding='latin-1',
)
masc_tagged = LazyCorpusLoader(
- "masc_tagged",
+ 'masc_tagged',
CategorizedTaggedCorpusReader,
- r"(spoken|written)/.*\.txt",
- cat_file="categories.txt",
- tagset="wsj",
+ r'(spoken|written)/.*\.txt',
+ cat_file='categories.txt',
+ tagset='wsj',
encoding="utf-8",
sep="_",
)
movie_reviews = LazyCorpusLoader(
- "movie_reviews",
+ 'movie_reviews',
CategorizedPlaintextCorpusReader,
- r"(?!\.).*\.txt",
- cat_pattern=r"(neg|pos)/.*",
- encoding="ascii",
+ r'(?!\.).*\.txt',
+ cat_pattern=r'(neg|pos)/.*',
+ encoding='ascii',
)
multext_east = LazyCorpusLoader(
- "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
+ 'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8"
)
names = LazyCorpusLoader(
- "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
+ 'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii'
)
nps_chat = LazyCorpusLoader(
- "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
+ 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj'
)
opinion_lexicon = LazyCorpusLoader(
- "opinion_lexicon",
+ 'opinion_lexicon',
OpinionLexiconCorpusReader,
- r"(\w+)\-words\.txt",
- encoding="ISO-8859-2",
+ r'(\w+)\-words\.txt',
+ encoding='ISO-8859-2',
)
ppattach = LazyCorpusLoader(
- "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
+ 'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']
)
product_reviews_1 = LazyCorpusLoader(
- "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
+ 'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
)
product_reviews_2 = LazyCorpusLoader(
- "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
+ 'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
)
pros_cons = LazyCorpusLoader(
- "pros_cons",
+ 'pros_cons',
ProsConsCorpusReader,
- r"Integrated(Cons|Pros)\.txt",
- cat_pattern=r"Integrated(Cons|Pros)\.txt",
- encoding="ISO-8859-2",
+ r'Integrated(Cons|Pros)\.txt',
+ cat_pattern=r'Integrated(Cons|Pros)\.txt',
+ encoding='ISO-8859-2',
)
ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
- "ptb",
+ 'ptb',
CategorizedBracketParseCorpusReader,
- r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
- cat_file="allcats.txt",
- tagset="wsj",
+ r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
+ cat_file='allcats.txt',
+ tagset='wsj',
)
qc = LazyCorpusLoader(
- "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
+ 'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2'
)
reuters = LazyCorpusLoader(
- "reuters",
+ 'reuters',
CategorizedPlaintextCorpusReader,
- "(training|test).*",
- cat_file="cats.txt",
- encoding="ISO-8859-2",
+ '(training|test).*',
+ cat_file='cats.txt',
+ encoding='ISO-8859-2',
)
-rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
-senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos")
+rte = LazyCorpusLoader('rte', RTECorpusReader, r'(?!\.).*\.xml')
+senseval = LazyCorpusLoader('senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
sentence_polarity = LazyCorpusLoader(
- "sentence_polarity",
+ 'sentence_polarity',
CategorizedSentencesCorpusReader,
- r"rt-polarity\.(neg|pos)",
- cat_pattern=r"rt-polarity\.(neg|pos)",
- encoding="utf-8",
+ r'rt-polarity\.(neg|pos)',
+ cat_pattern=r'rt-polarity\.(neg|pos)',
+ encoding='utf-8',
)
sentiwordnet = LazyCorpusLoader(
- "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
+ 'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8'
)
-shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml")
+shakespeare = LazyCorpusLoader('shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
sinica_treebank = LazyCorpusLoader(
- "sinica_treebank",
+ 'sinica_treebank',
SinicaTreebankCorpusReader,
- ["parsed"],
- tagset="unknown",
- encoding="utf-8",
+ ['parsed'],
+ tagset='unknown',
+ encoding='utf-8',
)
state_union = LazyCorpusLoader(
- "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
+ 'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='ISO-8859-2'
)
stopwords = LazyCorpusLoader(
- "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
+ 'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8'
)
subjectivity = LazyCorpusLoader(
- "subjectivity",
+ 'subjectivity',
CategorizedSentencesCorpusReader,
- r"(quote.tok.gt9|plot.tok.gt9)\.5000",
- cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
- encoding="latin-1",
+ r'(quote.tok.gt9|plot.tok.gt9)\.5000',
+ cat_map={'quote.tok.gt9.5000': ['subj'], 'plot.tok.gt9.5000': ['obj']},
+ encoding='latin-1',
)
swadesh = LazyCorpusLoader(
- "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
+ 'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
)
swadesh110 = LazyCorpusLoader(
- 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
+ 'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
)
swadesh207 = LazyCorpusLoader(
- 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
+ 'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
)
-switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
-timit = LazyCorpusLoader("timit", TimitCorpusReader)
+switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
+timit = LazyCorpusLoader('timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
- "timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
+ 'timit', TimitTaggedCorpusReader, '.+\.tags', tagset='wsj', encoding='ascii'
)
toolbox = LazyCorpusLoader(
- "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
+ 'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)'
)
treebank = LazyCorpusLoader(
- "treebank/combined",
+ 'treebank/combined',
BracketParseCorpusReader,
- r"wsj_.*\.mrg",
- tagset="wsj",
- encoding="ascii",
+ r'wsj_.*\.mrg',
+ tagset='wsj',
+ encoding='ascii',
)
treebank_chunk = LazyCorpusLoader(
- "treebank/tagged",
+ 'treebank/tagged',
ChunkedCorpusReader,
- r"wsj_.*\.pos",
- sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
+ r'wsj_.*\.pos',
+ sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
para_block_reader=tagged_treebank_para_block_reader,
- tagset="wsj",
- encoding="ascii",
+ tagset='wsj',
+ encoding='ascii',
)
treebank_raw = LazyCorpusLoader(
- "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
+ 'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2'
)
-twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
-udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
-udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
+twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, '.*\.json')
+udhr = LazyCorpusLoader('udhr', UdhrCorpusReader)
+udhr2 = LazyCorpusLoader('udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8')
universal_treebanks = LazyCorpusLoader(
- "universal_treebanks_v20",
+ 'universal_treebanks_v20',
ConllCorpusReader,
- r".*\.conll",
+ r'.*\.conll',
columntypes=(
- "ignore",
- "words",
- "ignore",
- "ignore",
- "pos",
- "ignore",
- "ignore",
- "ignore",
- "ignore",
- "ignore",
+ 'ignore',
+ 'words',
+ 'ignore',
+ 'ignore',
+ 'pos',
+ 'ignore',
+ 'ignore',
+ 'ignore',
+ 'ignore',
+ 'ignore',
),
)
-verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml")
+verbnet = LazyCorpusLoader('verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
- "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
+ 'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2'
)
wordnet = LazyCorpusLoader(
- "wordnet",
+ 'wordnet',
WordNetCorpusReader,
- LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+ LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'),
)
-wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
+wordnet_ic = LazyCorpusLoader('wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
- "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
+ 'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii'
)
# defined after treebank
propbank = LazyCorpusLoader(
- "propbank",
+ 'propbank',
PropbankCorpusReader,
- "prop.txt",
- "frames/.*\.xml",
- "verbs.txt",
- lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
+ 'prop.txt',
+ 'frames/.*\.xml',
+ 'verbs.txt',
+ lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank,
) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
- "nombank.1.0",
+ 'nombank.1.0',
NombankCorpusReader,
- "nombank.1.0",
- "frames/.*\.xml",
- "nombank.1.0.words",
- lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
+ 'nombank.1.0',
+ 'frames/.*\.xml',
+ 'nombank.1.0.words',
+ lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank,
) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
- "propbank",
+ 'propbank',
PropbankCorpusReader,
- "prop.txt",
- "frames/.*\.xml",
- "verbs.txt",
+ 'prop.txt',
+ 'frames/.*\.xml',
+ 'verbs.txt',
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
- "nombank.1.0",
+ 'nombank.1.0',
NombankCorpusReader,
- "nombank.1.0",
- "frames/.*\.xml",
- "nombank.1.0.words",
+ 'nombank.1.0',
+ 'frames/.*\.xml',
+ 'nombank.1.0.words',
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
semcor = LazyCorpusLoader(
- "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
+ 'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml', wordnet
) # Must be defined *after* wordnet corpus.
nonbreaking_prefixes = LazyCorpusLoader(
- "nonbreaking_prefixes",
+ 'nonbreaking_prefixes',
NonbreakingPrefixesCorpusReader,
- r"(?!README|\.).*",
- encoding="utf8",
+ r'(?!README|\.).*',
+ encoding='utf8',
)
perluniprops = LazyCorpusLoader(
- "perluniprops",
+ 'perluniprops',
UnicharsCorpusReader,
- r"(?!README|\.).*",
- nltk_data_subdir="misc",
- encoding="utf8",
+ r'(?!README|\.).*',
+ nltk_data_subdir='misc',
+ encoding='utf8',
)
# mwa_ppdb = LazyCorpusLoader(
# ycoe.demo()
-if __name__ == "__main__":
+if __name__ == '__main__':
# demo()
pass
for name in dir(nltk.corpus):
obj = getattr(nltk.corpus, name, None)
- if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
+ if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
obj._unload()
# Natural Language Toolkit: Europarl Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Create a new corpus reader instance for each European language
danish = LazyCorpusLoader(
- "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
+ 'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8'
)
dutch = LazyCorpusLoader(
- "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
+ 'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8'
)
english = LazyCorpusLoader(
- "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
+ 'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8'
)
finnish = LazyCorpusLoader(
- "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
+ 'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8'
)
french = LazyCorpusLoader(
- "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
+ 'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8'
)
german = LazyCorpusLoader(
- "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
+ 'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8'
)
greek = LazyCorpusLoader(
- "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
+ 'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8'
)
italian = LazyCorpusLoader(
- "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
+ 'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8'
)
portuguese = LazyCorpusLoader(
- "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
+ 'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8'
)
spanish = LazyCorpusLoader(
- "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
+ 'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8'
)
swedish = LazyCorpusLoader(
- "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
+ 'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8'
)
# Natural Language Toolkit: Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
-from nltk.corpus.reader.panlex_swadesh import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
'NonbreakingPrefixesCorpusReader',
'UnicharsCorpusReader',
'MWAPPDBCorpusReader',
- 'PanlexSwadeshCorpusReader',
]
# Natural Language Toolkit: Aligned Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# Author: Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
+from six import string_types
+
from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
from nltk.translate import AlignedSent, Alignment
self,
root,
fileids,
- sep="/",
+ sep='/',
word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
- encoding="latin1",
+ encoding='latin1',
):
"""
Construct a new Aligned Corpus reader for a set of documents
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: API for Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
"""
API for corpus readers.
"""
+from __future__ import unicode_literals
import os
import re
from collections import defaultdict
from itertools import chain
+from six import string_types
+
+from nltk import compat
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
from nltk.corpus.reader.util import *
+@compat.python_2_unicode_compatible
class CorpusReader(object):
"""
A base class for "corpus reader" classes, each of which can be
be used to select which portion of the corpus should be returned.
"""
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
"""
:type root: PathPointer or str
:param root: A path pointer identifying the root directory for
tagged_...() methods.
"""
# Convert the root to a path pointer, if necessary.
- if isinstance(root, str) and not isinstance(root, PathPointer):
- m = re.match("(.*\.zip)/?(.*)$|", root)
+ if isinstance(root, string_types) and not isinstance(root, PathPointer):
+ m = re.match('(.*\.zip)/?(.*)$|', root)
zipfile, zipentry = m.groups()
if zipfile:
root = ZipFilePathPointer(zipfile, zipentry)
else:
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
- raise TypeError("CorpusReader: expected a string or a PathPointer")
+ raise TypeError('CorpusReader: expected a string or a PathPointer')
# If `fileids` is a regexp, then expand it.
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = find_corpus_fileids(root, fileids)
self._fileids = fileids
def __repr__(self):
if isinstance(self._root, ZipFilePathPointer):
- path = "%s/%s" % (self._root.zipfile.filename, self._root.entry)
+ path = '%s/%s' % (self._root.zipfile.filename, self._root.entry)
else:
- path = "%s" % self._root.path
- return "<%s in %r>" % (self.__class__.__name__, path)
+ path = '%s' % self._root.path
+ return '<%s in %r>' % (self.__class__.__name__, path)
def ensure_loaded(self):
"""
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
paths = [self._root.join(f) for f in fileids]
self._file = None #: fileid of file containing the mapping
self._delimiter = None #: delimiter for ``self._file``
- if "cat_pattern" in kwargs:
- self._pattern = kwargs["cat_pattern"]
- del kwargs["cat_pattern"]
- elif "cat_map" in kwargs:
- self._map = kwargs["cat_map"]
- del kwargs["cat_map"]
- elif "cat_file" in kwargs:
- self._file = kwargs["cat_file"]
- del kwargs["cat_file"]
- if "cat_delimiter" in kwargs:
- self._delimiter = kwargs["cat_delimiter"]
- del kwargs["cat_delimiter"]
+ if 'cat_pattern' in kwargs:
+ self._pattern = kwargs['cat_pattern']
+ del kwargs['cat_pattern']
+ elif 'cat_map' in kwargs:
+ self._map = kwargs['cat_map']
+ del kwargs['cat_map']
+ elif 'cat_file' in kwargs:
+ self._file = kwargs['cat_file']
+ del kwargs['cat_file']
+ if 'cat_delimiter' in kwargs:
+ self._delimiter = kwargs['cat_delimiter']
+ del kwargs['cat_delimiter']
else:
raise ValueError(
- "Expected keyword argument cat_pattern or " "cat_map or cat_file."
+ 'Expected keyword argument cat_pattern or ' 'cat_map or cat_file.'
)
- if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
+ if 'cat_pattern' in kwargs or 'cat_map' in kwargs or 'cat_file' in kwargs:
raise ValueError(
- "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
+ 'Specify exactly one of: cat_pattern, ' 'cat_map, cat_file.'
)
def _init(self):
file_id, categories = line.split(self._delimiter, 1)
if file_id not in self.fileids():
raise ValueError(
- "In category mapping file %s: %s "
- "not found" % (self._file, file_id)
+ 'In category mapping file %s: %s '
+ 'not found' % (self._file, file_id)
)
for category in categories.split(self._delimiter):
self._add(file_id, category)
self._init()
if fileids is None:
return sorted(self._c2f)
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(set.union(*[self._f2c[d] for d in fileids]))
"""
if categories is None:
return super(CategorizedCorpusReader, self).fileids()
- elif isinstance(categories, str):
+ elif isinstance(categories, string_types):
if self._f2c is None:
self._init()
if categories in self._c2f:
return sorted(self._c2f[categories])
else:
- raise ValueError("Category %s not found" % categories)
+ raise ValueError('Category %s not found' % categories)
else:
if self._f2c is None:
self._init()
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: Plaintext Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
- tag = "c5" if c5 else "pos"
+ tag = 'c5' if c5 else 'pos'
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
- tag = "c5" if c5 else "pos"
+ tag = 'c5' if c5 else 'pos'
return self._views(
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
)
result = []
xmldoc = ElementTree.parse(fileid).getroot()
- for xmlsent in xmldoc.findall(".//s"):
+ for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if strip_space or stem:
word = word.strip()
if stem:
- word = xmlword.get("hw", word)
- if tag == "c5":
- word = (word, xmlword.get("c5"))
- elif tag == "pos":
- word = (word, xmlword.get("pos", xmlword.get("c5")))
+ word = xmlword.get('hw', word)
+ if tag == 'c5':
+ word = (word, xmlword.get('c5'))
+ elif tag == 'pos':
+ word = (word, xmlword.get('pos', xmlword.get('c5')))
sent.append(word)
if bracket_sent:
- result.append(BNCSentence(xmlsent.attrib["n"], sent))
+ result.append(BNCSentence(xmlsent.attrib['n'], sent))
else:
result.extend(sent)
if result is None:
result = []
for child in elt:
- if child.tag in ("c", "w"):
+ if child.tag in ('c', 'w'):
result.append(child)
else:
_all_xmlwords_in(child, result)
"""
tags_to_ignore = set(
- ["pb", "gap", "vocal", "event", "unclear", "shift", "pause", "align"]
+ ['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
)
"""These tags are ignored. For their description refer to the
technical documentation, for example,
:param stem: If true, then substitute stems for words.
"""
if sent:
- tagspec = ".*/s"
+ tagspec = '.*/s'
else:
- tagspec = ".*/s/(.*/)?(c|w)"
+ tagspec = '.*/s/(.*/)?(c|w)'
self._sent = sent
self._tag = tag
self._strip_space = strip_space
# Read in a tasty header.
self._open()
- self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
+ self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
self.close()
# Reset tag context.
def handle_header(self, elt, context):
# Set up some metadata!
- titles = elt.findall("titleStmt/title")
+ titles = elt.findall('titleStmt/title')
if titles:
- self.title = "\n".join(title.text.strip() for title in titles)
+ self.title = '\n'.join(title.text.strip() for title in titles)
- authors = elt.findall("titleStmt/author")
+ authors = elt.findall('titleStmt/author')
if authors:
- self.author = "\n".join(author.text.strip() for author in authors)
+ self.author = '\n'.join(author.text.strip() for author in authors)
- editors = elt.findall("titleStmt/editor")
+ editors = elt.findall('titleStmt/editor')
if editors:
- self.editor = "\n".join(editor.text.strip() for editor in editors)
+ self.editor = '\n'.join(editor.text.strip() for editor in editors)
- resps = elt.findall("titleStmt/respStmt")
+ resps = elt.findall('titleStmt/respStmt')
if resps:
- self.resps = "\n\n".join(
- "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
+ self.resps = '\n\n'.join(
+ '\n'.join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
)
def handle_elt(self, elt, context):
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
- word = elt.get("hw", word)
- if self._tag == "c5":
- word = (word, elt.get("c5"))
- elif self._tag == "pos":
- word = (word, elt.get("pos", elt.get("c5")))
+ word = elt.get('hw', word)
+ if self._tag == 'c5':
+ word = (word, elt.get('c5'))
+ elif self._tag == 'pos':
+ word = (word, elt.get('pos', elt.get('c5')))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
- if child.tag in ("mw", "hi", "corr", "trunc"):
+ if child.tag in ('mw', 'hi', 'corr', 'trunc'):
sent += [self.handle_word(w) for w in child]
- elif child.tag in ("w", "c"):
+ elif child.tag in ('w', 'c'):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
- raise ValueError("Unexpected element %s" % child.tag)
- return BNCSentence(elt.attrib["n"], sent)
+ raise ValueError('Unexpected element %s' % child.tag)
+ return BNCSentence(elt.attrib['n'], sent)
# Natural Language Toolkit: Penn Treebank Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.corpus.reader.api import *
# we use [^\s()]+ instead of \S+? to avoid matching ()
-SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
-TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
-WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
-EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
+SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
+TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
+WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
+EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
class BracketParseCorpusReader(SyntaxCorpusReader):
root,
fileids,
comment_char=None,
- detect_blocks="unindented_paren",
- encoding="utf8",
+ detect_blocks='unindented_paren',
+ encoding='utf8',
tagset=None,
):
"""
self._tagset = tagset
def _read_block(self, stream):
- if self._detect_blocks == "sexpr":
+ if self._detect_blocks == 'sexpr':
return read_sexpr_block(stream, comment_char=self._comment_char)
- elif self._detect_blocks == "blankline":
+ elif self._detect_blocks == 'blankline':
return read_blankline_block(stream)
- elif self._detect_blocks == "unindented_paren":
+ elif self._detect_blocks == 'unindented_paren':
# Tokens start with unindented left parens.
- toks = read_regexp_block(stream, start_re=r"^\(")
+ toks = read_regexp_block(stream, start_re=r'^\(')
# Strip any comments out of the tokens.
if self._comment_char:
toks = [
- re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
+ re.sub('(?m)^%s.*' % re.escape(self._comment_char), '', tok)
for tok in toks
]
return toks
else:
- assert 0, "bad block type"
+ assert 0, 'bad block type'
def _normalize(self, t):
+ # If there's an empty set of brackets surrounding the actual
+ # parse, then strip them off.
+ if EMPTY_BRACKETS.match(t):
+ t = t.strip()[1:-1]
# Replace leaves of the form (!), (,), with (! !), (, ,)
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
# Replace leaves of the form (tag word root) with (tag word)
def _parse(self, t):
try:
- tree = Tree.fromstring(self._normalize(t))
- # If there's an empty node at the top, strip it off
- if tree.label() == '' and len(tree) == 1:
- return tree[0]
- else:
- return tree
+ return Tree.fromstring(self._normalize(t))
except ValueError as e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
# Try to recover, if we can:
- if e.args == ("mismatched parens",):
+ if e.args == ('mismatched parens',):
for n in range(1, 5):
try:
- v = Tree(self._normalize(t + ")" * n))
+ v = Tree(self._normalize(t + ')' * n))
sys.stderr.write(
" Recovered by adding %d close " "paren(s)\n" % n
)
# Try something else:
sys.stderr.write(" Recovered by returning a flat parse.\n")
# sys.stderr.write(' '.join(t.split())+'\n')
- return Tree("S", self._tag(t))
+ return Tree('S', self._tag(t))
def _tag(self, t, tagset=None):
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
untouched.
"""
- def __init__(self, root, encoding="ISO-8859-1", tagset=None):
+ def __init__(self, root, encoding='ISO-8859-1', tagset=None):
BracketParseCorpusReader.__init__(
self,
root,
- "alpino\.xml",
- detect_blocks="blankline",
+ 'alpino\.xml',
+ detect_blocks='blankline',
encoding=encoding,
tagset=tagset,
)
# Natural Language Toolkit: Categorized Sentences Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
sentiment categorization with respect to rating scales". Proceedings of the
ACL, 2005.
"""
+from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
- encoding="utf8",
+ encoding='utf8',
**kwargs
):
"""
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
import sys
+from six import string_types
+
from nltk.corpus.reader import util
from nltk.corpus.reader.util import *
class ChasenCorpusReader(CorpusReader):
- def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
+ def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
self._sent_splitter = sent_splitter
CorpusReader.__init__(self, root, fileids, encoding)
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
sent = []
for line in para_str.splitlines():
- _eos = line.strip() == "EOS"
- _cells = line.split("\t")
- w = (_cells[0], "\t".join(_cells[1:]))
+ _eos = line.strip() == 'EOS'
+ _cells = line.split('\t')
+ w = (_cells[0], '\t'.join(_cells[1:]))
if not _eos:
sent.append(w)
import nltk
from nltk.corpus.util import LazyCorpusLoader
- jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
- print("/".join(jeita.words()[22100:22140]))
+ jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
+ print('/'.join(jeita.words()[22100:22140]))
print(
- "\nEOS\n".join(
- "\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent)
+ '\nEOS\n'.join(
+ '\n'.join("%s/%s" % (w[0], w[1].split('\t')[2]) for w in sent)
for sent in jeita.tagged_sents()[2170:2173]
)
)
from nltk.corpus.util import LazyCorpusLoader
- jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
+ jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
- assert isinstance(jeita.tagged_words()[0][1], str)
+ assert isinstance(jeita.tagged_words()[0][1], string_types)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
test()
# CHILDES XML Corpus Reader
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
# URL: <http://nltk.org/>
"""
Corpus reader for the XML version of the CHILDES corpus.
"""
+from __future__ import print_function, division
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
import re
from collections import defaultdict
+from six import string_types
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
# to resolve the namespace issue
-NS = "http://www.talkbank.org/ns/talkbank"
+NS = 'http://www.talkbank.org/ns/talkbank'
class CHILDESCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the CHILDES corpus.
- The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
- version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
+ The CHILDES corpus is available at ``http://childes.psy.cmu.edu/``. The XML
+ version of CHILDES is located at ``http://childes.psy.cmu.edu/data-xml/``.
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
(``nltk_data/corpora/CHILDES/``).
def words(
self,
fileids=None,
- speaker="ALL",
+ speaker='ALL',
stem=False,
relation=False,
strip_space=True,
def tagged_words(
self,
fileids=None,
- speaker="ALL",
+ speaker='ALL',
stem=False,
relation=False,
strip_space=True,
def sents(
self,
fileids=None,
- speaker="ALL",
+ speaker='ALL',
stem=False,
relation=None,
strip_space=True,
def tagged_sents(
self,
fileids=None,
- speaker="ALL",
+ speaker='ALL',
stem=False,
relation=None,
strip_space=True,
# getting participants' data
pat = dictOfDicts()
for participant in xmldoc.findall(
- ".//{%s}Participants/{%s}participant" % (NS, NS)
+ './/{%s}Participants/{%s}participant' % (NS, NS)
):
for (key, value) in participant.items():
- pat[participant.get("id")][key] = value
+ pat[participant.get('id')][key] = value
return pat
- def age(self, fileids=None, speaker="CHI", month=False):
+ def age(self, fileids=None, speaker='CHI', month=False):
"""
:return: the given file(s) as string or int
:rtype: list or int
def _get_age(self, fileid, speaker, month):
xmldoc = ElementTree.parse(fileid).getroot()
- for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)):
+ for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
try:
- if pat.get("id") == speaker:
- age = pat.get("age")
+ if pat.get('id') == speaker:
+ age = pat.get('age')
if month:
age = self.convert_age(age)
return age
pass
return age_month
- def MLU(self, fileids=None, speaker="CHI"):
+ def MLU(self, fileids=None, speaker='CHI'):
"""
:return: the given file(s) as a floating number
:rtype: list(float)
for sent in sents:
posList = [pos for (word, pos) in sent]
# if any part of the sentence is intelligible
- if any(pos == "unk" for pos in posList):
+ if any(pos == 'unk' for pos in posList):
continue
# if the sentence is null
elif sent == []:
else:
results.append([word for (word, pos) in sent])
# count number of fillers
- if len(set(["co", None]).intersection(posList)) > 0:
- numFillers += posList.count("co")
+ if len(set(['co', None]).intersection(posList)) > 0:
+ numFillers += posList.count('co')
numFillers += posList.count(None)
sentDiscount += 1
lastSent = sent
# count number of morphemes
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
numWords = (
- len(flatten([word.split("-") for word in thisWordList])) - numFillers
+ len(flatten([word.split('-') for word in thisWordList])) - numFillers
)
numSents = len(results) - sentDiscount
mlu = numWords / numSents
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
):
if (
- isinstance(speaker, str) and speaker != "ALL"
+ isinstance(speaker, string_types) and speaker != 'ALL'
): # ensure we have a list of speakers
speaker = [speaker]
xmldoc = ElementTree.parse(fileid).getroot()
# processing each xml doc
results = []
- for xmlsent in xmldoc.findall(".//{%s}u" % NS):
+ for xmlsent in xmldoc.findall('.//{%s}u' % NS):
sents = []
# select speakers
- if speaker == "ALL" or xmlsent.get("who") in speaker:
- for xmlword in xmlsent.findall(".//{%s}w" % NS):
+ if speaker == 'ALL' or xmlsent.get('who') in speaker:
+ for xmlword in xmlsent.findall('.//{%s}w' % NS):
infl = None
suffixStem = None
suffixTag = None
# getting replaced words
- if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)):
+ if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
xmlword = xmlsent.find(
- ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)
+ './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
)
- elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)):
- xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
+ elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
+ xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
# get text
if xmlword.text:
word = xmlword.text
else:
- word = ""
+ word = ''
# strip tailing space
if strip_space:
word = word.strip()
# stem
if relation or stem:
try:
- xmlstem = xmlword.find(".//{%s}stem" % NS)
+ xmlstem = xmlword.find('.//{%s}stem' % NS)
word = xmlstem.text
except AttributeError as e:
pass
# if there is an inflection
try:
xmlinfl = xmlword.find(
- ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)
+ './/{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
)
- word += "-" + xmlinfl.text
+ word += '-' + xmlinfl.text
except:
pass
# if there is a suffix
try:
xmlsuffix = xmlword.find(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
+ './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
% (NS, NS, NS, NS)
)
suffixStem = xmlsuffix.text
tag = ""
try:
xmlsuffixpos = xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
+ './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
% (NS, NS, NS, NS, NS)
)
xmlsuffixpos2 = xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
+ './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
% (NS, NS, NS, NS, NS)
)
if xmlsuffixpos2:
# <mor></mor><mor type="trn"><gra type="grt">
if relation == True:
for xmlstem_rel in xmlword.findall(
- ".//{%s}mor/{%s}gra" % (NS, NS)
+ './/{%s}mor/{%s}gra' % (NS, NS)
):
- if not xmlstem_rel.get("type") == "grt":
+ if not xmlstem_rel.get('type') == 'grt':
word = (
word[0],
word[1],
- xmlstem_rel.get("index")
+ xmlstem_rel.get('index')
+ "|"
- + xmlstem_rel.get("head")
+ + xmlstem_rel.get('head')
+ "|"
- + xmlstem_rel.get("relation"),
+ + xmlstem_rel.get('relation'),
)
else:
word = (
word[2],
word[0],
word[1],
- xmlstem_rel.get("index")
+ xmlstem_rel.get('index')
+ "|"
- + xmlstem_rel.get("head")
+ + xmlstem_rel.get('head')
+ "|"
- + xmlstem_rel.get("relation"),
+ + xmlstem_rel.get('relation'),
)
try:
for xmlpost_rel in xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)
+ './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
):
- if not xmlpost_rel.get("type") == "grt":
+ if not xmlpost_rel.get('type') == 'grt':
suffixStem = (
suffixStem[0],
suffixStem[1],
- xmlpost_rel.get("index")
+ xmlpost_rel.get('index')
+ "|"
- + xmlpost_rel.get("head")
+ + xmlpost_rel.get('head')
+ "|"
- + xmlpost_rel.get("relation"),
+ + xmlpost_rel.get('relation'),
)
else:
suffixStem = (
suffixStem[2],
suffixStem[0],
suffixStem[1],
- xmlpost_rel.get("index")
+ xmlpost_rel.get('index')
+ "|"
- + xmlpost_rel.get("head")
+ + xmlpost_rel.get('head')
+ "|"
- + xmlpost_rel.get("relation"),
+ + xmlpost_rel.get('relation'),
)
except:
pass
shouldn't need to be changed, unless CHILDES changes the configuration
of their server or unless the user sets up their own corpus webserver.
"""
- childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
+ childes_url_base = r'http://childes.psy.cmu.edu/browser/index.php?url='
def webview_file(self, fileid, urlbase=None):
"""Map a corpus file to its web version on the CHILDES website,
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
"""
- import webbrowser
+ import webbrowser, re
if urlbase:
path = urlbase + "/" + fileid
else:
full = self.root + "/" + fileid
- full = re.sub(r"\\", "/", full)
- if "/childes/" in full.lower():
+ full = re.sub(r'\\', '/', full)
+ if '/childes/' in full.lower():
# Discard /data-xml/ if present
- path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
- elif "eng-usa" in full.lower():
- path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
+ path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
+ elif 'eng-usa' in full.lower():
+ path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
else:
path = fileid
# Strip ".xml" and add ".cha", as necessary:
- if path.endswith(".xml"):
+ if path.endswith('.xml'):
path = path[:-4]
- if not path.endswith(".cha"):
- path = path + ".cha"
+ if not path.endswith('.cha'):
+ path = path + '.cha'
url = self.childes_url_base + path
if not corpus_root:
from nltk.data import find
- corpus_root = find("corpora/childes/data-xml/Eng-USA/")
+ corpus_root = find('corpora/childes/data-xml/Eng-USA/')
try:
- childes = CHILDESCorpusReader(corpus_root, ".*.xml")
+ childes = CHILDESCorpusReader(corpus_root, '.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
- corpus = ""
- corpus_id = ""
+ corpus = ''
+ corpus_id = ''
for (key, value) in childes.corpus(file)[0].items():
if key == "Corpus":
corpus = value
if key == "Id":
corpus_id = value
- print("Reading", corpus, corpus_id, " .....")
+ print('Reading', corpus, corpus_id, ' .....')
print("words:", childes.words(file)[:7], "...")
print(
"words with replaced words:",
" ...",
)
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
- print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
- print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
+ print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
+ print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
print(
"words with relations and pos-tag:",
except LookupError as e:
print(
"""The CHILDES corpus, or the parts you need, should be manually
- downloaded from https://childes.talkbank.org/data-xml/ and saved at
+ downloaded from http://childes.psy.cmu.edu/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
"""
)
- # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
+ # corpus_root_http = urllib2.urlopen('http://childes.psy.cmu.edu/data-xml/Eng-USA/Bates.zip')
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
##this fails
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
# Natural Language Toolkit: Chunked Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
import os.path, codecs
+from six import string_types
+
import nltk
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.tree import Tree
self,
root,
fileids,
- extension="",
+ extension='',
str2chunktree=tagstr2tree,
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
- encoding="utf8",
+ encoding='utf8',
tagset=None,
):
"""
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
elif isinstance(child, tuple):
tree[i] = child[0]
else:
- raise ValueError("expected child to be Tree or tuple")
+ raise ValueError('expected child to be Tree or tuple')
return tree
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
ZH seizure S IY ZH ER
"""
+from nltk import compat
from nltk.util import Index
from nltk.corpus.reader.util import *
:return: the cmudict lexicon as a raw string.
"""
fileids = self._fileids
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
entries = []
while len(entries) < 100: # Read 100 at a time.
line = stream.readline()
- if line == "":
+ if line == '':
return entries # end of file.
pieces = line.split()
entries.append((pieces[0].lower(), pieces[2:]))
# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
# Regular expressions for dataset components
-STARS = re.compile(r"^\*+$")
-COMPARISON = re.compile(r"<cs-[1234]>")
-CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
-GRAD_COMPARISON = re.compile(r"<cs-[123]>")
-NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
+STARS = re.compile(r'^\*+$')
+COMPARISON = re.compile(r'<cs-[1234]>')
+CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
+GRAD_COMPARISON = re.compile(r'<cs-[123]>')
+NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
-KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
+KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
class Comparison(object):
def __repr__(self):
return (
- 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
- 'feature="{}", keyword="{}")'
+ "Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
+ "feature=\"{}\", keyword=\"{}\")"
).format(
self.text,
self.comp_type,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
- encoding="utf8",
+ encoding='utf8',
):
"""
:param root: The root directory for this corpus.
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
if grad_comparisons:
# Each comparison tag has its own relations on a separate line
for comp in grad_comparisons:
- comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
+ comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
entities_feats = ENTITIES_FEATS.findall(line)
if entities_feats:
for (code, entity_feat) in entities_feats:
- if code == "1":
+ if code == '1':
comparison.entity_1 = entity_feat.strip()
- elif code == "2":
+ elif code == '2':
comparison.entity_2 = entity_feat.strip()
- elif code == "3":
+ elif code == '3':
comparison.feature = entity_feat.strip()
keyword = KEYWORD.findall(line)
if keyword:
if non_grad_comparisons:
for comp in non_grad_comparisons:
# comp_type in this case should always be 4.
- comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
+ comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
# Natural Language Toolkit: CONLL Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
Read CoNLL-style chunk fileids.
"""
+from __future__ import unicode_literals
+
import textwrap
+from nltk import compat
from nltk.tree import Tree
from nltk.util import LazyMap, LazyConcatenation
from nltk.tag import map_tag
# Column Types
# /////////////////////////////////////////////////////////////////
- WORDS = "words" #: column type for words
- POS = "pos" #: column type for part-of-speech tags
- TREE = "tree" #: column type for parse trees
- CHUNK = "chunk" #: column type for chunk structures
- NE = "ne" #: column type for named entities
- SRL = "srl" #: column type for semantic role labels
- IGNORE = "ignore" #: column type for column that should be ignored
+ WORDS = 'words' #: column type for words
+ POS = 'pos' #: column type for part-of-speech tags
+ TREE = 'tree' #: column type for parse trees
+ CHUNK = 'chunk' #: column type for chunk structures
+ NE = 'ne' #: column type for named entities
+ SRL = 'srl' #: column type for semantic role labels
+ IGNORE = 'ignore' #: column type for column that should be ignored
#: A list of all column types supported by the conll corpus reader.
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
fileids,
columntypes,
chunk_types=None,
- root_label="S",
+ root_label='S',
pos_in_tree=False,
srl_includes_roleset=True,
- encoding="utf8",
+ encoding='utf8',
tree_class=Tree,
tagset=None,
separator=None,
):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
- raise ValueError("Bad column type %r" % columntype)
- if isinstance(chunk_types, str):
+ raise ValueError('Bad column type %r' % columntype)
+ if isinstance(chunk_types, string_types):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
self._colmap = dict((c, i) for (i, c) in enumerate(columntypes))
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
if not block:
continue
- grid = [line.split(self.sep) for line in block.split("\n")]
+ grid = [line.split(self.sep) for line in block.split('\n')]
# If there's a docstart row, then discard. ([xx] eventually it
# would be good to actually use it)
- if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
+ if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
del grid[0]
# Check that the grid is consistent.
for row in grid:
if len(row) != len(grid[0]):
- raise ValueError("Inconsistent number of columns:\n%s" % block)
+ raise ValueError('Inconsistent number of columns:\n%s' % block)
grids.append(grid)
return grids
# a list of words or a parse tree).
def _get_words(self, grid):
- return self._get_column(grid, self._colmap["words"])
+ return self._get_column(grid, self._colmap['words'])
def _get_tagged_words(self, grid, tagset=None):
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
+ return list(zip(self._get_column(grid, self._colmap['words']), pos_tags))
def _get_iob_words(self, grid, tagset=None):
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(
zip(
- self._get_column(grid, self._colmap["words"]),
+ self._get_column(grid, self._colmap['words']),
pos_tags,
- self._get_column(grid, self._colmap["chunk"]),
+ self._get_column(grid, self._colmap['chunk']),
)
)
def _get_chunked_words(self, grid, chunk_types, tagset=None):
# n.b.: this method is very similar to conllstr2tree.
- words = self._get_column(grid, self._colmap["words"])
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ words = self._get_column(grid, self._colmap['words'])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- chunk_tags = self._get_column(grid, self._colmap["chunk"])
+ chunk_tags = self._get_column(grid, self._colmap['chunk'])
stack = [Tree(self._root_label, [])]
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
- if chunk_tag == "O":
- state, chunk_type = "O", ""
+ if chunk_tag == 'O':
+ state, chunk_type = 'O', ''
else:
- (state, chunk_type) = chunk_tag.split("-")
+ (state, chunk_type) = chunk_tag.split('-')
# If it's a chunk we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
- state = "O"
+ state = 'O'
# Treat a mismatching I like a B.
- if state == "I" and chunk_type != stack[-1].label():
- state = "B"
+ if state == 'I' and chunk_type != stack[-1].label():
+ state = 'B'
# For B or I: close any open chunks
- if state in "BO" and len(stack) == 2:
+ if state in 'BO' and len(stack) == 2:
stack.pop()
# For B: start a new chunk.
- if state == "B":
+ if state == 'B':
new_chunk = Tree(chunk_type, [])
stack[-1].append(new_chunk)
stack.append(new_chunk)
return stack[0]
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
- words = self._get_column(grid, self._colmap["words"])
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ words = self._get_column(grid, self._colmap['words'])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- parse_tags = self._get_column(grid, self._colmap["tree"])
+ parse_tags = self._get_column(grid, self._colmap['tree'])
- treestr = ""
+ treestr = ''
for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
- if word == "(":
- word = "-LRB-"
- if word == ")":
- word = "-RRB-"
- if pos_tag == "(":
- pos_tag = "-LRB-"
- if pos_tag == ")":
- pos_tag = "-RRB-"
- (left, right) = parse_tag.split("*")
- right = right.count(")") * ")" # only keep ')'.
- treestr += "%s (%s %s) %s" % (left, pos_tag, word, right)
+ if word == '(':
+ word = '-LRB-'
+ if word == ')':
+ word = '-RRB-'
+ if pos_tag == '(':
+ pos_tag = '-LRB-'
+ if pos_tag == ')':
+ pos_tag = '-RRB-'
+ (left, right) = parse_tag.split('*')
+ right = right.count(')') * ')' # only keep ')'.
+ treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
try:
tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
- tree = self._tree_class.fromstring("(%s %s)" % (self._root_label, treestr))
+ tree = self._tree_class.fromstring('(%s %s)' % (self._root_label, treestr))
if not pos_in_tree:
for subtree in tree.subtrees():
if (
isinstance(child, Tree)
and len(child) == 1
- and isinstance(child[0], str)
+ and isinstance(child[0], string_types)
):
subtree[i] = (child[0], child.label())
list of list of (start, end), tag) tuples
"""
if self._srl_includes_roleset:
- predicates = self._get_column(grid, self._colmap["srl"] + 1)
- start_col = self._colmap["srl"] + 2
+ predicates = self._get_column(grid, self._colmap['srl'] + 1)
+ start_col = self._colmap['srl'] + 2
else:
- predicates = self._get_column(grid, self._colmap["srl"])
- start_col = self._colmap["srl"] + 1
+ predicates = self._get_column(grid, self._colmap['srl'])
+ start_col = self._colmap['srl'] + 1
# Count how many predicates there are. This tells us how many
# columns to expect for SRL data.
- num_preds = len([p for p in predicates if p != "-"])
+ num_preds = len([p for p in predicates if p != '-'])
spanlists = []
for i in range(num_preds):
spanlist = []
stack = []
for wordnum, srl_tag in enumerate(col):
- (left, right) = srl_tag.split("*")
- for tag in left.split("("):
+ (left, right) = srl_tag.split('*')
+ for tag in left.split('('):
if tag:
stack.append((tag, wordnum))
- for i in range(right.count(")")):
+ for i in range(right.count(')')):
(tag, start) = stack.pop()
spanlist.append(((start, wordnum + 1), tag))
spanlists.append(spanlist)
tree = self._get_parsed_sent(grid, pos_in_tree)
spanlists = self._get_srl_spans(grid)
if self._srl_includes_roleset:
- predicates = self._get_column(grid, self._colmap["srl"] + 1)
- rolesets = self._get_column(grid, self._colmap["srl"])
+ predicates = self._get_column(grid, self._colmap['srl'] + 1)
+ rolesets = self._get_column(grid, self._colmap['srl'])
else:
- predicates = self._get_column(grid, self._colmap["srl"])
+ predicates = self._get_column(grid, self._colmap['srl'])
rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree)
for wordnum, predicate in enumerate(predicates):
- if predicate == "-":
+ if predicate == '-':
continue
# Decide which spanlist to use. Don't assume that they're
# sorted in the same order as the predicates (even though
# they usually are).
for spanlist in spanlists:
for (start, end), tag in spanlist:
- if wordnum in range(start, end) and tag in ("V", "C-V"):
+ if wordnum in range(start, end) and tag in ('V', 'C-V'):
break
else:
continue
break
else:
- raise ValueError("No srl column found for %r" % predicate)
+ raise ValueError('No srl column found for %r' % predicate)
instances.append(
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
)
for columntype in columntypes:
if columntype not in self._colmap:
raise ValueError(
- "This corpus does not contain a %s " "column." % columntype
+ 'This corpus does not contain a %s ' 'column.' % columntype
)
@staticmethod
return [grid[i][column_index] for i in range(len(grid))]
+@compat.python_2_unicode_compatible
class ConllSRLInstance(object):
"""
An SRL instance from a CoNLL corpus, which identifies and
# Fill in the self.verb and self.arguments values.
for (start, end), tag in tagged_spans:
- if tag in ("V", "C-V"):
+ if tag in ('V', 'C-V'):
self.verb += list(range(start, end))
else:
self.arguments.append(((start, end), tag))
def __repr__(self):
# Originally, its:
##plural = 's' if len(self.arguments) != 1 else ''
- plural = "s" if len(self.arguments) != 1 else ""
- return "<ConllSRLInstance for %r with %d argument%s>" % (
+ plural = 's' if len(self.arguments) != 1 else ''
+ return '<ConllSRLInstance for %r with %d argument%s>' % (
(self.verb_stem, len(self.arguments), plural)
)
def pprint(self):
- verbstr = " ".join(self.words[i][0] for i in self.verb)
- hdr = "SRL for %r (stem=%r):\n" % (verbstr, self.verb_stem)
- s = ""
+ verbstr = ' '.join(self.words[i][0] for i in self.verb)
+ hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
+ s = ''
for i, word in enumerate(self.words):
if isinstance(word, tuple):
word = word[0]
for (start, end), argid in self.arguments:
if i == start:
- s += "[%s " % argid
+ s += '[%s ' % argid
if i == end:
- s += "] "
+ s += '] '
if i in self.verb:
- word = "<<%s>>" % word
- s += word + " "
+ word = '<<%s>>' % word
+ s += word + ' '
return hdr + textwrap.fill(
- s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
+ s.replace(' ]', ']'), initial_indent=' ', subsequent_indent=' '
)
+@compat.python_2_unicode_compatible
class ConllSRLInstanceList(list):
"""
Set of instances for a single sentence
# Sanity check: trees should be the same
for inst in self:
if inst.tree != self.tree:
- raise ValueError("Tree mismatch!")
+ raise ValueError('Tree mismatch!')
# If desired, add trees:
if include_tree:
words = self.tree.leaves()
pos = [None] * len(words)
- synt = ["*"] * len(words)
+ synt = ['*'] * len(words)
self._tree2conll(self.tree, 0, words, pos, synt)
- s = ""
+ s = ''
for i in range(len(words)):
# optional tree columns
if include_tree:
- s += "%-20s " % words[i]
- s += "%-8s " % pos[i]
- s += "%15s*%-8s " % tuple(synt[i].split("*"))
+ s += '%-20s ' % words[i]
+ s += '%-8s ' % pos[i]
+ s += '%15s*%-8s ' % tuple(synt[i].split('*'))
# verb head column
for inst in self:
if i == inst.verb_head:
- s += "%-20s " % inst.verb_stem
+ s += '%-20s ' % inst.verb_stem
break
else:
- s += "%-20s " % "-"
+ s += '%-20s ' % '-'
# Remaining columns: self
for inst in self:
- argstr = "*"
+ argstr = '*'
for (start, end), argid in inst.tagged_spans:
if i == start:
- argstr = "(%s%s" % (argid, argstr)
+ argstr = '(%s%s' % (argid, argstr)
if i == (end - 1):
- argstr += ")"
- s += "%-12s " % argstr
- s += "\n"
+ argstr += ')'
+ s += '%-12s ' % argstr
+ s += '\n'
return s
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
- if len(tree) == 1 and isinstance(tree[0], str):
+ if len(tree) == 1 and isinstance(tree[0], string_types):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
return wordnum + 1
pos[wordnum], pos[wordnum] = tree[0]
return wordnum + 1
else:
- synt[wordnum] = "(%s%s" % (tree.label(), synt[wordnum])
+ synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum])
for child in tree:
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
- synt[wordnum - 1] += ")"
+ synt[wordnum - 1] += ')'
return wordnum
"""
def __init__(
- self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
+ self, root, fileids, chunk_types, encoding='utf8', tagset=None, separator=None
):
ConllCorpusReader.__init__(
self,
root,
fileids,
- ("words", "pos", "chunk"),
+ ('words', 'pos', 'chunk'),
chunk_types=chunk_types,
encoding=encoding,
tagset=tagset,
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Crubadan N-grams Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
http://borel.slu.edu/crubadan/index.html
"""
+from __future__ import print_function, unicode_literals
+
import re
from os import path
+from nltk.compat import PY3
from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist
from nltk.data import ZipFilePathPointer
A corpus reader used to access language An Crubadan n-gram files.
"""
- _LANG_MAPPER_FILE = "table.txt"
+ _LANG_MAPPER_FILE = 'table.txt'
_all_lang_freq = {}
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
- super(CrubadanCorpusReader, self).__init__(root, fileids, encoding="utf8")
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
+ super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
self._lang_mapping_data = []
self._load_lang_mapping_data()
def lang_freq(self, lang):
- """ Return n-gram FreqDist for a specific language
- given ISO 639-3 language code """
+ ''' Return n-gram FreqDist for a specific language
+ given ISO 639-3 language code '''
if lang not in self._all_lang_freq:
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
return self._all_lang_freq[lang]
def langs(self):
- """ Return a list of supported languages as ISO 639-3 codes """
+ ''' Return a list of supported languages as ISO 639-3 codes '''
return [row[1] for row in self._lang_mapping_data]
def iso_to_crubadan(self, lang):
- """ Return internal Crubadan code based on ISO 639-3 code """
+ ''' Return internal Crubadan code based on ISO 639-3 code '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return i[0]
def crubadan_to_iso(self, lang):
- """ Return ISO 639-3 code given internal Crubadan code """
+ ''' Return ISO 639-3 code given internal Crubadan code '''
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return i[1]
def _load_lang_mapping_data(self):
- """ Load language mappings between codes and description from table.txt """
+ ''' Load language mappings between codes and description from table.txt '''
if isinstance(self.root, ZipFilePathPointer):
raise RuntimeError(
"Please install the 'crubadan' corpus first, use nltk.download()"
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
- raw = open(mapper_file, "r", encoding="utf-8").read().strip()
+ if PY3:
+ raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+ else:
+ raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
- self._lang_mapping_data = [row.split("\t") for row in raw.split("\n")]
+ self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
def _load_lang_ngrams(self, lang):
- """ Load single n-gram language file given the ISO 639-3 language code
- and return its FreqDist """
+ ''' Load single n-gram language file given the ISO 639-3 language code
+ and return its FreqDist '''
if lang not in self.langs():
raise RuntimeError("Unsupported language.")
crubadan_code = self.iso_to_crubadan(lang)
- ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
+ ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
if not path.isfile(ngram_file):
raise RuntimeError("No N-gram file found for requested language.")
counts = FreqDist()
- f = open(ngram_file, "r", encoding="utf-8")
+ if PY3:
+ f = open(ngram_file, 'r', encoding='utf-8')
+ else:
+ f = open(ngram_file, 'rU')
for line in f:
- data = line.split(" ")
+ if PY3:
+ data = line.split(' ')
+ else:
+ data = line.decode('utf8').split(' ')
- ngram = data[1].strip("\n")
+ ngram = data[1].strip('\n')
freq = int(data[0])
counts[ngram] = freq
# Natural Language Toolkit: Dependency Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
# Iker Manterola <returntothehangar@hotmail.com>
#
self,
root,
fileids,
- encoding="utf8",
+ encoding='utf8',
word_tokenizer=TabTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
):
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
class DependencyCorpusView(StreamBackedCorpusView):
- _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
+ _DOCSTART = '-DOCSTART- -DOCSTART- O\n' # dokumentu hasiera definitzen da
def __init__(
self,
group_by_sent,
dependencies,
chunk_types=None,
- encoding="utf8",
+ encoding='utf8',
):
self._tagged = tagged
self._dependencies = dependencies
# extract word and tag from any of the formats
if not self._dependencies:
- lines = [line.split("\t") for line in sent.split("\n")]
+ lines = [line.split('\t') for line in sent.split('\n')]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
- raise ValueError("Unexpected number of fields in dependency tree file")
+ raise ValueError('Unexpected number of fields in dependency tree file')
# discard tags if they weren't requested
if not self._tagged:
# Natural Language Toolkit: Framenet Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Chuck Wooters <wooters@icsi.berkeley.edu>,
# Nathan Schneider <nathan.schneider@georgetown.edu>
# URL: <http://nltk.org/>
"""
Corpus reader for the FrameNet 1.7 lexicon and corpus.
"""
+from __future__ import print_function, unicode_literals
import os
import re
import types
from collections import defaultdict, OrderedDict
from operator import itemgetter
-from itertools import zip_longest
+from six import string_types, text_type
+from six.moves import zip_longest
from pprint import pprint
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView
-
+from nltk.compat import python_2_unicode_compatible
from nltk.util import LazyConcatenation, LazyMap, LazyIteratorList
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
def mimic_wrap(lines, wrap_at=65, **kwargs):
Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same
positions as the first.
"""
- l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n")
+ l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split('\n')
yield l0
def _(line):
il0 += 1
if line: # Remaining stuff on this line past the end of the mimicked line.
# So just textwrap this line.
- for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n"):
+ for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split('\n'):
yield ln
for l in lines[1:]:
yield list(_(l))
-def _pretty_longstring(defstr, prefix="", wrap_at=65):
+def _pretty_longstring(defstr, prefix='', wrap_at=65):
"""
Helper function for pretty-printing a long string.
"""
outstr = ""
- for line in textwrap.fill(defstr, wrap_at).split("\n"):
- outstr += prefix + line + "\n"
+ for line in textwrap.fill(defstr, wrap_at).split('\n'):
+ outstr += prefix + line + '\n'
return outstr
outstr = ""
for k in obj:
- if isinstance(obj[k], str) and len(obj[k]) > 65:
+ if isinstance(obj[k], string_types) and len(obj[k]) > 65:
outstr += "[{0}]\n".format(k)
- outstr += "{0}".format(_pretty_longstring(obj[k], prefix=" "))
- outstr += "\n"
+ outstr += "{0}".format(_pretty_longstring(obj[k], prefix=' '))
+ outstr += '\n'
else:
outstr += "[{0}] {1}\n".format(k, obj[k])
outstr = ""
outstr += "semantic type ({0.ID}): {0.name}\n".format(st)
- if "abbrev" in semkeys:
+ if 'abbrev' in semkeys:
outstr += "[abbrev] {0}\n".format(st.abbrev)
- if "definition" in semkeys:
+ if 'definition' in semkeys:
outstr += "[definition]\n"
- outstr += _pretty_longstring(st.definition, " ")
+ outstr += _pretty_longstring(st.definition, ' ')
outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID)
if st.superType is None:
outstr += "[superType] <None>\n"
outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes))
outstr += (
" "
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in st.subTypes)
- + "\n" * (len(st.subTypes) > 0)
+ + ", ".join('{0}({1})'.format(x.name, x.ID) for x in st.subTypes)
+ + '\n' * (len(st.subTypes) > 0)
)
return outstr
lukeys = lu.keys()
outstr = ""
outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu)
- if "definition" in lukeys:
+ if 'definition' in lukeys:
outstr += "[definition]\n"
- outstr += _pretty_longstring(lu.definition, " ")
- if "frame" in lukeys:
+ outstr += _pretty_longstring(lu.definition, ' ')
+ if 'frame' in lukeys:
outstr += "\n[frame] {0}({1})\n".format(lu.frame.name, lu.frame.ID)
- if "incorporatedFE" in lukeys:
+ if 'incorporatedFE' in lukeys:
outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE)
- if "POS" in lukeys:
+ if 'POS' in lukeys:
outstr += "\n[POS] {0}\n".format(lu.POS)
- if "status" in lukeys:
+ if 'status' in lukeys:
outstr += "\n[status] {0}\n".format(lu.status)
- if "totalAnnotated" in lukeys:
+ if 'totalAnnotated' in lukeys:
outstr += "\n[totalAnnotated] {0} annotated examples\n".format(
lu.totalAnnotated
)
- if "lexemes" in lukeys:
+ if 'lexemes' in lukeys:
outstr += "\n[lexemes] {0}\n".format(
- " ".join("{0}/{1}".format(lex.name, lex.POS) for lex in lu.lexemes)
+ ' '.join('{0}/{1}'.format(lex.name, lex.POS) for lex in lu.lexemes)
)
- if "semTypes" in lukeys:
+ if 'semTypes' in lukeys:
outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes))
outstr += (
" " * (len(lu.semTypes) > 0)
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in lu.semTypes)
- + "\n" * (len(lu.semTypes) > 0)
+ + ", ".join('{0}({1})'.format(x.name, x.ID) for x in lu.semTypes)
+ + '\n' * (len(lu.semTypes) > 0)
)
- if "URL" in lukeys:
+ if 'URL' in lukeys:
outstr += "\n[URL] {0}\n".format(lu.URL)
- if "subCorpus" in lukeys:
+ if 'subCorpus' in lukeys:
subc = [x.name for x in lu.subCorpus]
outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus))
- for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"):
+ for line in textwrap.fill(", ".join(sorted(subc)), 60).split('\n'):
outstr += " {0}\n".format(line)
- if "exemplars" in lukeys:
+ if 'exemplars' in lukeys:
outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(
len(lu.exemplars)
)
outstr = ""
outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(
- sent, sent.doc.get("name", sent.doc.description)
+ sent, sent.doc.get('name', sent.doc.description)
)
outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset)
sent = aset.sent
s0 = sent.text
- s1 = ""
- s2 = ""
+ s1 = ''
+ s2 = ''
i = 0
adjust = 0
for j, k, lbl in overt:
- assert j >= i, ("Overlapping targets?", (j, k, lbl))
- s1 += " " * (j - i) + "-" * (k - j)
+ assert j >= i, ('Overlapping targets?', (j, k, lbl))
+ s1 += ' ' * (j - i) + '-' * (k - j)
if len(lbl) > (k - j):
# add space in the sentence to make room for the annotation index
amt = len(lbl) - (k - j)
s0 = (
- s0[: k + adjust] + "~" * amt + s0[k + adjust :]
+ s0[: k + adjust] + '~' * amt + s0[k + adjust :]
) # '~' to prevent line wrapping
- s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
+ s1 = s1[: k + adjust] + ' ' * amt + s1[k + adjust :]
adjust += amt
- s2 += " " * (j - i) + lbl.ljust(k - j)
+ s2 += ' ' * (j - i) + lbl.ljust(k - j)
i = k
long_lines = [s0, s1, s2]
- outstr += "\n\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- ).replace("~", " ")
+ outstr += '\n\n'.join(
+ map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))
+ ).replace('~', ' ')
outstr += "\n"
return outstr
outstr += " ({0.ID}):\n".format(sent)
if aset_level: # TODO: any UNANN exemplars?
outstr += "\n[status] {0}\n".format(sent.status)
- for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"):
+ for k in ('corpID', 'docID', 'paragNo', 'sentNo', 'aPos'):
if k in sentkeys:
outstr += "[{0}] {1}\n".format(k, sent[k])
outstr += (
"\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU)
if sent.LU
- else "\n[LU] Not found!"
+ else '\n[LU] Not found!'
)
outstr += "\n[frame] ({0.ID}) {0.name}\n".format(
sent.frame
- Scon: (none)
- Art: (none)
"""
- for lyr in ("NER", "WSL", "Other", "Sent"):
+ for lyr in ('NER', 'WSL', 'Other', 'Sent'):
if lyr in sent and sent[lyr]:
outstr += "\n[{0}] {1} entr{2}\n".format(
lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y"
outstr += "\n[text] + [Target] + [FE]"
# POS-specific layers: syntactically important words that are neither the target
# nor the FEs. Include these along with the first FE layer but with '^' underlining.
- for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
+ for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
if lyr in sent and sent[lyr]:
outstr += " + [{0}]".format(lyr)
- if "FE2" in sentkeys:
+ if 'FE2' in sentkeys:
outstr += " + [FE2]"
- if "FE3" in sentkeys:
+ if 'FE3' in sentkeys:
outstr += " + [FE3]"
outstr += "\n\n"
outstr += sent._ascii() # -> _annotation_ascii()
def _annotation_ascii(sent):
- """
+ '''
Given a sentence or FE annotation set, construct the width-limited string showing
an ASCII visualization of the sentence's annotations, calling either
_annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate.
This will be attached as a method to appropriate AttrDict instances
and called in the full pretty-printing of the instance.
- """
- if sent._type == "fulltext_sentence" or (
- "annotationSet" in sent and len(sent.annotationSet) > 2
+ '''
+ if sent._type == 'fulltext_sentence' or (
+ 'annotationSet' in sent and len(sent.annotationSet) > 2
):
# a full-text sentence OR sentence with multiple targets.
# (multiple targets = >2 annotation sets, because the first annotation set is POS.)
def _annotation_ascii_frames(sent):
- """
+ '''
ASCII string rendering of the sentence along with its targets and frame names.
Called for all full-text sentences, as well as the few LU sentences with multiple
targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets).
Line-wrapped to limit the display width.
- """
+ '''
# list the target spans and their associated aset index
overt = []
for a, aset in enumerate(sent.annotationSet[1:]):
for j, k in aset.Target:
indexS = "[{0}]".format(a + 1)
- if aset.status == "UNANN" or aset.LU.status == "Problem":
+ if aset.status == 'UNANN' or aset.LU.status == 'Problem':
indexS += " "
- if aset.status == "UNANN":
+ if aset.status == 'UNANN':
indexS += (
"!"
) # warning indicator that there is a frame annotation but no FE annotation
- if aset.LU.status == "Problem":
+ if aset.LU.status == 'Problem':
indexS += (
"?"
) # warning indicator that there is a missing LU definition (because the LU has Problem status)
combinedIndex = (
overt[o - 1][3] + asetIndex
) # e.g., '[1][2]', '[1]! [2]'
- combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ")
+ combinedIndex = combinedIndex.replace(' !', '! ').replace(' ?', '? ')
overt[o - 1] = overt[o - 1][:3] + (combinedIndex,)
duplicates.add(o)
else: # different frames, same or overlapping targets
s = sent.text
for j, k, fname, asetIndex in overt:
- s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname
- s += "\n(Unable to display sentence with targets marked inline due to overlap)"
+ s += '\n' + asetIndex + ' ' + sent.text[j:k] + ' :: ' + fname
+ s += '\n(Unable to display sentence with targets marked inline due to overlap)'
return s
for o in reversed(sorted(duplicates)):
del overt[o]
s0 = sent.text
- s1 = ""
- s11 = ""
- s2 = ""
+ s1 = ''
+ s11 = ''
+ s2 = ''
i = 0
adjust = 0
fAbbrevs = OrderedDict()
for j, k, fname, asetIndex in overt:
if not j >= i:
assert j >= i, (
- "Overlapping targets?"
+ 'Overlapping targets?'
+ (
- " UNANN"
- if any(aset.status == "UNANN" for aset in sent.annotationSet[1:])
- else ""
+ ' UNANN'
+ if any(aset.status == 'UNANN' for aset in sent.annotationSet[1:])
+ else ''
),
(j, k, asetIndex),
)
- s1 += " " * (j - i) + "*" * (k - j)
+ s1 += ' ' * (j - i) + '*' * (k - j)
short = fname[: k - j]
if (k - j) < len(fname):
r = 0
short = fname[: k - j - 1] + str(r)
else: # short not in fAbbrevs
fAbbrevs[short] = fname
- s11 += " " * (j - i) + short.ljust(k - j)
+ s11 += ' ' * (j - i) + short.ljust(k - j)
if len(asetIndex) > (k - j):
# add space in the sentence to make room for the annotation index
amt = len(asetIndex) - (k - j)
s0 = (
- s0[: k + adjust] + "~" * amt + s0[k + adjust :]
+ s0[: k + adjust] + '~' * amt + s0[k + adjust :]
) # '~' to prevent line wrapping
- s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
- s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :]
+ s1 = s1[: k + adjust] + ' ' * amt + s1[k + adjust :]
+ s11 = s11[: k + adjust] + ' ' * amt + s11[k + adjust :]
adjust += amt
- s2 += " " * (j - i) + asetIndex.ljust(k - j)
+ s2 += ' ' * (j - i) + asetIndex.ljust(k - j)
i = k
long_lines = [s0, s1, s11, s2]
- outstr = "\n\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- ).replace("~", " ")
- outstr += "\n"
+ outstr = '\n\n'.join(
+ map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))
+ ).replace('~', ' ')
+ outstr += '\n'
if fAbbrevs:
- outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")"
- assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash"
+ outstr += ' (' + ', '.join('='.join(pair) for pair in fAbbrevs.items()) + ')'
+ assert len(fAbbrevs) == len(dict(fAbbrevs)), 'Abbreviation clash'
return outstr
def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
- """Helper for _annotation_ascii_FEs()."""
- s1 = ""
- s2 = ""
+ '''Helper for _annotation_ascii_FEs().'''
+ s1 = ''
+ s2 = ''
i = 0
for j, k, fename in overt:
- s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j)
+ s1 += ' ' * (j - i) + ('^' if fename.islower() else '-') * (k - j)
short = fename[: k - j]
if len(fename) > len(short):
r = 0
short = fename[: k - j - 1] + str(r)
else: # short not in feAbbrevs
feAbbrevs[short] = fename
- s2 += " " * (j - i) + short.ljust(k - j)
+ s2 += ' ' * (j - i) + short.ljust(k - j)
i = k
- sNI = ""
+ sNI = ''
if ni:
- sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]"
+ sNI += ' [' + ', '.join(':'.join(x) for x in sorted(ni.items())) + ']'
return [s1, s2, sNI]
def _annotation_ascii_FEs(sent):
- """
+ '''
ASCII string rendering of the sentence along with a single target and its FEs.
Secondary and tertiary FE layers are included if present.
'sent' can be an FE annotation set or an LU sentence with a single target.
Line-wrapped to limit the display width.
- """
+ '''
feAbbrevs = OrderedDict()
posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
posspec_separate = False
- for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
+ for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
if lyr in sent and sent[lyr]:
for a, b, lbl in sent[lyr]:
if (
- lbl == "X"
+ lbl == 'X'
): # skip this, which covers an entire phrase typically containing the target and all its FEs
# (but do display the Gov)
continue
True
) # show POS-specific layers on a separate line
posspec.append(
- (a, b, lbl.lower().replace("-", ""))
+ (a, b, lbl.lower().replace('-', ''))
) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
if posspec_separate:
POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs)
feAbbrevs,
)
FE2 = FE3 = None
- if "FE2" in sent:
+ if 'FE2' in sent:
FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs)
- if "FE3" in sent:
+ if 'FE3' in sent:
FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs)
for i, j in sent.Target:
FE1span, FE1name, FE1exp = FE1
if len(FE1span) < j:
- FE1span += " " * (j - len(FE1span))
+ FE1span += ' ' * (j - len(FE1span))
if len(FE1name) < j:
- FE1name += " " * (j - len(FE1name))
+ FE1name += ' ' * (j - len(FE1name))
FE1[1] = FE1name
FE1[0] = (
- FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:]
+ FE1span[:i] + FE1span[i:j].replace(' ', '*').replace('-', '=') + FE1span[j:]
)
long_lines = [sent.text]
if posspec_separate:
long_lines.extend([FE2[0], FE2[1] + FE2[2]])
if FE3:
long_lines.extend([FE3[0], FE3[1] + FE3[2]])
- long_lines.append("")
- outstr = "\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
+ long_lines.append('')
+ outstr = '\n'.join(
+ map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))
)
if feAbbrevs:
- outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")"
- assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash"
+ outstr += '(' + ', '.join('='.join(pair) for pair in feAbbrevs.items()) + ')'
+ assert len(feAbbrevs) == len(dict(feAbbrevs)), 'Abbreviation clash'
outstr += "\n"
return outstr
outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format(
fe, fe.frame
)
- if "definition" in fekeys:
+ if 'definition' in fekeys:
outstr += "[definition]\n"
- outstr += _pretty_longstring(fe.definition, " ")
- if "abbrev" in fekeys:
+ outstr += _pretty_longstring(fe.definition, ' ')
+ if 'abbrev' in fekeys:
outstr += "[abbrev] {0}\n".format(fe.abbrev)
- if "coreType" in fekeys:
+ if 'coreType' in fekeys:
outstr += "[coreType] {0}\n".format(fe.coreType)
- if "requiresFE" in fekeys:
+ if 'requiresFE' in fekeys:
outstr += "[requiresFE] "
if fe.requiresFE is None:
outstr += "<None>\n"
else:
outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID)
- if "excludesFE" in fekeys:
+ if 'excludesFE' in fekeys:
outstr += "[excludesFE] "
if fe.excludesFE is None:
outstr += "<None>\n"
else:
outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID)
- if "semType" in fekeys:
+ if 'semType' in fekeys:
outstr += "[semType] "
if fe.semType is None:
outstr += "<None>\n"
else:
- outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + "\n"
+ outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + '\n'
return outstr
outstr += "frame ({0.ID}): {0.name}\n\n".format(frame)
outstr += "[URL] {0}\n\n".format(frame.URL)
outstr += "[definition]\n"
- outstr += _pretty_longstring(frame.definition, " ") + "\n"
+ outstr += _pretty_longstring(frame.definition, ' ') + '\n'
outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes))
outstr += (
" " * (len(frame.semTypes) > 0)
+ ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes)
- + "\n" * (len(frame.semTypes) > 0)
+ + '\n' * (len(frame.semTypes) > 0)
)
outstr += "\n[frameRelations] {0} frame relations\n".format(
len(frame.frameRelations)
)
- outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n"
+ outstr += ' ' + '\n '.join(repr(frel) for frel in frame.frameRelations) + '\n'
outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit))
lustrs = []
for luName, lu in sorted(frame.lexUnit.items()):
- tmpstr = "{0} ({1})".format(luName, lu.ID)
+ tmpstr = '{0} ({1})'.format(luName, lu.ID)
lustrs.append(tmpstr)
- outstr += "{0}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" "))
+ outstr += "{0}\n".format(_pretty_longstring(', '.join(lustrs), prefix=' '))
outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE))
fes = {}
for ct in sorted(
fes.keys(),
key=lambda ct2: [
- "Core",
- "Core-Unexpressed",
- "Peripheral",
- "Extra-Thematic",
+ 'Core',
+ 'Core-Unexpressed',
+ 'Peripheral',
+ 'Extra-Thematic',
].index(ct2),
):
- outstr += "{0:>16}: {1}\n".format(ct, ", ".join(sorted(fes[ct])))
+ outstr += "{0:>16}: {1}\n".format(ct, ', '.join(sorted(fes[ct])))
outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(
len(frame.FEcoreSets)
)
outstr += (
" "
- + "\n ".join(
+ + '\n '.join(
", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets
)
- + "\n"
+ + '\n'
)
return outstr
"""An exception class for framenet-related errors."""
+@python_2_unicode_compatible
class AttrDict(dict):
"""A class that wraps a dict and allows accessing the keys of the
self[name] = value
def __getattr__(self, name):
- if name == "_short_repr":
+ if name == '_short_repr':
return self._short_repr
return self[name]
return v
def _short_repr(self):
- if "_type" in self:
- if self["_type"].endswith("relation"):
+ if '_type' in self:
+ if self['_type'].endswith('relation'):
return self.__repr__()
try:
return "<{0} ID={1} name={2}>".format(
- self["_type"], self["ID"], self["name"]
+ self['_type'], self['ID'], self['name']
)
except KeyError:
try: # no ID--e.g., for _type=lusubcorpus
- return "<{0} name={1}>".format(self["_type"], self["name"])
+ return "<{0} name={1}>".format(self['_type'], self['name'])
except KeyError: # no name--e.g., for _type=lusentence
- return "<{0} ID={1}>".format(self["_type"], self["ID"])
+ return "<{0} ID={1}>".format(self['_type'], self['ID'])
else:
return self.__repr__()
def _str(self):
outstr = ""
- if "_type" not in self:
+ if '_type' not in self:
outstr = _pretty_any(self)
- elif self["_type"] == "frame":
+ elif self['_type'] == 'frame':
outstr = _pretty_frame(self)
- elif self["_type"] == "fe":
+ elif self['_type'] == 'fe':
outstr = _pretty_fe(self)
- elif self["_type"] == "lu":
+ elif self['_type'] == 'lu':
outstr = _pretty_lu(self)
- elif self["_type"] == "luexemplars": # list of ALL exemplars for LU
+ elif self['_type'] == 'luexemplars': # list of ALL exemplars for LU
outstr = _pretty_exemplars(self, self[0].LU)
elif (
- self["_type"] == "fulltext_annotation"
+ self['_type'] == 'fulltext_annotation'
): # list of all sentences for full-text doc
outstr = _pretty_fulltext_sentences(self)
- elif self["_type"] == "lusentence":
+ elif self['_type'] == 'lusentence':
outstr = _pretty_annotation(self)
- elif self["_type"] == "fulltext_sentence":
+ elif self['_type'] == 'fulltext_sentence':
outstr = _pretty_fulltext_sentence(self)
- elif self["_type"] in ("luannotationset", "fulltext_annotationset"):
+ elif self['_type'] in ('luannotationset', 'fulltext_annotationset'):
outstr = _pretty_annotation(self, aset_level=True)
- elif self["_type"] == "posannotationset":
+ elif self['_type'] == 'posannotationset':
outstr = _pretty_pos(self)
- elif self["_type"] == "semtype":
+ elif self['_type'] == 'semtype':
outstr = _pretty_semtype(self)
- elif self["_type"] == "framerelationtype":
+ elif self['_type'] == 'framerelationtype':
outstr = _pretty_frame_relation_type(self)
- elif self["_type"] == "framerelation":
+ elif self['_type'] == 'framerelation':
outstr = _pretty_frame_relation(self)
- elif self["_type"] == "ferelation":
+ elif self['_type'] == 'ferelation':
outstr = _pretty_fe_relation(self)
else:
outstr = _pretty_any(self)
# ensure result is unicode string prior to applying the
- # decorator (because non-ASCII characters
+ # @python_2_unicode_compatible decorator (because non-ASCII characters
# could in principle occur in the data and would trigger an encoding error when
# passed as arguments to str.format()).
# assert isinstance(outstr, unicode) # not in Python 3.2
return self.__str__()
+@python_2_unicode_compatible
class SpecialList(list):
"""
A list subclass which adds a '_type' attribute for special printing
assert self._type
if len(self) == 0:
outstr = "[]"
- elif self._type == "luexemplars": # list of ALL exemplars for LU
+ elif self._type == 'luexemplars': # list of ALL exemplars for LU
outstr = _pretty_exemplars(self, self[0].LU)
else:
assert False, self._type
return self._data().__repr__()
+@python_2_unicode_compatible
class PrettyDict(AttrDict):
"""
Displays an abbreviated repr of values where possible.
"""
def __init__(self, *args, **kwargs):
- _BREAK_LINES = kwargs.pop("breakLines", False)
+ _BREAK_LINES = kwargs.pop('breakLines', False)
super(PrettyDict, self).__init__(*args, **kwargs)
- dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES)
+ dict.__setattr__(self, '_BREAK_LINES', _BREAK_LINES)
def __repr__(self):
parts = []
for k, v in sorted(self.items()):
- kv = repr(k) + ": "
+ kv = repr(k) + ': '
try:
kv += v._short_repr()
except AttributeError:
kv += repr(v)
parts.append(kv)
- return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}"
+ return '{' + (',\n ' if self._BREAK_LINES else ', ').join(parts) + '}'
+@python_2_unicode_compatible
class PrettyList(list):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
# from nltk.util
def __init__(self, *args, **kwargs):
- self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60)
- self._BREAK_LINES = kwargs.pop("breakLines", False)
+ self._MAX_REPR_SIZE = kwargs.pop('maxReprSize', 60)
+ self._BREAK_LINES = kwargs.pop('breakLines', False)
super(PrettyList, self).__init__(*args, **kwargs)
def __repr__(self):
) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(
- ",\n " if self._BREAK_LINES else ", "
+ return "[%s, ...]" % text_type(
+ ',\n ' if self._BREAK_LINES else ', '
).join(pieces[:-1])
- return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces)
+ return "[%s]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces)
+@python_2_unicode_compatible
class PrettyLazyMap(LazyMap):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ return "[%s]" % text_type(', ').join(pieces)
+@python_2_unicode_compatible
class PrettyLazyIteratorList(LazyIteratorList):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ return "[%s]" % text_type(', ').join(pieces)
+@python_2_unicode_compatible
class PrettyLazyConcatenation(LazyConcatenation):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ return "[%s]" % text_type(', ').join(pieces)
def __add__(self, other):
"""Return a list concatenating self with other."""
True
"""
- _bad_statuses = ["Problem"]
+ _bad_statuses = ['Problem']
"""
When loading LUs for a frame, those whose status is in this list will be ignored.
Due to caching, if user code modifies this, it should do so before loading any data.
# otherwise weird ordering effects might result in incomplete information
self._frame_idx = {}
for f in XMLCorpusView(
- self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt
+ self.abspath("frameIndex.xml"), 'frameIndex/frame', self._handle_elt
):
- self._frame_idx[f["ID"]] = f
+ self._frame_idx[f['ID']] = f
def _buildcorpusindex(self):
# The total number of fulltext annotated documents in Framenet
self._fulltext_idx = {}
for doclist in XMLCorpusView(
self.abspath("fulltextIndex.xml"),
- "fulltextIndex/corpus",
+ 'fulltextIndex/corpus',
self._handle_fulltextindex_elt,
):
for doc in doclist:
# should not be very large
self._lu_idx = {}
for lu in XMLCorpusView(
- self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt
+ self.abspath("luIndex.xml"), 'luIndex/lu', self._handle_elt
):
self._lu_idx[
- lu["ID"]
+ lu['ID']
] = lu # populate with LU index entries. if any of these
# are looked up they will be replaced by full LU objects.
x
for x in XMLCorpusView(
self.abspath("frRelation.xml"),
- "frameRelations/frameRelationType",
+ 'frameRelations/frameRelationType',
self._handle_framerelationtype_elt,
)
)
def _warn(self, *message, **kwargs):
if self._warnings:
- kwargs.setdefault("file", sys.stderr)
+ kwargs.setdefault('file', sys.stderr)
print(*message, **kwargs)
def readme(self):
locpath = os.path.join("{0}".format(self._root), self._fulltext_dir, xmlfname)
# Grab the top-level xml element containing the fulltext annotation
- elt = XMLCorpusView(locpath, "fullTextAnnotation")[0]
+ elt = XMLCorpusView(locpath, 'fullTextAnnotation')[0]
info = self._handle_fulltextannotation_elt(elt)
# add metadata
for k, v in self._fulltext_idx[fn_docid].items():
# get the name of the frame with this id number
try:
fentry = self._frame_idx[fn_fid]
- if "_type" in fentry:
+ if '_type' in fentry:
return fentry # full frame object is cached
- name = fentry["name"]
+ name = fentry['name']
except TypeError:
self._buildframeindex()
- name = self._frame_idx[fn_fid]["name"]
+ name = self._frame_idx[fn_fid]['name']
except KeyError:
- raise FramenetError("Unknown frame id: {0}".format(fn_fid))
+ raise FramenetError('Unknown frame id: {0}'.format(fn_fid))
return self.frame_by_name(name, ignorekeys, check_cache=False)
# print(locpath, file=sys.stderr)
# Grab the xml for the frame
try:
- elt = XMLCorpusView(locpath, "frame")[0]
+ elt = XMLCorpusView(locpath, 'frame')[0]
except IOError:
- raise FramenetError("Unknown frame: {0}".format(fn_fname))
+ raise FramenetError('Unknown frame: {0}'.format(fn_fname))
fentry = self._handle_frame_elt(elt, ignorekeys)
assert fentry
- fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml"
+ fentry.URL = self._fnweb_url + '/' + self._frame_dir + '/' + fn_fname + '.xml'
# INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs
for st in fentry.semTypes:
- if st.rootType.name == "Lexical_type":
+ if st.rootType.name == 'Lexical_type':
for lu in fentry.lexUnit.values():
if not any(
x is st for x in lu.semTypes
self._frame_idx[fentry.ID] = fentry
self._cached_frames[fentry.name] = fentry.ID
- """
+ '''
# now set up callables to resolve the LU pointers lazily.
# (could also do this here--caching avoids infinite recursion.)
for luName,luinfo in fentry.lexUnit.items():
fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID)
- """
+ '''
return fentry
def frame(self, fn_fid_or_fname, ignorekeys=[]):
"""
# get the frame info by name or id number
- if isinstance(fn_fid_or_fname, str):
+ if isinstance(fn_fid_or_fname, string_types):
f = self.frame_by_name(fn_fid_or_fname, ignorekeys)
else:
f = self.frame_by_id(fn_fid_or_fname, ignorekeys)
>>> lu # doctest: +ELLIPSIS
{'ID': 256,
'POS': 'V',
- 'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
+ 'URL': u'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
'_type': 'lu',
'cBy': ...,
'cDate': '02/08/2001 01:27:50 PST Thu',
:return: Basic information about the lexical unit
:rtype: dict
"""
- return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"])
+ return self.lu(fn_luid, ignorekeys=['subCorpus', 'exemplars'])
def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
"""
# luName, frameID, and frameName. However, this will not be listed
# among the LUs for its frame.
self._warn(
- "LU ID not found: {0} ({1}) in {2} ({3})".format(
+ 'LU ID not found: {0} ({1}) in {2} ({3})'.format(
luName, fn_luid, frameName, frameID
)
)
luinfo = AttrDict(
{
- "_type": "lu",
- "ID": fn_luid,
- "name": luName,
- "frameID": frameID,
- "status": "Problem",
+ '_type': 'lu',
+ 'ID': fn_luid,
+ 'name': luName,
+ 'frameID': frameID,
+ 'status': 'Problem',
}
)
f = self.frame_by_id(luinfo.frameID)
assert f.name == frameName, (f.name, frameName)
- luinfo["frame"] = f
+ luinfo['frame'] = f
self._lu_idx[fn_luid] = luinfo
- elif "_type" not in luinfo:
+ elif '_type' not in luinfo:
# we only have an index entry for the LU. loading the frame will replace this.
f = self.frame_by_id(luinfo.frameID)
luinfo = self._lu_idx[fn_luid]
self._buildluindex()
try:
- elt = XMLCorpusView(locpath, "lexUnit")[0]
+ elt = XMLCorpusView(locpath, 'lexUnit')[0]
except IOError:
- raise FramenetError("Unknown LU id: {0}".format(fn_luid))
+ raise FramenetError('Unknown LU id: {0}'.format(fn_luid))
lu2 = self._handle_lexunit_elt(elt, ignorekeys)
- lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname
+ lu.URL = self._fnweb_url + '/' + self._lu_dir + '/' + fname
lu.subCorpus = lu2.subCorpus
lu.exemplars = SpecialList(
- "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence]
+ 'luexemplars', [sent for subc in lu.subCorpus for sent in subc.sentence]
)
for sent in lu.exemplars:
- sent["LU"] = lu
- sent["frame"] = lu.frame
+ sent['LU'] = lu
+ sent['frame'] = lu.frame
for aset in sent.annotationSet:
- aset["LU"] = lu
- aset["frame"] = lu.frame
+ aset['LU'] = lu
+ aset['frame'] = lu.frame
return lu
x
for x in XMLCorpusView(
self.abspath("semTypes.xml"),
- "semTypes/semType",
+ 'semTypes/semType',
self._handle_semtype_elt,
)
]
for st in semtypeXML:
- n = st["name"]
- a = st["abbrev"]
- i = st["ID"]
+ n = st['name']
+ a = st['abbrev']
+ i = st['ID']
# Both name and abbrev should be able to retrieve the
# ID. The ID will retrieve the semantic type dict itself.
self._semtypes[n] = i
changed = True
nPropagations += 1
if (
- ferel.type.name in ["Perspective_on", "Subframe", "Precedes"]
+ ferel.type.name in ['Perspective_on', 'Subframe', 'Precedes']
and subST
and subST is not superST
):
)
def fes(self, name=None, frame=None):
- """
+ '''
Lists frame element objects. If 'name' is provided, this is treated as
a case-insensitive regular expression to filter by frame name.
(Case-insensitivity is because casing of frame element names is not always
:type name: str
:return: A list of matching frame elements
:rtype: list(AttrDict)
- """
+ '''
# what frames are we searching in?
if frame is not None:
if isinstance(frame, int):
frames = [self.frame(frame)]
- elif isinstance(frame, str):
+ elif isinstance(frame, string_types):
frames = self.frames(frame)
else:
frames = [frame]
if frame is not None:
if isinstance(frame, int):
frameIDs = {frame}
- elif isinstance(frame, str):
+ elif isinstance(frame, string_types):
frameIDs = {f.ID for f in self.frames(frame)}
else:
frameIDs = {frame.ID}
elif frame is not None: # all LUs in matching frames
if isinstance(frame, int):
frames = [self.frame(frame)]
- elif isinstance(frame, str):
+ elif isinstance(frame, string_types):
frames = self.frames(frame)
else:
frames = [frame]
return ftlist
else:
return PrettyList(
- x for x in ftlist if re.search(name, x["filename"]) is not None
+ x for x in ftlist if re.search(name, x['filename']) is not None
)
def docs(self, name=None):
aset
for sent in self.ft_sents()
for aset in sent.annotationSet[1:]
- if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs
+ if luNamePattern is None or aset.get('luID', 'CXN_ASET') in matchedLUIDs
)
else:
ftpart = []
be specified to retrieve sentences with both overt FEs (in either order).
"""
if fe is None and fe2 is not None:
- raise FramenetError("exemplars(..., fe=None, fe2=<value>) is not allowed")
+ raise FramenetError('exemplars(..., fe=None, fe2=<value>) is not allowed')
elif fe is not None and fe2 is not None:
- if not isinstance(fe2, str):
- if isinstance(fe, str):
+ if not isinstance(fe2, string_types):
+ if isinstance(fe, string_types):
# fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame.
fe, fe2 = fe2, fe
elif fe.frame is not fe2.frame: # ensure frames match
raise FramenetError(
- "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)"
+ 'exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)'
)
- if frame is None and fe is not None and not isinstance(fe, str):
+ if frame is None and fe is not None and not isinstance(fe, string_types):
frame = fe.frame
# narrow down to frames matching criteria
list
) # frame name -> matching LUs, if luNamePattern is specified
if frame is not None or luNamePattern is not None:
- if frame is None or isinstance(frame, str):
+ if frame is None or isinstance(frame, string_types):
if luNamePattern is not None:
frames = set()
for lu in self.lus(luNamePattern, frame=frame):
lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)}
if fe is not None: # narrow to frames that define this FE
- if isinstance(fe, str):
+ if isinstance(fe, string_types):
frames = PrettyLazyIteratorList(
f
for f in frames
else:
if fe.frame not in frames:
raise FramenetError(
- "exemplars() call with inconsistent `frame` and `fe` specification"
+ 'exemplars() call with inconsistent `frame` and `fe` specification'
)
frames = [fe.frame]
if fe2 is not None: # narrow to frames that ALSO define this FE
- if isinstance(fe2, str):
+ if isinstance(fe2, string_types):
frames = PrettyLazyIteratorList(
f
for f in frames
if fe is not None:
fes = (
{ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)}
- if isinstance(fe, str)
+ if isinstance(fe, string_types)
else {fe.name}
)
if fe2 is not None:
fes2 = (
{ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)}
- if isinstance(fe2, str)
+ if isinstance(fe2, string_types)
else {fe2.name}
)
If 'fes' is None, returns all overt FE names.
"""
overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set()
- if "FE2" in ex:
+ if 'FE2' in ex:
overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set()
- if "FE3" in ex:
+ if 'FE3' in ex:
overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set()
return overtNames & fes if fes is not None else overtNames
# lookup by 'frame'
if frame is not None:
- if isinstance(frame, dict) and "frameRelations" in frame:
+ if isinstance(frame, dict) and 'frameRelations' in frame:
rels = PrettyList(frame.frameRelations)
else:
if not isinstance(frame, int):
# Ignore these attributes when loading attributes from an xml node
ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
- "xsi",
- "schemaLocation",
- "xmlns",
- "bgColor",
- "fgColor",
+ 'xsi',
+ 'schemaLocation',
+ 'xmlns',
+ 'bgColor',
+ 'fgColor',
]
for attr in attr_dict:
"""
try:
- """
+ '''
# Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
if m:
print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr)
- """
-
- data = data.replace("<t>", "")
- data = data.replace("</t>", "")
- data = re.sub('<fex name="[^"]+">', "", data)
- data = data.replace("</fex>", "")
- data = data.replace("<fen>", "")
- data = data.replace("</fen>", "")
- data = data.replace("<m>", "")
- data = data.replace("</m>", "")
- data = data.replace("<ment>", "")
- data = data.replace("</ment>", "")
- data = data.replace("<ex>", "'")
- data = data.replace("</ex>", "'")
- data = data.replace("<gov>", "")
- data = data.replace("</gov>", "")
- data = data.replace("<x>", "")
- data = data.replace("</x>", "")
+ '''
+
+ data = data.replace('<t>', '')
+ data = data.replace('</t>', '')
+ data = re.sub('<fex name="[^"]+">', '', data)
+ data = data.replace('</fex>', '')
+ data = data.replace('<fen>', '')
+ data = data.replace('</fen>', '')
+ data = data.replace('<m>', '')
+ data = data.replace('</m>', '')
+ data = data.replace('<ment>', '')
+ data = data.replace('</ment>', '')
+ data = data.replace('<ex>', "'")
+ data = data.replace('</ex>', "'")
+ data = data.replace('<gov>', '')
+ data = data.replace('</gov>', '')
+ data = data.replace('<x>', '')
+ data = data.replace('</x>', '')
# Get rid of <def-root> and </def-root> tags
- data = data.replace("<def-root>", "")
- data = data.replace("</def-root>", "")
+ data = data.replace('<def-root>', '')
+ data = data.replace('</def-root>', '')
- data = data.replace("\n", " ")
+ data = data.replace('\n', ' ')
except AttributeError:
pass
corpid = ftinfo.ID
retlist = []
for sub in elt:
- if sub.tag.endswith("document"):
+ if sub.tag.endswith('document'):
doc = self._load_xml_attributes(AttrDict(), sub)
- if "name" in doc:
+ if 'name' in doc:
docname = doc.name
else:
docname = doc.description
doc.filename = "{0}__{1}.xml".format(corpname, docname)
doc.URL = (
- self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename
+ self._fnweb_url + '/' + self._fulltext_dir + '/' + doc.filename
)
doc.corpname = corpname
doc.corpid = corpid
"""Load the info for a Frame from a frame xml file"""
frinfo = self._load_xml_attributes(AttrDict(), elt)
- frinfo["_type"] = "frame"
- frinfo["definition"] = ""
- frinfo["definitionMarkup"] = ""
- frinfo["FE"] = PrettyDict()
- frinfo["FEcoreSets"] = []
- frinfo["lexUnit"] = PrettyDict()
- frinfo["semTypes"] = []
+ frinfo['_type'] = 'frame'
+ frinfo['definition'] = ""
+ frinfo['definitionMarkup'] = ""
+ frinfo['FE'] = PrettyDict()
+ frinfo['FEcoreSets'] = []
+ frinfo['lexUnit'] = PrettyDict()
+ frinfo['semTypes'] = []
for k in ignorekeys:
if k in frinfo:
del frinfo[k]
for sub in elt:
- if sub.tag.endswith("definition") and "definition" not in ignorekeys:
- frinfo["definitionMarkup"] = sub.text
- frinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("FE") and "FE" not in ignorekeys:
+ if sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+ frinfo['definitionMarkup'] = sub.text
+ frinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('FE') and 'FE' not in ignorekeys:
feinfo = self._handle_fe_elt(sub)
- frinfo["FE"][feinfo.name] = feinfo
- feinfo["frame"] = frinfo # backpointer
- elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys:
+ frinfo['FE'][feinfo.name] = feinfo
+ feinfo['frame'] = frinfo # backpointer
+ elif sub.tag.endswith('FEcoreSet') and 'FEcoreSet' not in ignorekeys:
coreset = self._handle_fecoreset_elt(sub)
# assumes all FEs have been loaded before coresets
- frinfo["FEcoreSets"].append(
- PrettyList(frinfo["FE"][fe.name] for fe in coreset)
+ frinfo['FEcoreSets'].append(
+ PrettyList(frinfo['FE'][fe.name] for fe in coreset)
)
- elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys:
+ elif sub.tag.endswith('lexUnit') and 'lexUnit' not in ignorekeys:
luentry = self._handle_framelexunit_elt(sub)
- if luentry["status"] in self._bad_statuses:
+ if luentry['status'] in self._bad_statuses:
# problematic LU entry; ignore it
continue
- luentry["frame"] = frinfo
- luentry["URL"] = (
+ luentry['frame'] = frinfo
+ luentry['URL'] = (
self._fnweb_url
- + "/"
+ + '/'
+ self._lu_dir
- + "/"
- + "lu{0}.xml".format(luentry["ID"])
+ + '/'
+ + "lu{0}.xml".format(luentry['ID'])
)
- luentry["subCorpus"] = Future(
+ luentry['subCorpus'] = Future(
(lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry)
)
- luentry["exemplars"] = Future(
+ luentry['exemplars'] = Future(
(lambda lu: lambda: self._lu_file(lu).exemplars)(luentry)
)
- frinfo["lexUnit"][luentry.name] = luentry
+ frinfo['lexUnit'][luentry.name] = luentry
if not self._lu_idx:
self._buildluindex()
self._lu_idx[luentry.ID] = luentry
- elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys:
+ elif sub.tag.endswith('semType') and 'semTypes' not in ignorekeys:
semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
- frinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+ frinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
- frinfo["frameRelations"] = self.frame_relations(frame=frinfo)
+ frinfo['frameRelations'] = self.frame_relations(frame=frinfo)
# resolve 'requires' and 'excludes' links between FEs of this frame
for fe in frinfo.FE.values():
def _handle_framerelationtype_elt(self, elt, *args):
"""Load frame-relation element and its child fe-relation elements from frRelation.xml."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "framerelationtype"
- info["frameRelations"] = PrettyList()
+ info['_type'] = 'framerelationtype'
+ info['frameRelations'] = PrettyList()
for sub in elt:
- if sub.tag.endswith("frameRelation"):
+ if sub.tag.endswith('frameRelation'):
frel = self._handle_framerelation_elt(sub)
- frel["type"] = info # backpointer
+ frel['type'] = info # backpointer
for ferel in frel.feRelations:
- ferel["type"] = info
- info["frameRelations"].append(frel)
+ ferel['type'] = info
+ info['frameRelations'].append(frel)
return info
def _handle_framerelation_elt(self, elt):
"""Load frame-relation element and its child fe-relation elements from frRelation.xml."""
info = self._load_xml_attributes(AttrDict(), elt)
- assert info["superFrameName"] != info["subFrameName"], (elt, info)
- info["_type"] = "framerelation"
- info["feRelations"] = PrettyList()
+ assert info['superFrameName'] != info['subFrameName'], (elt, info)
+ info['_type'] = 'framerelation'
+ info['feRelations'] = PrettyList()
for sub in elt:
- if sub.tag.endswith("FERelation"):
+ if sub.tag.endswith('FERelation'):
ferel = self._handle_elt(sub)
- ferel["_type"] = "ferelation"
- ferel["frameRelation"] = info # backpointer
- info["feRelations"].append(ferel)
+ ferel['_type'] = 'ferelation'
+ ferel['frameRelation'] = info # backpointer
+ info['feRelations'].append(ferel)
return info
element (which we ignore here) and a bunch of 'sentence'
elements."""
info = AttrDict()
- info["_type"] = "fulltext_annotation"
- info["sentence"] = []
+ info['_type'] = 'fulltext_annotation'
+ info['sentence'] = []
for sub in elt:
- if sub.tag.endswith("header"):
+ if sub.tag.endswith('header'):
continue # not used
- elif sub.tag.endswith("sentence"):
+ elif sub.tag.endswith('sentence'):
s = self._handle_fulltext_sentence_elt(sub)
s.doc = info
- info["sentence"].append(s)
+ info['sentence'].append(s)
return info
'sentence' element contains a "text" and "annotationSet" sub
elements."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "fulltext_sentence"
- info["annotationSet"] = []
- info["targets"] = []
+ info['_type'] = "fulltext_sentence"
+ info['annotationSet'] = []
+ info['targets'] = []
target_spans = set()
- info["_ascii"] = types.MethodType(
+ info['_ascii'] = types.MethodType(
_annotation_ascii, info
) # attach a method for this instance
- info["text"] = ""
+ info['text'] = ""
for sub in elt:
- if sub.tag.endswith("text"):
- info["text"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("annotationSet"):
+ if sub.tag.endswith('text'):
+ info['text'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('annotationSet'):
a = self._handle_fulltextannotationset_elt(
- sub, is_pos=(len(info["annotationSet"]) == 0)
+ sub, is_pos=(len(info['annotationSet']) == 0)
)
- if "cxnID" in a: # ignoring construction annotations for now
+ if 'cxnID' in a: # ignoring construction annotations for now
continue
a.sent = info
a.text = info.text
- info["annotationSet"].append(a)
- if "Target" in a:
+ info['annotationSet'].append(a)
+ if 'Target' in a:
for tspan in a.Target:
if tspan in target_spans:
self._warn(
info.text[slice(*tspan)]
),
tspan,
- "in sentence",
- info["ID"],
+ 'in sentence',
+ info['ID'],
info.text,
)
# this can happen in cases like "chemical and biological weapons"
# being annotated as "chemical weapons" and "biological weapons"
else:
target_spans.add(tspan)
- info["targets"].append((a.Target, a.luName, a.frameName))
+ info['targets'].append((a.Target, a.luName, a.frameName))
- assert info["annotationSet"][0].status == "UNANN"
- info["POS"] = info["annotationSet"][0].POS
- info["POS_tagset"] = info["annotationSet"][0].POS_tagset
+ assert info['annotationSet'][0].status == 'UNANN'
+ info['POS'] = info['annotationSet'][0].POS
+ info['POS_tagset'] = info['annotationSet'][0].POS_tagset
return info
def _handle_fulltextannotationset_elt(self, elt, is_pos=False):
info = self._handle_luannotationset_elt(elt, is_pos=is_pos)
if not is_pos:
- info["_type"] = "fulltext_annotationset"
- if "cxnID" not in info: # ignoring construction annotations for now
- info["LU"] = self.lu(
+ info['_type'] = 'fulltext_annotationset'
+ if 'cxnID' not in info: # ignoring construction annotations for now
+ info['LU'] = self.lu(
info.luID,
luName=info.luName,
frameID=info.frameID,
frameName=info.frameName,
)
- info["frame"] = info.LU.frame
+ info['frame'] = info.LU.frame
return info
def _handle_fulltextlayer_elt(self, elt):
"""Load information from the given 'layer' element. Each
'layer' contains several "label" elements."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "layer"
- info["label"] = []
+ info['_type'] = 'layer'
+ info['label'] = []
for sub in elt:
- if sub.tag.endswith("label"):
+ if sub.tag.endswith('label'):
l = self._load_xml_attributes(AttrDict(), sub)
- info["label"].append(l)
+ info['label'].append(l)
return info
def _handle_framelexunit_elt(self, elt):
"""Load the lexical unit info from an xml element in a frame's xml file."""
luinfo = AttrDict()
- luinfo["_type"] = "lu"
+ luinfo['_type'] = 'lu'
luinfo = self._load_xml_attributes(luinfo, elt)
luinfo["definition"] = ""
luinfo["definitionMarkup"] = ""
luinfo["sentenceCount"] = PrettyDict()
- luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes
- luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes
+ luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes
+ luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes
for sub in elt:
- if sub.tag.endswith("definition"):
- luinfo["definitionMarkup"] = sub.text
- luinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("sentenceCount"):
- luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub)
- elif sub.tag.endswith("lexeme"):
+ if sub.tag.endswith('definition'):
+ luinfo['definitionMarkup'] = sub.text
+ luinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('sentenceCount'):
+ luinfo['sentenceCount'] = self._load_xml_attributes(PrettyDict(), sub)
+ elif sub.tag.endswith('lexeme'):
lexemeinfo = self._load_xml_attributes(PrettyDict(), sub)
- if not isinstance(lexemeinfo.name, str):
+ if not isinstance(lexemeinfo.name, string_types):
# some lexeme names are ints by default: e.g.,
# thousand.num has lexeme with name="1000"
lexemeinfo.name = str(lexemeinfo.name)
- luinfo["lexemes"].append(lexemeinfo)
- elif sub.tag.endswith("semType"):
+ luinfo['lexemes'].append(lexemeinfo)
+ elif sub.tag.endswith('semType'):
semtypeinfo = self._load_xml_attributes(PrettyDict(), sub)
- luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+ luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
# sort lexemes by 'order' attribute
# otherwise, e.g., 'write down.v' may have lexemes in wrong order
- luinfo["lexemes"].sort(key=lambda x: x.order)
+ luinfo['lexemes'].sort(key=lambda x: x.order)
return luinfo
(which are not included in frame files).
"""
luinfo = self._load_xml_attributes(AttrDict(), elt)
- luinfo["_type"] = "lu"
- luinfo["definition"] = ""
- luinfo["definitionMarkup"] = ""
- luinfo["subCorpus"] = PrettyList()
- luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes
- luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes
+ luinfo['_type'] = 'lu'
+ luinfo['definition'] = ""
+ luinfo['definitionMarkup'] = ""
+ luinfo['subCorpus'] = PrettyList()
+ luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes
+ luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes
for k in ignorekeys:
if k in luinfo:
del luinfo[k]
for sub in elt:
- if sub.tag.endswith("header"):
+ if sub.tag.endswith('header'):
continue # not used
- elif sub.tag.endswith("valences"):
+ elif sub.tag.endswith('valences'):
continue # not used
- elif sub.tag.endswith("definition") and "definition" not in ignorekeys:
- luinfo["definitionMarkup"] = sub.text
- luinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys:
+ elif sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+ luinfo['definitionMarkup'] = sub.text
+ luinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('subCorpus') and 'subCorpus' not in ignorekeys:
sc = self._handle_lusubcorpus_elt(sub)
if sc is not None:
- luinfo["subCorpus"].append(sc)
- elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys:
- luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub))
- elif sub.tag.endswith("semType") and "semType" not in ignorekeys:
+ luinfo['subCorpus'].append(sc)
+ elif sub.tag.endswith('lexeme') and 'lexeme' not in ignorekeys:
+ luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub))
+ elif sub.tag.endswith('semType') and 'semType' not in ignorekeys:
semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
- luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+ luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
return luinfo
"""Load a subcorpus of a lexical unit from the given xml."""
sc = AttrDict()
try:
- sc["name"] = elt.get("name")
+ sc['name'] = elt.get('name')
except AttributeError:
return None
- sc["_type"] = "lusubcorpus"
- sc["sentence"] = []
+ sc['_type'] = "lusubcorpus"
+ sc['sentence'] = []
for sub in elt:
- if sub.tag.endswith("sentence"):
+ if sub.tag.endswith('sentence'):
s = self._handle_lusentence_elt(sub)
if s is not None:
- sc["sentence"].append(s)
+ sc['sentence'].append(s)
return sc
def _handle_lusentence_elt(self, elt):
"""Load a sentence from a subcorpus of an LU from xml."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "lusentence"
- info["annotationSet"] = []
- info["_ascii"] = types.MethodType(
+ info['_type'] = 'lusentence'
+ info['annotationSet'] = []
+ info['_ascii'] = types.MethodType(
_annotation_ascii, info
) # attach a method for this instance
for sub in elt:
- if sub.tag.endswith("text"):
- info["text"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("annotationSet"):
+ if sub.tag.endswith('text'):
+ info['text'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('annotationSet'):
annset = self._handle_luannotationset_elt(
- sub, is_pos=(len(info["annotationSet"]) == 0)
+ sub, is_pos=(len(info['annotationSet']) == 0)
)
if annset is not None:
- assert annset.status == "UNANN" or "FE" in annset, annset
- if annset.status != "UNANN":
- info["frameAnnotation"] = annset
+ assert annset.status == 'UNANN' or 'FE' in annset, annset
+ if annset.status != 'UNANN':
+ info['frameAnnotation'] = annset
# copy layer info up to current level
for k in (
- "Target",
- "FE",
- "FE2",
- "FE3",
- "GF",
- "PT",
- "POS",
- "POS_tagset",
- "Other",
- "Sent",
- "Verb",
- "Noun",
- "Adj",
- "Adv",
- "Prep",
- "Scon",
- "Art",
+ 'Target',
+ 'FE',
+ 'FE2',
+ 'FE3',
+ 'GF',
+ 'PT',
+ 'POS',
+ 'POS_tagset',
+ 'Other',
+ 'Sent',
+ 'Verb',
+ 'Noun',
+ 'Adj',
+ 'Adv',
+ 'Prep',
+ 'Scon',
+ 'Art',
):
if k in annset:
info[k] = annset[k]
- info["annotationSet"].append(annset)
- annset["sent"] = info
- annset["text"] = info.text
+ info['annotationSet'].append(annset)
+ annset['sent'] = info
+ annset['text'] = info.text
return info
def _handle_luannotationset_elt(self, elt, is_pos=False):
"""Load an annotation set from a sentence in an subcorpus of an LU"""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "posannotationset" if is_pos else "luannotationset"
- info["layer"] = []
- info["_ascii"] = types.MethodType(
+ info['_type'] = 'posannotationset' if is_pos else 'luannotationset'
+ info['layer'] = []
+ info['_ascii'] = types.MethodType(
_annotation_ascii, info
) # attach a method for this instance
- if "cxnID" in info: # ignoring construction annotations for now.
+ if 'cxnID' in info: # ignoring construction annotations for now.
return info
for sub in elt:
- if sub.tag.endswith("layer"):
+ if sub.tag.endswith('layer'):
l = self._handle_lulayer_elt(sub)
if l is not None:
overt = []
ni = {} # null instantiations
- info["layer"].append(l)
+ info['layer'].append(l)
for lbl in l.label:
- if "start" in lbl:
+ if 'start' in lbl:
thespan = (lbl.start, lbl.end + 1, lbl.name)
if l.name not in (
- "Sent",
- "Other",
+ 'Sent',
+ 'Other',
): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
assert thespan not in overt, (info.ID, l.name, thespan)
overt.append(thespan)
else: # null instantiation
if lbl.name in ni:
self._warn(
- "FE with multiple NI entries:",
+ 'FE with multiple NI entries:',
lbl.name,
ni[lbl.name],
lbl.itype,
ni[lbl.name] = lbl.itype
overt = sorted(overt)
- if l.name == "Target":
+ if l.name == 'Target':
if not overt:
self._warn(
- "Skipping empty Target layer in annotation set ID={0}".format(
+ 'Skipping empty Target layer in annotation set ID={0}'.format(
info.ID
)
)
continue
- assert all(lblname == "Target" for i, j, lblname in overt)
- if "Target" in info:
+ assert all(lblname == 'Target' for i, j, lblname in overt)
+ if 'Target' in info:
self._warn(
- "Annotation set {0} has multiple Target layers".format(
+ 'Annotation set {0} has multiple Target layers'.format(
info.ID
)
)
else:
- info["Target"] = [(i, j) for (i, j, _) in overt]
- elif l.name == "FE":
+ info['Target'] = [(i, j) for (i, j, _) in overt]
+ elif l.name == 'FE':
if l.rank == 1:
- assert "FE" not in info
- info["FE"] = (overt, ni)
+ assert 'FE' not in info
+ info['FE'] = (overt, ni)
# assert False,info
else:
# sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v
assert 2 <= l.rank <= 3, l.rank
- k = "FE" + str(l.rank)
+ k = 'FE' + str(l.rank)
assert k not in info
info[k] = (overt, ni)
- elif l.name in ("GF", "PT"):
+ elif l.name in ('GF', 'PT'):
assert l.rank == 1
info[l.name] = overt
- elif l.name in ("BNC", "PENN"):
+ elif l.name in ('BNC', 'PENN'):
assert l.rank == 1
- info["POS"] = overt
- info["POS_tagset"] = l.name
+ info['POS'] = overt
+ info['POS_tagset'] = l.name
else:
if is_pos:
- if l.name not in ("NER", "WSL"):
+ if l.name not in ('NER', 'WSL'):
self._warn(
- "Unexpected layer in sentence annotationset:",
+ 'Unexpected layer in sentence annotationset:',
l.name,
)
else:
if l.name not in (
- "Sent",
- "Verb",
- "Noun",
- "Adj",
- "Adv",
- "Prep",
- "Scon",
- "Art",
- "Other",
+ 'Sent',
+ 'Verb',
+ 'Noun',
+ 'Adj',
+ 'Adv',
+ 'Prep',
+ 'Scon',
+ 'Art',
+ 'Other',
):
self._warn(
- "Unexpected layer in frame annotationset:", l.name
+ 'Unexpected layer in frame annotationset:', l.name
)
info[l.name] = overt
- if not is_pos and "cxnID" not in info:
- if "Target" not in info:
- self._warn("Missing target in annotation set ID={0}".format(info.ID))
- assert "FE" in info
- if "FE3" in info:
- assert "FE2" in info
+ if not is_pos and 'cxnID' not in info:
+ if 'Target' not in info:
+ self._warn('Missing target in annotation set ID={0}'.format(info.ID))
+ assert 'FE' in info
+ if 'FE3' in info:
+ assert 'FE2' in info
return info
def _handle_lulayer_elt(self, elt):
"""Load a layer from an annotation set"""
layer = self._load_xml_attributes(AttrDict(), elt)
- layer["_type"] = "lulayer"
- layer["label"] = []
+ layer['_type'] = 'lulayer'
+ layer['label'] = []
for sub in elt:
- if sub.tag.endswith("label"):
+ if sub.tag.endswith('label'):
l = self._load_xml_attributes(AttrDict(), sub)
if l is not None:
- layer["label"].append(l)
+ layer['label'].append(l)
return layer
def _handle_fe_elt(self, elt):
feinfo = self._load_xml_attributes(AttrDict(), elt)
- feinfo["_type"] = "fe"
- feinfo["definition"] = ""
- feinfo["definitionMarkup"] = ""
- feinfo["semType"] = None
- feinfo["requiresFE"] = None
- feinfo["excludesFE"] = None
+ feinfo['_type'] = 'fe'
+ feinfo['definition'] = ""
+ feinfo['definitionMarkup'] = ""
+ feinfo['semType'] = None
+ feinfo['requiresFE'] = None
+ feinfo['excludesFE'] = None
for sub in elt:
- if sub.tag.endswith("definition"):
- feinfo["definitionMarkup"] = sub.text
- feinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("semType"):
+ if sub.tag.endswith('definition'):
+ feinfo['definitionMarkup'] = sub.text
+ feinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('semType'):
stinfo = self._load_xml_attributes(AttrDict(), sub)
- feinfo["semType"] = self.semtype(stinfo.ID)
- elif sub.tag.endswith("requiresFE"):
- feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub)
- elif sub.tag.endswith("excludesFE"):
- feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub)
+ feinfo['semType'] = self.semtype(stinfo.ID)
+ elif sub.tag.endswith('requiresFE'):
+ feinfo['requiresFE'] = self._load_xml_attributes(AttrDict(), sub)
+ elif sub.tag.endswith('excludesFE'):
+ feinfo['excludesFE'] = self._load_xml_attributes(AttrDict(), sub)
return feinfo
def _handle_semtype_elt(self, elt, tagspec=None):
semt = self._load_xml_attributes(AttrDict(), elt)
- semt["_type"] = "semtype"
- semt["superType"] = None
- semt["subTypes"] = PrettyList()
+ semt['_type'] = 'semtype'
+ semt['superType'] = None
+ semt['subTypes'] = PrettyList()
for sub in elt:
if sub.text is not None:
- semt["definitionMarkup"] = sub.text
- semt["definition"] = self._strip_tags(sub.text)
+ semt['definitionMarkup'] = sub.text
+ semt['definition'] = self._strip_tags(sub.text)
else:
supertypeinfo = self._load_xml_attributes(AttrDict(), sub)
- semt["superType"] = supertypeinfo
+ semt['superType'] = supertypeinfo
# the supertype may not have been loaded yet
return semt
# buildindexes(). We do this here just for demo purposes. If the
# indexes are not built explicitely, they will be built as needed.
#
- print("Building the indexes...")
+ print('Building the indexes...')
fn.buildindexes()
#
# Get some statistics about the corpus
#
- print("Number of Frames:", len(fn.frames()))
- print("Number of Lexical Units:", len(fn.lus()))
- print("Number of annotated documents:", len(fn.docs()))
+ print('Number of Frames:', len(fn.frames()))
+ print('Number of Lexical Units:', len(fn.lus()))
+ print('Number of annotated documents:', len(fn.docs()))
print()
#
print(
'getting frames whose name matches the (case insensitive) regex: "(?i)medical"'
)
- medframes = fn.frames(r"(?i)medical")
+ medframes = fn.frames(r'(?i)medical')
print('Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
print([(f.name, f.ID) for f in medframes])
len(m_frame.frameRelations),
)
for fr in m_frame.frameRelations:
- print(" ", fr)
+ print(' ', fr)
#
# get the names of the Frame Elements
'\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
len(m_frame.FE),
)
- print(" ", [x for x in m_frame.FE])
+ print(' ', [x for x in m_frame.FE])
#
# get the names of the "Core" Frame Elements
#
print('\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
- print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
+ print(' ', [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
#
# get all of the Lexical Units that are incorporated in the
ailment_lus = [
x
for x in m_frame.lexUnit.values()
- if "incorporatedFE" in x and x.incorporatedFE == "Ailment"
+ if 'incorporatedFE' in x and x.incorporatedFE == 'Ailment'
]
- print(" ", [x.name for x in ailment_lus])
+ print(' ', [x.name for x in ailment_lus])
#
# get all of the Lexical Units for the frame
'\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
len(m_frame.lexUnit),
)
- print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...")
+ print(' ', [x.name for x in m_frame.lexUnit.values()][:5], '...')
#
# get basic info on the second LU in the frame
#
- tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU
+ tmp_id = m_frame.lexUnit['ailment.n'].ID # grab the id of the specified LU
luinfo = fn.lu_basic(tmp_id) # get basic info on the LU
- print("\nInformation on the LU: {0}".format(luinfo.name))
+ print('\nInformation on the LU: {0}'.format(luinfo.name))
pprint(luinfo)
#
# Get a list of all of the corpora used for fulltext annotation
#
- print("\nNames of all of the corpora used for fulltext annotation:")
+ print('\nNames of all of the corpora used for fulltext annotation:')
allcorpora = set(x.corpname for x in fn.docs_metadata())
pprint(list(allcorpora))
print(
'\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":'
)
- pprint(fn.frames_by_lemma(r"^run.v$"))
+ pprint(fn.frames_by_lemma(r'^run.v$'))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: IEER Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
The corpus contains the following files: APW_19980314, APW_19980424,
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
"""
+from __future__ import unicode_literals
+
+from six import string_types
import nltk
+from nltk import compat
from nltk.corpus.reader.api import *
#: A dictionary whose keys are the names of documents in this corpus;
#: and whose values are descriptions of those documents' contents.
titles = {
- "APW_19980314": "Associated Press Weekly, 14 March 1998",
- "APW_19980424": "Associated Press Weekly, 24 April 1998",
- "APW_19980429": "Associated Press Weekly, 29 April 1998",
- "NYT_19980315": "New York Times, 15 March 1998",
- "NYT_19980403": "New York Times, 3 April 1998",
- "NYT_19980407": "New York Times, 7 April 1998",
+ 'APW_19980314': 'Associated Press Weekly, 14 March 1998',
+ 'APW_19980424': 'Associated Press Weekly, 24 April 1998',
+ 'APW_19980429': 'Associated Press Weekly, 29 April 1998',
+ 'NYT_19980315': 'New York Times, 15 March 1998',
+ 'NYT_19980403': 'New York Times, 3 April 1998',
+ 'NYT_19980407': 'New York Times, 7 April 1998',
}
#: A list of all documents in this corpus.
documents = sorted(titles)
-
+@compat.python_2_unicode_compatible
class IEERDocument(object):
- def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
+ def __init__(self, text, docno=None, doctype=None, date_time=None, headline=''):
self.text = text
self.docno = docno
self.doctype = doctype
def __repr__(self):
if self.headline:
- headline = " ".join(self.headline.leaves())
+ headline = ' '.join(self.headline.leaves())
else:
headline = (
- " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
+ ' '.join([w for w in self.text.leaves() if w[:1] != '<'][:12]) + '...'
)
if self.docno is not None:
- return "<IEERDocument %s: %r>" % (self.docno, headline)
+ return '<IEERDocument %s: %r>' % (self.docno, headline)
else:
- return "<IEERDocument: %r>" % headline
+ return '<IEERDocument: %r>' % headline
class IEERCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
line = stream.readline()
if not line:
break
- if line.strip() == "<DOC>":
+ if line.strip() == '<DOC>':
break
out.append(line)
# Read the document
if not line:
break
out.append(line)
- if line.strip() == "</DOC>":
+ if line.strip() == '</DOC>':
break
# Return the document
- return ["\n".join(out)]
+ return ['\n'.join(out)]
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
- Telugu: IIIT Hyderabad
"""
+from six import string_types
+
from nltk.tag import str2tuple, map_tag
from nltk.corpus.reader.util import *
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def read_block(self, stream):
line = stream.readline()
- if line.startswith("<"):
+ if line.startswith('<'):
return []
- sent = [str2tuple(word, sep="_") for word in line.split()]
+ sent = [str2tuple(word, sep='_') for word in line.split()]
if self._tag_mapping_function:
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
if not self._tagged:
# Natural Language Toolkit: IPI PAN Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import functools
+from six import string_types
+
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
from nltk.corpus.reader.api import CorpusReader
def _parse_args(fun):
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
- kwargs.pop("tags", None)
+ kwargs.pop('tags', None)
if not fileids:
fileids = self.fileids()
return fun(self, fileids, **kwargs)
filecontents = []
for fileid in self._list_morph_files(fileids):
- with open(fileid, "r") as infile:
+ with open(fileid, 'r') as infile:
filecontents.append(infile.read())
- return "".join(filecontents)
+ return ''.join(filecontents)
def channels(self, fileids=None):
if not fileids:
fileids = self.fileids()
- return self._parse_header(fileids, "channel")
+ return self._parse_header(fileids, 'channel')
def domains(self, fileids=None):
if not fileids:
fileids = self.fileids()
- return self._parse_header(fileids, "domain")
+ return self._parse_header(fileids, 'domain')
def categories(self, fileids=None):
if not fileids:
fileids = self.fileids()
return [
- self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
+ self._map_category(cat) for cat in self._parse_header(fileids, 'keyTerm')
]
def fileids(self, channels=None, domains=None, categories=None):
if channels is not None and domains is not None and categories is not None:
raise ValueError(
- "You can specify only one of channels, domains "
- "and categories parameter at once"
+ 'You can specify only one of channels, domains '
+ 'and categories parameter at once'
)
if channels is None and domains is None and categories is None:
return CorpusReader.fileids(self)
- if isinstance(channels, str):
+ if isinstance(channels, string_types):
channels = [channels]
- if isinstance(domains, str):
+ if isinstance(domains, string_types):
domains = [domains]
- if isinstance(categories, str):
+ if isinstance(categories, string_types):
categories = [categories]
if channels:
- return self._list_morph_files_by("channel", channels)
+ return self._list_morph_files_by('channel', channels)
elif domains:
- return self._list_morph_files_by("domain", domains)
+ return self._list_morph_files_by('domain', domains)
else:
return self._list_morph_files_by(
- "keyTerm", categories, map=self._map_category
+ 'keyTerm', categories, map=self._map_category
)
@_parse_args
def _list_header_files(self, fileids):
return [
- f.replace("morph.xml", "header.xml")
+ f.replace('morph.xml', 'header.xml')
for f in self._list_morph_files(fileids)
]
fileids = self.fileids()
ret_fileids = set()
for f in fileids:
- fp = self.abspath(f).replace("morph.xml", "header.xml")
+ fp = self.abspath(f).replace('morph.xml', 'header.xml')
values_list = self._get_tag(fp, tag)
for value in values_list:
if map is not None:
def _get_tag(self, f, tag):
tags = []
- with open(f, "r") as infile:
+ with open(f, 'r') as infile:
header = infile.read()
tag_end = 0
while True:
- tag_pos = header.find("<" + tag, tag_end)
+ tag_pos = header.find('<' + tag, tag_end)
if tag_pos < 0:
return tags
- tag_end = header.find("</" + tag + ">", tag_pos)
+ tag_end = header.find('</' + tag + '>', tag_pos)
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
def _map_category(self, cat):
- pos = cat.find(">")
+ pos = cat.find('>')
if pos == -1:
return cat
else:
return cat[pos + 1 :]
def _view(self, filename, **kwargs):
- tags = kwargs.pop("tags", True)
- mode = kwargs.pop("mode", 0)
- simplify_tags = kwargs.pop("simplify_tags", False)
- one_tag = kwargs.pop("one_tag", True)
- disamb_only = kwargs.pop("disamb_only", True)
- append_no_space = kwargs.pop("append_no_space", False)
- append_space = kwargs.pop("append_space", False)
- replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+ tags = kwargs.pop('tags', True)
+ mode = kwargs.pop('mode', 0)
+ simplify_tags = kwargs.pop('simplify_tags', False)
+ one_tag = kwargs.pop('one_tag', True)
+ disamb_only = kwargs.pop('disamb_only', True)
+ append_no_space = kwargs.pop('append_no_space', False)
+ append_space = kwargs.pop('append_space', False)
+ replace_xmlentities = kwargs.pop('replace_xmlentities', True)
if len(kwargs) > 0:
- raise ValueError("Unexpected arguments: %s" % kwargs.keys())
+ raise ValueError('Unexpected arguments: %s' % kwargs.keys())
if not one_tag and not disamb_only:
raise ValueError(
- "You cannot specify both one_tag=False and " "disamb_only=False"
+ 'You cannot specify both one_tag=False and ' 'disamb_only=False'
)
if not tags and (simplify_tags or not one_tag or not disamb_only):
raise ValueError(
- "You cannot specify simplify_tags, one_tag or "
- "disamb_only with functions other than tagged_*"
+ 'You cannot specify simplify_tags, one_tag or '
+ 'disamb_only with functions other than tagged_*'
)
return IPIPANCorpusView(
self.in_sentence = False
self.position = 0
- self.show_tags = kwargs.pop("tags", True)
- self.disamb_only = kwargs.pop("disamb_only", True)
- self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
- self.simplify_tags = kwargs.pop("simplify_tags", False)
- self.one_tag = kwargs.pop("one_tag", True)
- self.append_no_space = kwargs.pop("append_no_space", False)
- self.append_space = kwargs.pop("append_space", False)
- self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+ self.show_tags = kwargs.pop('tags', True)
+ self.disamb_only = kwargs.pop('disamb_only', True)
+ self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
+ self.simplify_tags = kwargs.pop('simplify_tags', False)
+ self.one_tag = kwargs.pop('one_tag', True)
+ self.append_no_space = kwargs.pop('append_no_space', False)
+ self.append_space = kwargs.pop('append_space', False)
+ self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
def read_block(self, stream):
sentence = []
self._seek(stream)
lines = self._read_data(stream)
- if lines == [""]:
+ if lines == ['']:
assert not sentences
return []
self.in_sentence = True
elif line.startswith('<chunk type="p"'):
pass
- elif line.startswith("<tok"):
+ elif line.startswith('<tok'):
if self.append_space and space and not no_space:
self._append_space(sentence)
space = True
no_space = False
orth = ""
tags = set()
- elif line.startswith("</chunk"):
+ elif line.startswith('</chunk'):
if self.in_sentence:
self.in_sentence = False
self._seek(stream)
elif self.mode == self.PARAS_MODE:
self._seek(stream)
return [sentences]
- elif line.startswith("<orth"):
+ elif line.startswith('<orth'):
orth = line[6:-7]
if self.replace_xmlentities:
- orth = orth.replace(""", '"').replace("&", "&")
- elif line.startswith("<lex"):
- if not self.disamb_only or line.find("disamb=") != -1:
- tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
+ orth = orth.replace('"', '"').replace('&', '&')
+ elif line.startswith('<lex'):
+ if not self.disamb_only or line.find('disamb=') != -1:
+ tag = line[line.index('<ctag') + 6 : line.index('</ctag')]
tags.add(tag)
- elif line.startswith("</tok"):
+ elif line.startswith('</tok'):
if self.show_tags:
if self.simplify_tags:
- tags = [t.split(":")[0] for t in tags]
+ tags = [t.split(':')[0] for t in tags]
if not self.one_tag or not self.disamb_only:
sentence.append((orth, tuple(tags)))
else:
sentence.append((orth, tags.pop()))
else:
sentence.append(orth)
- elif line.startswith("<ns/>"):
+ elif line.startswith('<ns/>'):
if self.append_space:
no_space = True
if self.append_no_space:
if self.show_tags:
- sentence.append(("", "no-space"))
+ sentence.append(('', 'no-space'))
else:
- sentence.append("")
- elif line.startswith("</cesAna"):
+ sentence.append('')
+ elif line.startswith('</cesAna'):
pass
def _read_data(self, stream):
self.position = stream.tell()
buff = stream.read(4096)
- lines = buff.split("\n")
+ lines = buff.split('\n')
lines.reverse()
return lines
def _append_space(self, sentence):
if self.show_tags:
- sentence.append((" ", "space"))
+ sentence.append((' ', 'space'))
else:
- sentence.append(" ")
+ sentence.append(' ')
#! /usr/bin/env python
# KNB Corpus reader
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
import re
+from six import string_types
from nltk.parse import DependencyGraph
from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
# default function to convert morphlist to str for tree representation
-_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
+_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS')
class KNBCorpusReader(SyntaxCorpusReader):
"""
- def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
+ def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
- res.append((cells[0], " ".join(cells[1:])))
+ res.append((cells[0], ' '.join(cells[1:])))
return res
dg = DependencyGraph()
i = 0
for line in t.splitlines():
- if line[0] in "*+":
+ if line[0] in '*+':
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
assert m is not None
node = dg.nodes[i]
- node.update({"address": i, "rel": m.group(2), "word": []})
+ node.update({'address': i, 'rel': m.group(2), 'word': []})
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
- dg.nodes[dep_parent]["deps"].append(i)
+ dg.nodes[dep_parent]['deps'].append(i)
i += 1
- elif line[0] != "#":
+ elif line[0] != '#':
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
- morph = cells[0], " ".join(cells[1:])
- dg.nodes[i - 1]["word"].append(morph)
+ morph = cells[0], ' '.join(cells[1:])
+ dg.nodes[i - 1]['word'].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
- node["word"] = self.morphs2str(node["word"])
+ node['word'] = self.morphs2str(node['word'])
return dg.tree()
import nltk
from nltk.corpus.util import LazyCorpusLoader
- root = nltk.data.find("corpora/knbc/corpus1")
+ root = nltk.data.find('corpora/knbc/corpus1')
fileids = [
f
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
]
def _knbc_fileids_sort(x):
- cells = x.split("-")
+ cells = x.split('-')
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader(
- "knbc/corpus1",
+ 'knbc/corpus1',
KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort),
- encoding="euc-jp",
+ encoding='euc-jp',
)
print(knbc.fileids()[:10])
- print("".join(knbc.words()[:100]))
+ print(''.join(knbc.words()[:100]))
- print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
+ print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))
- knbc.morphs2str = lambda morphs: "/".join(
- "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
- ).encode("utf-8")
+ knbc.morphs2str = lambda morphs: '/'.join(
+ "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
+ ).encode('utf-8')
- print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
+ print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))
print(
- "\n".join(
- " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
+ '\n'.join(
+ ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
- "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
+ 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp'
)
- assert isinstance(knbc.words()[0], str)
- assert isinstance(knbc.sents()[0][0], str)
+ assert isinstance(knbc.words()[0], string_types)
+ assert isinstance(knbc.sents()[0][0], string_types)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Lin's Thesaurus
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Dan Blanchard <dblanchard@ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.txt
+from __future__ import print_function
import re
from collections import defaultdict
@staticmethod
def __defaultdict_factory():
- """ Factory for creating defaultdict of defaultdict(dict)s """
+ ''' Factory for creating defaultdict of defaultdict(dict)s '''
return defaultdict(dict)
def __init__(self, root, badscore=0.0):
- """
+ '''
Initialize the thesaurus.
:param root: root directory containing thesaurus LISP files
:type root: C{string}
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
:type badscore: C{float}
- """
+ '''
- super(LinThesaurusCorpusReader, self).__init__(root, r"sim[A-Z]\.lsp")
+ super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
self._badscore = badscore
for path, encoding, fileid in self.abspaths(
line = line.strip()
# Start of entry
if first:
- key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
+ key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
first = False
# End of entry
- elif line == "))":
+ elif line == '))':
first = True
# Lines with pairs of ngrams and scores
else:
- split_line = line.split("\t")
+ split_line = line.split('\t')
if len(split_line) == 2:
ngram, score = split_line
self._thesaurus[fileid][key][ngram.strip('"')] = float(
)
def similarity(self, ngram1, ngram2, fileid=None):
- """
+ '''
Returns the similarity score for two ngrams.
:param ngram1: first ngram to compare
:type fileid: C{string}
:return: If fileid is specified, just the score for the two ngrams; otherwise,
list of tuples of fileids and scores.
- """
+ '''
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
if ngram1 == ngram2:
if fileid:
]
def scored_synonyms(self, ngram, fileid=None):
- """
+ '''
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
:param ngram: ngram to lookup
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
list of tuples of fileids and lists, where inner lists consist of tuples of
scores and synonyms.
- """
+ '''
if fileid:
return self._thesaurus[fileid][ngram].items()
else:
]
def synonyms(self, ngram, fileid=None):
- """
+ '''
Returns a list of synonyms for the current ngram.
:param ngram: ngram to lookup
:type fileid: C{string}
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
lists, where inner lists contain synonyms.
- """
+ '''
if fileid:
return self._thesaurus[fileid][ngram].keys()
else:
]
def __contains__(self, ngram):
- """
+ '''
Determines whether or not the given ngram is in the thesaurus.
:param ngram: ngram to lookup
:type ngram: C{string}
:return: whether the given ngram is in the thesaurus.
- """
+ '''
return reduce(
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
self._fileids,
print(thes.similarity(word1, word2))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
import re
from functools import reduce
+from six import string_types
+
from nltk.corpus.reader import concat, TaggedCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusView
"""
ns = {
- "tei": "http://www.tei-c.org/ns/1.0",
- "xml": "http://www.w3.org/XML/1998/namespace",
+ 'tei': 'http://www.tei-c.org/ns/1.0',
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
}
- tag_ns = "{http://www.tei-c.org/ns/1.0}"
- xml_ns = "{http://www.w3.org/XML/1998/namespace}"
+ tag_ns = '{http://www.tei-c.org/ns/1.0}'
+ xml_ns = '{http://www.w3.org/XML/1998/namespace}'
word_path = "TEI/text/body/div/div/p/s/(w|c)"
sent_path = "TEI/text/body/div/div/p/s"
para_path = "TEI/text/body/div/div/p"
@classmethod
def _sent_elt(cls, elt, context):
- return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+ return [cls._word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
@classmethod
def _para_elt(cls, elt, context):
- return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+ return [cls._sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
@classmethod
def _tagged_word_elt(cls, elt, context):
- if "ana" not in elt.attrib:
- return (elt.text, "")
+ if 'ana' not in elt.attrib:
+ return (elt.text, '')
if cls.__tags == "" and cls.__tagset == "msd":
- return (elt.text, elt.attrib["ana"])
+ return (elt.text, elt.attrib['ana'])
elif cls.__tags == "" and cls.__tagset == "universal":
- return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
+ return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
else:
- tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
- if tags.match(elt.attrib["ana"]):
+ tags = re.compile('^' + re.sub("-", ".", cls.__tags) + '.*$')
+ if tags.match(elt.attrib['ana']):
if cls.__tagset == "msd":
- return (elt.text, elt.attrib["ana"])
+ return (elt.text, elt.attrib['ana'])
else:
return (
elt.text,
- MTETagConverter.msd_to_universal(elt.attrib["ana"]),
+ MTETagConverter.msd_to_universal(elt.attrib['ana']),
)
else:
return None
return list(
filter(
lambda x: x is not None,
- [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
+ [cls._tagged_word_elt(w, None) for w in xpath(elt, '*', cls.ns)],
)
)
return list(
filter(
lambda x: x is not None,
- [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
+ [cls._tagged_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)],
)
)
@classmethod
def _lemma_word_elt(cls, elt, context):
- if "lemma" not in elt.attrib:
- return (elt.text, "")
+ if 'lemma' not in elt.attrib:
+ return (elt.text, '')
else:
- return (elt.text, elt.attrib["lemma"])
+ return (elt.text, elt.attrib['lemma'])
@classmethod
def _lemma_sent_elt(cls, elt, context):
- return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+ return [cls._lemma_word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
@classmethod
def _lemma_para_elt(cls, elt, context):
- return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+ return [cls._lemma_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
def words(self):
return MTECorpusView(
"""
mapping_msd_universal = {
- "A": "ADJ",
- "S": "ADP",
- "R": "ADV",
- "C": "CONJ",
- "D": "DET",
- "N": "NOUN",
- "M": "NUM",
- "Q": "PRT",
- "P": "PRON",
- "V": "VERB",
- ".": ".",
- "-": "X",
+ 'A': 'ADJ',
+ 'S': 'ADP',
+ 'R': 'ADV',
+ 'C': 'CONJ',
+ 'D': 'DET',
+ 'N': 'NOUN',
+ 'M': 'NUM',
+ 'Q': 'PRT',
+ 'P': 'PRON',
+ 'V': 'VERB',
+ '.': '.',
+ '-': 'X',
}
@staticmethod
indicator = tag[0] if not tag[0] == "#" else tag[1]
if not indicator in MTETagConverter.mapping_msd_universal:
- indicator = "-"
+ indicator = '-'
return MTETagConverter.mapping_msd_universal[indicator]
scheme. These tags can be converted to the Universal tagset
"""
- def __init__(self, root=None, fileids=None, encoding="utf8"):
+ def __init__(self, root=None, fileids=None, encoding='utf8'):
"""
Construct a new MTECorpusreader for a set of documents
located at the given root directory. Example usage:
def __fileids(self, fileids):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
# filter wrong userinput
fileids = filter(lambda x: x in self._fileids, fileids)
# Natural Language Toolkit: NKJP Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Gabriela Kaczka
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
import tempfile
+from six import string_types
+
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
HEADER_MODE = 2
RAW_MODE = 3
- def __init__(self, root, fileids=".*"):
+ def __init__(self, root, fileids='.*'):
"""
Corpus reader designed to work with National Corpus of Polish.
See http://nkjp.pl/ for more details about NKJP.
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
"""
- if isinstance(fileids, str):
- XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
+ if isinstance(fileids, string_types):
+ XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
else:
XMLCorpusReader.__init__(
- self, root, [fileid + "/header.xml" for fileid in fileids]
+ self, root, [fileid + '/header.xml' for fileid in fileids]
)
self._paths = self.get_paths()
"""
Returns a view specialised for use with particular corpus file.
"""
- mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
+ mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE)
if mode is NKJPCorpusReader.WORDS_MODE:
return NKJPCorpus_Morph_View(filename, tags=tags)
elif mode is NKJPCorpusReader.SENTS_MODE:
)
else:
- raise NameError("No such mode!")
+ raise NameError('No such mode!')
def add_root(self, fileid):
"""
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
Returns tagged words in specified fileids.
"""
- tags = kwargs.pop("tags", [])
+ tags = kwargs.pop('tags', [])
return concat(
[
self._view(
header.xml files in NKJP corpus.
"""
self.tagspec = ".*/sourceDesc$"
- XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
+ XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
def handle_query(self):
self._open()
return header
def handle_elt(self, elt, context):
- titles = elt.findall("bibl/title")
+ titles = elt.findall('bibl/title')
title = []
if titles:
- title = "\n".join(title.text.strip() for title in titles)
+ title = '\n'.join(title.text.strip() for title in titles)
- authors = elt.findall("bibl/author")
+ authors = elt.findall('bibl/author')
author = []
if authors:
- author = "\n".join(author.text.strip() for author in authors)
+ author = '\n'.join(author.text.strip() for author in authors)
- dates = elt.findall("bibl/date")
+ dates = elt.findall('bibl/date')
date = []
if dates:
- date = "\n".join(date.text.strip() for date in dates)
+ date = '\n'.join(date.text.strip() for date in dates)
- publishers = elt.findall("bibl/publisher")
+ publishers = elt.findall('bibl/publisher')
publisher = []
if publishers:
- publisher = "\n".join(publisher.text.strip() for publisher in publishers)
+ publisher = '\n'.join(publisher.text.strip() for publisher in publishers)
- idnos = elt.findall("bibl/idno")
+ idnos = elt.findall('bibl/idno')
idno = []
if idnos:
- idno = "\n".join(idno.text.strip() for idno in idnos)
+ idno = '\n'.join(idno.text.strip() for idno in idnos)
- notes = elt.findall("bibl/note")
+ notes = elt.findall('bibl/note')
note = []
if notes:
- note = "\n".join(note.text.strip() for note in notes)
+ note = '\n'.join(note.text.strip() for note in notes)
return {
- "title": title,
- "author": author,
- "date": date,
- "publisher": publisher,
- "idno": idno,
- "note": note,
+ 'title': title,
+ 'author': author,
+ 'date': date,
+ 'publisher': publisher,
+ 'idno': idno,
+ 'note': note,
}
def build_preprocessed_file(self):
try:
- fr = open(self.read_file, "r")
+ fr = open(self.read_file, 'r')
fw = self.write_file
- line = " "
+ line = ' '
while len(line):
line = fr.readline()
- x = re.split(r"nkjp:[^ ]* ", line) # in all files
- ret = " ".join(x)
- x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
- x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
- x = re.split("<choice>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
- x = re.split("</choice>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
+ x = re.split(r'nkjp:[^ ]* ', line) # in all files
+ ret = ' '.join(x)
+ x = re.split('<nkjp:paren>', ret) # in ann_segmentation.xml
+ ret = ' '.join(x)
+ x = re.split('</nkjp:paren>', ret) # in ann_segmentation.xml
+ ret = ' '.join(x)
+ x = re.split('<choice>', ret) # in ann_segmentation.xml
+ ret = ' '.join(x)
+ x = re.split('</choice>', ret) # in ann_segmentation.xml
+ ret = ' '.join(x)
fw.write(ret)
fr.close()
fw.close()
"""
def __init__(self, filename, **kwargs):
- self.tagspec = ".*p/.*s"
+ self.tagspec = '.*p/.*s'
# intersperse NKJPCorpus_Text_View
self.text_view = NKJPCorpus_Text_View(
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
)
self.text_view.handle_query()
# xml preprocessing
- self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
+ self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def get_segm_id(self, example_word):
- return example_word.split("(")[1].split(",")[0]
+ return example_word.split('(')[1].split(',')[0]
def get_sent_beg(self, beg_word):
# returns index of beginning letter in sentence
- return int(beg_word.split(",")[1])
+ return int(beg_word.split(',')[1])
def get_sent_end(self, end_word):
# returns index of end letter in sentence
- splitted = end_word.split(")")[0].split(",")
+ splitted = end_word.split(')')[0].split(',')
return int(splitted[1]) + int(splitted[2])
def get_sentences(self, sent_segm):
def handle_elt(self, elt, context):
ret = []
for seg in elt:
- ret.append(seg.get("corresp"))
+ ret.append(seg.get('corresp'))
return ret
RAW_MODE = 1
def __init__(self, filename, **kwargs):
- self.mode = kwargs.pop("mode", 0)
- self.tagspec = ".*/div/ab"
+ self.mode = kwargs.pop('mode', 0)
+ self.tagspec = '.*/div/ab'
self.segm_dict = dict()
# xml preprocessing
- self.xml_tool = XML_Tool(filename, "text.xml")
+ self.xml_tool = XML_Tool(filename, 'text.xml')
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
for part in segm:
txt.append(part)
- return [" ".join([segm for segm in txt])]
+ return [' '.join([segm for segm in txt])]
def get_segm_id(self, elt):
for attr in elt.attrib:
- if attr.endswith("id"):
+ if attr.endswith('id'):
return elt.get(attr)
def handle_elt(self, elt, context):
"""
def __init__(self, filename, **kwargs):
- self.tags = kwargs.pop("tags", None)
- self.tagspec = ".*/seg/fs"
- self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
+ self.tags = kwargs.pop('tags', None)
+ self.tagspec = '.*/seg/fs'
+ self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
raise Exception
def handle_elt(self, elt, context):
- word = ""
+ word = ''
flag = False
is_not_interp = True
# if tags not specified, then always return word
for child in elt:
# get word
- if "name" in child.keys() and child.attrib["name"] == "orth":
+ if 'name' in child.keys() and child.attrib['name'] == 'orth':
for symbol in child:
- if symbol.tag == "string":
+ if symbol.tag == 'string':
word = symbol.text
- elif "name" in child.keys() and child.attrib["name"] == "interps":
+ elif 'name' in child.keys() and child.attrib['name'] == 'interps':
for symbol in child:
- if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
+ if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex':
for symbol2 in symbol:
if (
- "name" in symbol2.keys()
- and symbol2.attrib["name"] == "ctag"
+ 'name' in symbol2.keys()
+ and symbol2.attrib['name'] == 'ctag'
):
for symbol3 in symbol2:
if (
- "value" in symbol3.keys()
+ 'value' in symbol3.keys()
and self.tags is not None
- and symbol3.attrib["value"] in self.tags
+ and symbol3.attrib['value'] in self.tags
):
flag = True
elif (
- "value" in symbol3.keys()
- and symbol3.attrib["value"] == "interp"
+ 'value' in symbol3.keys()
+ and symbol3.attrib['value'] == 'interp'
):
is_not_interp = False
if flag and is_not_interp:
# Natural Language Toolkit: NomBank Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
from xml.etree import ElementTree
from functools import total_ordering
+from six import string_types
+
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
self,
root,
nomfile,
- framefiles="",
+ framefiles='',
nounsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
- encoding="utf8",
+ encoding='utf8',
):
"""
:param root: The root directory for this corpus.
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by nombank.
"""
-
# If framefiles is specified as a regexp, expand it.
- if isinstance(framefiles, str):
- self._fileids = find_corpus_fileids(root, framefiles)
- self._fileids = list(framefiles)
+ if isinstance(framefiles, string_types):
+ framefiles = find_corpus_fileids(root, framefiles)
+ framefiles = list(framefiles)
# Initialze the corpus reader.
- CorpusReader.__init__(self, root, framefiles, encoding)
+ CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles, encoding)
- # Record our nom file & nouns file.
+ # Record our frame fileids & nom file.
self._nomfile = nomfile
+ self._framefiles = framefiles
self._nounsfile = nounsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
"""
kwargs = {}
if baseform is not None:
- kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+ kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._nomfile),
lambda stream: self._read_instance_block(stream, **kwargs),
"""
:return: the xml description for the given roleset.
"""
- baseform = roleset_id.split(".")[0]
- baseform = baseform.replace("perc-sign", "%")
- baseform = baseform.replace("oneslashonezero", "1/10").replace(
- "1/10", "1-slash-10"
+ baseform = roleset_id.split('.')[0]
+ baseform = baseform.replace('perc-sign', '%')
+ baseform = baseform.replace('oneslashonezero', '1/10').replace(
+ '1/10', '1-slash-10'
)
- framefile = "frames/%s.xml" % baseform
- if framefile not in self.fileids():
- raise ValueError("Frameset file for %s not found" % roleset_id)
+ framefile = 'frames/%s.xml' % baseform
+ if framefile not in self._framefiles:
+ raise ValueError('Frameset file for %s not found' % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- for roleset in etree.findall("predicate/roleset"):
- if roleset.attrib["id"] == roleset_id:
+ for roleset in etree.findall('predicate/roleset'):
+ if roleset.attrib['id'] == roleset_id:
return roleset
- raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
+ raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
- framefile = "frames/%s.xml" % baseform
- if framefile not in self.fileids():
- raise ValueError("Frameset file for %s not found" % baseform)
+ framefile = 'frames/%s.xml' % baseform
+ if framefile not in self._framefiles:
+ raise ValueError('Frameset file for %s not found' % baseform)
framefiles = [framefile]
else:
- framefiles = self.fileids()
+ framefiles = self._framefiles
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- rsets.append(etree.findall("predicate/roleset"))
+ rsets.append(etree.findall('predicate/roleset'))
return LazyConcatenation(rsets)
def nouns(self):
######################################################################
+@python_2_unicode_compatible
class NombankInstance(object):
def __init__(
self,
"""The name of the roleset used by this instance's predicate.
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
look up information about the roleset."""
- r = self.baseform.replace("%", "perc-sign")
- r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
- return "%s.%s" % (r, self.sensenumber)
+ r = self.baseform.replace('%', 'perc-sign')
+ r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero')
+ return '%s.%s' % (r, self.sensenumber)
def __repr__(self):
- return "<NombankInstance: %s, sent %s, word %s>" % (
+ return '<NombankInstance: %s, sent %s, word %s>' % (
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
- s = "%s %s %s %s %s" % (
+ s = '%s %s %s %s %s' % (
self.fileid,
self.sentnum,
self.wordnum,
self.baseform,
self.sensenumber,
)
- items = self.arguments + ((self.predicate, "rel"),)
+ items = self.arguments + ((self.predicate, 'rel'),)
for (argloc, argid) in sorted(items):
- s += " %s-%s" % (argloc, argid)
+ s += ' %s-%s' % (argloc, argid)
return s
def _get_tree(self):
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 6:
- raise ValueError("Badly formatted nombank line: %r" % s)
+ raise ValueError('Badly formatted nombank line: %r' % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
args = pieces[5:]
- rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
+ rel = [args.pop(i) for i, p in enumerate(args) if '-rel' in p]
if len(rel) != 1:
- raise ValueError("Badly formatted nombank line: %r" % s)
+ raise ValueError('Badly formatted nombank line: %r' % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
# Parse the predicate location.
- predloc, predid = rel[0].split("-", 1)
+ predloc, predid = rel[0].split('-', 1)
predicate = NombankTreePointer.parse(predloc)
# Parse the arguments.
arguments = []
for arg in args:
- argloc, argid = arg.split("-", 1)
+ argloc, argid = arg.split('-', 1)
arguments.append((NombankTreePointer.parse(argloc), argid))
# Put it all together.
raise NotImplementedError()
+@python_2_unicode_compatible
class NombankChainTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
``NombankTreePointer`` pointers."""
def __str__(self):
- return "*".join("%s" % p for p in self.pieces)
+ return '*'.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<NombankChainTreePointer: %s>" % self
+ return '<NombankChainTreePointer: %s>' % self
def select(self, tree):
if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
+ raise ValueError('Parse tree not avaialable')
+ return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
+@python_2_unicode_compatible
class NombankSplitTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
all ``NombankTreePointer`` pointers."""
def __str__(self):
- return ",".join("%s" % p for p in self.pieces)
+ return ','.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<NombankSplitTreePointer: %s>" % self
+ return '<NombankSplitTreePointer: %s>' % self
def select(self, tree):
if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+ raise ValueError('Parse tree not avaialable')
+ return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
@total_ordering
+@python_2_unicode_compatible
class NombankTreePointer(NombankPointer):
"""
wordnum:height*wordnum:height*...
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
- pieces = s.split("*")
+ pieces = s.split('*')
if len(pieces) > 1:
return NombankChainTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
- pieces = s.split(",")
+ pieces = s.split(',')
if len(pieces) > 1:
return NombankSplitTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
- pieces = s.split(":")
+ pieces = s.split(':')
if len(pieces) != 2:
- raise ValueError("bad nombank pointer %r" % s)
+ raise ValueError('bad nombank pointer %r' % s)
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
- return "%s:%s" % (self.wordnum, self.height)
+ return '%s:%s' % (self.wordnum, self.height)
def __repr__(self):
- return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
+ return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
def select(self, tree):
if tree is None:
- raise ValueError("Parse tree not avaialable")
+ raise ValueError('Parse tree not avaialable')
return tree[self.treepos(tree)]
def treepos(self, tree):
given that it points to the given tree.
"""
if tree is None:
- raise ValueError("Parse tree not avaialable")
+ raise ValueError('Parse tree not avaialable')
stack = [tree]
treepos = []
wordnum = 0
while True:
+ # print treepos
+ # print stack[-1]
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
# Natural Language Toolkit: NPS Chat Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
import textwrap
if self._wrap_etree:
return concat(
[
- XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
+ XMLCorpusView(fileid, 'Session/Posts/Post', self._wrap_elt)
for fileid in self.abspaths(fileids)
]
)
else:
return concat(
[
- XMLCorpusView(fileid, "Session/Posts/Post")
+ XMLCorpusView(fileid, 'Session/Posts/Post')
for fileid in self.abspaths(fileids)
]
)
return concat(
[
XMLCorpusView(
- fileid, "Session/Posts/Post/terminals", self._elt_to_words
+ fileid, 'Session/Posts/Post/terminals', self._elt_to_words
)
for fileid in self.abspaths(fileids)
]
return concat(
[
- XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
+ XMLCorpusView(fileid, 'Session/Posts/Post/terminals', reader)
for fileid in self.abspaths(fileids)
]
)
return ElementWrapper(elt)
def _elt_to_words(self, elt, handler):
- return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
+ return [self._simplify_username(t.attrib['word']) for t in elt.findall('t')]
def _elt_to_tagged_words(self, elt, handler, tagset=None):
tagged_post = [
- (self._simplify_username(t.attrib["word"]), t.attrib["pos"])
- for t in elt.findall("t")
+ (self._simplify_username(t.attrib['word']), t.attrib['pos'])
+ for t in elt.findall('t')
]
if tagset and tagset != self._tagset:
tagged_post = [
@staticmethod
def _simplify_username(word):
- if "User" in word:
- word = "U" + word.split("User", 1)[1]
+ if 'User' in word:
+ word = 'U' + word.split('User', 1)[1]
elif isinstance(word, bytes):
- word = word.decode("ascii")
+ word = word.decode('ascii')
return word
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
+from six import string_types
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
:return: a list of positive words.
:rtype: list(str)
"""
- return self.words("positive-words.txt")
+ return self.words('positive-words.txt')
def negative(self):
"""
:return: a list of negative words.
:rtype: list(str)
"""
- return self.words("negative-words.txt")
+ return self.words('negative-words.txt')
def _read_word_block(self, stream):
words = []
# Natural Language Toolkit: PanLex Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: David Kamholz <kamholz@panlex.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
def __init__(self, root):
- self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
+ self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
self._uid_lv = {}
self._lv_uid = {}
- for row in self._c.execute("SELECT uid, lv FROM lv"):
+ for row in self._c.execute('SELECT uid, lv FROM lv'):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
"""
if lc is None:
- return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
+ return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
else:
return self._c.execute(
- "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
+ 'SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)
).fetchall()
def meanings(self, expr_uid, expr_tt):
if not mn in mn_info:
mn_info[mn] = {
- "uq": i[1],
- "ap": i[2],
- "ui": i[3],
- "ex": {expr_uid: [expr_tt]},
+ 'uq': i[1],
+ 'ap': i[2],
+ 'ui': i[3],
+ 'ex': {expr_uid: [expr_tt]},
}
- if not uid in mn_info[mn]["ex"]:
- mn_info[mn]["ex"][uid] = []
+ if not uid in mn_info[mn]['ex']:
+ mn_info[mn]['ex'][uid] = []
- mn_info[mn]["ex"][uid].append(i[4])
+ mn_info[mn]['ex'][uid].append(i[4])
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
def __init__(self, mn, attr):
super(Meaning, self).__init__(**attr)
- self["mn"] = mn
+ self['mn'] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
- return self["mn"]
+ return self['mn']
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
- return self["uq"]
+ return self['uq']
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
- return self["ap"]
+ return self['ap']
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
- return self["ui"]
+ return self['ui']
def expressions(self):
"""
texts.
:rtype: dict
"""
- return self["ex"]
+ return self['ex']
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Word List Corpus Reader
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Steven Bird <stevenbird1@gmail.com>
-# Edward Loper <edloper@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from collections import namedtuple, defaultdict
-import re
-
-from nltk.tokenize import line_tokenize
-
-from nltk.corpus.reader.wordlist import WordListCorpusReader
-from nltk.corpus.reader.util import *
-from nltk.corpus.reader.api import *
-
-PanlexLanguage = namedtuple('PanlexLanguage',
- ['panlex_uid', # (1) PanLex UID
- 'iso639', # (2) ISO 639 language code
- 'iso639_type', # (3) ISO 639 language type, see README
- 'script', # (4) normal scripts of expressions
- 'name', # (5) PanLex default name
- 'langvar_uid' # (6) UID of the language variety in which the default name is an expression
- ])
-
-class PanlexSwadeshCorpusReader(WordListCorpusReader):
- """
- This is a class to read the PanLex Swadesh list from
-
- David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
- PanLex: Building a Resource for Panlingual Lexical Translation.
- In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
-
- License: CC0 1.0 Universal
- https://creativecommons.org/publicdomain/zero/1.0/legalcode
- """
- def __init__(self, *args, **kwargs):
- super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
- # Find the swadesh size using the fileids' path.
- self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
- self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
- self._macro_langauges = self.get_macrolanguages()
-
- def license(self):
- print('CC0 1.0 Universal')
-
- def readme(self):
- print(self.raw('README'))
-
- def language_codes(self):
- return self._languages.keys()
-
- def get_languages(self):
- for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
- if not line.strip(): # Skip empty lines.
- continue
- yield PanlexLanguage(*line.strip().split('\t'))
-
- def get_macrolanguages(self):
- macro_langauges = defaultdict(list)
- for lang in self._languages.values():
- macro_langauges[lang.iso639].append(lang.panlex_uid)
- return macro_langauges
-
- def words_by_lang(self, lang_code):
- """
- :return: a list of list(str)
- """
- fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
- return [concept.split('\t') for concept in self.words(fileid)]
-
- def words_by_iso639(self, iso63_code):
- """
- :return: a list of list(str)
- """
- fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
- for lang_code in self._macro_langauges[iso63_code]]
- return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
-
- def entries(self, fileids=None):
- """
- :return: a tuple of words for the specified fileids.
- """
- if not fileids:
- fileids = self.fileids()
-
- wordlists = [self.words(f) for f in fileids]
- return list(zip(*wordlists))
# Natural Language Toolkit:
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader
-PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
-SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
+PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
+SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
-TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
-WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
+TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
+WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
TYPE = re.compile(r'type="(.*?)"')
ANA = re.compile(r'ana="(.*?)"')
def read_block(self, stream):
block = stream.readlines(self._pagesize)
block = concat(block)
- while (block.count("<text id") > block.count("</text>")) or block.count(
- "<text id"
+ while (block.count('<text id') > block.count('</text>')) or block.count(
+ '<text id'
) == 0:
tmp = stream.readline()
if len(tmp) <= 0:
break
block += tmp
- block = block.replace("\n", "")
+ block = block.replace('\n', '')
textids = TEXTID.findall(block)
if self._textids:
for tid in textids:
if tid not in self._textids:
beg = block.find(tid) - 1
- end = block[beg:].find("</text>") + len("</text>")
+ end = block[beg:].find('</text>') + len('</text>')
block = block[:beg] + block[beg + end :]
output = []
def _parse_tag(self, tag_word_tuple):
(tag, word) = tag_word_tuple
- if tag.startswith("w"):
+ if tag.startswith('w'):
tag = ANA.search(tag).group(1)
else: # tag.startswith('c')
tag = TYPE.search(tag).group(1)
head_len = 2770
def __init__(self, *args, **kwargs):
- if "textid_file" in kwargs:
- self._textids = kwargs["textid_file"]
+ if 'textid_file' in kwargs:
+ self._textids = kwargs['textid_file']
else:
self._textids = None
with open(self._textids) as fp:
for line in fp:
line = line.strip()
- file_id, text_ids = line.split(" ", 1)
+ file_id, text_ids = line.split(' ', 1)
if file_id not in self.fileids():
raise ValueError(
- "In text_id mapping file %s: %s not found"
+ 'In text_id mapping file %s: %s not found'
% (self._textids, file_id)
)
for text_id in text_ids.split(self._delimiter):
def _resolve(self, fileids, categories, textids=None):
tmp = None
if (
- len(list(
+ len(
filter(
lambda accessor: accessor is None, (fileids, categories, textids)
)
- ))
+ )
!= 1
):
raise ValueError(
- "Specify exactly one of: fileids, " "categories or textids"
+ 'Specify exactly one of: fileids, ' 'categories or textids'
)
if fileids is not None:
return self.fileids(categories), None
if textids is not None:
- if isinstance(textids, str):
+ if isinstance(textids, string_types):
textids = [textids]
files = sum((self._t2f[t] for t in textids), [])
tdict = dict()
if fileids is None:
return sorted(self._t2f)
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(sum((self._f2t[d] for d in fileids), []))
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
if len(fileids) == 1:
return XMLCorpusReader.xml(self, fileids[0])
else:
- raise TypeError("Expected a single file")
+ raise TypeError('Expected a single file')
def raw(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: Plaintext Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@umiacs.umd.edu>
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
- sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
+ sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle'),
para_block_reader=read_blankline_block,
- encoding="utf8",
+ encoding='utf8',
):
"""
Construct a new plaintext corpus reader for a set of documents
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
raw_texts = []
for f in fileids:
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
- raise ValueError("No sentence tokenizer for this corpus")
+ raise ValueError('No sentence tokenizer for this corpus')
return concat(
[
:rtype: list(list(list(str)))
"""
if self._sent_tokenizer is None:
- raise ValueError("No sentence tokenizer for this corpus")
+ raise ValueError('No sentence tokenizer for this corpus')
return concat(
[
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
def __init__(self, *args, **kwargs):
CategorizedCorpusReader.__init__(self, kwargs)
- kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
- "tokenizers/punkt/portuguese.pickle"
+ kwargs['sent_tokenizer'] = nltk.data.LazyLoader(
+ 'tokenizers/punkt/portuguese.pickle'
)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
def paras(self, fileids=None):
raise NotImplementedError(
- "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
+ 'The Europarl corpus reader does not support paragraphs. Please use chapters() instead.'
)
# Natural Language Toolkit: PP Attachment Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
The PP Attachment Corpus is distributed with NLTK with the permission
of the author.
"""
+from __future__ import unicode_literals
+from six import string_types
+
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
+@compat.python_2_unicode_compatible
class PPAttachment(object):
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
self.sent = sent
def __repr__(self):
return (
- "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
- "noun2=%r, attachment=%r)"
+ 'PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, '
+ 'noun2=%r, attachment=%r)'
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
)
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: PropBank Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from functools import total_ordering
from xml.etree import ElementTree
+from six import string_types
+
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
self,
root,
propfile,
- framefiles="",
+ framefiles='',
verbsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
- encoding="utf8",
+ encoding='utf8',
):
"""
:param root: The root directory for this corpus.
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
- if isinstance(framefiles, str):
+ if isinstance(framefiles, string_types):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialze the corpus reader.
"""
kwargs = {}
if baseform is not None:
- kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+ kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._propfile),
lambda stream: self._read_instance_block(stream, **kwargs),
"""
:return: the xml description for the given roleset.
"""
- baseform = roleset_id.split(".")[0]
- framefile = "frames/%s.xml" % baseform
+ baseform = roleset_id.split('.')[0]
+ framefile = 'frames/%s.xml' % baseform
if framefile not in self._framefiles:
- raise ValueError("Frameset file for %s not found" % roleset_id)
+ raise ValueError('Frameset file for %s not found' % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- for roleset in etree.findall("predicate/roleset"):
- if roleset.attrib["id"] == roleset_id:
+ for roleset in etree.findall('predicate/roleset'):
+ if roleset.attrib['id'] == roleset_id:
return roleset
- raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
+ raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
- framefile = "frames/%s.xml" % baseform
+ framefile = 'frames/%s.xml' % baseform
if framefile not in self._framefiles:
- raise ValueError("Frameset file for %s not found" % baseform)
+ raise ValueError('Frameset file for %s not found' % baseform)
framefiles = [framefile]
else:
framefiles = self._framefiles
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- rsets.append(etree.findall("predicate/roleset"))
+ rsets.append(etree.findall('predicate/roleset'))
return LazyConcatenation(rsets)
def verbs(self):
######################################################################
-
+@compat.python_2_unicode_compatible
class PropbankInstance(object):
def __init__(
self,
@property
def baseform(self):
"""The baseform of the predicate."""
- return self.roleset.split(".")[0]
+ return self.roleset.split('.')[0]
@property
def sensenumber(self):
"""The sense number of the predicate."""
- return self.roleset.split(".")[1]
+ return self.roleset.split('.')[1]
@property
def predid(self):
"""Identifier of the predicate."""
- return "rel"
+ return 'rel'
def __repr__(self):
- return "<PropbankInstance: %s, sent %s, word %s>" % (
+ return '<PropbankInstance: %s, sent %s, word %s>' % (
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
- s = "%s %s %s %s %s %s" % (
+ s = '%s %s %s %s %s %s' % (
self.fileid,
self.sentnum,
self.wordnum,
self.roleset,
self.inflection,
)
- items = self.arguments + ((self.predicate, "rel"),)
+ items = self.arguments + ((self.predicate, 'rel'),)
for (argloc, argid) in sorted(items):
- s += " %s-%s" % (argloc, argid)
+ s += ' %s-%s' % (argloc, argid)
return s
def _get_tree(self):
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 7:
- raise ValueError("Badly formatted propbank line: %r" % s)
+ raise ValueError('Badly formatted propbank line: %r' % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
- rel = [p for p in pieces[6:] if p.endswith("-rel")]
- args = [p for p in pieces[6:] if not p.endswith("-rel")]
+ rel = [p for p in pieces[6:] if p.endswith('-rel')]
+ args = [p for p in pieces[6:] if not p.endswith('-rel')]
if len(rel) != 1:
- raise ValueError("Badly formatted propbank line: %r" % s)
+ raise ValueError('Badly formatted propbank line: %r' % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
# Parse the arguments.
arguments = []
for arg in args:
- argloc, argid = arg.split("-", 1)
+ argloc, argid = arg.split('-', 1)
arguments.append((PropbankTreePointer.parse(argloc), argid))
# Put it all together.
raise NotImplementedError()
-
+@compat.python_2_unicode_compatible
class PropbankChainTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
``PropbankTreePointer`` pointers."""
def __str__(self):
- return "*".join("%s" % p for p in self.pieces)
+ return '*'.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<PropbankChainTreePointer: %s>" % self
+ return '<PropbankChainTreePointer: %s>' % self
def select(self, tree):
if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
-
+ raise ValueError('Parse tree not avaialable')
+ return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
+@compat.python_2_unicode_compatible
class PropbankSplitTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
all ``PropbankTreePointer`` pointers."""
def __str__(self):
- return ",".join("%s" % p for p in self.pieces)
+ return ','.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<PropbankSplitTreePointer: %s>" % self
+ return '<PropbankSplitTreePointer: %s>' % self
def select(self, tree):
if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+ raise ValueError('Parse tree not avaialable')
+ return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
@total_ordering
-
+@compat.python_2_unicode_compatible
class PropbankTreePointer(PropbankPointer):
"""
wordnum:height*wordnum:height*...
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
- pieces = s.split("*")
+ pieces = s.split('*')
if len(pieces) > 1:
return PropbankChainTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
- pieces = s.split(",")
+ pieces = s.split(',')
if len(pieces) > 1:
return PropbankSplitTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
- pieces = s.split(":")
+ pieces = s.split(':')
if len(pieces) != 2:
- raise ValueError("bad propbank pointer %r" % s)
+ raise ValueError('bad propbank pointer %r' % s)
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
- return "%s:%s" % (self.wordnum, self.height)
+ return '%s:%s' % (self.wordnum, self.height)
def __repr__(self):
- return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
+ return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
def select(self, tree):
if tree is None:
- raise ValueError("Parse tree not avaialable")
+ raise ValueError('Parse tree not avaialable')
return tree[self.treepos(tree)]
def treepos(self, tree):
given that it points to the given tree.
"""
if tree is None:
- raise ValueError("Parse tree not avaialable")
+ raise ValueError('Parse tree not avaialable')
stack = [tree]
treepos = []
wordnum = 0
while True:
+ # print treepos
+ # print stack[-1]
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
stack.pop()
-
+@compat.python_2_unicode_compatible
class PropbankInflection(object):
# { Inflection Form
- INFINITIVE = "i"
- GERUND = "g"
- PARTICIPLE = "p"
- FINITE = "v"
+ INFINITIVE = 'i'
+ GERUND = 'g'
+ PARTICIPLE = 'p'
+ FINITE = 'v'
# { Inflection Tense
- FUTURE = "f"
- PAST = "p"
- PRESENT = "n"
+ FUTURE = 'f'
+ PAST = 'p'
+ PRESENT = 'n'
# { Inflection Aspect
- PERFECT = "p"
- PROGRESSIVE = "o"
- PERFECT_AND_PROGRESSIVE = "b"
+ PERFECT = 'p'
+ PROGRESSIVE = 'o'
+ PERFECT_AND_PROGRESSIVE = 'b'
# { Inflection Person
- THIRD_PERSON = "3"
+ THIRD_PERSON = '3'
# { Inflection Voice
- ACTIVE = "a"
- PASSIVE = "p"
+ ACTIVE = 'a'
+ PASSIVE = 'p'
# { Inflection
- NONE = "-"
+ NONE = '-'
# }
- def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
+ def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
self.form = form
self.tense = tense
self.aspect = aspect
return self.form + self.tense + self.aspect + self.person + self.voice
def __repr__(self):
- return "<PropbankInflection: %s>" % self
+ return '<PropbankInflection: %s>' % self
- _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
+ _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
@staticmethod
def parse(s):
- if not isinstance(s, str):
- raise TypeError("expected a string")
+ if not isinstance(s, string_types):
+ raise TypeError('expected a string')
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
- raise ValueError("Bad propbank inflection string %r" % s)
+ raise ValueError('Bad propbank inflection string %r' % s)
return PropbankInflection(*s)
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
- encoding="utf8",
+ encoding='utf8',
**kwargs
):
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
# Natural Language Toolkit: Product Reviews Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
consideration.
"""
+from __future__ import division
+
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
-TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
+TITLE = re.compile(r'^\[t\](.*)$') # [t] Title
FEATURES = re.compile(
- r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
+ r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]'
) # find 'feature' in feature[+3]
-NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
-SENT = re.compile(r"##(.*)$") # find tokenized sentence
+NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p]
+SENT = re.compile(r'##(.*)$') # find tokenized sentence
+@compat.python_2_unicode_compatible
class Review(object):
"""
A Review is the main block of a ReviewsCorpusReader.
return [review_line.sent for review_line in self.review_lines]
def __repr__(self):
- return 'Review(title="{}", review_lines={})'.format(
+ return 'Review(title=\"{}\", review_lines={})'.format(
self.title, self.review_lines
)
+@compat.python_2_unicode_compatible
class ReviewLine(object):
"""
A ReviewLine represents a sentence of the review, together with (optional)
self.notes = notes
def __repr__(self):
- return "ReviewLine(features={}, notes={}, sent={})".format(
+ return 'ReviewLine(features={}, notes={}, sent={})'.format(
self.features, self.notes, self.sent
)
We can compute stats for specific product features:
+ >>> from __future__ import division
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+ >>> # We use float for backward compatibility with division in Python2.7
>>> mean = tot / n_reviews
>>> print(n_reviews, tot, mean)
15 24 1.6
CorpusView = StreamBackedCorpusView
def __init__(
- self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
+ self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding='utf8'
):
"""
:param root: The root directory for the corpus.
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: RTE Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
challenge number and 'n' is the pair ID.
"""
+from __future__ import unicode_literals
+
+from six import string_types
+
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import *
return valdict[value_string.upper()]
+@compat.python_2_unicode_compatible
class RTEPair(object):
"""
Container for RTE text-hypothesis pairs.
def __repr__(self):
if self.challenge:
- return "<RTEPair: gid=%s-%s>" % (self.challenge, self.id)
+ return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
else:
- return "<RTEPair: id=%s>" % self.id
+ return '<RTEPair: id=%s>' % self.id
class RTECorpusReader(XMLCorpusReader):
:rtype: list(RTEPair)
"""
try:
- challenge = doc.attrib["challenge"]
+ challenge = doc.attrib['challenge']
except KeyError:
challenge = None
return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
:type: list
:rtype: list(RTEPair)
"""
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
# Natural Language Toolkit: SemCor Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the SemCor Corpus.
"""
+from __future__ import absolute_import, unicode_literals
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
- return self._items(fileids, "word", False, False, False)
+ return self._items(fileids, 'word', False, False, False)
def chunks(self, fileids=None):
"""
that form a unit.
:rtype: list(list(str))
"""
- return self._items(fileids, "chunk", False, False, False)
+ return self._items(fileids, 'chunk', False, False, False)
- def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
+ def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of tagged chunks, represented
in tree form.
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
- return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
+ return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos')
def sents(self, fileids=None):
"""
as a list of word strings.
:rtype: list(list(str))
"""
- return self._items(fileids, "word", True, False, False)
+ return self._items(fileids, 'word', True, False, False)
def chunk_sents(self, fileids=None):
"""
as a list of chunks.
:rtype: list(list(list(str)))
"""
- return self._items(fileids, "chunk", True, False, False)
+ return self._items(fileids, 'chunk', True, False, False)
- def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
+ def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of sentences. Each sentence
is represented as a list of tagged chunks (in tree form).
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
- return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
+ return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos')
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
- if unit == "word" and not bracket_sent:
+ if unit == 'word' and not bracket_sent:
# the result of the SemcorWordView may be a multiword unit, so the
# LazyConcatenation will make sure the sentence is flattened
_ = lambda *args: LazyConcatenation(
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
- assert unit in ("token", "word", "chunk")
+ assert unit in ('token', 'word', 'chunk')
result = []
xmldoc = ElementTree.parse(fileid).getroot()
- for xmlsent in xmldoc.findall(".//s"):
+ for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
itm = SemcorCorpusReader._word(
xmlword, unit, pos_tag, sem_tag, self._wordnet
)
- if unit == "word":
+ if unit == 'word':
sent.extend(itm)
else:
sent.append(itm)
if bracket_sent:
- result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
+ result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
else:
result.extend(sent)
if not tkn:
tkn = "" # fixes issue 337?
- lemma = xmlword.get("lemma", tkn) # lemma or NE class
- lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
+ lemma = xmlword.get('lemma', tkn) # lemma or NE class
+ lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense)
if lexsn is not None:
- sense_key = lemma + "%" + lexsn
- wnpos = ("n", "v", "a", "r", "s")[
- int(lexsn.split(":")[0]) - 1
+ sense_key = lemma + '%' + lexsn
+ wnpos = ('n', 'v', 'a', 'r', 's')[
+ int(lexsn.split(':')[0]) - 1
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
else:
sense_key = wnpos = None
redef = xmlword.get(
- "rdf", tkn
+ 'rdf', tkn
) # redefinition--this indicates the lookup string
# does not exactly match the enclosed string, e.g. due to typographical adjustments
# or discontinuity of a multiword expression. If a redefinition has occurred,
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
- sensenum = xmlword.get("wnsn") # WordNet sense number
- isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
+ sensenum = xmlword.get('wnsn') # WordNet sense number
+ isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet
pos = xmlword.get(
- "pos"
+ 'pos'
) # part of speech for the whole chunk (None for punctuation)
- if unit == "token":
+ if unit == 'token':
if not pos_tag and not sem_tag:
itm = tkn
else:
)
return itm
else:
- ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
- if unit == "word":
+ ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
+ if unit == 'word':
return ww
else:
if sensenum is not None:
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
# solution: just use the lemma name as a string
try:
- sense = "%s.%s.%02d" % (
+ sense = '%s.%s.%02d' % (
lemma,
wnpos,
int(sensenum),
) # e.g.: reach.v.02
except ValueError:
sense = (
- lemma + "." + wnpos + "." + sensenum
+ lemma + '.' + wnpos + '.' + sensenum
) # e.g. the sense number may be "2;1"
bottom = [Tree(pos, ww)] if pos_tag else ww
if sem_tag and isOOVEntity:
if sensenum is not None:
- return Tree(sense, [Tree("NE", bottom)])
+ return Tree(sense, [Tree('NE', bottom)])
else: # 'other' NE
- return Tree("NE", bottom)
+ return Tree('NE', bottom)
elif sem_tag and sensenum is not None:
return Tree(sense, bottom)
elif pos_tag:
if result is None:
result = []
for child in elt:
- if child.tag in ("wf", "punc"):
+ if child.tag in ('wf', 'punc'):
result.append(child)
else:
_all_xmlwords_in(child, result)
and OOV named entity status.
"""
if bracket_sent:
- tagspec = ".*/s"
+ tagspec = '.*/s'
else:
- tagspec = ".*/s/(punc|wf)"
+ tagspec = '.*/s/(punc|wf)'
self._unit = unit
self._sent = bracket_sent
def handle_sent(self, elt):
sent = []
for child in elt:
- if child.tag in ("wf", "punc"):
+ if child.tag in ('wf', 'punc'):
itm = self.handle_word(child)
- if self._unit == "word":
+ if self._unit == 'word':
sent.extend(itm)
else:
sent.append(itm)
else:
- raise ValueError("Unexpected element %s" % child.tag)
- return SemcorSentence(elt.attrib["snum"], sent)
+ raise ValueError('Unexpected element %s' % child.tag)
+ return SemcorSentence(elt.attrib['snum'], sent)
# Natural Language Toolkit: Senseval 2 Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Steven Bird <stevenbird1@gmail.com> (modifications)
# URL: <http://nltk.org/>
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
"""
+from __future__ import print_function, unicode_literals
import re
from xml.etree import ElementTree
+from six import string_types
+
+from nltk import compat
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
+@compat.python_2_unicode_compatible
class SensevalInstance(object):
def __init__(self, word, position, context, senses):
self.word = word
self.context = context
def __repr__(self):
- return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
+ return 'SensevalInstance(word=%r, position=%r, ' 'context=%r, senses=%r)' % (
self.word,
self.position,
self.context,
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _entry(self, tree):
elts = []
- for lexelt in tree.findall("lexelt"):
- for inst in lexelt.findall("instance"):
- sense = inst[0].attrib["senseid"]
- context = [(w.text, w.attrib["pos"]) for w in inst[1]]
+ for lexelt in tree.findall('lexelt'):
+ for inst in lexelt.findall('instance'):
+ sense = inst[0].attrib['senseid']
+ context = [(w.text, w.attrib['pos']) for w in inst[1]]
elts.append((sense, context))
return elts
in_instance = False
while True:
line = stream.readline()
- if line == "":
+ if line == '':
assert instance_lines == []
return []
# Start of a lexical element?
- if line.lstrip().startswith("<lexelt"):
+ if line.lstrip().startswith('<lexelt'):
lexelt_num += 1
- m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
+ m = re.search('item=("[^"]+"|\'[^\']+\')', line)
assert m is not None # <lexelt> has no 'item=...'
lexelt = m.group(1)[1:-1]
if lexelt_num < len(self._lexelts):
self._lexelt_starts.append(stream.tell())
# Start of an instance?
- if line.lstrip().startswith("<instance"):
+ if line.lstrip().startswith('<instance'):
assert instance_lines == []
in_instance = True
instance_lines.append(line)
# End of an instance?
- if line.lstrip().startswith("</instance"):
- xml_block = "\n".join(instance_lines)
+ if line.lstrip().startswith('</instance'):
+ xml_block = '\n'.join(instance_lines)
xml_block = _fixXML(xml_block)
inst = ElementTree.fromstring(xml_block)
return [self._parse_instance(inst, lexelt)]
context = []
position = None
for child in instance:
- if child.tag == "answer":
- senses.append(child.attrib["senseid"])
- elif child.tag == "context":
+ if child.tag == 'answer':
+ senses.append(child.attrib['senseid'])
+ elif child.tag == 'context':
context += self._word_tokenizer.tokenize(child.text)
for cword in child:
- if cword.tag == "compound":
+ if cword.tag == 'compound':
cword = cword[0] # is this ok to do?
- if cword.tag == "head":
+ if cword.tag == 'head':
# Some santiy checks:
- assert position is None, "head specified twice"
+ assert position is None, 'head specified twice'
assert cword.text.strip() or len(cword) == 1
assert not (cword.text.strip() and len(cword) == 1)
# Record the position of the head:
# Addd on the head word itself:
if cword.text.strip():
context.append(cword.text.strip())
- elif cword[0].tag == "wf":
- context.append((cword[0].text, cword[0].attrib["pos"]))
+ elif cword[0].tag == 'wf':
+ context.append((cword[0].text, cword[0].attrib['pos']))
if cword[0].tail:
context += self._word_tokenizer.tokenize(cword[0].tail)
else:
- assert False, "expected CDATA or wf in <head>"
- elif cword.tag == "wf":
- context.append((cword.text, cword.attrib["pos"]))
- elif cword.tag == "s":
+ assert False, 'expected CDATA or wf in <head>'
+ elif cword.tag == 'wf':
+ context.append((cword.text, cword.attrib['pos']))
+ elif cword.tag == 's':
pass # Sentence boundary marker.
else:
- print("ACK", cword.tag)
- assert False, "expected CDATA or <wf> or <head>"
+ print('ACK', cword.tag)
+ assert False, 'expected CDATA or <wf> or <head>'
if cword.tail:
context += self._word_tokenizer.tokenize(cword.tail)
else:
- assert False, "unexpected tag %s" % child.tag
+ assert False, 'unexpected tag %s' % child.tag
return SensevalInstance(lexelt, position, context, senses)
Fix the various issues with Senseval pseudo-XML.
"""
# <~> or <^> => ~ or ^
- text = re.sub(r"<([~\^])>", r"\1", text)
+ text = re.sub(r'<([~\^])>', r'\1', text)
# fix lone &
- text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text)
+ text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)
# fix """
- text = re.sub(r'"""', "'\"'", text)
+ text = re.sub(r'"""', '\'"\'', text)
# fix <s snum=dd> => <s snum="dd"/>
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
# fix foreign word tag
- text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
+ text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
# remove <&I .>
- text = re.sub(r"<\&I[^>]*>", "", text)
+ text = re.sub(r'<\&I[^>]*>', '', text)
# fix <{word}>
- text = re.sub(r"<{([^}]+)}>", r"\1", text)
+ text = re.sub(r'<{([^}]+)}>', r'\1', text)
# remove <@>, <p>, </p>
- text = re.sub(r"<(@|/?p)>", r"", text)
+ text = re.sub(r'<(@|/?p)>', r'', text)
# remove <&M .> and <&T .> and <&Ms .>
- text = re.sub(r"<&\w+ \.>", r"", text)
+ text = re.sub(r'<&\w+ \.>', r'', text)
# remove <!DOCTYPE... > lines
- text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
+ text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
# remove <[hi]> and <[/p]> etc
- text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
+ text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
# take the thing out of the brackets: <…>
- text = re.sub(r"<(\&\w+;)>", r"\1", text)
+ text = re.sub(r'<(\&\w+;)>', r'\1', text)
# and remove the & for those patterns that aren't regular XML
- text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
+ text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
text = re.sub(
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
# -*- coding: utf-8 -*-
# Natural Language Toolkit: SentiWordNet
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
import re
-
+from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader import CorpusReader
+@python_2_unicode_compatible
class SentiWordNetCorpusReader(CorpusReader):
- def __init__(self, root, fileids, encoding="utf-8"):
+ def __init__(self, root, fileids, encoding='utf-8'):
"""
Construct a new SentiWordNet Corpus Reader, using data from
the specified file.
"""
super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
if len(self._fileids) != 1:
- raise ValueError("Exactly one file must be specified")
+ raise ValueError('Exactly one file must be specified')
self._db = {}
self._parse_src_file()
try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except:
- raise ValueError("Line %s formatted incorrectly: %s\n" % (i, line))
+ raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
if pos and offset:
offset = int(offset)
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
if tuple(vals) in self._db:
pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals
- if pos == "s":
- pos = "a"
+ if pos == 's':
+ pos = 'a'
synset = wn.synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos()
- if pos == "s":
- pos = "a"
+ if pos == 's':
+ pos = 'a'
offset = synset.offset()
if (pos, offset) in self._db:
pos_score, neg_score = self._db[(pos, offset)]
yield SentiSynset(pos_score, neg_score, synset)
+@python_2_unicode_compatible
class SentiSynset(object):
def __init__(self, pos_score, neg_score, synset):
self._pos_score = pos_score
# Natural Language Toolkit: Sinica Treebank Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-IDENTIFIER = re.compile(r"^#\S+\s")
-APPENDIX = re.compile(r"(?<=\))#.*$")
-TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
-WORD = re.compile(r":[^:()|]+:([^:()|]+)")
+IDENTIFIER = re.compile(r'^#\S+\s')
+APPENDIX = re.compile(r'(?<=\))#.*$')
+TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
+WORD = re.compile(r':[^:()|]+:([^:()|]+)')
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
def _read_block(self, stream):
sent = stream.readline()
- sent = IDENTIFIER.sub("", sent)
- sent = APPENDIX.sub("", sent)
+ sent = IDENTIFIER.sub('', sent)
+ sent = APPENDIX.sub('', sent)
return [sent]
def _parse(self, sent):
# Natural Language Toolkit: String Category Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
"""
# based on PPAttachmentCorpusReader
+from six import string_types
+
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
# in nltk, we use the form (data, tag) -- e.g., tagged words and
# labeled texts for classifiers.
class StringCategoryCorpusReader(CorpusReader):
- def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
+ def __init__(self, root, fileids, delimiter=' ', encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
def tuples(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: Switchboard Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from nltk.tag import str2tuple, map_tag
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
+@compat.python_2_unicode_compatible
class SwitchboardTurn(list):
"""
A specialized list object used to encode switchboard utterances.
def __repr__(self):
if len(self) == 0:
- text = ""
+ text = ''
elif isinstance(self[0], tuple):
- text = " ".join("%s/%s" % w for w in self)
+ text = ' '.join('%s/%s' % w for w in self)
else:
- text = " ".join(self)
- return "<%s.%s: %r>" % (self.speaker, self.id, text)
+ text = ' '.join(self)
+ return '<%s.%s: %r>' % (self.speaker, self.id, text)
class SwitchboardCorpusReader(CorpusReader):
- _FILES = ["tagged"]
+ _FILES = ['tagged']
# Use the "tagged" file even for non-tagged data methods, since
# it's tokenized.
self._tagset = tagset
def words(self):
- return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'), self._words_block_reader)
def tagged_words(self, tagset=None):
def tagged_words_block_reader(stream):
return self._tagged_words_block_reader(stream, tagset)
- return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'), tagged_words_block_reader)
def turns(self):
- return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'), self._turns_block_reader)
def tagged_turns(self, tagset=None):
def tagged_turns_block_reader(stream):
return self._tagged_turns_block_reader(stream, tagset)
- return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'), tagged_turns_block_reader)
def discourses(self):
return StreamBackedCorpusView(
- self.abspath("tagged"), self._discourses_block_reader
+ self.abspath('tagged'), self._discourses_block_reader
)
def tagged_discourses(self, tagset=False):
return self._tagged_discourses_block_reader(stream, tagset)
return StreamBackedCorpusView(
- self.abspath("tagged"), tagged_discourses_block_reader
+ self.abspath('tagged'), tagged_discourses_block_reader
)
def _discourses_block_reader(self, stream):
[
self._parse_utterance(u, include_tag=False)
for b in read_blankline_block(stream)
- for u in b.split("\n")
+ for u in b.split('\n')
if u.strip()
]
]
[
self._parse_utterance(u, include_tag=True, tagset=tagset)
for b in read_blankline_block(stream)
- for u in b.split("\n")
+ for u in b.split('\n')
if u.strip()
]
]
def _tagged_words_block_reader(self, stream, tagset=None):
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
- _UTTERANCE_RE = re.compile("(\w+)\.(\d+)\:\s*(.*)")
- _SEP = "/"
+ _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
+ _SEP = '/'
def _parse_utterance(self, utterance, include_tag, tagset=None):
m = self._UTTERANCE_RE.match(utterance)
if m is None:
- raise ValueError("Bad utterance %r" % utterance)
+ raise ValueError('Bad utterance %r' % utterance)
speaker, id, text = m.groups()
words = [str2tuple(s, self._SEP) for s in text.split()]
if not include_tag:
# Natural Language Toolkit: Tagged Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jacob Perkins <japerk@gmail.com>
import os
+from six import string_types
+
from nltk.tag import str2tuple, map_tag
from nltk.tokenize import *
self,
root,
fileids,
- sep="/",
+ sep='/',
word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
- encoding="utf8",
+ encoding='utf8',
tagset=None,
):
"""
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
sentence.
"""
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
TaggedCorpusReader.__init__(
self,
root,
fileids,
- sep="_",
+ sep='_',
word_tokenizer=LineTokenizer(),
- sent_tokenizer=RegexpTokenizer(".*\n"),
+ sent_tokenizer=RegexpTokenizer('.*\n'),
para_block_reader=self._read_block,
encoding=encoding,
tagset=tagset,
)
def _read_block(self, stream):
- return read_regexp_block(stream, r".*", r".*_\.")
+ return read_regexp_block(stream, r'.*', r'.*_\.')
class TimitTaggedCorpusReader(TaggedCorpusReader):
)
def paras(self):
- raise NotImplementedError("use sents() instead")
+ raise NotImplementedError('use sents() instead')
def tagged_paras(self):
- raise NotImplementedError("use tagged_sents() instead")
+ raise NotImplementedError('use tagged_sents() instead')
timit.audiodata function.
"""
+from __future__ import print_function, unicode_literals
+
import sys
import os
import re
import tempfile
import time
+from six import string_types
+
+from nltk import compat
from nltk.tree import Tree
from nltk.internals import import_from_stdlib
- <utterance-id>.wav: utterance sound file
"""
- _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
+ _FILE_RE = r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt'
"""A regexp matching fileids that are used by this corpus reader."""
- _UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
+ _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
- def __init__(self, root, encoding="utf8"):
+ def __init__(self, root, encoding='utf8'):
"""
Construct a new TIMIT corpus reader in the given directory.
:param root: The root directory for this corpus.
"""
# Ensure that wave files don't get treated as unicode data:
- if isinstance(encoding, str):
- encoding = [(".*\.wav", None), (".*", encoding)]
+ if isinstance(encoding, string_types):
+ encoding = [('.*\.wav', None), ('.*', encoding)]
CorpusReader.__init__(
self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
self._speakerinfo = None
self._root = root
- self.speakers = sorted(set(u.split("/")[0] for u in self._utterances))
+ self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
def fileids(self, filetype=None):
"""
"""
if filetype is None:
return CorpusReader.fileids(self)
- elif filetype in ("txt", "wrd", "phn", "wav"):
- return ["%s.%s" % (u, filetype) for u in self._utterances]
- elif filetype == "metadata":
- return ["timitdic.txt", "spkrinfo.txt"]
+ elif filetype in ('txt', 'wrd', 'phn', 'wav'):
+ return ['%s.%s' % (u, filetype) for u in self._utterances]
+ elif filetype == 'metadata':
+ return ['timitdic.txt', 'spkrinfo.txt']
else:
- raise ValueError("Bad value for filetype: %r" % filetype)
+ raise ValueError('Bad value for filetype: %r' % filetype)
def utteranceids(
self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
region, gender, sentence type, or sentence number, if
specified.
"""
- if isinstance(dialect, str):
+ if isinstance(dialect, string_types):
dialect = [dialect]
- if isinstance(sex, str):
+ if isinstance(sex, string_types):
sex = [sex]
- if isinstance(spkrid, str):
+ if isinstance(spkrid, string_types):
spkrid = [spkrid]
- if isinstance(sent_type, str):
+ if isinstance(sent_type, string_types):
sent_type = [sent_type]
- if isinstance(sentid, str):
+ if isinstance(sentid, string_types):
sentid = [sentid]
utterances = self._utterances[:]
each word.
"""
_transcriptions = {}
- for line in self.open("timitdic.txt"):
- if not line.strip() or line[0] == ";":
+ for line in self.open('timitdic.txt'):
+ if not line.strip() or line[0] == ';':
continue
- m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
+ m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
if not m:
- raise ValueError("Bad line: %r" % line)
+ raise ValueError('Bad line: %r' % line)
_transcriptions[m.group(1)] = m.group(2).split()
return _transcriptions
def spkrid(self, utterance):
- return utterance.split("/")[0]
+ return utterance.split('/')[0]
def sentid(self, utterance):
- return utterance.split("/")[1]
+ return utterance.split('/')[1]
def utterance(self, spkrid, sentid):
- return "%s/%s" % (spkrid, sentid)
+ return '%s/%s' % (spkrid, sentid)
def spkrutteranceids(self, speaker):
"""
return [
utterance
for utterance in self._utterances
- if utterance.startswith(speaker + "/")
+ if utterance.startswith(speaker + '/')
]
def spkrinfo(self, speaker):
if self._speakerinfo is None:
self._speakerinfo = {}
- for line in self.open("spkrinfo.txt"):
- if not line.strip() or line[0] == ";":
+ for line in self.open('spkrinfo.txt'):
+ if not line.strip() or line[0] == ';':
continue
rec = line.strip().split(None, 9)
key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower())
def phones(self, utterances=None):
return [
line.split()[-1]
- for fileid in self._utterance_fileids(utterances, ".phn")
+ for fileid in self._utterance_fileids(utterances, '.phn')
for line in self.open(fileid)
if line.strip()
]
"""
return [
(line.split()[2], int(line.split()[0]), int(line.split()[1]))
- for fileid in self._utterance_fileids(utterances, ".phn")
+ for fileid in self._utterance_fileids(utterances, '.phn')
for line in self.open(fileid)
if line.strip()
]
def words(self, utterances=None):
return [
line.split()[-1]
- for fileid in self._utterance_fileids(utterances, ".wrd")
+ for fileid in self._utterance_fileids(utterances, '.wrd')
for line in self.open(fileid)
if line.strip()
]
def word_times(self, utterances=None):
return [
(line.split()[2], int(line.split()[0]), int(line.split()[1]))
- for fileid in self._utterance_fileids(utterances, ".wrd")
+ for fileid in self._utterance_fileids(utterances, '.wrd')
for line in self.open(fileid)
if line.strip()
]
def sents(self, utterances=None):
return [
[line.split()[-1] for line in self.open(fileid) if line.strip()]
- for fileid in self._utterance_fileids(utterances, ".wrd")
+ for fileid in self._utterance_fileids(utterances, '.wrd')
]
def sent_times(self, utterances=None):
int(line.split()[0]),
int(line.split()[1]),
)
- for fileid in self._utterance_fileids(utterances, ".txt")
+ for fileid in self._utterance_fileids(utterances, '.txt')
for line in self.open(fileid)
if line.strip()
]
def phone_trees(self, utterances=None):
if utterances is None:
utterances = self._utterances
- if isinstance(utterances, str):
+ if isinstance(utterances, string_types):
utterances = [utterances]
trees = []
while sent_times:
(sent, sent_start, sent_end) = sent_times.pop(0)
- trees.append(Tree("S", []))
+ trees.append(Tree('S', []))
while (
word_times and phone_times and phone_times[0][2] <= word_times[0][1]
):
# fileids.
def wav(self, utterance, start=0, end=None):
# nltk.chunk conflicts with the stdlib module 'chunk'
- wave = import_from_stdlib("wave")
+ wave = import_from_stdlib('wave')
- w = wave.open(self.open(utterance + ".wav"), "rb")
+ w = wave.open(self.open(utterance + '.wav'), 'rb')
if end is None:
end = w.getnframes()
# Open a new temporary file -- the wave module requires
# an actual file, and won't work w/ stringio. :(
tf = tempfile.TemporaryFile()
- out = wave.open(tf, "w")
+ out = wave.open(tf, 'w')
# Write the parameters & data to the new file.
out.setparams(w.getparams())
assert end is None or end > start
headersize = 44
if end is None:
- data = self.open(utterance + ".wav").read()
+ data = self.open(utterance + '.wav').read()
else:
- data = self.open(utterance + ".wav").read(headersize + end * 2)
+ data = self.open(utterance + '.wav').read(headersize + end * 2)
return data[headersize + start * 2 :]
def _utterance_fileids(self, utterances, extension):
if utterances is None:
utterances = self._utterances
- if isinstance(utterances, str):
+ if isinstance(utterances, string_types):
utterances = [utterances]
- return ["%s%s" % (u, extension) for u in utterances]
+ return ['%s%s' % (u, extension) for u in utterances]
def play(self, utterance, start=0, end=None):
"""
import ossaudiodev
try:
- dsp = ossaudiodev.open("w")
+ dsp = ossaudiodev.open('w')
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
dsp.channels(1)
dsp.speed(16000)
)
+@compat.python_2_unicode_compatible
class SpeakerInfo(object):
def __init__(
self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
self.comments = comments
def __repr__(self):
- attribs = "id sex dr use recdate birthdate ht race edu comments"
- args = ["%s=%r" % (attr, getattr(self, attr)) for attr in attribs.split()]
- return "SpeakerInfo(%s)" % (", ".join(args))
+ attribs = 'id sex dr use recdate birthdate ht race edu comments'
+ args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()]
+ return 'SpeakerInfo(%s)' % (', '.join(args))
def read_timit_block(stream):
line = stream.readline()
if not line:
return []
- n, sent = line.split(" ", 1)
+ n, sent = line.split(' ', 1)
return [sent]
# Natural Language Toolkit: Toolbox Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Greg Aumann <greg_aumann@sil.org>
# Stuart Robinson <Stuart.Robinson@mpi.nl>
# Steven Bird <stevenbird1@gmail.com>
fileids,
strip=True,
unwrap=True,
- encoding="utf8",
- errors="strict",
+ encoding='utf8',
+ errors='strict',
unicode_fields=None,
):
return concat(
# should probably be done lazily:
def entries(self, fileids, **kwargs):
- if "key" in kwargs:
- key = kwargs["key"]
- del kwargs["key"]
+ if 'key' in kwargs:
+ key = kwargs['key']
+ del kwargs['key']
else:
- key = "lx" # the default key in MDF
+ key = 'lx' # the default key in MDF
entries = []
for marker, contents in self.fields(fileids, **kwargs):
if marker == key:
pass
return entries
- def words(self, fileids, key="lx"):
+ def words(self, fileids, key='lx'):
return [contents for marker, contents in self.fields(fileids) if marker == key]
def raw(self, fileids):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
pass
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Twitter Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import json
import os
+from six import string_types
+
from nltk.tokenize import TweetTokenizer
from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
"""
def __init__(
- self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
+ self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
):
"""
tweets = []
for jsono in fulltweets:
try:
- text = jsono["text"]
+ text = jsono['text']
if isinstance(text, bytes):
text = text.decode(self.encoding)
tweets.append(text)
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
"""
UDHR corpus reader. It mostly deals with encodings.
"""
+from __future__ import absolute_import, unicode_literals
from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
class UdhrCorpusReader(PlaintextCorpusReader):
ENCODINGS = [
- (".*-Latin1$", "latin-1"),
- (".*-Hebrew$", "hebrew"),
- (".*-Arabic$", "cp1256"),
- ("Czech_Cesky-UTF8", "cp1250"), # yeah
- (".*-Cyrillic$", "cyrillic"),
- (".*-SJIS$", "SJIS"),
- (".*-GB2312$", "GB2312"),
- (".*-Latin2$", "ISO-8859-2"),
- (".*-Greek$", "greek"),
- (".*-UTF8$", "utf-8"),
- ("Hungarian_Magyar-Unicode", "utf-16-le"),
- ("Amahuaca", "latin1"),
- ("Turkish_Turkce-Turkish", "latin5"),
- ("Lithuanian_Lietuviskai-Baltic", "latin4"),
- ("Japanese_Nihongo-EUC", "EUC-JP"),
- ("Japanese_Nihongo-JIS", "iso2022_jp"),
- ("Chinese_Mandarin-HZ", "hz"),
- ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
+ ('.*-Latin1$', 'latin-1'),
+ ('.*-Hebrew$', 'hebrew'),
+ ('.*-Arabic$', 'cp1256'),
+ ('Czech_Cesky-UTF8', 'cp1250'), # yeah
+ ('.*-Cyrillic$', 'cyrillic'),
+ ('.*-SJIS$', 'SJIS'),
+ ('.*-GB2312$', 'GB2312'),
+ ('.*-Latin2$', 'ISO-8859-2'),
+ ('.*-Greek$', 'greek'),
+ ('.*-UTF8$', 'utf-8'),
+ ('Hungarian_Magyar-Unicode', 'utf-16-le'),
+ ('Amahuaca', 'latin1'),
+ ('Turkish_Turkce-Turkish', 'latin5'),
+ ('Lithuanian_Lietuviskai-Baltic', 'latin4'),
+ ('Japanese_Nihongo-EUC', 'EUC-JP'),
+ ('Japanese_Nihongo-JIS', 'iso2022_jp'),
+ ('Chinese_Mandarin-HZ', 'hz'),
+ ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
]
SKIP = set(
[
# The following files are not fully decodable because they
# were truncated at wrong bytes:
- "Burmese_Myanmar-UTF8",
- "Japanese_Nihongo-JIS",
- "Chinese_Mandarin-HZ",
- "Chinese_Mandarin-UTF8",
- "Gujarati-UTF8",
- "Hungarian_Magyar-Unicode",
- "Lao-UTF8",
- "Magahi-UTF8",
- "Marathi-UTF8",
- "Tamil-UTF8",
+ 'Burmese_Myanmar-UTF8',
+ 'Japanese_Nihongo-JIS',
+ 'Chinese_Mandarin-HZ',
+ 'Chinese_Mandarin-UTF8',
+ 'Gujarati-UTF8',
+ 'Hungarian_Magyar-Unicode',
+ 'Lao-UTF8',
+ 'Magahi-UTF8',
+ 'Marathi-UTF8',
+ 'Tamil-UTF8',
# Unfortunately, encodings required for reading
# the following files are not supported by Python:
- "Vietnamese-VPS",
- "Vietnamese-VIQR",
- "Vietnamese-TCVN",
- "Magahi-Agra",
- "Bhojpuri-Agra",
- "Esperanto-T61", # latin3 raises an exception
+ 'Vietnamese-VPS',
+ 'Vietnamese-VIQR',
+ 'Vietnamese-TCVN',
+ 'Magahi-Agra',
+ 'Bhojpuri-Agra',
+ 'Esperanto-T61', # latin3 raises an exception
# The following files are encoded for specific fonts:
- "Burmese_Myanmar-WinResearcher",
- "Armenian-DallakHelv",
- "Tigrinya_Tigrigna-VG2Main",
- "Amharic-Afenegus6..60375", # ?
- "Navaho_Dine-Navajo-Navaho-font",
+ 'Burmese_Myanmar-WinResearcher',
+ 'Armenian-DallakHelv',
+ 'Tigrinya_Tigrigna-VG2Main',
+ 'Amharic-Afenegus6..60375', # ?
+ 'Navaho_Dine-Navajo-Navaho-font',
# What are these?
- "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
- "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
+ 'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
+ 'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
# The following files are unintended:
- "Czech-Latin2-err",
- "Russian_Russky-UTF8~",
+ 'Czech-Latin2-err',
+ 'Russian_Russky-UTF8~',
]
)
- def __init__(self, root="udhr"):
- fileids = find_corpus_fileids(root, r"(?!README|\.).*")
+ def __init__(self, root='udhr'):
+ fileids = find_corpus_fileids(root, r'(?!README|\.).*')
super(UdhrCorpusReader, self).__init__(
root,
[fileid for fileid in fileids if fileid not in self.SKIP],
# Natural Language Toolkit: Corpus Reader Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
import bisect
import re
import tempfile
-import pickle
from functools import reduce
-from xml.etree import ElementTree
+
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+
+try: # Use the c version of ElementTree, which is faster, if possible.
+ from xml.etree import cElementTree as ElementTree
+except ImportError:
+ from xml.etree import ElementTree
+
+from six import string_types, text_type
from nltk.tokenize import wordpunct_tokenize
from nltk.internals import slice_bounds
block; and tokens is a list of the tokens in the block.
"""
- def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
+ def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'):
"""
Create a new corpus view, based on the file ``fileid``, and
read with ``block_reader``. See the class documentation
else:
self._eofpos = os.stat(self._fileid).st_size
except Exception as exc:
- raise ValueError("Unable to open or access %r -- %s" % (fileid, exc))
+ raise ValueError('Unable to open or access %r -- %s' % (fileid, exc))
# Maintain a cache of the most recently read block, to
# increase efficiency of random access.
:param stream: an input stream
:type stream: stream
"""
- raise NotImplementedError("Abstract Method")
+ raise NotImplementedError('Abstract Method')
def _open(self):
"""
self._stream = self._fileid.open(self._encoding)
elif self._encoding:
self._stream = SeekableUnicodeStreamReader(
- open(self._fileid, "rb"), self._encoding
+ open(self._fileid, 'rb'), self._encoding
)
else:
- self._stream = open(self._fileid, "rb")
+ self._stream = open(self._fileid, 'rb')
def close(self):
"""
if i < 0:
i += len(self)
if i < 0:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# Check if it's in the cache.
offset = self._cache[0]
if offset <= i < self._cache[1]:
try:
return next(self.iterate_from(i))
except StopIteration:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# If we wanted to be thread-safe, then this method would need to
# do some locking.
self._current_blocknum = block_index
tokens = self.read_block(self._stream)
assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
- "block reader %s() should return list or tuple."
+ 'block reader %s() should return list or tuple.'
% self.read_block.__name__
)
num_toks = len(tokens)
new_filepos = self._stream.tell()
assert new_filepos > filepos, (
- "block reader %s() should consume at least 1 byte (filepos=%d)"
+ 'block reader %s() should consume at least 1 byte (filepos=%d)'
% (self.read_block.__name__, filepos)
)
# Check for consistency:
assert (
new_filepos == self._filepos[block_index]
- ), "inconsistent block reader (num chars read)"
+ ), 'inconsistent block reader (num chars read)'
assert (
toknum + num_toks == self._toknum[block_index]
- ), "inconsistent block reader (num tokens returned)"
+ ), 'inconsistent block reader (num tokens returned)'
# If we reached the end of the file, then update self._len
if new_filepos == self._eofpos:
if len(docs) == 1:
return docs[0]
if len(docs) == 0:
- raise ValueError("concat() expects at least one object!")
+ raise ValueError('concat() expects at least one object!')
types = set(d.__class__ for d in docs)
# If they're all strings, use string concatenation.
- if all(isinstance(doc, str) for doc in docs):
- return "".join(docs)
+ if all(isinstance(doc, string_types) for doc in docs):
+ return ''.join(docs)
# If they're all corpus views, then use ConcatenatedCorpusView.
for typ in types:
return reduce((lambda a, b: a + b), docs, ())
if ElementTree.iselement(typ):
- xmltree = ElementTree.Element("documents")
+ xmltree = ElementTree.Element('documents')
for doc in docs:
xmltree.append(doc)
return xmltree
fileid. (This method is called whenever a
``PickledCorpusView`` is garbage-collected.
"""
- if getattr(self, "_delete_on_gc"):
+ if getattr(self, '_delete_on_gc'):
if os.path.exists(self._fileid):
try:
os.remove(self._fileid)
@classmethod
def write(cls, sequence, output_file):
- if isinstance(output_file, str):
- output_file = open(output_file, "wb")
+ if isinstance(output_file, string_types):
+ output_file = open(output_file, 'wb')
for item in sequence:
pickle.dump(item, output_file, cls.PROTOCOL)
deleted whenever this object gets garbage-collected.
"""
try:
- fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
- output_file = os.fdopen(fd, "wb")
+ fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
+ output_file = os.fdopen(fd, 'wb')
cls.write(sequence, output_file)
output_file.close()
return PickleCorpusView(output_file_name, delete_on_gc)
except (OSError, IOError) as e:
- raise ValueError("Error while creating temp file: %s" % e)
+ raise ValueError('Error while creating temp file: %s' % e)
######################################################################
line = stream.readline()
if not line:
return toks
- toks.append(line.rstrip("\n"))
+ toks.append(line.rstrip('\n'))
return toks
def read_blankline_block(stream):
- s = ""
+ s = ''
while True:
line = stream.readline()
# End of file:
def read_alignedsent_block(stream):
- s = ""
+ s = ''
while True:
line = stream.readline()
- if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
+ if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n':
continue
# End of file:
if not line:
# Other line:
else:
s += line
- if re.match("^\d+-\d+", line) is not None:
+ if re.match('^\d+-\d+', line) is not None:
return [s]
line = stream.readline()
# End of file:
if not line:
- return ["".join(lines)]
+ return [''.join(lines)]
# End of token:
if end_re is not None and re.match(end_re, line):
- return ["".join(lines)]
+ return [''.join(lines)]
# Start of new token: backup to just before it starts, and
# return the token we've already collected.
if end_re is None and re.match(start_re, line):
stream.seek(oldpos)
- return ["".join(lines)]
+ return [''.join(lines)]
# Anything else is part of the token.
lines.append(line)
"""
start = stream.tell()
block = stream.read(block_size)
- encoding = getattr(stream, "encoding", None)
- assert encoding is not None or isinstance(block, str)
- if encoding not in (None, "utf-8"):
+ encoding = getattr(stream, 'encoding', None)
+ assert encoding is not None or isinstance(block, text_type)
+ if encoding not in (None, 'utf-8'):
import warnings
warnings.warn(
- "Parsing may fail, depending on the properties "
- "of the %s encoding!" % encoding
+ 'Parsing may fail, depending on the properties '
+ 'of the %s encoding!' % encoding
)
# (e.g., the utf-16 encoding does not work because it insists
# on adding BOMs to the beginning of encoded strings.)
if comment_char:
- COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
+ COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
while True:
try:
# If we're stripping comments, then make sure our block ends
# Read the block.
tokens, offset = _parse_sexpr_block(block)
# Skip whitespace
- offset = re.compile(r"\s*").search(block, offset).end()
+ offset = re.compile(r'\s*').search(block, offset).end()
# Move to the end position.
if encoding is None:
# Return the list of tokens we processed
return tokens
except ValueError as e:
- if e.args[0] == "Block too small":
+ if e.args[0] == 'Block too small':
next_block = stream.read(block_size)
if next_block:
block += next_block
def _sub_space(m):
"""Helper function: given a regexp match, return a string of
spaces that's the same length as the matched string."""
- return " " * (m.end() - m.start())
+ return ' ' * (m.end() - m.start())
def _parse_sexpr_block(block):
start = end = 0
while end < len(block):
- m = re.compile(r"\S").search(block, end)
+ m = re.compile(r'\S').search(block, end)
if not m:
return tokens, end
start = m.start()
# Case 1: sexpr is not parenthesized.
- if m.group() != "(":
- m2 = re.compile(r"[\s(]").search(block, start)
+ if m.group() != '(':
+ m2 = re.compile(r'[\s(]').search(block, start)
if m2:
end = m2.start()
else:
if tokens:
return tokens, end
- raise ValueError("Block too small")
+ raise ValueError('Block too small')
# Case 2: parenthesized sexpr.
else:
nesting = 0
- for m in re.compile(r"[()]").finditer(block, start):
- if m.group() == "(":
+ for m in re.compile(r'[()]').finditer(block, start):
+ if m.group() == '(':
nesting += 1
else:
nesting -= 1
else:
if tokens:
return tokens, end
- raise ValueError("Block too small")
+ raise ValueError('Block too small')
tokens.append(block[start:end])
def find_corpus_fileids(root, regexp):
if not isinstance(root, PathPointer):
- raise TypeError("find_corpus_fileids: expected a PathPointer")
- regexp += "$"
+ raise TypeError('find_corpus_fileids: expected a PathPointer')
+ regexp += '$'
# Find fileids in a zipfile: scan the zipfile's namelist. Filter
# out entries that end in '/' -- they're directories.
fileids = [
name[len(root.entry) :]
for name in root.zipfile.namelist()
- if not name.endswith("/")
+ if not name.endswith('/')
]
items = [name for name in fileids if re.match(regexp, name)]
return sorted(items)
# workaround for py25 which doesn't support followlinks
kwargs = {}
if not py25():
- kwargs = {"followlinks": True}
+ kwargs = {'followlinks': True}
for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
- prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
+ prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
items += [
prefix + fileid
for fileid in fileids
if re.match(regexp, prefix + fileid)
]
# Don't visit svn directories:
- if ".svn" in subdirs:
- subdirs.remove(".svn")
+ if '.svn' in subdirs:
+ subdirs.remove('.svn')
return sorted(items)
else:
def _path_from(parent, child):
- if os.path.split(parent)[1] == "":
+ if os.path.split(parent)[1] == '':
parent = os.path.split(parent)[0]
path = []
while parent != child:
def tagged_treebank_para_block_reader(stream):
# Read the next paragraph.
- para = ""
+ para = ''
while True:
line = stream.readline()
# End of paragraph:
- if re.match("======+\s*$", line):
+ if re.match('======+\s*$', line):
if para.strip():
return [para]
# End of file:
- elif line == "":
+ elif line == '':
if para.strip():
return [para]
else:
# Natural Language Toolkit: Verbnet Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
For details about VerbNet see:
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
"""
+from __future__ import unicode_literals
import re
import textwrap
from collections import defaultdict
+from six import string_types
+
from nltk.corpus.reader.xmldocs import XMLCorpusReader
# runs 2-30 times faster.
self._quick_index()
- _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
+ _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
"""Regular expression that matches (and decomposes) longids"""
- _SHORTID_RE = re.compile(r"[\d+.\-]+$")
+ _SHORTID_RE = re.compile(r'[\d+.\-]+$')
"""Regular expression that matches shortids"""
_INDEX_RE = re.compile(
return sorted(self._lemma_to_class.keys())
else:
# [xx] should this include subclass members?
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
+ return [member.get('name') for member in vnclass.findall('MEMBERS/MEMBER')]
def wordnetids(self, vnclass=None):
"""
return sorted(self._wordnet_to_class.keys())
else:
# [xx] should this include subclass members?
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
return sum(
[
- member.get("wn", "").split()
- for member in vnclass.findall("MEMBERS/MEMBER")
+ member.get('wn', '').split()
+ for member in vnclass.findall('MEMBERS/MEMBER')
],
[],
)
elif classid is not None:
xmltree = self.vnclass(classid)
return [
- subclass.get("ID")
- for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
+ subclass.get('ID')
+ for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS')
]
else:
return sorted(self._class_to_fileid.keys())
if classid in self._class_to_fileid:
fileid = self._class_to_fileid[self.longid(classid)]
tree = self.xml(fileid)
- if classid == tree.get("ID"):
+ if classid == tree.get('ID'):
return tree
else:
- for subclass in tree.findall(".//VNSUBCLASS"):
- if classid == subclass.get("ID"):
+ for subclass in tree.findall('.//VNSUBCLASS'):
+ if classid == subclass.get('ID'):
return subclass
else:
assert False # we saw it during _index()!
else:
- raise ValueError("Unknown identifier {}".format(fileid_or_classid))
+ raise ValueError('Unknown identifier {}'.format(fileid_or_classid))
def fileids(self, vnclass_ids=None):
"""
"""
if vnclass_ids is None:
return self._fileids
- elif isinstance(vnclass_ids, str):
+ elif isinstance(vnclass_ids, string_types):
return [self._class_to_fileid[self.longid(vnclass_ids)]]
else:
return [
containing the xml contents of a VerbNet class.
:return: frames - a list of frame dictionaries
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
frames = []
- vnframes = vnclass.findall("FRAMES/FRAME")
+ vnframes = vnclass.findall('FRAMES/FRAME')
for vnframe in vnframes:
frames.append(
{
- "example": self._get_example_within_frame(vnframe),
- "description": self._get_description_within_frame(vnframe),
- "syntax": self._get_syntactic_list_within_frame(vnframe),
- "semantics": self._get_semantics_within_frame(vnframe),
+ 'example': self._get_example_within_frame(vnframe),
+ 'description': self._get_description_within_frame(vnframe),
+ 'syntax': self._get_syntactic_list_within_frame(vnframe),
+ 'semantics': self._get_semantics_within_frame(vnframe),
}
)
return frames
containing the xml contents of a VerbNet class.
:return: list of subclasses
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
subclasses = [
- subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
+ subclass.get('ID') for subclass in vnclass.findall('SUBCLASSES/VNSUBCLASS')
]
return subclasses
containing the xml contents of a VerbNet class.
:return: themroles: A list of thematic roles in the VerbNet class
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
themroles = []
- for trole in vnclass.findall("THEMROLES/THEMROLE"):
+ for trole in vnclass.findall('THEMROLES/THEMROLE'):
themroles.append(
{
- "type": trole.get("type"),
- "modifiers": [
- {"value": restr.get("Value"), "type": restr.get("type")}
- for restr in trole.findall("SELRESTRS/SELRESTR")
+ 'type': trole.get('type'),
+ 'modifiers': [
+ {'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in trole.findall('SELRESTRS/SELRESTR')
],
}
)
"""
Initialize the indexes ``_lemma_to_class``,
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
- through the corpus fileids. This is fast if ElementTree
- uses the C implementation (<0.1 secs), but quite slow (>10 secs)
- if only the python implementation is available.
+ through the corpus fileids. This is fast with cElementTree
+ (<0.1 secs), but quite slow (>10 secs) with the python
+ implementation of ElementTree.
"""
for fileid in self._fileids:
self._index_helper(self.xml(fileid), fileid)
def _index_helper(self, xmltree, fileid):
"""Helper for ``_index()``"""
- vnclass = xmltree.get("ID")
+ vnclass = xmltree.get('ID')
self._class_to_fileid[vnclass] = fileid
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
- for member in xmltree.findall("MEMBERS/MEMBER"):
- self._lemma_to_class[member.get("name")].append(vnclass)
- for wn in member.get("wn", "").split():
+ for member in xmltree.findall('MEMBERS/MEMBER'):
+ self._lemma_to_class[member.get('name')].append(vnclass)
+ for wn in member.get('wn', '').split():
self._wordnet_to_class[wn].append(vnclass)
- for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
+ for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'):
self._index_helper(subclass, fileid)
def _quick_index(self):
through the corpus fileids. This doesn't do proper xml parsing,
but is good enough to find everything in the standard VerbNet
corpus -- and it runs about 30 times faster than xml parsing
- (with the python ElementTree; only 2-3 times faster
- if ElementTree uses the C implementation).
+ (with the python ElementTree; only 2-3 times faster with
+ cElementTree).
"""
# nb: if we got rid of wordnet_to_class, this would run 2-3
# times faster.
vnclass = groups[2] # for <MEMBER> elts.
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
else:
- assert False, "unexpected match condition"
+ assert False, 'unexpected match condition'
######################################################################
# { Identifier conversion
if self._LONGID_RE.match(shortid):
return shortid # it's already a longid.
elif not self._SHORTID_RE.match(shortid):
- raise ValueError("vnclass identifier %r not found" % shortid)
+ raise ValueError('vnclass identifier %r not found' % shortid)
try:
return self._shortid_to_longid[shortid]
except KeyError:
- raise ValueError("vnclass identifier %r not found" % shortid)
+ raise ValueError('vnclass identifier %r not found' % shortid)
def shortid(self, longid):
"""Returns shortid of a VerbNet class
if m:
return m.group(2)
else:
- raise ValueError("vnclass identifier %r not found" % longid)
+ raise ValueError('vnclass identifier %r not found' % longid)
######################################################################
# { Frame access utility functions
:return: semantics: semantics dictionary
"""
semantics_within_single_frame = []
- for pred in vnframe.findall("SEMANTICS/PRED"):
+ for pred in vnframe.findall('SEMANTICS/PRED'):
arguments = [
- {"type": arg.get("type"), "value": arg.get("value")}
- for arg in pred.findall("ARGS/ARG")
+ {'type': arg.get('type'), 'value': arg.get('value')}
+ for arg in pred.findall('ARGS/ARG')
]
semantics_within_single_frame.append(
- {"predicate_value": pred.get("value"), "arguments": arguments}
+ {'predicate_value': pred.get('value'), 'arguments': arguments}
)
return semantics_within_single_frame
a VerbNet frame.
:return: example_text: The example sentence for this particular frame
"""
- example_element = vnframe.find("EXAMPLES/EXAMPLE")
+ example_element = vnframe.find('EXAMPLES/EXAMPLE')
if example_element is not None:
example_text = example_element.text
else:
a VerbNet frame.
:return: description: a description dictionary with members - primary and secondary
"""
- description_element = vnframe.find("DESCRIPTION")
+ description_element = vnframe.find('DESCRIPTION')
return {
- "primary": description_element.attrib["primary"],
- "secondary": description_element.get("secondary", ""),
+ 'primary': description_element.attrib['primary'],
+ 'secondary': description_element.get('secondary', ''),
}
def _get_syntactic_list_within_frame(self, vnframe):
:return: syntax_within_single_frame
"""
syntax_within_single_frame = []
- for elt in vnframe.find("SYNTAX"):
+ for elt in vnframe.find('SYNTAX'):
pos_tag = elt.tag
modifiers = dict()
- modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
- modifiers["selrestrs"] = [
- {"value": restr.get("Value"), "type": restr.get("type")}
- for restr in elt.findall("SELRESTRS/SELRESTR")
+ modifiers['value'] = elt.get('value') if 'value' in elt.attrib else ""
+ modifiers['selrestrs'] = [
+ {'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in elt.findall('SELRESTRS/SELRESTR')
]
- modifiers["synrestrs"] = [
- {"value": restr.get("Value"), "type": restr.get("type")}
- for restr in elt.findall("SYNRESTRS/SYNRESTR")
+ modifiers['synrestrs'] = [
+ {'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in elt.findall('SYNRESTRS/SYNRESTR')
]
syntax_within_single_frame.append(
- {"pos_tag": pos_tag, "modifiers": modifiers}
+ {'pos_tag': pos_tag, 'modifiers': modifiers}
)
return syntax_within_single_frame
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- s = vnclass.get("ID") + "\n"
- s += self.pprint_subclasses(vnclass, indent=" ") + "\n"
- s += self.pprint_members(vnclass, indent=" ") + "\n"
- s += " Thematic roles:\n"
- s += self.pprint_themroles(vnclass, indent=" ") + "\n"
- s += " Frames:\n"
- s += self.pprint_frames(vnclass, indent=" ")
+ s = vnclass.get('ID') + '\n'
+ s += self.pprint_subclasses(vnclass, indent=' ') + '\n'
+ s += self.pprint_members(vnclass, indent=' ') + '\n'
+ s += ' Thematic roles:\n'
+ s += self.pprint_themroles(vnclass, indent=' ') + '\n'
+ s += ' Frames:\n'
+ s += self.pprint_frames(vnclass, indent=' ')
return s
- def pprint_subclasses(self, vnclass, indent=""):
+ def pprint_subclasses(self, vnclass, indent=''):
"""Returns pretty printed version of subclasses of VerbNet class
Return a string containing a pretty-printed representation of
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
subclasses = self.subclasses(vnclass)
if not subclasses:
- subclasses = ["(none)"]
- s = "Subclasses: " + " ".join(subclasses)
+ subclasses = ['(none)']
+ s = 'Subclasses: ' + ' '.join(subclasses)
return textwrap.fill(
- s, 70, initial_indent=indent, subsequent_indent=indent + " "
+ s, 70, initial_indent=indent, subsequent_indent=indent + ' '
)
- def pprint_members(self, vnclass, indent=""):
+ def pprint_members(self, vnclass, indent=''):
"""Returns pretty printed version of members in a VerbNet class
Return a string containing a pretty-printed representation of
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
members = self.lemmas(vnclass)
if not members:
- members = ["(none)"]
- s = "Members: " + " ".join(members)
+ members = ['(none)']
+ s = 'Members: ' + ' '.join(members)
return textwrap.fill(
- s, 70, initial_indent=indent, subsequent_indent=indent + " "
+ s, 70, initial_indent=indent, subsequent_indent=indent + ' '
)
- def pprint_themroles(self, vnclass, indent=""):
+ def pprint_themroles(self, vnclass, indent=''):
"""Returns pretty printed version of thematic roles in a VerbNet class
Return a string containing a pretty-printed representation of
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
pieces = []
for themrole in self.themroles(vnclass):
- piece = indent + "* " + themrole.get("type")
+ piece = indent + '* ' + themrole.get('type')
modifiers = [
- modifier["value"] + modifier["type"]
- for modifier in themrole["modifiers"]
+ modifier['value'] + modifier['type']
+ for modifier in themrole['modifiers']
]
if modifiers:
- piece += "[{}]".format(" ".join(modifiers))
+ piece += '[{}]'.format(' '.join(modifiers))
pieces.append(piece)
- return "\n".join(pieces)
+ return '\n'.join(pieces)
- def pprint_frames(self, vnclass, indent=""):
+ def pprint_frames(self, vnclass, indent=''):
"""Returns pretty version of all frames in a VerbNet class
Return a string containing a pretty-printed representation of
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
pieces = []
for vnframe in self.frames(vnclass):
pieces.append(self._pprint_single_frame(vnframe, indent))
- return "\n".join(pieces)
+ return '\n'.join(pieces)
- def _pprint_single_frame(self, vnframe, indent=""):
+ def _pprint_single_frame(self, vnframe, indent=''):
"""Returns pretty printed version of a single frame in a VerbNet class
Returns a string containing a pretty-printed representation of
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
- frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
- frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
+ frame_string = self._pprint_description_within_frame(vnframe, indent) + '\n'
+ frame_string += self._pprint_example_within_frame(vnframe, indent + ' ') + '\n'
frame_string += (
- self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n"
+ self._pprint_syntax_within_frame(vnframe, indent + ' Syntax: ') + '\n'
)
- frame_string += indent + " Semantics:\n"
- frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ")
+ frame_string += indent + ' Semantics:\n'
+ frame_string += self._pprint_semantics_within_frame(vnframe, indent + ' ')
return frame_string
- def _pprint_example_within_frame(self, vnframe, indent=""):
+ def _pprint_example_within_frame(self, vnframe, indent=''):
"""Returns pretty printed version of example within frame in a VerbNet class
Return a string containing a pretty-printed representation of
:param vnframe: An ElementTree containing the xml contents of
a Verbnet frame.
"""
- if vnframe["example"]:
- return indent + " Example: " + vnframe["example"]
+ if vnframe['example']:
+ return indent + ' Example: ' + vnframe['example']
- def _pprint_description_within_frame(self, vnframe, indent=""):
+ def _pprint_description_within_frame(self, vnframe, indent=''):
"""Returns pretty printed version of a VerbNet frame description
Return a string containing a pretty-printed representation of
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
- description = indent + vnframe["description"]["primary"]
- if vnframe["description"]["secondary"]:
- description += " ({})".format(vnframe["description"]["secondary"])
+ description = indent + vnframe['description']['primary']
+ if vnframe['description']['secondary']:
+ description += ' ({})'.format(vnframe['description']['secondary'])
return description
- def _pprint_syntax_within_frame(self, vnframe, indent=""):
+ def _pprint_syntax_within_frame(self, vnframe, indent=''):
"""Returns pretty printed version of syntax within a frame in a VerbNet class
Return a string containing a pretty-printed representation of
a VerbNet frame.
"""
pieces = []
- for element in vnframe["syntax"]:
- piece = element["pos_tag"]
+ for element in vnframe['syntax']:
+ piece = element['pos_tag']
modifier_list = []
- if "value" in element["modifiers"] and element["modifiers"]["value"]:
- modifier_list.append(element["modifiers"]["value"])
+ if 'value' in element['modifiers'] and element['modifiers']['value']:
+ modifier_list.append(element['modifiers']['value'])
modifier_list += [
- "{}{}".format(restr["value"], restr["type"])
+ '{}{}'.format(restr['value'], restr['type'])
for restr in (
- element["modifiers"]["selrestrs"]
- + element["modifiers"]["synrestrs"]
+ element['modifiers']['selrestrs']
+ + element['modifiers']['synrestrs']
)
]
if modifier_list:
- piece += "[{}]".format(" ".join(modifier_list))
+ piece += '[{}]'.format(' '.join(modifier_list))
pieces.append(piece)
- return indent + " ".join(pieces)
+ return indent + ' '.join(pieces)
- def _pprint_semantics_within_frame(self, vnframe, indent=""):
+ def _pprint_semantics_within_frame(self, vnframe, indent=''):
"""Returns a pretty printed version of semantics within frame in a VerbNet class
Return a string containing a pretty-printed representation of
a VerbNet frame.
"""
pieces = []
- for predicate in vnframe["semantics"]:
- arguments = [argument["value"] for argument in predicate["arguments"]]
+ for predicate in vnframe['semantics']:
+ arguments = [argument['value'] for argument in predicate['arguments']]
pieces.append(
- "{}({})".format(predicate["predicate_value"], ", ".join(arguments))
+ '{}({})'.format(predicate['predicate_value'], ', '.join(arguments))
)
- return "\n".join("{}* {}".format(indent, piece) for piece in pieces)
+ return '\n'.join('{}* {}'.format(indent, piece) for piece in pieces)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from six import string_types
+
from nltk.tokenize import line_tokenize
from nltk.corpus.reader.util import *
List of words, one per line. Blank lines are ignored.
"""
- def words(self, fileids=None, ignore_lines_startswith="\n"):
+ def words(self, fileids=None, ignore_lines_startswith='\n'):
return [
line
for line in line_tokenize(self.raw(fileids))
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
"""
available_langs = {
- "catalan": "ca",
- "czech": "cs",
- "german": "de",
- "greek": "el",
- "english": "en",
- "spanish": "es",
- "finnish": "fi",
- "french": "fr",
- "hungarian": "hu",
- "icelandic": "is",
- "italian": "it",
- "latvian": "lv",
- "dutch": "nl",
- "polish": "pl",
- "portuguese": "pt",
- "romanian": "ro",
- "russian": "ru",
- "slovak": "sk",
- "slovenian": "sl",
- "swedish": "sv",
- "tamil": "ta",
+ 'catalan': 'ca',
+ 'czech': 'cs',
+ 'german': 'de',
+ 'greek': 'el',
+ 'english': 'en',
+ 'spanish': 'es',
+ 'finnish': 'fi',
+ 'french': 'fr',
+ 'hungarian': 'hu',
+ 'icelandic': 'is',
+ 'italian': 'it',
+ 'latvian': 'lv',
+ 'dutch': 'nl',
+ 'polish': 'pl',
+ 'portuguese': 'pt',
+ 'romanian': 'ro',
+ 'russian': 'ru',
+ 'slovak': 'sk',
+ 'slovenian': 'sl',
+ 'swedish': 'sv',
+ 'tamil': 'ta',
}
# Also, add the lang IDs as the keys.
available_langs.update({v: v for v in available_langs.values()})
- def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
+ def words(self, lang=None, fileids=None, ignore_lines_startswith='#'):
"""
This module returns a list of nonbreaking prefixes for the specified
language(s).
# all languages when fileids==None.
if lang in self.available_langs:
lang = self.available_langs[lang]
- fileids = ["nonbreaking_prefix." + lang]
+ fileids = ['nonbreaking_prefix.' + lang]
return [
line
for line in line_tokenize(self.raw(fileids))
# These are categories similar to the Perl Unicode Properties
available_categories = [
- "Close_Punctuation",
- "Currency_Symbol",
- "IsAlnum",
- "IsAlpha",
- "IsLower",
- "IsN",
- "IsSc",
- "IsSo",
- "IsUpper",
- "Line_Separator",
- "Number",
- "Open_Punctuation",
- "Punctuation",
- "Separator",
- "Symbol",
+ 'Close_Punctuation',
+ 'Currency_Symbol',
+ 'IsAlnum',
+ 'IsAlpha',
+ 'IsLower',
+ 'IsN',
+ 'IsSc',
+ 'IsSo',
+ 'IsUpper',
+ 'Line_Separator',
+ 'Number',
+ 'Open_Punctuation',
+ 'Punctuation',
+ 'Separator',
+ 'Symbol',
]
def chars(self, category=None, fileids=None):
:return: a list of characters given the specific unicode character category
"""
if category in self.available_categories:
- fileids = [category + ".txt"]
+ fileids = [category + '.txt']
return list(self.raw(fileids).strip())
:return: a list of tuples of similar lexical terms.
"""
- mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
+ mwa_ppdb_xxxl_file = 'ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'
def entries(self, fileids=mwa_ppdb_xxxl_file):
"""
:return: a tuple of synonym word pairs.
"""
- return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
+ return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: WordNet
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bethard <Steven.Bethard@colorado.edu>
# Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
"""
+from __future__ import print_function, unicode_literals
+
import math
import re
from itertools import islice, chain
from operator import itemgetter
from collections import defaultdict, deque
+from six import iteritems
+from six.moves import range
+
from nltk.corpus.reader import CorpusReader
from nltk.util import binary_search_file as _binary_search_file
from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
from nltk.internals import deprecated
######################################################################
_INF = 1e300
# { Part-of-speech constants
-ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# }
POS_LIST = [NOUN, VERB, ADJ, ADV]
"Something %s INFINITIVE",
)
-SENSENUM_RE = re.compile(r"\.[\d]+\.")
+SENSENUM_RE = re.compile(r'\.[\d]+\.')
######################################################################
"""A common base class for lemmas and synsets."""
def hypernyms(self):
- return self._related("@")
+ return self._related('@')
def _hypernyms(self):
- return self._related("@")
+ return self._related('@')
def instance_hypernyms(self):
- return self._related("@i")
+ return self._related('@i')
def _instance_hypernyms(self):
- return self._related("@i")
+ return self._related('@i')
def hyponyms(self):
- return self._related("~")
+ return self._related('~')
def instance_hyponyms(self):
- return self._related("~i")
+ return self._related('~i')
def member_holonyms(self):
- return self._related("#m")
+ return self._related('#m')
def substance_holonyms(self):
- return self._related("#s")
+ return self._related('#s')
def part_holonyms(self):
- return self._related("#p")
+ return self._related('#p')
def member_meronyms(self):
- return self._related("%m")
+ return self._related('%m')
def substance_meronyms(self):
- return self._related("%s")
+ return self._related('%s')
def part_meronyms(self):
- return self._related("%p")
+ return self._related('%p')
def topic_domains(self):
- return self._related(";c")
+ return self._related(';c')
def in_topic_domains(self):
- return self._related("-c")
+ return self._related('-c')
def region_domains(self):
- return self._related(";r")
+ return self._related(';r')
def in_region_domains(self):
- return self._related("-r")
+ return self._related('-r')
def usage_domains(self):
- return self._related(";u")
+ return self._related(';u')
def in_usage_domains(self):
- return self._related("-u")
+ return self._related('-u')
def attributes(self):
- return self._related("=")
+ return self._related('=')
def entailments(self):
- return self._related("*")
+ return self._related('*')
def causes(self):
- return self._related(">")
+ return self._related('>')
def also_sees(self):
- return self._related("^")
+ return self._related('^')
def verb_groups(self):
- return self._related("$")
+ return self._related('$')
def similar_tos(self):
- return self._related("&")
+ return self._related('&')
def __hash__(self):
return hash(self._name)
return self._name < other._name
+@python_2_unicode_compatible
class Lemma(_WordNetObject):
"""
The lexical entry for a single morphological form of a
'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
'salt.n.03.salinity'.
- Lemma attributes, accessible via methods with the same name:
+ Lemma attributes, accessible via methods with the same name::
- name: The canonical name of this lemma.
- synset: The synset that this lemma belongs to.
- syntactic_marker: For adjectives, the WordNet string identifying the
syntactic position relative modified noun. See:
- https://wordnet.princeton.edu/documentation/wninput5wn
+ http://wordnet.princeton.edu/man/wninput.5WN.html#sect10
For all other parts of speech, this attribute is None.
- count: The frequency of this lemma in wordnet.
Lemmas have the following methods for retrieving related Lemmas. They
correspond to the names for the pointer symbols defined here:
- https://wordnet.princeton.edu/documentation/wninput5wn
+ http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
These methods all return lists of Lemmas:
- antonyms
"""
__slots__ = [
- "_wordnet_corpus_reader",
- "_name",
- "_syntactic_marker",
- "_synset",
- "_frame_strings",
- "_frame_ids",
- "_lexname_index",
- "_lex_id",
- "_lang",
- "_key",
+ '_wordnet_corpus_reader',
+ '_name',
+ '_syntactic_marker',
+ '_synset',
+ '_frame_strings',
+ '_frame_ids',
+ '_lexname_index',
+ '_lex_id',
+ '_lang',
+ '_key',
]
def __init__(
self._frame_ids = []
self._lexname_index = lexname_index
self._lex_id = lex_id
- self._lang = "eng"
+ self._lang = 'eng'
self._key = None # gets set later.
return self._wordnet_corpus_reader.lemma_count(self)
def antonyms(self):
- return self._related("!")
+ return self._related('!')
def derivationally_related_forms(self):
- return self._related("+")
+ return self._related('+')
def pertainyms(self):
- return self._related("\\")
+ return self._related('\\')
+@python_2_unicode_compatible
class Synset(_WordNetObject):
"""Create a Synset from a "<lemma>.<pos>.<number>" string where:
<lemma> is the word's morphological stem
Synsets have the following methods for retrieving related Synsets.
They correspond to the names for the pointer symbols defined here:
- https://wordnet.princeton.edu/documentation/wninput5wn
+ http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
These methods all return lists of Synsets.
- hypernyms, instance_hypernyms
"""
__slots__ = [
- "_pos",
- "_offset",
- "_name",
- "_frame_ids",
- "_lemmas",
- "_lemma_names",
- "_definition",
- "_examples",
- "_lexname",
- "_pointers",
- "_lemma_pointers",
- "_max_depth",
- "_min_depth",
+ '_pos',
+ '_offset',
+ '_name',
+ '_frame_ids',
+ '_lemmas',
+ '_lemma_names',
+ '_definition',
+ '_examples',
+ '_lexname',
+ '_pointers',
+ '_lemma_pointers',
+ '_max_depth',
+ '_min_depth',
]
def __init__(self, wordnet_corpus_reader):
def _needs_root(self):
if self._pos == NOUN:
- if self._wordnet_corpus_reader.get_version() == "1.6":
+ if self._wordnet_corpus_reader.get_version() == '1.6':
return True
else:
return False
elif self._pos == VERB:
return True
- def lemma_names(self, lang="eng"):
- """Return all the lemma_names associated with the synset"""
- if lang == "eng":
+ def lemma_names(self, lang='eng'):
+ '''Return all the lemma_names associated with the synset'''
+ if lang == 'eng':
return self._lemma_names
else:
self._wordnet_corpus_reader._load_lang_data(lang)
else:
return []
- def lemmas(self, lang="eng"):
- """Return all the lemma objects associated with the synset"""
- if lang == "eng":
+ def lemmas(self, lang='eng'):
+ '''Return all the lemma objects associated with the synset'''
+ if lang == 'eng':
return self._lemmas
else:
self._wordnet_corpus_reader._load_lang_data(lang)
synsets = self.common_hypernyms(other)
if simulate_root:
fake_synset = Synset(None)
- fake_synset._name = "*ROOT*"
+ fake_synset._name = '*ROOT*'
fake_synset.hypernyms = lambda: []
fake_synset.instance_hypernyms = lambda: []
synsets.append(fake_synset)
distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
if simulate_root:
fake_synset = Synset(None)
- fake_synset._name = "*ROOT*"
+ fake_synset._name = '*ROOT*'
fake_synset_distance = max(distances, key=itemgetter(1))[1]
distances.add((fake_synset, fake_synset_distance + 1))
return distances
def _shortest_hypernym_paths(self, simulate_root):
- if self._name == "*ROOT*":
+ if self._name == '*ROOT*':
return {self: 0}
queue = deque([(self, 0)])
if simulate_root:
fake_synset = Synset(None)
- fake_synset._name = "*ROOT*"
+ fake_synset._name = '*ROOT*'
path[fake_synset] = max(path.values()) + 1
return path
# For each ancestor synset common to both subject synsets, find the
# connecting path length. Return the shortest of these.
- inf = float("inf")
+ inf = float('inf')
path_distance = inf
- for synset, d1 in dist_dict1.items():
+ for synset, d1 in iteritems(dist_dict1):
d2 = dist_dict2.get(synset, inf)
path_distance = min(path_distance, d1 + d2)
if self._pos != other._pos:
raise WordNetError(
- "Computing the lch similarity requires "
- "%s and %s to have the same part of speech." % (self, other)
+ 'Computing the lch similarity requires '
+ '%s and %s to have the same part of speech.' % (self, other)
)
need_root = self._needs_root()
A corpus reader used to access wordnet or its variants.
"""
- _ENCODING = "utf8"
+ _ENCODING = 'utf8'
# { Part-of-speech constants
- ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+ ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# }
# { Filename constants
- _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
+ _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
# }
# { Part of speech constants
#: A list of file identifiers for all the fileids used by this
#: corpus reader.
_FILES = (
- "cntlist.rev",
- "lexnames",
- "index.sense",
- "index.adj",
- "index.adv",
- "index.noun",
- "index.verb",
- "data.adj",
- "data.adv",
- "data.noun",
- "data.verb",
- "adj.exc",
- "adv.exc",
- "noun.exc",
- "verb.exc",
+ 'cntlist.rev',
+ 'lexnames',
+ 'index.sense',
+ 'index.adj',
+ 'index.adv',
+ 'index.noun',
+ 'index.verb',
+ 'data.adj',
+ 'data.adv',
+ 'data.noun',
+ 'data.verb',
+ 'adj.exc',
+ 'adv.exc',
+ 'noun.exc',
+ 'verb.exc',
)
def __init__(self, root, omw_reader):
self._key_synset_file = None
# Load the lexnames
- for i, line in enumerate(self.open("lexnames")):
+ for i, line in enumerate(self.open('lexnames')):
index, lexname, _ = line.split()
assert int(index) == i
self._lexnames.append(lexname)
# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
def of2ss(self, of):
- """ take an id and return the synsets """
+ ''' take an id and return the synsets '''
return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
def ss2of(self, ss, lang=None):
- """ return the ID of the synset """
+ ''' return the ID of the synset '''
pos = ss.pos()
# Only these 3 WordNets retain the satellite pos tag
- if lang not in ["nld", "lit", "slk"] and pos == "s":
- pos = "a"
+ if lang not in ["nld", "lit", "slk"] and pos == 's':
+ pos = 'a'
return "{:08d}-{}".format(ss.offset(), pos)
def _load_lang_data(self, lang):
- """ load the wordnet data of the requested language from the file to
- the cache, _lang_data """
+ ''' load the wordnet data of the requested language from the file to
+ the cache, _lang_data '''
if lang in self._lang_data.keys():
return
if lang not in self.langs():
raise WordNetError("Language is not supported.")
- f = self._omw_reader.open("{0:}/wn-data-{0:}.tab".format(lang))
+ f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
self.custom_lemmas(f, lang)
f.close()
def langs(self):
- """ return a list of languages supported by Multilingual Wordnet """
+ ''' return a list of languages supported by Multilingual Wordnet '''
import os
- langs = ["eng"]
+ langs = ['eng']
fileids = self._omw_reader.fileids()
for fileid in fileids:
file_name, file_extension = os.path.splitext(fileid)
- if file_extension == ".tab":
- langs.append(file_name.split("-")[-1])
+ if file_extension == '.tab':
+ langs.append(file_name.split('-')[-1])
return langs
for suffix in self._FILEMAP.values():
# parse each line of the file (ignoring comment lines)
- for i, line in enumerate(self.open("index.%s" % suffix)):
- if line.startswith(" "):
+ for i, line in enumerate(self.open('index.%s' % suffix)):
+ if line.startswith(' '):
continue
_iter = iter(line.split())
# raise more informative error with file name and line number
except (AssertionError, ValueError) as e:
- tup = ("index.%s" % suffix), (i + 1), e
- raise WordNetError("file %s, line %i: %s" % tup)
+ tup = ('index.%s' % suffix), (i + 1), e
+ raise WordNetError('file %s, line %i: %s' % tup)
# map lemmas and parts of speech to synsets
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
# load the exception file data into memory
for pos, suffix in self._FILEMAP.items():
self._exception_map[pos] = {}
- for line in self.open("%s.exc" % suffix):
+ for line in self.open('%s.exc' % suffix):
terms = line.split()
self._exception_map[pos][terms[0]] = terms[1:]
self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
def get_version(self):
fh = self._data_file(ADJ)
for line in fh:
- match = re.search(r"WordNet (\d+\.\d+) Copyright", line)
+ match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
if match is not None:
version = match.group(1)
fh.seek(0)
# Loading Lemmas
#############################################################
- def lemma(self, name, lang="eng"):
- """Return lemma object that matches the name"""
+ def lemma(self, name, lang='eng'):
+ '''Return lemma object that matches the name'''
# cannot simply split on first '.',
# e.g.: '.45_caliber.a.01..45_caliber'
separator = SENSENUM_RE.search(name).end()
for lemma in synset.lemmas(lang):
if lemma._name == lemma_name:
return lemma
- raise WordNetError("no lemma %r in %r" % (lemma_name, synset_name))
+ raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
def lemma_from_key(self, key):
# Keys are case sensitive and always lower-case
key = key.lower()
- lemma_name, lex_sense = key.split("%")
- pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
+ lemma_name, lex_sense = key.split('%')
+ pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
pos = self._pos_names[int(pos_number)]
# open the key -> synset file if necessary
if self._key_synset_file is None:
- self._key_synset_file = self.open("index.sense")
+ self._key_synset_file = self.open('index.sense')
# Find the synset for the lemma.
synset_line = _binary_search_file(self._key_synset_file, key)
#############################################################
def synset(self, name):
# split name into lemma, part of speech and synset number
- lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
+ lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
synset_index = int(synset_index_str) - 1
# get the offset for this synset
try:
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
except KeyError:
- message = "no lemma %r with part of speech %r"
+ message = 'no lemma %r with part of speech %r'
raise WordNetError(message % (lemma, pos))
except IndexError:
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
synset = self.synset_from_pos_and_offset(pos, offset)
# some basic sanity checks on loaded attributes
- if pos == "s" and synset._pos == "a":
+ if pos == 's' and synset._pos == 'a':
message = (
- "adjective satellite requested but only plain "
- "adjective found for lemma %r"
+ 'adjective satellite requested but only plain '
+ 'adjective found for lemma %r'
)
raise WordNetError(message % lemma)
- assert synset._pos == pos or (pos == "a" and synset._pos == "s")
+ assert synset._pos == pos or (pos == 'a' and synset._pos == 's')
# Return the synset object.
return synset
if pos == ADJ_SAT:
pos = ADJ
if self._data_file_map.get(pos) is None:
- fileid = "data.%s" % self._FILEMAP[pos]
+ fileid = 'data.%s' % self._FILEMAP[pos]
self._data_file_map[pos] = self.open(fileid)
return self._data_file_map[pos]
self._synset_offset_cache[pos][offset] = synset
return synset
- @deprecated("Use public method synset_from_pos_and_offset() instead")
+ @deprecated('Use public method synset_from_pos_and_offset() instead')
def _synset_from_pos_and_offset(self, *args, **kwargs):
"""
Hack to help people like the readers of
try:
# parse out the definitions and examples from the gloss
- columns_str, gloss = data_file_line.strip().split("|")
- definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
- examples = re.findall(r'"([^"]*)"', gloss)
- for example in examples:
- synset._examples.append(example)
-
- synset._definition = definition.strip("; ")
+ columns_str, gloss = data_file_line.split('|')
+ gloss = gloss.strip()
+ definitions = []
+ for gloss_part in gloss.split(';'):
+ gloss_part = gloss_part.strip()
+ if gloss_part.startswith('"'):
+ synset._examples.append(gloss_part.strip('"'))
+ else:
+ definitions.append(gloss_part)
+ synset._definition = '; '.join(definitions)
# split the other info into fields
_iter = iter(columns_str.split())
# get the lex_id (used for sense_keys)
lex_id = int(_next_token(), 16)
# If the lemma has a syntactic marker, extract it.
- m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
+ m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
lemma_name, syn_mark = m.groups()
# create the lemma object
lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
offset = int(_next_token())
pos = _next_token()
lemma_ids_str = _next_token()
- if lemma_ids_str == "0000":
+ if lemma_ids_str == '0000':
synset._pointers[symbol].add((pos, offset))
else:
source_index = int(lemma_ids_str[:2], 16) - 1
for _ in range(frame_count):
# read the plus sign
plus = _next_token()
- assert plus == "+"
+ assert plus == '+'
# read the frame and lemma number
frame_number = int(_next_token())
frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
# raise a more informative error with line text
except ValueError as e:
- raise WordNetError("line %r: %s" % (data_file_line, e))
+ raise WordNetError('line %r: %s' % (data_file_line, e))
# set sense keys for Lemma objects - note that this has to be
# done afterwards so that the relations are available
if synset._pos == ADJ_SAT:
head_lemma = synset.similar_tos()[0]._lemmas[0]
head_name = head_lemma._name
- head_id = "%02d" % head_lemma._lex_id
+ head_id = '%02d' % head_lemma._lex_id
else:
- head_name = head_id = ""
+ head_name = head_id = ''
tup = (
lemma._name,
WordNetCorpusReader._pos_numbers[synset._pos],
head_name,
head_id,
)
- lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
+ lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
# the canonical name is based on the first lemma
lemma_name = synset._lemmas[0]._name.lower()
offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
sense_index = offsets.index(synset._offset)
tup = lemma_name, synset._pos, sense_index + 1
- synset._name = "%s.%s.%02i" % tup
+ synset._name = '%s.%s.%02i' % tup
return synset
Retrieves synset based on a given sense_key. Sense keys can be
obtained from lemma.key()
- From https://wordnet.princeton.edu/documentation/senseidx5wn:
+ From https://wordnet.princeton.edu/wordnet/man/senseidx.5WN.html:
A sense_key is represented as:
lemma % lex_sense (e.g. 'dog%1:18:01::')
where lex_sense is encoded as:
"valid {} could not be extracted from the sense key".format(error)
)
- synset_id = ".".join([lemma, synset_types[int(ss_type)], lex_id])
+ synset_id = '.'.join([lemma, synset_types[int(ss_type)], lex_id])
return self.synset(synset_id)
#############################################################
# Retrieve synsets and lemmas.
#############################################################
- def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
+ def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
"""Load all synsets with a given lemma and part of speech tag.
If no pos is specified, all synsets for all parts of speech
will be loaded.
"""
lemma = lemma.lower()
- if lang == "eng":
+ if lang == 'eng':
get_synset = self.synset_from_pos_and_offset
index = self._lemma_pos_offset_map
if pos is None:
synset_list.append(self.of2ss(l))
return synset_list
- def lemmas(self, lemma, pos=None, lang="eng"):
+ def lemmas(self, lemma, pos=None, lang='eng'):
"""Return all Lemma objects with a name matching the specified lemma
name and part of speech tag. Matches any part of speech tag if none is
specified."""
lemma = lemma.lower()
- if lang == "eng":
+ if lang == 'eng':
return [
lemma_obj
for synset in self.synsets(lemma, pos)
lemmas.append(lemma_obj)
return lemmas
- def all_lemma_names(self, pos=None, lang="eng"):
+ def all_lemma_names(self, pos=None, lang='eng'):
"""Return all lemma names for all synsets for the given
part of speech tag and language or languages. If pos is
not specified, all synsets for all parts of speech will
be used."""
- if lang == "eng":
+ if lang == 'eng':
if pos is None:
return iter(self._lemma_pos_offset_map)
else:
continue
lemma.extend(self._lang_data[lang][0][i])
- lemma = iter(set(lemma))
+ lemma = list(set(lemma))
return lemma
def all_synsets(self, pos=None):
# be moved while we're not looking.
if pos_tag == ADJ_SAT:
pos_tag = ADJ
- fileid = "data.%s" % self._FILEMAP[pos_tag]
+ fileid = 'data.%s' % self._FILEMAP[pos_tag]
data_file = self.open(fileid)
try:
else:
data_file.close()
- def words(self, lang="eng"):
+ def words(self, lang='eng'):
"""return lemmas of the given language as list of words"""
return self.all_lemma_names(lang=lang)
- def license(self, lang="eng"):
+ def license(self, lang='eng'):
"""Return the contents of LICENSE (for omw)
use lang=lang to get the license for an individual language"""
- if lang == "eng":
+ if lang == 'eng':
return self.open("LICENSE").read()
elif lang in self.langs():
return self._omw_reader.open("{}/LICENSE".format(lang)).read()
- elif lang == "omw":
+ elif lang == 'omw':
# under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("LICENSE").read()
elif lang in self._lang_data:
else:
raise WordNetError("Language is not supported.")
- def readme(self, lang="omw"):
+ def readme(self, lang='omw'):
"""Return the contents of README (for omw)
use lang=lang to get the readme for an individual language"""
- if lang == "eng":
+ if lang == 'eng':
return self.open("README").read()
elif lang in self.langs():
return self._omw_reader.open("{}/README".format(lang)).read()
- elif lang == "omw":
+ elif lang == 'omw':
# under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("README").read()
elif lang in self._lang_data:
else:
raise WordNetError("Language is not supported.")
- def citation(self, lang="omw"):
+ def citation(self, lang='omw'):
"""Return the contents of citation.bib file (for omw)
use lang=lang to get the citation for an individual language"""
- if lang == "eng":
+ if lang == 'eng':
return self.open("citation.bib").read()
elif lang in self.langs():
return self._omw_reader.open("{}/citation.bib".format(lang)).read()
- elif lang == "omw":
+ elif lang == 'omw':
# under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("citation.bib").read()
elif lang in self._lang_data:
def lemma_count(self, lemma):
"""Return the frequency count for this Lemma"""
# Currently, count is only work for English
- if lemma._lang != "eng":
+ if lemma._lang != 'eng':
return 0
# open the count file if we haven't already
if self._key_count_file is None:
- self._key_count_file = self.open("cntlist.rev")
+ self._key_count_file = self.open('cntlist.rev')
# find the key in the counts file and return the count
line = _binary_search_file(self._key_count_file, lemma._key)
if line:
- return int(line.rsplit(" ", 1)[-1])
+ return int(line.rsplit(' ', 1)[-1])
else:
return 0
MORPHOLOGICAL_SUBSTITUTIONS = {
NOUN: [
- ("s", ""),
- ("ses", "s"),
- ("ves", "f"),
- ("xes", "x"),
- ("zes", "z"),
- ("ches", "ch"),
- ("shes", "sh"),
- ("men", "man"),
- ("ies", "y"),
+ ('s', ''),
+ ('ses', 's'),
+ ('ves', 'f'),
+ ('xes', 'x'),
+ ('zes', 'z'),
+ ('ches', 'ch'),
+ ('shes', 'sh'),
+ ('men', 'man'),
+ ('ies', 'y'),
],
VERB: [
- ("s", ""),
- ("ies", "y"),
- ("es", "e"),
- ("es", ""),
- ("ed", "e"),
- ("ed", ""),
- ("ing", "e"),
- ("ing", ""),
+ ('s', ''),
+ ('ies', 'y'),
+ ('es', 'e'),
+ ('es', ''),
+ ('ed', 'e'),
+ ('ed', ''),
+ ('ing', 'e'),
+ ('ing', ''),
],
- ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
+ ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
ADV: [],
}
:param lang ISO 639-3 code of the language of the tab file
"""
if len(lang) != 3:
- raise ValueError("lang should be a (3 character) ISO 639-3 code")
+ raise ValueError('lang should be a (3 character) ISO 639-3 code')
self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
- for line in tab_file.readlines():
- if isinstance(line, bytes):
+ for l in tab_file.readlines():
+ if isinstance(l, bytes):
# Support byte-stream files (e.g. as returned by Python 2's
# open() function) as well as text-stream ones
- line = line.decode("utf-8")
- if not line.startswith("#"):
- offset_pos, lemma_type, lemma = line.strip().split("\t")
- lemma = lemma.strip().replace(" ", "_")
- self._lang_data[lang][0][offset_pos].append(lemma)
- self._lang_data[lang][1][lemma.lower()].append(offset_pos)
+ l = l.decode('utf-8')
+ l = l.replace('\n', '')
+ l = l.replace(' ', '_')
+ if l[0] != '#':
+ word = l.split('\t')
+ self._lang_data[lang][0][word[0]].append(word[2])
+ self._lang_data[lang][1][word[2].lower()].append(word[0])
# Make sure no more entries are accidentally added subsequently
self._lang_data[lang][0].default_factory = None
self._lang_data[lang][1].default_factory = None
"""
def __init__(self, root, fileids):
- CorpusReader.__init__(self, root, fileids, encoding="utf8")
+ CorpusReader.__init__(self, root, fileids, encoding='utf8')
# this load function would be more efficient if the data was pickled
# Note that we can't use NLTK's frequency distributions because
"""
if synset1._pos != synset2._pos:
raise WordNetError(
- "Computing the least common subsumer requires "
- "%s and %s to have the same part of speech." % (synset1, synset2)
+ 'Computing the least common subsumer requires '
+ '%s and %s to have the same part of speech.' % (synset1, synset2)
)
ic1 = information_content(synset1, ic)
try:
icpos = ic[synset._pos]
except KeyError:
- msg = "Information content file has no entries for part-of-speech: %s"
+ msg = 'Information content file has no entries for part-of-speech: %s'
raise WordNetError(msg % synset._pos)
counts = icpos[synset._offset]
def _get_pos(field):
- if field[-1] == "n":
+ if field[-1] == 'n':
return NOUN
- elif field[-1] == "v":
+ elif field[-1] == 'v':
return VERB
else:
msg = (
# Natural Language Toolkit: XML Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
(note -- not named 'xml' to avoid conflicting w/ standard xml package)
"""
+from __future__ import print_function, unicode_literals
import codecs
-from xml.etree import ElementTree
+
+# Use the c version of ElementTree, which is faster, if possible:
+try:
+ from xml.etree import cElementTree as ElementTree
+except ImportError:
+ from xml.etree import ElementTree
+
+from six import string_types
from nltk.data import SeekableUnicodeStreamReader
from nltk.tokenize import WordPunctTokenizer
# Make sure we have exactly one file -- no concatenating XML.
if fileid is None and len(self._fileids) == 1:
fileid = self._fileids[0]
- if not isinstance(fileid, str):
- raise TypeError("Expected a single file identifier string")
+ if not isinstance(fileid, string_types):
+ raise TypeError('Expected a single file identifier string')
# Read the XML in using ElementTree.
elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
# If requested, wrap it.
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
if elt_handler:
self.handle_elt = elt_handler
- self._tagspec = re.compile(tagspec + r"\Z")
+ self._tagspec = re.compile(tagspec + r'\Z')
"""The tag specification for this corpus view."""
self._tag_context = {0: ()}
finally:
infile.close()
else:
- with open(fileid, "rb") as infile:
+ with open(fileid, 'rb') as infile:
s = infile.readline()
if s.startswith(codecs.BOM_UTF16_BE):
- return "utf-16-be"
+ return 'utf-16-be'
if s.startswith(codecs.BOM_UTF16_LE):
- return "utf-16-le"
+ return 'utf-16-le'
if s.startswith(codecs.BOM_UTF32_BE):
- return "utf-32-be"
+ return 'utf-32-be'
if s.startswith(codecs.BOM_UTF32_LE):
- return "utf-32-le"
+ return 'utf-32-le'
if s.startswith(codecs.BOM_UTF8):
- return "utf-8"
+ return 'utf-8'
m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
if m:
return m.group(1).decode()
if m:
return m.group(1).decode()
# No encoding found -- what should the default be?
- return "utf-8"
+ return 'utf-8'
def handle_elt(self, elt, context):
"""
#: A regular expression used to extract the tag name from a start tag,
#: end tag, or empty-elt tag string.
- _XML_TAG_NAME = re.compile("<\s*/?\s*([^\s>]+)")
+ _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
#: A regular expression used to find all start-tags, end-tags, and
#: emtpy-elt tags in an XML file. This regexp is more lenient than
then this function either backtracks to the last '<', or reads
another block.
"""
- fragment = ""
+ fragment = ''
if isinstance(stream, SeekableUnicodeStreamReader):
startpos = stream.tell()
return fragment
# Do we have a fragment that will never be well-formed?
- if re.search("[<>]", fragment).group(0) == ">":
+ if re.search('[<>]', fragment).group(0) == '>':
pos = stream.tell() - (
- len(fragment) - re.search("[<>]", fragment).end()
+ len(fragment) - re.search('[<>]', fragment).end()
)
raise ValueError('Unexpected ">" near char %s' % pos)
# End of file?
if not xml_block:
- raise ValueError("Unexpected end of file: tag not closed")
+ raise ValueError('Unexpected end of file: tag not closed')
# If not, then we must be in the middle of a <..tag..>.
# If appropriate, backtrack to the most recent '<'
# character.
- last_open_bracket = fragment.rfind("<")
+ last_open_bracket = fragment.rfind('<')
if last_open_bracket > 0:
if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
if isinstance(stream, SeekableUnicodeStreamReader):
elt_start = None # where does the elt start
elt_depth = None # what context depth
- elt_text = ""
+ elt_text = ''
while elts == [] or elt_start is not None:
if isinstance(stream, SeekableUnicodeStreamReader):
if elt_start is None:
break
else:
- raise ValueError("Unexpected end of file")
+ raise ValueError('Unexpected end of file')
# Process each <tag> in the xml fragment.
for piece in self._XML_PIECE.finditer(xml_fragment):
if self._DEBUG:
- print("%25s %s" % ("/".join(context)[-20:], piece.group()))
+ print('%25s %s' % ('/'.join(context)[-20:], piece.group()))
- if piece.group("START_TAG"):
+ if piece.group('START_TAG'):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
# Keep context up-to-date.
context.append(name)
# Is this one of the elts we're looking for?
if elt_start is None:
- if re.match(tagspec, "/".join(context)):
+ if re.match(tagspec, '/'.join(context)):
elt_start = piece.start()
elt_depth = len(context)
- elif piece.group("END_TAG"):
+ elif piece.group('END_TAG'):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
# sanity checks:
if not context:
- raise ValueError("Unmatched tag </%s>" % name)
+ raise ValueError('Unmatched tag </%s>' % name)
if name != context[-1]:
raise ValueError(
- "Unmatched tag <%s>...</%s>" % (context[-1], name)
+ 'Unmatched tag <%s>...</%s>' % (context[-1], name)
)
# Is this the end of an element?
if elt_start is not None and elt_depth == len(context):
elt_text += xml_fragment[elt_start : piece.end()]
- elts.append((elt_text, "/".join(context)))
+ elts.append((elt_text, '/'.join(context)))
elt_start = elt_depth = None
- elt_text = ""
+ elt_text = ''
# Keep context up-to-date
context.pop()
- elif piece.group("EMPTY_ELT_TAG"):
+ elif piece.group('EMPTY_ELT_TAG'):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
if elt_start is None:
- if re.match(tagspec, "/".join(context) + "/" + name):
- elts.append((piece.group(), "/".join(context) + "/" + name))
+ if re.match(tagspec, '/'.join(context) + '/' + name):
+ elts.append((piece.group(), '/'.join(context) + '/' + name))
if elt_start is not None:
# If we haven't found any elements yet, then keep
# take back the last start-tag, and return what
# we've gotten so far (elts is non-empty).
if self._DEBUG:
- print(" " * 36 + "(backtrack)")
+ print(' ' * 36 + '(backtrack)')
if isinstance(stream, SeekableUnicodeStreamReader):
stream.seek(startpos)
stream.char_seek_forward(elt_start)
stream.seek(-(len(xml_fragment) - elt_start), 1)
context = context[: elt_depth - 1]
elt_start = elt_depth = None
- elt_text = ""
+ elt_text = ''
# Update the _tag_context dict.
pos = stream.tell()
return [
elt_handler(
- ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
+ ElementTree.fromstring(elt.encode('ascii', 'xmlcharrefreplace')),
context,
)
for (elt, context) in elts
import os
import re
+from six import string_types
+
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.tagged import TaggedCorpusReader
corpus of Old English prose texts.
"""
- def __init__(self, root, encoding="utf8"):
+ def __init__(self, root, encoding='utf8'):
CorpusReader.__init__(self, root, [], encoding)
self._psd_reader = YCOEParseCorpusReader(
- self.root.join("psd"), ".*", ".psd", encoding=encoding
+ self.root.join('psd'), '.*', '.psd', encoding=encoding
)
- self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
+ self._pos_reader = YCOETaggedCorpusReader(self.root.join('pos'), '.*', '.pos')
# Make sure we have a consistent set of items:
documents = set(f[:-4] for f in self._psd_reader.fileids())
if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
- raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
+ raise ValueError('Items in "psd" and "pos" ' 'subdirectories do not match.')
fileids = sorted(
- ["%s.psd" % doc for doc in documents]
- + ["%s.pos" % doc for doc in documents]
+ ['%s.psd' % doc for doc in documents]
+ + ['%s.pos' % doc for doc in documents]
)
CorpusReader.__init__(self, root, fileids, encoding)
self._documents = sorted(documents)
"""
if fileids is None:
return self._documents
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
for f in fileids:
if f not in self._fileids:
- raise KeyError("File id %s not found" % fileids)
+ raise KeyError('File id %s not found' % fileids)
# Strip off the '.pos' and '.psd' extensions.
return sorted(set(f[:-4] for f in fileids))
"""
if documents is None:
return self._fileids
- elif isinstance(documents, str):
+ elif isinstance(documents, string_types):
documents = [documents]
return sorted(
set(
- ["%s.pos" % doc for doc in documents]
- + ["%s.psd" % doc for doc in documents]
+ ['%s.pos' % doc for doc in documents]
+ + ['%s.psd' % doc for doc in documents]
)
)
if documents is None:
documents = self._documents
else:
- if isinstance(documents, str):
+ if isinstance(documents, string_types):
documents = [documents]
for document in documents:
if document not in self._documents:
- if document[-4:] in (".pos", ".psd"):
+ if document[-4:] in ('.pos', '.psd'):
raise ValueError(
- "Expected a document identifier, not a file "
- "identifier. (Use corpus.documents() to get "
- "a list of document identifiers."
+ 'Expected a document identifier, not a file '
+ 'identifier. (Use corpus.documents() to get '
+ 'a list of document identifiers.'
)
else:
- raise ValueError("Document identifier %s not found" % document)
- return ["%s.%s" % (d, subcorpus) for d in documents]
+ raise ValueError('Document identifier %s not found' % document)
+ return ['%s.%s' % (d, subcorpus) for d in documents]
# Delegate to one of our two sub-readers:
def words(self, documents=None):
- return self._pos_reader.words(self._getfileids(documents, "pos"))
+ return self._pos_reader.words(self._getfileids(documents, 'pos'))
def sents(self, documents=None):
- return self._pos_reader.sents(self._getfileids(documents, "pos"))
+ return self._pos_reader.sents(self._getfileids(documents, 'pos'))
def paras(self, documents=None):
- return self._pos_reader.paras(self._getfileids(documents, "pos"))
+ return self._pos_reader.paras(self._getfileids(documents, 'pos'))
def tagged_words(self, documents=None):
- return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
+ return self._pos_reader.tagged_words(self._getfileids(documents, 'pos'))
def tagged_sents(self, documents=None):
- return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
+ return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos'))
def tagged_paras(self, documents=None):
- return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
+ return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos'))
def parsed_sents(self, documents=None):
- return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
+ return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd'))
class YCOEParseCorpusReader(BracketParseCorpusReader):
that strips out (CODE ...) and (ID ...) nodes."""
def _parse(self, t):
- t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
- if re.match(r"\s*\(\s*\)\s*$", t):
+ t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t)
+ if re.match(r'\s*\(\s*\)\s*$', t):
return None
return BracketParseCorpusReader._parse(self, t)
class YCOETaggedCorpusReader(TaggedCorpusReader):
- def __init__(self, root, items, encoding="utf8"):
- gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
+ def __init__(self, root, items, encoding='utf8'):
+ gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
TaggedCorpusReader.__init__(
- self, root, items, sep="_", sent_tokenizer=sent_tokenizer
+ self, root, items, sep='_', sent_tokenizer=sent_tokenizer
)
#: A list of all documents and their titles in ycoe.
documents = {
- "coadrian.o34": "Adrian and Ritheus",
- "coaelhom.o3": "Ælfric, Supplemental Homilies",
- "coaelive.o3": "Ælfric's Lives of Saints",
- "coalcuin": "Alcuin De virtutibus et vitiis",
- "coalex.o23": "Alexander's Letter to Aristotle",
- "coapollo.o3": "Apollonius of Tyre",
- "coaugust": "Augustine",
- "cobede.o2": "Bede's History of the English Church",
- "cobenrul.o3": "Benedictine Rule",
- "coblick.o23": "Blickling Homilies",
- "coboeth.o2": "Boethius' Consolation of Philosophy",
- "cobyrhtf.o3": "Byrhtferth's Manual",
- "cocanedgD": "Canons of Edgar (D)",
- "cocanedgX": "Canons of Edgar (X)",
- "cocathom1.o3": "Ælfric's Catholic Homilies I",
- "cocathom2.o3": "Ælfric's Catholic Homilies II",
- "cochad.o24": "Saint Chad",
- "cochdrul": "Chrodegang of Metz, Rule",
- "cochristoph": "Saint Christopher",
- "cochronA.o23": "Anglo-Saxon Chronicle A",
- "cochronC": "Anglo-Saxon Chronicle C",
- "cochronD": "Anglo-Saxon Chronicle D",
- "cochronE.o34": "Anglo-Saxon Chronicle E",
- "cocura.o2": "Cura Pastoralis",
- "cocuraC": "Cura Pastoralis (Cotton)",
- "codicts.o34": "Dicts of Cato",
- "codocu1.o1": "Documents 1 (O1)",
- "codocu2.o12": "Documents 2 (O1/O2)",
- "codocu2.o2": "Documents 2 (O2)",
- "codocu3.o23": "Documents 3 (O2/O3)",
- "codocu3.o3": "Documents 3 (O3)",
- "codocu4.o24": "Documents 4 (O2/O4)",
- "coeluc1": "Honorius of Autun, Elucidarium 1",
- "coeluc2": "Honorius of Autun, Elucidarium 1",
- "coepigen.o3": "Ælfric's Epilogue to Genesis",
- "coeuphr": "Saint Euphrosyne",
- "coeust": "Saint Eustace and his companions",
- "coexodusP": "Exodus (P)",
- "cogenesiC": "Genesis (C)",
- "cogregdC.o24": "Gregory's Dialogues (C)",
- "cogregdH.o23": "Gregory's Dialogues (H)",
- "coherbar": "Pseudo-Apuleius, Herbarium",
- "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
- "coinspolX": "Wulfstan's Institute of Polity (X)",
- "cojames": "Saint James",
- "colacnu.o23": "Lacnunga",
- "colaece.o2": "Leechdoms",
- "colaw1cn.o3": "Laws, Cnut I",
- "colaw2cn.o3": "Laws, Cnut II",
- "colaw5atr.o3": "Laws, Æthelred V",
- "colaw6atr.o3": "Laws, Æthelred VI",
- "colawaf.o2": "Laws, Alfred",
- "colawafint.o2": "Alfred's Introduction to Laws",
- "colawger.o34": "Laws, Gerefa",
- "colawine.ox2": "Laws, Ine",
- "colawnorthu.o3": "Northumbra Preosta Lagu",
- "colawwllad.o4": "Laws, William I, Lad",
- "coleofri.o4": "Leofric",
- "colsigef.o3": "Ælfric's Letter to Sigefyrth",
- "colsigewB": "Ælfric's Letter to Sigeweard (B)",
- "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
- "colwgeat": "Ælfric's Letter to Wulfgeat",
- "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
- "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
- "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
- "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
- "comargaC.o34": "Saint Margaret (C)",
- "comargaT": "Saint Margaret (T)",
- "comart1": "Martyrology, I",
- "comart2": "Martyrology, II",
- "comart3.o23": "Martyrology, III",
- "comarvel.o23": "Marvels of the East",
- "comary": "Mary of Egypt",
- "coneot": "Saint Neot",
- "conicodA": "Gospel of Nicodemus (A)",
- "conicodC": "Gospel of Nicodemus (C)",
- "conicodD": "Gospel of Nicodemus (D)",
- "conicodE": "Gospel of Nicodemus (E)",
- "coorosiu.o2": "Orosius",
- "cootest.o3": "Heptateuch",
- "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
- "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
- "coprefcura.o2": "Preface to the Cura Pastoralis",
- "coprefgen.o3": "Ælfric's Preface to Genesis",
- "copreflives.o3": "Ælfric's Preface to Lives of Saints",
- "coprefsolilo": "Preface to Augustine's Soliloquies",
- "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
- "corood": "History of the Holy Rood-Tree",
- "cosevensl": "Seven Sleepers",
- "cosolilo": "St. Augustine's Soliloquies",
- "cosolsat1.o4": "Solomon and Saturn I",
- "cosolsat2": "Solomon and Saturn II",
- "cotempo.o3": "Ælfric's De Temporibus Anni",
- "coverhom": "Vercelli Homilies",
- "coverhomE": "Vercelli Homilies (E)",
- "coverhomL": "Vercelli Homilies (L)",
- "covinceB": "Saint Vincent (Bodley 343)",
- "covinsal": "Vindicta Salvatoris",
- "cowsgosp.o3": "West-Saxon Gospels",
- "cowulf.o34": "Wulfstan's Homilies",
+ 'coadrian.o34': 'Adrian and Ritheus',
+ 'coaelhom.o3': 'Ælfric, Supplemental Homilies',
+ 'coaelive.o3': 'Ælfric\'s Lives of Saints',
+ 'coalcuin': 'Alcuin De virtutibus et vitiis',
+ 'coalex.o23': 'Alexander\'s Letter to Aristotle',
+ 'coapollo.o3': 'Apollonius of Tyre',
+ 'coaugust': 'Augustine',
+ 'cobede.o2': 'Bede\'s History of the English Church',
+ 'cobenrul.o3': 'Benedictine Rule',
+ 'coblick.o23': 'Blickling Homilies',
+ 'coboeth.o2': 'Boethius\' Consolation of Philosophy',
+ 'cobyrhtf.o3': 'Byrhtferth\'s Manual',
+ 'cocanedgD': 'Canons of Edgar (D)',
+ 'cocanedgX': 'Canons of Edgar (X)',
+ 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
+ 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
+ 'cochad.o24': 'Saint Chad',
+ 'cochdrul': 'Chrodegang of Metz, Rule',
+ 'cochristoph': 'Saint Christopher',
+ 'cochronA.o23': 'Anglo-Saxon Chronicle A',
+ 'cochronC': 'Anglo-Saxon Chronicle C',
+ 'cochronD': 'Anglo-Saxon Chronicle D',
+ 'cochronE.o34': 'Anglo-Saxon Chronicle E',
+ 'cocura.o2': 'Cura Pastoralis',
+ 'cocuraC': 'Cura Pastoralis (Cotton)',
+ 'codicts.o34': 'Dicts of Cato',
+ 'codocu1.o1': 'Documents 1 (O1)',
+ 'codocu2.o12': 'Documents 2 (O1/O2)',
+ 'codocu2.o2': 'Documents 2 (O2)',
+ 'codocu3.o23': 'Documents 3 (O2/O3)',
+ 'codocu3.o3': 'Documents 3 (O3)',
+ 'codocu4.o24': 'Documents 4 (O2/O4)',
+ 'coeluc1': 'Honorius of Autun, Elucidarium 1',
+ 'coeluc2': 'Honorius of Autun, Elucidarium 1',
+ 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
+ 'coeuphr': 'Saint Euphrosyne',
+ 'coeust': 'Saint Eustace and his companions',
+ 'coexodusP': 'Exodus (P)',
+ 'cogenesiC': 'Genesis (C)',
+ 'cogregdC.o24': 'Gregory\'s Dialogues (C)',
+ 'cogregdH.o23': 'Gregory\'s Dialogues (H)',
+ 'coherbar': 'Pseudo-Apuleius, Herbarium',
+ 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
+ 'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
+ 'cojames': 'Saint James',
+ 'colacnu.o23': 'Lacnunga',
+ 'colaece.o2': 'Leechdoms',
+ 'colaw1cn.o3': 'Laws, Cnut I',
+ 'colaw2cn.o3': 'Laws, Cnut II',
+ 'colaw5atr.o3': 'Laws, Æthelred V',
+ 'colaw6atr.o3': 'Laws, Æthelred VI',
+ 'colawaf.o2': 'Laws, Alfred',
+ 'colawafint.o2': 'Alfred\'s Introduction to Laws',
+ 'colawger.o34': 'Laws, Gerefa',
+ 'colawine.ox2': 'Laws, Ine',
+ 'colawnorthu.o3': 'Northumbra Preosta Lagu',
+ 'colawwllad.o4': 'Laws, William I, Lad',
+ 'coleofri.o4': 'Leofric',
+ 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
+ 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
+ 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
+ 'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
+ 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
+ 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
+ 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
+ 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
+ 'comargaC.o34': 'Saint Margaret (C)',
+ 'comargaT': 'Saint Margaret (T)',
+ 'comart1': 'Martyrology, I',
+ 'comart2': 'Martyrology, II',
+ 'comart3.o23': 'Martyrology, III',
+ 'comarvel.o23': 'Marvels of the East',
+ 'comary': 'Mary of Egypt',
+ 'coneot': 'Saint Neot',
+ 'conicodA': 'Gospel of Nicodemus (A)',
+ 'conicodC': 'Gospel of Nicodemus (C)',
+ 'conicodD': 'Gospel of Nicodemus (D)',
+ 'conicodE': 'Gospel of Nicodemus (E)',
+ 'coorosiu.o2': 'Orosius',
+ 'cootest.o3': 'Heptateuch',
+ 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
+ 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
+ 'coprefcura.o2': 'Preface to the Cura Pastoralis',
+ 'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
+ 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
+ 'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
+ 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
+ 'corood': 'History of the Holy Rood-Tree',
+ 'cosevensl': 'Seven Sleepers',
+ 'cosolilo': 'St. Augustine\'s Soliloquies',
+ 'cosolsat1.o4': 'Solomon and Saturn I',
+ 'cosolsat2': 'Solomon and Saturn II',
+ 'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
+ 'coverhom': 'Vercelli Homilies',
+ 'coverhomE': 'Vercelli Homilies (E)',
+ 'coverhomL': 'Vercelli Homilies (L)',
+ 'covinceB': 'Saint Vincent (Bodley 343)',
+ 'covinsal': 'Vindicta Salvatoris',
+ 'cowsgosp.o3': 'West-Saxon Gospels',
+ 'cowulf.o34': 'Wulfstan\'s Homilies',
}
# Natural Language Toolkit: Corpus Reader Utility Functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# { Lazy Corpus Loader
######################################################################
+from __future__ import unicode_literals
import re
import gc
import nltk
+from nltk.compat import python_2_unicode_compatible
TRY_ZIPFILE_FIRST = False
+@python_2_unicode_compatible
class LazyCorpusLoader(object):
"""
To see the API documentation for this lazily loaded corpus, first
self.__name = self.__name__ = name
self.__reader_cls = reader_cls
# If nltk_data_subdir is set explicitly
- if "nltk_data_subdir" in kwargs:
+ if 'nltk_data_subdir' in kwargs:
# Use the specified subdirectory path
- self.subdir = kwargs["nltk_data_subdir"]
+ self.subdir = kwargs['nltk_data_subdir']
# Pops the `nltk_data_subdir` argument, we don't need it anymore.
- kwargs.pop("nltk_data_subdir", None)
+ kwargs.pop('nltk_data_subdir', None)
else: # Otherwise use 'nltk_data/corpora'
- self.subdir = "corpora"
+ self.subdir = 'corpora'
self.__args = args
self.__kwargs = kwargs
def __load(self):
# Find the corpus root directory.
- zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
+ zip_name = re.sub(r'(([^/]+)(/.*)?)', r'\2.zip/\1/', self.__name)
if TRY_ZIPFILE_FIRST:
try:
- root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
+ root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError as e:
try:
- root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
+ root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError:
raise e
else:
try:
- root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
+ root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError as e:
try:
- root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
+ root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError:
raise e
# (see http://bugs.python.org/issue1225107).
# Without this fix tests may take extra 1.5GB RAM
# because all corpora gets loaded during test collection.
- if attr == "__bases__":
+ if attr == '__bases__':
raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
self.__load()
return getattr(self, attr)
def __repr__(self):
- return "<%s in %r (not loaded yet)>" % (
+ return '<%s in %r (not loaded yet)>' % (
self.__reader_cls.__name__,
- ".../corpora/" + self.__name,
+ '.../corpora/' + self.__name,
)
def _unload(self):
# Natural Language Toolkit: Utility functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
adds it to a resource cache; and ``retrieve()`` copies a given resource
to a local file.
"""
+from __future__ import print_function, unicode_literals, division
import functools
import textwrap
import io
-from io import BytesIO
import os
import re
import sys
import zipfile
import codecs
-import pickle
from abc import ABCMeta, abstractmethod
from gzip import GzipFile, WRITE as GZ_WRITE
-from urllib.request import urlopen, url2pathname
+from six import add_metaclass
+from six import string_types, text_type
+from six.moves.urllib.request import urlopen, url2pathname
+
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+
+try: # Python 3.
+ textwrap_indent = functools.partial(textwrap.indent, prefix=' ')
+except AttributeError: # Python 2; indent() not available for Python2.
+ textwrap_fill = functools.partial(
+ textwrap.fill,
+ initial_indent=' ',
+ subsequent_indent=' ',
+ replace_whitespace=False,
+ )
+
+ def textwrap_indent(text):
+ return '\n'.join(textwrap_fill(line) for line in text.splitlines())
+
try:
from zlib import Z_SYNC_FLUSH as FLUSH
# this import should be more specific:
import nltk
-from nltk.compat import py3_data, add_py3_data
-from nltk.internals import deprecated
-
-textwrap_indent = functools.partial(textwrap.indent, prefix=" ")
+from nltk.compat import py3_data, add_py3_data, BytesIO
######################################################################
# Search Path
(e.g., in their home directory under ~/nltk_data)."""
# User-specified locations:
-_paths_from_env = os.environ.get("NLTK_DATA", str("")).split(os.pathsep)
+_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
path += [d for d in _paths_from_env if d]
-if "APPENGINE_RUNTIME" not in os.environ and os.path.expanduser("~/") != "~/":
- path.append(os.path.expanduser(str("~/nltk_data")))
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
+ path.append(os.path.expanduser(str('~/nltk_data')))
-if sys.platform.startswith("win"):
+if sys.platform.startswith('win'):
# Common locations on Windows:
path += [
- os.path.join(sys.prefix, str("nltk_data")),
- os.path.join(sys.prefix, str("share"), str("nltk_data")),
- os.path.join(sys.prefix, str("lib"), str("nltk_data")),
- os.path.join(os.environ.get(str("APPDATA"), str("C:\\")), str("nltk_data")),
- str(r"C:\nltk_data"),
- str(r"D:\nltk_data"),
- str(r"E:\nltk_data"),
+ os.path.join(sys.prefix, str('nltk_data')),
+ os.path.join(sys.prefix, str('share'), str('nltk_data')),
+ os.path.join(sys.prefix, str('lib'), str('nltk_data')),
+ os.path.join(os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data')),
+ str(r'C:\nltk_data'),
+ str(r'D:\nltk_data'),
+ str(r'E:\nltk_data'),
]
else:
# Common locations on UNIX & OS X:
path += [
- os.path.join(sys.prefix, str("nltk_data")),
- os.path.join(sys.prefix, str("share"), str("nltk_data")),
- os.path.join(sys.prefix, str("lib"), str("nltk_data")),
- str("/usr/share/nltk_data"),
- str("/usr/local/share/nltk_data"),
- str("/usr/lib/nltk_data"),
- str("/usr/local/lib/nltk_data"),
+ os.path.join(sys.prefix, str('nltk_data')),
+ os.path.join(sys.prefix, str('share'), str('nltk_data')),
+ os.path.join(sys.prefix, str('lib'), str('nltk_data')),
+ str('/usr/share/nltk_data'),
+ str('/usr/local/share/nltk_data'),
+ str('/usr/lib/nltk_data'),
+ str('/usr/local/lib/nltk_data'),
]
filename,
mode="rb",
compresslevel=9,
- encoding="utf-8",
+ encoding='utf-8',
fileobj=None,
errors=None,
newline=None,
>>> split_resource_url('file:///C:/home/nltk')
('file', '/C:/home/nltk')
"""
- protocol, path_ = resource_url.split(":", 1)
- if protocol == "nltk":
+ protocol, path_ = resource_url.split(':', 1)
+ if protocol == 'nltk':
pass
- elif protocol == "file":
- if path_.startswith("/"):
- path_ = "/" + path_.lstrip("/")
+ elif protocol == 'file':
+ if path_.startswith('/'):
+ path_ = '/' + path_.lstrip('/')
else:
- path_ = re.sub(r"^/{0,2}", "", path_)
+ path_ = re.sub(r'^/{0,2}', '', path_)
return protocol, path_
protocol, name = split_resource_url(resource_url)
except ValueError:
# the resource url has no protocol, use the nltk protocol by default
- protocol = "nltk"
+ protocol = 'nltk'
name = resource_url
# use file protocol if the path is an absolute path
- if protocol == "nltk" and os.path.isabs(name):
- protocol = "file://"
+ if protocol == 'nltk' and os.path.isabs(name):
+ protocol = 'file://'
name = normalize_resource_name(name, False, None)
- elif protocol == "file":
- protocol = "file://"
+ elif protocol == 'file':
+ protocol = 'file://'
# name is absolute
name = normalize_resource_name(name, False, None)
- elif protocol == "nltk":
- protocol = "nltk:"
+ elif protocol == 'nltk':
+ protocol = 'nltk:'
name = normalize_resource_name(name, True)
else:
# handled by urllib
- protocol += "://"
- return "".join([protocol, name])
+ protocol += '://'
+ return ''.join([protocol, name])
def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
>>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file'
True
"""
- is_dir = bool(re.search(r"[\\/.]$", resource_name)) or resource_name.endswith(
+ is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(
os.path.sep
)
- if sys.platform.startswith("win"):
- resource_name = resource_name.lstrip("/")
+ if sys.platform.startswith('win'):
+ resource_name = resource_name.lstrip('/')
else:
- resource_name = re.sub(r"^/+", "/", resource_name)
+ resource_name = re.sub(r'^/+', '/', resource_name)
if allow_relative:
resource_name = os.path.normpath(resource_name)
else:
if relative_path is None:
relative_path = os.curdir
resource_name = os.path.abspath(os.path.join(relative_path, resource_name))
- resource_name = resource_name.replace("\\", "/").replace(os.path.sep, "/")
- if sys.platform.startswith("win") and os.path.isabs(resource_name):
- resource_name = "/" + resource_name
- if is_dir and not resource_name.endswith("/"):
- resource_name += "/"
+ resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
+ if sys.platform.startswith('win') and os.path.isabs(resource_name):
+ resource_name = '/' + resource_name
+ if is_dir and not resource_name.endswith('/'):
+ resource_name += '/'
return resource_name
######################################################################
-class PathPointer(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class PathPointer(object):
"""
An abstract base class for 'path pointers,' used by NLTK's data
package to identify specific paths. Two subclasses exist:
"""
-class FileSystemPathPointer(PathPointer, str):
+class FileSystemPathPointer(PathPointer, text_type):
"""
A path pointer that identifies a file which can be accessed
directly via a given absolute path.
_path = os.path.abspath(_path)
if not os.path.exists(_path):
- raise IOError("No such file or directory: %r" % _path)
+ raise IOError('No such file or directory: %r' % _path)
self._path = _path
# There's no need to call str.__init__(), since it's a no-op;
return self._path
def open(self, encoding=None):
- stream = open(self._path, "rb")
+ stream = open(self._path, 'rb')
if encoding is not None:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
return FileSystemPathPointer(_path)
def __repr__(self):
- return "FileSystemPathPointer(%r)" % self._path
+ # This should be a byte string under Python 2.x;
+ # we don't want transliteration here so
+ # @python_2_unicode_compatible is not used.
+ return str('FileSystemPathPointer(%r)' % self._path)
def __str__(self):
return self._path
-@deprecated("Use gzip.GzipFile instead as it also uses a buffer.")
+
class BufferedGzipFile(GzipFile):
- """A ``GzipFile`` subclass for compatibility with older nltk releases.
+ """
+ A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``.
+ This allows faster reads and writes of data to and from gzip-compressed
+ files at the cost of using more memory.
+
+ The default buffer size is 2MB.
- Use ``GzipFile`` directly as it also buffers in all supported
- Python versions.
+ ``BufferedGzipFile`` is useful for loading large gzipped pickle objects
+ as well as writing large encoded feature files for classifier training.
"""
+ MB = 2 ** 20
+ SIZE = 2 * MB
+
@py3_data
def __init__(
self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs
):
- """Return a buffered gzip file object."""
+ """
+ Return a buffered gzip file object.
+
+ :param filename: a filesystem path
+ :type filename: str
+ :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
+ 'w', or 'wb'
+ :type mode: str
+ :param compresslevel: The compresslevel argument is an integer from 1
+ to 9 controlling the level of compression; 1 is fastest and
+ produces the least compression, and 9 is slowest and produces the
+ most compression. The default is 9.
+ :type compresslevel: int
+ :param fileobj: a BytesIO stream to read from instead of a file.
+ :type fileobj: BytesIO
+ :param size: number of bytes to buffer during calls to read() and write()
+ :type size: int
+ :rtype: BufferedGzipFile
+ """
GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
+ self._size = kwargs.get('size', self.SIZE)
+ self._nltk_buffer = BytesIO()
+ # cStringIO does not support len.
+ self._len = 0
+
+ def _reset_buffer(self):
+ # For some reason calling BytesIO.truncate() here will lead to
+ # inconsistent writes so just set _buffer to a new BytesIO object.
+ self._nltk_buffer = BytesIO()
+ self._len = 0
+
+ def _write_buffer(self, data):
+ # Simply write to the buffer and increment the buffer size.
+ if data is not None:
+ self._nltk_buffer.write(data)
+ self._len += len(data)
+
+ def _write_gzip(self, data):
+ # Write the current buffer to the GzipFile.
+ GzipFile.write(self, self._nltk_buffer.getvalue())
+ # Then reset the buffer and write the new data to the buffer.
+ self._reset_buffer()
+ self._write_buffer(data)
- def write(self, data):
- # This is identical to GzipFile.write but does not return
- # the bytes written to retain compatibility.
- super().write(data)
+ def close(self):
+ # GzipFile.close() doesn't actuallly close anything.
+ if self.mode == GZ_WRITE:
+ self._write_gzip(None)
+ self._reset_buffer()
+ return GzipFile.close(self)
+
+ def flush(self, lib_mode=FLUSH):
+ self._nltk_buffer.flush()
+ GzipFile.flush(self, lib_mode)
+
+ def read(self, size=None):
+ if not size:
+ size = self._size
+ contents = BytesIO()
+ while True:
+ blocks = GzipFile.read(self, size)
+ if not blocks:
+ contents.flush()
+ break
+ contents.write(blocks)
+ return contents.getvalue()
+ else:
+ return GzipFile.read(self, size)
+
+ def write(self, data, size=-1):
+ """
+ :param data: bytes to write to file or buffer
+ :type data: bytes
+ :param size: buffer at least size bytes before writing to file
+ :type size: int
+ """
+ if not size:
+ size = self._size
+ if self._len + len(data) <= size:
+ self._write_buffer(data)
+ else:
+ self._write_gzip(data)
class GzipFileSystemPathPointer(FileSystemPathPointer):
"""
def open(self, encoding=None):
- stream = GzipFile(self._path, "rb")
+ # Note: In >= Python3.5, GzipFile is already using a
+ # buffered reader in the backend which has a variable self._buffer
+ # See https://github.com/nltk/nltk/issues/1308
+ if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+ stream = BufferedGzipFile(self._path, 'rb')
+ else:
+ stream = GzipFile(self._path, 'rb')
if encoding:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
"""
@py3_data
- def __init__(self, zipfile, entry=""):
+ def __init__(self, zipfile, entry=''):
"""
Create a new path pointer pointing at the specified entry
in the given zipfile.
:raise IOError: If the given zipfile does not exist, or if it
does not contain the specified entry.
"""
- if isinstance(zipfile, str):
+ if isinstance(zipfile, string_types):
zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
# Check that the entry exists:
if entry:
# Normalize the entry string, it should be relative:
- entry = normalize_resource_name(entry, True, "/").lstrip("/")
+ entry = normalize_resource_name(entry, True, '/').lstrip('/')
try:
zipfile.getinfo(entry)
# the zip file. So if `entry` is a directory name,
# then check if the zipfile contains any files that
# are under the given directory.
- if entry.endswith("/") and [
+ if entry.endswith('/') and [
n for n in zipfile.namelist() if n.startswith(entry)
]:
pass # zipfile contains a file in that directory.
else:
# Otherwise, complain.
raise IOError(
- "Zipfile %r does not contain %r" % (zipfile.filename, entry)
+ 'Zipfile %r does not contain %r' % (zipfile.filename, entry)
)
self._zipfile = zipfile
self._entry = entry
def open(self, encoding=None):
data = self._zipfile.read(self._entry)
stream = BytesIO(data)
- if self._entry.endswith(".gz"):
- stream = GzipFile(self._entry, fileobj=stream)
+ if self._entry.endswith('.gz'):
+ # Note: In >= Python3.5, GzipFile is already using a
+ # buffered reader in the backend which has a variable self._buffer
+ # See https://github.com/nltk/nltk/issues/1308
+ if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+ stream = BufferedGzipFile(self._entry, fileobj=stream)
+ else:
+ stream = GzipFile(self._entry, fileobj=stream)
elif encoding is not None:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
return self._zipfile.getinfo(self._entry).file_size
def join(self, fileid):
- entry = "%s/%s" % (self._entry, fileid)
+ entry = '%s/%s' % (self._entry, fileid)
return ZipFilePathPointer(self._zipfile, entry)
def __repr__(self):
- return str("ZipFilePathPointer(%r, %r)") % (self._zipfile.filename, self._entry)
+ return str('ZipFilePathPointer(%r, %r)') % (self._zipfile.filename, self._entry)
def __str__(self):
return os.path.normpath(os.path.join(self._zipfile.filename, self._entry))
paths = path
# Check if the resource name includes a zipfile name
- m = re.match(r"(.*\.zip)/?(.*)$|", resource_name)
+ m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
zipfile, zipentry = m.groups()
# Check each item in our path
for path_ in paths:
# Is the path item a zipfile?
- if path_ and (os.path.isfile(path_) and path_.endswith(".zip")):
+ if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
try:
return ZipFilePathPointer(path_, resource_name)
except IOError:
if zipfile is None:
p = os.path.join(path_, url2pathname(resource_name))
if os.path.exists(p):
- if p.endswith(".gz"):
+ if p.endswith('.gz'):
return GzipFileSystemPathPointer(p)
else:
return FileSystemPathPointer(p)
# again, assuming that one of the path components is inside a
# zipfile of the same name.
if zipfile is None:
- pieces = resource_name.split("/")
+ pieces = resource_name.split('/')
for i in range(len(pieces)):
- modified_name = "/".join(pieces[:i] + [pieces[i] + ".zip"] + pieces[i:])
+ modified_name = '/'.join(pieces[:i] + [pieces[i] + '.zip'] + pieces[i:])
try:
return find(modified_name, paths)
except LookupError:
pass
# Identify the package (i.e. the .zip file) to download.
- resource_zipname = resource_name.split("/")[1]
- if resource_zipname.endswith(".zip"):
- resource_zipname = resource_zipname.rpartition(".")[0]
+ resource_zipname = resource_name.split('/')[1]
+ if resource_zipname.endswith('.zip'):
+ resource_zipname = resource_zipname.rpartition('.')[0]
# Display a friendly error message if the resource wasn't found:
msg = str(
"Resource \33[93m{resource}\033[0m not found.\n"
"Please use the NLTK Downloader to obtain the resource:\n\n"
"\33[31m" # To display red text in terminal.
">>> import nltk\n"
- ">>> nltk.download('{resource}')\n"
+ ">>> nltk.download(\'{resource}\')\n"
"\033[0m"
).format(resource=resource_zipname)
msg = textwrap_indent(msg)
- msg += "\n For more information see: https://www.nltk.org/data.html\n"
-
- msg += "\n Attempted to load \33[93m{resource_name}\033[0m\n".format(
+ msg += '\n Attempted to load \33[93m{resource_name}\033[0m\n'.format(
resource_name=resource_name
)
- msg += "\n Searched in:" + "".join("\n - %r" % d for d in paths)
- sep = "*" * 70
- resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
+ msg += '\n Searched in:' + ''.join('\n - %r' % d for d in paths)
+ sep = '*' * 70
+ resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
raise LookupError(resource_not_found)
"""
resource_url = normalize_resource_url(resource_url)
if filename is None:
- if resource_url.startswith("file:"):
+ if resource_url.startswith('file:'):
filename = os.path.split(resource_url)[-1]
else:
- filename = re.sub(r"(^\w+:)?.*/", "", resource_url)
+ filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
if os.path.exists(filename):
filename = os.path.abspath(filename)
raise ValueError("File %r already exists!" % filename)
if verbose:
- print("Retrieving %r, saving to %r" % (resource_url, filename))
+ print('Retrieving %r, saving to %r' % (resource_url, filename))
# Open the input & output streams.
infile = _open(resource_url)
#: load() method. Keys are format names, and values are format
#: descriptions.
FORMATS = {
- "pickle": "A serialized python object, stored using the pickle module.",
- "json": "A serialized python object, stored using the json module.",
- "yaml": "A serialized python object, stored using the yaml module.",
- "cfg": "A context free grammar.",
- "pcfg": "A probabilistic CFG.",
- "fcfg": "A feature CFG.",
- "fol": "A list of first order logic expressions, parsed with "
+ 'pickle': "A serialized python object, stored using the pickle module.",
+ 'json': "A serialized python object, stored using the json module.",
+ 'yaml': "A serialized python object, stored using the yaml module.",
+ 'cfg': "A context free grammar.",
+ 'pcfg': "A probabilistic CFG.",
+ 'fcfg': "A feature CFG.",
+ 'fol': "A list of first order logic expressions, parsed with "
"nltk.sem.logic.Expression.fromstring.",
- "logic": "A list of first order logic expressions, parsed with "
+ 'logic': "A list of first order logic expressions, parsed with "
"nltk.sem.logic.LogicParser. Requires an additional logic_parser "
"parameter",
- "val": "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
- "raw": "The raw (byte string) contents of a file.",
- "text": "The raw (unicode string) contents of a file. ",
+ 'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
+ 'raw': "The raw (byte string) contents of a file.",
+ 'text': "The raw (unicode string) contents of a file. ",
}
#: A dictionary mapping from file extensions to format names, used
#: by load() when format="auto" to decide the format for a
#: given resource url.
AUTO_FORMATS = {
- "pickle": "pickle",
- "json": "json",
- "yaml": "yaml",
- "cfg": "cfg",
- "pcfg": "pcfg",
- "fcfg": "fcfg",
- "fol": "fol",
- "logic": "logic",
- "val": "val",
- "txt": "text",
- "text": "text",
+ 'pickle': 'pickle',
+ 'json': 'json',
+ 'yaml': 'yaml',
+ 'cfg': 'cfg',
+ 'pcfg': 'pcfg',
+ 'fcfg': 'fcfg',
+ 'fol': 'fol',
+ 'logic': 'logic',
+ 'val': 'val',
+ 'txt': 'text',
+ 'text': 'text',
}
def load(
resource_url,
- format="auto",
+ format='auto',
cache=True,
verbose=False,
logic_parser=None,
:type cache: bool
:param cache: If true, add this resource to a cache. If load()
finds a resource in its cache, then it will return it from the
- cache rather than loading it.
+ cache rather than loading it. The cache uses weak references,
+ so a resource wil automatically be expunged from the cache
+ when no more objects are using it.
:type verbose: bool
:param verbose: If true, print a message when loading a resource.
Messages are not displayed when a resource is retrieved from
resource_url = add_py3_data(resource_url)
# Determine the format of the resource.
- if format == "auto":
- resource_url_parts = resource_url.split(".")
+ if format == 'auto':
+ resource_url_parts = resource_url.split('.')
ext = resource_url_parts[-1]
- if ext == "gz":
+ if ext == 'gz':
ext = resource_url_parts[-2]
format = AUTO_FORMATS.get(ext)
if format is None:
raise ValueError(
- "Could not determine format for %s based "
+ 'Could not determine format for %s based '
'on its file\nextension; use the "format" '
- "argument to specify the format explicitly." % resource_url
+ 'argument to specify the format explicitly.' % resource_url
)
if format not in FORMATS:
- raise ValueError("Unknown format type: %s!" % (format,))
+ raise ValueError('Unknown format type: %s!' % (format,))
# If we've cached the resource, then just return it.
if cache:
resource_val = _resource_cache.get((resource_url, format))
if resource_val is not None:
if verbose:
- print("<<Using cached copy of %s>>" % (resource_url,))
+ print('<<Using cached copy of %s>>' % (resource_url,))
return resource_val
# Let the user know what's going on.
if verbose:
- print("<<Loading %s>>" % (resource_url,))
+ print('<<Loading %s>>' % (resource_url,))
# Load the resource.
opened_resource = _open(resource_url)
- if format == "raw":
+ if format == 'raw':
resource_val = opened_resource.read()
- elif format == "pickle":
+ elif format == 'pickle':
resource_val = pickle.load(opened_resource)
- elif format == "json":
+ elif format == 'json':
import json
from nltk.jsontags import json_tags
if len(resource_val) != 1:
tag = next(resource_val.keys())
if tag not in json_tags:
- raise ValueError("Unknown json tag.")
- elif format == "yaml":
+ raise ValueError('Unknown json tag.')
+ elif format == 'yaml':
import yaml
- resource_val = yaml.safe_load(opened_resource)
+ resource_val = yaml.load(opened_resource)
else:
# The resource is a text format.
binary_data = opened_resource.read()
string_data = binary_data.decode(encoding)
else:
try:
- string_data = binary_data.decode("utf-8")
+ string_data = binary_data.decode('utf-8')
except UnicodeDecodeError:
- string_data = binary_data.decode("latin-1")
- if format == "text":
+ string_data = binary_data.decode('latin-1')
+ if format == 'text':
resource_val = string_data
- elif format == "cfg":
+ elif format == 'cfg':
resource_val = nltk.grammar.CFG.fromstring(string_data, encoding=encoding)
- elif format == "pcfg":
+ elif format == 'pcfg':
resource_val = nltk.grammar.PCFG.fromstring(string_data, encoding=encoding)
- elif format == "fcfg":
+ elif format == 'fcfg':
resource_val = nltk.grammar.FeatureGrammar.fromstring(
string_data,
logic_parser=logic_parser,
fstruct_reader=fstruct_reader,
encoding=encoding,
)
- elif format == "fol":
+ elif format == 'fol':
resource_val = nltk.sem.read_logic(
string_data,
logic_parser=nltk.sem.logic.LogicParser(),
encoding=encoding,
)
- elif format == "logic":
+ elif format == 'logic':
resource_val = nltk.sem.read_logic(
string_data, logic_parser=logic_parser, encoding=encoding
)
- elif format == "val":
+ elif format == 'val':
resource_val = nltk.sem.read_valuation(string_data, encoding=encoding)
else:
raise AssertionError(
return resource_val
-def show_cfg(resource_url, escape="##"):
+def show_cfg(resource_url, escape='##'):
"""
Write out a grammar file, ignoring escaped and empty lines.
:param escape: Prepended string that signals lines to be ignored
"""
resource_url = normalize_resource_url(resource_url)
- resource_val = load(resource_url, format="text", cache=False)
+ resource_val = load(resource_url, format='text', cache=False)
lines = resource_val.splitlines()
for l in lines:
if l.startswith(escape):
continue
- if re.match("^$", l):
+ if re.match('^$', l):
continue
print(l)
resource_url = normalize_resource_url(resource_url)
protocol, path_ = split_resource_url(resource_url)
- if protocol is None or protocol.lower() == "nltk":
- return find(path_, path + [""]).open()
- elif protocol.lower() == "file":
+ if protocol is None or protocol.lower() == 'nltk':
+ return find(path_, path + ['']).open()
+ elif protocol.lower() == 'file':
# urllib might not use mode='rb', so handle this one ourselves:
- return find(path_, [""]).open()
+ return find(path_, ['']).open()
else:
return urlopen(resource_url)
# Lazy Resource Loader
######################################################################
+# We shouldn't apply @python_2_unicode_compatible
+# decorator to LazyLoader, this is resource.__class__ responsibility.
+
class LazyLoader(object):
@py3_data
@py3_data
def __init__(self, filename):
- if not isinstance(filename, str):
- raise TypeError("ReopenableZipFile filename must be a string")
+ if not isinstance(filename, string_types):
+ raise TypeError('ReopenableZipFile filename must be a string')
zipfile.ZipFile.__init__(self, filename)
assert self.filename == filename
self.close()
def read(self, name):
assert self.fp is None
- self.fp = open(self.filename, "rb")
+ self.fp = open(self.filename, 'rb')
value = zipfile.ZipFile.read(self, name)
# Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
# Since we only opened one file here, we add 1.
def write(self, *args, **kwargs):
""":raise NotImplementedError: OpenOnDemandZipfile is read-only"""
- raise NotImplementedError("OpenOnDemandZipfile is read-only")
+ raise NotImplementedError('OpenOnDemandZipfile is read-only')
def writestr(self, *args, **kwargs):
""":raise NotImplementedError: OpenOnDemandZipfile is read-only"""
- raise NotImplementedError("OpenOnDemandZipfile is read-only")
+ raise NotImplementedError('OpenOnDemandZipfile is read-only')
def __repr__(self):
- return repr(str("OpenOnDemandZipFile(%r)") % self.filename)
+ return repr(str('OpenOnDemandZipFile(%r)') % self.filename)
######################################################################
DEBUG = True # : If true, then perform extra sanity checks.
@py3_data
- def __init__(self, stream, encoding, errors="strict"):
+ def __init__(self, stream, encoding, errors='strict'):
# Rewind the stream to its beginning.
stream.seek(0)
"""The function that is used to decode byte strings into
unicode strings."""
- self.bytebuffer = b""
+ self.bytebuffer = b''
"""A buffer to use bytes that have been read but have not yet
been decoded. This is only used when the final bytes from
a read do not form a complete encoding for a character."""
# If linebuffer is not empty, then include it in the result
if self.linebuffer:
- chars = "".join(self.linebuffer) + chars
+ chars = ''.join(self.linebuffer) + chars
self.linebuffer = None
self._rewind_numchars = None
return line
readsize = size or 72
- chars = ""
+ chars = ''
# If there's a remaining incomplete line in the buffer, add it.
if self.linebuffer:
# If we're at a '\r', then read one extra character, since
# it might be a '\n', to get the proper line ending.
- if new_chars and new_chars.endswith("\r"):
+ if new_chars and new_chars.endswith('\r'):
new_chars += self._read(1)
chars += new_chars
"""
if whence == 1:
raise ValueError(
- "Relative seek is not supported for "
- "SeekableUnicodeStreamReader -- consider "
- "using char_seek_forward() instead."
+ 'Relative seek is not supported for '
+ 'SeekableUnicodeStreamReader -- consider '
+ 'using char_seek_forward() instead.'
)
self.stream.seek(offset, whence)
self.linebuffer = None
- self.bytebuffer = b""
+ self.bytebuffer = b''
self._rewind_numchars = None
self._rewind_checkpoint = self.stream.tell()
Move the read pointer forward by ``offset`` characters.
"""
if offset < 0:
- raise ValueError("Negative offsets are not supported")
+ raise ValueError('Negative offsets are not supported')
# Clear all buffers.
self.seek(self.tell())
# Perform the seek operation.
"""
if est_bytes is None:
est_bytes = offset
- bytes = b""
+ bytes = b''
while True:
# Read in a block of bytes.
if self.DEBUG:
self.stream.seek(filepos)
check1 = self._incr_decode(self.stream.read(50))[0]
- check2 = "".join(self.linebuffer)
+ check2 = ''.join(self.linebuffer)
assert check1.startswith(check2) or check2.startswith(check1)
# Return to our original filepos (so we don't have to throw
unicode string. ``linebuffer`` is not included in the result.
"""
if size == 0:
- return ""
+ return ''
# Skip past the byte order marker, if present.
if self._bom and self.stream.tell() == 0:
"""
while True:
try:
- return self.decode(bytes, "strict")
+ return self.decode(bytes, 'strict')
except UnicodeDecodeError as exc:
# If the exception occurs at the end of the string,
# then assume that it's a truncation error.
return self.decode(bytes[: exc.start], self.errors)
# Otherwise, if we're being strict, then raise it.
- elif self.errors == "strict":
+ elif self.errors == 'strict':
raise
# If we're not strict, then re-process it with our
return self.decode(bytes, self.errors)
_BOM_TABLE = {
- "utf8": [(codecs.BOM_UTF8, None)],
- "utf16": [(codecs.BOM_UTF16_LE, "utf16-le"), (codecs.BOM_UTF16_BE, "utf16-be")],
- "utf16le": [(codecs.BOM_UTF16_LE, None)],
- "utf16be": [(codecs.BOM_UTF16_BE, None)],
- "utf32": [(codecs.BOM_UTF32_LE, "utf32-le"), (codecs.BOM_UTF32_BE, "utf32-be")],
- "utf32le": [(codecs.BOM_UTF32_LE, None)],
- "utf32be": [(codecs.BOM_UTF32_BE, None)],
+ 'utf8': [(codecs.BOM_UTF8, None)],
+ 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), (codecs.BOM_UTF16_BE, 'utf16-be')],
+ 'utf16le': [(codecs.BOM_UTF16_LE, None)],
+ 'utf16be': [(codecs.BOM_UTF16_BE, None)],
+ 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), (codecs.BOM_UTF32_BE, 'utf32-be')],
+ 'utf32le': [(codecs.BOM_UTF32_LE, None)],
+ 'utf32be': [(codecs.BOM_UTF32_BE, None)],
}
def _check_bom(self):
# Normalize our encoding name
- enc = re.sub("[ -]", "", self.encoding.lower())
+ enc = re.sub('[ -]', '', self.encoding.lower())
# Look up our encoding in the BOM table.
bom_info = self._BOM_TABLE.get(enc)
__all__ = [
- "path",
- "PathPointer",
- "FileSystemPathPointer",
- "BufferedGzipFile",
- "GzipFileSystemPathPointer",
- "GzipFileSystemPathPointer",
- "find",
- "retrieve",
- "FORMATS",
- "AUTO_FORMATS",
- "load",
- "show_cfg",
- "clear_cache",
- "LazyLoader",
- "OpenOnDemandZipFile",
- "GzipFileSystemPathPointer",
- "SeekableUnicodeStreamReader",
+ 'path',
+ 'PathPointer',
+ 'FileSystemPathPointer',
+ 'BufferedGzipFile',
+ 'GzipFileSystemPathPointer',
+ 'GzipFileSystemPathPointer',
+ 'find',
+ 'retrieve',
+ 'FORMATS',
+ 'AUTO_FORMATS',
+ 'load',
+ 'show_cfg',
+ 'clear_cache',
+ 'LazyLoader',
+ 'OpenOnDemandZipFile',
+ 'GzipFileSystemPathPointer',
+ 'SeekableUnicodeStreamReader',
]
Included in NLTK for its support of a nice memoization decorator.
"""
+from __future__ import print_function
-__docformat__ = "restructuredtext en"
+__docformat__ = 'restructuredtext en'
## The basic trick is to generate the source code for the decorated function
## with the right signature and to evaluate it.
# Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in
# the Python standard library.
-OLD_SYS_PATH = sys.path[:]
+old_sys_path = sys.path[:]
sys.path = [p for p in sys.path if p and "nltk" not in p]
import inspect
-sys.path = OLD_SYS_PATH
-
-def __legacysignature(signature):
- """
- For retrocompatibility reasons, we don't use a standard Signature.
- Instead, we use the string generated by this method.
- Basically, from a Signature we create a string and remove the default values.
- """
- listsignature = str(signature)[1:-1].split(",")
- for counter, param in enumerate(listsignature):
- if param.count("=") > 0:
- listsignature[counter] = param[0:param.index("=")].strip()
- else:
- listsignature[counter] = param.strip()
- return ", ".join(listsignature)
+sys.path = old_sys_path
def getinfo(func):
- argnames (the names of the arguments : list)
- defaults (the values of the default arguments : tuple)
- signature (the signature : str)
- - fullsignature (the full signature : Signature)
- doc (the docstring : str)
- module (the module name : str)
- dict (the function __dict__ : str)
>>> info["signature"]
'self, x, y, *args, **kw'
-
- >>> info["fullsignature"]
- <Signature (self, x=1, y=2, *args, **kw)>
"""
assert inspect.ismethod(func) or inspect.isfunction(func)
- argspec = inspect.getfullargspec(func)
- regargs, varargs, varkwargs = argspec[:3]
+ if sys.version_info[0] >= 3:
+ argspec = inspect.getfullargspec(func)
+ else:
+ argspec = inspect.getargspec(func)
+ regargs, varargs, varkwargs, defaults = argspec[:4]
argnames = list(regargs)
if varargs:
argnames.append(varargs)
if varkwargs:
argnames.append(varkwargs)
- fullsignature = inspect.signature(func)
- # Convert Signature to str
- signature = __legacysignature(fullsignature)
-
+ signature = inspect.formatargspec(
+ regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
+ )[1:-1]
# pypy compatibility
- if hasattr(func, "__closure__"):
+ if hasattr(func, '__closure__'):
_closure = func.__closure__
_globals = func.__globals__
else:
name=func.__name__,
argnames=argnames,
signature=signature,
- fullsignature=fullsignature,
defaults=func.__defaults__,
doc=func.__doc__,
module=func.__module__,
)
+# akin to functools.update_wrapper
def update_wrapper(wrapper, model, infodict=None):
- " akin to functools.update_wrapper "
infodict = infodict or getinfo(model)
- wrapper.__name__ = infodict["name"]
- wrapper.__doc__ = infodict["doc"]
- wrapper.__module__ = infodict["module"]
- wrapper.__dict__.update(infodict["dict"])
- wrapper.__defaults__ = infodict["defaults"]
+ wrapper.__name__ = infodict['name']
+ wrapper.__doc__ = infodict['doc']
+ wrapper.__module__ = infodict['module']
+ wrapper.__dict__.update(infodict['dict'])
+ wrapper.__defaults__ = infodict['defaults']
wrapper.undecorated = model
return wrapper
else: # assume model is a function
infodict = getinfo(model)
assert (
- not "_wrapper_" in infodict["argnames"]
+ not '_wrapper_' in infodict["argnames"]
), '"_wrapper_" is a reserved argument name!'
src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict
funcopy = eval(src, dict(_wrapper_=wrapper))
method.
"""
attrs = set(dir(cls))
- if "__call__" in attrs:
+ if '__call__' in attrs:
raise TypeError(
- "You cannot decorate a class with a nontrivial " "__call__ method"
+ 'You cannot decorate a class with a nontrivial ' '__call__ method'
)
- if "call" not in attrs:
- raise TypeError("You cannot decorate a class without a " ".call method")
+ if 'call' not in attrs:
+ raise TypeError('You cannot decorate a class without a ' '.call method')
cls.__call__ = __call__
return cls
def _decorator(func): # the real meat is here
infodict = getinfo(func)
- argnames = infodict["argnames"]
+ argnames = infodict['argnames']
assert not (
- "_call_" in argnames or "_func_" in argnames
- ), "You cannot use _call_ or _func_ as argument names!"
+ '_call_' in argnames or '_func_' in argnames
+ ), 'You cannot use _call_ or _func_ as argument names!'
src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict
# import sys; print >> sys.stderr, src # for debugging purposes
dec_func = eval(src, dict(_func_=func, _call_=caller))
# memoize_dic is created at the first call
if args in dic:
return dic[args]
- result = func(*args)
- dic[args] = result
- return result
+ else:
+ result = func(*args)
+ dic[args] = result
+ return result
########################## LEGALESE ###############################
# Natural Language Toolkit: Corpus & Model Downloader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
"""
# ----------------------------------------------------------------------
+from __future__ import print_function, division, unicode_literals
"""
try:
TKINTER = True
- from tkinter import (
+ from six.moves.tkinter import (
Tk,
Frame,
Label,
IntVar,
TclError,
)
- from tkinter.messagebox import showerror
+ from six.moves.tkinter_messagebox import showerror
from nltk.draw.table import Table
from nltk.draw.util import ShowText
except ImportError:
TKINTER = False
TclError = ValueError
-from urllib.request import urlopen
-from urllib.error import HTTPError, URLError
+from six import string_types, text_type
+from six.moves import input
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.error import HTTPError, URLError
import nltk
+from nltk.compat import python_2_unicode_compatible
# urllib2 = nltk.internals.import_from_stdlib('urllib2')
# Directory entry objects (from the data server's index file)
######################################################################
+
+@python_2_unicode_compatible
class Package(object):
"""
A directory entry for a downloadable package. These entries are
id,
url,
name=None,
- subdir="",
+ subdir='',
size=None,
unzipped_size=None,
checksum=None,
svn_revision=None,
- copyright="Unknown",
- contact="Unknown",
- license="Unknown",
- author="Unknown",
+ copyright='Unknown',
+ contact='Unknown',
+ license='Unknown',
+ author='Unknown',
unzip=True,
**kw
):
self.author = author
"""Author of this package."""
- ext = os.path.splitext(url.split("/")[-1])[1]
+ ext = os.path.splitext(url.split('/')[-1])[1]
self.filename = os.path.join(subdir, id + ext)
"""The filename that should be used for this package's file. It
is formed by joining ``self.subdir`` with ``self.id``, and
@staticmethod
def fromxml(xml):
- if isinstance(xml, str):
+ if isinstance(xml, string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
- xml.attrib[key] = str(xml.attrib[key])
+ xml.attrib[key] = text_type(xml.attrib[key])
return Package(**xml.attrib)
def __lt__(self, other):
return self.id < other.id
def __repr__(self):
- return "<Package %s>" % self.id
+ return '<Package %s>' % self.id
+@python_2_unicode_compatible
class Collection(object):
"""
A directory entry for a collection of downloadable packages.
@staticmethod
def fromxml(xml):
- if isinstance(xml, str):
+ if isinstance(xml, string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
- xml.attrib[key] = str(xml.attrib[key])
- children = [child.get("ref") for child in xml.findall("item")]
+ xml.attrib[key] = text_type(xml.attrib[key])
+ children = [child.get('ref') for child in xml.findall('item')]
return Collection(children=children, **xml.attrib)
def __lt__(self, other):
return self.id < other.id
def __repr__(self):
- return "<Collection %s>" % self.id
+ return '<Collection %s>' % self.id
######################################################################
server index will be considered 'stale,' and will be
re-downloaded."""
- DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
+ DEFAULT_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml'
"""The default URL for the NLTK data server's index. An
alternative URL can be specified when creating a new
``Downloader`` object."""
# Status Constants
# /////////////////////////////////////////////////////////////////
- INSTALLED = "installed"
+ INSTALLED = 'installed'
"""A status string indicating that a package or collection is
installed and up-to-date."""
- NOT_INSTALLED = "not installed"
+ NOT_INSTALLED = 'not installed'
"""A status string indicating that a package or collection is
not installed."""
- STALE = "out of date"
+ STALE = 'out of date'
"""A status string indicating that a package or collection is
corrupt or out-of-date."""
- PARTIAL = "partial"
+ PARTIAL = 'partial'
"""A status string indicating that a collection is partially
installed (i.e., only some of its packages are installed.)"""
lines = 0 # for more_prompt
if download_dir is None:
download_dir = self._download_dir
- print("Using default data directory (%s)" % download_dir)
+ print('Using default data directory (%s)' % download_dir)
if header:
- print("=" * (26 + len(self._url)))
- print(" Data server index for <%s>" % self._url)
- print("=" * (26 + len(self._url)))
+ print('=' * (26 + len(self._url)))
+ print(' Data server index for <%s>' % self._url)
+ print('=' * (26 + len(self._url)))
lines += 3 # for more_prompt
stale = partial = False
categories = []
if show_packages:
- categories.append("packages")
+ categories.append('packages')
if show_collections:
- categories.append("collections")
+ categories.append('collections')
for category in categories:
- print("%s:" % category.capitalize())
+ print('%s:' % category.capitalize())
lines += 1 # for more_prompt
for info in sorted(getattr(self, category)(), key=str):
status = self.status(info, download_dir)
if status == self.PARTIAL:
partial = True
prefix = {
- self.INSTALLED: "*",
- self.STALE: "-",
- self.PARTIAL: "P",
- self.NOT_INSTALLED: " ",
+ self.INSTALLED: '*',
+ self.STALE: '-',
+ self.PARTIAL: 'P',
+ self.NOT_INSTALLED: ' ',
}[status]
name = textwrap.fill(
- "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " "
+ '-' * 27 + (info.name or info.id), 75, subsequent_indent=27 * ' '
)[27:]
- print(" [%s] %s %s" % (prefix, info.id.ljust(20, "."), name))
- lines += len(name.split("\n")) # for more_prompt
+ print(' [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name))
+ lines += len(name.split('\n')) # for more_prompt
if more_prompt and lines > 20:
user_input = input("Hit Enter to continue: ")
- if user_input.lower() in ("x", "q"):
+ if user_input.lower() in ('x', 'q'):
return
lines = 0
print()
- msg = "([*] marks installed packages"
+ msg = '([*] marks installed packages'
if stale:
- msg += "; [-] marks out-of-date or corrupt packages"
+ msg += '; [-] marks out-of-date or corrupt packages'
if partial:
- msg += "; [P] marks partially installed collections"
- print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76))
+ msg += '; [P] marks partially installed collections'
+ print(textwrap.fill(msg + ')', subsequent_indent=' ', width=76))
def packages(self):
self._update_index()
def corpora(self):
self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"]
+ return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == 'corpora']
def models(self):
self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"]
+ return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != 'corpora']
def collections(self):
self._update_index()
# /////////////////////////////////////////////////////////////////
def _info_or_id(self, info_or_id):
- if isinstance(info_or_id, str):
+ if isinstance(info_or_id, string_types):
return self.info(info_or_id)
else:
return info_or_id
try:
info = self._info_or_id(info_or_id)
except (IOError, ValueError) as e:
- yield ErrorMessage(None, "Error loading %s: %s" % (info_or_id, e))
+ yield ErrorMessage(None, 'Error loading %s: %s' % (info_or_id, e))
return
# Handle collections.
yield ProgressMessage(5)
try:
infile = urlopen(info.url)
- with open(filepath, "wb") as outfile:
+ with open(filepath, 'wb') as outfile:
+ # print info.size
num_blocks = max(1, info.size / (1024 * 16))
for block in itertools.count():
s = infile.read(1024 * 16) # 16k blocks.
except IOError as e:
yield ErrorMessage(
info,
- "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e),
+ 'Error downloading %r from <%s>:' '\n %s' % (info.id, info.url, e),
)
return
yield FinishDownloadMessage(info)
yield ProgressMessage(80)
# If it's a zipfile, uncompress it.
- if info.filename.endswith(".zip"):
+ if info.filename.endswith('.zip'):
zipdir = os.path.join(download_dir, info.subdir)
# Unzip if we're unzipping by default; *or* if it's already
# been unzipped (presumably a previous version).
download_dir=None,
quiet=False,
force=False,
- prefix="[nltk_data] ",
+ prefix='[nltk_data] ',
halt_on_error=True,
raise_on_error=False,
print_error_to=sys.stderr,
else:
# Define a helper function for displaying output:
- def show(s, prefix2=""):
+ def show(s, prefix2=''):
print_to(
textwrap.fill(
s,
initial_indent=prefix + prefix2,
- subsequent_indent=prefix + prefix2 + " " * 4,
+ subsequent_indent=prefix + prefix2 + ' ' * 4,
)
)
if not quiet:
print_to("Error installing package. Retry? [n/y/e]")
choice = input().strip()
- if choice in ["y", "Y"]:
+ if choice in ['y', 'Y']:
if not self.download(
msg.package.id,
download_dir,
raise_on_error,
):
return False
- elif choice in ["e", "E"]:
+ elif choice in ['e', 'E']:
return False
# All other messages
if not quiet:
# Collection downloading messages:
if isinstance(msg, StartCollectionMessage):
- show("Downloading collection %r" % msg.collection.id)
- prefix += " | "
+ show('Downloading collection %r' % msg.collection.id)
+ prefix += ' | '
print_to(prefix)
elif isinstance(msg, FinishCollectionMessage):
print_to(prefix)
prefix = prefix[:-4]
if self._errors:
show(
- "Downloaded collection %r with errors"
+ 'Downloaded collection %r with errors'
% msg.collection.id
)
else:
- show("Done downloading collection %s" % msg.collection.id)
+ show('Done downloading collection %s' % msg.collection.id)
# Package downloading messages:
elif isinstance(msg, StartPackageMessage):
show(
- "Downloading package %s to %s..."
+ 'Downloading package %s to %s...'
% (msg.package.id, download_dir)
)
elif isinstance(msg, UpToDateMessage):
- show("Package %s is already up-to-date!" % msg.package.id, " ")
+ show('Package %s is already up-to-date!' % msg.package.id, ' ')
# elif isinstance(msg, StaleMessage):
# show('Package %s is out-of-date or corrupt' %
# msg.package.id, ' ')
elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s." % msg.package.filename, " ")
+ show('Unzipping %s.' % msg.package.filename, ' ')
# Data directory message:
elif isinstance(msg, SelectDownloadDirMessage):
# If it's a zipfile, and it's been at least partially
# unzipped, then check if it's been fully unzipped.
- if filepath.endswith(".zip"):
+ if filepath.endswith('.zip'):
unzipdir = filepath[:-4]
if not os.path.exists(unzipdir):
return self.INSTALLED # but not unzipped -- ok!
# Otherwise, everything looks good.
return self.INSTALLED
- def update(self, quiet=False, prefix="[nltk_data] "):
+ def update(self, quiet=False, prefix='[nltk_data] '):
"""
Re-download any packages whose status is STALE.
"""
self._index_timestamp = time.time()
# Build a dictionary of packages.
- packages = [Package.fromxml(p) for p in self._index.findall("packages/package")]
+ packages = [Package.fromxml(p) for p in self._index.findall('packages/package')]
self._packages = dict((p.id, p) for p in packages)
# Build a dictionary of collections.
collections = [
- Collection.fromxml(c) for c in self._index.findall("collections/collection")
+ Collection.fromxml(c) for c in self._index.findall('collections/collection')
]
self._collections = dict((c.id, c) for c in collections)
collection.children[i] = self._collections[child_id]
else:
print(
- "removing collection member with no package: {}".format(
+ 'removing collection member with no package: {}'.format(
child_id
)
)
return self._packages[id]
if id in self._collections:
return self._collections[id]
- raise ValueError("Package %r not found in index" % id)
+ raise ValueError('Package %r not found in index' % id)
def xmlinfo(self, id):
"""Return the XML info record for the given item"""
self._update_index()
- for package in self._index.findall("packages/package"):
- if package.get("id") == id:
+ for package in self._index.findall('packages/package'):
+ if package.get('id') == id:
return package
- for collection in self._index.findall("collections/collection"):
- if collection.get("id") == id:
+ for collection in self._index.findall('collections/collection'):
+ if collection.get('id') == id:
return collection
- raise ValueError("Package %r not found in index" % id)
+ raise ValueError('Package %r not found in index' % id)
# /////////////////////////////////////////////////////////////////
# URL & Data Directory
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
"""
# Check if we are on GAE where we cannot write into filesystem.
- if "APPENGINE_RUNTIME" in os.environ:
+ if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a
return nltkdir
# On Windows, use %APPDATA%
- if sys.platform == "win32" and "APPDATA" in os.environ:
- homedir = os.environ["APPDATA"]
+ if sys.platform == 'win32' and 'APPDATA' in os.environ:
+ homedir = os.environ['APPDATA']
# Otherwise, install in the user's home directory.
else:
- homedir = os.path.expanduser("~/")
- if homedir == "~/":
+ homedir = os.path.expanduser('~/')
+ if homedir == '~/':
raise ValueError("Could not find a default download directory")
# append "nltk_data" to the home directory
- return os.path.join(homedir, "nltk_data")
+ return os.path.join(homedir, 'nltk_data')
def _get_download_dir(self):
"""
self._ds = dataserver
def _simple_interactive_menu(self, *options):
- print("-" * 75)
- spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " "
- print(" " + spc.join(options))
- print("-" * 75)
+ print('-' * 75)
+ spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * ' '
+ print(' ' + spc.join(options))
+ # w = 76/len(options)
+ # fmt = ' ' + ('%-'+str(w)+'s')*(len(options)-1) + '%s'
+ # print fmt % options
+ print('-' * 75)
def run(self):
- print("NLTK Downloader")
+ print('NLTK Downloader')
while True:
self._simple_interactive_menu(
- "d) Download",
- "l) List",
- " u) Update",
- "c) Config",
- "h) Help",
- "q) Quit",
+ 'd) Download',
+ 'l) List',
+ ' u) Update',
+ 'c) Config',
+ 'h) Help',
+ 'q) Quit',
)
- user_input = input("Downloader> ").strip()
+ user_input = input('Downloader> ').strip()
if not user_input:
print()
continue
command = user_input.lower().split()[0]
args = user_input.split()[1:]
try:
- if command == "l":
+ if command == 'l':
print()
self._ds.list(self._ds.download_dir, header=False, more_prompt=True)
- elif command == "h":
+ elif command == 'h':
self._simple_interactive_help()
- elif command == "c":
+ elif command == 'c':
self._simple_interactive_config()
- elif command in ("q", "x"):
+ elif command in ('q', 'x'):
return
- elif command == "d":
+ elif command == 'd':
self._simple_interactive_download(args)
- elif command == "u":
+ elif command == 'u':
self._simple_interactive_update()
else:
- print("Command %r unrecognized" % user_input)
+ print('Command %r unrecognized' % user_input)
except HTTPError as e:
- print("Error reading from server: %s" % e)
+ print('Error reading from server: %s' % e)
except URLError as e:
- print("Error connecting to server: %s" % e.reason)
+ print('Error connecting to server: %s' % e.reason)
# try checking if user_input is a package name, &
# downloading it?
print()
if args:
for arg in args:
try:
- self._ds.download(arg, prefix=" ")
+ self._ds.download(arg, prefix=' ')
except (IOError, ValueError) as e:
print(e)
else:
while True:
print()
- print("Download which package (l=list; x=cancel)?")
- user_input = input(" Identifier> ")
- if user_input.lower() == "l":
+ print('Download which package (l=list; x=cancel)?')
+ user_input = input(' Identifier> ')
+ if user_input.lower() == 'l':
self._ds.list(
self._ds.download_dir,
header=False,
skip_installed=True,
)
continue
- elif user_input.lower() in ("x", "q", ""):
+ elif user_input.lower() in ('x', 'q', ''):
return
elif user_input:
for id in user_input.split():
try:
- self._ds.download(id, prefix=" ")
+ self._ds.download(id, prefix=' ')
except (IOError, ValueError) as e:
print(e)
break
while True:
stale_packages = []
stale = partial = False
- for info in sorted(getattr(self._ds, "packages")(), key=str):
+ for info in sorted(getattr(self._ds, 'packages')(), key=str):
if self._ds.status(info) == self._ds.STALE:
stale_packages.append((info.id, info.name))
print()
if stale_packages:
- print("Will update following packages (o=ok; x=cancel)")
+ print('Will update following packages (o=ok; x=cancel)')
for pid, pname in stale_packages:
name = textwrap.fill(
- "-" * 27 + (pname), 75, subsequent_indent=27 * " "
+ '-' * 27 + (pname), 75, subsequent_indent=27 * ' '
)[27:]
- print(" [ ] %s %s" % (pid.ljust(20, "."), name))
+ print(' [ ] %s %s' % (pid.ljust(20, '.'), name))
print()
- user_input = input(" Identifier> ")
- if user_input.lower() == "o":
+ user_input = input(' Identifier> ')
+ if user_input.lower() == 'o':
for pid, pname in stale_packages:
try:
- self._ds.download(pid, prefix=" ")
+ self._ds.download(pid, prefix=' ')
except (IOError, ValueError) as e:
print(e)
break
- elif user_input.lower() in ("x", "q", ""):
+ elif user_input.lower() in ('x', 'q', ''):
return
else:
- print("Nothing to update.")
+ print('Nothing to update.')
return
def _simple_interactive_help(self):
print()
- print("Commands:")
+ print('Commands:')
print(
- " d) Download a package or collection u) Update out of date packages"
+ ' d) Download a package or collection u) Update out of date packages'
)
- print(" l) List packages & collections h) Help")
- print(" c) View & Modify Configuration q) Quit")
+ print(' l) List packages & collections h) Help')
+ print(' c) View & Modify Configuration q) Quit')
def _show_config(self):
print()
- print("Data Server:")
- print(" - URL: <%s>" % self._ds.url)
- print((" - %d Package Collections Available" % len(self._ds.collections())))
- print((" - %d Individual Packages Available" % len(self._ds.packages())))
+ print('Data Server:')
+ print(' - URL: <%s>' % self._ds.url)
+ print((' - %d Package Collections Available' % len(self._ds.collections())))
+ print((' - %d Individual Packages Available' % len(self._ds.packages())))
print()
- print("Local Machine:")
- print(" - Data directory: %s" % self._ds.download_dir)
+ print('Local Machine:')
+ print(' - Data directory: %s' % self._ds.download_dir)
def _simple_interactive_config(self):
self._show_config()
while True:
print()
self._simple_interactive_menu(
- "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu"
+ 's) Show Config', 'u) Set Server URL', 'd) Set Data Dir', 'm) Main Menu'
)
- user_input = input("Config> ").strip().lower()
- if user_input == "s":
+ user_input = input('Config> ').strip().lower()
+ if user_input == 's':
self._show_config()
- elif user_input == "d":
- new_dl_dir = input(" New Directory> ").strip()
- if new_dl_dir in ("", "x", "q", "X", "Q"):
- print(" Cancelled!")
+ elif user_input == 'd':
+ new_dl_dir = input(' New Directory> ').strip()
+ if new_dl_dir in ('', 'x', 'q', 'X', 'Q'):
+ print(' Cancelled!')
elif os.path.isdir(new_dl_dir):
self._ds.download_dir = new_dl_dir
else:
- print(("Directory %r not found! Create it first." % new_dl_dir))
- elif user_input == "u":
- new_url = input(" New URL> ").strip()
- if new_url in ("", "x", "q", "X", "Q"):
- print(" Cancelled!")
+ print(('Directory %r not found! Create it first.' % new_dl_dir))
+ elif user_input == 'u':
+ new_url = input(' New URL> ').strip()
+ if new_url in ('', 'x', 'q', 'X', 'Q'):
+ print(' Cancelled!')
else:
- if not new_url.startswith(("http://", "https://")):
- new_url = "http://" + new_url
+ if not new_url.startswith(('http://', 'https://')):
+ new_url = 'http://' + new_url
try:
self._ds.url = new_url
except Exception as e:
- print("Error reading <%r>:\n %s" % (new_url, e))
- elif user_input == "m":
+ print('Error reading <%r>:\n %s' % (new_url, e))
+ elif user_input == 'm':
break
# /////////////////////////////////////////////////////////////////
COLUMNS = [
- "",
- "Identifier",
- "Name",
- "Size",
- "Status",
- "Unzipped Size",
- "Copyright",
- "Contact",
- "License",
- "Author",
- "Subdir",
- "Checksum",
+ '',
+ 'Identifier',
+ 'Name',
+ 'Size',
+ 'Status',
+ 'Unzipped Size',
+ 'Copyright',
+ 'Contact',
+ 'License',
+ 'Author',
+ 'Subdir',
+ 'Checksum',
]
"""A list of the names of columns. This controls the order in
which the columns will appear. If this is edited, then
``_package_to_columns()`` may need to be edited to match."""
- COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0}
+ COLUMN_WEIGHTS = {'': 0, 'Name': 5, 'Size': 0, 'Status': 0}
"""A dictionary specifying how columns should be resized when the
table is resized. Columns with weight 0 will not be resized at
all; and columns with high weight will be resized more.
Default weight (for columns not explicitly listed) is 1."""
COLUMN_WIDTHS = {
- "": 1,
- "Identifier": 20,
- "Name": 45,
- "Size": 10,
- "Unzipped Size": 10,
- "Status": 12,
+ '': 1,
+ 'Identifier': 20,
+ 'Name': 45,
+ 'Size': 10,
+ 'Unzipped Size': 10,
+ 'Status': 12,
}
"""A dictionary specifying how wide each column should be, in
characters. The default width (for columns not explicitly
"""The default width for columns that are not explicitly listed
in ``COLUMN_WIDTHS``."""
- INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"]
+ INITIAL_COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status']
"""The set of columns that should be displayed by default."""
# Perform a few import-time sanity checks to make sure that the
# Color Configuration
# /////////////////////////////////////////////////////////////////
- _BACKDROP_COLOR = ("#000", "#ccc")
+ _BACKDROP_COLOR = ('#000', '#ccc')
_ROW_COLOR = {
- Downloader.INSTALLED: ("#afa", "#080"),
- Downloader.PARTIAL: ("#ffa", "#880"),
- Downloader.STALE: ("#faa", "#800"),
- Downloader.NOT_INSTALLED: ("#fff", "#888"),
+ Downloader.INSTALLED: ('#afa', '#080'),
+ Downloader.PARTIAL: ('#ffa', '#880'),
+ Downloader.STALE: ('#faa', '#800'),
+ Downloader.NOT_INSTALLED: ('#fff', '#888'),
}
- _MARK_COLOR = ("#000", "#ccc")
+ _MARK_COLOR = ('#000', '#ccc')
# _FRONT_TAB_COLOR = ('#ccf', '#008')
# _BACK_TAB_COLOR = ('#88a', '#448')
- _FRONT_TAB_COLOR = ("#fff", "#45c")
- _BACK_TAB_COLOR = ("#aaa", "#67a")
+ _FRONT_TAB_COLOR = ('#fff', '#45c')
+ _BACK_TAB_COLOR = ('#aaa', '#67a')
- _PROGRESS_COLOR = ("#f00", "#aaa")
+ _PROGRESS_COLOR = ('#f00', '#aaa')
- _TAB_FONT = "helvetica -16 bold"
+ _TAB_FONT = 'helvetica -16 bold'
# /////////////////////////////////////////////////////////////////
# Constructor
# A message log.
self._log_messages = []
self._log_indent = 0
- self._log("NLTK Downloader Started!")
+ self._log('NLTK Downloader Started!')
# Create the main window.
top = self.top = Tk()
- top.geometry("+50+50")
- top.title("NLTK Downloader")
+ top.geometry('+50+50')
+ top.title('NLTK Downloader')
top.configure(background=self._BACKDROP_COLOR[1])
# Set up some bindings now, in case anything goes wrong.
- top.bind("<Control-q>", self.destroy)
- top.bind("<Control-x>", self.destroy)
+ top.bind('<Control-q>', self.destroy)
+ top.bind('<Control-x>', self.destroy)
self._destroyed = False
self._column_vars = {}
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
self._show_info()
self._select_columns()
# Make sure we get notified when we're destroyed, so we can
# cancel any download in progress.
- self._table.bind("<Destroy>", self._destroy)
+ self._table.bind('<Destroy>', self._destroy)
def _log(self, msg):
self._log_messages.append(
- "%s %s%s" % (time.ctime(), " | " * self._log_indent, msg)
+ '%s %s%s' % (time.ctime(), ' | ' * self._log_indent, msg)
)
# /////////////////////////////////////////////////////////////////
def _init_widgets(self):
# Create the top-level frame structures
- f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0)
- f1.pack(sid="top", expand=True, fill="both")
+ f1 = Frame(self.top, relief='raised', border=2, padx=8, pady=0)
+ f1.pack(sid='top', expand=True, fill='both')
f1.grid_rowconfigure(2, weight=1)
f1.grid_columnconfigure(0, weight=1)
Frame(f1, height=8).grid(column=0, row=0) # spacer
tabframe = Frame(f1)
- tabframe.grid(column=0, row=1, sticky="news")
+ tabframe.grid(column=0, row=1, sticky='news')
tableframe = Frame(f1)
- tableframe.grid(column=0, row=2, sticky="news")
+ tableframe.grid(column=0, row=2, sticky='news')
buttonframe = Frame(f1)
- buttonframe.grid(column=0, row=3, sticky="news")
+ buttonframe.grid(column=0, row=3, sticky='news')
Frame(f1, height=8).grid(column=0, row=4) # spacer
infoframe = Frame(f1)
- infoframe.grid(column=0, row=5, sticky="news")
+ infoframe.grid(column=0, row=5, sticky='news')
Frame(f1, height=8).grid(column=0, row=6) # spacer
progressframe = Frame(
self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1]
)
- progressframe.pack(side="bottom", fill="x")
- self.top["border"] = 0
- self.top["highlightthickness"] = 0
+ progressframe.pack(side='bottom', fill='x')
+ self.top['border'] = 0
+ self.top['highlightthickness'] = 0
# Create the tabs
- self._tab_names = ["Collections", "Corpora", "Models", "All Packages"]
+ self._tab_names = ['Collections', 'Corpora', 'Models', 'All Packages']
self._tabs = {}
for i, tab in enumerate(self._tab_names):
label = Label(tabframe, text=tab, font=self._TAB_FONT)
- label.pack(side="left", padx=((i + 1) % 2) * 10)
- label.bind("<Button-1>", self._select_tab)
+ label.pack(side='left', padx=((i + 1) % 2) * 10)
+ label.bind('<Button-1>', self._select_tab)
self._tabs[tab.lower()] = label
# Create the table.
for i, column in enumerate(self.COLUMNS):
width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH)
self._table.columnconfig(i, width=width)
- self._table.pack(expand=True, fill="both")
+ self._table.pack(expand=True, fill='both')
self._table.focus()
- self._table.bind_to_listboxes("<Double-Button-1>", self._download)
- self._table.bind("<space>", self._table_mark)
- self._table.bind("<Return>", self._download)
- self._table.bind("<Left>", self._prev_tab)
- self._table.bind("<Right>", self._next_tab)
- self._table.bind("<Control-a>", self._mark_all)
+ self._table.bind_to_listboxes('<Double-Button-1>', self._download)
+ self._table.bind('<space>', self._table_mark)
+ self._table.bind('<Return>', self._download)
+ self._table.bind('<Left>', self._prev_tab)
+ self._table.bind('<Right>', self._next_tab)
+ self._table.bind('<Control-a>', self._mark_all)
# Create entry boxes for URL & download_dir
infoframe.grid_columnconfigure(1, weight=1)
info = [
- ("url", "Server Index:", self._set_url),
- ("download_dir", "Download Directory:", self._set_download_dir),
+ ('url', 'Server Index:', self._set_url),
+ ('download_dir', 'Download Directory:', self._set_download_dir),
]
self._info = {}
for (i, (key, label, callback)) in enumerate(info):
- Label(infoframe, text=label).grid(column=0, row=i, sticky="e")
+ Label(infoframe, text=label).grid(column=0, row=i, sticky='e')
entry = Entry(
- infoframe, font="courier", relief="groove", disabledforeground="black"
+ infoframe, font='courier', relief='groove', disabledforeground='black'
)
self._info[key] = (entry, callback)
- entry.bind("<Return>", self._info_save)
- entry.bind("<Button-1>", lambda e, key=key: self._info_edit(key))
- entry.grid(column=1, row=i, sticky="ew")
+ entry.bind('<Return>', self._info_save)
+ entry.bind('<Button-1>', lambda e, key=key: self._info_edit(key))
+ entry.grid(column=1, row=i, sticky='ew')
# If the user edits url or download_dir, and then clicks outside
# the entry box, then save their results.
- self.top.bind("<Button-1>", self._info_save)
+ self.top.bind('<Button-1>', self._info_save)
# Create Download & Refresh buttons.
self._download_button = Button(
- buttonframe, text="Download", command=self._download, width=8
+ buttonframe, text='Download', command=self._download, width=8
)
- self._download_button.pack(side="left")
+ self._download_button.pack(side='left')
self._refresh_button = Button(
- buttonframe, text="Refresh", command=self._refresh, width=8
+ buttonframe, text='Refresh', command=self._refresh, width=8
)
- self._refresh_button.pack(side="right")
+ self._refresh_button.pack(side='right')
# Create Progress bar
self._progresslabel = Label(
progressframe,
- text="",
+ text='',
foreground=self._BACKDROP_COLOR[0],
background=self._BACKDROP_COLOR[1],
)
width=200,
height=16,
background=self._PROGRESS_COLOR[1],
- relief="sunken",
+ relief='sunken',
border=1,
)
self._init_progressbar()
- self._progressbar.pack(side="right")
- self._progresslabel.pack(side="left")
+ self._progressbar.pack(side='right')
+ self._progresslabel.pack(side='left')
def _init_menu(self):
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Download", underline=0, command=self._download, accelerator="Return"
+ label='Download', underline=0, command=self._download, accelerator='Return'
)
filemenu.add_separator()
filemenu.add_command(
- label="Change Server Index",
+ label='Change Server Index',
underline=7,
- command=lambda: self._info_edit("url"),
+ command=lambda: self._info_edit('url'),
)
filemenu.add_command(
- label="Change Download Directory",
+ label='Change Download Directory',
underline=0,
- command=lambda: self._info_edit("download_dir"),
+ command=lambda: self._info_edit('download_dir'),
)
filemenu.add_separator()
- filemenu.add_command(label="Show Log", underline=5, command=self._show_log)
+ filemenu.add_command(label='Show Log', underline=5, command=self._show_log)
filemenu.add_separator()
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
# Create a menu to control which columns of the table are
# shown. n.b.: we never hide the first two columns (mark and
viewmenu.add_checkbutton(
label=column, underline=0, variable=var, command=self._select_columns
)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
# Create a sort menu
# [xx] this should be selectbuttons; and it should include
sortmenu = Menu(menubar, tearoff=0)
for column in self._table.column_names[1:]:
sortmenu.add_command(
- label="Sort by %s" % column,
- command=(lambda c=column: self._table.sort_by(c, "ascending")),
+ label='Sort by %s' % column,
+ command=(lambda c=column: self._table.sort_by(c, 'ascending')),
)
sortmenu.add_separator()
# sortmenu.add_command(label='Descending Sort:')
for column in self._table.column_names[1:]:
sortmenu.add_command(
- label="Reverse sort by %s" % column,
- command=(lambda c=column: self._table.sort_by(c, "descending")),
+ label='Reverse sort by %s' % column,
+ command=(lambda c=column: self._table.sort_by(c, 'descending')),
)
- menubar.add_cascade(label="Sort", underline=0, menu=sortmenu)
+ menubar.add_cascade(label='Sort', underline=0, menu=sortmenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
+ helpmenu.add_command(label='About', underline=0, command=self.about)
helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
+ label='Instructions', underline=0, command=self.help, accelerator='F1'
)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
- self.top.bind("<F1>", self.help)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+ self.top.bind('<F1>', self.help)
self.top.config(menu=menubar)
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
self._table.select(0)
def _info_edit(self, info_key):
self._info_save() # just in case.
(entry, callback) = self._info[info_key]
- entry["state"] = "normal"
- entry["relief"] = "sunken"
+ entry['state'] = 'normal'
+ entry['relief'] = 'sunken'
entry.focus()
def _info_save(self, e=None):
focus = self._table
for entry, callback in self._info.values():
- if entry["state"] == "disabled":
+ if entry['state'] == 'disabled':
continue
- if e is not None and e.widget is entry and e.keysym != "Return":
+ if e is not None and e.widget is entry and e.keysym != 'Return':
focus = entry
else:
- entry["state"] = "disabled"
- entry["relief"] = "groove"
+ entry['state'] = 'disabled'
+ entry['relief'] = 'groove'
callback(entry.get())
focus.focus()
def _table_reprfunc(self, row, col, val):
- if self._table.column_names[col].endswith("Size"):
- if isinstance(val, str):
- return " %s" % val
+ if self._table.column_names[col].endswith('Size'):
+ if isinstance(val, string_types):
+ return ' %s' % val
elif val < 1024 ** 2:
- return " %.1f KB" % (val / 1024.0 ** 1)
+ return ' %.1f KB' % (val / 1024.0 ** 1)
elif val < 1024 ** 3:
- return " %.1f MB" % (val / 1024.0 ** 2)
+ return ' %.1f MB' % (val / 1024.0 ** 2)
else:
- return " %.1f GB" % (val / 1024.0 ** 3)
+ return ' %.1f GB' % (val / 1024.0 ** 3)
- if col in (0, ""):
+ if col in (0, ''):
return str(val)
else:
- return " %s" % val
+ return ' %s' % val
def _set_url(self, url):
if url == self._ds.url:
self._ds.url = url
self._fill_table()
except IOError as e:
- showerror("Error Setting Server Index", str(e))
+ showerror('Error Setting Server Index', str(e))
self._show_info()
def _set_download_dir(self, download_dir):
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
self._show_info()
def _show_info(self):
- print("showing info", self._ds.url)
+ print('showing info', self._ds.url)
for entry, cb in self._info.values():
- entry["state"] = "normal"
- entry.delete(0, "end")
- self._info["url"][0].insert(0, self._ds.url)
- self._info["download_dir"][0].insert(0, self._ds.download_dir)
+ entry['state'] = 'normal'
+ entry.delete(0, 'end')
+ self._info['url'][0].insert(0, self._ds.url)
+ self._info['download_dir'][0].insert(0, self._ds.download_dir)
for entry, cb in self._info.values():
- entry["state"] = "disabled"
+ entry['state'] = 'disabled'
def _prev_tab(self, *e):
for i, tab in enumerate(self._tab_names):
try:
return self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
def _next_tab(self, *e):
for i, tab in enumerate(self._tab_names):
try:
return self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
def _select_tab(self, event):
- self._tab = event.widget["text"].lower()
+ self._tab = event.widget['text'].lower()
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
- _tab = "collections"
+ _tab = 'collections'
# _tab = 'corpora'
_rows = None
def _fill_table(self):
selected_row = self._table.selected_row()
self._table.clear()
- if self._tab == "all packages":
+ if self._tab == 'all packages':
items = self._ds.packages()
- elif self._tab == "corpora":
+ elif self._tab == 'corpora':
items = self._ds.corpora()
- elif self._tab == "models":
+ elif self._tab == 'models':
items = self._ds.models()
- elif self._tab == "collections":
+ elif self._tab == 'collections':
items = self._ds.collections()
else:
- assert 0, "bad tab value %r" % self._tab
+ assert 0, 'bad tab value %r' % self._tab
rows = [self._package_to_columns(item) for item in items]
self._table.extend(rows)
background=self._BACK_TAB_COLOR[1],
)
- self._table.sort_by("Identifier", order="ascending")
+ self._table.sort_by('Identifier', order='ascending')
self._color_table()
self._table.select(selected_row)
def _update_table_status(self):
for row_num in range(len(self._table)):
- status = self._ds.status(self._table[row_num, "Identifier"])
- self._table[row_num, "Status"] = status
+ status = self._ds.status(self._table[row_num, 'Identifier'])
+ self._table[row_num, 'Status'] = status
self._color_table()
def _download(self, *e):
return self._download_threaded(*e)
marked = [
- self._table[row, "Identifier"]
+ self._table[row, 'Identifier']
for row in range(len(self._table))
- if self._table[row, 0] != ""
+ if self._table[row, 0] != ''
]
selection = self._table.selected_row()
if not marked and selection is not None:
- marked = [self._table[selection, "Identifier"]]
+ marked = [self._table[selection, 'Identifier']]
download_iter = self._ds.incr_download(marked, self._ds.download_dir)
self._log_indent = 0
# self._fill_table(sort=False)
self._update_table_status()
afterid = self.top.after(10, self._show_progress, 0)
- self._afterid["_download_cb"] = afterid
+ self._afterid['_download_cb'] = afterid
return
def show(s):
- self._progresslabel["text"] = s
+ self._progresslabel['text'] = s
self._log(s)
if isinstance(msg, ProgressMessage):
self._show_progress(None)
return # halt progress.
elif isinstance(msg, StartCollectionMessage):
- show("Downloading collection %s" % msg.collection.id)
+ show('Downloading collection %s' % msg.collection.id)
self._log_indent += 1
elif isinstance(msg, StartPackageMessage):
- show("Downloading package %s" % msg.package.id)
+ show('Downloading package %s' % msg.package.id)
elif isinstance(msg, UpToDateMessage):
- show("Package %s is up-to-date!" % msg.package.id)
+ show('Package %s is up-to-date!' % msg.package.id)
# elif isinstance(msg, StaleMessage):
# show('Package %s is out-of-date or corrupt' % msg.package.id)
elif isinstance(msg, FinishDownloadMessage):
- show("Finished downloading %r." % msg.package.id)
+ show('Finished downloading %r.' % msg.package.id)
elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s" % msg.package.filename)
+ show('Unzipping %s' % msg.package.filename)
elif isinstance(msg, FinishCollectionMessage):
self._log_indent -= 1
- show("Finished downloading collection %r." % msg.collection.id)
+ show('Finished downloading collection %r.' % msg.collection.id)
self._clear_mark(msg.collection.id)
elif isinstance(msg, FinishPackageMessage):
self._clear_mark(msg.package.id)
afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids)
- self._afterid["_download_cb"] = afterid
+ self._afterid['_download_cb'] = afterid
def _select(self, id):
for row in range(len(self._table)):
- if self._table[row, "Identifier"] == id:
+ if self._table[row, 'Identifier'] == id:
self._table.select(row)
return
def _color_table(self):
# Color rows according to status.
for row in range(len(self._table)):
- bg, sbg = self._ROW_COLOR[self._table[row, "Status"]]
- fg, sfg = ("black", "white")
+ bg, sbg = self._ROW_COLOR[self._table[row, 'Status']]
+ fg, sfg = ('black', 'white')
self._table.rowconfig(
row,
foreground=fg,
def _clear_mark(self, id):
for row in range(len(self._table)):
- if self._table[row, "Identifier"] == id:
- self._table[row, 0] = ""
+ if self._table[row, 'Identifier'] == id:
+ self._table[row, 0] = ''
def _mark_all(self, *e):
for row in range(len(self._table)):
- self._table[row, 0] = "X"
+ self._table[row, 0] = 'X'
def _table_mark(self, *e):
selection = self._table.selected_row()
if selection >= 0:
- if self._table[selection][0] != "":
- self._table[selection, 0] = ""
+ if self._table[selection][0] != '':
+ self._table[selection, 0] = ''
else:
- self._table[selection, 0] = "X"
+ self._table[selection, 0] = 'X'
self._table.select(delta=1)
def _show_log(self):
- text = "\n".join(self._log_messages)
- ShowText(self.top, "NLTK Downloader Log", text)
+ text = '\n'.join(self._log_messages)
+ ShowText(self.top, 'NLTK Downloader Log', text)
def _package_to_columns(self, pkg):
"""
row = []
for column_index, column_name in enumerate(self.COLUMNS):
if column_index == 0: # Mark:
- row.append("")
- elif column_name == "Identifier":
+ row.append('')
+ elif column_name == 'Identifier':
row.append(pkg.id)
- elif column_name == "Status":
+ elif column_name == 'Status':
row.append(self._ds.status(pkg))
else:
- attr = column_name.lower().replace(" ", "_")
- row.append(getattr(pkg, attr, "n/a"))
+ attr = column_name.lower().replace(' ', '_')
+ row.append(getattr(pkg, attr, 'n/a'))
return row
# /////////////////////////////////////////////////////////////////
try:
ShowText(
self.top,
- "Help: NLTK Dowloader",
+ 'Help: NLTK Dowloader',
self.HELP.strip(),
width=75,
- font="fixed",
+ font='fixed',
)
except:
- ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75)
+ ShowText(self.top, 'Help: NLTK Downloader', self.HELP.strip(), width=75)
def about(self, *e):
ABOUT = "NLTK Downloader\n" + "Written by Edward Loper"
- TITLE = "About: NLTK Downloader"
+ TITLE = 'About: NLTK Downloader'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except ImportError:
def _init_progressbar(self):
c = self._progressbar
- width, height = int(c["width"]), int(c["height"])
- for i in range(0, (int(c["width"]) * 2) // self._gradient_width):
+ width, height = int(c['width']), int(c['height'])
+ for i in range(0, (int(c['width']) * 2) // self._gradient_width):
c.create_line(
i * self._gradient_width + 20,
-20,
i * self._gradient_width - height - 20,
height + 20,
width=self._gradient_width,
- fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12),
+ fill='#%02x0000' % (80 + abs(i % 6 - 3) * 12),
)
- c.addtag_all("gradient")
- c.itemconfig("gradient", state="hidden")
+ c.addtag_all('gradient')
+ c.itemconfig('gradient', state='hidden')
# This is used to display progress
c.addtag_withtag(
- "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
+ 'redbox', c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
)
def _show_progress(self, percent):
c = self._progressbar
if percent is None:
- c.coords("redbox", 0, 0, 0, 0)
- c.itemconfig("gradient", state="hidden")
+ c.coords('redbox', 0, 0, 0, 0)
+ c.itemconfig('gradient', state='hidden')
else:
- width, height = int(c["width"]), int(c["height"])
+ width, height = int(c['width']), int(c['height'])
x = percent * int(width) // 100 + 1
- c.coords("redbox", 0, 0, x, height + 1)
+ c.coords('redbox', 0, 0, x, height + 1)
def _progress_alive(self):
c = self._progressbar
if not self._downloading:
- c.itemconfig("gradient", state="hidden")
+ c.itemconfig('gradient', state='hidden')
else:
- c.itemconfig("gradient", state="normal")
- x1, y1, x2, y2 = c.bbox("gradient")
+ c.itemconfig('gradient', state='normal')
+ x1, y1, x2, y2 = c.bbox('gradient')
if x1 <= -100:
- c.move("gradient", (self._gradient_width * 6) - 4, 0)
+ c.move('gradient', (self._gradient_width * 6) - 4, 0)
else:
- c.move("gradient", -4, 0)
+ c.move('gradient', -4, 0)
afterid = self.top.after(200, self._progress_alive)
- self._afterid["_progress_alive"] = afterid
+ self._afterid['_progress_alive'] = afterid
# /////////////////////////////////////////////////////////////////
# Threaded downloader
return
# Change the 'download' button to an 'abort' button.
- self._download_button["text"] = "Cancel"
+ self._download_button['text'] = 'Cancel'
marked = [
- self._table[row, "Identifier"]
+ self._table[row, 'Identifier']
for row in range(len(self._table))
- if self._table[row, 0] != ""
+ if self._table[row, 0] != ''
]
selection = self._table.selected_row()
if not marked and selection is not None:
- marked = [self._table[selection, "Identifier"]]
+ marked = [self._table[selection, 'Identifier']]
# Create a new data server object for the download operation,
# just in case the user modifies our data server during the
def _abort_download(self):
if self._downloading:
self._download_lock.acquire()
- self._download_abort_queue.append("abort")
+ self._download_abort_queue.append('abort')
self._download_lock.release()
class _DownloadThread(threading.Thread):
self.message_queue.append(msg)
# Check if we've been told to kill ourselves:
if self.abort:
- self.message_queue.append("aborted")
+ self.message_queue.append('aborted')
self.lock.release()
return
self.lock.release()
self.lock.acquire()
- self.message_queue.append("finished")
+ self.message_queue.append('finished')
self.lock.release()
_MONITOR_QUEUE_DELAY = 100
def _monitor_message_queue(self):
def show(s):
- self._progresslabel["text"] = s
+ self._progresslabel['text'] = s
self._log(s)
# Try to acquire the lock; if it's busy, then just try again later.
for msg in self._download_msg_queue:
# Done downloading?
- if msg == "finished" or msg == "aborted":
+ if msg == 'finished' or msg == 'aborted':
# self._fill_table(sort=False)
self._update_table_status()
self._downloading = False
- self._download_button["text"] = "Download"
+ self._download_button['text'] = 'Download'
del self._download_msg_queue[:]
del self._download_abort_queue[:]
self._download_lock.release()
- if msg == "aborted":
- show("Download aborted!")
+ if msg == 'aborted':
+ show('Download aborted!')
self._show_progress(None)
else:
afterid = self.top.after(100, self._show_progress, None)
- self._afterid["_monitor_message_queue"] = afterid
+ self._afterid['_monitor_message_queue'] = afterid
return
# All other messages
self._downloading = False
return # halt progress.
elif isinstance(msg, StartCollectionMessage):
- show("Downloading collection %r" % msg.collection.id)
+ show('Downloading collection %r' % msg.collection.id)
self._log_indent += 1
elif isinstance(msg, StartPackageMessage):
self._ds.clear_status_cache(msg.package.id)
- show("Downloading package %r" % msg.package.id)
+ show('Downloading package %r' % msg.package.id)
elif isinstance(msg, UpToDateMessage):
- show("Package %s is up-to-date!" % msg.package.id)
+ show('Package %s is up-to-date!' % msg.package.id)
# elif isinstance(msg, StaleMessage):
# show('Package %s is out-of-date or corrupt; updating it' %
# msg.package.id)
elif isinstance(msg, FinishDownloadMessage):
- show("Finished downloading %r." % msg.package.id)
+ show('Finished downloading %r.' % msg.package.id)
elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s" % msg.package.filename)
+ show('Unzipping %s' % msg.package.filename)
elif isinstance(msg, FinishUnzipMessage):
- show("Finished installing %s" % msg.package.id)
+ show('Finished installing %s' % msg.package.id)
elif isinstance(msg, FinishCollectionMessage):
self._log_indent -= 1
- show("Finished downloading collection %r." % msg.collection.id)
+ show('Finished downloading collection %r.' % msg.collection.id)
self._clear_mark(msg.collection.id)
elif isinstance(msg, FinishPackageMessage):
self._update_table_status()
# waiting for a good point to abort it, so we don't end up
# with a partially unzipped package or anything like that).
if self._download_abort_queue:
- self._progresslabel["text"] = "Aborting download..."
+ self._progresslabel['text'] = 'Aborting download...'
# Clear the message queue and then release the lock
del self._download_msg_queue[:]
# Check the queue again after MONITOR_QUEUE_DELAY msec.
afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue)
- self._afterid["_monitor_message_queue"] = afterid
+ self._afterid['_monitor_message_queue'] = afterid
######################################################################
Calculate and return the MD5 checksum for a given file.
``file`` may either be a filename or an open stream.
"""
- if isinstance(file, str):
- with open(file, "rb") as infile:
+ if isinstance(file, string_types):
+ with open(file, 'rb') as infile:
return _md5_hexdigest(infile)
return _md5_hexdigest(file)
def _unzip_iter(filename, root, verbose=True):
if verbose:
- sys.stdout.write("Unzipping %s" % os.path.split(filename)[1])
+ sys.stdout.write('Unzipping %s' % os.path.split(filename)[1])
sys.stdout.flush()
try:
zf = zipfile.ZipFile(filename)
except zipfile.error as e:
- yield ErrorMessage(filename, "Error with downloaded zip file")
+ yield ErrorMessage(filename, 'Error with downloaded zip file')
return
except Exception as e:
yield ErrorMessage(filename, e)
return
- zf.extractall(root)
+ # Get lists of directories & files
+ namelist = zf.namelist()
+ dirlist = set()
+ for x in namelist:
+ if x.endswith('/'):
+ dirlist.add(x)
+ else:
+ dirlist.add(x.rsplit('/', 1)[0] + '/')
+ filelist = [x for x in namelist if not x.endswith('/')]
+
+ # Create the target directory if it doesn't exist
+ if not os.path.exists(root):
+ os.mkdir(root)
+
+ # Create the directory structure
+ for dirname in sorted(dirlist):
+ pieces = dirname[:-1].split('/')
+ for i in range(len(pieces)):
+ dirpath = os.path.join(root, *pieces[: i + 1])
+ if not os.path.exists(dirpath):
+ os.mkdir(dirpath)
+
+ # Extract files.
+ for i, filename in enumerate(filelist):
+ filepath = os.path.join(root, *filename.split('/'))
+
+ try:
+ with open(filepath, 'wb') as dstfile, zf.open(filename) as srcfile:
+ shutil.copyfileobj(srcfile, dstfile)
+ except Exception as e:
+ yield ErrorMessage(filename, e)
+ return
+ if verbose and (i * 10 / len(filelist) > (i - 1) * 10 / len(filelist)):
+ sys.stdout.write('.')
+ sys.stdout.flush()
if verbose:
print()
"""
# Find all packages.
packages = []
- for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")):
+ for pkg_xml, zf, subdir in _find_packages(os.path.join(root, 'packages')):
zipstat = os.stat(zf.filename)
- url = "%s/%s/%s" % (base_url, subdir, os.path.split(zf.filename)[1])
+ url = '%s/%s/%s' % (base_url, subdir, os.path.split(zf.filename)[1])
unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist())
# Fill in several fields of the package xml with calculated values.
- pkg_xml.set("unzipped_size", "%s" % unzipped_size)
- pkg_xml.set("size", "%s" % zipstat.st_size)
- pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename))
- pkg_xml.set("subdir", subdir)
+ pkg_xml.set('unzipped_size', '%s' % unzipped_size)
+ pkg_xml.set('size', '%s' % zipstat.st_size)
+ pkg_xml.set('checksum', '%s' % md5_hexdigest(zf.filename))
+ pkg_xml.set('subdir', subdir)
# pkg_xml.set('svn_revision', _svn_revision(zf.filename))
- if not pkg_xml.get("url"):
- pkg_xml.set("url", url)
+ if not pkg_xml.get('url'):
+ pkg_xml.set('url', url)
# Record the package.
packages.append(pkg_xml)
# Find all collections
- collections = list(_find_collections(os.path.join(root, "collections")))
+ collections = list(_find_collections(os.path.join(root, 'collections')))
# Check that all UIDs are unique
uids = set()
for item in packages + collections:
- if item.get("id") in uids:
- raise ValueError("Duplicate UID: %s" % item.get("id"))
- uids.add(item.get("id"))
+ if item.get('id') in uids:
+ raise ValueError('Duplicate UID: %s' % item.get('id'))
+ uids.add(item.get('id'))
# Put it all together
- top_elt = ElementTree.Element("nltk_data")
- top_elt.append(ElementTree.Element("packages"))
+ top_elt = ElementTree.Element('nltk_data')
+ top_elt.append(ElementTree.Element('packages'))
for package in packages:
top_elt[0].append(package)
- top_elt.append(ElementTree.Element("collections"))
+ top_elt.append(ElementTree.Element('collections'))
for collection in collections:
top_elt[1].append(collection)
return top_elt
-def _indent_xml(xml, prefix=""):
+def _indent_xml(xml, prefix=''):
"""
Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
(and its descendents) ``text`` and ``tail`` attributes to generate
spaces with respect to its parent.
"""
if len(xml) > 0:
- xml.text = (xml.text or "").strip() + "\n" + prefix + " "
+ xml.text = (xml.text or '').strip() + '\n' + prefix + ' '
for child in xml:
- _indent_xml(child, prefix + " ")
+ _indent_xml(child, prefix + ' ')
for child in xml[:-1]:
- child.tail = (child.tail or "").strip() + "\n" + prefix + " "
- xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix
+ child.tail = (child.tail or '').strip() + '\n' + prefix + ' '
+ xml[-1].tail = (xml[-1].tail or '').strip() + '\n' + prefix
def _check_package(pkg_xml, zipfilename, zf):
"""
# The filename must patch the id given in the XML file.
uid = os.path.splitext(os.path.split(zipfilename)[1])[0]
- if pkg_xml.get("id") != uid:
+ if pkg_xml.get('id') != uid:
raise ValueError(
- "package identifier mismatch (%s vs %s)" % (pkg_xml.get("id"), uid)
+ 'package identifier mismatch (%s vs %s)' % (pkg_xml.get('id'), uid)
)
# Zip file must expand to a subdir whose name matches uid.
- if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()):
+ if sum((name != uid and not name.startswith(uid + '/')) for name in zf.namelist()):
raise ValueError(
- "Zipfile %s.zip does not expand to a single "
- "subdirectory %s/" % (uid, uid)
+ 'Zipfile %s.zip does not expand to a single '
+ 'subdirectory %s/' % (uid, uid)
)
number for a given file (by using ``subprocess`` to run ``svn``).
"""
p = subprocess.Popen(
- ["svn", "status", "-v", filename],
+ ['svn', 'status', '-v', filename],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
(stdout, stderr) = p.communicate()
if p.returncode != 0 or stderr or not stdout:
raise ValueError(
- "Error determining svn_revision for %s: %s"
+ 'Error determining svn_revision for %s: %s'
% (os.path.split(filename)[1], textwrap.fill(stderr))
)
return stdout.split()[2]
packages = []
for dirname, subdirs, files in os.walk(root):
for filename in files:
- if filename.endswith(".xml"):
+ if filename.endswith('.xml'):
xmlfile = os.path.join(dirname, filename)
yield ElementTree.parse(xmlfile).getroot()
# Find all packages.
packages = []
for dirname, subdirs, files in os.walk(root):
- relpath = "/".join(_path_from(root, dirname))
+ relpath = '/'.join(_path_from(root, dirname))
for filename in files:
- if filename.endswith(".xml"):
+ if filename.endswith('.xml'):
xmlfilename = os.path.join(dirname, filename)
- zipfilename = xmlfilename[:-4] + ".zip"
+ zipfilename = xmlfilename[:-4] + '.zip'
try:
zf = zipfile.ZipFile(zipfilename)
except Exception as e:
- raise ValueError("Error reading file %r!\n%s" % (zipfilename, e))
+ raise ValueError('Error reading file %r!\n%s' % (zipfilename, e))
try:
pkg_xml = ElementTree.parse(xmlfilename).getroot()
except Exception as e:
- raise ValueError("Error reading file %r!\n%s" % (xmlfilename, e))
+ raise ValueError('Error reading file %r!\n%s' % (xmlfilename, e))
# Check that the UID matches the filename
uid = os.path.split(xmlfilename[:-4])[1]
- if pkg_xml.get("id") != uid:
+ if pkg_xml.get('id') != uid:
raise ValueError(
- "package identifier mismatch (%s "
- "vs %s)" % (pkg_xml.get("id"), uid)
+ 'package identifier mismatch (%s '
+ 'vs %s)' % (pkg_xml.get('id'), uid)
)
# Check that the zipfile expands to a subdir whose
# name matches the uid.
if sum(
- (name != uid and not name.startswith(uid + "/"))
+ (name != uid and not name.startswith(uid + '/'))
for name in zf.namelist()
):
raise ValueError(
- "Zipfile %s.zip does not expand to a "
- "single subdirectory %s/" % (uid, uid)
+ 'Zipfile %s.zip does not expand to a '
+ 'single subdirectory %s/' % (uid, uid)
)
yield pkg_xml, zf, relpath
# Don't recurse into svn subdirectories:
try:
- subdirs.remove(".svn")
+ subdirs.remove('.svn')
except ValueError:
pass
_downloader.update()
-if __name__ == "__main__":
+if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
"-u",
"--url",
dest="server_index_url",
- default=os.environ.get("NLTK_DOWNLOAD_URL"),
+ default=os.environ.get('NLTK_DOWNLOAD_URL'),
help="download server index url",
)
# Natural Language Toolkit: graphical representations package
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# Import Tkinter-based modules if Tkinter is installed
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
import warnings
# Natural Language Toolkit: CFG visualization
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
-from tkinter import (
+from six import string_types
+from six.moves.tkinter import (
Button,
Canvas,
Entry,
class ProductionList(ColorizedList):
- ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+ ARROW = SymbolWidget.SYMBOLS['rightarrow']
def _init_colortags(self, textwidget, options):
- textwidget.tag_config("terminal", foreground="#006000")
- textwidget.tag_config("arrow", font="symbol", underline="0")
+ textwidget.tag_config('terminal', foreground='#006000')
+ textwidget.tag_config('arrow', font='symbol', underline='0')
textwidget.tag_config(
- "nonterminal", foreground="blue", font=("helvetica", -12, "bold")
+ 'nonterminal', foreground='blue', font=('helvetica', -12, 'bold')
)
def _item_repr(self, item):
contents = []
- contents.append(("%s\t" % item.lhs(), "nonterminal"))
- contents.append((self.ARROW, "arrow"))
+ contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+ contents.append((self.ARROW, 'arrow'))
for elt in item.rhs():
if isinstance(elt, Nonterminal):
- contents.append((" %s" % elt.symbol(), "nonterminal"))
+ contents.append((' %s' % elt.symbol(), 'nonterminal'))
else:
- contents.append((" %r" % elt, "terminal"))
+ contents.append((' %r' % elt, 'terminal'))
return contents
# Regular expressions used by _analyze_line. Precompile them, so
# we can process the text faster.
- ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+ ARROW = SymbolWidget.SYMBOLS['rightarrow']
_LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
_ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*")
_PRODUCTION_RE = re.compile(
+ r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$" # arrow
) # RHS
_TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
- _BOLD = ("helvetica", -12, "bold")
+ _BOLD = ('helvetica', -12, 'bold')
def __init__(self, parent, cfg=None, set_cfg_callback=None):
self._parent = parent
if cfg is not None:
self._cfg = cfg
else:
- self._cfg = CFG(Nonterminal("S"), [])
+ self._cfg = CFG(Nonterminal('S'), [])
self._set_cfg_callback = set_cfg_callback
self._highlight_matching_nonterminals = 1
self._init_bindings()
self._init_startframe()
- self._startframe.pack(side="top", fill="x", expand=0)
+ self._startframe.pack(side='top', fill='x', expand=0)
self._init_prodframe()
- self._prodframe.pack(side="top", fill="both", expand=1)
+ self._prodframe.pack(side='top', fill='both', expand=1)
self._init_buttons()
- self._buttonframe.pack(side="bottom", fill="x", expand=0)
+ self._buttonframe.pack(side='bottom', fill='x', expand=0)
self._textwidget.focus()
def _init_startframe(self):
frame = self._startframe = Frame(self._top)
self._start = Entry(frame)
- self._start.pack(side="right")
- Label(frame, text="Start Symbol:").pack(side="right")
- Label(frame, text="Productions:").pack(side="left")
+ self._start.pack(side='right')
+ Label(frame, text='Start Symbol:').pack(side='right')
+ Label(frame, text='Productions:').pack(side='left')
self._start.insert(0, self._cfg.start().symbol())
def _init_buttons(self):
frame = self._buttonframe = Frame(self._top)
- Button(frame, text="Ok", command=self._ok, underline=0, takefocus=0).pack(
- side="left"
+ Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(
+ side='left'
)
- Button(frame, text="Apply", command=self._apply, underline=0, takefocus=0).pack(
- side="left"
+ Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(
+ side='left'
)
- Button(frame, text="Reset", command=self._reset, underline=0, takefocus=0).pack(
- side="left"
+ Button(frame, text='Reset', command=self._reset, underline=0, takefocus=0).pack(
+ side='left'
)
Button(
- frame, text="Cancel", command=self._cancel, underline=0, takefocus=0
- ).pack(side="left")
- Button(frame, text="Help", command=self._help, underline=0, takefocus=0).pack(
- side="right"
+ frame, text='Cancel', command=self._cancel, underline=0, takefocus=0
+ ).pack(side='left')
+ Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(
+ side='right'
)
def _init_bindings(self):
- self._top.title("CFG Editor")
- self._top.bind("<Control-q>", self._cancel)
- self._top.bind("<Alt-q>", self._cancel)
- self._top.bind("<Control-d>", self._cancel)
+ self._top.title('CFG Editor')
+ self._top.bind('<Control-q>', self._cancel)
+ self._top.bind('<Alt-q>', self._cancel)
+ self._top.bind('<Control-d>', self._cancel)
# self._top.bind('<Control-x>', self._cancel)
- self._top.bind("<Alt-x>", self._cancel)
- self._top.bind("<Escape>", self._cancel)
+ self._top.bind('<Alt-x>', self._cancel)
+ self._top.bind('<Escape>', self._cancel)
# self._top.bind('<Control-c>', self._cancel)
- self._top.bind("<Alt-c>", self._cancel)
-
- self._top.bind("<Control-o>", self._ok)
- self._top.bind("<Alt-o>", self._ok)
- self._top.bind("<Control-a>", self._apply)
- self._top.bind("<Alt-a>", self._apply)
- self._top.bind("<Control-r>", self._reset)
- self._top.bind("<Alt-r>", self._reset)
- self._top.bind("<Control-h>", self._help)
- self._top.bind("<Alt-h>", self._help)
- self._top.bind("<F1>", self._help)
+ self._top.bind('<Alt-c>', self._cancel)
+
+ self._top.bind('<Control-o>', self._ok)
+ self._top.bind('<Alt-o>', self._ok)
+ self._top.bind('<Control-a>', self._apply)
+ self._top.bind('<Alt-a>', self._apply)
+ self._top.bind('<Control-r>', self._reset)
+ self._top.bind('<Alt-r>', self._reset)
+ self._top.bind('<Control-h>', self._help)
+ self._top.bind('<Alt-h>', self._help)
+ self._top.bind('<F1>', self._help)
def _init_prodframe(self):
self._prodframe = Frame(self._top)
# Create the basic Text widget & scrollbar.
self._textwidget = Text(
- self._prodframe, background="#e0e0e0", exportselection=1
+ self._prodframe, background='#e0e0e0', exportselection=1
)
- self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient="vertical")
+ self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical')
self._textwidget.config(yscrollcommand=self._textscroll.set)
self._textscroll.config(command=self._textwidget.yview)
- self._textscroll.pack(side="right", fill="y")
- self._textwidget.pack(expand=1, fill="both", side="left")
+ self._textscroll.pack(side='right', fill='y')
+ self._textwidget.pack(expand=1, fill='both', side='left')
# Initialize the colorization tags. Each nonterminal gets its
# own tag, so they aren't listed here.
- self._textwidget.tag_config("terminal", foreground="#006000")
- self._textwidget.tag_config("arrow", font="symbol")
- self._textwidget.tag_config("error", background="red")
+ self._textwidget.tag_config('terminal', foreground='#006000')
+ self._textwidget.tag_config('arrow', font='symbol')
+ self._textwidget.tag_config('error', background='red')
# Keep track of what line they're on. We use that to remember
# to re-analyze a line whenever they leave it.
self._linenum = 0
# Expand "->" to an arrow.
- self._top.bind(">", self._replace_arrows)
+ self._top.bind('>', self._replace_arrows)
# Re-colorize lines when appropriate.
- self._top.bind("<<Paste>>", self._analyze)
- self._top.bind("<KeyPress>", self._check_analyze)
- self._top.bind("<ButtonPress>", self._check_analyze)
+ self._top.bind('<<Paste>>', self._analyze)
+ self._top.bind('<KeyPress>', self._check_analyze)
+ self._top.bind('<ButtonPress>', self._check_analyze)
# Tab cycles focus. (why doesn't this work??)
def cycle(e, textwidget=self._textwidget):
textwidget.tk_focusNext().focus()
- self._textwidget.bind("<Tab>", cycle)
+ self._textwidget.bind('<Tab>', cycle)
prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
for i in range(len(prod_tuples) - 1, 0, -1):
for lhs, rhss in prod_tuples:
print(lhs, rhss)
- s = "%s ->" % lhs
+ s = '%s ->' % lhs
for rhs in rhss:
for elt in rhs:
if isinstance(elt, Nonterminal):
- s += " %s" % elt
+ s += ' %s' % elt
else:
- s += " %r" % elt
- s += " |"
- s = s[:-2] + "\n"
- self._textwidget.insert("end", s)
+ s += ' %r' % elt
+ s += ' |'
+ s = s[:-2] + '\n'
+ self._textwidget.insert('end', s)
self._analyze()
Remove all tags (except ``arrow`` and ``sel``) from the given
line of the text widget used for editing the productions.
"""
- start = "%d.0" % linenum
- end = "%d.end" % linenum
+ start = '%d.0' % linenum
+ end = '%d.end' % linenum
for tag in self._textwidget.tag_names():
- if tag not in ("arrow", "sel"):
+ if tag not in ('arrow', 'sel'):
self._textwidget.tag_remove(tag, start, end)
def _check_analyze(self, *e):
all colorization from the line we moved to, and re-colorize
the line that we moved from.
"""
- linenum = int(self._textwidget.index("insert").split(".")[0])
+ linenum = int(self._textwidget.index('insert').split('.')[0])
if linenum != self._linenum:
self._clear_tags(linenum)
self._analyze_line(self._linenum)
symbol font). This searches the whole buffer, but is fast
enough to be done anytime they press '>'.
"""
- arrow = "1.0"
+ arrow = '1.0'
while True:
- arrow = self._textwidget.search("->", arrow, "end+1char")
- if arrow == "":
+ arrow = self._textwidget.search('->', arrow, 'end+1char')
+ if arrow == '':
break
- self._textwidget.delete(arrow, arrow + "+2char")
- self._textwidget.insert(arrow, self.ARROW, "arrow")
- self._textwidget.insert(arrow, "\t")
+ self._textwidget.delete(arrow, arrow + '+2char')
+ self._textwidget.insert(arrow, self.ARROW, 'arrow')
+ self._textwidget.insert(arrow, '\t')
- arrow = "1.0"
+ arrow = '1.0'
while True:
- arrow = self._textwidget.search(self.ARROW, arrow + "+1char", "end+1char")
- if arrow == "":
+ arrow = self._textwidget.search(self.ARROW, arrow + '+1char', 'end+1char')
+ if arrow == '':
break
- self._textwidget.tag_add("arrow", arrow, arrow + "+1char")
+ self._textwidget.tag_add('arrow', arrow, arrow + '+1char')
def _analyze_token(self, match, linenum):
"""
"""
# What type of token is it?
if match.group()[0] in "'\"":
- tag = "terminal"
- elif match.group() in ("->", self.ARROW):
- tag = "arrow"
+ tag = 'terminal'
+ elif match.group() in ('->', self.ARROW):
+ tag = 'arrow'
else:
# If it's a nonterminal, then set up new bindings, so we
# can highlight all instances of that nonterminal when we
# put the mouse over it.
- tag = "nonterminal_" + match.group()
+ tag = 'nonterminal_' + match.group()
if tag not in self._textwidget.tag_names():
self._init_nonterminal_tag(tag)
- start = "%d.%d" % (linenum, match.start())
- end = "%d.%d" % (linenum, match.end())
+ start = '%d.%d' % (linenum, match.start())
+ end = '%d.%d' % (linenum, match.end())
self._textwidget.tag_add(tag, start, end)
- def _init_nonterminal_tag(self, tag, foreground="blue"):
+ def _init_nonterminal_tag(self, tag, foreground='blue'):
self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD)
if not self._highlight_matching_nonterminals:
return
def enter(e, textwidget=self._textwidget, tag=tag):
- textwidget.tag_config(tag, background="#80ff80")
+ textwidget.tag_config(tag, background='#80ff80')
def leave(e, textwidget=self._textwidget, tag=tag):
- textwidget.tag_config(tag, background="")
+ textwidget.tag_config(tag, background='')
- self._textwidget.tag_bind(tag, "<Enter>", enter)
- self._textwidget.tag_bind(tag, "<Leave>", leave)
+ self._textwidget.tag_bind(tag, '<Enter>', enter)
+ self._textwidget.tag_bind(tag, '<Leave>', leave)
def _analyze_line(self, linenum):
"""
self._clear_tags(linenum)
# Get the line line's text string.
- line = self._textwidget.get(repr(linenum) + ".0", repr(linenum) + ".end")
+ line = self._textwidget.get(repr(linenum) + '.0', repr(linenum) + '.end')
# If it's a valid production, then colorize each token.
if CFGEditor._PRODUCTION_RE.match(line):
# and call analyze_token on each token.
def analyze_token(match, self=self, linenum=linenum):
self._analyze_token(match, linenum)
- return ""
+ return ''
CFGEditor._TOKEN_RE.sub(analyze_token, line)
- elif line.strip() != "":
+ elif line.strip() != '':
# It's invalid; show the user where the error is.
self._mark_error(linenum, line)
arrowmatch = CFGEditor._ARROW_RE.search(line)
if not arrowmatch:
# If there's no arrow at all, highlight the whole line.
- start = "%d.0" % linenum
- end = "%d.end" % linenum
+ start = '%d.0' % linenum
+ end = '%d.end' % linenum
elif not CFGEditor._LHS_RE.match(line):
# Otherwise, if the LHS is bad, highlight it.
- start = "%d.0" % linenum
- end = "%d.%d" % (linenum, arrowmatch.start())
+ start = '%d.0' % linenum
+ end = '%d.%d' % (linenum, arrowmatch.start())
else:
# Otherwise, highlight the RHS.
- start = "%d.%d" % (linenum, arrowmatch.end())
- end = "%d.end" % linenum
+ start = '%d.%d' % (linenum, arrowmatch.end())
+ end = '%d.end' % linenum
# If we're highlighting 0 chars, highlight the whole line.
- if self._textwidget.compare(start, "==", end):
- start = "%d.0" % linenum
- end = "%d.end" % linenum
- self._textwidget.tag_add("error", start, end)
+ if self._textwidget.compare(start, '==', end):
+ start = '%d.0' % linenum
+ end = '%d.end' % linenum
+ self._textwidget.tag_add('error', start, end)
def _analyze(self, *e):
"""
Replace ``->`` with arrows, and colorize the entire buffer.
"""
self._replace_arrows()
- numlines = int(self._textwidget.index("end").split(".")[0])
+ numlines = int(self._textwidget.index('end').split('.')[0])
for linenum in range(1, numlines + 1): # line numbers start at 1.
self._analyze_line(linenum)
productions = []
# Get the text, normalize it, and split it into lines.
- text = self._textwidget.get("1.0", "end")
- text = re.sub(self.ARROW, "->", text)
- text = re.sub("\t", " ", text)
- lines = text.split("\n")
+ text = self._textwidget.get('1.0', 'end')
+ text = re.sub(self.ARROW, '->', text)
+ text = re.sub('\t', ' ', text)
+ lines = text.split('\n')
# Convert each line to a CFG production
for line in lines:
line = line.strip()
- if line == "":
+ if line == '':
continue
productions += _read_cfg_production(line)
# if line.strip() == '': continue
self._set_cfg_callback(cfg)
def _reset(self, *e):
- self._textwidget.delete("1.0", "end")
+ self._textwidget.delete('1.0', 'end')
for production in self._cfg.productions():
- self._textwidget.insert("end", "%s\n" % production)
+ self._textwidget.insert('end', '%s\n' % production)
self._analyze()
if self._set_cfg_callback is not None:
self._set_cfg_callback(self._cfg)
try:
ShowText(
self._parent,
- "Help: Chart Parser Demo",
+ 'Help: Chart Parser Demo',
(_CFGEditor_HELP).strip(),
width=75,
- font="fixed",
+ font='fixed',
)
except:
ShowText(
self._parent,
- "Help: Chart Parser Demo",
+ 'Help: Chart Parser Demo',
(_CFGEditor_HELP).strip(),
width=75,
)
# Set up the main window.
self._top = Tk()
- self._top.title("Context Free Grammar Demo")
+ self._top.title('Context Free Grammar Demo')
# Base font size
self._size = IntVar(self._top)
# Create the basic frames
frame1 = Frame(self._top)
- frame1.pack(side="left", fill="y", expand=0)
+ frame1.pack(side='left', fill='y', expand=0)
self._init_menubar(self._top)
self._init_buttons(self._top)
self._init_grammar(frame1)
# //////////////////////////////////////////////////
def _init_bindings(self, top):
- top.bind("<Control-q>", self.destroy)
+ top.bind('<Control-q>', self.destroy)
def _init_menubar(self, parent):
pass
def _init_grammar(self, parent):
self._prodlist = ProductionList(parent, self._grammar, width=20)
- self._prodlist.pack(side="top", fill="both", expand=1)
+ self._prodlist.pack(side='top', fill='both', expand=1)
self._prodlist.focus()
- self._prodlist.add_callback("select", self._selectprod_cb)
- self._prodlist.add_callback("move", self._selectprod_cb)
+ self._prodlist.add_callback('select', self._selectprod_cb)
+ self._prodlist.add_callback('move', self._selectprod_cb)
def _init_treelet(self, parent):
- self._treelet_canvas = Canvas(parent, background="white")
- self._treelet_canvas.pack(side="bottom", fill="x")
+ self._treelet_canvas = Canvas(parent, background='white')
+ self._treelet_canvas.pack(side='bottom', fill='x')
self._treelet = None
def _init_workspace(self, parent):
- self._workspace = CanvasFrame(parent, background="white")
- self._workspace.pack(side="right", fill="both", expand=1)
+ self._workspace = CanvasFrame(parent, background='white')
+ self._workspace.pack(side='right', fill='both', expand=1)
self._tree = None
self.reset_workspace()
def reset_workspace(self):
c = self._workspace.canvas()
fontsize = int(self._size.get())
- node_font = ("helvetica", -(fontsize + 4), "bold")
- leaf_font = ("helvetica", -(fontsize + 2))
+ node_font = ('helvetica', -(fontsize + 4), 'bold')
+ leaf_font = ('helvetica', -(fontsize + 2))
# Remove the old tree
if self._tree is not None:
leaves.append(TextWidget(c, word, font=leaf_font, draggable=1))
# Put it all together into one tree
- self._tree = TreeSegmentWidget(c, rootnode, leaves, color="white")
+ self._tree = TreeSegmentWidget(c, rootnode, leaves, color='white')
# Add it to the workspace.
self._workspace.add_widget(self._tree)
if tree is None:
tree = self._tree
for i in range(len(tree.subtrees()) - len(prod.rhs())):
- if tree["color", i] == "white":
+ if tree['color', i] == 'white':
self._markproduction # FIXME: Is this necessary at all?
for j, node in enumerate(prod.rhs()):
):
pass # matching nonterminal
elif (
- isinstance(node, str)
+ isinstance(node, string_types)
and isinstance(widget, TextWidget)
and node == widget.text()
):
break
else:
# Everything matched!
- print("MATCH AT", i)
+ print('MATCH AT', i)
# //////////////////////////////////////////////////
# Grammar
# Draw the tree in the treelet area.
fontsize = int(self._size.get())
- node_font = ("helvetica", -(fontsize + 4), "bold")
- leaf_font = ("helvetica", -(fontsize + 2))
+ node_font = ('helvetica', -(fontsize + 4), 'bold')
+ leaf_font = ('helvetica', -(fontsize + 2))
self._treelet = tree_to_treesegment(
canvas, tree, node_font=node_font, leaf_font=leaf_font
)
- self._treelet["draggable"] = 1
+ self._treelet['draggable'] = 1
# Center the treelet.
(x1, y1, x2, y2) = self._treelet.bbox()
- w, h = int(canvas["width"]), int(canvas["height"])
+ w, h = int(canvas['width']), int(canvas['height'])
self._treelet.move((w - x1 - x2) / 2, (h - y1 - y2) / 2)
# Mark the places where we can add it to the workspace.
def demo2():
from nltk import Nonterminal, Production, CFG
- nonterminals = "S VP NP PP P N Name V Det"
+ nonterminals = 'S VP NP PP P N Name V Det'
(S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
productions = (
# Syntactic Productions
Production(VP, [V, NP]),
Production(PP, [P, NP]),
Production(PP, []),
- Production(PP, ["up", "over", NP]),
+ Production(PP, ['up', 'over', NP]),
# Lexical Productions
- Production(NP, ["I"]),
- Production(Det, ["the"]),
- Production(Det, ["a"]),
- Production(N, ["man"]),
- Production(V, ["saw"]),
- Production(P, ["in"]),
- Production(P, ["with"]),
- Production(N, ["park"]),
- Production(N, ["dog"]),
- Production(N, ["statue"]),
- Production(Det, ["my"]),
+ Production(NP, ['I']),
+ Production(Det, ['the']),
+ Production(Det, ['a']),
+ Production(N, ['man']),
+ Production(V, ['saw']),
+ Production(P, ['in']),
+ Production(P, ['with']),
+ Production(N, ['park']),
+ Production(N, ['dog']),
+ Production(N, ['statue']),
+ Production(Det, ['my']),
)
grammar = CFG(S, productions)
- text = "I saw a man in the park".split()
+ text = 'I saw a man in the park'.split()
d = CFGDemo(grammar, text)
d.mainloop()
def demo():
from nltk import Nonterminal, CFG
- nonterminals = "S VP NP PP P N Name V Det"
+ nonterminals = 'S VP NP PP P N Name V Det'
(S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
grammar = CFG.fromstring(
top = Tk()
editor = CFGEditor(top, grammar, cb)
- Label(top, text="\nTesting CFG Editor\n").pack()
- Button(top, text="Quit", command=top.destroy).pack()
+ Label(top, text='\nTesting CFG Editor\n').pack()
+ Button(top, text='Quit', command=top.destroy).pack()
top.mainloop()
from nltk import Production
(S, VP, NP, PP, P, N, Name, V, Det) = nonterminals(
- "S, VP, NP, PP, P, N, Name, V, Det"
+ 'S, VP, NP, PP, P, N, Name, V, Det'
)
productions = (
Production(VP, [V, NP]),
Production(PP, [P, NP]),
Production(PP, []),
- Production(PP, ["up", "over", NP]),
+ Production(PP, ['up', 'over', NP]),
# Lexical Productions
- Production(NP, ["I"]),
- Production(Det, ["the"]),
- Production(Det, ["a"]),
- Production(N, ["man"]),
- Production(V, ["saw"]),
- Production(P, ["in"]),
- Production(P, ["with"]),
- Production(N, ["park"]),
- Production(N, ["dog"]),
- Production(N, ["statue"]),
- Production(Det, ["my"]),
+ Production(NP, ['I']),
+ Production(Det, ['the']),
+ Production(Det, ['a']),
+ Production(N, ['man']),
+ Production(V, ['saw']),
+ Production(P, ['in']),
+ Production(P, ['with']),
+ Production(N, ['park']),
+ Production(N, ['dog']),
+ Production(N, ['statue']),
+ Production(Det, ['my']),
)
t = Tk()
def destroy(e, t=t):
t.destroy()
- t.bind("q", destroy)
+ t.bind('q', destroy)
p = ProductionList(t, productions)
- p.pack(expand=1, fill="both")
- p.add_callback("select", p.markonly)
- p.add_callback("move", p.markonly)
+ p.pack(expand=1, fill='both')
+ p.add_callback('select', p.markonly)
+ p.add_callback('move', p.markonly)
p.focus()
p.mark(productions[2])
p.mark(productions[8])
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Dispersion Plots
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from matplotlib import pylab
except ImportError:
raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
+ 'The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/'
)
text = list(text)
pylab.show()
-if __name__ == "__main__":
+if __name__ == '__main__':
+ import nltk.compat
from nltk.corpus import gutenberg
- words = ["Elinor", "Marianne", "Edward", "Willoughby"]
- dispersion_plot(gutenberg.words("austen-sense.txt"), words)
+ words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
+ dispersion_plot(gutenberg.words('austen-sense.txt'), words)
# Natural Language Toolkit: Table widget
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Tkinter widgets for displaying multi-column listboxes and tables.
"""
+from __future__ import division
+
+
import operator
-from tkinter import Frame, Label, Listbox, Scrollbar, Tk
+from six.moves.tkinter import Frame, Label, Listbox, Scrollbar, Tk
######################################################################
# /////////////////////////////////////////////////////////////////
#: Default configuration values for the frame.
- FRAME_CONFIG = dict(background="#888", takefocus=True, highlightthickness=1)
+ FRAME_CONFIG = dict(background='#888', takefocus=True, highlightthickness=1)
#: Default configurations for the column labels.
LABEL_CONFIG = dict(
borderwidth=1,
- relief="raised",
- font="helvetica -16 bold",
- background="#444",
- foreground="white",
+ relief='raised',
+ font='helvetica -16 bold',
+ background='#444',
+ foreground='white',
)
#: Default configuration for the column listboxes.
selectborderwidth=0,
highlightthickness=0,
exportselection=False,
- selectbackground="#888",
- activestyle="none",
+ selectbackground='#888',
+ activestyle='none',
takefocus=False,
)
if column_weights is None:
column_weights = [1] * len(columns)
elif len(column_weights) != len(columns):
- raise ValueError("Expected one column_weight for each column")
+ raise ValueError('Expected one column_weight for each column')
self._column_weights = column_weights
# Configure our widgets.
if include_labels:
l = Label(self, text=label, **self.LABEL_CONFIG)
self._labels.append(l)
- l.grid(column=i, row=0, sticky="news", padx=0, pady=0)
+ l.grid(column=i, row=0, sticky='news', padx=0, pady=0)
l.column_index = i
# Create a listbox for the column
lb = Listbox(self, **self.LISTBOX_CONFIG)
self._listboxes.append(lb)
- lb.grid(column=i, row=1, sticky="news", padx=0, pady=0)
+ lb.grid(column=i, row=1, sticky='news', padx=0, pady=0)
lb.column_index = i
# Clicking or dragging selects:
- lb.bind("<Button-1>", self._select)
- lb.bind("<B1-Motion>", self._select)
+ lb.bind('<Button-1>', self._select)
+ lb.bind('<B1-Motion>', self._select)
# Scroll whell scrolls:
- lb.bind("<Button-4>", lambda e: self._scroll(-1))
- lb.bind("<Button-5>", lambda e: self._scroll(+1))
- lb.bind("<MouseWheel>", lambda e: self._scroll(e.delta))
+ lb.bind('<Button-4>', lambda e: self._scroll(-1))
+ lb.bind('<Button-5>', lambda e: self._scroll(+1))
+ lb.bind('<MouseWheel>', lambda e: self._scroll(e.delta))
# Button 2 can be used to scan:
- lb.bind("<Button-2>", lambda e: self.scan_mark(e.x, e.y))
- lb.bind("<B2-Motion>", lambda e: self.scan_dragto(e.x, e.y))
+ lb.bind('<Button-2>', lambda e: self.scan_mark(e.x, e.y))
+ lb.bind('<B2-Motion>', lambda e: self.scan_dragto(e.x, e.y))
# Dragging outside the window has no effect (diable
# the default listbox behavior, which scrolls):
- lb.bind("<B1-Leave>", lambda e: "break")
+ lb.bind('<B1-Leave>', lambda e: 'break')
# Columns can be resized by dragging them:
- l.bind("<Button-1>", self._resize_column)
+ l.bind('<Button-1>', self._resize_column)
# Columns can be resized by dragging them. (This binding is
# used if they click on the grid between columns:)
- self.bind("<Button-1>", self._resize_column)
+ self.bind('<Button-1>', self._resize_column)
# Set up key bindings for the widget:
- self.bind("<Up>", lambda e: self.select(delta=-1))
- self.bind("<Down>", lambda e: self.select(delta=1))
- self.bind("<Prior>", lambda e: self.select(delta=-self._pagesize()))
- self.bind("<Next>", lambda e: self.select(delta=self._pagesize()))
+ self.bind('<Up>', lambda e: self.select(delta=-1))
+ self.bind('<Down>', lambda e: self.select(delta=1))
+ self.bind('<Prior>', lambda e: self.select(delta=-self._pagesize()))
+ self.bind('<Next>', lambda e: self.select(delta=self._pagesize()))
# Configuration customizations
self.configure(cnf, **kw)
"""
# If we're already waiting for a button release, then ignore
# the new button press.
- if event.widget.bind("<ButtonRelease>"):
+ if event.widget.bind('<ButtonRelease>'):
return False
# Decide which column (if any) to resize.
# Bind callbacks that are used to resize it.
if self._resize_column_index is not None:
- event.widget.bind("<Motion>", self._resize_column_motion_cb)
+ event.widget.bind('<Motion>', self._resize_column_motion_cb)
event.widget.bind(
- "<ButtonRelease-%d>" % event.num, self._resize_column_buttonrelease_cb
+ '<ButtonRelease-%d>' % event.num, self._resize_column_buttonrelease_cb
)
return True
else:
def _resize_column_motion_cb(self, event):
lb = self._listboxes[self._resize_column_index]
- charwidth = lb.winfo_width() / lb["width"]
+ charwidth = lb.winfo_width() / lb['width']
x1 = event.x + event.widget.winfo_x()
x2 = lb.winfo_x() + lb.winfo_width()
- lb["width"] = max(3, lb["width"] + (x1 - x2) // charwidth)
+ lb['width'] = max(3, lb['width'] + (x1 - x2) // charwidth)
def _resize_column_buttonrelease_cb(self, event):
- event.widget.unbind("<ButtonRelease-%d>" % event.num)
- event.widget.unbind("<Motion>")
+ event.widget.unbind('<ButtonRelease-%d>' % event.num)
+ event.widget.unbind('<Motion>')
# /////////////////////////////////////////////////////////////////
# Properties
def _select(self, e):
i = e.widget.nearest(e.y)
- self.selection_clear(0, "end")
+ self.selection_clear(0, 'end')
self.selection_set(i)
self.activate(i)
self.focus()
def _scroll(self, delta):
for lb in self._listboxes:
- lb.yview_scroll(delta, "unit")
- return "break"
+ lb.yview_scroll(delta, 'unit')
+ return 'break'
def _pagesize(self):
""":return: The number of rows that makes up one page"""
- return int(self.index("@0,1000000")) - int(self.index("@0,0"))
+ return int(self.index('@0,1000000')) - int(self.index('@0,0'))
# /////////////////////////////////////////////////////////////////
# Row selection
selected index, to ensure that it is visible.
"""
if (index is not None) and (delta is not None):
- raise ValueError("specify index or delta, but not both")
+ raise ValueError('specify index or delta, but not both')
# If delta was given, then calculate index.
if delta is not None:
index = int(self.curselection()[0]) + delta
# Clear all selected rows.
- self.selection_clear(0, "end")
+ self.selection_clear(0, 'end')
# Select the specified index
if index is not None:
"""
cnf = dict(list(cnf.items()) + list(kw.items()))
for (key, val) in list(cnf.items()):
- if key.startswith("label_") or key.startswith("label-"):
+ if key.startswith('label_') or key.startswith('label-'):
for label in self._labels:
label.configure({key[6:]: val})
- elif key.startswith("listbox_") or key.startswith("listbox-"):
+ elif key.startswith('listbox_') or key.startswith('listbox-'):
for listbox in self._listboxes:
listbox.configure({key[8:]: val})
else:
cnf = dict(list(cnf.items()) + list(kw.items()))
for (key, val) in list(cnf.items()):
if key in (
- "background",
- "bg",
- "foreground",
- "fg",
- "selectbackground",
- "selectforeground",
+ 'background',
+ 'bg',
+ 'foreground',
+ 'fg',
+ 'selectbackground',
+ 'selectforeground',
):
for i in range(lb.size()):
lb.itemconfigure(i, {key: val})
for elt in rows:
if len(elt) != len(self._column_names):
raise ValueError(
- "rows should be tuples whose length "
- "is equal to the number of columns"
+ 'rows should be tuples whose length '
+ 'is equal to the number of columns'
)
for (lb, elts) in zip(self._listboxes, list(zip(*rows))):
lb.insert(index, *elts)
weight = self._column_weights[col_index]
if self._labels:
self._labels[col_index].grid(
- column=col_index, row=0, sticky="news", padx=0, pady=0
+ column=col_index, row=0, sticky='news', padx=0, pady=0
)
self._listboxes[col_index].grid(
- column=col_index, row=1, sticky="news", padx=0, pady=0
+ column=col_index, row=1, sticky='news', padx=0, pady=0
)
self.grid_columnconfigure(col_index, weight=weight)
# Create our multi-list box.
self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw)
- self._mlb.pack(side="left", expand=True, fill="both")
+ self._mlb.pack(side='left', expand=True, fill='both')
# Optional scrollbar
if scrollbar:
- sb = Scrollbar(self._frame, orient="vertical", command=self._mlb.yview)
- self._mlb.listboxes[0]["yscrollcommand"] = sb.set
+ sb = Scrollbar(self._frame, orient='vertical', command=self._mlb.yview)
+ self._mlb.listboxes[0]['yscrollcommand'] = sb.set
# for listbox in self._mlb.listboxes:
# listbox['yscrollcommand'] = sb.set
- sb.pack(side="right", fill="y")
+ sb.pack(side='right', fill='y')
self._scrollbar = sb
# Set up sorting
self._sortkey = None
if click_to_sort:
for i, l in enumerate(self._mlb.column_labels):
- l.bind("<Button-1>", self._sort)
+ l.bind('<Button-1>', self._sort)
# Fill in our multi-list box.
self._fill_table()
Delete all rows in this table.
"""
self._rows = []
- self._mlb.delete(0, "end")
+ self._mlb.delete(0, 'end')
if self._DEBUG:
self._check_table_vs_mlb()
``i``th row and the ``j``th column.
"""
if isinstance(index, slice):
- raise ValueError("Slicing not supported")
+ raise ValueError('Slicing not supported')
elif isinstance(index, tuple) and len(index) == 2:
return self._rows[index[0]][self.column_index(index[1])]
else:
``val``.
"""
if isinstance(index, slice):
- raise ValueError("Slicing not supported")
+ raise ValueError('Slicing not supported')
# table[i,j] = val
elif isinstance(index, tuple) and len(index) == 2:
Delete the ``row_index``th row from this table.
"""
if isinstance(row_index, slice):
- raise ValueError("Slicing not supported")
+ raise ValueError('Slicing not supported')
if isinstance(row_index, tuple) and len(row_index) == 2:
- raise ValueError("Cannot delete a single cell!")
+ raise ValueError('Cannot delete a single cell!')
del self._rows[row_index]
self._mlb.delete(row_index)
if self._DEBUG:
"""
if len(rowvalue) != self._num_columns:
raise ValueError(
- "Row %r has %d columns; expected %d"
+ 'Row %r has %d columns; expected %d'
% (rowvalue, len(rowvalue), self._num_columns)
)
# Sorting
# /////////////////////////////////////////////////////////////////
- def sort_by(self, column_index, order="toggle"):
+ def sort_by(self, column_index, order='toggle'):
"""
Sort the rows in this table, using the specified column's
values as a sort key.
then reverse the rows; otherwise sort in ascending
order.
"""
- if order not in ("ascending", "descending", "toggle"):
+ if order not in ('ascending', 'descending', 'toggle'):
raise ValueError(
'sort_by(): order should be "ascending", ' '"descending", or "toggle".'
)
config_cookie = self._save_config_info(index_by_id=True)
# Sort the rows.
- if order == "toggle" and column_index == self._sortkey:
+ if order == 'toggle' and column_index == self._sortkey:
self._rows.reverse()
else:
self._rows.sort(
- key=operator.itemgetter(column_index), reverse=(order == "descending")
+ key=operator.itemgetter(column_index), reverse=(order == 'descending')
)
self._sortkey = column_index
# If they click on the far-left of far-right of a column's
# label, then resize rather than sorting.
if self._mlb._resize_column(event):
- return "continue"
+ return 'continue'
# Otherwise, sort.
else:
self.sort_by(column_index)
- return "continue"
+ return 'continue'
# /////////////////////////////////////////////////////////////////
# { Table Drawing Helpers
selection will also be lost -- i.e., no row will be selected
after this call completes.
"""
- self._mlb.delete(0, "end")
+ self._mlb.delete(0, 'end')
for i, row in enumerate(self._rows):
if self._reprfunc is not None:
row = [self._reprfunc(i, j, v) for (j, v) in enumerate(row)]
- self._mlb.insert("end", row)
+ self._mlb.insert('end', row)
def _get_itemconfig(self, r, c):
return dict(
(k, self._mlb.itemconfig(r, c, k)[-1])
for k in (
- "foreground",
- "selectforeground",
- "background",
- "selectbackground",
+ 'foreground',
+ 'selectforeground',
+ 'background',
+ 'selectbackground',
)
)
# Clear the selection.
if selection is None:
- self._mlb.selection_clear(0, "end")
+ self._mlb.selection_clear(0, 'end')
# Restore selection & color config
if index_by_id:
# update this to use new WordNet API
def demo():
root = Tk()
- root.bind("<Control-q>", lambda e: root.destroy())
+ root.bind('<Control-q>', lambda e: root.destroy())
table = Table(
root,
- "Word Synset Hypernym Hyponym".split(),
+ 'Word Synset Hypernym Hyponym'.split(),
column_weights=[0, 1, 1, 1],
- reprfunc=(lambda i, j, s: " %s" % s),
+ reprfunc=(lambda i, j, s: ' %s' % s),
)
- table.pack(expand=True, fill="both")
+ table.pack(expand=True, fill='both')
from nltk.corpus import wordnet
from nltk.corpus import brown
for word, pos in sorted(set(brown.tagged_words()[:500])):
- if pos[0] != "N":
+ if pos[0] != 'N':
continue
word = word.lower()
for synset in wordnet.synsets(word):
try:
hyper_def = synset.hypernyms()[0].definition()
except:
- hyper_def = "*none*"
+ hyper_def = '*none*'
try:
hypo_def = synset.hypernyms()[0].definition()
except:
- hypo_def = "*none*"
+ hypo_def = '*none*'
table.append([word, synset.definition(), hyper_def, hypo_def])
- table.columnconfig("Word", background="#afa")
- table.columnconfig("Synset", background="#efe")
- table.columnconfig("Hypernym", background="#fee")
- table.columnconfig("Hyponym", background="#ffe")
+ table.columnconfig('Word', background='#afa')
+ table.columnconfig('Synset', background='#efe')
+ table.columnconfig('Hypernym', background='#fee')
+ table.columnconfig('Hyponym', background='#ffe')
for row in range(len(table)):
- for column in ("Hypernym", "Hyponym"):
- if table[row, column] == "*none*":
+ for column in ('Hypernym', 'Hyponym'):
+ if table[row, column] == '*none*':
table.itemconfig(
- row, column, foreground="#666", selectforeground="#666"
+ row, column, foreground='#666', selectforeground='#666'
)
root.mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Graphical Representations for Trees
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Graphically display a Tree.
"""
-from tkinter import IntVar, Menu, Tk
+from six.moves.tkinter import IntVar, Menu, Tk
from nltk.util import in_idle
from nltk.tree import Tree
self._ordered = False
# Create canvas objects.
- self._lines = [canvas.create_line(0, 0, 0, 0, fill="#006060") for c in subtrees]
+ self._lines = [canvas.create_line(0, 0, 0, 0, fill='#006060') for c in subtrees]
self._polygon = canvas.create_polygon(
- 0, 0, fill="", state="hidden", outline="#006060"
+ 0, 0, fill='', state='hidden', outline='#006060'
)
# Register child widgets (label + subtrees)
def __setitem__(self, attr, value):
canvas = self.canvas()
- if attr == "roof":
+ if attr == 'roof':
self._roof = value
if self._roof:
for l in self._lines:
- canvas.itemconfig(l, state="hidden")
- canvas.itemconfig(self._polygon, state="normal")
+ canvas.itemconfig(l, state='hidden')
+ canvas.itemconfig(self._polygon, state='normal')
else:
for l in self._lines:
- canvas.itemconfig(l, state="normal")
- canvas.itemconfig(self._polygon, state="hidden")
- elif attr == "orientation":
- if value == "horizontal":
+ canvas.itemconfig(l, state='normal')
+ canvas.itemconfig(self._polygon, state='hidden')
+ elif attr == 'orientation':
+ if value == 'horizontal':
self._horizontal = 1
- elif value == "vertical":
+ elif value == 'vertical':
self._horizontal = 0
else:
- raise ValueError("orientation must be horizontal or vertical")
- elif attr == "color":
+ raise ValueError('orientation must be horizontal or vertical')
+ elif attr == 'color':
for l in self._lines:
canvas.itemconfig(l, fill=value)
canvas.itemconfig(self._polygon, outline=value)
- elif isinstance(attr, tuple) and attr[0] == "color":
+ elif isinstance(attr, tuple) and attr[0] == 'color':
# Set the color of an individual line.
l = self._lines[int(attr[1])]
canvas.itemconfig(l, fill=value)
- elif attr == "fill":
+ elif attr == 'fill':
canvas.itemconfig(self._polygon, fill=value)
- elif attr == "width":
+ elif attr == 'width':
canvas.itemconfig(self._polygon, {attr: value})
for l in self._lines:
canvas.itemconfig(l, {attr: value})
- elif attr in ("xspace", "yspace"):
- if attr == "xspace":
+ elif attr in ('xspace', 'yspace'):
+ if attr == 'xspace':
self._xspace = value
- elif attr == "yspace":
+ elif attr == 'yspace':
self._yspace = value
self.update(self._label)
- elif attr == "ordered":
+ elif attr == 'ordered':
self._ordered = value
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "roof":
+ if attr == 'roof':
return self._roof
- elif attr == "width":
+ elif attr == 'width':
return self.canvas().itemcget(self._polygon, attr)
- elif attr == "color":
- return self.canvas().itemcget(self._polygon, "outline")
- elif isinstance(attr, tuple) and attr[0] == "color":
+ elif attr == 'color':
+ return self.canvas().itemcget(self._polygon, 'outline')
+ elif isinstance(attr, tuple) and attr[0] == 'color':
l = self._lines[int(attr[1])]
- return self.canvas().itemcget(l, "fill")
- elif attr == "xspace":
+ return self.canvas().itemcget(l, 'fill')
+ elif attr == 'xspace':
return self._xspace
- elif attr == "yspace":
+ elif attr == 'yspace':
return self._yspace
- elif attr == "orientation":
+ elif attr == 'orientation':
if self._horizontal:
- return "horizontal"
+ return 'horizontal'
else:
- return "vertical"
- elif attr == "ordered":
+ return 'vertical'
+ elif attr == 'ordered':
return self._ordered
else:
return CanvasWidget.__getitem__(self, attr)
canvas = self.canvas()
self._subtrees.insert(index, child)
self._add_child_widget(child)
- self._lines.append(canvas.create_line(0, 0, 0, 0, fill="#006060"))
+ self._lines.append(canvas.create_line(0, 0, 0, 0, fill='#006060'))
self.update(self._label)
# but.. lines???
self._managing = False
def __repr__(self):
- return "[TreeSeg %s: %s]" % (self._label, self._subtrees)
+ return '[TreeSeg %s: %s]' % (self._label, self._subtrees)
def _tree_to_treeseg(
loc_attribs = {}
for (key, value) in list(attribs.items()):
- if key[:5] == "tree_":
+ if key[:5] == 'tree_':
tree_attribs[key[5:]] = value
- elif key[:5] == "node_":
+ elif key[:5] == 'node_':
node_attribs[key[5:]] = value
- elif key[:5] == "leaf_":
+ elif key[:5] == 'leaf_':
leaf_attribs[key[5:]] = value
- elif key[:4] == "loc_":
+ elif key[:4] == 'loc_':
loc_attribs[key[4:]] = value
else:
- raise ValueError("Bad attribute: %s" % key)
+ raise ValueError('Bad attribute: %s' % key)
return _tree_to_treeseg(
canvas,
t,
# Attributes.
self._nodeattribs = {}
self._leafattribs = {}
- self._locattribs = {"color": "#008000"}
- self._line_color = "#008080"
+ self._locattribs = {'color': '#008000'}
+ self._line_color = '#008080'
self._line_width = 1
- self._roof_color = "#008080"
- self._roof_fill = "#c0c0c0"
+ self._roof_color = '#008080'
+ self._roof_fill = '#c0c0c0'
self._shapeable = False
self._xspace = 10
self._yspace = 10
- self._orientation = "vertical"
+ self._orientation = 'vertical'
self._ordered = False
# Build trees.
return leaf
def __setitem__(self, attr, value):
- if attr[:5] == "node_":
+ if attr[:5] == 'node_':
for node in self._nodes:
node[attr[5:]] = value
- elif attr[:5] == "leaf_":
+ elif attr[:5] == 'leaf_':
for leaf in self._leaves:
leaf[attr[5:]] = value
- elif attr == "line_color":
+ elif attr == 'line_color':
self._line_color = value
for tseg in list(self._expanded_trees.values()):
- tseg["color"] = value
- elif attr == "line_width":
+ tseg['color'] = value
+ elif attr == 'line_width':
self._line_width = value
for tseg in list(self._expanded_trees.values()):
- tseg["width"] = value
+ tseg['width'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["width"] = value
- elif attr == "roof_color":
+ tseg['width'] = value
+ elif attr == 'roof_color':
self._roof_color = value
for tseg in list(self._collapsed_trees.values()):
- tseg["color"] = value
- elif attr == "roof_fill":
+ tseg['color'] = value
+ elif attr == 'roof_fill':
self._roof_fill = value
for tseg in list(self._collapsed_trees.values()):
- tseg["fill"] = value
- elif attr == "shapeable":
+ tseg['fill'] = value
+ elif attr == 'shapeable':
self._shapeable = value
for tseg in list(self._expanded_trees.values()):
- tseg["draggable"] = value
+ tseg['draggable'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["draggable"] = value
+ tseg['draggable'] = value
for leaf in self._leaves:
- leaf["draggable"] = value
- elif attr == "xspace":
+ leaf['draggable'] = value
+ elif attr == 'xspace':
self._xspace = value
for tseg in list(self._expanded_trees.values()):
- tseg["xspace"] = value
+ tseg['xspace'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["xspace"] = value
+ tseg['xspace'] = value
self.manage()
- elif attr == "yspace":
+ elif attr == 'yspace':
self._yspace = value
for tseg in list(self._expanded_trees.values()):
- tseg["yspace"] = value
+ tseg['yspace'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["yspace"] = value
+ tseg['yspace'] = value
self.manage()
- elif attr == "orientation":
+ elif attr == 'orientation':
self._orientation = value
for tseg in list(self._expanded_trees.values()):
- tseg["orientation"] = value
+ tseg['orientation'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["orientation"] = value
+ tseg['orientation'] = value
self.manage()
- elif attr == "ordered":
+ elif attr == 'ordered':
self._ordered = value
for tseg in list(self._expanded_trees.values()):
- tseg["ordered"] = value
+ tseg['ordered'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["ordered"] = value
+ tseg['ordered'] = value
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr[:5] == "node_":
+ if attr[:5] == 'node_':
return self._nodeattribs.get(attr[5:], None)
- elif attr[:5] == "leaf_":
+ elif attr[:5] == 'leaf_':
return self._leafattribs.get(attr[5:], None)
- elif attr[:4] == "loc_":
+ elif attr[:4] == 'loc_':
return self._locattribs.get(attr[4:], None)
- elif attr == "line_color":
+ elif attr == 'line_color':
return self._line_color
- elif attr == "line_width":
+ elif attr == 'line_width':
return self._line_width
- elif attr == "roof_color":
+ elif attr == 'roof_color':
return self._roof_color
- elif attr == "roof_fill":
+ elif attr == 'roof_fill':
return self._roof_fill
- elif attr == "shapeable":
+ elif attr == 'shapeable':
return self._shapeable
- elif attr == "xspace":
+ elif attr == 'xspace':
return self._xspace
- elif attr == "yspace":
+ elif attr == 'yspace':
return self._yspace
- elif attr == "orientation":
+ elif attr == 'orientation':
return self._orientation
else:
return CanvasWidget.__getitem__(self, attr)
Collapse/expand a tree.
"""
old_treeseg = treeseg
- if old_treeseg["roof"]:
+ if old_treeseg['roof']:
new_treeseg = self._expanded_trees[self._keys[old_treeseg]]
else:
new_treeseg = self._collapsed_trees[self._keys[old_treeseg]]
self._trees = trees
self._top = Tk()
- self._top.title("NLTK")
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Control-q>", self.destroy)
+ self._top.title('NLTK')
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Control-q>', self.destroy)
cf = self._cframe = CanvasFrame(self._top)
- self._top.bind("<Control-p>", self._cframe.print_to_file)
+ self._top.bind('<Control-p>', self._cframe.print_to_file)
# Size is variable.
self._size = IntVar(self._top)
self._size.set(12)
- bold = ("helvetica", -self._size.get(), "bold")
- helv = ("helvetica", -self._size.get())
+ bold = ('helvetica', -self._size.get(), 'bold')
+ helv = ('helvetica', -self._size.get())
# Lay the trees out in a square.
self._width = int(ceil(sqrt(len(trees))))
cf.canvas(),
trees[i],
node_font=bold,
- leaf_color="#008040",
- node_color="#004080",
- roof_color="#004040",
- roof_fill="white",
- line_color="#004040",
+ leaf_color='#008040',
+ node_color='#004080',
+ roof_color='#004040',
+ roof_fill='white',
+ line_color='#004040',
draggable=1,
leaf_font=helv,
)
cf.add_widget(widget, 0, 0)
self._layout()
- self._cframe.pack(expand=1, fill="both")
+ self._cframe.pack(expand=1, fill='both')
self._init_menubar()
def _layout(self):
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Print to Postscript",
+ label='Print to Postscript',
underline=0,
command=self._cframe.print_to_file,
- accelerator="Ctrl-p",
+ accelerator='Ctrl-p',
)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
zoommenu = Menu(menubar, tearoff=0)
zoommenu.add_radiobutton(
- label="Tiny",
+ label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Small",
+ label='Small',
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Medium",
+ label='Medium',
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Large",
+ label='Large',
variable=self._size,
underline=0,
value=28,
command=self.resize,
)
zoommenu.add_radiobutton(
- label="Huge",
+ label='Huge',
variable=self._size,
underline=0,
value=50,
command=self.resize,
)
- menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu)
+ menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
self._top.config(menu=menubar)
def resize(self, *e):
- bold = ("helvetica", -self._size.get(), "bold")
- helv = ("helvetica", -self._size.get())
+ bold = ('helvetica', -self._size.get(), 'bold')
+ helv = ('helvetica', -self._size.get())
xspace = self._size.get()
yspace = self._size.get()
for widget in self._widgets:
- widget["node_font"] = bold
- widget["leaf_font"] = helv
- widget["xspace"] = xspace
- widget["yspace"] = yspace
+ widget['node_font'] = bold
+ widget['leaf_font'] = helv
+ widget['xspace'] = xspace
+ widget['yspace'] = yspace
if self._size.get() < 20:
- widget["line_width"] = 1
+ widget['line_width'] = 1
elif self._size.get() < 30:
- widget["line_width"] = 2
+ widget['line_width'] = 2
else:
- widget["line_width"] = 3
+ widget['line_width'] = 3
self._layout()
def destroy(self, *e):
import random
def fill(cw):
- cw["fill"] = "#%06d" % random.randint(0, 999999)
+ cw['fill'] = '#%06d' % random.randint(0, 999999)
cf = CanvasFrame(width=550, height=450, closeenough=2)
t = Tree.fromstring(
- """
+ '''
(S (NP the very big cat)
- (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))"""
+ (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))'''
)
tc = TreeWidget(
cf.canvas(),
t,
draggable=1,
- node_font=("helvetica", -14, "bold"),
- leaf_font=("helvetica", -12, "italic"),
- roof_fill="white",
- roof_color="black",
- leaf_color="green4",
- node_color="blue2",
+ node_font=('helvetica', -14, 'bold'),
+ leaf_font=('helvetica', -12, 'italic'),
+ roof_fill='white',
+ roof_color='black',
+ leaf_color='green4',
+ node_color='blue2',
)
cf.add_widget(tc, 10, 10)
def boxit(canvas, text):
- big = ("helvetica", -16, "bold")
- return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green")
+ big = ('helvetica', -16, 'bold')
+ return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green')
def ovalit(canvas, text):
- return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan")
+ return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan')
- treetok = Tree.fromstring("(S (NP this tree) (VP (V is) (AdjP shapeable)))")
+ treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)
def color(node):
- node["color"] = "#%04d00" % random.randint(0, 9999)
+ node['color'] = '#%04d00' % random.randint(0, 9999)
def color2(treeseg):
- treeseg.label()["fill"] = "#%06d" % random.randint(0, 9999)
- treeseg.label().child()["color"] = "white"
+ treeseg.label()['fill'] = '#%06d' % random.randint(0, 9999)
+ treeseg.label().child()['color'] = 'white'
tc.bind_click_trees(tc.toggle_collapsed)
tc2.bind_click_trees(tc2.toggle_collapsed)
cf.add_widget(paren, tc.bbox()[2] + 10, 10)
tree3 = Tree.fromstring(
- """
+ '''
(S (NP this tree) (AUX was)
- (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))"""
+ (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))'''
)
tc3 = tree_to_treesegment(
- cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2
+ cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2
)
- tc3["draggable"] = 1
+ tc3['draggable'] = 1
cf.add_widget(tc3, 10, tc.bbox()[3] + 10)
def orientswitch(treewidget):
- if treewidget["orientation"] == "horizontal":
- treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical")
- treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical")
- treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical")
- treewidget.collapsed_tree().subtrees()[3].set_text("vertical")
- treewidget["orientation"] = "vertical"
+ if treewidget['orientation'] == 'horizontal':
+ treewidget.expanded_tree(1, 1).subtrees()[0].set_text('vertical')
+ treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('vertical')
+ treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
+ treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
+ treewidget['orientation'] = 'vertical'
else:
- treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal")
- treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal")
- treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal")
- treewidget.collapsed_tree().subtrees()[3].set_text("horizontal")
- treewidget["orientation"] = "horizontal"
+ treewidget.expanded_tree(1, 1).subtrees()[0].set_text('horizontal')
+ treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('horizontal')
+ treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
+ treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
+ treewidget['orientation'] = 'horizontal'
text = """
Try clicking, right clicking, and dragging
and OvalWidget). The bottom-left tree is
built from tree_to_treesegment."""
twidget = TextWidget(cf.canvas(), text.strip())
- textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1)
+ textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10)
- tree4 = Tree.fromstring("(S (NP this tree) (VP (V is) (Adj horizontal)))")
+ tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))')
tc4 = TreeWidget(
cf.canvas(),
tree4,
draggable=1,
- line_color="brown2",
- roof_color="brown2",
- node_font=("helvetica", -12, "bold"),
- node_color="brown4",
- orientation="horizontal",
+ line_color='brown2',
+ roof_color='brown2',
+ node_font=('helvetica', -12, 'bold'),
+ node_color='brown4',
+ orientation='horizontal',
)
tc4.manage()
cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10)
cf.mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Drawing utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
from abc import ABCMeta, abstractmethod
-from tkinter import (
+from six import add_metaclass
+from six.moves.tkinter import (
Button,
Canvas,
Entry,
Widget,
RAISED,
)
-from tkinter.filedialog import asksaveasfilename
+from six.moves.tkinter_tkfiledialog import asksaveasfilename
from nltk.util import in_idle
##//////////////////////////////////////////////////////
-class CanvasWidget(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class CanvasWidget(object):
"""
A collection of graphical elements and bindings used to display a
complex object on a Tkinter ``Canvas``. A canvas widget is
:param attribs: The new canvas widget's attributes.
"""
if self.__class__ == CanvasWidget:
- raise TypeError("CanvasWidget is an abstract base class")
+ raise TypeError('CanvasWidget is an abstract base class')
if not isinstance(canvas, Canvas):
- raise TypeError("Expected a canvas!")
+ raise TypeError('Expected a canvas!')
self.__canvas = canvas
self.__parent = parent
# If the subclass constructor called _add_child_widget, then
# self.__children will already exist.
- if not hasattr(self, "_CanvasWidget__children"):
+ if not hasattr(self, '_CanvasWidget__children'):
self.__children = []
# Is this widget hidden?
# Register any new bindings
for tag in self._tags():
- self.__canvas.tag_bind(tag, "<ButtonPress-1>", self.__press_cb)
- self.__canvas.tag_bind(tag, "<ButtonPress-2>", self.__press_cb)
- self.__canvas.tag_bind(tag, "<ButtonPress-3>", self.__press_cb)
+ self.__canvas.tag_bind(tag, '<ButtonPress-1>', self.__press_cb)
+ self.__canvas.tag_bind(tag, '<ButtonPress-2>', self.__press_cb)
+ self.__canvas.tag_bind(tag, '<ButtonPress-3>', self.__press_cb)
##//////////////////////////////////////////////////////
## Inherited methods.
if self.__hidden:
return (0, 0, 0, 0)
if len(self.tags()) == 0:
- raise ValueError("No tags")
+ raise ValueError('No tags')
return self.__canvas.bbox(*self.tags())
def width(self):
:rtype: int
"""
if len(self.tags()) == 0:
- raise ValueError("No tags")
+ raise ValueError('No tags')
bbox = self.__canvas.bbox(*self.tags())
return bbox[2] - bbox[0]
:rtype: int
"""
if len(self.tags()) == 0:
- raise ValueError("No tags")
+ raise ValueError('No tags')
bbox = self.__canvas.bbox(*self.tags())
return bbox[3] - bbox[1]
if self.__parent:
self.__parent.update(self)
- def moveto(self, x, y, anchor="NW"):
+ def moveto(self, x, y, anchor='NW'):
"""
Move this canvas widget to the given location. In particular,
shift the canvas widget such that the corner or side of the
corner; etc.
"""
x1, y1, x2, y2 = self.bbox()
- if anchor == "NW":
+ if anchor == 'NW':
self.move(x - x1, y - y1)
- if anchor == "N":
+ if anchor == 'N':
self.move(x - x1 / 2 - x2 / 2, y - y1)
- if anchor == "NE":
+ if anchor == 'NE':
self.move(x - x2, y - y1)
- if anchor == "E":
+ if anchor == 'E':
self.move(x - x2, y - y1 / 2 - y2 / 2)
- if anchor == "SE":
+ if anchor == 'SE':
self.move(x - x2, y - y2)
- if anchor == "S":
+ if anchor == 'S':
self.move(x - x1 / 2 - x2 / 2, y - y2)
- if anchor == "SW":
+ if anchor == 'SW':
self.move(x - x1, y - y2)
- if anchor == "W":
+ if anchor == 'W':
self.move(x - x1, y - y1 / 2 - y2 / 2)
def destroy(self):
return
for tag in self.tags():
- self.__canvas.tag_unbind(tag, "<ButtonPress-1>")
- self.__canvas.tag_unbind(tag, "<ButtonPress-2>")
- self.__canvas.tag_unbind(tag, "<ButtonPress-3>")
+ self.__canvas.tag_unbind(tag, '<ButtonPress-1>')
+ self.__canvas.tag_unbind(tag, '<ButtonPress-2>')
+ self.__canvas.tag_unbind(tag, '<ButtonPress-3>')
self.__canvas.delete(*self.tags())
self.__canvas = None
:rtype: list of int
"""
if self.__canvas is None:
- raise ValueError("Attempt to access a destroyed canvas widget")
+ raise ValueError('Attempt to access a destroyed canvas widget')
tags = []
tags += self._tags()
for child in self.__children:
:rtype: None
"""
- if attr == "draggable":
+ if attr == 'draggable':
self.__draggable = value
else:
- raise ValueError("Unknown attribute %r" % attr)
+ raise ValueError('Unknown attribute %r' % attr)
def __getitem__(self, attr):
"""
canvas widget.
:rtype: (any)
"""
- if attr == "draggable":
+ if attr == 'draggable':
return self.__draggable
else:
- raise ValueError("Unknown attribute %r" % attr)
+ raise ValueError('Unknown attribute %r' % attr)
def __repr__(self):
"""
:return: a string representation of this canvas widget.
:rtype: str
"""
- return "<%s>" % self.__class__.__name__
+ return '<%s>' % self.__class__.__name__
def hide(self):
"""
"""
self.__hidden = 1
for tag in self.tags():
- self.__canvas.itemconfig(tag, state="hidden")
+ self.__canvas.itemconfig(tag, state='hidden')
def show(self):
"""
"""
self.__hidden = 0
for tag in self.tags():
- self.__canvas.itemconfig(tag, state="normal")
+ self.__canvas.itemconfig(tag, state='normal')
def hidden(self):
"""
will be called with this ``CanvasWidget`` as its argument.
"""
self.__draggable = 1
- self.__callbacks["drag"] = callback
+ self.__callbacks['drag'] = callback
def unbind_click(self, button=1):
"""
Remove a callback that was registered with ``bind_drag``.
"""
try:
- del self.__callbacks["drag"]
+ del self.__callbacks['drag']
except:
pass
# If we're already waiting for a button release, then ignore
# this new button press.
if (
- self.__canvas.bind("<ButtonRelease-1>")
- or self.__canvas.bind("<ButtonRelease-2>")
- or self.__canvas.bind("<ButtonRelease-3>")
+ self.__canvas.bind('<ButtonRelease-1>')
+ or self.__canvas.bind('<ButtonRelease-2>')
+ or self.__canvas.bind('<ButtonRelease-3>')
):
return
# Unbind motion (just in case; this shouldn't be necessary)
- self.__canvas.unbind("<Motion>")
+ self.__canvas.unbind('<Motion>')
# Record the button press event.
self.__press = event
if event.num == 1:
widget = self
while widget is not None:
- if widget["draggable"]:
+ if widget['draggable']:
widget.__start_drag(event)
break
widget = widget.parent()
# Set up the button release callback.
- self.__canvas.bind("<ButtonRelease-%d>" % event.num, self.__release_cb)
+ self.__canvas.bind('<ButtonRelease-%d>' % event.num, self.__release_cb)
def __start_drag(self, event):
"""
- register a motion callback
- record the drag coordinates
"""
- self.__canvas.bind("<Motion>", self.__motion_cb)
+ self.__canvas.bind('<Motion>', self.__motion_cb)
self.__drag_x = event.x
self.__drag_y = event.y
- call the appropriate handler.
"""
# Unbind the button release & motion callbacks.
- self.__canvas.unbind("<ButtonRelease-%d>" % event.num)
- self.__canvas.unbind("<Motion>")
+ self.__canvas.unbind('<ButtonRelease-%d>' % event.num)
+ self.__canvas.unbind('<Motion>')
# Is it a click or a drag?
if (
call it. If no ancestors have a drag callback, do nothing.
"""
if self.__draggable:
- if "drag" in self.__callbacks:
- cb = self.__callbacks["drag"]
+ if 'drag' in self.__callbacks:
+ cb = self.__callbacks['drag']
try:
cb(self)
except:
- print("Error in drag callback for %r" % self)
+ print('Error in drag callback for %r' % self)
elif self.__parent is not None:
self.__parent.__drag()
# try:
cb(self)
# except:
- # print('Error in click callback for %r' % self)
+ # print 'Error in click callback for %r' % self
# raise
elif self.__parent is not None:
self.__parent.__click(button)
have a parent.
:type child: CanvasWidget
"""
- if not hasattr(self, "_CanvasWidget__children"):
+ if not hasattr(self, '_CanvasWidget__children'):
self.__children = []
if child.__parent is not None:
- raise ValueError("{} already has a parent".format(child))
+ raise ValueError('{} already has a parent'.format(child))
child.__parent = self
self.__children.append(child)
CanvasWidget.__init__(self, canvas, **attribs)
def __setitem__(self, attr, value):
- if attr in ("color", "font", "justify", "width"):
- if attr == "color":
- attr = "fill"
+ if attr in ('color', 'font', 'justify', 'width'):
+ if attr == 'color':
+ attr = 'fill'
self.canvas().itemconfig(self._tag, {attr: value})
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "width":
+ if attr == 'width':
return int(self.canvas().itemcget(self._tag, attr))
- elif attr in ("color", "font", "justify"):
- if attr == "color":
- attr = "fill"
+ elif attr in ('color', 'font', 'justify'):
+ if attr == 'color':
+ attr = 'fill'
return self.canvas().itemcget(self._tag, attr)
else:
return CanvasWidget.__getitem__(self, attr)
:return: The text displayed by this text widget.
:rtype: str
"""
- return self.canvas().itemcget(self._tag, "TEXT")
+ return self.canvas().itemcget(self._tag, 'TEXT')
def set_text(self, text):
"""
self.parent().update(self)
def __repr__(self):
- return "[Text: %r]" % self._text
+ return '[Text: %r]' % self._text
class SymbolWidget(TextWidget):
"""
SYMBOLS = {
- "neg": "\330",
- "disj": "\332",
- "conj": "\331",
- "lambda": "\154",
- "merge": "\304",
- "forall": "\042",
- "exists": "\044",
- "subseteq": "\315",
- "subset": "\314",
- "notsubset": "\313",
- "emptyset": "\306",
- "imp": "\336",
- "rightarrow": chr(222), #'\256',
- "equal": "\75",
- "notequal": "\271",
- "intersection": "\307",
- "union": "\310",
- "epsilon": "e",
+ 'neg': '\330',
+ 'disj': '\332',
+ 'conj': '\331',
+ 'lambda': '\154',
+ 'merge': '\304',
+ 'forall': '\042',
+ 'exists': '\044',
+ 'subseteq': '\315',
+ 'subset': '\314',
+ 'notsubset': '\313',
+ 'emptyset': '\306',
+ 'imp': '\336',
+ 'rightarrow': chr(222), #'\256',
+ 'equal': '\75',
+ 'notequal': '\271',
+ 'intersection': '\307',
+ 'union': '\310',
+ 'epsilon': 'e',
}
def __init__(self, canvas, symbol, **attribs):
:param symbol: The name of the symbol to display.
:param attribs: The new canvas widget's attributes.
"""
- attribs["font"] = "symbol"
- TextWidget.__init__(self, canvas, "", **attribs)
+ attribs['font'] = 'symbol'
+ TextWidget.__init__(self, canvas, '', **attribs)
self.set_symbol(symbol)
def symbol(self):
:param symbol: The name of the symbol to display.
"""
if symbol not in SymbolWidget.SYMBOLS:
- raise ValueError("Unknown symbol: %s" % symbol)
+ raise ValueError('Unknown symbol: %s' % symbol)
self._symbol = symbol
self.set_text(SymbolWidget.SYMBOLS[symbol])
def __repr__(self):
- return "[Symbol: %r]" % self._symbol
+ return '[Symbol: %r]' % self._symbol
@staticmethod
def symbolsheet(size=20):
def destroy(e, top=top):
top.destroy()
- top.bind("q", destroy)
- Button(top, text="Quit", command=top.destroy).pack(side="bottom")
- text = Text(top, font=("helvetica", -size), width=20, height=30)
- text.pack(side="left")
+ top.bind('q', destroy)
+ Button(top, text='Quit', command=top.destroy).pack(side='bottom')
+ text = Text(top, font=('helvetica', -size), width=20, height=30)
+ text.pack(side='left')
sb = Scrollbar(top, command=text.yview)
- text["yscrollcommand"] = sb.set
- sb.pack(side="right", fill="y")
- text.tag_config("symbol", font=("symbol", -size))
+ text['yscrollcommand'] = sb.set
+ sb.pack(side='right', fill='y')
+ text.tag_config('symbol', font=('symbol', -size))
for i in range(256):
if i in (0, 10):
continue # null and newline
for k, v in list(SymbolWidget.SYMBOLS.items()):
if v == chr(i):
- text.insert("end", "%-10s\t" % k)
+ text.insert('end', '%-10s\t' % k)
break
else:
- text.insert("end", "%-10d \t" % i)
- text.insert("end", "[%s]\n" % chr(i), "symbol")
+ text.insert('end', '%-10d \t' % i)
+ text.insert('end', '[%s]\n' % chr(i), 'symbol')
top.mainloop()
def __repr__(self):
name = self.__class__.__name__
- if name[-6:] == "Widget":
+ if name[-6:] == 'Widget':
name = name[:-6]
- return "[%s: %r]" % (name, self._child)
+ return '[%s: %r]' % (name, self._child)
class BoxWidget(AbstractContainerWidget):
AbstractContainerWidget.__init__(self, canvas, child, **attribs)
def __setitem__(self, attr, value):
- if attr == "margin":
+ if attr == 'margin':
self._margin = value
- elif attr in ("outline", "fill", "width"):
+ elif attr in ('outline', 'fill', 'width'):
self.canvas().itemconfig(self._box, {attr: value})
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "margin":
+ if attr == 'margin':
return self._margin
- elif attr == "width":
+ elif attr == 'width':
return float(self.canvas().itemcget(self._box, attr))
- elif attr in ("outline", "fill", "width"):
+ elif attr in ('outline', 'fill', 'width'):
return self.canvas().itemcget(self._box, attr)
else:
return CanvasWidget.__getitem__(self, attr)
def _update(self, child):
(x1, y1, x2, y2) = child.bbox()
- margin = self._margin + self["width"] / 2
+ margin = self._margin + self['width'] / 2
self.canvas().coords(
self._box, x1 - margin, y1 - margin, x2 + margin, y2 + margin
)
self._child = child
self._margin = 1
self._oval = canvas.create_oval(1, 1, 1, 1)
- self._circle = attribs.pop("circle", False)
- self._double = attribs.pop("double", False)
+ self._circle = attribs.pop('circle', False)
+ self._double = attribs.pop('double', False)
if self._double:
self._oval2 = canvas.create_oval(1, 1, 1, 1)
else:
def __setitem__(self, attr, value):
c = self.canvas()
- if attr == "margin":
+ if attr == 'margin':
self._margin = value
- elif attr == "double":
+ elif attr == 'double':
if value == True and self._oval2 is None:
# Copy attributes & position from self._oval.
x1, y1, x2, y2 = c.bbox(self._oval)
- w = self["width"] * 2
+ w = self['width'] * 2
self._oval2 = c.create_oval(
x1 - w,
y1 - w,
x2 + w,
y2 + w,
- outline=c.itemcget(self._oval, "outline"),
- width=c.itemcget(self._oval, "width"),
+ outline=c.itemcget(self._oval, 'outline'),
+ width=c.itemcget(self._oval, 'width'),
)
c.tag_lower(self._oval2)
if value == False and self._oval2 is not None:
c.delete(self._oval2)
self._oval2 = None
- elif attr in ("outline", "fill", "width"):
+ elif attr in ('outline', 'fill', 'width'):
c.itemconfig(self._oval, {attr: value})
- if self._oval2 is not None and attr != "fill":
+ if self._oval2 is not None and attr != 'fill':
c.itemconfig(self._oval2, {attr: value})
- if self._oval2 is not None and attr != "fill":
+ if self._oval2 is not None and attr != 'fill':
self.canvas().itemconfig(self._oval2, {attr: value})
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "margin":
+ if attr == 'margin':
return self._margin
- elif attr == "double":
+ elif attr == 'double':
return self._double is not None
- elif attr == "width":
+ elif attr == 'width':
return float(self.canvas().itemcget(self._oval, attr))
- elif attr in ("outline", "fill", "width"):
+ elif attr in ('outline', 'fill', 'width'):
return self.canvas().itemcget(self._oval, attr)
else:
return CanvasWidget.__getitem__(self, attr)
:param attribs: The new canvas widget's attributes.
"""
self._child = child
- self._oparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=90, extent=180)
- self._cparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=-90, extent=180)
+ self._oparen = canvas.create_arc(1, 1, 1, 1, style='arc', start=90, extent=180)
+ self._cparen = canvas.create_arc(1, 1, 1, 1, style='arc', start=-90, extent=180)
AbstractContainerWidget.__init__(self, canvas, child, **attribs)
def __setitem__(self, attr, value):
- if attr == "color":
+ if attr == 'color':
self.canvas().itemconfig(self._oparen, outline=value)
self.canvas().itemconfig(self._cparen, outline=value)
- elif attr == "width":
+ elif attr == 'width':
self.canvas().itemconfig(self._oparen, width=value)
self.canvas().itemconfig(self._cparen, width=value)
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "color":
- return self.canvas().itemcget(self._oparen, "outline")
- elif attr == "width":
- return self.canvas().itemcget(self._oparen, "width")
+ if attr == 'color':
+ return self.canvas().itemcget(self._oparen, 'outline')
+ elif attr == 'width':
+ return self.canvas().itemcget(self._oparen, 'width')
else:
return CanvasWidget.__getitem__(self, attr)
AbstractContainerWidget.__init__(self, canvas, child, **attribs)
def __setitem__(self, attr, value):
- if attr == "color":
+ if attr == 'color':
self.canvas().itemconfig(self._obrack, fill=value)
self.canvas().itemconfig(self._cbrack, fill=value)
- elif attr == "width":
+ elif attr == 'width':
self.canvas().itemconfig(self._obrack, width=value)
self.canvas().itemconfig(self._cbrack, width=value)
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "color":
- return self.canvas().itemcget(self._obrack, "outline")
- elif attr == "width":
- return self.canvas().itemcget(self._obrack, "width")
+ if attr == 'color':
+ return self.canvas().itemcget(self._obrack, 'outline')
+ elif attr == 'width':
+ return self.canvas().itemcget(self._obrack, 'width')
else:
return CanvasWidget.__getitem__(self, attr)
:type children: list(CanvasWidget)
:param attribs: The new canvas widget's attributes.
"""
- self._align = "center"
+ self._align = 'center'
self._space = 1
self._ordered = False
self._children = list(children)
CanvasWidget.__init__(self, canvas, **attribs)
def __setitem__(self, attr, value):
- if attr == "align":
- if value not in ("top", "bottom", "center"):
- raise ValueError("Bad alignment: %r" % value)
+ if attr == 'align':
+ if value not in ('top', 'bottom', 'center'):
+ raise ValueError('Bad alignment: %r' % value)
self._align = value
- elif attr == "space":
+ elif attr == 'space':
self._space = value
- elif attr == "ordered":
+ elif attr == 'ordered':
self._ordered = value
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "align":
+ if attr == 'align':
return self._align
- elif attr == "space":
+ elif attr == 'space':
return self._space
- elif attr == "ordered":
+ elif attr == 'ordered':
return self._ordered
else:
return CanvasWidget.__getitem__(self, attr)
return []
def _yalign(self, top, bot):
- if self._align == "top":
+ if self._align == 'top':
return top
- if self._align == "bottom":
+ if self._align == 'bottom':
return bot
- if self._align == "center":
+ if self._align == 'center':
return (top + bot) / 2
def _update(self, child):
x -= x2 - x1 + self._space
def __repr__(self):
- return "[Sequence: " + repr(self._children)[1:-1] + "]"
+ return '[Sequence: ' + repr(self._children)[1:-1] + ']'
# Provide an alias for the child_widgets() member.
children = CanvasWidget.child_widgets
:type children: list(CanvasWidget)
:param attribs: The new canvas widget's attributes.
"""
- self._align = "center"
+ self._align = 'center'
self._space = 1
self._ordered = False
self._children = list(children)
CanvasWidget.__init__(self, canvas, **attribs)
def __setitem__(self, attr, value):
- if attr == "align":
- if value not in ("left", "right", "center"):
- raise ValueError("Bad alignment: %r" % value)
+ if attr == 'align':
+ if value not in ('left', 'right', 'center'):
+ raise ValueError('Bad alignment: %r' % value)
self._align = value
- elif attr == "space":
+ elif attr == 'space':
self._space = value
- elif attr == "ordered":
+ elif attr == 'ordered':
self._ordered = value
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "align":
+ if attr == 'align':
return self._align
- elif attr == "space":
+ elif attr == 'space':
return self._space
- elif attr == "ordered":
+ elif attr == 'ordered':
return self._ordered
else:
return CanvasWidget.__getitem__(self, attr)
return []
def _xalign(self, left, right):
- if self._align == "left":
+ if self._align == 'left':
return left
- if self._align == "right":
+ if self._align == 'right':
return right
- if self._align == "center":
+ if self._align == 'center':
return (left + right) / 2
def _update(self, child):
y -= y2 - y1 + self._space
def __repr__(self):
- return "[Stack: " + repr(self._children)[1:-1] + "]"
+ return '[Stack: ' + repr(self._children)[1:-1] + ']'
# Provide an alias for the child_widgets() member.
children = CanvasWidget.child_widgets
width -= 4
if height > 4:
height -= 4
- self._tag = canvas.create_line(1, 1, width, height, fill="")
+ self._tag = canvas.create_line(1, 1, width, height, fill='')
CanvasWidget.__init__(self, canvas, **attribs)
# note: width() and height() are already defined by CanvasWidget.
return [self._tag]
def __repr__(self):
- return "[Space]"
+ return '[Space]'
class ScrollWatcherWidget(CanvasWidget):
"""
bbox = self.bbox()
canvas = self.canvas()
- scrollregion = [int(n) for n in canvas["scrollregion"].split()]
+ scrollregion = [int(n) for n in canvas['scrollregion'].split()]
if len(scrollregion) != 4:
return
if (
or bbox[2] > scrollregion[2]
or bbox[3] > scrollregion[3]
):
- scrollregion = "%d %d %d %d" % (
+ scrollregion = '%d %d %d %d' % (
min(bbox[0], scrollregion[0]),
min(bbox[1], scrollregion[1]),
max(bbox[2], scrollregion[2]),
max(bbox[3], scrollregion[3]),
)
- canvas["scrollregion"] = scrollregion
+ canvas['scrollregion'] = scrollregion
##//////////////////////////////////////////////////////
# If no parent was given, set up a top-level window.
if parent is None:
self._parent = Tk()
- self._parent.title("NLTK")
- self._parent.bind("<Control-p>", lambda e: self.print_to_file())
- self._parent.bind("<Control-x>", self.destroy)
- self._parent.bind("<Control-q>", self.destroy)
+ self._parent.title('NLTK')
+ self._parent.bind('<Control-p>', lambda e: self.print_to_file())
+ self._parent.bind('<Control-x>', self.destroy)
+ self._parent.bind('<Control-q>', self.destroy)
else:
self._parent = parent
# Create a frame for the canvas & scrollbars
self._frame = frame = Frame(self._parent)
self._canvas = canvas = Canvas(frame, **kw)
- xscrollbar = Scrollbar(self._frame, orient="horizontal")
- yscrollbar = Scrollbar(self._frame, orient="vertical")
- xscrollbar["command"] = canvas.xview
- yscrollbar["command"] = canvas.yview
- canvas["xscrollcommand"] = xscrollbar.set
- canvas["yscrollcommand"] = yscrollbar.set
- yscrollbar.pack(fill="y", side="right")
- xscrollbar.pack(fill="x", side="bottom")
- canvas.pack(expand=1, fill="both", side="left")
+ xscrollbar = Scrollbar(self._frame, orient='horizontal')
+ yscrollbar = Scrollbar(self._frame, orient='vertical')
+ xscrollbar['command'] = canvas.xview
+ yscrollbar['command'] = canvas.yview
+ canvas['xscrollcommand'] = xscrollbar.set
+ canvas['yscrollcommand'] = yscrollbar.set
+ yscrollbar.pack(fill='y', side='right')
+ xscrollbar.pack(fill='x', side='bottom')
+ canvas.pack(expand=1, fill='both', side='left')
# Set initial scroll region.
- scrollregion = "0 0 %s %s" % (canvas["width"], canvas["height"])
- canvas["scrollregion"] = scrollregion
+ scrollregion = '0 0 %s %s' % (canvas['width'], canvas['height'])
+ canvas['scrollregion'] = scrollregion
self._scrollwatcher = ScrollWatcherWidget(canvas)
# If no parent was given, pack the frame, and add a menu.
if parent is None:
- self.pack(expand=1, fill="both")
+ self.pack(expand=1, fill='both')
self._init_menubar()
def _init_menubar(self):
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Print to Postscript",
+ label='Print to Postscript',
underline=0,
command=self.print_to_file,
- accelerator="Ctrl-p",
+ accelerator='Ctrl-p',
)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+ label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
self._parent.config(menu=menubar)
:rtype: None
"""
if filename is None:
- ftypes = [("Postscript files", ".ps"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".ps")
+ ftypes = [('Postscript files', '.ps'), ('All files', '*')]
+ filename = asksaveasfilename(filetypes=ftypes, defaultextension='.ps')
if not filename:
return
(x0, y0, w, h) = self.scrollregion()
pagey=0,
)
# workaround for bug in Tk font handling
- postscript = postscript.replace(" 0 scalefont ", " 9 scalefont ")
- with open(filename, "wb") as f:
- f.write(postscript.encode("utf8"))
+ postscript = postscript.replace(' 0 scalefont ', ' 9 scalefont ')
+ with open(filename, 'wb') as f:
+ f.write(postscript.encode('utf8'))
def scrollregion(self):
"""
this ``CanvasFrame``.
:rtype: 4-tuple of int
"""
- (x1, y1, x2, y2) = self._canvas["scrollregion"].split()
+ (x1, y1, x2, y2) = self._canvas['scrollregion'].split()
return (int(x1), int(y1), int(x2), int(y2))
def canvas(self):
self._top = top = Toplevel(root)
top.title(title)
- b = Button(top, text="Ok", command=self.destroy)
- b.pack(side="bottom")
+ b = Button(top, text='Ok', command=self.destroy)
+ b.pack(side='bottom')
tbf = Frame(top)
- tbf.pack(expand=1, fill="both")
- scrollbar = Scrollbar(tbf, orient="vertical")
- scrollbar.pack(side="right", fill="y")
- textbox = Text(tbf, wrap="word", width=width, height=height, **textbox_options)
- textbox.insert("end", text)
- textbox["state"] = "disabled"
- textbox.pack(side="left", expand=1, fill="both")
- scrollbar["command"] = textbox.yview
- textbox["yscrollcommand"] = scrollbar.set
+ tbf.pack(expand=1, fill='both')
+ scrollbar = Scrollbar(tbf, orient='vertical')
+ scrollbar.pack(side='right', fill='y')
+ textbox = Text(tbf, wrap='word', width=width, height=height, **textbox_options)
+ textbox.insert('end', text)
+ textbox['state'] = 'disabled'
+ textbox.pack(side='left', expand=1, fill='both')
+ scrollbar['command'] = textbox.yview
+ textbox['yscrollcommand'] = scrollbar.set
# Make it easy to close the window.
- top.bind("q", self.destroy)
- top.bind("x", self.destroy)
- top.bind("c", self.destroy)
- top.bind("<Return>", self.destroy)
- top.bind("<Escape>", self.destroy)
+ top.bind('q', self.destroy)
+ top.bind('x', self.destroy)
+ top.bind('c', self.destroy)
+ top.bind('<Return>', self.destroy)
+ top.bind('<Escape>', self.destroy)
# Focus the scrollbar, so they can use up/down, etc.
scrollbar.focus()
def find_dimentions(self, text, width, height):
- lines = text.split("\n")
+ lines = text.split('\n')
if width is None:
maxwidth = max(len(line) for line in lines)
width = min(maxwidth, 80)
height = 0
for line in lines:
while len(line) > width:
- brk = line[:width].rfind(" ")
+ brk = line[:width].rfind(' ')
line = line[brk:]
height += 1
height += 1
"""
def __init__(
- self, parent, original_text="", instructions="", set_callback=None, title=None
+ self, parent, original_text='', instructions='', set_callback=None, title=None
):
self._parent = parent
self._original_text = original_text
# The text entry box.
entryframe = Frame(self._top)
- entryframe.pack(expand=1, fill="both", padx=5, pady=5, ipady=10)
+ entryframe.pack(expand=1, fill='both', padx=5, pady=5, ipady=10)
if instructions:
l = Label(entryframe, text=instructions)
- l.pack(side="top", anchor="w", padx=30)
+ l.pack(side='top', anchor='w', padx=30)
self._entry = Entry(entryframe, width=width)
- self._entry.pack(expand=1, fill="x", padx=30)
+ self._entry.pack(expand=1, fill='x', padx=30)
self._entry.insert(0, original_text)
# A divider
- divider = Frame(self._top, borderwidth=1, relief="sunken")
- divider.pack(fill="x", ipady=1, padx=10)
+ divider = Frame(self._top, borderwidth=1, relief='sunken')
+ divider.pack(fill='x', ipady=1, padx=10)
# The buttons.
buttons = Frame(self._top)
- buttons.pack(expand=0, fill="x", padx=5, pady=5)
- b = Button(buttons, text="Cancel", command=self._cancel, width=8)
- b.pack(side="right", padx=5)
- b = Button(buttons, text="Ok", command=self._ok, width=8, default="active")
- b.pack(side="left", padx=5)
- b = Button(buttons, text="Apply", command=self._apply, width=8)
- b.pack(side="left")
-
- self._top.bind("<Return>", self._ok)
- self._top.bind("<Control-q>", self._cancel)
- self._top.bind("<Escape>", self._cancel)
+ buttons.pack(expand=0, fill='x', padx=5, pady=5)
+ b = Button(buttons, text='Cancel', command=self._cancel, width=8)
+ b.pack(side='right', padx=5)
+ b = Button(buttons, text='Ok', command=self._ok, width=8, default='active')
+ b.pack(side='left', padx=5)
+ b = Button(buttons, text='Apply', command=self._apply, width=8)
+ b.pack(side='left')
+
+ self._top.bind('<Return>', self._ok)
+ self._top.bind('<Control-q>', self._cancel)
+ self._top.bind('<Escape>', self._cancel)
self._entry.focus()
def _reset(self, *e):
- self._entry.delete(0, "end")
+ self._entry.delete(0, 'end')
self._entry.insert(0, self._original_text)
if self._set_callback:
self._set_callback(self._original_text)
self._init_itemframe(options.copy())
# Set up key & mouse bindings.
- self._textwidget.bind("<KeyPress>", self._keypress)
- self._textwidget.bind("<ButtonPress>", self._buttonpress)
+ self._textwidget.bind('<KeyPress>', self._keypress)
+ self._textwidget.bind('<ButtonPress>', self._buttonpress)
# Fill in the given CFG's items.
self._items = None
return
self._items = list(items)
- self._textwidget["state"] = "normal"
- self._textwidget.delete("1.0", "end")
+ self._textwidget['state'] = 'normal'
+ self._textwidget.delete('1.0', 'end')
for item in items:
for (text, colortag) in self._item_repr(item):
- assert "\n" not in text, "item repr may not contain newline"
- self._textwidget.insert("end", text, colortag)
- self._textwidget.insert("end", "\n")
+ assert '\n' not in text, 'item repr may not contain newline'
+ self._textwidget.insert('end', text, colortag)
+ self._textwidget.insert('end', '\n')
# Remove the final newline
- self._textwidget.delete("end-1char", "end")
- self._textwidget.mark_set("insert", "1.0")
- self._textwidget["state"] = "disabled"
+ self._textwidget.delete('end-1char', 'end')
+ self._textwidget.mark_set('insert', '1.0')
+ self._textwidget['state'] = 'disabled'
# Clear all marks
self._marks.clear()
"""
if item is None:
self._marks.clear()
- self._textwidget.tag_remove("highlight", "1.0", "end+1char")
+ self._textwidget.tag_remove('highlight', '1.0', 'end+1char')
else:
index = self._items.index(item)
del self._marks[item]
- (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2))
- self._textwidget.tag_remove("highlight", start, end)
+ (start, end) = ('%d.0' % (index + 1), '%d.0' % (index + 2))
+ self._textwidget.tag_remove('highlight', start, end)
def mark(self, item):
"""
"""
self._marks[item] = 1
index = self._items.index(item)
- (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2))
- self._textwidget.tag_add("highlight", start, end)
+ (start, end) = ('%d.0' % (index + 1), '%d.0' % (index + 2))
+ self._textwidget.tag_add('highlight', start, end)
def markonly(self, item):
"""
the item is already visible, then do nothing.
"""
index = self._items.index(item)
- self._textwidget.see("%d.0" % (index + 1))
+ self._textwidget.see('%d.0' % (index + 1))
# ////////////////////////////////////////////////////////////
# Callbacks
single item as its argument. (The item selected
or the item moved to).
"""
- if event == "select":
- events = ["click1", "space", "return"]
- elif event == "move":
- events = ["up", "down", "next", "prior"]
+ if event == 'select':
+ events = ['click1', 'space', 'return']
+ elif event == 'move':
+ events = ['up', 'down', 'next', 'prior']
else:
events = [event]
"""
if event is None:
events = list(self._callbacks.keys())
- elif event == "select":
- events = ["click1", "space", "return"]
- elif event == "move":
- events = ["up", "down", "next", "prior"]
+ elif event == 'select':
+ events = ['click1', 'space', 'return']
+ elif event == 'move':
+ events = ['up', 'down', 'next', 'prior']
else:
events = [event]
self._itemframe = Frame(self._parent)
# Create the basic Text widget & scrollbar.
- options.setdefault("background", "#e0e0e0")
+ options.setdefault('background', '#e0e0e0')
self._textwidget = Text(self._itemframe, **options)
- self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient="vertical")
+ self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient='vertical')
self._textwidget.config(yscrollcommand=self._textscroll.set)
self._textscroll.config(command=self._textwidget.yview)
- self._textscroll.pack(side="right", fill="y")
- self._textwidget.pack(expand=1, fill="both", side="left")
+ self._textscroll.pack(side='right', fill='y')
+ self._textwidget.pack(expand=1, fill='both', side='left')
# Initialize the colorization tags
self._textwidget.tag_config(
- "highlight", background="#e0ffff", border="1", relief="raised"
+ 'highlight', background='#e0ffff', border='1', relief='raised'
)
self._init_colortags(self._textwidget, options)
# How do I want to mark keyboard selection?
- self._textwidget.tag_config("sel", foreground="")
+ self._textwidget.tag_config('sel', foreground='')
self._textwidget.tag_config(
- "sel", foreground="", background="", border="", underline=1
+ 'sel', foreground='', background='', border='', underline=1
)
- self._textwidget.tag_lower("highlight", "sel")
+ self._textwidget.tag_lower('highlight', 'sel')
def _fire_callback(self, event, itemnum):
if event not in self._callbacks:
cb_func(item)
def _buttonpress(self, event):
- clickloc = "@%d,%d" % (event.x, event.y)
+ clickloc = '@%d,%d' % (event.x, event.y)
insert_point = self._textwidget.index(clickloc)
- itemnum = int(insert_point.split(".")[0]) - 1
- self._fire_callback("click%d" % event.num, itemnum)
+ itemnum = int(insert_point.split('.')[0]) - 1
+ self._fire_callback('click%d' % event.num, itemnum)
def _keypress(self, event):
- if event.keysym == "Return" or event.keysym == "space":
- insert_point = self._textwidget.index("insert")
- itemnum = int(insert_point.split(".")[0]) - 1
+ if event.keysym == 'Return' or event.keysym == 'space':
+ insert_point = self._textwidget.index('insert')
+ itemnum = int(insert_point.split('.')[0]) - 1
self._fire_callback(event.keysym.lower(), itemnum)
return
- elif event.keysym == "Down":
- delta = "+1line"
- elif event.keysym == "Up":
- delta = "-1line"
- elif event.keysym == "Next":
- delta = "+10lines"
- elif event.keysym == "Prior":
- delta = "-10lines"
+ elif event.keysym == 'Down':
+ delta = '+1line'
+ elif event.keysym == 'Up':
+ delta = '-1line'
+ elif event.keysym == 'Next':
+ delta = '+10lines'
+ elif event.keysym == 'Prior':
+ delta = '-10lines'
else:
- return "continue"
+ return 'continue'
- self._textwidget.mark_set("insert", "insert" + delta)
- self._textwidget.see("insert")
- self._textwidget.tag_remove("sel", "1.0", "end+1char")
- self._textwidget.tag_add("sel", "insert linestart", "insert lineend")
+ self._textwidget.mark_set('insert', 'insert' + delta)
+ self._textwidget.see('insert')
+ self._textwidget.tag_remove('sel', '1.0', 'end+1char')
+ self._textwidget.tag_add('sel', 'insert linestart', 'insert lineend')
- insert_point = self._textwidget.index("insert")
- itemnum = int(insert_point.split(".")[0]) - 1
+ insert_point = self._textwidget.index('insert')
+ itemnum = int(insert_point.split('.')[0]) - 1
self._fire_callback(event.keysym.lower(), itemnum)
- return "break"
+ return 'break'
##//////////////////////////////////////////////////////
class MutableOptionMenu(Menubutton):
def __init__(self, master, values, **options):
- self._callback = options.get("command")
- if "command" in options:
- del options["command"]
+ self._callback = options.get('command')
+ if 'command' in options:
+ del options['command']
# Create a variable
self._variable = variable = StringVar()
}
kw.update(options)
Widget.__init__(self, master, "menubutton", kw)
- self.widgetName = "tk_optionMenu"
+ self.widgetName = 'tk_optionMenu'
self._menu = Menu(self, name="menu", tearoff=0)
self.menuname = self._menu._w
self._menu.delete(i, i)
def __getitem__(self, name):
- if name == "menu":
+ if name == 'menu':
return self.__menu
return Widget.__getitem__(self, name)
def fill(cw):
from random import randint
- cw["fill"] = "#00%04d" % randint(0, 9999)
+ cw['fill'] = '#00%04d' % randint(0, 9999)
def color(cw):
from random import randint
- cw["color"] = "#ff%04d" % randint(0, 9999)
+ cw['color'] = '#ff%04d' % randint(0, 9999)
cf = CanvasFrame(closeenough=10, width=300, height=300)
c = cf.canvas()
- ct3 = TextWidget(c, "hiya there", draggable=1)
- ct2 = TextWidget(c, "o o\n||\n___\n U", draggable=1, justify="center")
- co = OvalWidget(c, ct2, outline="red")
- ct = TextWidget(c, "o o\n||\n\\___/", draggable=1, justify="center")
- cp = ParenWidget(c, ct, color="red")
- cb = BoxWidget(c, cp, fill="cyan", draggable=1, width=3, margin=10)
+ ct3 = TextWidget(c, 'hiya there', draggable=1)
+ ct2 = TextWidget(c, 'o o\n||\n___\n U', draggable=1, justify='center')
+ co = OvalWidget(c, ct2, outline='red')
+ ct = TextWidget(c, 'o o\n||\n\\___/', draggable=1, justify='center')
+ cp = ParenWidget(c, ct, color='red')
+ cb = BoxWidget(c, cp, fill='cyan', draggable=1, width=3, margin=10)
equation = SequenceWidget(
c,
- SymbolWidget(c, "forall"),
- TextWidget(c, "x"),
- SymbolWidget(c, "exists"),
- TextWidget(c, "y: "),
- TextWidget(c, "x"),
- SymbolWidget(c, "notequal"),
- TextWidget(c, "y"),
+ SymbolWidget(c, 'forall'),
+ TextWidget(c, 'x'),
+ SymbolWidget(c, 'exists'),
+ TextWidget(c, 'y: '),
+ TextWidget(c, 'x'),
+ SymbolWidget(c, 'notequal'),
+ TextWidget(c, 'y'),
)
space = SpaceWidget(c, 0, 30)
- cstack = StackWidget(c, cb, ct3, space, co, equation, align="center")
+ cstack = StackWidget(c, cb, ct3, space, co, equation, align='center')
prompt_msg = TextWidget(
- c, "try clicking\nand dragging", draggable=1, justify="center"
+ c, 'try clicking\nand dragging', draggable=1, justify='center'
)
cs = SequenceWidget(c, cstack, prompt_msg)
- zz = BracketWidget(c, cs, color="green4", width=3)
+ zz = BracketWidget(c, cs, color='green4', width=3)
cf.add_widget(zz, 60, 30)
cb.bind_click(fill)
# ShowText(None, 'title', ((('this is text'*150)+'\n')*5))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Feature Structures
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>,
# Rob Speer,
# Steven Bird <stevenbird1@gmail.com>
or if you plan to use them as dictionary keys, it is strongly
recommended that you use full-fledged ``FeatStruct`` objects.
"""
+from __future__ import print_function, unicode_literals, division
import re
import copy
from functools import total_ordering
+from six import integer_types, string_types
+
from nltk.internals import read_str, raise_unorderable_types
from nltk.sem.logic import (
Variable,
LogicParser,
LogicalExpressionException,
)
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
######################################################################
# Feature Structure
return FeatDict.__new__(FeatDict, features, **morefeatures)
elif morefeatures:
raise TypeError(
- "Keyword arguments may only be specified "
- "if features is None or is a mapping."
+ 'Keyword arguments may only be specified '
+ 'if features is None or is a mapping.'
)
- if isinstance(features, str):
+ if isinstance(features, string_types):
if FeatStructReader._START_FDICT_RE.match(features):
return FeatDict.__new__(FeatDict, features, **morefeatures)
else:
elif _is_sequence(features):
return FeatList.__new__(FeatList, features)
else:
- raise TypeError("Expected string or mapping or sequence")
+ raise TypeError('Expected string or mapping or sequence')
# Otherwise, construct the object as normal.
else:
otherwise, raise ``TypeError``.
"""
if not self._frozen:
- raise TypeError("FeatStructs must be frozen before they " "can be hashed.")
+ raise TypeError('FeatStructs must be frozen before they ' 'can be hashed.')
try:
return self._hash
except AttributeError:
_FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError."
-def _check_frozen(method, indent=""):
+def _check_frozen(method, indent=''):
"""
Given a method function, return a new method function that first
checks if ``self._frozen`` is true; and if so, raises ``ValueError``
return method(self, *args, **kwargs)
wrapped.__name__ = method.__name__
- wrapped.__doc__ = (method.__doc__ or "") + (_FROZEN_NOTICE % indent)
+ wrapped.__doc__ = (method.__doc__ or '') + (_FROZEN_NOTICE % indent)
return wrapped
######################################################################
-
+@python_2_unicode_compatible
class FeatDict(FeatStruct, dict):
"""
A feature structure that acts like a Python dictionary. I.e., a
``morefeatures``, then the value from ``morefeatures`` will be
used.
"""
- if isinstance(features, str):
+ if isinstance(features, string_types):
FeatStructReader().fromstring(features, self)
self.update(**morefeatures)
else:
def __getitem__(self, name_or_path):
"""If the feature with the given name or path exists, return
its value; otherwise, raise ``KeyError``."""
- if isinstance(name_or_path, (str, Feature)):
+ if isinstance(name_or_path, (string_types, Feature)):
return dict.__getitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
try:
its value; otherwise, raise ``KeyError``."""
if self._frozen:
raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (str, Feature)):
+ if isinstance(name_or_path, (string_types, Feature)):
return dict.__delitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
``KeyError``."""
if self._frozen:
raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (str, Feature)):
+ if isinstance(name_or_path, (string_types, Feature)):
return dict.__setitem__(self, name_or_path, value)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
raise ValueError(_FROZEN_ERROR)
if features is None:
items = ()
- elif hasattr(features, "items") and callable(features.items):
+ elif hasattr(features, 'items') and callable(features.items):
items = features.items()
- elif hasattr(features, "__iter__"):
+ elif hasattr(features, '__iter__'):
items = features
else:
- raise ValueError("Expected mapping or list of tuples")
+ raise ValueError('Expected mapping or list of tuples')
for key, val in items:
- if not isinstance(key, (str, Feature)):
- raise TypeError("Feature names must be strings")
+ if not isinstance(key, (string_types, Feature)):
+ raise TypeError('Feature names must be strings')
self[key] = val
for key, val in morefeatures.items():
- if not isinstance(key, (str, Feature)):
- raise TypeError("Feature names must be strings")
+ if not isinstance(key, (string_types, Feature)):
+ raise TypeError('Feature names must be strings')
self[key] = val
##////////////////////////////////////////////////////////////
Display a multi-line representation of this feature dictionary
as an FVM (feature value matrix).
"""
- return "\n".join(self._str(self._find_reentrances({}), {}))
+ return '\n'.join(self._str(self._find_reentrances({}), {}))
def _repr(self, reentrances, reentrance_ids):
segments = []
- prefix = ""
- suffix = ""
+ prefix = ''
+ suffix = ''
# If this is the first time we've seen a reentrant structure,
# then assign it a unique identifier.
# sorting note: keys are unique strings, so we'll never fall
# through to comparing values.
for (fname, fval) in sorted(self.items()):
- display = getattr(fname, "display", None)
+ display = getattr(fname, 'display', None)
if id(fval) in reentrance_ids:
- segments.append("%s->(%s)" % (fname, reentrance_ids[id(fval)]))
+ segments.append('%s->(%s)' % (fname, reentrance_ids[id(fval)]))
elif (
- display == "prefix"
+ display == 'prefix'
and not prefix
- and isinstance(fval, (Variable, str))
+ and isinstance(fval, (Variable, string_types))
):
- prefix = "%s" % fval
- elif display == "slash" and not suffix:
+ prefix = '%s' % fval
+ elif display == 'slash' and not suffix:
if isinstance(fval, Variable):
- suffix = "/%s" % fval.name
+ suffix = '/%s' % fval.name
else:
- suffix = "/%s" % repr(fval)
+ suffix = '/%s' % unicode_repr(fval)
elif isinstance(fval, Variable):
- segments.append("%s=%s" % (fname, fval.name))
+ segments.append('%s=%s' % (fname, fval.name))
elif fval is True:
- segments.append("+%s" % fname)
+ segments.append('+%s' % fname)
elif fval is False:
- segments.append("-%s" % fname)
+ segments.append('-%s' % fname)
elif isinstance(fval, Expression):
- segments.append("%s=<%s>" % (fname, fval))
+ segments.append('%s=<%s>' % (fname, fval))
elif not isinstance(fval, FeatStruct):
- segments.append("%s=%s" % (fname, repr(fval)))
+ segments.append('%s=%s' % (fname, unicode_repr(fval)))
else:
fval_repr = fval._repr(reentrances, reentrance_ids)
- segments.append("%s=%s" % (fname, fval_repr))
+ segments.append('%s=%s' % (fname, fval_repr))
# If it's reentrant, then add on an identifier tag.
if reentrances[id(self)]:
- prefix = "(%s)%s" % (reentrance_ids[id(self)], prefix)
- return "%s[%s]%s" % (prefix, ", ".join(segments), suffix)
+ prefix = '(%s)%s' % (reentrance_ids[id(self)], prefix)
+ return '%s[%s]%s' % (prefix, ', '.join(segments), suffix)
def _str(self, reentrances, reentrance_ids):
"""
# Special case: empty feature dict.
if len(self) == 0:
if reentrances[id(self)]:
- return ["(%s) []" % reentrance_ids[id(self)]]
+ return ['(%s) []' % reentrance_ids[id(self)]]
else:
- return ["[]"]
+ return ['[]']
# What's the longest feature name? Use this to align names.
maxfnamelen = max(len("%s" % k) for k in self.keys())
for (fname, fval) in sorted(self.items()):
fname = ("%s" % fname).ljust(maxfnamelen)
if isinstance(fval, Variable):
- lines.append("%s = %s" % (fname, fval.name))
+ lines.append('%s = %s' % (fname, fval.name))
elif isinstance(fval, Expression):
- lines.append("%s = <%s>" % (fname, fval))
+ lines.append('%s = <%s>' % (fname, fval))
elif isinstance(fval, FeatList):
fval_repr = fval._repr(reentrances, reentrance_ids)
- lines.append("%s = %s" % (fname, repr(fval_repr)))
+ lines.append('%s = %s' % (fname, unicode_repr(fval_repr)))
elif not isinstance(fval, FeatDict):
# It's not a nested feature structure -- just print it.
- lines.append("%s = %s" % (fname, repr(fval)))
+ lines.append('%s = %s' % (fname, unicode_repr(fval)))
elif id(fval) in reentrance_ids:
# It's a feature structure we've seen before -- print
# the reentrance id.
- lines.append("%s -> (%s)" % (fname, reentrance_ids[id(fval)]))
+ lines.append('%s -> (%s)' % (fname, reentrance_ids[id(fval)]))
else:
# It's a new feature structure. Separate it from
# other values by a blank line.
- if lines and lines[-1] != "":
- lines.append("")
+ if lines and lines[-1] != '':
+ lines.append('')
# Recursively print the feature's value (fval).
fval_lines = fval._str(reentrances, reentrance_ids)
# Indent each line to make room for fname.
- fval_lines = [(" " * (maxfnamelen + 3)) + l for l in fval_lines]
+ fval_lines = [(' ' * (maxfnamelen + 3)) + l for l in fval_lines]
# Pick which line we'll display fname on, & splice it in.
nameline = (len(fval_lines) - 1) // 2
fval_lines[nameline] = (
- fname + " =" + fval_lines[nameline][maxfnamelen + 2 :]
+ fname + ' =' + fval_lines[nameline][maxfnamelen + 2 :]
)
# Add the feature structure to the output.
lines += fval_lines
# Separate FeatStructs by a blank line.
- lines.append("")
+ lines.append('')
# Get rid of any excess blank lines.
- if lines[-1] == "":
+ if lines[-1] == '':
lines.pop()
# Add brackets around everything.
maxlen = max(len(line) for line in lines)
- lines = ["[ %s%s ]" % (line, " " * (maxlen - len(line))) for line in lines]
+ lines = ['[ %s%s ]' % (line, ' ' * (maxlen - len(line))) for line in lines]
# If it's reentrant, then add on an identifier tag.
if reentrances[id(self)]:
- idstr = "(%s) " % reentrance_ids[id(self)]
- lines = [(" " * len(idstr)) + l for l in lines]
+ idstr = '(%s) ' % reentrance_ids[id(self)]
+ lines = [(' ' * len(idstr)) + l for l in lines]
idline = (len(lines) - 1) // 2
lines[idline] = idstr + lines[idline][len(idstr) :]
``FeatStructReader``. Otherwise, it should be a sequence
of basic values and nested feature structures.
"""
- if isinstance(features, str):
+ if isinstance(features, string_types):
FeatStructReader().fromstring(features, self)
else:
list.__init__(self, features)
_INDEX_ERROR = "Expected int or feature path. Got %r."
def __getitem__(self, name_or_path):
- if isinstance(name_or_path, int):
+ if isinstance(name_or_path, integer_types):
return list.__getitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
try:
its value; otherwise, raise ``KeyError``."""
if self._frozen:
raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (int, slice)):
+ if isinstance(name_or_path, (integer_types, slice)):
return list.__delitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
``KeyError``."""
if self._frozen:
raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (int, slice)):
+ if isinstance(name_or_path, (integer_types, slice)):
return list.__setitem__(self, name_or_path, value)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
if reentrances[id(self)]:
assert id(self) not in reentrance_ids
reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1)
- prefix = "(%s)" % reentrance_ids[id(self)]
+ prefix = '(%s)' % reentrance_ids[id(self)]
else:
- prefix = ""
+ prefix = ''
segments = []
for fval in self:
if id(fval) in reentrance_ids:
- segments.append("->(%s)" % reentrance_ids[id(fval)])
+ segments.append('->(%s)' % reentrance_ids[id(fval)])
elif isinstance(fval, Variable):
segments.append(fval.name)
elif isinstance(fval, Expression):
- segments.append("%s" % fval)
+ segments.append('%s' % fval)
elif isinstance(fval, FeatStruct):
segments.append(fval._repr(reentrances, reentrance_ids))
else:
- segments.append("%s" % repr(fval))
+ segments.append('%s' % unicode_repr(fval))
- return "%s[%s]" % (prefix, ", ".join(segments))
+ return '%s[%s]' % (prefix, ', '.join(segments))
######################################################################
######################################################################
-def substitute_bindings(fstruct, bindings, fs_class="default"):
+def substitute_bindings(fstruct, bindings, fs_class='default'):
"""
Return the feature structure that is obtained by replacing each
variable bound by ``bindings`` with its binding. If a variable is
:type bindings: dict(Variable -> any)
:param bindings: A dictionary mapping from variables to values.
"""
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct)
fstruct = copy.deepcopy(fstruct)
_substitute_bindings(fstruct, bindings, fs_class, set())
elif _is_sequence(fstruct):
items = enumerate(fstruct)
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
while isinstance(fval, Variable) and fval in bindings:
fval = fstruct[fname] = bindings[fval]
fstruct[fname] = fval.substitute_bindings(bindings)
-def retract_bindings(fstruct, bindings, fs_class="default"):
+def retract_bindings(fstruct, bindings, fs_class='default'):
"""
Return the feature structure that is obtained by replacing each
feature structure value that is bound by ``bindings`` with the
values in ``bindings`` may be modified if they are contained in
``fstruct``.
"""
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct)
(fstruct, new_bindings) = copy.deepcopy((fstruct, bindings))
bindings.update(new_bindings)
elif _is_sequence(fstruct):
items = enumerate(fstruct)
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, fs_class):
if id(fval) in inv_bindings:
_retract_bindings(fval, inv_bindings, fs_class, visited)
-def find_variables(fstruct, fs_class="default"):
+def find_variables(fstruct, fs_class='default'):
"""
:return: The set of variables used by this feature structure.
:rtype: set(Variable)
"""
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct)
return _variables(fstruct, set(), fs_class, set())
elif _is_sequence(fstruct):
items = enumerate(fstruct)
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, Variable):
vars.add(fval)
def rename_variables(
- fstruct, vars=None, used_vars=(), new_vars=None, fs_class="default"
+ fstruct, vars=None, used_vars=(), new_vars=None, fs_class='default'
):
"""
Return the feature structure that is obtained by replacing
If new_vars is not specified, then an empty dictionary is used.
"""
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct)
# Default values:
elif _is_sequence(fstruct):
items = enumerate(fstruct)
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, Variable):
# If it's in new_vars, then rebind it.
def _rename_variable(var, used_vars):
- name, n = re.sub("\d+$", "", var.name), 2
+ name, n = re.sub('\d+$', '', var.name), 2
if not name:
- name = "?"
- while Variable("%s%s" % (name, n)) in used_vars:
+ name = '?'
+ while Variable('%s%s' % (name, n)) in used_vars:
n += 1
- return Variable("%s%s" % (name, n))
+ return Variable('%s%s' % (name, n))
-def remove_variables(fstruct, fs_class="default"):
+def remove_variables(fstruct, fs_class='default'):
"""
:rtype: FeatStruct
:return: The feature structure that is obtained by deleting
all features whose values are ``Variables``.
"""
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct)
return _remove_variables(copy.deepcopy(fstruct), fs_class, set())
elif _is_sequence(fstruct):
items = list(enumerate(fstruct))
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, Variable):
######################################################################
-
+@python_2_unicode_compatible
class _UnificationFailure(object):
def __repr__(self):
- return "nltk.featstruct.UnificationFailure"
+ return 'nltk.featstruct.UnificationFailure'
UnificationFailure = _UnificationFailure()
trace=False,
fail=None,
rename_vars=True,
- fs_class="default",
+ fs_class='default',
):
"""
Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature
"""
# Decide which class(es) will be treated as feature structures,
# for the purposes of unification.
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct1)
if _default_fs_class(fstruct2) != fs_class:
raise ValueError(
# Unifying two mappings:
if _is_mapping(fstruct1) and _is_mapping(fstruct2):
for fname in fstruct1:
- if getattr(fname, "default", None) is not None:
+ if getattr(fname, 'default', None) is not None:
fstruct2.setdefault(fname, fname.default)
for fname in fstruct2:
- if getattr(fname, "default", None) is not None:
+ if getattr(fname, 'default', None) is not None:
fstruct1.setdefault(fname, fname.default)
# Unify any values that are defined in both fstruct1 and
return UnificationFailure
# Unifying anything else: not allowed!
- raise TypeError("Expected mappings or sequences")
+ raise TypeError('Expected mappings or sequences')
def _unify_feature_values(
# Sanity check: unify value should be symmetric
if isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1):
raise AssertionError(
- "CustomFeatureValue objects %r and %r disagree "
- "about unification value: %r vs. %r"
+ 'CustomFeatureValue objects %r and %r disagree '
+ 'about unification value: %r vs. %r'
% (fval1, fval2, result, fval2.unify(fval1))
)
elif isinstance(fval2, CustomFeatureValue):
elif _is_sequence(fstruct):
items = enumerate(fstruct)
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for fname, fval in items:
if isinstance(fval, fs_class):
# Replace w/ forwarded value.
def _trace_unify_start(path, fval1, fval2):
if path == ():
- print("\nUnification trace:")
+ print('\nUnification trace:')
else:
- fullname = ".".join("%s" % n for n in path)
- print(" " + "| " * (len(path) - 1) + "|")
- print(" " + "| " * (len(path) - 1) + "| Unify feature: %s" % fullname)
- print(" " + "| " * len(path) + " / " + _trace_valrepr(fval1))
- print(" " + "| " * len(path) + "|\\ " + _trace_valrepr(fval2))
+ fullname = '.'.join("%s" % n for n in path)
+ print(' ' + '| ' * (len(path) - 1) + '|')
+ print(' ' + '| ' * (len(path) - 1) + '| Unify feature: %s' % fullname)
+ print(' ' + '| ' * len(path) + ' / ' + _trace_valrepr(fval1))
+ print(' ' + '| ' * len(path) + '|\\ ' + _trace_valrepr(fval2))
def _trace_unify_identity(path, fval1):
- print(" " + "| " * len(path) + "|")
- print(" " + "| " * len(path) + "| (identical objects)")
- print(" " + "| " * len(path) + "|")
- print(" " + "| " * len(path) + "+-->" + repr(fval1))
+ print(' ' + '| ' * len(path) + '|')
+ print(' ' + '| ' * len(path) + '| (identical objects)')
+ print(' ' + '| ' * len(path) + '|')
+ print(' ' + '| ' * len(path) + '+-->' + unicode_repr(fval1))
def _trace_unify_fail(path, result):
if result is UnificationFailure:
- resume = ""
+ resume = ''
else:
- resume = " (nonfatal)"
- print(" " + "| " * len(path) + "| |")
- print(" " + "X " * len(path) + "X X <-- FAIL" + resume)
+ resume = ' (nonfatal)'
+ print(' ' + '| ' * len(path) + '| |')
+ print(' ' + 'X ' * len(path) + 'X X <-- FAIL' + resume)
def _trace_unify_succeed(path, fval1):
# Print the result.
- print(" " + "| " * len(path) + "|")
- print(" " + "| " * len(path) + "+-->" + repr(fval1))
+ print(' ' + '| ' * len(path) + '|')
+ print(' ' + '| ' * len(path) + '+-->' + unicode_repr(fval1))
def _trace_bindings(path, bindings):
# Print the bindings (if any).
if len(bindings) > 0:
binditems = sorted(bindings.items(), key=lambda v: v[0].name)
- bindstr = "{%s}" % ", ".join(
- "%s: %s" % (var, _trace_valrepr(val)) for (var, val) in binditems
+ bindstr = '{%s}' % ', '.join(
+ '%s: %s' % (var, _trace_valrepr(val)) for (var, val) in binditems
)
- print(" " + "| " * len(path) + " Bindings: " + bindstr)
+ print(' ' + '| ' * len(path) + ' Bindings: ' + bindstr)
def _trace_valrepr(val):
if isinstance(val, Variable):
- return "%s" % val
+ return '%s' % val
else:
- return "%s" % repr(val)
+ return '%s' % unicode_repr(val)
def subsumes(fstruct1, fstruct2):
def _is_mapping(v):
- return hasattr(v, "__contains__") and hasattr(v, "keys")
+ return hasattr(v, '__contains__') and hasattr(v, 'keys')
def _is_sequence(v):
return (
- hasattr(v, "__iter__")
- and hasattr(v, "__len__")
- and not isinstance(v, str)
+ hasattr(v, '__iter__')
+ and hasattr(v, '__len__')
+ and not isinstance(v, string_types)
)
return (dict, list)
else:
raise ValueError(
- "To unify objects of type %s, you must specify "
- "fs_class explicitly." % obj.__class__.__name__
+ 'To unify objects of type %s, you must specify '
+ 'fs_class explicitly.' % obj.__class__.__name__
)
return bindings.get(v, v)
-
+@python_2_unicode_compatible
class FeatureValueTuple(SubstituteBindingsSequence, tuple):
"""
A base feature value that is a tuple of other base feature values.
def __repr__(self): # [xx] really use %s here?
if len(self) == 0:
- return "()"
- return "(%s)" % ", ".join("%s" % (b,) for b in self)
-
+ return '()'
+ return '(%s)' % ', '.join('%s' % (b,) for b in self)
+@python_2_unicode_compatible
class FeatureValueSet(SubstituteBindingsSequence, frozenset):
"""
A base feature value that is a set of other base feature values.
def __repr__(self): # [xx] really use %s here?
if len(self) == 0:
- return "{/}" # distinguish from dict.
+ return '{/}' # distinguish from dict.
# n.b., we sort the string reprs of our elements, to ensure
# that our own repr is deterministic.
- return "{%s}" % ", ".join(sorted("%s" % (b,) for b in self))
+ return '{%s}' % ', '.join(sorted('%s' % (b,) for b in self))
__str__ = __repr__
-
+@python_2_unicode_compatible
class FeatureValueUnion(SubstituteBindingsSequence, frozenset):
"""
A base feature value that represents the union of two or more
# n.b., we sort the string reprs of our elements, to ensure
# that our own repr is deterministic. also, note that len(self)
# is guaranteed to be 2 or more.
- return "{%s}" % "+".join(sorted("%s" % (b,) for b in self))
-
+ return '{%s}' % '+'.join(sorted('%s' % (b,) for b in self))
+@python_2_unicode_compatible
class FeatureValueConcat(SubstituteBindingsSequence, tuple):
"""
A base feature value that represents the concatenation of two or
def __repr__(self):
# n.b.: len(self) is guaranteed to be 2 or more.
- return "(%s)" % "+".join("%s" % (b,) for b in self)
+ return '(%s)' % '+'.join('%s' % (b,) for b in self)
def _flatten(lst, cls):
@total_ordering
-
+@python_2_unicode_compatible
class Feature(object):
"""
A feature identifier that's specialized to put additional
"""
def __init__(self, name, default=None, display=None):
- assert display in (None, "prefix", "slash")
+ assert display in (None, 'prefix', 'slash')
self._name = name # [xx] rename to .identifier?
self._default = default # [xx] not implemented yet.
self._display = display
- if self._display == "prefix":
+ if self._display == 'prefix':
self._sortkey = (-1, self._name)
- elif self._display == "slash":
+ elif self._display == 'slash':
self._sortkey = (1, self._name)
else:
self._sortkey = (0, self._name)
return self._display
def __repr__(self):
- return "*%s*" % self.name
+ return '*%s*' % self.name
def __lt__(self, other):
- if isinstance(other, str):
+ if isinstance(other, string_types):
return True
if not isinstance(other, Feature):
raise_unorderable_types("<", self, other)
class RangeFeature(Feature):
- RANGE_RE = re.compile("(-?\d+):(-?\d+)")
+ RANGE_RE = re.compile('(-?\d+):(-?\d+)')
def read_value(self, s, position, reentrances, parser):
m = self.RANGE_RE.match(s, position)
if not m:
- raise ValueError("range", position)
+ raise ValueError('range', position)
return (int(m.group(1)), int(m.group(2))), m.end()
def unify_base_values(self, fval1, fval2, bindings):
return rng
-SLASH = SlashFeature("slash", default=False, display="slash")
-TYPE = Feature("type", display="prefix")
+SLASH = SlashFeature('slash', default=False, display='slash')
+TYPE = Feature('type', display='prefix')
######################################################################
If this base value unifies with ``other``, then return the
unified value. Otherwise, return ``UnificationFailure``.
"""
- raise NotImplementedError("abstract base class")
+ raise NotImplementedError('abstract base class')
def __eq__(self, other):
- raise NotImplementedError("abstract base class")
+ raise NotImplementedError('abstract base class')
def __ne__(self, other):
return not self == other
def __lt__(self, other):
- raise NotImplementedError("abstract base class")
+ raise NotImplementedError('abstract base class')
def __hash__(self):
- raise TypeError("%s objects or unhashable" % self.__class__.__name__)
+ raise TypeError('%s objects or unhashable' % self.__class__.__name__)
######################################################################
self._prefix_feature = None
self._slash_feature = None
for feature in features:
- if feature.display == "slash":
+ if feature.display == 'slash':
if self._slash_feature:
- raise ValueError("Multiple features w/ display=slash")
+ raise ValueError('Multiple features w/ display=slash')
self._slash_feature = feature
- if feature.display == "prefix":
+ if feature.display == 'prefix':
if self._prefix_feature:
- raise ValueError("Multiple features w/ display=prefix")
+ raise ValueError('Multiple features w/ display=prefix')
self._prefix_feature = feature
self._features_with_defaults = [
feature for feature in features if feature.default is not None
s = s.strip()
value, position = self.read_partial(s, 0, {}, fstruct)
if position != len(s):
- self._error(s, "end of string", position)
+ self._error(s, 'end of string', position)
return value
- _START_FSTRUCT_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)")
- _END_FSTRUCT_RE = re.compile(r"\s*]\s*")
- _SLASH_RE = re.compile(r"/")
+ _START_FSTRUCT_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)')
+ _END_FSTRUCT_RE = re.compile(r'\s*]\s*')
+ _SLASH_RE = re.compile(r'/')
_FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*')
- _REENTRANCE_RE = re.compile(r"\s*->\s*")
- _TARGET_RE = re.compile(r"\s*\((\d+)\)\s*")
- _ASSIGN_RE = re.compile(r"\s*=\s*")
- _COMMA_RE = re.compile(r"\s*,\s*")
- _BARE_PREFIX_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()")
+ _REENTRANCE_RE = re.compile(r'\s*->\s*')
+ _TARGET_RE = re.compile(r'\s*\((\d+)\)\s*')
+ _ASSIGN_RE = re.compile(r'\s*=\s*')
+ _COMMA_RE = re.compile(r'\s*,\s*')
+ _BARE_PREFIX_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()')
# This one is used to distinguish fdicts from flists:
_START_FDICT_RE = re.compile(
- r"(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))"
+ r'(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))'
% (
_BARE_PREFIX_RE.pattern,
_START_FSTRUCT_RE.pattern,
if not match:
match = self._BARE_PREFIX_RE.match(s, position)
if not match:
- raise ValueError("open bracket or identifier", position)
+ raise ValueError('open bracket or identifier', position)
position = match.end()
# If there as an identifier, record it.
if match.group(1):
identifier = match.group(1)
if identifier in reentrances:
- raise ValueError("new identifier", match.start(1))
+ raise ValueError('new identifier', match.start(1))
reentrances[identifier] = fstruct
if isinstance(fstruct, FeatDict):
def _read_partial_featlist(self, s, position, match, reentrances, fstruct):
# Prefix features are not allowed:
if match.group(2):
- raise ValueError("open bracket")
+ raise ValueError('open bracket')
# Bare prefixes are not allowed:
if not match.group(3):
- raise ValueError("open bracket")
+ raise ValueError('open bracket')
# Build a list of the features defined by the structure.
while position < len(s):
position = match.end()
match = self._TARGET_RE.match(s, position)
if not match:
- raise ValueError("identifier", position)
+ raise ValueError('identifier', position)
target = match.group(1)
if target not in reentrances:
- raise ValueError("bound identifier", position)
+ raise ValueError('bound identifier', position)
position = match.end()
fstruct.append(reentrances[target])
# Otherwise, there should be a comma
match = self._COMMA_RE.match(s, position)
if match is None:
- raise ValueError("comma", position)
+ raise ValueError('comma', position)
position = match.end()
# We never saw a close bracket.
- raise ValueError("close bracket", position)
+ raise ValueError('close bracket', position)
def _read_partial_featdict(self, s, position, match, reentrances, fstruct):
# If there was a prefix feature, record it.
if match.group(2):
if self._prefix_feature is None:
- raise ValueError("open bracket or identifier", match.start(2))
+ raise ValueError('open bracket or identifier', match.start(2))
prefixval = match.group(2).strip()
- if prefixval.startswith("?"):
+ if prefixval.startswith('?'):
prefixval = Variable(prefixval)
fstruct[self._prefix_feature] = prefixval
# Get the feature name's name
match = self._FEATURE_NAME_RE.match(s, position)
if match is None:
- raise ValueError("feature name", position)
+ raise ValueError('feature name', position)
name = match.group(2)
position = match.end()
# Check if it's a special feature.
- if name[0] == "*" and name[-1] == "*":
+ if name[0] == '*' and name[-1] == '*':
name = self._features.get(name[1:-1])
if name is None:
- raise ValueError("known special feature", match.start(2))
+ raise ValueError('known special feature', match.start(2))
# Check if this feature has a value already.
if name in fstruct:
- raise ValueError("new name", match.start(2))
+ raise ValueError('new name', match.start(2))
# Boolean value ("+name" or "-name")
- if match.group(1) == "+":
+ if match.group(1) == '+':
value = True
- if match.group(1) == "-":
+ if match.group(1) == '-':
value = False
# Reentrance link ("-> (target)")
position = match.end()
match = self._TARGET_RE.match(s, position)
if not match:
- raise ValueError("identifier", position)
+ raise ValueError('identifier', position)
target = match.group(1)
if target not in reentrances:
- raise ValueError("bound identifier", position)
+ raise ValueError('bound identifier', position)
position = match.end()
value = reentrances[target]
value, position = self._read_value(name, s, position, reentrances)
# None of the above: error.
else:
- raise ValueError("equals sign", position)
+ raise ValueError('equals sign', position)
# Store the value.
fstruct[name] = value
# Otherwise, there should be a comma
match = self._COMMA_RE.match(s, position)
if match is None:
- raise ValueError("comma", position)
+ raise ValueError('comma', position)
position = match.end()
# We never saw a close bracket.
- raise ValueError("close bracket", position)
+ raise ValueError('close bracket', position)
def _finalize(self, s, pos, reentrances, fstruct):
"""
if match:
handler_func = getattr(self, handler)
return handler_func(s, position, reentrances, match)
- raise ValueError("value", position)
+ raise ValueError('value', position)
def _error(self, s, expected, position):
- lines = s.split("\n")
+ lines = s.split('\n')
while position > len(lines[0]):
position -= len(lines.pop(0)) + 1 # +1 for the newline.
estr = (
- "Error parsing feature structure\n "
+ 'Error parsing feature structure\n '
+ lines[0]
- + "\n "
- + " " * position
- + "^ "
- + "Expected %s" % expected
+ + '\n '
+ + ' ' * position
+ + '^ '
+ + 'Expected %s' % expected
)
raise ValueError(estr)
#: the string position where the value ended. (n.b.: order is
#: important here!)
VALUE_HANDLERS = [
- ("read_fstruct_value", _START_FSTRUCT_RE),
- ("read_var_value", re.compile(r"\?[a-zA-Z_][a-zA-Z0-9_]*")),
- ("read_str_value", re.compile("[uU]?[rR]?(['\"])")),
- ("read_int_value", re.compile(r"-?\d+")),
- ("read_sym_value", re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")),
+ ('read_fstruct_value', _START_FSTRUCT_RE),
+ ('read_var_value', re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*')),
+ ('read_str_value', re.compile("[uU]?[rR]?(['\"])")),
+ ('read_int_value', re.compile(r'-?\d+')),
+ ('read_sym_value', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')),
(
- "read_app_value",
- re.compile(r"<(app)\((\?[a-z][a-z]*)\s*," r"\s*(\?[a-z][a-z]*)\)>"),
+ 'read_app_value',
+ re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,' r'\s*(\?[a-z][a-z]*)\)>'),
),
# ('read_logic_value', re.compile(r'<([^>]*)>')),
# lazily match any character after '<' until we hit a '>' not preceded by '-'
- ("read_logic_value", re.compile(r"<(.*?)(?<!-)>")),
- ("read_set_value", re.compile(r"{")),
- ("read_tuple_value", re.compile(r"\(")),
+ ('read_logic_value', re.compile(r'<(.*?)(?<!-)>')),
+ ('read_set_value', re.compile(r'{')),
+ ('read_tuple_value', re.compile(r'\(')),
]
def read_fstruct_value(self, s, position, reentrances, match):
def read_var_value(self, s, position, reentrances, match):
return Variable(match.group()), match.end()
- _SYM_CONSTS = {"None": None, "True": True, "False": False}
+ _SYM_CONSTS = {'None': None, 'True': True, 'False': False}
def read_sym_value(self, s, position, reentrances, match):
val, end = match.group(), match.end()
def read_app_value(self, s, position, reentrances, match):
"""Mainly included for backwards compat."""
- return self._logic_parser.parse("%s(%s)" % match.group(2, 3)), match.end()
+ return self._logic_parser.parse('%s(%s)' % match.group(2, 3)), match.end()
def read_logic_value(self, s, position, reentrances, match):
try:
raise ValueError()
return expr, match.end()
except ValueError:
- raise ValueError("logic expression", match.start(1))
+ raise ValueError('logic expression', match.start(1))
def read_tuple_value(self, s, position, reentrances, match):
return self._read_seq_value(
- s, position, reentrances, match, ")", FeatureValueTuple, FeatureValueConcat
+ s, position, reentrances, match, ')', FeatureValueTuple, FeatureValueConcat
)
def read_set_value(self, s, position, reentrances, match):
return self._read_seq_value(
- s, position, reentrances, match, "}", FeatureValueSet, FeatureValueUnion
+ s, position, reentrances, match, '}', FeatureValueSet, FeatureValueUnion
)
def _read_seq_value(
cp = re.escape(close_paren)
position = match.end()
# Special syntax fo empty tuples:
- m = re.compile(r"\s*/?\s*%s" % cp).match(s, position)
+ m = re.compile(r'\s*/?\s*%s' % cp).match(s, position)
if m:
return seq_class(), m.end()
# Read values:
seen_plus = False
while True:
# Close paren: return value.
- m = re.compile(r"\s*%s" % cp).match(s, position)
+ m = re.compile(r'\s*%s' % cp).match(s, position)
if m:
if seen_plus:
return plus_class(values), m.end()
values.append(val)
# Comma or looking at close paren
- m = re.compile(r"\s*(,|\+|(?=%s))\s*" % cp).match(s, position)
+ m = re.compile(r'\s*(,|\+|(?=%s))\s*' % cp).match(s, position)
if not m:
raise ValueError("',' or '+' or '%s'" % cp, position)
- if m.group(1) == "+":
+ if m.group(1) == '+':
seen_plus = True
position = m.end()
######################################################################
-def display_unification(fs1, fs2, indent=" "):
+def display_unification(fs1, fs2, indent=' '):
# Print the two input feature structures, side by side.
- fs1_lines = ("%s" % fs1).split("\n")
- fs2_lines = ("%s" % fs2).split("\n")
+ fs1_lines = ("%s" % fs1).split('\n')
+ fs2_lines = ("%s" % fs2).split('\n')
if len(fs1_lines) > len(fs2_lines):
- blankline = "[" + " " * (len(fs2_lines[0]) - 2) + "]"
+ blankline = '[' + ' ' * (len(fs2_lines[0]) - 2) + ']'
fs2_lines += [blankline] * len(fs1_lines)
else:
- blankline = "[" + " " * (len(fs1_lines[0]) - 2) + "]"
+ blankline = '[' + ' ' * (len(fs1_lines[0]) - 2) + ']'
fs1_lines += [blankline] * len(fs2_lines)
for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines):
- print(indent + fs1_line + " " + fs2_line)
- print(indent + "-" * len(fs1_lines[0]) + " " + "-" * len(fs2_lines[0]))
+ print(indent + fs1_line + ' ' + fs2_line)
+ print(indent + '-' * len(fs1_lines[0]) + ' ' + '-' * len(fs2_lines[0]))
linelen = len(fs1_lines[0]) * 2 + 3
- print(indent + "| |".center(linelen))
- print(indent + "+-----UNIFY-----+".center(linelen))
- print(indent + "|".center(linelen))
- print(indent + "V".center(linelen))
+ print(indent + '| |'.center(linelen))
+ print(indent + '+-----UNIFY-----+'.center(linelen))
+ print(indent + '|'.center(linelen))
+ print(indent + 'V'.center(linelen))
bindings = {}
result = fs1.unify(fs2, bindings)
if result is None:
- print(indent + "(FAILED)".center(linelen))
+ print(indent + '(FAILED)'.center(linelen))
else:
print(
- "\n".join(indent + l.center(linelen) for l in ("%s" % result).split("\n"))
+ '\n'.join(indent + l.center(linelen) for l in ("%s" % result).split('\n'))
)
if bindings and len(bindings.bound_variables()) > 0:
print(repr(bindings).center(linelen))
def interactive_demo(trace=False):
import random, sys
- HELP = """
+ HELP = '''
1-%d: Select the corresponding feature structure
q: Quit
t: Turn tracing on or off
l: List all feature structures
?: Help
- """
+ '''
print(
- """
+ '''
This demo will repeatedly present you with a list of feature
structures, and ask you to choose two for unification. Whenever a
new feature structure is generated, it is added to the list of
random subset for you to choose between at a given time. If you
want to see the complete lists, type "l". For a list of valid
commands, type "?".
- """
+ '''
)
print('Press "Enter" to continue...')
sys.stdin.readline()
fstruct_strings = [
- "[agr=[number=sing, gender=masc]]",
- "[agr=[gender=masc, person=3]]",
- "[agr=[gender=fem, person=3]]",
- "[subj=[agr=(1)[]], agr->(1)]",
- "[obj=?x]",
- "[subj=?x]",
- "[/=None]",
- "[/=NP]",
- "[cat=NP]",
- "[cat=VP]",
- "[cat=PP]",
- "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]",
- "[gender=masc, agr=?C]",
- "[gender=?S, agr=[gender=?S,person=3]]",
+ '[agr=[number=sing, gender=masc]]',
+ '[agr=[gender=masc, person=3]]',
+ '[agr=[gender=fem, person=3]]',
+ '[subj=[agr=(1)[]], agr->(1)]',
+ '[obj=?x]',
+ '[subj=?x]',
+ '[/=None]',
+ '[/=NP]',
+ '[cat=NP]',
+ '[cat=VP]',
+ '[cat=PP]',
+ '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+ '[gender=masc, agr=?C]',
+ '[gender=?S, agr=[gender=?S,person=3]]',
]
all_fstructs = [
def list_fstructs(fstructs):
for i, fstruct in fstructs:
print()
- lines = ("%s" % fstruct).split("\n")
- print("%3d: %s" % (i + 1, lines[0]))
+ lines = ("%s" % fstruct).split('\n')
+ print('%3d: %s' % (i + 1, lines[0]))
for line in lines[1:]:
- print(" " + line)
+ print(' ' + line)
print()
while True:
else:
fstructs = all_fstructs
- print("_" * 75)
+ print('_' * 75)
- print("Choose two feature structures to unify:")
+ print('Choose two feature structures to unify:')
list_fstructs(fstructs)
selected = [None, None]
- for (nth, i) in (("First", 0), ("Second", 1)):
+ for (nth, i) in (('First', 0), ('Second', 1)):
while selected[i] is None:
print(
(
- "%s feature structure (1-%d,q,t,l,?): "
+ '%s feature structure (1-%d,q,t,l,?): '
% (nth, len(all_fstructs))
),
- end=" ",
+ end=' ',
)
try:
input = sys.stdin.readline().strip()
- if input in ("q", "Q", "x", "X"):
+ if input in ('q', 'Q', 'x', 'X'):
return
- if input in ("t", "T"):
+ if input in ('t', 'T'):
trace = not trace
- print(" Trace = %s" % trace)
+ print(' Trace = %s' % trace)
continue
- if input in ("h", "H", "?"):
+ if input in ('h', 'H', '?'):
print(HELP % len(fstructs))
continue
- if input in ("l", "L"):
+ if input in ('l', 'L'):
list_fstructs(all_fstructs)
continue
num = int(input) - 1
selected[i] = all_fstructs[num][1]
print()
except:
- print("Bad sentence number")
+ print('Bad sentence number')
continue
if trace:
print('\nType "Enter" to continue unifying; or "q" to quit.')
input = sys.stdin.readline().strip()
- if input in ("q", "Q", "x", "X"):
+ if input in ('q', 'Q', 'x', 'X'):
return
# processor breaks with values like '3rd'
fstruct_strings = [
- "[agr=[number=sing, gender=masc]]",
- "[agr=[gender=masc, person=3]]",
- "[agr=[gender=fem, person=3]]",
- "[subj=[agr=(1)[]], agr->(1)]",
- "[obj=?x]",
- "[subj=?x]",
- "[/=None]",
- "[/=NP]",
- "[cat=NP]",
- "[cat=VP]",
- "[cat=PP]",
- "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]",
- "[gender=masc, agr=?C]",
- "[gender=?S, agr=[gender=?S,person=3]]",
+ '[agr=[number=sing, gender=masc]]',
+ '[agr=[gender=masc, person=3]]',
+ '[agr=[gender=fem, person=3]]',
+ '[subj=[agr=(1)[]], agr->(1)]',
+ '[obj=?x]',
+ '[subj=?x]',
+ '[/=None]',
+ '[/=NP]',
+ '[cat=NP]',
+ '[cat=VP]',
+ '[cat=PP]',
+ '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+ '[gender=masc, agr=?C]',
+ '[gender=?S, agr=[gender=?S,person=3]]',
]
all_fstructs = [FeatStruct(fss) for fss in fstruct_strings]
# MAX_CHOICES = 5
)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
__all__ = [
- "FeatStruct",
- "FeatDict",
- "FeatList",
- "unify",
- "subsumes",
- "conflicts",
- "Feature",
- "SlashFeature",
- "RangeFeature",
- "SLASH",
- "TYPE",
- "FeatStructReader",
+ 'FeatStruct',
+ 'FeatDict',
+ 'FeatList',
+ 'unify',
+ 'subsumes',
+ 'conflicts',
+ 'Feature',
+ 'SlashFeature',
+ 'RangeFeature',
+ 'SLASH',
+ 'TYPE',
+ 'FeatStructReader',
]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Context Free Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Jason Narad <jason.narad@gmail.com>
with the right hand side (*rhs*) in a tree (*tree*) is known as
"expanding" *lhs* to *rhs* in *tree*.
"""
+from __future__ import print_function, unicode_literals, division
+
import re
from functools import total_ordering
+from six import string_types
+
from nltk.util import transitive_closure, invert_graph
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
from nltk.probability import ImmutableProbabilisticMixIn
@total_ordering
+@python_2_unicode_compatible
class Nonterminal(object):
"""
A non-terminal symbol for a context free grammar. ``Nonterminal``
:rtype: str
"""
- if isinstance(self._symbol, str):
- return "%s" % self._symbol
+ if isinstance(self._symbol, string_types):
+ return '%s' % self._symbol
else:
- return "%s" % repr(self._symbol)
+ return '%s' % unicode_repr(self._symbol)
def __str__(self):
"""
:rtype: str
"""
- if isinstance(self._symbol, str):
- return "%s" % self._symbol
+ if isinstance(self._symbol, string_types):
+ return '%s' % self._symbol
else:
- return "%s" % repr(self._symbol)
+ return '%s' % unicode_repr(self._symbol)
def __div__(self, rhs):
"""
:type rhs: Nonterminal
:rtype: Nonterminal
"""
- return Nonterminal("%s/%s" % (self._symbol, rhs._symbol))
+ return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
def __truediv__(self, rhs):
"""
in the same order as the symbols names.
:rtype: list(Nonterminal)
"""
- if "," in symbols:
- symbol_list = symbols.split(",")
+ if ',' in symbols:
+ symbol_list = symbols.split(',')
else:
symbol_list = symbols.split()
return [Nonterminal(s.strip()) for s in symbol_list]
:rtype: bool
"""
- return hasattr(item, "__hash__") and not isinstance(item, Nonterminal)
+ return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)
#################################################################
@total_ordering
-
+@python_2_unicode_compatible
class Production(object):
"""
A grammar production. Each production maps a single symbol
:param rhs: The right-hand side of the new ``Production``.
:type rhs: sequence(Nonterminal and terminal)
"""
- if isinstance(rhs, str):
+ if isinstance(rhs, string_types):
raise TypeError(
- "production right hand side should be a list, " "not a string"
+ 'production right hand side should be a list, ' 'not a string'
)
self._lhs = lhs
self._rhs = tuple(rhs)
:rtype: str
"""
- result = "%s -> " % repr(self._lhs)
- result += " ".join(repr(el) for el in self._rhs)
+ result = '%s -> ' % unicode_repr(self._lhs)
+ result += " ".join(unicode_repr(el) for el in self._rhs)
return result
def __repr__(self):
:rtype: str
"""
- return "%s" % self
+ return '%s' % self
def __eq__(self, other):
"""
return self._hash
-
+@python_2_unicode_compatible
class DependencyProduction(Production):
"""
A dependency grammar production. Each production maps a single
:rtype: str
"""
- result = "'%s' ->" % (self._lhs,)
+ result = '\'%s\' ->' % (self._lhs,)
for elt in self._rhs:
- result += " '%s'" % (elt,)
+ result += ' \'%s\'' % (elt,)
return result
-
+@python_2_unicode_compatible
class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
"""
A probabilistic context free grammar production.
Production.__init__(self, lhs, rhs)
def __str__(self):
- return super().__str__() + (
- " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob()
+ return Production.__unicode__(self) + (
+ ' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob()
)
def __eq__(self, other):
#################################################################
-
+@python_2_unicode_compatible
class CFG(object):
"""
A context-free grammar. A grammar consists of a start state and
"""
missing = [tok for tok in tokens if not self._lexical_index.get(tok)]
if missing:
- missing = ", ".join("%r" % (w,) for w in missing)
+ missing = ', '.join('%r' % (w,) for w in missing)
raise ValueError(
"Grammar does not cover some of the " "input words: %r." % missing
)
"""
return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical
- def chomsky_normal_form(self, new_token_padding="@$@", flexible=False):
- """
- Returns a new Grammer that is in chomsky normal
- :param: new_token_padding
- Customise new rule formation during binarisation
- """
- if self.is_chomsky_normal_form():
- return self
- if self.productions(empty=True):
- raise ValueError(
- ("Grammar has Empty rules. " "Cannot deal with them at the moment")
- )
-
- # check for mixed rules
- for rule in self.productions():
- if rule.is_lexical() and len(rule.rhs()) > 1:
- raise ValueError(
- "Cannot handled mixed rule {} => {}".format(rule.lhs(), rule.rhs())
- )
-
- step1 = CFG.eliminate_start(self)
- step2 = CFG.binarize(step1, new_token_padding)
- if flexible:
- return step2
- step3 = CFG.remove_unitary_rules(step2)
- return step3
-
- @classmethod
- def remove_unitary_rules(cls, grammar):
- """
- Remove nonlexical unitary rules and convert them to
- lexical
- """
- result = []
- unitary = []
- for rule in grammar.productions():
- if len(rule) == 1 and rule.is_nonlexical():
- unitary.append(rule)
- else:
- result.append(rule)
-
- while unitary:
- rule = unitary.pop(0)
- for item in grammar.productions(lhs=rule.rhs()[0]):
- new_rule = Production(rule.lhs(), item.rhs())
- if len(new_rule) != 1 or new_rule.is_lexical():
- result.append(new_rule)
- else:
- unitary.append(new_rule)
-
- n_grammar = CFG(grammar.start(), result)
- return n_grammar
-
- @classmethod
- def binarize(cls, grammar, padding="@$@"):
- """
- Convert all non-binary rules into binary by introducing
- new tokens.
- Example::
- Original:
- A => B C D
- After Conversion:
- A => B A@$@B
- A@$@B => C D
- """
- result = []
-
- for rule in grammar.productions():
- if len(rule.rhs()) > 2:
- # this rule needs to be broken down
- left_side = rule.lhs()
- for k in range(0, len(rule.rhs()) - 2):
- tsym = rule.rhs()[k]
- new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol())
- new_production = Production(left_side, (tsym, new_sym))
- left_side = new_sym
- result.append(new_production)
- last_prd = Production(left_side, rule.rhs()[-2:])
- result.append(last_prd)
- else:
- result.append(rule)
-
- n_grammar = CFG(grammar.start(), result)
- return n_grammar
-
- @classmethod
- def eliminate_start(cls, grammar):
- """
- Eliminate start rule in case it appears on RHS
- Example: S -> S0 S1 and S0 -> S1 S
- Then another rule S0_Sigma -> S is added
- """
- start = grammar.start()
- result = []
- need_to_add = None
- for rule in grammar.productions():
- if start in rule.rhs():
- need_to_add = True
- result.append(rule)
- if need_to_add:
- start = Nonterminal("S0_SIGMA")
- result.append(Production(start, [grammar.start()]))
- n_grammar = CFG(start, result)
- return n_grammar
- return grammar
-
def __repr__(self):
- return "<Grammar with %d productions>" % len(self._productions)
+ return '<Grammar with %d productions>' % len(self._productions)
def __str__(self):
- result = "Grammar with %d productions" % len(self._productions)
- result += " (start state = %r)" % self._start
+ result = 'Grammar with %d productions' % len(self._productions)
+ result += ' (start state = %r)' % self._start
for production in self._productions:
- result += "\n %s" % production
+ result += '\n %s' % production
return result
)
elif logic_parser is not None:
raise Exception(
- "'logic_parser' and 'fstruct_reader' must " "not both be set"
+ '\'logic_parser\' and \'fstruct_reader\' must ' 'not both be set'
)
start, productions = read_grammar(
@total_ordering
-
+@python_2_unicode_compatible
class FeatureValueType(object):
"""
A helper class for ``FeatureGrammars``, designed to be different
self._hash = hash(value)
def __repr__(self):
- return "<%s>" % self._value
+ return '<%s>' % self._value
def __eq__(self, other):
return type(self) == type(other) and self._value == other._value
return self._hash
-
+@python_2_unicode_compatible
class DependencyGrammar(object):
"""
A dependency grammar. A DependencyGrammar consists of a set of
@classmethod
def fromstring(cls, input):
productions = []
- for linenum, line in enumerate(input.split("\n")):
+ for linenum, line in enumerate(input.split('\n')):
line = line.strip()
- if line.startswith("#") or line == "":
+ if line.startswith('#') or line == '':
continue
try:
productions += _read_dependency_production(line)
except ValueError:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
if len(productions) == 0:
- raise ValueError("No productions found!")
+ raise ValueError('No productions found!')
return cls(productions)
def contains(self, head, mod):
:rtype: str
"""
- str = "Dependency grammar with %d productions" % len(self._productions)
+ str = 'Dependency grammar with %d productions' % len(self._productions)
for production in self._productions:
- str += "\n %s" % production
+ str += '\n %s' % production
return str
def __repr__(self):
"""
Return a concise string representation of the ``DependencyGrammar``
"""
- return "Dependency grammar with %d productions" % len(self._productions)
-
+ return 'Dependency grammar with %d productions' % len(self._productions)
+@python_2_unicode_compatible
class ProbabilisticDependencyGrammar(object):
"""
:rtype: str
"""
- str = "Statistical dependency grammar with %d productions" % len(
+ str = 'Statistical dependency grammar with %d productions' % len(
self._productions
)
for production in self._productions:
- str += "\n %s" % production
- str += "\nEvents:"
+ str += '\n %s' % production
+ str += '\nEvents:'
for event in self._events:
- str += "\n %d:%s" % (self._events[event], event)
- str += "\nTags:"
+ str += '\n %d:%s' % (self._events[event], event)
+ str += '\nTags:'
for tag_word in self._tags:
- str += "\n %s:\t(%s)" % (tag_word, self._tags[tag_word])
+ str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word])
return str
def __repr__(self):
"""
Return a concise string representation of the ``ProbabilisticDependencyGrammar``
"""
- return "Statistical Dependency grammar with %d productions" % len(
+ return 'Statistical Dependency grammar with %d productions' % len(
self._productions
)
# Parsing generic grammars
-_ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE)
-_PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE)
+_ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE)
+_PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE)
_TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
-_DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE)
+_DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)
def _read_production(line, nonterm_parser, probabilistic=False):
# Skip over the arrow.
m = _ARROW_RE.match(line, pos)
if not m:
- raise ValueError("Expected an arrow")
+ raise ValueError('Expected an arrow')
pos = m.end()
# Parse the right hand side.
probabilities[-1] = float(m.group(1)[1:-1])
if probabilities[-1] > 1.0:
raise ValueError(
- "Production probability %f, "
- "should not be greater than 1.0" % (probabilities[-1],)
+ 'Production probability %f, '
+ 'should not be greater than 1.0' % (probabilities[-1],)
)
# String -- add terminal.
- elif line[pos] in "'\"":
+ elif line[pos] in "\'\"":
m = _TERMINAL_RE.match(line, pos)
if not m:
- raise ValueError("Unterminated string")
+ raise ValueError('Unterminated string')
rhsides[-1].append(m.group(1)[1:-1])
pos = m.end()
# Vertical bar -- start new rhside.
- elif line[pos] == "|":
+ elif line[pos] == '|':
m = _DISJUNCTION_RE.match(line, pos)
probabilities.append(0.0)
rhsides.append([])
"""
if encoding is not None:
input = input.decode(encoding)
- if isinstance(input, str):
- lines = input.split("\n")
+ if isinstance(input, string_types):
+ lines = input.split('\n')
else:
lines = input
start = None
productions = []
- continue_line = ""
+ continue_line = ''
for linenum, line in enumerate(lines):
line = continue_line + line.strip()
- if line.startswith("#") or line == "":
+ if line.startswith('#') or line == '':
continue
- if line.endswith("\\"):
- continue_line = line[:-1].rstrip() + " "
+ if line.endswith('\\'):
+ continue_line = line[:-1].rstrip() + ' '
continue
- continue_line = ""
+ continue_line = ''
try:
- if line[0] == "%":
+ if line[0] == '%':
directive, args = line[1:].split(None, 1)
- if directive == "start":
+ if directive == 'start':
start, pos = nonterm_parser(args, 0)
if pos != len(args):
- raise ValueError("Bad argument to start directive")
+ raise ValueError('Bad argument to start directive')
else:
- raise ValueError("Bad directive")
+ raise ValueError('Bad directive')
else:
# expand out the disjunctions on the RHS
productions += _read_production(line, nonterm_parser, probabilistic)
except ValueError as e:
- raise ValueError("Unable to parse line %s: %s\n%s" % (linenum + 1, line, e))
+ raise ValueError('Unable to parse line %s: %s\n%s' % (linenum + 1, line, e))
if not productions:
- raise ValueError("No productions found!")
+ raise ValueError('No productions found!')
if not start:
start = productions[0].lhs()
return (start, productions)
-_STANDARD_NONTERM_RE = re.compile("( [\w/][\w/^<>-]* ) \s*", re.VERBOSE)
+_STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
def standard_nonterm_parser(string, pos):
m = _STANDARD_NONTERM_RE.match(string, pos)
if not m:
- raise ValueError("Expected a nonterminal, found: " + string[pos:])
+ raise ValueError('Expected a nonterminal, found: ' + string[pos:])
return (Nonterminal(m.group(1)), m.end())
#################################################################
_READ_DG_RE = re.compile(
- r"""^\s* # leading whitespace
+ r'''^\s* # leading whitespace
('[^']+')\s* # single-quoted lhs
(?:[-=]+>)\s* # arrow
(?:( # rhs:
| \| # disjunction
)
\s*) # trailing space
- *$""", # zero or more copies
+ *$''', # zero or more copies
re.VERBOSE,
)
-_SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""")
+_SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')
def _read_dependency_production(s):
if not _READ_DG_RE.match(s):
- raise ValueError("Bad production string")
+ raise ValueError('Bad production string')
pieces = _SPLIT_DG_RE.split(s)
pieces = [p for i, p in enumerate(pieces) if i % 2 == 1]
- lhside = pieces[0].strip("'\"")
+ lhside = pieces[0].strip('\'\"')
rhsides = [[]]
for piece in pieces[2:]:
- if piece == "|":
+ if piece == '|':
rhsides.append([])
else:
- rhsides[-1].append(piece.strip("'\""))
+ rhsides[-1].append(piece.strip('\'\"'))
return [DependencyProduction(lhside, rhside) for rhside in rhsides]
from nltk import nonterminals, Production, CFG
# Create some nonterminals
- S, NP, VP, PP = nonterminals("S, NP, VP, PP")
- N, V, P, Det = nonterminals("N, V, P, Det")
+ S, NP, VP, PP = nonterminals('S, NP, VP, PP')
+ N, V, P, Det = nonterminals('N, V, P, Det')
VP_slash_NP = VP / NP
- print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP])
- print(" S.symbol() =>", repr(S.symbol()))
+ print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP])
+ print(' S.symbol() =>', repr(S.symbol()))
print()
print(Production(S, [NP]))
"""
)
- print("A Grammar:", repr(grammar))
- print(" grammar.start() =>", repr(grammar.start()))
- print(" grammar.productions() =>", end=" ")
+ print('A Grammar:', repr(grammar))
+ print(' grammar.start() =>', repr(grammar.start()))
+ print(' grammar.productions() =>', end=' ')
# Use string.replace(...) is to line-wrap the output.
- print(repr(grammar.productions()).replace(",", ",\n" + " " * 25))
+ print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 25))
print()
pcfg_prods = toy_pcfg1.productions()
pcfg_prod = pcfg_prods[2]
- print("A PCFG production:", repr(pcfg_prod))
- print(" pcfg_prod.lhs() =>", repr(pcfg_prod.lhs()))
- print(" pcfg_prod.rhs() =>", repr(pcfg_prod.rhs()))
- print(" pcfg_prod.prob() =>", repr(pcfg_prod.prob()))
+ print('A PCFG production:', repr(pcfg_prod))
+ print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs()))
+ print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs()))
+ print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
print()
grammar = toy_pcfg2
- print("A PCFG grammar:", repr(grammar))
- print(" grammar.start() =>", repr(grammar.start()))
- print(" grammar.productions() =>", end=" ")
+ print('A PCFG grammar:', repr(grammar))
+ print(' grammar.start() =>', repr(grammar.start()))
+ print(' grammar.productions() =>', end=' ')
# Use .replace(...) is to line-wrap the output.
- print(repr(grammar.productions()).replace(",", ",\n" + " " * 26))
+ print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
print()
# extract productions from three trees and induce the PCFG
productions += tree.productions()
- S = Nonterminal("S")
+ S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
print(grammar)
print()
def fcfg_demo():
import nltk.data
- g = nltk.data.load("grammars/book_grammars/feat0.fcfg")
+ g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
print(g)
print()
sdg_demo()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
__all__ = [
- "Nonterminal",
- "nonterminals",
- "CFG",
- "Production",
- "PCFG",
- "ProbabilisticProduction",
- "DependencyGrammar",
- "DependencyProduction",
- "ProbabilisticDependencyGrammar",
- "induce_pcfg",
- "read_grammar",
+ 'Nonterminal',
+ 'nonterminals',
+ 'CFG',
+ 'Production',
+ 'PCFG',
+ 'ProbabilisticProduction',
+ 'DependencyGrammar',
+ 'DependencyProduction',
+ 'ProbabilisticDependencyGrammar',
+ 'induce_pcfg',
+ 'read_grammar',
]
# Natural Language Toolkit (NLTK) Help
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Provide structured access to documentation.
"""
+from __future__ import print_function
import re
from textwrap import wrap
entry = tagdict[tag]
defn = [tag + ": " + entry[0]]
examples = wrap(
- entry[1], width=75, initial_indent=" ", subsequent_indent=" "
+ entry[1], width=75, initial_indent=' ', subsequent_indent=' '
)
print("\n".join(defn + examples))
print("No matching tags found.")
-if __name__ == "__main__":
- brown_tagset(r"NN.*")
- upenn_tagset(r".*\$")
- claws5_tagset("UNDEFINED")
- brown_tagset(r"NN")
+if __name__ == '__main__':
+ brown_tagset(r'NN.*')
+ upenn_tagset(r'.*\$')
+ claws5_tagset('UNDEFINED')
+ brown_tagset(r'NN')
# Natural Language Toolkit: Inference
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Dan Garrette <dhgarrette@gmail.com>
# Ewan Klein <ewan@inf.ed.ac.uk>
#
goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy
the assumptions plus the negation of *G*.
"""
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
import threading
import time
+from six import add_metaclass
-class Prover(metaclass=ABCMeta):
+
+@add_metaclass(ABCMeta)
+class Prover(object):
"""
Interface for trying to prove a goal from assumptions. Both the goal and
the assumptions are constrained to be formulas of ``logic.Expression``.
"""
-class ModelBuilder(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ModelBuilder(object):
"""
Interface for trying to build a model of set of formulas.
Open formulas are assumed to be universally quantified.
"""
-class TheoremToolCommand(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TheoremToolCommand(object):
"""
This class holds a goal and a list of assumptions to be used in proving
or model building.
:return: str
"""
if self._result is None:
- raise LookupError("You have to call build_model() first to " "get a model!")
+ raise LookupError('You have to call build_model() first to ' 'get a model!')
else:
return self._decorate_model(self._model, format)
:return: str
"""
if self._result is None:
- raise LookupError("You have to call build_model() first to " "get a model!")
+ raise LookupError('You have to call build_model() first to ' 'get a model!')
else:
return self._decorate_model(self._model, format)
self._modelbuilder = modelbuilder
def _prove(self, goal=None, assumptions=None, verbose=False):
- return self._run(goal, assumptions, verbose), ""
+ return self._run(goal, assumptions, verbose), ''
def _build_model(self, goal=None, assumptions=None, verbose=False):
- return not self._run(goal, assumptions, verbose), ""
+ return not self._run(goal, assumptions, verbose), ''
def _run(self, goal, assumptions, verbose):
# Set up two thread, Prover and ModelBuilder to run in parallel
tp_thread = TheoremToolThread(
- lambda: self._prover.prove(goal, assumptions, verbose), verbose, "TP"
+ lambda: self._prover.prove(goal, assumptions, verbose), verbose, 'TP'
)
mb_thread = TheoremToolThread(
lambda: self._modelbuilder.build_model(goal, assumptions, verbose),
verbose,
- "MB",
+ 'MB',
)
tp_thread.start()
def _run(self, verbose):
# Set up two thread, Prover and ModelBuilder to run in parallel
tp_thread = TheoremToolThread(
- lambda: BaseProverCommand.prove(self, verbose), verbose, "TP"
+ lambda: BaseProverCommand.prove(self, verbose), verbose, 'TP'
)
mb_thread = TheoremToolThread(
- lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, "MB"
+ lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, 'MB'
)
tp_thread.start()
self._result = self._command()
if self._verbose:
print(
- "Thread %s finished with result %s at %s"
+ 'Thread %s finished with result %s at %s'
% (self._name, self._result, time.localtime(time.time()))
)
except Exception as e:
print(e)
- print("Thread %s completed abnormally" % (self._name))
+ print('Thread %s completed abnormally' % (self._name))
@property
def result(self):
(This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show
those threads which are consistent (taking into account any background assumptions).
"""
+from __future__ import print_function
import os
from abc import ABCMeta, abstractmethod
from operator import and_, add
from functools import reduce
+from six import add_metaclass
from nltk.data import show_cfg
from nltk.tag import RegexpTagger
from nltk.inference.prover9 import Prover9Command
-class ReadingCommand(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ReadingCommand(object):
@abstractmethod
def parse_to_readings(self, sentence):
"""
:type gramfile: str
"""
self._gramfile = (
- gramfile if gramfile else "grammars/book_grammars/discourse.fcfg"
+ gramfile if gramfile else 'grammars/book_grammars/discourse.fcfg'
)
self._parser = load_parser(self._gramfile)
"""
if semtype_file is None:
semtype_file = os.path.join(
- "grammars", "sample_grammars", "drt_glue.semtype"
+ 'grammars', 'sample_grammars', 'drt_glue.semtype'
)
self._glue = DrtGlue(
semtype_file=semtype_file,
:type background: list(Expression)
"""
self._input = input
- self._sentences = dict([("s%s" % i, sent) for i, sent in enumerate(input)])
+ self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)])
self._models = None
self._readings = {}
self._reading_command = (
self._input.append(sentence)
self._sentences = dict(
- [("s%s" % i, sent) for i, sent in enumerate(self._input)]
+ [('s%s' % i, sent) for i, sent in enumerate(self._input)]
)
# check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of
# of assumptions
self.sentences()
return None
self._sentences = dict(
- [("s%s" % i, sent) for i, sent in enumerate(self._input)]
+ [('s%s' % i, sent) for i, sent in enumerate(self._input)]
)
self.readings(verbose=False)
if verbose:
else:
for sid in sorted(self._readings):
print()
- print("%s readings:" % sid)
+ print('%s readings:' % sid)
print() #'-' * 30
for rid in sorted(self._readings[sid]):
lf = self._readings[sid][rid]
for tid in sorted(threads):
if show_thread_readings:
readings = [
- self._readings[rid.split("-")[0]][rid] for rid in self._threads[tid]
+ self._readings[rid.split('-')[0]][rid] for rid in self._threads[tid]
]
try:
thread_reading = (
% self._reading_command.combine_readings(readings).normalize()
)
except Exception as e:
- thread_reading = ": INVALID: %s" % e.__class__.__name__
+ thread_reading = ': INVALID: %s' % e.__class__.__name__
else:
- thread_reading = ""
+ thread_reading = ''
print("%s:" % tid, self._threads[tid], thread_reading)
return [
(rid, self._readings[sid][rid])
for rid in threads[thread_id]
- for sid in rid.split("-")[:1]
+ for sid in rid.split('-')[:1]
]
###############################
print(a)
spacer(80)
if modelfound:
- print(mb.model(format="cooked"))
+ print(mb.model(format='cooked'))
else:
print("No model found!\n")
return results
return result
+# multiply = DiscourseTester.multiply
+# L1 = [['A'], ['B']]
+# L2 = ['a', 'b', 'c']
+# print multiply(L1,L2)
+
+
def load_fol(s):
"""
Temporarily duplicated from ``nltk.sem.util``.
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith("#") or line == "":
+ if line.startswith('#') or line == '':
continue
try:
statements.append(Expression.fromstring(line))
except Exception:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return statements
Illustrate the various methods of ``DiscourseTester``
"""
dt = DiscourseTester(
- ["A boxer walks", "Every boxer chases a girl"], reading_command
+ ['A boxer walks', 'Every boxer chases a girl'], reading_command
)
dt.models()
print()
print()
dt.readings(threaded=True)
print()
- dt.models("d1")
- dt.add_sentence("John is a boxer")
+ dt.models('d1')
+ dt.add_sentence('John is a boxer')
print()
dt.sentences()
print()
dt.readings(threaded=True)
print()
dt = DiscourseTester(
- ["A student dances", "Every student is a person"], reading_command
+ ['A student dances', 'Every student is a person'], reading_command
)
print()
- dt.add_sentence("No person dances", consistchk=True)
+ dt.add_sentence('No person dances', consistchk=True)
print()
dt.readings()
print()
- dt.retract_sentence("No person dances", verbose=True)
+ dt.retract_sentence('No person dances', verbose=True)
print()
dt.models()
print()
- dt.readings("A person dances")
+ dt.readings('A person dances')
print()
- dt.add_sentence("A person dances", informchk=True)
+ dt.add_sentence('A person dances', informchk=True)
dt = DiscourseTester(
- ["Vincent is a boxer", "Fido is a boxer", "Vincent is married", "Fido barks"],
+ ['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks'],
reading_command,
)
dt.readings(filter=True)
import nltk.data
- background_file = os.path.join("grammars", "book_grammars", "background.fol")
+ background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
background = nltk.data.load(background_file)
print()
"""
Illustrate the various methods of ``DiscourseTester``
"""
- dt = DiscourseTester(["every dog chases a boy", "he runs"], reading_command)
+ dt = DiscourseTester(['every dog chases a boy', 'he runs'], reading_command)
dt.models()
print()
dt.sentences()
def spacer(num=30):
- print("-" * num)
+ print('-' * num)
def demo():
tagger = RegexpTagger(
[
- ("^(chases|runs)$", "VB"),
- ("^(a)$", "ex_quant"),
- ("^(every)$", "univ_quant"),
- ("^(dog|boy)$", "NN"),
- ("^(he)$", "PRP"),
+ ('^(chases|runs)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(dog|boy)$', 'NN'),
+ ('^(he)$', 'PRP'),
]
)
depparser = MaltParser(tagger=tagger)
)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
"""
A model builder that makes use of the external 'Mace4' package.
"""
+from __future__ import print_function
import os
import tempfile
@property
def valuation(mbc):
- return mbc.model("valuation")
+ return mbc.model('valuation')
def _convert2val(self, valuation_str):
"""
:return: A model if one is generated; None otherwise.
:rtype: sem.Valuation
"""
- valuation_standard_format = self._transform_output(valuation_str, "standard")
+ valuation_standard_format = self._transform_output(valuation_str, 'standard')
val = []
for line in valuation_standard_format.splitlines(False):
l = line.strip()
- if l.startswith("interpretation"):
+ if l.startswith('interpretation'):
# find the number of entities in the model
- num_entities = int(l[l.index("(") + 1 : l.index(",")].strip())
+ num_entities = int(l[l.index('(') + 1 : l.index(',')].strip())
- elif l.startswith("function") and l.find("_") == -1:
+ elif l.startswith('function') and l.find('_') == -1:
# replace the integer identifier with a corresponding alphabetic character
- name = l[l.index("(") + 1 : l.index(",")].strip()
+ name = l[l.index('(') + 1 : l.index(',')].strip()
if is_indvar(name):
name = name.upper()
- value = int(l[l.index("[") + 1 : l.index("]")].strip())
+ value = int(l[l.index('[') + 1 : l.index(']')].strip())
val.append((name, MaceCommand._make_model_var(value)))
- elif l.startswith("relation"):
- l = l[l.index("(") + 1 :]
- if "(" in l:
+ elif l.startswith('relation'):
+ l = l[l.index('(') + 1 :]
+ if '(' in l:
# relation is not nullary
- name = l[: l.index("(")].strip()
+ name = l[: l.index('(')].strip()
values = [
int(v.strip())
- for v in l[l.index("[") + 1 : l.index("]")].split(",")
+ for v in l[l.index('[') + 1 : l.index(']')].split(',')
]
val.append(
(name, MaceCommand._make_relation_set(num_entities, values))
)
else:
# relation is nullary
- name = l[: l.index(",")].strip()
- value = int(l[l.index("[") + 1 : l.index("]")].strip())
+ name = l[: l.index(',')].strip()
+ value = int(l[l.index('[') + 1 : l.index(']')].strip())
val.append((name, value == 1))
return Valuation(val)
:type value: int
"""
letter = [
- "a",
- "b",
- "c",
- "d",
- "e",
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
+ 'a',
+ 'b',
+ 'c',
+ 'd',
+ 'e',
+ 'f',
+ 'g',
+ 'h',
+ 'i',
+ 'j',
+ 'k',
+ 'l',
+ 'm',
+ 'n',
+ 'o',
+ 'p',
+ 'q',
+ 'r',
+ 's',
+ 't',
+ 'u',
+ 'v',
+ 'w',
+ 'x',
+ 'y',
+ 'z',
][value]
num = value // 26
return letter + str(num) if num > 0 else letter
"""
if not format:
return valuation_str
- elif format == "valuation":
+ elif format == 'valuation':
return self._convert2val(valuation_str)
else:
return self._transform_output(valuation_str, format)
:type format: str
"""
if format in [
- "standard",
- "standard2",
- "portable",
- "tabular",
- "raw",
- "cooked",
- "xml",
- "tex",
+ 'standard',
+ 'standard2',
+ 'portable',
+ 'tabular',
+ 'raw',
+ 'cooked',
+ 'xml',
+ 'tex',
]:
return self._call_interpformat(valuation_str, [format])[0]
else:
"""
if self._interpformat_bin is None:
self._interpformat_bin = self._modelbuilder._find_binary(
- "interpformat", verbose
+ 'interpformat', verbose
)
return self._modelbuilder._call(
:see: ``config_prover9``
"""
if self._mace4_bin is None:
- self._mace4_bin = self._find_binary("mace4", verbose)
+ self._mace4_bin = self._find_binary('mace4', verbose)
- updated_input_str = ""
+ updated_input_str = ''
if self._end_size > 0:
- updated_input_str += "assign(end_size, %d).\n\n" % self._end_size
+ updated_input_str += 'assign(end_size, %d).\n\n' % self._end_size
updated_input_str += input_str
return self._call(updated_input_str, self._mace4_bin, args, verbose)
def spacer(num=30):
- print("-" * num)
+ print('-' * num)
def decode_result(found):
:param found: The output of model_found()
:type found: bool
"""
- return {True: "Countermodel found", False: "No countermodel found", None: "None"}[
+ return {True: 'Countermodel found', False: 'No countermodel found', None: 'None'}[
found
]
m = MaceCommand(g, assumptions=alist, max_models=50)
found = m.build_model()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, decode_result(found)))
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, decode_result(found)))
def test_build_model(arguments):
"""
Try to build a ``nltk.sem.Valuation``.
"""
- g = Expression.fromstring("all x.man(x)")
+ g = Expression.fromstring('all x.man(x)')
alist = [
Expression.fromstring(a)
for a in [
- "man(John)",
- "man(Socrates)",
- "man(Bill)",
- "some x.(-(x = John) & man(x) & sees(John,x))",
- "some x.(-(x = Bill) & man(x))",
- "all x.some y.(man(x) -> gives(Socrates,x,y))",
+ 'man(John)',
+ 'man(Socrates)',
+ 'man(Bill)',
+ 'some x.(-(x = John) & man(x) & sees(John,x))',
+ 'some x.(-(x = Bill) & man(x))',
+ 'all x.some y.(man(x) -> gives(Socrates,x,y))',
]
]
print("Assumptions and Goal")
spacer()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, decode_result(m.build_model())))
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, decode_result(m.build_model())))
spacer()
- # print(m.model('standard'))
- # print(m.model('cooked'))
+ # print m.model('standard')
+ # print m.model('cooked')
print("Valuation")
spacer()
- print(m.valuation, "\n")
+ print(m.valuation, '\n')
def test_transform_output(argument_pair):
m = MaceCommand(g, assumptions=alist)
m.build_model()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, m.build_model()))
- for format in ["standard", "portable", "xml", "cooked"]:
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, m.build_model()))
+ for format in ['standard', 'portable', 'xml', 'cooked']:
spacer()
print("Using '%s' format" % format)
spacer()
def test_make_relation_set():
print(
MaceCommand._make_relation_set(num_entities=3, values=[1, 0, 1])
- == set([("c",), ("a",)])
+ == set([('c',), ('a',)])
)
print(
MaceCommand._make_relation_set(
num_entities=3, values=[0, 0, 0, 0, 0, 0, 1, 0, 0]
)
- == set([("c", "a")])
+ == set([('c', 'a')])
)
print(
MaceCommand._make_relation_set(num_entities=2, values=[0, 0, 1, 0, 0, 0, 1, 0])
- == set([("a", "b", "a"), ("b", "b", "a")])
+ == set([('a', 'b', 'a'), ('b', 'b', 'a')])
)
arguments = [
- ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
- ("(not mortal(Socrates))", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
+ ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+ ('(not mortal(Socrates))', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
]
test_transform_output(arguments[1])
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Daniel H. Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
this module are based on "Logical Foundations of Artificial Intelligence" by
Michael R. Genesereth and Nils J. Nilsson.
"""
+from __future__ import print_function, unicode_literals
from collections import defaultdict
from functools import reduce
)
from nltk.inference.api import Prover, ProverCommandDecorator
+from nltk.compat import python_2_unicode_compatible
class ProverParseError(Exception):
predDict[func1].validate_sig_len(sig)
+@python_2_unicode_compatible
class PredHolder(object):
"""
This class will be used by a dictionary that will store information
raise Exception("Signature lengths do not match")
def __str__(self):
- return "(%s,%s,%s)" % (self.signatures, self.properties, self.signature_len)
+ return '(%s,%s,%s)' % (self.signatures, self.properties, self.signature_len)
def __repr__(self):
return "%s" % self
def closed_domain_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"exists x.walk(x)")
- p2 = lexpr(r"man(Socrates)")
- c = lexpr(r"walk(Socrates)")
+ p1 = lexpr(r'exists x.walk(x)')
+ p2 = lexpr(r'man(Socrates)')
+ c = lexpr(r'walk(Socrates)')
prover = Prover9Command(c, [p1, p2])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"exists x.walk(x)")
- p2 = lexpr(r"man(Socrates)")
- p3 = lexpr(r"-walk(Bill)")
- c = lexpr(r"walk(Socrates)")
+ p1 = lexpr(r'exists x.walk(x)')
+ p2 = lexpr(r'man(Socrates)')
+ p3 = lexpr(r'-walk(Bill)')
+ c = lexpr(r'walk(Socrates)')
prover = Prover9Command(c, [p1, p2, p3])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"exists x.walk(x)")
- p2 = lexpr(r"man(Socrates)")
- p3 = lexpr(r"-walk(Bill)")
- c = lexpr(r"walk(Socrates)")
+ p1 = lexpr(r'exists x.walk(x)')
+ p2 = lexpr(r'man(Socrates)')
+ p3 = lexpr(r'-walk(Bill)')
+ c = lexpr(r'walk(Socrates)')
prover = Prover9Command(c, [p1, p2, p3])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"walk(Socrates)")
- p2 = lexpr(r"walk(Bill)")
- c = lexpr(r"all x.walk(x)")
+ p1 = lexpr(r'walk(Socrates)')
+ p2 = lexpr(r'walk(Bill)')
+ c = lexpr(r'all x.walk(x)')
prover = Prover9Command(c, [p1, p2])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"girl(mary)")
- p2 = lexpr(r"dog(rover)")
- p3 = lexpr(r"all x.(girl(x) -> -dog(x))")
- p4 = lexpr(r"all x.(dog(x) -> -girl(x))")
- p5 = lexpr(r"chase(mary, rover)")
- c = lexpr(r"exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))")
+ p1 = lexpr(r'girl(mary)')
+ p2 = lexpr(r'dog(rover)')
+ p3 = lexpr(r'all x.(girl(x) -> -dog(x))')
+ p4 = lexpr(r'all x.(dog(x) -> -girl(x))')
+ p5 = lexpr(r'chase(mary, rover)')
+ c = lexpr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))')
prover = Prover9Command(c, [p1, p2, p3, p4, p5])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
def unique_names_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"man(Socrates)")
- p2 = lexpr(r"man(Bill)")
- c = lexpr(r"exists x.exists y.(x != y)")
+ p1 = lexpr(r'man(Socrates)')
+ p2 = lexpr(r'man(Bill)')
+ c = lexpr(r'exists x.exists y.(x != y)')
prover = Prover9Command(c, [p1, p2])
print(prover.prove())
unp = UniqueNamesProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in unp.assumptions():
- print(" ", a)
- print("goal:", unp.goal())
+ print(' ', a)
+ print('goal:', unp.goal())
print(unp.prove())
- p1 = lexpr(r"all x.(walk(x) -> (x = Socrates))")
- p2 = lexpr(r"Bill = William")
- p3 = lexpr(r"Bill = Billy")
- c = lexpr(r"-walk(William)")
+ p1 = lexpr(r'all x.(walk(x) -> (x = Socrates))')
+ p2 = lexpr(r'Bill = William')
+ p3 = lexpr(r'Bill = Billy')
+ c = lexpr(r'-walk(William)')
prover = Prover9Command(c, [p1, p2, p3])
print(prover.prove())
unp = UniqueNamesProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in unp.assumptions():
- print(" ", a)
- print("goal:", unp.goal())
+ print(' ', a)
+ print('goal:', unp.goal())
print(unp.prove())
def closed_world_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"walk(Socrates)")
- p2 = lexpr(r"(Socrates != Bill)")
- c = lexpr(r"-walk(Bill)")
+ p1 = lexpr(r'walk(Socrates)')
+ p2 = lexpr(r'(Socrates != Bill)')
+ c = lexpr(r'-walk(Bill)')
prover = Prover9Command(c, [p1, p2])
print(prover.prove())
cwp = ClosedWorldProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cwp.assumptions():
- print(" ", a)
- print("goal:", cwp.goal())
+ print(' ', a)
+ print('goal:', cwp.goal())
print(cwp.prove())
- p1 = lexpr(r"see(Socrates, John)")
- p2 = lexpr(r"see(John, Mary)")
- p3 = lexpr(r"(Socrates != John)")
- p4 = lexpr(r"(John != Mary)")
- c = lexpr(r"-see(Socrates, Mary)")
+ p1 = lexpr(r'see(Socrates, John)')
+ p2 = lexpr(r'see(John, Mary)')
+ p3 = lexpr(r'(Socrates != John)')
+ p4 = lexpr(r'(John != Mary)')
+ c = lexpr(r'-see(Socrates, Mary)')
prover = Prover9Command(c, [p1, p2, p3, p4])
print(prover.prove())
cwp = ClosedWorldProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cwp.assumptions():
- print(" ", a)
- print("goal:", cwp.goal())
+ print(' ', a)
+ print('goal:', cwp.goal())
print(cwp.prove())
- p1 = lexpr(r"all x.(ostrich(x) -> bird(x))")
- p2 = lexpr(r"bird(Tweety)")
- p3 = lexpr(r"-ostrich(Sam)")
- p4 = lexpr(r"Sam != Tweety")
- c = lexpr(r"-bird(Sam)")
+ p1 = lexpr(r'all x.(ostrich(x) -> bird(x))')
+ p2 = lexpr(r'bird(Tweety)')
+ p3 = lexpr(r'-ostrich(Sam)')
+ p4 = lexpr(r'Sam != Tweety')
+ c = lexpr(r'-bird(Sam)')
prover = Prover9Command(c, [p1, p2, p3, p4])
print(prover.prove())
cwp = ClosedWorldProver(prover)
- print("assumptions:")
+ print('assumptions:')
for a in cwp.assumptions():
- print(" ", a)
- print("goal:", cwp.goal())
+ print(' ', a)
+ print('goal:', cwp.goal())
print(cwp.prove())
def combination_prover_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"see(Socrates, John)")
- p2 = lexpr(r"see(John, Mary)")
- c = lexpr(r"-see(Socrates, Mary)")
+ p1 = lexpr(r'see(Socrates, John)')
+ p2 = lexpr(r'see(John, Mary)')
+ c = lexpr(r'-see(Socrates, Mary)')
prover = Prover9Command(c, [p1, p2])
print(prover.prove())
command = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover)))
premises = []
# define taxonomy
- premises.append(lexpr(r"all x.(elephant(x) -> animal(x))"))
- premises.append(lexpr(r"all x.(bird(x) -> animal(x))"))
- premises.append(lexpr(r"all x.(dove(x) -> bird(x))"))
- premises.append(lexpr(r"all x.(ostrich(x) -> bird(x))"))
- premises.append(lexpr(r"all x.(flying_ostrich(x) -> ostrich(x))"))
+ premises.append(lexpr(r'all x.(elephant(x) -> animal(x))'))
+ premises.append(lexpr(r'all x.(bird(x) -> animal(x))'))
+ premises.append(lexpr(r'all x.(dove(x) -> bird(x))'))
+ premises.append(lexpr(r'all x.(ostrich(x) -> bird(x))'))
+ premises.append(lexpr(r'all x.(flying_ostrich(x) -> ostrich(x))'))
# default properties
premises.append(
- lexpr(r"all x.((animal(x) & -Ab1(x)) -> -fly(x))")
+ lexpr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')
) # normal animals don't fly
premises.append(
- lexpr(r"all x.((bird(x) & -Ab2(x)) -> fly(x))")
+ lexpr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')
) # normal birds fly
premises.append(
- lexpr(r"all x.((ostrich(x) & -Ab3(x)) -> -fly(x))")
+ lexpr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')
) # normal ostriches don't fly
# specify abnormal entities
- premises.append(lexpr(r"all x.(bird(x) -> Ab1(x))")) # flight
- premises.append(lexpr(r"all x.(ostrich(x) -> Ab2(x))")) # non-flying bird
- premises.append(lexpr(r"all x.(flying_ostrich(x) -> Ab3(x))")) # flying ostrich
+ premises.append(lexpr(r'all x.(bird(x) -> Ab1(x))')) # flight
+ premises.append(lexpr(r'all x.(ostrich(x) -> Ab2(x))')) # non-flying bird
+ premises.append(lexpr(r'all x.(flying_ostrich(x) -> Ab3(x))')) # flying ostrich
# define entities
- premises.append(lexpr(r"elephant(E)"))
- premises.append(lexpr(r"dove(D)"))
- premises.append(lexpr(r"ostrich(O)"))
+ premises.append(lexpr(r'elephant(E)'))
+ premises.append(lexpr(r'dove(D)'))
+ premises.append(lexpr(r'ostrich(O)'))
# print the assumptions
prover = Prover9Command(None, premises)
for a in command.assumptions():
print(a)
- print_proof("-fly(E)", premises)
- print_proof("fly(D)", premises)
- print_proof("-fly(O)", premises)
+ print_proof('-fly(E)', premises)
+ print_proof('fly(D)', premises)
+ print_proof('-fly(O)', premises)
def print_proof(goal, premises):
default_reasoning_demo()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Interface to the Prover9 Theorem Prover
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Dan Garrette <dhgarrette@gmail.com>
# Ewan Klein <ewan@inf.ed.ac.uk>
#
"""
A theorem prover that makes use of the external 'Prover9' package.
"""
+from __future__ import print_function
import os
import subprocess
and generating prover9-style input files from them.
"""
- def print_assumptions(self, output_format="nltk"):
+ def print_assumptions(self, output_format='nltk'):
"""
Print the list of the current assumptions.
"""
- if output_format.lower() == "nltk":
+ if output_format.lower() == 'nltk':
for a in self.assumptions():
print(a)
- elif output_format.lower() == "prover9":
+ elif output_format.lower() == 'prover9':
for a in convert_to_prover9(self.assumptions()):
print(a)
else:
:see BaseProverCommand.decorate_proof()
"""
if simplify:
- return self._prover._call_prooftrans(proof_string, ["striplabels"])[
+ return self._prover._call_prooftrans(proof_string, ['striplabels'])[
0
].rstrip()
else:
self._binary_location = None
self._prover9_bin = None
else:
- name = "prover9"
+ name = 'prover9'
self._prover9_bin = nltk.internals.find_binary(
name,
path_to_bin=binary_location,
- env_vars=["PROVER9"],
- url="http://www.cs.unm.edu/~mccune/prover9/",
- binary_names=[name, name + ".exe"],
+ env_vars=['PROVER9'],
+ url='http://www.cs.unm.edu/~mccune/prover9/',
+ binary_names=[name, name + '.exe'],
verbose=verbose,
)
self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1)
prover9 binary. This string is formed based on the goal,
assumptions, and timeout value of this object.
"""
- s = ""
+ s = ''
if assumptions:
- s += "formulas(assumptions).\n"
+ s += 'formulas(assumptions).\n'
for p9_assumption in convert_to_prover9(assumptions):
- s += " %s.\n" % p9_assumption
- s += "end_of_list.\n\n"
+ s += ' %s.\n' % p9_assumption
+ s += 'end_of_list.\n\n'
if goal:
- s += "formulas(goals).\n"
- s += " %s.\n" % convert_to_prover9(goal)
- s += "end_of_list.\n\n"
+ s += 'formulas(goals).\n'
+ s += ' %s.\n' % convert_to_prover9(goal)
+ s += 'end_of_list.\n\n'
return s
for the prover9 executables.
"""
return [
- "/usr/local/bin/prover9",
- "/usr/local/bin/prover9/bin",
- "/usr/local/bin",
- "/usr/bin",
- "/usr/local/prover9",
- "/usr/local/share/prover9",
+ '/usr/local/bin/prover9',
+ '/usr/local/bin/prover9/bin',
+ '/usr/local/bin',
+ '/usr/bin',
+ '/usr/local/prover9',
+ '/usr/local/share/prover9',
]
def _find_binary(self, name, verbose=False):
return nltk.internals.find_binary(
name,
searchpath=binary_locations,
- env_vars=["PROVER9"],
- url="http://www.cs.unm.edu/~mccune/prover9/",
- binary_names=[name, name + ".exe"],
+ env_vars=['PROVER9'],
+ url='http://www.cs.unm.edu/~mccune/prover9/',
+ binary_names=[name, name + '.exe'],
verbose=verbose,
)
:see: ``config_prover9``
"""
if verbose:
- print("Calling:", binary)
- print("Args:", args)
- print("Input:\n", input_str, "\n")
+ print('Calling:', binary)
+ print('Args:', args)
+ print('Input:\n', input_str, '\n')
# Call prover9 via a subprocess
cmd = [binary] + args
(stdout, stderr) = p.communicate(input=input_str)
if verbose:
- print("Return code:", p.returncode)
+ print('Return code:', p.returncode)
if stdout:
- print("stdout:\n", stdout, "\n")
+ print('stdout:\n', stdout, '\n')
if stderr:
- print("stderr:\n", stderr, "\n")
+ print('stderr:\n', stderr, '\n')
return (stdout.decode("utf-8"), p.returncode)
try:
result.append(_convert_to_prover9(s.simplify()))
except:
- print("input %s cannot be converted to Prover9 input syntax" % input)
+ print('input %s cannot be converted to Prover9 input syntax' % input)
raise
return result
else:
try:
return _convert_to_prover9(input.simplify())
except:
- print("input %s cannot be converted to Prover9 input syntax" % input)
+ print('input %s cannot be converted to Prover9 input syntax' % input)
raise
"""
if isinstance(expression, ExistsExpression):
return (
- "exists "
+ 'exists '
+ str(expression.variable)
- + " "
+ + ' '
+ _convert_to_prover9(expression.term)
)
elif isinstance(expression, AllExpression):
return (
- "all "
+ 'all '
+ str(expression.variable)
- + " "
+ + ' '
+ _convert_to_prover9(expression.term)
)
elif isinstance(expression, NegatedExpression):
- return "-(" + _convert_to_prover9(expression.term) + ")"
+ return '-(' + _convert_to_prover9(expression.term) + ')'
elif isinstance(expression, AndExpression):
return (
- "("
+ '('
+ _convert_to_prover9(expression.first)
- + " & "
+ + ' & '
+ _convert_to_prover9(expression.second)
- + ")"
+ + ')'
)
elif isinstance(expression, OrExpression):
return (
- "("
+ '('
+ _convert_to_prover9(expression.first)
- + " | "
+ + ' | '
+ _convert_to_prover9(expression.second)
- + ")"
+ + ')'
)
elif isinstance(expression, ImpExpression):
return (
- "("
+ '('
+ _convert_to_prover9(expression.first)
- + " -> "
+ + ' -> '
+ _convert_to_prover9(expression.second)
- + ")"
+ + ')'
)
elif isinstance(expression, IffExpression):
return (
- "("
+ '('
+ _convert_to_prover9(expression.first)
- + " <-> "
+ + ' <-> '
+ _convert_to_prover9(expression.second)
- + ")"
+ + ')'
)
elif isinstance(expression, EqualityExpression):
return (
- "("
+ '('
+ _convert_to_prover9(expression.first)
- + " = "
+ + ' = '
+ _convert_to_prover9(expression.second)
- + ")"
+ + ')'
)
else:
return str(expression)
"""
:see: Prover9Parent.prover9_input
"""
- s = "clear(auto_denials).\n" # only one proof required
+ s = 'clear(auto_denials).\n' # only one proof required
return s + Prover9Parent.prover9_input(self, goal, assumptions)
def _call_prover9(self, input_str, args=[], verbose=False):
:see: ``config_prover9``
"""
if self._prover9_bin is None:
- self._prover9_bin = self._find_binary("prover9", verbose)
+ self._prover9_bin = self._find_binary('prover9', verbose)
- updated_input_str = ""
+ updated_input_str = ''
if self._timeout > 0:
- updated_input_str += "assign(max_seconds, %d).\n\n" % self._timeout
+ updated_input_str += 'assign(max_seconds, %d).\n\n' % self._timeout
updated_input_str += input_str
stdout, returncode = self._call(
)
if returncode not in [0, 2]:
- errormsgprefix = "%%ERROR:"
+ errormsgprefix = '%%ERROR:'
if errormsgprefix in stdout:
msgstart = stdout.index(errormsgprefix)
errormsg = stdout[msgstart:].strip()
:see: ``config_prover9``
"""
if self._prooftrans_bin is None:
- self._prooftrans_bin = self._find_binary("prooftrans", verbose)
+ self._prooftrans_bin = self._find_binary('prooftrans', verbose)
return self._call(input_str, self._prooftrans_bin, args, verbose)
def __init__(self, returncode, message):
msg = p9_return_codes[returncode]
if message:
- msg += "\n%s" % message
+ msg += '\n%s' % message
Exception.__init__(self, msg)
def test_config():
- a = Expression.fromstring("(walk(j) & sing(j))")
- g = Expression.fromstring("walk(j)")
+ a = Expression.fromstring('(walk(j) & sing(j))')
+ g = Expression.fromstring('walk(j)')
p = Prover9Command(g, assumptions=[a])
p._executable_path = None
p.prover9_search = []
alist = [Expression.fromstring(a) for a in assumptions]
p = Prover9Command(g, assumptions=alist).prove()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, p))
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, p))
arguments = [
- ("(man(x) <-> (not (not man(x))))", []),
- ("(not (man(x) & (not man(x))))", []),
- ("(man(x) | (not man(x)))", []),
- ("(man(x) & (not man(x)))", []),
- ("(man(x) -> man(x))", []),
- ("(not (man(x) & (not man(x))))", []),
- ("(man(x) | (not man(x)))", []),
- ("(man(x) -> man(x))", []),
- ("(man(x) <-> man(x))", []),
- ("(not (man(x) <-> (not man(x))))", []),
- ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
- ("((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))", []),
- ("(all x.man(x) -> all x.man(x))", []),
- ("some x.all y.sees(x,y)", []),
+ ('(man(x) <-> (not (not man(x))))', []),
+ ('(not (man(x) & (not man(x))))', []),
+ ('(man(x) | (not man(x)))', []),
+ ('(man(x) & (not man(x)))', []),
+ ('(man(x) -> man(x))', []),
+ ('(not (man(x) & (not man(x))))', []),
+ ('(man(x) | (not man(x)))', []),
+ ('(man(x) -> man(x))', []),
+ ('(man(x) <-> man(x))', []),
+ ('(not (man(x) <-> (not man(x))))', []),
+ ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+ ('((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))', []),
+ ('(all x.man(x) -> all x.man(x))', []),
+ ('some x.all y.sees(x,y)', []),
(
- "some e3.(walk(e3) & subj(e3, mary))",
+ 'some e3.(walk(e3) & subj(e3, mary))',
[
- "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))"
+ 'some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))'
],
),
(
- "some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))",
+ 'some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))',
[
- "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))"
+ 'some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))'
],
),
]
expressions = [
- r"some x y.sees(x,y)",
- r"some x.(man(x) & walks(x))",
- r"\x.(man(x) & walks(x))",
- r"\x y.sees(x,y)",
- r"walks(john)",
- r"\x.big(x, \y.mouse(y))",
- r"(walks(x) & (runs(x) & (threes(x) & fours(x))))",
- r"(walks(x) -> runs(x))",
- r"some x.(PRO(x) & sees(John, x))",
- r"some x.(man(x) & (not walks(x)))",
- r"all x.(man(x) -> walks(x))",
+ r'some x y.sees(x,y)',
+ r'some x.(man(x) & walks(x))',
+ r'\x.(man(x) & walks(x))',
+ r'\x y.sees(x,y)',
+ r'walks(john)',
+ r'\x.big(x, \y.mouse(y))',
+ r'(walks(x) & (runs(x) & (threes(x) & fours(x))))',
+ r'(walks(x) -> runs(x))',
+ r'some x.(PRO(x) & sees(John, x))',
+ r'some x.(man(x) & (not walks(x)))',
+ r'all x.(man(x) -> walks(x))',
]
def spacer(num=45):
- print("-" * num)
+ print('-' * num)
def demo():
test_prove(arguments)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
"""
Module for a resolution-based First Order theorem prover.
"""
+from __future__ import print_function, unicode_literals
import operator
from collections import defaultdict
)
from nltk.inference.api import Prover, BaseProverCommand
+from nltk.compat import python_2_unicode_compatible
class ProverParseError(Exception):
class ResolutionProver(Prover):
- ANSWER_KEY = "ANSWER"
+ ANSWER_KEY = 'ANSWER'
_assume_false = True
def _prove(self, goal=None, assumptions=None, verbose=False):
print(ResolutionProverCommand._decorate_clauses(clauses))
except RuntimeError as e:
if self._assume_false and str(e).startswith(
- "maximum recursion depth exceeded"
+ 'maximum recursion depth exceeded'
):
result = False
clauses = []
"""
Decorate the proof output.
"""
- out = ""
+ out = ''
max_clause_len = max([len(str(clause)) for clause in clauses])
max_seq_len = len(str(len(clauses)))
for i in range(len(clauses)):
- parents = "A"
- taut = ""
+ parents = 'A'
+ taut = ''
if clauses[i].is_tautology():
- taut = "Tautology"
+ taut = 'Tautology'
if clauses[i]._parents:
parents = str(clauses[i]._parents)
- parents = " " * (max_clause_len - len(str(clauses[i])) + 1) + parents
- seq = " " * (max_seq_len - len(str(i + 1))) + str(i + 1)
- out += "[%s] %s %s %s\n" % (seq, clauses[i], parents, taut)
+ parents = ' ' * (max_clause_len - len(str(clauses[i])) + 1) + parents
+ seq = ' ' * (max_seq_len - len(str(i + 1))) + str(i + 1)
+ out += '[%s] %s %s %s\n' % (seq, clauses[i], parents, taut)
return out
+@python_2_unicode_compatible
class Clause(list):
def __init__(self, data):
list.__init__(self, data)
return Clause([atom.substitute_bindings(bindings) for atom in self])
def __str__(self):
- return "{" + ", ".join("%s" % item for item in self) + "}"
+ return '{' + ', '.join("%s" % item for item in self) + '}'
def __repr__(self):
return "%s" % self
"""
This method facilitates movement through the terms of 'self'
"""
- debug.line("unify(%s,%s) %s" % (first, second, bindings))
+ debug.line('unify(%s,%s) %s' % (first, second, bindings))
if not len(first) or not len(second): # if no more recursions can be performed
return finalize_method(first, second, bindings, used, skipped, debug)
"""
This method facilitates movement through the terms of 'other'
"""
- debug.line("unify(%s,%s) %s" % (first, second, bindings))
+ debug.line('unify(%s,%s) %s' % (first, second, bindings))
if not len(first) or not len(second): # if no more recursions can be performed
return finalize_method(first, second, bindings, used, skipped, debug)
def _complete_unify_path(first, second, bindings, used, skipped, debug):
if used[0] or used[1]: # if bindings were made along the path
newclause = Clause(skipped[0] + skipped[1] + first + second)
- debug.line(" -> New Clause: %s" % newclause)
+ debug.line(' -> New Clause: %s' % newclause)
return [newclause.substitute_bindings(bindings)]
else: # no bindings made means no unification occurred. so no result
- debug.line(" -> End")
+ debug.line(' -> End')
return []
raise ProverParseError()
+@python_2_unicode_compatible
class BindingDict(object):
def __init__(self, binding_list=None):
"""
self.d[binding.variable] = binding2
else:
raise BindingException(
- "Variable %s already bound to another " "value" % (variable)
+ 'Variable %s already bound to another ' 'value' % (variable)
)
else:
raise BindingException(
- "Variable %s already bound to another " "value" % (variable)
+ 'Variable %s already bound to another ' 'value' % (variable)
)
def __getitem__(self, variable):
return len(self.d)
def __str__(self):
- data_str = ", ".join("%s: %s" % (v, self.d[v]) for v in sorted(self.d.keys()))
- return "{" + data_str + "}"
+ data_str = ', '.join('%s: %s' % (v, self.d[v]) for v in sorted(self.d.keys()))
+ return '{' + data_str + '}'
def __repr__(self):
return "%s" % self
def line(self, line):
if self.enabled:
- print(" " * self.indent + line)
+ print(' ' * self.indent + line)
def testResolutionProver():
- resolution_test(r"man(x)")
- resolution_test(r"(man(x) -> man(x))")
- resolution_test(r"(man(x) -> --man(x))")
- resolution_test(r"-(man(x) and -man(x))")
- resolution_test(r"(man(x) or -man(x))")
- resolution_test(r"(man(x) -> man(x))")
- resolution_test(r"-(man(x) and -man(x))")
- resolution_test(r"(man(x) or -man(x))")
- resolution_test(r"(man(x) -> man(x))")
- resolution_test(r"(man(x) iff man(x))")
- resolution_test(r"-(man(x) iff -man(x))")
- resolution_test("all x.man(x)")
- resolution_test("-all x.some y.F(x,y) & some x.all y.(-F(x,y))")
- resolution_test("some x.all y.sees(x,y)")
-
- p1 = Expression.fromstring(r"all x.(man(x) -> mortal(x))")
- p2 = Expression.fromstring(r"man(Socrates)")
- c = Expression.fromstring(r"mortal(Socrates)")
- print("%s, %s |- %s: %s" % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
-
- p1 = Expression.fromstring(r"all x.(man(x) -> walks(x))")
- p2 = Expression.fromstring(r"man(John)")
- c = Expression.fromstring(r"some y.walks(y)")
- print("%s, %s |- %s: %s" % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
-
- p = Expression.fromstring(r"some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))")
- c = Expression.fromstring(r"some e0.walk(e0,mary)")
- print("%s |- %s: %s" % (p, c, ResolutionProver().prove(c, [p])))
+ resolution_test(r'man(x)')
+ resolution_test(r'(man(x) -> man(x))')
+ resolution_test(r'(man(x) -> --man(x))')
+ resolution_test(r'-(man(x) and -man(x))')
+ resolution_test(r'(man(x) or -man(x))')
+ resolution_test(r'(man(x) -> man(x))')
+ resolution_test(r'-(man(x) and -man(x))')
+ resolution_test(r'(man(x) or -man(x))')
+ resolution_test(r'(man(x) -> man(x))')
+ resolution_test(r'(man(x) iff man(x))')
+ resolution_test(r'-(man(x) iff -man(x))')
+ resolution_test('all x.man(x)')
+ resolution_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
+ resolution_test('some x.all y.sees(x,y)')
+
+ p1 = Expression.fromstring(r'all x.(man(x) -> mortal(x))')
+ p2 = Expression.fromstring(r'man(Socrates)')
+ c = Expression.fromstring(r'mortal(Socrates)')
+ print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
+
+ p1 = Expression.fromstring(r'all x.(man(x) -> walks(x))')
+ p2 = Expression.fromstring(r'man(John)')
+ c = Expression.fromstring(r'some y.walks(y)')
+ print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
+
+ p = Expression.fromstring(r'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))')
+ c = Expression.fromstring(r'some e0.walk(e0,mary)')
+ print('%s |- %s: %s' % (p, c, ResolutionProver().prove(c, [p])))
def resolution_test(e):
f = Expression.fromstring(e)
t = ResolutionProver().prove(f)
- print("|- %s: %s" % (f, t))
+ print('|- %s: %s' % (f, t))
def test_clausify():
lexpr = Expression.fromstring
- print(clausify(lexpr("P(x) | Q(x)")))
- print(clausify(lexpr("(P(x) & Q(x)) | R(x)")))
- print(clausify(lexpr("P(x) | (Q(x) & R(x))")))
- print(clausify(lexpr("(P(x) & Q(x)) | (R(x) & S(x))")))
+ print(clausify(lexpr('P(x) | Q(x)')))
+ print(clausify(lexpr('(P(x) & Q(x)) | R(x)')))
+ print(clausify(lexpr('P(x) | (Q(x) & R(x))')))
+ print(clausify(lexpr('(P(x) & Q(x)) | (R(x) & S(x))')))
- print(clausify(lexpr("P(x) | Q(x) | R(x)")))
- print(clausify(lexpr("P(x) | (Q(x) & R(x)) | S(x)")))
+ print(clausify(lexpr('P(x) | Q(x) | R(x)')))
+ print(clausify(lexpr('P(x) | (Q(x) & R(x)) | S(x)')))
- print(clausify(lexpr("exists x.P(x) | Q(x)")))
+ print(clausify(lexpr('exists x.P(x) | Q(x)')))
- print(clausify(lexpr("-(-P(x) & Q(x))")))
- print(clausify(lexpr("P(x) <-> Q(x)")))
- print(clausify(lexpr("-(P(x) <-> Q(x))")))
- print(clausify(lexpr("-(all x.P(x))")))
- print(clausify(lexpr("-(some x.P(x))")))
+ print(clausify(lexpr('-(-P(x) & Q(x))')))
+ print(clausify(lexpr('P(x) <-> Q(x)')))
+ print(clausify(lexpr('-(P(x) <-> Q(x))')))
+ print(clausify(lexpr('-(all x.P(x))')))
+ print(clausify(lexpr('-(some x.P(x))')))
- print(clausify(lexpr("some x.P(x)")))
- print(clausify(lexpr("some x.all y.P(x,y)")))
- print(clausify(lexpr("all y.some x.P(x,y)")))
- print(clausify(lexpr("all z.all y.some x.P(x,y,z)")))
- print(clausify(lexpr("all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))")))
+ print(clausify(lexpr('some x.P(x)')))
+ print(clausify(lexpr('some x.all y.P(x,y)')))
+ print(clausify(lexpr('all y.some x.P(x,y)')))
+ print(clausify(lexpr('all z.all y.some x.P(x,y,z)')))
+ print(clausify(lexpr('all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))')))
def demo():
testResolutionProver()
print()
- p = Expression.fromstring("man(x)")
+ p = Expression.fromstring('man(x)')
print(ResolutionProverCommand(p, [p]).prove())
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: First-Order Tableau Theorem Prover
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# URL: <http://nltk.org/>
"""
Module for a tableau-based First Order theorem prover.
"""
+from __future__ import print_function, unicode_literals
from nltk.internals import Counter
result = self._attempt_proof(agenda, set(), set(), debugger)
except RuntimeError as e:
if self._assume_false and str(e).startswith(
- "maximum recursion depth exceeded"
+ 'maximum recursion depth exceeded'
):
result = False
else:
print(e)
else:
raise e
- return (result, "\n".join(debugger.lines))
+ return (result, '\n'.join(debugger.lines))
def _attempt_proof(self, agenda, accessible_vars, atoms, debug):
(current, context), category = agenda.pop_first()
# if there's nothing left in the agenda, and we haven't closed the path
if not current:
- debug.line("AGENDA EMPTY")
+ debug.line('AGENDA EMPTY')
return False
proof_method = {
):
# Check if the branch is closed. Return 'True' if it is
if (current, True) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
if context:
):
# Check if the branch is closed. Return 'True' if it is
if (current.term, False) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
if context:
):
# Check if the branch is closed. Return 'True' if it is
if (current, True) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
# mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
):
# Check if the branch is closed. Return 'True' if it is
if (current.term, False) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
# mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
for i, arg in enumerate(args):
if not TableauProver.is_atom(arg):
ctx = f
- nv = Variable("X%s" % _counter.get())
+ nv = Variable('X%s' % _counter.get())
for j, a in enumerate(args):
ctx = ctx(VariableExpression(nv)) if i == j else ctx(a)
if context:
ctx = LambdaExpression(nv, ctx)
agenda.put(arg, ctx)
return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
- raise Exception("If this method is called, there must be a non-atomic argument")
+ raise Exception('If this method is called, there must be a non-atomic argument')
def _attempt_proof_n_app(
self, current, context, agenda, accessible_vars, atoms, debug
for i, arg in enumerate(args):
if not TableauProver.is_atom(arg):
ctx = f
- nv = Variable("X%s" % _counter.get())
+ nv = Variable('X%s' % _counter.get())
for j, a in enumerate(args):
ctx = ctx(VariableExpression(nv)) if i == j else ctx(a)
if context:
ctx = LambdaExpression(nv, -ctx)
agenda.put(-arg, ctx)
return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
- raise Exception("If this method is called, there must be a non-atomic argument")
+ raise Exception('If this method is called, there must be a non-atomic argument')
def _attempt_proof_n_eq(
self, current, context, agenda, accessible_vars, atoms, debug
# Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b'
###########################################################################
if current.term.first == current.term.second:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
agenda[Categories.N_EQ].add((current, context))
if bv_available:
variable_to_use = list(bv_available)[0]
- debug.line("--> Using '%s'" % variable_to_use, 2)
+ debug.line('--> Using \'%s\'' % variable_to_use, 2)
current._used_vars |= set([variable_to_use])
agenda.put(
current.term.replace(current.variable, variable_to_use), context
else:
# no more available variables to substitute
- debug.line("--> Variables Exhausted", 2)
+ debug.line('--> Variables Exhausted', 2)
current._exhausted = True
agenda[Categories.ALL].add((current, context))
return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
else:
new_unique_variable = VariableExpression(unique_variable())
- debug.line("--> Using '%s'" % new_unique_variable, 2)
+ debug.line('--> Using \'%s\'' % new_unique_variable, 2)
current._used_vars |= set([new_unique_variable])
agenda.put(
current.term.replace(current.variable, new_unique_variable), context
if isinstance(data, tuple):
ex, ctx = data
if ctx:
- data = "%s, %s" % (ex, ctx)
+ data = '%s, %s' % (ex, ctx)
else:
- data = "%s" % ex
+ data = '%s' % ex
if isinstance(ex, AllExpression):
try:
used_vars = "[%s]" % (
",".join("%s" % ve.variable.name for ve in ex._used_vars)
)
- data += ": %s" % used_vars
+ data += ': %s' % used_vars
except AttributeError:
- data += ": []"
+ data += ': []'
- newline = "%s%s" % (" " * (self.indent + indent), data)
+ newline = '%s%s' % (' ' * (self.indent + indent), data)
self.lines.append(newline)
if self.verbose:
def testTableauProver():
- tableau_test("P | -P")
- tableau_test("P & -P")
- tableau_test("Q", ["P", "(P -> Q)"])
- tableau_test("man(x)")
- tableau_test("(man(x) -> man(x))")
- tableau_test("(man(x) -> --man(x))")
- tableau_test("-(man(x) and -man(x))")
- tableau_test("(man(x) or -man(x))")
- tableau_test("(man(x) -> man(x))")
- tableau_test("-(man(x) and -man(x))")
- tableau_test("(man(x) or -man(x))")
- tableau_test("(man(x) -> man(x))")
- tableau_test("(man(x) iff man(x))")
- tableau_test("-(man(x) iff -man(x))")
- tableau_test("all x.man(x)")
- tableau_test("all x.all y.((x = y) -> (y = x))")
- tableau_test("all x.all y.all z.(((x = y) & (y = z)) -> (x = z))")
+ tableau_test('P | -P')
+ tableau_test('P & -P')
+ tableau_test('Q', ['P', '(P -> Q)'])
+ tableau_test('man(x)')
+ tableau_test('(man(x) -> man(x))')
+ tableau_test('(man(x) -> --man(x))')
+ tableau_test('-(man(x) and -man(x))')
+ tableau_test('(man(x) or -man(x))')
+ tableau_test('(man(x) -> man(x))')
+ tableau_test('-(man(x) and -man(x))')
+ tableau_test('(man(x) or -man(x))')
+ tableau_test('(man(x) -> man(x))')
+ tableau_test('(man(x) iff man(x))')
+ tableau_test('-(man(x) iff -man(x))')
+ tableau_test('all x.man(x)')
+ tableau_test('all x.all y.((x = y) -> (y = x))')
+ tableau_test('all x.all y.all z.(((x = y) & (y = z)) -> (x = z))')
# tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
# tableau_test('some x.all y.sees(x,y)')
- p1 = "all x.(man(x) -> mortal(x))"
- p2 = "man(Socrates)"
- c = "mortal(Socrates)"
+ p1 = 'all x.(man(x) -> mortal(x))'
+ p2 = 'man(Socrates)'
+ c = 'mortal(Socrates)'
tableau_test(c, [p1, p2])
- p1 = "all x.(man(x) -> walks(x))"
- p2 = "man(John)"
- c = "some y.walks(y)"
+ p1 = 'all x.(man(x) -> walks(x))'
+ p2 = 'man(John)'
+ c = 'some y.walks(y)'
tableau_test(c, [p1, p2])
- p = "((x = y) & walks(y))"
- c = "walks(x)"
+ p = '((x = y) & walks(y))'
+ c = 'walks(x)'
tableau_test(c, [p])
- p = "((x = y) & ((y = z) & (z = w)))"
- c = "(x = w)"
+ p = '((x = y) & ((y = z) & (z = w)))'
+ c = '(x = w)'
tableau_test(c, [p])
- p = "some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))"
- c = "some e0.walk(e0,mary)"
+ p = 'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))'
+ c = 'some e0.walk(e0,mary)'
tableau_test(c, [p])
- c = "(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))"
+ c = '(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))'
tableau_test(c)
def testHigherOrderTableauProver():
- tableau_test("believe(j, -lie(b))", ["believe(j, -lie(b) & -cheat(b))"])
- tableau_test("believe(j, lie(b) & cheat(b))", ["believe(j, lie(b))"])
+ tableau_test('believe(j, -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
+ tableau_test('believe(j, lie(b) & cheat(b))', ['believe(j, lie(b))'])
tableau_test(
- "believe(j, lie(b))", ["lie(b)"]
+ 'believe(j, lie(b))', ['lie(b)']
) # how do we capture that John believes all things that are true
tableau_test(
- "believe(j, know(b, cheat(b)))",
- ["believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))"],
+ 'believe(j, know(b, cheat(b)))',
+ ['believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))'],
)
- tableau_test("P(Q(y), R(y) & R(z))", ["P(Q(x) & Q(y), R(y) & R(z))"])
+ tableau_test('P(Q(y), R(y) & R(z))', ['P(Q(x) & Q(y), R(y) & R(z))'])
- tableau_test("believe(j, cheat(b) & lie(b))", ["believe(j, lie(b) & cheat(b))"])
- tableau_test("believe(j, -cheat(b) & -lie(b))", ["believe(j, -lie(b) & -cheat(b))"])
+ tableau_test('believe(j, cheat(b) & lie(b))', ['believe(j, lie(b) & cheat(b))'])
+ tableau_test('believe(j, -cheat(b) & -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
def tableau_test(c, ps=None, verbose=False):
if not ps:
ps = []
print(
- "%s |- %s: %s"
- % (", ".join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose))
+ '%s |- %s: %s'
+ % (', '.join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose))
)
testHigherOrderTableauProver()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Internal utility functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
import subprocess
import os
import sys
import stat
import locale
-from xml.etree import ElementTree
+
+# Use the c version of ElementTree, which is faster, if possible:
+try:
+ from xml.etree import cElementTree as ElementTree
+except ImportError:
+ from xml.etree import ElementTree
+
+from six import string_types
+
+from nltk import compat
##########################################################################
# Java Via Command-Line
"""
global _java_bin, _java_options
_java_bin = find_binary(
- "java",
+ 'java',
bin,
- env_vars=["JAVAHOME", "JAVA_HOME"],
+ env_vars=['JAVAHOME', 'JAVA_HOME'],
verbose=verbose,
- binary_names=["java.exe"],
+ binary_names=['java.exe'],
)
if options is not None:
- if isinstance(options, str):
+ if isinstance(options, string_types):
options = options.split()
_java_options = list(options)
standard input, standard output and standard error file
handles, respectively. Valid values are ``subprocess.PIPE``,
an existing file descriptor (a positive integer), an existing
- file object, 'pipe', 'stdout', 'devnull' and None. ``subprocess.PIPE`` indicates that a
+ file object, and None. ``subprocess.PIPE`` indicates that a
new pipe to the child should be created. With None, no
redirection will occur; the child's file handles will be
inherited from the parent. Additionally, stderr can be
:raise OSError: If the java command returns a nonzero return code.
"""
-
- subprocess_output_dict = {
- "pipe": subprocess.PIPE,
- "stdout": subprocess.STDOUT,
- "devnull": subprocess.DEVNULL,
- }
-
- stdin = subprocess_output_dict.get(stdin, stdin)
- stdout = subprocess_output_dict.get(stdout, stdout)
- stderr = subprocess_output_dict.get(stderr, stderr)
-
- if isinstance(cmd, str):
- raise TypeError("cmd should be a list of strings")
+ if stdin == 'pipe':
+ stdin = subprocess.PIPE
+ if stdout == 'pipe':
+ stdout = subprocess.PIPE
+ if stderr == 'pipe':
+ stderr = subprocess.PIPE
+ if isinstance(cmd, string_types):
+ raise TypeError('cmd should be a list of strings')
# Make sure we know where a java binary is.
if _java_bin is None:
config_java()
# Set up the classpath.
- if isinstance(classpath, str):
+ if isinstance(classpath, string_types):
classpaths = [classpath]
else:
classpaths = list(classpath)
# Construct the full command string.
cmd = list(cmd)
- cmd = ["-cp", classpath] + cmd
+ cmd = ['-cp', classpath] + cmd
cmd = [_java_bin] + _java_options + cmd
# Call java via a subprocess
# Check the return code.
if p.returncode != 0:
print(_decode_stdoutdata(stderr))
- raise OSError("Java command failed : " + str(cmd))
+ raise OSError('Java command failed : ' + str(cmd))
return (stdout, stderr)
# Read:
(a, b) = java(
[
- "weka.classifiers.bayes.NaiveBayes",
- "-l",
- "/tmp/names.model",
- "-T",
- "/tmp/test.arff",
- "-p",
- "0",
+ 'weka.classifiers.bayes.NaiveBayes',
+ '-l',
+ '/tmp/names.model',
+ '-T',
+ '/tmp/test.arff',
+ '-p',
+ '0',
], # , '-distribution'],
- classpath="/Users/edloper/Desktop/weka/weka.jar",
+ classpath='/Users/edloper/Desktop/weka/weka.jar',
)
self.position = position
def __str__(self):
- return "Expected %s at %s" % (self.expected, self.position)
+ return 'Expected %s at %s' % (self.expected, self.position)
_STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
# Read the open quote, and any modifiers.
m = _STRING_START_RE.match(s, start_position)
if not m:
- raise ReadError("open quote", start_position)
+ raise ReadError('open quote', start_position)
quotemark = m.group(1)
# Find the close quote.
- _STRING_END_RE = re.compile(r"\\|%s" % quotemark)
+ _STRING_END_RE = re.compile(r'\\|%s' % quotemark)
position = m.end()
while True:
match = _STRING_END_RE.search(s, position)
if not match:
- raise ReadError("close quote", position)
- if match.group(0) == "\\":
+ raise ReadError('close quote', position)
+ if match.group(0) == '\\':
position = match.end() + 1
else:
break
try:
return eval(s[start_position : match.end()]), match.end()
except ValueError as e:
- raise ReadError("invalid string (%s)" % e)
+ raise ReadError('invalid string (%s)' % e)
-_READ_INT_RE = re.compile(r"-?\d+")
+_READ_INT_RE = re.compile(r'-?\d+')
def read_int(s, start_position):
"""
m = _READ_INT_RE.match(s, start_position)
if not m:
- raise ReadError("integer", start_position)
+ raise ReadError('integer', start_position)
return int(m.group()), m.end()
-_READ_NUMBER_VALUE = re.compile(r"-?(\d*)([.]?\d*)?")
+_READ_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
def read_number(s, start_position):
"""
m = _READ_NUMBER_VALUE.match(s, start_position)
if not m or not (m.group(1) or m.group(2)):
- raise ReadError("number", start_position)
+ raise ReadError('number', start_position)
if m.group(2):
return float(m.group()), m.end()
else:
:type method: instance method
"""
- if isinstance(method, types.MethodType) and method.__self__.__class__ is not None:
+ # [xx] breaks on classic classes!
+ if isinstance(method, types.MethodType) and compat.get_im_class(method) is not None:
name = method.__name__
funcs = [
cls.__dict__[name]
- for cls in _mro(method.__self__.__class__)
+ for cls in _mro(compat.get_im_class(method))
if name in cls.__dict__
]
return len(funcs) > 1
else:
- raise TypeError("Expected an instance method.")
+ raise TypeError('Expected an instance method.')
def _mro(cls):
def _add_epytext_field(obj, field, message):
"""Add an epytext @field to a given object's docstring."""
- indent = ""
+ indent = ''
# If we already have a docstring, then add a blank line to separate
# it from the new field, and check its indentation.
if obj.__doc__:
- obj.__doc__ = obj.__doc__.rstrip() + "\n\n"
- indents = re.findall(r"(?<=\n)[ ]+(?!\s)", obj.__doc__.expandtabs())
+ obj.__doc__ = obj.__doc__.rstrip() + '\n\n'
+ indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
if indents:
indent = min(indents)
# If we don't have a docstring, add an empty one.
else:
- obj.__doc__ = ""
+ obj.__doc__ = ''
obj.__doc__ += textwrap.fill(
- "@%s: %s" % (field, message),
+ '@%s: %s' % (field, message),
initial_indent=indent,
- subsequent_indent=indent + " ",
+ subsequent_indent=indent + ' ',
)
def decorator(func):
msg = "Function %s() has been deprecated. %s" % (func.__name__, message)
- msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ")
+ msg = '\n' + textwrap.fill(msg, initial_indent=' ', subsequent_indent=' ')
def newFunc(*args, **kwargs):
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
newFunc.__doc__ = func.__doc__
newFunc.__deprecated__ = True
# Add a @deprecated field to the docstring.
- _add_epytext_field(newFunc, "deprecated", message)
+ _add_epytext_field(newFunc, 'deprecated', message)
return newFunc
return decorator
if Deprecated in base.__bases__:
dep_cls = base
break
- assert dep_cls, "Unable to determine which base is deprecated."
+ assert dep_cls, 'Unable to determine which base is deprecated.'
# Construct an appropriate warning.
- doc = dep_cls.__doc__ or "".strip()
+ doc = dep_cls.__doc__ or ''.strip()
# If there's a @deprecated field, strip off the field marker.
- doc = re.sub(r"\A\s*@deprecated:", r"", doc)
+ doc = re.sub(r'\A\s*@deprecated:', r'', doc)
# Strip off any indentation.
- doc = re.sub(r"(?m)^\s*", "", doc)
+ doc = re.sub(r'(?m)^\s*', '', doc)
# Construct a 'name' string.
- name = "Class %s" % dep_cls.__name__
+ name = 'Class %s' % dep_cls.__name__
if cls != dep_cls:
- name += " (base class for %s)" % cls.__name__
+ name += ' (base class for %s)' % cls.__name__
# Put it all together.
- msg = "%s has been deprecated. %s" % (name, doc)
+ msg = '%s has been deprecated. %s' % (name, doc)
# Wrap it.
- msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ")
+ msg = '\n' + textwrap.fill(msg, initial_indent=' ', subsequent_indent=' ')
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
# Do the actual work of __new__.
return object.__new__(cls)
:param verbose: Whether or not to print path when a file is found.
"""
file_names = [filename] + (file_names or [])
- assert isinstance(filename, str)
- assert not isinstance(file_names, str)
- assert not isinstance(searchpath, str)
- if isinstance(env_vars, str):
+ assert isinstance(filename, string_types)
+ assert not isinstance(file_names, string_types)
+ assert not isinstance(searchpath, string_types)
+ if isinstance(env_vars, string_types):
env_vars = env_vars.split()
yielded = False
path_to_file = os.path.join(filename, alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# Check the bare alternatives
if os.path.isfile(alternative):
if verbose:
- print("[Found %s: %s]" % (filename, alternative))
+ print('[Found %s: %s]' % (filename, alternative))
yielded = True
yield alternative
# Check if the alternative is inside a 'file' directory
- path_to_file = os.path.join(filename, "file", alternative)
+ path_to_file = os.path.join(filename, 'file', alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# Check if the environment variable contains a direct path to the bin
if os.path.isfile(env_dir):
if verbose:
- print("[Found %s: %s]" % (filename, env_dir))
+ print('[Found %s: %s]' % (filename, env_dir))
yielded = True
yield env_dir
# Check if the possible bin names exist inside the environment variable directories
path_to_file = os.path.join(env_dir, alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# Check if the alternative is inside a 'file' directory
# path_to_file = os.path.join(env_dir, 'file', alternative)
# Check if the alternative is inside a 'bin' directory
- path_to_file = os.path.join(env_dir, "bin", alternative)
+ path_to_file = os.path.join(env_dir, 'bin', alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# If we're on a POSIX system, then try using the 'which' command
# to find the file.
- if os.name == "posix":
+ if os.name == 'posix':
for alternative in file_names:
try:
p = subprocess.Popen(
- ["which", alternative],
+ ['which', alternative],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
path = _decode_stdoutdata(stdout).strip()
if path.endswith(alternative) and os.path.exists(path):
if verbose:
- print("[Found %s: %s]" % (filename, path))
+ print('[Found %s: %s]' % (filename, path))
yielded = True
yield path
except (KeyboardInterrupt, SystemExit, OSError):
"configuration paramaters" % filename
)
if env_vars:
- msg += " or set the %s environment variable" % env_vars[0]
- msg += "."
+ msg += ' or set the %s environment variable' % env_vars[0]
+ msg += '.'
if searchpath:
- msg += "\n\n Searched in:"
- msg += "".join("\n - %s" % d for d in searchpath)
+ msg += '\n\n Searched in:'
+ msg += ''.join('\n - %s' % d for d in searchpath)
if url:
- msg += "\n\n For more information on %s, see:\n <%s>" % (filename, url)
- div = "=" * 75
- raise LookupError("\n\n%s\n%s\n%s" % (div, msg, div))
+ msg += '\n\n For more information on %s, see:\n <%s>' % (filename, url)
+ div = '=' * 75
+ raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
def find_file(
:param is_regex: Whether name is a regular expression.
"""
- assert isinstance(name_pattern, str)
- assert not isinstance(searchpath, str)
- if isinstance(env_vars, str):
+ assert isinstance(name_pattern, string_types)
+ assert not isinstance(searchpath, string_types)
+ if isinstance(env_vars, string_types):
env_vars = env_vars.split()
yielded = False
# Make sure we check the CLASSPATH first
- env_vars = ["CLASSPATH"] + list(env_vars)
+ env_vars = ['CLASSPATH'] + list(env_vars)
# If an explicit location was given, then check it, and yield it if
# it's present; otherwise, complain.
yield path_to_jar
else:
raise LookupError(
- "Could not find %s jar file at %s" % (name_pattern, path_to_jar)
+ 'Could not find %s jar file at %s' % (name_pattern, path_to_jar)
)
# Check environment variables
for env_var in env_vars:
if env_var in os.environ:
- if env_var == "CLASSPATH":
- classpath = os.environ["CLASSPATH"]
+ if env_var == 'CLASSPATH':
+ classpath = os.environ['CLASSPATH']
for cp in classpath.split(os.path.pathsep):
if os.path.isfile(cp):
filename = os.path.basename(cp)
or (not is_regex and filename == name_pattern)
):
if verbose:
- print("[Found %s: %s]" % (name_pattern, cp))
+ print('[Found %s: %s]' % (name_pattern, cp))
yielded = True
yield cp
# The case where user put directory containing the jar file in the classpath
if not is_regex:
if os.path.isfile(os.path.join(cp, name_pattern)):
if verbose:
- print("[Found %s: %s]" % (name_pattern, cp))
+ print('[Found %s: %s]' % (name_pattern, cp))
yielded = True
yield os.path.join(cp, name_pattern)
else:
if re.match(name_pattern, file_name):
if verbose:
print(
- "[Found %s: %s]"
+ '[Found %s: %s]'
% (
name_pattern,
os.path.join(cp, file_name),
or (not is_regex and filename == name_pattern)
):
if verbose:
- print("[Found %s: %s]" % (name_pattern, path_to_jar))
+ print('[Found %s: %s]' % (name_pattern, path_to_jar))
yielded = True
yield path_to_jar
if os.path.isfile(path_to_jar):
if re.match(name_pattern, filename):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_jar))
+ print('[Found %s: %s]' % (filename, path_to_jar))
yielded = True
yield path_to_jar
else:
path_to_jar = os.path.join(directory, name_pattern)
if os.path.isfile(path_to_jar):
if verbose:
- print("[Found %s: %s]" % (name_pattern, path_to_jar))
+ print('[Found %s: %s]' % (name_pattern, path_to_jar))
yielded = True
yield path_to_jar
# If nothing was found, raise an error
msg = "NLTK was unable to find %s!" % name_pattern
if env_vars:
- msg += " Set the %s environment variable" % env_vars[0]
- msg = textwrap.fill(msg + ".", initial_indent=" ", subsequent_indent=" ")
+ msg += ' Set the %s environment variable' % env_vars[0]
+ msg = textwrap.fill(msg + '.', initial_indent=' ', subsequent_indent=' ')
if searchpath:
- msg += "\n\n Searched in:"
- msg += "".join("\n - %s" % d for d in searchpath)
+ msg += '\n\n Searched in:'
+ msg += ''.join('\n - %s' % d for d in searchpath)
if url:
- msg += "\n\n For more information, on %s, see:\n <%s>" % (
+ msg += '\n\n For more information, on %s, see:\n <%s>' % (
name_pattern,
url,
)
- div = "=" * 75
- raise LookupError("\n\n%s\n%s\n%s" % (div, msg, div))
+ div = '=' * 75
+ raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
def find_jar(
return [
os.path.join(root, filename)
for root, dirnames, filenames in os.walk(path_to_jars)
- for filename in fnmatch.filter(filenames, "*.jar")
+ for filename in fnmatch.filter(filenames, '*.jar')
]
instead (causing the import to fail).
"""
old_path = sys.path
- sys.path = [d for d in sys.path if d not in ("", ".")]
+ sys.path = [d for d in sys.path if d not in ('', '.')]
m = __import__(module)
sys.path = old_path
return m
##########################################################################
-
+@compat.python_2_unicode_compatible
class ElementWrapper(object):
"""
A wrapper around ElementTree Element objects whose main purpose is
<Element "<?xml version='1.0' encoding='utf8'?>\n<test />">
"""
- if isinstance(etree, str):
+ if isinstance(etree, string_types):
etree = ElementTree.fromstring(etree)
- self.__dict__["_etree"] = etree
+ self.__dict__['_etree'] = etree
def unwrap(self):
"""
##////////////////////////////////////////////////////////////
def __repr__(self):
- s = ElementTree.tostring(self._etree, encoding="utf8").decode("utf8")
+ s = ElementTree.tostring(self._etree, encoding='utf8').decode('utf8')
if len(s) > 60:
- e = s.rfind("<")
+ e = s.rfind('<')
if (len(s) - e) > 30:
e = -20
- s = "%s...%s" % (s[:30], s[e:])
- return "<Element %r>" % s
+ s = '%s...%s' % (s[:30], s[e:])
+ return '<Element %r>' % s
def __str__(self):
"""
the wrapped Element object.
"""
return (
- ElementTree.tostring(self._etree, encoding="utf8").decode("utf8").rstrip()
+ ElementTree.tostring(self._etree, encoding='utf8').decode('utf8').rstrip()
)
##////////////////////////////////////////////////////////////
# Otherwise, make sure that no non-default step value is used.
elif slice_obj.step not in (None, 1):
raise ValueError(
- "slices with steps are not supported by %s" % sequence.__class__.__name__
+ 'slices with steps are not supported by %s' % sequence.__class__.__name__
)
# Supply default offsets.
return False
# If we're on a posix system, check its permissions.
- if hasattr(os, "getuid"):
+ if hasattr(os, 'getuid'):
statdata = os.stat(path)
perm = stat.S_IMODE(statdata.st_mode)
# is it world-writable?
# -*- coding: utf-8 -*-
# Natural Language Toolkit: JSON Encoder/Decoder Helpers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
json_tags = {}
-TAG_PREFIX = "!"
+TAG_PREFIX = '!'
def register_tag(cls):
"""
Decorates a class to register it's json tag.
"""
- json_tags[TAG_PREFIX + getattr(cls, "json_tag")] = cls
+ json_tags[TAG_PREFIX + getattr(cls, 'json_tag')] = cls
return cls
class JSONTaggedEncoder(json.JSONEncoder):
def default(self, obj):
- obj_tag = getattr(obj, "json_tag", None)
+ obj_tag = getattr(obj, 'json_tag', None)
if obj_tag is None:
return super(JSONTaggedEncoder, self).default(obj)
obj_tag = TAG_PREFIX + obj_tag
if not isinstance(obj, dict) or len(obj) != 1:
return obj
obj_tag = next(iter(obj.keys()))
- if not obj_tag.startswith("!"):
+ if not obj_tag.startswith('!'):
return obj
if obj_tag not in json_tags:
- raise ValueError("Unknown tag", obj_tag)
+ raise ValueError('Unknown tag', obj_tag)
obj_cls = json_tags[obj_tag]
return obj_cls.decode_json_obj(obj[obj_tag])
-__all__ = ["register_tag", "json_tags", "JSONTaggedEncoder", "JSONTaggedDecoder"]
+__all__ = ['register_tag', 'json_tags', 'JSONTaggedEncoder', 'JSONTaggedDecoder']
See the documentation for further information on copyrights,
or contact the author. All Rights Reserved.
"""
+from __future__ import print_function
### Constants
__lazymodule_init = 0
# Name of the module to load
- __lazymodule_name = ""
+ __lazymodule_name = ''
# Flag which indicates whether the module was loaded or not
__lazymodule_loaded = 0
if globals is None:
globals = locals
self.__lazymodule_globals = globals
- mainname = globals.get("__name__", "")
+ mainname = globals.get('__name__', '')
if mainname:
- self.__name__ = mainname + "." + name
+ self.__name__ = mainname + '.' + name
self.__lazymodule_name = name
else:
self.__name__ = self.__lazymodule_name = name
if self.__lazymodule_loaded:
return self.__lazymodule_locals[name]
if _debug:
- print("LazyModule: Loading module %r" % name)
+ print('LazyModule: Loading module %r' % name)
self.__lazymodule_locals[name] = module = __import__(
- name, self.__lazymodule_locals, self.__lazymodule_globals, "*"
+ name, self.__lazymodule_locals, self.__lazymodule_globals, '*'
)
# Fill namespace with all symbols from original module to
self.__dict__.update(module.__dict__)
# Set import flag
- self.__dict__["__lazymodule_loaded"] = 1
+ self.__dict__['__lazymodule_loaded'] = 1
if _debug:
- print("LazyModule: Module %r loaded" % name)
+ print('LazyModule: Module %r loaded' % name)
return module
def __getattr__(self, name):
raise AttributeError(name)
if _debug:
print(
- "LazyModule: "
- "Module load triggered by attribute %r read access" % name
+ 'LazyModule: '
+ 'Module load triggered by attribute %r read access' % name
)
module = self.__lazymodule_import()
return getattr(module, name)
return
if _debug:
print(
- "LazyModule: "
- "Module load triggered by attribute %r write access" % name
+ 'LazyModule: '
+ 'Module load triggered by attribute %r write access' % name
)
module = self.__lazymodule_import()
setattr(module, name, value)
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Models
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/
# For license information, see LICENSE.TXT
>>> lm.generate(1, random_seed=3)
'<s>'
>>> lm.generate(5, random_seed=3)
- ['<s>', 'a', 'b', 'c', 'd']
+ ['<s>', 'a', 'b', 'c', '</s>']
Provide `random_seed` if you want to consistently reproduce the same text all
other things being equal. Here we are using it to test the examples.
argument.
>>> lm.generate(5, text_seed=['c'], random_seed=3)
- ['</s>', 'c', 'd', 'c', 'd']
+ ['</s>', '<s>', 'a', 'b', 'c']
Note that an ngram model is restricted in how much preceding context it can
take into account. For example, a trigram model can only condition its output
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Models
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Language Model Interface."""
+from __future__ import division, unicode_literals
import random
from abc import ABCMeta, abstractmethod
from bisect import bisect
+from six import add_metaclass
from nltk.lm.counter import NgramCounter
from nltk.lm.util import log_base2
from nltk.lm.vocabulary import Vocabulary
-from itertools import accumulate
-
-
-class Smoothing(metaclass=ABCMeta):
+try:
+ from itertools import accumulate
+except ImportError:
+ import operator
+
+ def accumulate(iterable, func=operator.add):
+ """Return running totals"""
+ # accumulate([1,2,3,4,5]) --> 1 3 6 10 15
+ # accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
+ it = iter(iterable)
+ try:
+ total = next(it)
+ except StopIteration:
+ return
+ yield total
+ for element in it:
+ total = func(total, element)
+ yield total
+
+
+@add_metaclass(ABCMeta)
+class Smoothing(object):
"""Ngram Smoothing Interface
Implements Chen & Goodman 1995's idea that all smoothing algorithms have
- certain features in common. This should ideally allow smoothing algorithms to
+ certain features in common. This should ideally allow smoothing algoritms to
work both with Backoff and Interpolation.
+
+ counter represents the number of counts for ngrams
"""
def __init__(self, vocabulary, counter):
- """
- :param vocabulary: The Ngram vocabulary object.
- :type vocabulary: nltk.lm.vocab.Vocabulary
- :param counter: The counts of the vocabulary items.
- :type counter: nltk.lm.counter.NgramCounter
- """
self.vocab = vocabulary
self.counts = counter
return random.Random(seed_or_generator)
-def _weighted_choice(population, weights, random_generator=None):
+def _weighted_choice(population, weights, random_seed=None):
"""Like random.choice, but with weights.
Heavily inspired by python 3.6 `random.choices`.
raise ValueError("The number of weights does not match the population")
cum_weights = list(accumulate(weights))
total = cum_weights[-1]
- threshold = random_generator.random()
+ threshold = _random_generator(random_seed).random()
return population[bisect(cum_weights, total * threshold)]
-class LanguageModel(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class LanguageModel(object):
"""ABC for Language Models.
Cannot be directly instantiated itself.
if not self.vocab:
if vocabulary_text is None:
raise ValueError(
- "Cannot fit without a vocabulary or text to create it from."
+ "Cannot fit without a vocabulary or text to " "create it from."
)
self.vocab.update(vocabulary_text)
self.counts.update(self.vocab.lookup(sent) for sent in text)
:param int num_words: How many words to generate. By default 1.
:param text_seed: Generation can be conditioned on preceding context.
- :param random_seed: A random seed or an instance of `random.Random`. If provided,
- makes the random sampling part of generation reproducible.
+ :param random_seed: If provided, makes the random sampling part of
+ generation reproducible.
:return: One (str) word or a list of words generated from model.
Examples:
"""
text_seed = [] if text_seed is None else list(text_seed)
- random_generator = _random_generator(random_seed)
- # This is the base recursion case.
+ # base recursion case
if num_words == 1:
context = (
text_seed[-self.order + 1 :]
while context and not samples:
context = context[1:] if len(context) > 1 else []
samples = self.context_counts(self.vocab.lookup(context))
- # Sorting samples achieves two things:
+ # sorting achieves two things:
# - reproducible randomness when sampling
- # - turns Mapping into Sequence which `_weighted_choice` expects
+ # - turning Mapping into Sequence which _weighted_choice expects
samples = sorted(samples)
return _weighted_choice(
- samples,
- tuple(self.score(w, context) for w in samples),
- random_generator,
+ samples, tuple(self.score(w, context) for w in samples), random_seed
)
- # We build up text one word at a time using the preceding context.
+ # build up text one word at a time
generated = []
for _ in range(num_words):
generated.append(
self.generate(
num_words=1,
text_seed=text_seed + generated,
- random_seed=random_generator,
+ random_seed=random_seed,
)
)
return generated
+# -*- coding: utf-8 -*-
# Natural Language Toolkit
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
----------------------
"""
-from collections import defaultdict
-from collections.abc import Sequence
+from __future__ import unicode_literals
+from collections import Sequence, defaultdict
+
+from six import string_types
+from nltk import compat
from nltk.probability import ConditionalFreqDist, FreqDist
-class NgramCounter:
+@compat.python_2_unicode_compatible
+class NgramCounter(object):
"""Class for counting ngrams.
Will count any ngram sequence you give it ;)
"""User-friendly access to ngram counts."""
if isinstance(item, int):
return self._counts[item]
- elif isinstance(item, str):
+ elif isinstance(item, string_types):
return self._counts.__getitem__(1)[item]
elif isinstance(item, Sequence):
return self._counts.__getitem__(len(item) + 1)[tuple(item)]
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Models
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Language Models"""
+from __future__ import division, unicode_literals
+from nltk import compat
from nltk.lm.api import LanguageModel, Smoothing
from nltk.lm.smoothing import KneserNey, WittenBell
+@compat.python_2_unicode_compatible
class MLE(LanguageModel):
"""Class for providing MLE ngram model scores.
return self.context_counts(context).freq(word)
+@compat.python_2_unicode_compatible
class Lidstone(LanguageModel):
"""Provides Lidstone-smoothed scores.
"""
def __init__(self, gamma, *args, **kwargs):
- super().__init__(*args, **kwargs)
+ super(Lidstone, self).__init__(*args, **kwargs)
self.gamma = gamma
def unmasked_score(self, word, context=None):
return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
+@compat.python_2_unicode_compatible
class Laplace(Lidstone):
"""Implements Laplace (add one) smoothing.
"""
def __init__(self, *args, **kwargs):
- super().__init__(1, *args, **kwargs)
+ super(Laplace, self).__init__(1, *args, **kwargs)
class InterpolatedLanguageModel(LanguageModel):
"""Logic common to all interpolated language models.
The idea to abstract this comes from Chen & Goodman 1995.
- Do not instantiate this class directly!
"""
def __init__(self, smoothing_cls, order, **kwargs):
assert issubclass(smoothing_cls, Smoothing)
params = kwargs.pop("params", {})
- super().__init__(order, **kwargs)
+ super(InterpolatedLanguageModel, self).__init__(order, **kwargs)
self.estimator = smoothing_cls(self.vocab, self.counts, **params)
def unmasked_score(self, word, context=None):
if not context:
- # The base recursion case: no context, we only have a unigram.
return self.estimator.unigram_score(word)
- if not self.counts[context]:
- # It can also happen that we have no data for this context.
- # In that case we defer to the lower-order ngram.
- # This is the same as setting alpha to 0 and gamma to 1.
- return self.unmasked_score(word, context[1:])
alpha, gamma = self.estimator.alpha_gamma(word, context)
return alpha + gamma * self.unmasked_score(word, context[1:])
"""Interpolated version of Witten-Bell smoothing."""
def __init__(self, order, **kwargs):
- super().__init__(WittenBell, order, **kwargs)
+ super(WittenBellInterpolated, self).__init__(WittenBell, order, **kwargs)
class KneserNeyInterpolated(InterpolatedLanguageModel):
"""Interpolated version of Kneser-Ney smoothing."""
def __init__(self, order, discount=0.1, **kwargs):
- super().__init__(KneserNey, order, params={"discount": discount}, **kwargs)
+ super(KneserNeyInterpolated, self).__init__(
+ KneserNey, order, params={"discount": discount}, **kwargs
+ )
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
class WittenBell(Smoothing):
"""Witten-Bell smoothing."""
- def __init__(self, vocabulary, counter, **kwargs):
- super().__init__(vocabulary, counter, **kwargs)
+ def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
+ super(WittenBell, self).__init__(vocabulary, counter, *kwargs)
+ self.counts = counter
def alpha_gamma(self, word, context):
- alpha = self.counts[context].freq(word)
- gamma = self._gamma(context)
- return (1.0 - gamma) * alpha, gamma
-
- def _gamma(self, context):
- n_plus = _count_non_zero_vals(self.counts[context])
- return n_plus / (n_plus + self.counts[len(context) + 1].N())
+ gamma = self.gamma(context)
+ return (1.0 - gamma) * self.alpha(word, context), gamma
def unigram_score(self, word):
return self.counts.unigrams.freq(word)
+ def alpha(self, word, context):
+ return self.counts[context].freq(word)
+
+ def gamma(self, context):
+ n_plus = _count_non_zero_vals(self.counts[context])
+ return n_plus / (n_plus + self.counts[len(context) + 1].N())
+
class KneserNey(Smoothing):
"""Kneser-Ney Smoothing."""
def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
- super().__init__(vocabulary, counter, **kwargs)
+ super(KneserNey, self).__init__(vocabulary, counter, *kwargs)
self.discount = discount
+ self.vocabulary = vocabulary
def unigram_score(self, word):
- return 1.0 / len(self.vocab)
+ return 1.0 / len(self.vocabulary)
def alpha_gamma(self, word, context):
prefix_counts = self.counts[context]
- prefix_total_ngrams = prefix_counts.N()
- alpha = max(prefix_counts[word] - self.discount, 0.0) / prefix_total_ngrams
- gamma = (
- self.discount * _count_non_zero_vals(prefix_counts) / prefix_total_ngrams
- )
- return alpha, gamma
+ return self.alpha(word, prefix_counts), self.gamma(prefix_counts)
+
+ def alpha(self, word, prefix_counts):
+ return max(prefix_counts[word] - self.discount, 0.0) / prefix_counts.N()
+
+ def gamma(self, prefix_counts):
+ return self.discount * _count_non_zero_vals(prefix_counts) / prefix_counts.N()
+
+
+class GoodTuring(Smoothing):
+ """Good-Turing Smoothing"""
+ def __init__(self, vocabulary, counter, **kwargs):
+ super(GoodTuring, self).__init__(vocabulary, counter, *kwargs)
+ self.counts = counter
+ self.vocabulary = vocabulary
+
+ def unigram_score(self, word):
+ word_count = self.counts[word]
+ count_plus_1 = 0.
+ for everyContext in self.counts.keys():
+ if len(everyContext.split()) == word_count+1:
+ count_plus_1 += 1
+ return count_plus_1 / len(self.vocabulary)
+# -*- coding: utf-8 -*-
# Natural Language Toolkit
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+# -*- coding: utf-8 -*-
# Natural Language Toolkit
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Language Model Vocabulary"""
+from __future__ import unicode_literals
+
import sys
-from collections import Counter
-from collections.abc import Iterable
+from collections import Counter, Iterable
from itertools import chain
-from functools import singledispatch
+
+from nltk import compat
+
+try:
+ # Python >= 3.4
+ from functools import singledispatch
+except ImportError:
+ # Python < 3.4
+ from singledispatch import singledispatch
@singledispatch
return tuple(_dispatched_lookup(w, vocab) for w in words)
-@_dispatched_lookup.register(str)
+try:
+ # Python 2 unicode + str type
+ basestring
+except NameError:
+ # Python 3 unicode + str type
+ basestring = str
+
+
+@_dispatched_lookup.register(basestring)
def _string_lookup(word, vocab):
"""Looks up one word in the vocabulary."""
return word if word in vocab else vocab.unk_label
-class Vocabulary:
+@compat.python_2_unicode_compatible
+class Vocabulary(object):
"""Stores language model vocabulary.
Satisfies two common language modeling requirements for a vocabulary:
('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
It's possible to update the counts after the vocabulary has been created.
- In general, the interface is the same as that of `collections.Counter`.
+ The interface follows that of `collections.Counter`.
>>> vocab['b']
1
and self.counts == other.counts
)
+ if sys.version_info[0] == 2:
+ # see https://stackoverflow.com/a/35781654/4501212
+ def __ne__(self, other):
+ equal = self.__eq__(other)
+ return equal if equal is NotImplemented else not equal
+
def __str__(self):
return "<{0} with cutoff={1} unk_label='{2}' and {3} items>".format(
self.__class__.__name__, self.cutoff, self.unk_label, len(self)
# Natural Language Toolkit: Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.metrics.confusionmatrix import ConfusionMatrix
from nltk.metrics.distance import (
edit_distance,
- edit_distance_align,
binary_distance,
jaccard_distance,
masi_distance,
NgramAssocMeasures,
BigramAssocMeasures,
TrigramAssocMeasures,
- QuadgramAssocMeasures,
ContingencyMeasures,
)
from nltk.metrics.spearman import (
# Natural Language Toolkit: Agreement Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tom Lippincott <tom@cs.columbia.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
1.0
"""
+from __future__ import print_function, unicode_literals, division
import logging
from itertools import groupby
from operator import itemgetter
+from six import iteritems
+
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.internals import deprecated
+from nltk.compat import python_2_unicode_compatible
from nltk.metrics.distance import binary_distance
log = logging.getLogger(__name__)
+@python_2_unicode_compatible
class AnnotationTask(object):
"""Represents an annotation task, i.e. people assign labels to items.
return "\r\n".join(
map(
lambda x: "%s\t%s\t%s"
- % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
+ % (x['coder'], x['item'].replace('_', "\t"), ",".join(x['labels'])),
self.data,
)
)
self.C.add(coder)
self.K.add(labels)
self.I.add(item)
- self.data.append({"coder": coder, "labels": labels, "item": item})
+ self.data.append({'coder': coder, 'labels': labels, 'item': item})
def agr(self, cA, cB, i, data=None):
"""Agreement between two coders on a given item
# cfedermann: we don't know what combination of coder/item will come
# first in x; to avoid StopIteration problems due to assuming an order
# cA,cB, we allow either for k1 and then look up the missing as k2.
- k1 = next((x for x in data if x["coder"] in (cA, cB) and x["item"] == i))
- if k1["coder"] == cA:
- k2 = next((x for x in data if x["coder"] == cB and x["item"] == i))
+ k1 = next((x for x in data if x['coder'] in (cA, cB) and x['item'] == i))
+ if k1['coder'] == cA:
+ k2 = next((x for x in data if x['coder'] == cB and x['item'] == i))
else:
- k2 = next((x for x in data if x["coder"] == cA and x["item"] == i))
+ k2 = next((x for x in data if x['coder'] == cA and x['item'] == i))
- ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
+ ret = 1.0 - float(self.distance(k1['labels'], k2['labels']))
log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
log.debug(
- 'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
+ "Distance between \"%r\" and \"%r\": %f",
+ k1['labels'],
+ k2['labels'],
+ 1.0 - ret,
)
return ret
def Nk(self, k):
- return float(sum(1 for x in self.data if x["labels"] == k))
+ return float(sum(1 for x in self.data if x['labels'] == k))
def Nik(self, i, k):
- return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
+ return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k))
def Nck(self, c, k):
- return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
+ return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k))
- @deprecated("Use Nk, Nik or Nck instead")
+ @deprecated('Use Nk, Nik or Nck instead')
def N(self, k=None, i=None, c=None):
"""Implements the "n-notation" used in Artstein and Poesio (2007)
"""
data = self._grouped_data(
- "item", (x for x in self.data if x["coder"] in (cA, cB))
+ 'item', (x for x in self.data if x['coder'] in (cA, cB))
)
ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
self.I
"""
total = 0.0
- data = (x for x in self.data if x["coder"] in (cA, cB))
- for i, itemdata in self._grouped_data("item", data):
+ data = (x for x in self.data if x['coder'] in (cA, cB))
+ for i, itemdata in self._grouped_data('item', data):
# we should have two items; distance doesn't care which comes first
- total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
+ total += self.distance(next(itemdata)['labels'], next(itemdata)['labels'])
ret = total / (len(self.I) * max_distance)
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
"""
total = 0.0
- label_freqs = FreqDist(x["labels"] for x in self.data)
- for k, f in label_freqs.items():
+ label_freqs = FreqDist(x['labels'] for x in self.data)
+ for k, f in iteritems(label_freqs):
total += f ** 2
Ae = total / ((len(self.I) * len(self.C)) ** 2)
return (self.avg_Ao() - Ae) / (1 - Ae)
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
- label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
+ label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
def Disagreement(self, label_freqs):
total_labels = sum(label_freqs.values())
pairs = 0.0
- for j, nj in label_freqs.items():
- for l, nl in label_freqs.items():
+ for j, nj in iteritems(label_freqs):
+ for l, nl in iteritems(label_freqs):
pairs += float(nj * nl) * self.distance(l, j)
return 1.0 * pairs / (total_labels * (total_labels - 1))
total_ratings = 0
all_valid_labels_freq = FreqDist([])
- total_do = 0.0 # Total observed disagreement for all items.
- for i, itemdata in self._grouped_data("item"):
- label_freqs = FreqDist(x["labels"] for x in itemdata)
+ total_do = 0.0 # Total observed disagreement for all items.
+ for i, itemdata in self._grouped_data('item'):
+ label_freqs = FreqDist(x['labels'] for x in itemdata)
labels_count = sum(label_freqs.values())
if labels_count < 2:
# Ignore the item.
do = total_do / sum(all_valid_labels_freq.values())
- de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
+ de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
k_alpha = 1.0 - do / de
return k_alpha
"""
total = 0.0
label_freqs = ConditionalFreqDist(
- (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
+ (x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)
)
for j in self.K:
for l in self.K:
)
-if __name__ == "__main__":
+if __name__ == '__main__':
import re
import optparse
"-v",
"--verbose",
dest="verbose",
- default="0",
+ default='0',
help="how much debugging to print on stderr (0-4)",
)
parser.add_option(
# read in data from the specified file
data = []
- with open(options.file, "r") as infile:
+ with open(options.file, 'r') as infile:
for l in infile:
toks = l.split(options.columnsep)
coder, object_, labels = (
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ALINE
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Greg Kondrak <gkondrak@ualberta.ca>
# Geoff Bacon <bacon@berkeley.edu> (Python port)
# URL: <http://nltk.org/>
University of Toronto.
"""
+from __future__ import unicode_literals
+
try:
import numpy as np
except ImportError:
# === Constants ===
-inf = float("inf")
+inf = float('inf')
# Default values for maximum similarity scores (Kondrak 2002: 54)
C_skip = 10 # Indels
C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
consonants = [
- "B",
- "N",
- "R",
- "b",
- "c",
- "d",
- "f",
- "g",
- "h",
- "j",
- "k",
- "l",
- "m",
- "n",
- "p",
- "q",
- "r",
- "s",
- "t",
- "v",
- "x",
- "z",
- "ç",
- "ð",
- "ħ",
- "ŋ",
- "ɖ",
- "ɟ",
- "ɢ",
- "ɣ",
- "ɦ",
- "ɬ",
- "ɮ",
- "ɰ",
- "ɱ",
- "ɲ",
- "ɳ",
- "ɴ",
- "ɸ",
- "ɹ",
- "ɻ",
- "ɽ",
- "ɾ",
- "ʀ",
- "ʁ",
- "ʂ",
- "ʃ",
- "ʈ",
- "ʋ",
- "ʐ ",
- "ʒ",
- "ʔ",
- "ʕ",
- "ʙ",
- "ʝ",
- "β",
- "θ",
- "χ",
- "ʐ",
- "w",
+ 'B',
+ 'N',
+ 'R',
+ 'b',
+ 'c',
+ 'd',
+ 'f',
+ 'g',
+ 'h',
+ 'j',
+ 'k',
+ 'l',
+ 'm',
+ 'n',
+ 'p',
+ 'q',
+ 'r',
+ 's',
+ 't',
+ 'v',
+ 'x',
+ 'z',
+ 'ç',
+ 'ð',
+ 'ħ',
+ 'ŋ',
+ 'ɖ',
+ 'ɟ',
+ 'ɢ',
+ 'ɣ',
+ 'ɦ',
+ 'ɬ',
+ 'ɮ',
+ 'ɰ',
+ 'ɱ',
+ 'ɲ',
+ 'ɳ',
+ 'ɴ',
+ 'ɸ',
+ 'ɹ',
+ 'ɻ',
+ 'ɽ',
+ 'ɾ',
+ 'ʀ',
+ 'ʁ',
+ 'ʂ',
+ 'ʃ',
+ 'ʈ',
+ 'ʋ',
+ 'ʐ ',
+ 'ʒ',
+ 'ʔ',
+ 'ʕ',
+ 'ʙ',
+ 'ʝ',
+ 'β',
+ 'θ',
+ 'χ',
+ 'ʐ',
+ 'w',
]
# Relevant features for comparing consonants and vowels
R_c = [
- "aspirated",
- "lateral",
- "manner",
- "nasal",
- "place",
- "retroflex",
- "syllabic",
- "voice",
+ 'aspirated',
+ 'lateral',
+ 'manner',
+ 'nasal',
+ 'place',
+ 'retroflex',
+ 'syllabic',
+ 'voice',
]
# 'high' taken out of R_v because same as manner
R_v = [
- "back",
- "lateral",
- "long",
- "manner",
- "nasal",
- "place",
- "retroflex",
- "round",
- "syllabic",
- "voice",
+ 'back',
+ 'lateral',
+ 'long',
+ 'manner',
+ 'nasal',
+ 'place',
+ 'retroflex',
+ 'round',
+ 'syllabic',
+ 'voice',
]
# Flattened feature matrix (Kondrak 2002: 56)
similarity_matrix = {
# place
- "bilabial": 1.0,
- "labiodental": 0.95,
- "dental": 0.9,
- "alveolar": 0.85,
- "retroflex": 0.8,
- "palato-alveolar": 0.75,
- "palatal": 0.7,
- "velar": 0.6,
- "uvular": 0.5,
- "pharyngeal": 0.3,
- "glottal": 0.1,
- "labiovelar": 1.0,
- "vowel": -1.0, # added 'vowel'
+ 'bilabial': 1.0,
+ 'labiodental': 0.95,
+ 'dental': 0.9,
+ 'alveolar': 0.85,
+ 'retroflex': 0.8,
+ 'palato-alveolar': 0.75,
+ 'palatal': 0.7,
+ 'velar': 0.6,
+ 'uvular': 0.5,
+ 'pharyngeal': 0.3,
+ 'glottal': 0.1,
+ 'labiovelar': 1.0,
+ 'vowel': -1.0, # added 'vowel'
# manner
- "stop": 1.0,
- "affricate": 0.9,
- "fricative": 0.85, # increased fricative from 0.8
- "trill": 0.7,
- "tap": 0.65,
- "approximant": 0.6,
- "high vowel": 0.4,
- "mid vowel": 0.2,
- "low vowel": 0.0,
- "vowel2": 0.5, # added vowel
+ 'stop': 1.0,
+ 'affricate': 0.9,
+ 'fricative': 0.85, # increased fricative from 0.8
+ 'trill': 0.7,
+ 'tap': 0.65,
+ 'approximant': 0.6,
+ 'high vowel': 0.4,
+ 'mid vowel': 0.2,
+ 'low vowel': 0.0,
+ 'vowel2': 0.5, # added vowel
# high
- "high": 1.0,
- "mid": 0.5,
- "low": 0.0,
+ 'high': 1.0,
+ 'mid': 0.5,
+ 'low': 0.0,
# back
- "front": 1.0,
- "central": 0.5,
- "back": 0.0,
+ 'front': 1.0,
+ 'central': 0.5,
+ 'back': 0.0,
# binary features
- "plus": 1.0,
- "minus": 0.0,
+ 'plus': 1.0,
+ 'minus': 0.0,
}
# Relative weights of phonetic features (Kondrak 2002: 55)
salience = {
- "syllabic": 5,
- "place": 40,
- "manner": 50,
- "voice": 5, # decreased from 10
- "nasal": 20, # increased from 10
- "retroflex": 10,
- "lateral": 10,
- "aspirated": 5,
- "long": 0, # decreased from 1
- "high": 3, # decreased from 5
- "back": 2, # decreased from 5
- "round": 2, # decreased from 5
+ 'syllabic': 5,
+ 'place': 40,
+ 'manner': 50,
+ 'voice': 5, # decreased from 10
+ 'nasal': 20, # increased from 10
+ 'retroflex': 10,
+ 'lateral': 10,
+ 'aspirated': 5,
+ 'long': 0, # decreased from 1
+ 'high': 3, # decreased from 5
+ 'back': 2, # decreased from 5
+ 'round': 2, # decreased from 5
}
# (Kondrak 2002: 59-60)
feature_matrix = {
# Consonants
- "p": {
- "place": "bilabial",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "b": {
- "place": "bilabial",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "t": {
- "place": "alveolar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "d": {
- "place": "alveolar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʈ": {
- "place": "retroflex",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɖ": {
- "place": "retroflex",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "c": {
- "place": "palatal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɟ": {
- "place": "palatal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "k": {
- "place": "velar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "g": {
- "place": "velar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "q": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɢ": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʔ": {
- "place": "glottal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "m": {
- "place": "bilabial",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɱ": {
- "place": "labiodental",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "n": {
- "place": "alveolar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɳ": {
- "place": "retroflex",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɲ": {
- "place": "palatal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ŋ": {
- "place": "velar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɴ": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "N": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʙ": {
- "place": "bilabial",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "B": {
- "place": "bilabial",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "r": {
- "place": "alveolar",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʀ": {
- "place": "uvular",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "R": {
- "place": "uvular",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɾ": {
- "place": "alveolar",
- "manner": "tap",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɽ": {
- "place": "retroflex",
- "manner": "tap",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɸ": {
- "place": "bilabial",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "β": {
- "place": "bilabial",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "f": {
- "place": "labiodental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "v": {
- "place": "labiodental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "θ": {
- "place": "dental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ð": {
- "place": "dental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "s": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "z": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʃ": {
- "place": "palato-alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʒ": {
- "place": "palato-alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʂ": {
- "place": "retroflex",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʐ": {
- "place": "retroflex",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ç": {
- "place": "palatal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʝ": {
- "place": "palatal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "x": {
- "place": "velar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɣ": {
- "place": "velar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "χ": {
- "place": "uvular",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʁ": {
- "place": "uvular",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ħ": {
- "place": "pharyngeal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʕ": {
- "place": "pharyngeal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "h": {
- "place": "glottal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɦ": {
- "place": "glottal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɬ": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "plus",
- "aspirated": "minus",
- },
- "ɮ": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "plus",
- "aspirated": "minus",
- },
- "ʋ": {
- "place": "labiodental",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɹ": {
- "place": "alveolar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɻ": {
- "place": "retroflex",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "j": {
- "place": "palatal",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɰ": {
- "place": "velar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "l": {
- "place": "alveolar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "plus",
- "aspirated": "minus",
- },
- "w": {
- "place": "labiovelar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
+ 'p': {
+ 'place': 'bilabial',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'b': {
+ 'place': 'bilabial',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 't': {
+ 'place': 'alveolar',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'd': {
+ 'place': 'alveolar',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʈ': {
+ 'place': 'retroflex',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɖ': {
+ 'place': 'retroflex',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'c': {
+ 'place': 'palatal',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɟ': {
+ 'place': 'palatal',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'k': {
+ 'place': 'velar',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'g': {
+ 'place': 'velar',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'q': {
+ 'place': 'uvular',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɢ': {
+ 'place': 'uvular',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʔ': {
+ 'place': 'glottal',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'm': {
+ 'place': 'bilabial',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɱ': {
+ 'place': 'labiodental',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'n': {
+ 'place': 'alveolar',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɳ': {
+ 'place': 'retroflex',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɲ': {
+ 'place': 'palatal',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ŋ': {
+ 'place': 'velar',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɴ': {
+ 'place': 'uvular',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'N': {
+ 'place': 'uvular',
+ 'manner': 'stop',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'plus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʙ': {
+ 'place': 'bilabial',
+ 'manner': 'trill',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'B': {
+ 'place': 'bilabial',
+ 'manner': 'trill',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'r': {
+ 'place': 'alveolar',
+ 'manner': 'trill',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʀ': {
+ 'place': 'uvular',
+ 'manner': 'trill',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'R': {
+ 'place': 'uvular',
+ 'manner': 'trill',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɾ': {
+ 'place': 'alveolar',
+ 'manner': 'tap',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɽ': {
+ 'place': 'retroflex',
+ 'manner': 'tap',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɸ': {
+ 'place': 'bilabial',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'β': {
+ 'place': 'bilabial',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'f': {
+ 'place': 'labiodental',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'v': {
+ 'place': 'labiodental',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'θ': {
+ 'place': 'dental',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ð': {
+ 'place': 'dental',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 's': {
+ 'place': 'alveolar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'z': {
+ 'place': 'alveolar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʃ': {
+ 'place': 'palato-alveolar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʒ': {
+ 'place': 'palato-alveolar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʂ': {
+ 'place': 'retroflex',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʐ': {
+ 'place': 'retroflex',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ç': {
+ 'place': 'palatal',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʝ': {
+ 'place': 'palatal',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'x': {
+ 'place': 'velar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɣ': {
+ 'place': 'velar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'χ': {
+ 'place': 'uvular',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʁ': {
+ 'place': 'uvular',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ħ': {
+ 'place': 'pharyngeal',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʕ': {
+ 'place': 'pharyngeal',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'h': {
+ 'place': 'glottal',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɦ': {
+ 'place': 'glottal',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɬ': {
+ 'place': 'alveolar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'minus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'ɮ': {
+ 'place': 'alveolar',
+ 'manner': 'fricative',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'ʋ': {
+ 'place': 'labiodental',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɹ': {
+ 'place': 'alveolar',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɻ': {
+ 'place': 'retroflex',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'plus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'j': {
+ 'place': 'palatal',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɰ': {
+ 'place': 'velar',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'l': {
+ 'place': 'alveolar',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'w': {
+ 'place': 'labiovelar',
+ 'manner': 'approximant',
+ 'syllabic': 'minus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'aspirated': 'minus',
},
# Vowels
- "i": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "y": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "front",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "e": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "E": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "minus",
- "long": "plus",
- "aspirated": "minus",
- },
- "ø": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ɛ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "œ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "æ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "a": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "A": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "front",
- "round": "minus",
- "long": "plus",
- "aspirated": "minus",
- },
- "ɨ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "central",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ʉ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "central",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ə": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "central",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "u": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "back",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "U": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "back",
- "round": "plus",
- "long": "plus",
- "aspirated": "minus",
- },
- "o": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "back",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "O": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "back",
- "round": "plus",
- "long": "plus",
- "aspirated": "minus",
- },
- "ɔ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "back",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ɒ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "back",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "I": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "front",
- "round": "minus",
- "long": "plus",
- "aspirated": "minus",
+ 'i': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'y': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'front',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'e': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'E': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'ø': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'front',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɛ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'œ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'front',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'æ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'low',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'a': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'low',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'A': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'low',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'ɨ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'central',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ʉ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'central',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ə': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'central',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'u': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'back',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'U': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'back',
+ 'round': 'plus',
+ 'long': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'o': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'back',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'O': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'back',
+ 'round': 'plus',
+ 'long': 'plus',
+ 'aspirated': 'minus',
+ },
+ 'ɔ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'mid',
+ 'back': 'back',
+ 'round': 'plus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'ɒ': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'low',
+ 'back': 'back',
+ 'round': 'minus',
+ 'long': 'minus',
+ 'aspirated': 'minus',
+ },
+ 'I': {
+ 'place': 'vowel',
+ 'manner': 'vowel2',
+ 'syllabic': 'plus',
+ 'voice': 'plus',
+ 'nasal': 'minus',
+ 'retroflex': 'minus',
+ 'lateral': 'minus',
+ 'high': 'high',
+ 'back': 'front',
+ 'round': 'minus',
+ 'long': 'plus',
+ 'aspirated': 'minus',
},
}
(Kondrak 2002: 51)
"""
if np is None:
- raise ImportError("You need numpy in order to use the align function")
+ raise ImportError('You need numpy in order to use the align function')
assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
m = len(str1)
out,
)
elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T:
- out.insert(0, ("-", str2[j - 1]))
+ out.insert(0, ('-', str2[j - 1]))
_retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out)
elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T:
- out.insert(0, (str1[i - 1], "-"))
+ out.insert(0, (str1[i - 1], '-'))
_retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out)
elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T:
out.insert(0, (str1[i - 1], str2[j - 1]))
A demonstration of the result of aligning phonetic sequences
used in Kondrak's (2002) dissertation.
"""
- data = [pair.split(",") for pair in cognate_data.split("\n")]
+ data = [pair.split(',') for pair in cognate_data.split('\n')]
for pair in data:
alignment = align(pair[0], pair[1])[0]
- alignment = ["({}, {})".format(a[0], a[1]) for a in alignment]
- alignment = " ".join(alignment)
- print("{} ~ {} : {}".format(pair[0], pair[1], alignment))
+ alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
+ alignment = ' '.join(alignment)
+ print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
cognate_data = """jo,ʒə
pematesiweni,pematesewen
asenja,aʔsɛn"""
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Ngram Association Measures
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
"""
+from __future__ import division
+
import math as _math
from abc import ABCMeta, abstractmethod
from functools import reduce
+from six import add_metaclass
_log2 = lambda x: _math.log(x, 2.0)
_ln = _math.log
"""Marginals index for the number of words in the data"""
-class NgramAssocMeasures(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class NgramAssocMeasures(object):
"""
An abstract class defining a collection of generic association measures.
Each public method returns a score, taking the following arguments::
argument power sets an exponent (default 3) for the numerator. No
logarithm of the result is calculated.
"""
- return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
+ return marginals[NGRAM] ** kwargs.get('power', 3) / _product(
marginals[UNIGRAMS]
)
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
- (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
+ (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
@staticmethod
def __init__(self, measures):
"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
- self.__class__.__name__ = "Contingency" + measures.__class__.__name__
+ self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
for k in dir(measures):
- if k.startswith("__"):
+ if k.startswith('__'):
continue
v = getattr(measures, k)
- if not k.startswith("_"):
+ if not k.startswith('_'):
v = self._make_contingency_fn(measures, v)
setattr(self, k, v)
# Natural Language Toolkit: Confusion Matrices
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
+from __future__ import print_function, unicode_literals
from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class ConfusionMatrix(object):
"""
The confusion matrix between a list of reference values and a
the same length.
"""
if len(reference) != len(test):
- raise ValueError("Lists must have the same length.")
+ raise ValueError('Lists must have the same length.')
# Get a list of all values.
if sort_by_count:
return self._confusion[i][j]
def __repr__(self):
- return "<ConfusionMatrix: %s/%s correct>" % (self._correct, self._total)
+ return '<ConfusionMatrix: %s/%s correct>' % (self._correct, self._total)
def __str__(self):
return self.pretty_format()
# Construct a format string for row values
valuelen = max(len(val) for val in value_strings)
- value_format = "%" + repr(valuelen) + "s | "
+ value_format = '%' + repr(valuelen) + 's | '
# Construct a format string for matrix entries
if show_percents:
entrylen = 6
- entry_format = "%5.1f%%"
- zerostr = " ."
+ entry_format = '%5.1f%%'
+ zerostr = ' .'
else:
entrylen = len(repr(self._max_conf))
- entry_format = "%" + repr(entrylen) + "d"
- zerostr = " " * (entrylen - 1) + "."
+ entry_format = '%' + repr(entrylen) + 'd'
+ zerostr = ' ' * (entrylen - 1) + '.'
# Write the column values.
- s = ""
+ s = ''
for i in range(valuelen):
- s += (" " * valuelen) + " |"
+ s += (' ' * valuelen) + ' |'
for val in value_strings:
if i >= valuelen - len(val):
s += val[i - valuelen + len(val)].rjust(entrylen + 1)
else:
- s += " " * (entrylen + 1)
- s += " |\n"
+ s += ' ' * (entrylen + 1)
+ s += ' |\n'
# Write a dividing line
- s += "%s-+-%s+\n" % ("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+ s += '%s-+-%s+\n' % ('-' * valuelen, '-' * ((entrylen + 1) * len(values)))
# Write the entries.
for val, li in zip(value_strings, values):
else:
s += entry_format % confusion[i][j]
if i == j:
- prevspace = s.rfind(" ")
- s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
+ prevspace = s.rfind(' ')
+ s = s[:prevspace] + '<' + s[prevspace + 1 :] + '>'
else:
- s += " "
- s += "|\n"
+ s += ' '
+ s += '|\n'
# Write a dividing line
- s += "%s-+-%s+\n" % ("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+ s += '%s-+-%s+\n' % ('-' * valuelen, '-' * ((entrylen + 1) * len(values)))
# Write a key
- s += "(row = reference; col = test)\n"
+ s += '(row = reference; col = test)\n'
if not values_in_chart:
- s += "Value key:\n"
+ s += 'Value key:\n'
for i, value in enumerate(values):
- s += "%6d: %s\n" % (i + 1, value)
+ s += '%6d: %s\n' % (i + 1, value)
return s
def key(self):
values = self._values
- str = "Value key:\n"
+ str = 'Value key:\n'
indexlen = len(repr(len(values) - 1))
- key_format = " %" + repr(indexlen) + "d: %s\n"
+ key_format = ' %' + repr(indexlen) + 'd: %s\n'
for i in range(len(values)):
str += key_format % (i, values[i])
def demo():
- reference = "DET NN VB DET JJ NN NN IN DET NN".split()
- test = "DET VB VB DET NN NN NN IN DET NN".split()
- print("Reference =", reference)
- print("Test =", test)
- print("Confusion matrix:")
+ reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+ test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+ print('Reference =', reference)
+ print('Test =', test)
+ print('Confusion matrix:')
print(ConfusionMatrix(reference, test))
print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Distance Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Tom Lippincott <tom@cs.columbia.edu>
3. d(a, c) <= d(a, b) + d(b, c)
"""
+from __future__ import print_function
+from __future__ import division
+
import warnings
-import operator
def _edit_dist_init(len1, len2):
return lev[len1][len2]
-def _edit_dist_backtrace(lev):
- i, j = len(lev) - 1, len(lev[0]) - 1
- alignment = [(i, j)]
-
- while (i, j) != (0, 0):
- directions = [
- (i - 1, j), # skip s1
- (i, j - 1), # skip s2
- (i - 1, j - 1), # substitution
- ]
-
- direction_costs = (
- (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
- for i, j in directions
- )
- _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
-
- alignment.append((i, j))
- return list(reversed(alignment))
-
-
-def edit_distance_align(s1, s2, substitution_cost=1):
- """
- Calculate the minimum Levenshtein edit-distance based alignment
- mapping between two strings. The alignment finds the mapping
- from string s1 to s2 that minimizes the edit distance cost.
- For example, mapping "rain" to "shine" would involve 2
- substitutions, 2 matches and an insertion resulting in
- the following mapping:
- [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
- NB: (0, 0) is the start state without any letters associated
- See more: https://web.stanford.edu/class/cs124/lec/med.pdf
-
- In case of multiple valid minimum-distance alignments, the
- backtrace has the following operation precedence:
- 1. Skip s1 character
- 2. Skip s2 character
- 3. Substitute s1 and s2 characters
- The backtrace is carried out in reverse string order.
-
- This function does not support transposition.
-
- :param s1, s2: The strings to be aligned
- :type s1: str
- :type s2: str
- :type substitution_cost: int
- :rtype List[Tuple(int, int)]
- """
- # set up a 2-D array
- len1 = len(s1)
- len2 = len(s2)
- lev = _edit_dist_init(len1 + 1, len2 + 1)
-
- # iterate over the array
- for i in range(len1):
- for j in range(len2):
- _edit_dist_step(
- lev,
- i + 1,
- j + 1,
- s1,
- s2,
- substitution_cost=substitution_cost,
- transpositions=False,
- )
-
- # backtrace to find alignment
- alignment = _edit_dist_backtrace(lev)
- return alignment
-
-
def binary_distance(label1, label2):
"""Simple equality test.
def custom_distance(file):
data = {}
- with open(file, "r") as infile:
+ with open(file, 'r') as infile:
for l in infile:
labelA, labelB, dist = l.strip().split("\t")
labelA = frozenset([labelA])
print("MASI distance:", masi_distance(s1, s2))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Agreement Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Lauri Hallila <laurihallila@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
def get_words_from_dictionary(lemmas):
- """
+ '''
Get original set of words used for analysis.
:param lemmas: A dictionary where keys are lemmas and values are sets
:type lemmas: dict(str): list(str)
:return: Set of words that exist as values in the dictionary
:rtype: set(str)
- """
+ '''
words = set()
for lemma in lemmas:
words.update(set(lemmas[lemma]))
def _truncate(words, cutlength):
- """Group words by stems defined by truncating them at given length.
+ '''Group words by stems defined by truncating them at given length.
:param words: Set of words used for analysis
:param cutlength: Words are stemmed by cutting at this length.
:return: Dictionary where keys are stems and values are sets of words
corresponding to that stem.
:rtype: dict(str): set(str)
- """
+ '''
stems = {}
for word in words:
stem = word[:cutlength]
# Reference: http://en.wikipedia.org/wiki/Line-line_intersection
def _count_intersection(l1, l2):
- """Count intersection between two line segments defined by coordinate pairs.
+ '''Count intersection between two line segments defined by coordinate pairs.
:param l1: Tuple of two coordinate pairs defining the first line segment
:param l2: Tuple of two coordinate pairs defining the second line segment
:type l2: tuple(float, float)
:return: Coordinates of the intersection
:rtype: tuple(float, float)
- """
+ '''
x1, y1 = l1[0]
x2, y2 = l1[1]
x3, y3 = l2[0]
def _get_derivative(coordinates):
- """Get derivative of the line from (0,0) to given coordinates.
+ '''Get derivative of the line from (0,0) to given coordinates.
:param coordinates: A coordinate pair
:type coordinates: tuple(float, float)
:return: Derivative; inf if x is zero
:rtype: float
- """
+ '''
try:
return coordinates[1] / coordinates[0]
except ZeroDivisionError:
- return float("inf")
+ return float('inf')
def _calculate_cut(lemmawords, stems):
- """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
+ '''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
:param lemmawords: Set or list of words corresponding to certain lemma.
:param stems: A dictionary where keys are stems and values are sets
:return: Amount of understemmed and overstemmed pairs contributed by words
existing in both lemmawords and stems.
:rtype: tuple(float, float)
- """
+ '''
umt, wmt = 0.0, 0.0
for stem in stems:
cut = set(lemmawords) & set(stems[stem])
def _calculate(lemmas, stems):
- """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
+ '''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:rtype: tuple(float, float, float, float)
- """
+ '''
n = sum(len(lemmas[word]) for word in lemmas)
def _indexes(gumt, gdmt, gwmt, gdnt):
- """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
+ '''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
global desired merge total (gdmt),
Overstemming Index (OI) and
Stemming Weight (SW).
:rtype: tuple(float, float, float)
- """
+ '''
# Calculate Understemming Index (UI),
# Overstemming Index (OI) and Stemming Weight (SW)
try:
except ZeroDivisionError:
if oi == 0.0:
# OI and UI are 0, define SW as 'not a number'
- sw = float("nan")
+ sw = float('nan')
else:
# UI is 0, define SW as infinity
- sw = float("inf")
+ sw = float('inf')
return (ui, oi, sw)
class Paice(object):
- """Class for storing lemmas, stems and evaluation metrics."""
+ '''Class for storing lemmas, stems and evaluation metrics.'''
def __init__(self, lemmas, stems):
- """
+ '''
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict(str): list(str)
:type stems: dict(str): set(str)
- """
+ '''
self.lemmas = lemmas
self.stems = stems
self.coords = []
self.update()
def __str__(self):
- text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
- text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
- text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
- text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
- text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
- text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
- text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
- text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
- coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
- text.append("Truncation line: %s" % coordinates)
- return "".join(text)
+ text = ['Global Unachieved Merge Total (GUMT): %s\n' % self.gumt]
+ text.append('Global Desired Merge Total (GDMT): %s\n' % self.gdmt)
+ text.append('Global Wrongly-Merged Total (GWMT): %s\n' % self.gwmt)
+ text.append('Global Desired Non-merge Total (GDNT): %s\n' % self.gdnt)
+ text.append('Understemming Index (GUMT / GDMT): %s\n' % self.ui)
+ text.append('Overstemming Index (GWMT / GDNT): %s\n' % self.oi)
+ text.append('Stemming Weight (OI / UI): %s\n' % self.sw)
+ text.append('Error-Rate Relative to Truncation (ERRT): %s\r\n' % self.errt)
+ coordinates = ' '.join(['(%s, %s)' % item for item in self.coords])
+ text.append('Truncation line: %s' % coordinates)
+ return ''.join(text)
def _get_truncation_indexes(self, words, cutlength):
- """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
+ '''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
:param words: Words used for the analysis
:param cutlength: Words are stemmed by cutting them at this length
:type cutlength: int
:return: Understemming and overstemming indexes
:rtype: tuple(int, int)
- """
+ '''
truncated = _truncate(words, cutlength)
gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
return (ui, oi)
def _get_truncation_coordinates(self, cutlength=0):
- """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
+ '''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
:param cutlength: Optional parameter to start counting from (ui, oi)
coordinates gotten by stemming at this length. Useful for speeding up
:type cutlength: int
:return: List of coordinate pairs that define the truncation line
:rtype: list(tuple(float, float))
- """
+ '''
words = get_words_from_dictionary(self.lemmas)
maxlength = max(len(word) for word in words)
return coords
def _errt(self):
- """Count Error-Rate Relative to Truncation (ERRT).
+ '''Count Error-Rate Relative to Truncation (ERRT).
:return: ERRT, length of the line from origo to (UI, OI) divided by
the length of the line from origo to the point defined by the same
line when extended until the truncation line.
:rtype: float
- """
+ '''
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
self.coords = self._get_truncation_coordinates()
if (0.0, 0.0) in self.coords:
# Truncation line goes through origo, so ERRT cannot be counted
if (self.ui, self.oi) != (0.0, 0.0):
- return float("inf")
+ return float('inf')
else:
- return float("nan")
+ return float('nan')
if (self.ui, self.oi) == (0.0, 0.0):
# (ui, oi) is origo; define errt as 0.0
return 0.0
return op / ot
def update(self):
- """Update statistics after lemmas and stems have been set."""
+ '''Update statistics after lemmas and stems have been set.'''
self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
self.errt = self._errt()
def demo():
- """Demonstration of the module."""
+ '''Demonstration of the module.'''
# Some words with their real lemmas
lemmas = {
- "kneel": ["kneel", "knelt"],
- "range": ["range", "ranged"],
- "ring": ["ring", "rang", "rung"],
+ 'kneel': ['kneel', 'knelt'],
+ 'range': ['range', 'ranged'],
+ 'ring': ['ring', 'rang', 'rung'],
}
# Same words with stems from a stemming algorithm
stems = {
- "kneel": ["kneel"],
- "knelt": ["knelt"],
- "rang": ["rang", "range", "ranged"],
- "ring": ["ring"],
- "rung": ["rung"],
+ 'kneel': ['kneel'],
+ 'knelt': ['knelt'],
+ 'rang': ['rang', 'range', 'ranged'],
+ 'ring': ['ring'],
+ 'rung': ['rung'],
}
- print("Words grouped by their lemmas:")
+ print('Words grouped by their lemmas:')
for lemma in sorted(lemmas):
- print("%s => %s" % (lemma, " ".join(lemmas[lemma])))
+ print('%s => %s' % (lemma, ' '.join(lemmas[lemma])))
print()
- print("Same words grouped by a stemming algorithm:")
+ print('Same words grouped by a stemming algorithm:')
for stem in sorted(stems):
- print("%s => %s" % (stem, " ".join(stems[stem])))
+ print('%s => %s' % (stem, ' '.join(stems[stem])))
print()
p = Paice(lemmas, stems)
print(p)
print()
# Let's "change" results from a stemming algorithm
stems = {
- "kneel": ["kneel"],
- "knelt": ["knelt"],
- "rang": ["rang"],
- "range": ["range", "ranged"],
- "ring": ["ring"],
- "rung": ["rung"],
+ 'kneel': ['kneel'],
+ 'knelt': ['knelt'],
+ 'rang': ['rang'],
+ 'range': ['range', 'ranged'],
+ 'ring': ['ring'],
+ 'rung': ['rung'],
}
- print("Counting stats after changing stemming results:")
+ print('Counting stats after changing stemming results:')
for stem in sorted(stems):
- print("%s => %s" % (stem, " ".join(stems[stem])))
+ print('%s => %s' % (stem, ' '.join(stems[stem])))
print()
p.stems = stems
p.update()
print(p)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Evaluation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division
from math import fabs
import operator
from random import shuffle
from functools import reduce
+from six.moves import range, zip
+
try:
from scipy.stats.stats import betai
except ImportError:
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
- if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
- raise TypeError("reference and test should be sets")
+ if not hasattr(reference, 'intersection') or not hasattr(test, 'intersection'):
+ raise TypeError('reference and test should be sets')
if len(test) == 0:
return None
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
- if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
- raise TypeError("reference and test should be sets")
+ if not hasattr(reference, 'intersection') or not hasattr(test, 'intersection'):
+ raise TypeError('reference and test should be sets')
if len(reference) == 0:
return None
:param b: another list of independently generated test values
:type b: list
"""
- shuffles = kwargs.get("shuffles", 999)
+ shuffles = kwargs.get('shuffles', 999)
# there's no point in trying to shuffle beyond all possible permutations
shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
- stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
- verbose = kwargs.get("verbose", False)
+ stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
+ verbose = kwargs.get('verbose', False)
if verbose:
- print("shuffles: %d" % shuffles)
+ print('shuffles: %d' % shuffles)
actual_stat = fabs(stat(a) - stat(b))
if verbose:
- print("actual statistic: %f" % actual_stat)
- print("-" * 60)
+ print('actual statistic: %f' % actual_stat)
+ print('-' * 60)
c = 1e-100
lst = LazyConcatenation([a, b])
for i in range(shuffles):
if verbose and i % 10 == 0:
- print("shuffle: %d" % i)
+ print('shuffle: %d' % i)
shuffle(indices)
c += 1
if verbose and i % 10 == 0:
- print("pseudo-statistic: %f" % pseudo_stat)
- print("significance: %f" % ((c + 1) / (i + 1)))
- print("-" * 60)
+ print('pseudo-statistic: %f' % pseudo_stat)
+ print('significance: %f' % ((c + 1) / (i + 1)))
+ print('-' * 60)
significance = (c + 1) / (shuffles + 1)
if verbose:
- print("significance: %f" % significance)
+ print('significance: %f' % significance)
if betai:
for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))
def demo():
- print("-" * 75)
- reference = "DET NN VB DET JJ NN NN IN DET NN".split()
- test = "DET VB VB DET NN NN NN IN DET NN".split()
- print("Reference =", reference)
- print("Test =", test)
- print("Accuracy:", accuracy(reference, test))
-
- print("-" * 75)
+ print('-' * 75)
+ reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+ test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+ print('Reference =', reference)
+ print('Test =', test)
+ print('Accuracy:', accuracy(reference, test))
+
+ print('-' * 75)
reference_set = set(reference)
test_set = set(test)
- print("Reference =", reference_set)
- print("Test = ", test_set)
- print("Precision:", precision(reference_set, test_set))
- print(" Recall:", recall(reference_set, test_set))
- print("F-Measure:", f_measure(reference_set, test_set))
- print("-" * 75)
+ print('Reference =', reference_set)
+ print('Test = ', test_set)
+ print('Precision:', precision(reference_set, test_set))
+ print(' Recall:', recall(reference_set, test_set))
+ print('F-Measure:', f_measure(reference_set, test_set))
+ print('-' * 75)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Text Segmentation Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# David Doukhan <david.doukhan@gmail.com>
except ImportError:
pass
+from six.moves import range
+
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
"""
mat[i + 1, j + 1] = min(tcost, shift_cost)
-def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
+def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
"""
Compute the Generalized Hamming Distance for a reference and a hypothetical
segmentation, corresponding to the cost related to the transformation
# Beeferman's Pk text segmentation evaluation metric
-def pk(ref, hyp, k=None, boundary="1"):
+def pk(ref, hyp, k=None, boundary='1'):
"""
Compute the Pk metric for a pair of segmentations A segmentation
is any sequence over a vocabulary of two items (e.g. "0", "1"),
# Natural Language Toolkit: Spearman Rank Correlation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
+from __future__ import division
"""
Tools for comparing ranked lists.
# Natural Language Toolkit: Miscellaneous modules
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
module is kept in NLTK source code in order to provide better error
messages for people following the NLTK Book 2.0.
"""
+from __future__ import print_function
def babelize_shell():
(CHOMSKY n) -- for example
(CHOMSKY 5) generates half a screen of linguistic truth.
"""
+from __future__ import print_function
leadins = """To characterize a linguistic level L,
On the other hand,
import textwrap, random
from itertools import chain, islice
+from six.moves import zip
+
def generate_chomsky(times=5, line_length=72):
parts = []
print(textwrap.fill(" ".join(output), line_length))
-if __name__ == "__main__":
+if __name__ == '__main__':
generate_chomsky()
# Natural Language Toolkit: Minimal Sets
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
# Natural Language Toolkit: List Sorting
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
illustrate the many different algorithms (recipes) for solving a
problem, and how to analyze algorithms experimentally.
"""
+from __future__ import print_function, division
+
# These algorithms are taken from:
# Levitin (2004) The Design and Analysis of Algorithms
)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Word Finder
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Simplified from PHP version by Robert Klein <brathna@gmail.com>
# http://fswordfinder.sourceforge.net/
+from __future__ import print_function
import random
return step(word, x, lambda i: x, y, lambda i: y - i, grid)
-def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
+def wordfinder(words, rows=20, cols=20, attempts=50, alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
"""
Attempt to arrange words into a letter-grid with the specified
number of rows and columns. Try each word in several positions
# Fill up the remaining spaces
for i in range(rows):
for j in range(cols):
- if grid[i][j] == "":
+ if grid[i][j] == '':
grid[i][j] = random.choice(alph)
return grid, used
print("Word Finder\n")
for i in range(len(grid)):
for j in range(len(grid[i])):
- print(grid[i][j], end=" ")
+ print(grid[i][j], end=' ')
print()
print()
print("%d:" % (i + 1), used[i])
-if __name__ == "__main__":
+if __name__ == '__main__':
word_finder()
# Natural Language Toolkit: Parsers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# Natural Language Toolkit: Parser API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
#
# Author: David McClosky <dmcc@bigasterisk.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
+
from nltk.parse.api import ParserI
from nltk.tree import Tree
on BLLIP Parser's Python interface.
"""
-__all__ = ["BllipParser"]
+__all__ = ['BllipParser']
# this block allows this module to be imported even if bllipparser isn't
# available
def _ensure_ascii(words):
try:
for i, word in enumerate(words):
- word.decode("ascii")
+ word.decode('ascii')
except UnicodeDecodeError:
raise ValueError(
"Token %d (%r) is non-ASCII. BLLIP Parser "
from nltk.data import find
- model_dir = find("models/bllip_wsj_no_aux").path
+ model_dir = find('models/bllip_wsj_no_aux').path
- print("Loading BLLIP Parsing models...")
+ print('Loading BLLIP Parsing models...')
# the easiest way to get started is to use a unified model
bllip = BllipParser.from_unified_model_dir(model_dir)
- print("Done.")
+ print('Done.')
- sentence1 = "British left waffles on Falklands .".split()
- sentence2 = "I saw the man with the telescope .".split()
+ sentence1 = 'British left waffles on Falklands .'.split()
+ sentence2 = 'I saw the man with the telescope .'.split()
# this sentence is known to fail under the WSJ parsing model
- fail1 = "# ! ? : -".split()
+ fail1 = '# ! ? : -'.split()
for sentence in (sentence1, sentence2, fail1):
- print("Sentence: %r" % " ".join(sentence))
+ print('Sentence: %r' % ' '.join(sentence))
try:
tree = next(bllip.parse(sentence))
print(tree)
# n-best parsing demo
for i, parse in enumerate(bllip.parse(sentence1)):
- print("parse %d:\n%s" % (i, parse))
+ print('parse %d:\n%s' % (i, parse))
# using external POS tag constraints
print(
"forcing 'tree' to be 'NN':",
- next(bllip.tagged_parse([("A", None), ("tree", "NN")])),
+ next(bllip.tagged_parse([('A', None), ('tree', 'NN')])),
)
print(
"forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
- next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])),
+ next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])),
)
# constraints don't have to make sense... (though on more complicated
# sentences, they may cause the parse to fail)
print(
"forcing 'A' to be 'NNP':",
- next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])),
+ next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])),
)
_ensure_bllip_import_or_error()
except ImportError:
raise SkipTest(
- "doctests from nltk.parse.bllip are skipped because "
- "the bllipparser module is not installed"
+ 'doctests from nltk.parse.bllip are skipped because '
+ 'the bllipparser module is not installed'
)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: A Chart Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jean Mark Gawron <gawron@mail.sdsu.edu>
- ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
be used to step through the parsing process.
"""
+from __future__ import print_function, division, unicode_literals
import itertools
import re
import warnings
from functools import total_ordering
+from six.moves import range
+
from nltk.tree import Tree
from nltk.grammar import PCFG, is_nonterminal, is_terminal
from nltk.util import OrderedDict
from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.parse.api import ParserI
def __init__(self):
if self.__class__ == EdgeI:
- raise TypeError("Edge is an abstract interface")
+ raise TypeError('Edge is an abstract interface')
# ////////////////////////////////////////////////////////////
# Span
return self._hash
+@python_2_unicode_compatible
class TreeEdge(EdgeI):
"""
An edge that records the fact that a tree is (partially)
# String representation
def __str__(self):
- str = "[%s:%s] " % (self._span[0], self._span[1])
- str += "%-2r ->" % (self._lhs,)
+ str = '[%s:%s] ' % (self._span[0], self._span[1])
+ str += '%-2r ->' % (self._lhs,)
for i in range(len(self._rhs)):
if i == self._dot:
- str += " *"
- str += " %s" % repr(self._rhs[i])
+ str += ' *'
+ str += ' %s' % unicode_repr(self._rhs[i])
if len(self._rhs) == self._dot:
- str += " *"
+ str += ' *'
return str
def __repr__(self):
- return "[Edge: %s]" % self
+ return '[Edge: %s]' % self
+@python_2_unicode_compatible
class LeafEdge(EdgeI):
"""
An edge that records the fact that a leaf value is consistent with
# String representations
def __str__(self):
- return "[%s:%s] %s" % (self._index, self._index + 1, repr(self._leaf))
+ return '[%s:%s] %s' % (self._index, self._index + 1, unicode_repr(self._leaf))
def __repr__(self):
- return "[Edge: %s]" % (self)
+ return '[Edge: %s]' % (self)
########################################################################
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = {}
width = 50 // (self.num_leaves() + 1)
(start, end) = (edge.start(), edge.end())
- str = "|" + ("." + " " * (width - 1)) * start
+ str = '|' + ('.' + ' ' * (width - 1)) * start
# Zero-width edges are "#" if complete, ">" if incomplete
if start == end:
if edge.is_complete():
- str += "#"
+ str += '#'
else:
- str += ">"
+ str += '>'
# Spanning complete edges are "[===]"; Other edges are
# "[---]" if complete, "[--->" if incomplete
elif edge.is_complete() and edge.span() == (0, self._num_leaves):
- str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]"
+ str += '[' + ('=' * width) * (end - start - 1) + '=' * (width - 1) + ']'
elif edge.is_complete():
- str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]"
+ str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + ']'
else:
- str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">"
+ str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + '>'
- str += (" " * (width - 1) + ".") * (self._num_leaves - end)
- return str + "| %s" % edge
+ str += (' ' * (width - 1) + '.') * (self._num_leaves - end)
+ return str + '| %s' % edge
def pretty_format_leaves(self, width=None):
"""
width = 50 // (self.num_leaves() + 1)
if self._tokens is not None and width > 1:
- header = "|."
+ header = '|.'
for tok in self._tokens:
- header += tok[: width - 1].center(width - 1) + "."
- header += "|"
+ header += tok[: width - 1].center(width - 1) + '.'
+ header += '|'
else:
- header = ""
+ header = ''
return header
return (
self.pretty_format_leaves(width)
- + "\n"
- + "\n".join(self.pretty_format_edge(edge, width) for edge in edges)
+ + '\n'
+ + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges)
)
# ////////////////////////////////////////////////////////////
def dot_digraph(self):
# Header
- s = "digraph nltk_chart {\n"
+ s = 'digraph nltk_chart {\n'
# s += ' size="5,5";\n'
- s += " rankdir=LR;\n"
- s += " node [height=0.1,width=0.1];\n"
+ s += ' rankdir=LR;\n'
+ s += ' node [height=0.1,width=0.1];\n'
s += ' node [style=filled, color="lightgray"];\n'
# Set up the nodes
s += ' %04d.%04d [label=""];\n' % (x, y)
# Add a spacer
- s += " x [style=invis]; x->0000.0000 [style=invis];\n"
+ s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
# Declare ranks.
for x in range(self.num_leaves() + 1):
- s += " {rank=same;"
+ s += ' {rank=same;'
for y in range(self.num_edges() + 1):
if y == 0 or (
x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
):
- s += " %04d.%04d" % (x, y)
- s += "}\n"
+ s += ' %04d.%04d' % (x, y)
+ s += '}\n'
# Add the leaves
- s += " edge [style=invis, weight=100];\n"
- s += " node [shape=plaintext]\n"
- s += " 0000.0000"
+ s += ' edge [style=invis, weight=100];\n'
+ s += ' node [shape=plaintext]\n'
+ s += ' 0000.0000'
for x in range(self.num_leaves()):
- s += "->%s->%04d.0000" % (self.leaf(x), x + 1)
- s += ";\n\n"
+ s += '->%s->%04d.0000' % (self.leaf(x), x + 1)
+ s += ';\n\n'
# Add the edges
- s += " edge [style=solid, weight=1];\n"
+ s += ' edge [style=solid, weight=1];\n'
for y, edge in enumerate(self):
for x in range(edge.start()):
s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
x + 1,
y + 1,
)
- s += "}\n"
+ s += '}\n'
return s
raise NotImplementedError()
+@python_2_unicode_compatible
class AbstractChartRule(ChartRuleI):
"""
An abstract base class for chart rules. ``AbstractChartRule``
yield new_edge
else:
- raise AssertionError("NUM_EDGES>3 is not currently supported")
+ raise AssertionError('NUM_EDGES>3 is not currently supported')
# Default: return a name based on the class name.
def __str__(self):
# Add spaces between InitialCapsWords.
- return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__)
+ return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
# ////////////////////////////////////////////////////////////
print_rule_header = trace > 1
for edge in new_edges:
if print_rule_header:
- print("%s:" % rule)
+ print('%s:' % rule)
print_rule_header = False
print(chart.pretty_format_edge(edge, edge_width))
added with the current strategy and grammar.
"""
if self._chart is None:
- raise ValueError("Parser must be initialized first")
+ raise ValueError('Parser must be initialized first')
while True:
self._restart = False
w = 50 // (self._chart.num_leaves() + 1)
print_grammar=False,
print_trees=True,
trace=2,
- sent="I saw John with a dog with my cookie",
+ sent='I saw John with a dog with my cookie',
numparses=5,
):
"""
# Ask the user which parser to test,
# if the parser wasn't provided as an argument
if choice is None:
- print(" 1: Top-down chart parser")
- print(" 2: Bottom-up chart parser")
- print(" 3: Bottom-up left-corner chart parser")
- print(" 4: Left-corner chart parser with bottom-up filter")
- print(" 5: Stepping chart parser (alternating top-down & bottom-up)")
- print(" 6: All parsers")
- print("\nWhich parser (1-6)? ", end=" ")
+ print(' 1: Top-down chart parser')
+ print(' 2: Bottom-up chart parser')
+ print(' 3: Bottom-up left-corner chart parser')
+ print(' 4: Left-corner chart parser with bottom-up filter')
+ print(' 5: Stepping chart parser (alternating top-down & bottom-up)')
+ print(' 6: All parsers')
+ print('\nWhich parser (1-6)? ', end=' ')
choice = sys.stdin.readline().strip()
print()
choice = str(choice)
if choice not in "123456":
- print("Bad parser number")
+ print('Bad parser number')
return
# Keep track of how long each parser takes.
times = {}
strategies = {
- "1": ("Top-down", TD_STRATEGY),
- "2": ("Bottom-up", BU_STRATEGY),
- "3": ("Bottom-up left-corner", BU_LC_STRATEGY),
- "4": ("Filtered left-corner", LC_STRATEGY),
+ '1': ('Top-down', TD_STRATEGY),
+ '2': ('Bottom-up', BU_STRATEGY),
+ '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
+ '4': ('Filtered left-corner', LC_STRATEGY),
}
choices = []
if choice in strategies:
choices = [choice]
- if choice == "6":
+ if choice == '6':
choices = "1234"
# Run the requested chart parser(s), except the stepping parser.
times[strategies[strategy][0]] = time.time() - t
print("Nr edges in chart:", len(chart.edges()))
if numparses:
- assert len(parses) == numparses, "Not all parses found"
+ assert len(parses) == numparses, 'Not all parses found'
if print_trees:
for tree in parses:
print(tree)
cp = SteppingChartParser(grammar, trace=trace)
cp.initialize(tokens)
for i in range(5):
- print("*** SWITCH TO TOP DOWN")
+ print('*** SWITCH TO TOP DOWN')
cp.set_strategy(TD_STRATEGY)
for j, e in enumerate(cp.step()):
if j > 20 or e is None:
break
- print("*** SWITCH TO BOTTOM UP")
+ print('*** SWITCH TO BOTTOM UP')
cp.set_strategy(BU_STRATEGY)
for j, e in enumerate(cp.step()):
if j > 20 or e is None:
break
- times["Stepping"] = time.time() - t
+ times['Stepping'] = time.time() - t
print("Nr edges in chart:", len(cp.chart().edges()))
if numparses:
- assert len(list(cp.parses())) == numparses, "Not all parses found"
+ assert len(list(cp.parses())) == numparses, 'Not all parses found'
if print_trees:
for tree in cp.parses():
print(tree)
print("* Parsing times")
print()
maxlen = max(len(key) for key in times)
- format = "%" + repr(maxlen) + "s parser: %6.3fsec"
+ format = '%' + repr(maxlen) + 's parser: %6.3fsec'
times_items = times.items()
for (parser, t) in sorted(times_items, key=lambda a: a[1]):
print(format % (parser, t))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the CoreNLP REST API.
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+
import re
import json
import time
from unittest import skip
-_stanford_url = "http://stanfordnlp.github.io/CoreNLP/"
+_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/'
class CoreNLPServerError(EnvironmentError):
def try_port(port=0):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- sock.bind(("", port))
+ sock.bind(('', port))
p = sock.getsockname()[1]
sock.close()
class CoreNLPServer(object):
- _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
- _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
+ _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar'
+ _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar'
def __init__(
self,
):
if corenlp_options is None:
- corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
+ corenlp_options = ['-preload', 'tokenize,ssplit,pos,lemma,parse,depparse']
jars = list(
find_jar_iter(
self._JAR,
path_to_jar,
- env_vars=("CORENLP",),
+ env_vars=('CORENLP',),
searchpath=(),
url=_stanford_url,
verbose=verbose,
else:
try_port(port)
- self.url = "http://localhost:{}".format(port)
+ self.url = 'http://localhost:{}'.format(port)
model_jar = max(
find_jar_iter(
self._MODEL_JAR_PATTERN,
path_to_models_jar,
- env_vars=("CORENLP_MODELS",),
+ env_vars=('CORENLP_MODELS',),
searchpath=(),
url=_stanford_url,
verbose=verbose,
self._classpath = stanford_jar, model_jar
self.corenlp_options = corenlp_options
- self.java_options = java_options or ["-mx2g"]
-
- def start(self, stdout="devnull", stderr="devnull"):
- """ Starts the CoreNLP server
+ self.java_options = java_options or ['-mx2g']
- :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
- """
+ def start(self):
import requests
- cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
+ cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']
if self.corenlp_options:
cmd.extend(self.corenlp_options)
# Configure java.
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=self.verbose)
try:
+ # TODO: it's probably a bad idea to pipe stdout, as it will
+ # accumulate when lots of text is being parsed.
self.popen = java(
cmd,
classpath=self._classpath,
blocking=False,
- stdout=stdout,
- stderr=stderr,
+ stdout='pipe',
+ stderr='pipe',
)
finally:
# Return java configurations to their default values.
_, stderrdata = self.popen.communicate()
raise CoreNLPServerError(
returncode,
- "Could not start the server. "
- "The error was: {}".format(stderrdata.decode("ascii")),
+ 'Could not start the server. '
+ 'The error was: {}'.format(stderrdata.decode('ascii')),
)
for i in range(30):
try:
- response = requests.get(requests.compat.urljoin(self.url, "live"))
+ response = requests.get(requests.compat.urljoin(self.url, 'live'))
except requests.exceptions.ConnectionError:
time.sleep(1)
else:
if response.ok:
break
else:
- raise CoreNLPServerError("Could not connect to the server.")
+ raise CoreNLPServerError('Could not connect to the server.')
for i in range(60):
try:
- response = requests.get(requests.compat.urljoin(self.url, "ready"))
+ response = requests.get(requests.compat.urljoin(self.url, 'ready'))
except requests.exceptions.ConnectionError:
time.sleep(1)
else:
if response.ok:
break
else:
- raise CoreNLPServerError("The server is not ready.")
+ raise CoreNLPServerError('The server is not ready.')
def stop(self):
self.popen.terminate()
class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
"""Interface to the CoreNLP Parser."""
- def __init__(self, url="http://localhost:9000", encoding="utf8", tagtype=None):
+ def __init__(self, url='http://localhost:9000', encoding='utf8', tagtype=None):
import requests
self.url = url
self.encoding = encoding
- if tagtype not in ["pos", "ner", None]:
+ if tagtype not in ['pos', 'ner', None]:
raise ValueError("tagtype must be either 'pos', 'ner' or None")
self.tagtype = tagtype
:rtype: iter(iter(Tree))
"""
# Converting list(list(str)) -> list(str)
- sentences = (" ".join(words) for words in sentences)
+ sentences = (' '.join(words) for words in sentences)
return self.raw_parse_sents(sentences, *args, **kwargs)
def raw_parse(self, sentence, properties=None, *args, **kwargs):
:type sentence: str
:rtype: iter(Tree)
"""
- default_properties = {"tokenize.whitespace": "false"}
+ default_properties = {'tokenize.whitespace': 'false'}
default_properties.update(properties or {})
return next(
)
)
- def api_call(self, data, properties=None, timeout=60):
+ def api_call(self, data, properties=None):
default_properties = {
- "outputFormat": "json",
- "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
+ 'outputFormat': 'json',
+ 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format(
parser_annotator=self.parser_annotator
),
}
response = self.session.post(
self.url,
- params={"properties": json.dumps(default_properties)},
+ params={'properties': json.dumps(default_properties)},
data=data.encode(self.encoding),
- timeout=timeout,
+ timeout=60,
)
response.raise_for_status()
"""
default_properties = {
# Only splits on '\n', never inside the sentence.
- "ssplit.eolonly": "true"
+ 'ssplit.ssplit.eolonly': 'true'
}
default_properties.update(properties or {})
tree = self.make_tree(parse)
yield iter([tree])
"""
- parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
- for parsed_sent in parsed_data["sentences"]:
+ parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
+ for parsed_sent in parsed_data['sentences']:
tree = self.make_tree(parsed_sent)
yield iter([tree])
"""
parsed_data = self.api_call(text, *args, **kwargs)
- for parse in parsed_data["sentences"]:
+ for parse in parsed_data['sentences']:
yield self.make_tree(parse)
def tokenize(self, text, properties=None):
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
"""
- default_properties = {"annotators": "tokenize,ssplit"}
+ default_properties = {'annotators': 'tokenize,ssplit'}
default_properties.update(properties or {})
result = self.api_call(text, properties=default_properties)
- for sentence in result["sentences"]:
- for token in sentence["tokens"]:
- yield token["originalText"] or token["word"]
+ for sentence in result['sentences']:
+ for token in sentence['tokens']:
+ yield token['originalText'] or token['word']
def tag_sents(self, sentences):
"""
Takes multiple sentences as a list where each sentence is a list of
tokens.
-
+
:param sentences: Input sentences to tag
:type sentences: list(list(str))
:rtype: list(list(tuple(str, str))
"""
# Converting list(list(str)) -> list(str)
- sentences = (" ".join(words) for words in sentences)
+ sentences = (' '.join(words) for words in sentences)
return [sentences[0] for sentences in self.raw_tag_sents(sentences)]
def tag(self, sentence):
Tag multiple sentences.
Takes multiple sentences as a list where each sentence is a string.
-
+
:param sentences: Input sentences to tag
:type sentences: list(str)
:rtype: list(list(list(tuple(str, str)))
"""
default_properties = {
- "ssplit.isOneSentence": "true",
- "annotators": "tokenize,ssplit,",
+ 'ssplit.isOneSentence': 'true',
+ 'annotators': 'tokenize,ssplit,',
}
# Supports only 'pos' or 'ner' tags.
- assert self.tagtype in ["pos", "ner"]
- default_properties["annotators"] += self.tagtype
+ assert self.tagtype in ['pos', 'ner']
+ default_properties['annotators'] += self.tagtype
for sentence in sentences:
tagged_data = self.api_call(sentence, properties=default_properties)
yield [
[
- (token["word"], token[self.tagtype])
- for token in tagged_sentence["tokens"]
+ (token['word'], token[self.tagtype])
+ for token in tagged_sentence['tokens']
]
- for tagged_sentence in tagged_data["sentences"]
+ for tagged_sentence in tagged_data['sentences']
]
"""
- _OUTPUT_FORMAT = "penn"
- parser_annotator = "parse"
+ _OUTPUT_FORMAT = 'penn'
+ parser_annotator = 'parse'
def make_tree(self, result):
- return Tree.fromstring(result["parse"])
+ return Tree.fromstring(result['parse'])
class CoreNLPDependencyParser(GenericCoreNLPParser):
"""
- _OUTPUT_FORMAT = "conll2007"
- parser_annotator = "depparse"
+ _OUTPUT_FORMAT = 'conll2007'
+ parser_annotator = 'depparse'
def make_tree(self, result):
return DependencyGraph(
(
- " ".join(n_items[1:]) # NLTK expects an iterable of strings...
+ ' '.join(n_items[1:]) # NLTK expects an iterable of strings...
for n_items in sorted(transform(result))
),
- cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
+ cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token.
)
def transform(sentence):
- for dependency in sentence["basicDependencies"]:
+ for dependency in sentence['basicDependencies']:
- dependent_index = dependency["dependent"]
- token = sentence["tokens"][dependent_index - 1]
+ dependent_index = dependency['dependent']
+ token = sentence['tokens'][dependent_index - 1]
# Return values that we don't know as '_'. Also, consider tag and ctag
# to be equal.
yield (
dependent_index,
- "_",
- token["word"],
- token["lemma"],
- token["pos"],
- token["pos"],
- "_",
- str(dependency["governor"]),
- dependency["dep"],
- "_",
- "_",
+ '_',
+ token['word'],
+ token['lemma'],
+ token['pos'],
+ token['pos'],
+ '_',
+ str(dependency['governor']),
+ dependency['dep'],
+ '_',
+ '_',
)
-@skip("Skipping all CoreNLP tests.")
+@skip('Skipping all CoreNLP tests.')
def setup_module(module):
from nose import SkipTest
try:
server = CoreNLPServer(port=9000)
except LookupError as e:
- raise SkipTest("Could not instantiate CoreNLPServer.")
+ raise SkipTest('Could not instantiate CoreNLPServer.')
try:
server.start()
except CoreNLPServerError as e:
raise SkipTest(
- "Skipping CoreNLP tests because the server could not be started. "
- "Make sure that the 9000 port is free. "
- "{}".format(e.strerror)
+ 'Skipping CoreNLP tests because the server could not be started. '
+ 'Make sure that the 9000 port is free. '
+ '{}'.format(e.strerror)
)
-@skip("Skipping all CoreNLP tests.")
+@skip('Skipping all CoreNLP tests.')
def teardown_module(module):
server.stop()
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (modifications)
#
The input is assumed to be in Malt-TAB format
(http://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
"""
+from __future__ import print_function, unicode_literals
from collections import defaultdict
from itertools import chain
import subprocess
import warnings
+from six import string_types
+
from nltk.tree import Tree
+from nltk.compat import python_2_unicode_compatible
+
#################################################################
# DependencyGraph Class
#################################################################
+@python_2_unicode_compatible
class DependencyGraph(object):
"""
A container for the nodes and labelled edges of a dependency structure.
cell_extractor=None,
zero_based=False,
cell_separator=None,
- top_relation_label="ROOT",
+ top_relation_label='ROOT',
):
"""Dependency graph.
"""
self.nodes = defaultdict(
lambda: {
- "address": None,
- "word": None,
- "lemma": None,
- "ctag": None,
- "tag": None,
- "feats": None,
- "head": None,
- "deps": defaultdict(list),
- "rel": None,
+ 'address': None,
+ 'word': None,
+ 'lemma': None,
+ 'ctag': None,
+ 'tag': None,
+ 'feats': None,
+ 'head': None,
+ 'deps': defaultdict(list),
+ 'rel': None,
}
)
- self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0})
+ self.nodes[0].update({'ctag': 'TOP', 'tag': 'TOP', 'address': 0})
self.root = None
"""
for node in self.nodes.values():
new_deps = []
- for dep in node["deps"]:
+ for dep in node['deps']:
if dep in originals:
new_deps.append(redirect)
else:
new_deps.append(dep)
- node["deps"] = new_deps
+ node['deps'] = new_deps
def add_arc(self, head_address, mod_address):
"""
Adds an arc from the node specified by head_address to the
node specified by the mod address.
"""
- relation = self.nodes[mod_address]["rel"]
- self.nodes[head_address]["deps"].setdefault(relation, [])
- self.nodes[head_address]["deps"][relation].append(mod_address)
+ relation = self.nodes[mod_address]['rel']
+ self.nodes[head_address]['deps'].setdefault(relation, [])
+ self.nodes[head_address]['deps'][relation].append(mod_address)
# self.nodes[head_address]['deps'].append(mod_address)
def connect_graph(self):
"""
for node1 in self.nodes.values():
for node2 in self.nodes.values():
- if node1["address"] != node2["address"] and node2["rel"] != "TOP":
- relation = node2["rel"]
- node1["deps"].setdefault(relation, [])
- node1["deps"][relation].append(node2["address"])
+ if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
+ relation = node2['rel']
+ node1['deps'].setdefault(relation, [])
+ node1['deps'][relation].append(node2['address'])
# node1['deps'].append(node2['address'])
def get_by_address(self, node_address):
"""
# Start the digraph specification
- s = "digraph G{\n"
- s += "edge [dir=forward]\n"
- s += "node [shape=plaintext]\n"
+ s = 'digraph G{\n'
+ s += 'edge [dir=forward]\n'
+ s += 'node [shape=plaintext]\n'
# Draw the remaining nodes
- for node in sorted(self.nodes.values(), key=lambda v: v["address"]):
+ for node in sorted(self.nodes.values(), key=lambda v: v['address']):
s += '\n%s [label="%s (%s)"]' % (
- node["address"],
- node["address"],
- node["word"],
+ node['address'],
+ node['address'],
+ node['word'],
)
- for rel, deps in node["deps"].items():
+ for rel, deps in node['deps'].items():
for dep in deps:
if rel is not None:
- s += '\n%s -> %s [label="%s"]' % (node["address"], dep, rel)
+ s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel)
else:
- s += "\n%s -> %s " % (node["address"], dep)
+ s += '\n%s -> %s ' % (node['address'], dep)
s += "\n}"
return s
try:
process = subprocess.Popen(
- ["dot", "-Tsvg"],
+ ['dot', '-Tsvg'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
except OSError:
- raise Exception("Cannot find the dot binary from Graphviz package")
+ raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
if err:
raise Exception(
- "Cannot create svg representation by running dot from string: {}"
- "".format(dot_string)
+ 'Cannot create svg representation by running dot from string: {}'
+ ''.format(dot_string)
)
return out
@staticmethod
def load(
- filename, zero_based=False, cell_separator=None, top_relation_label="ROOT"
+ filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'
):
"""
:param filename: a name of a file in Malt-TAB format
cell_separator=cell_separator,
top_relation_label=top_relation_label,
)
- for tree_str in infile.read().split("\n\n")
+ for tree_str in infile.read().split('\n\n')
]
def left_children(self, node_index):
Returns the number of left children under the node specified
by the given address.
"""
- children = chain.from_iterable(self.nodes[node_index]["deps"].values())
- index = self.nodes[node_index]["address"]
+ children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+ index = self.nodes[node_index]['address']
return sum(1 for c in children if c < index)
def right_children(self, node_index):
Returns the number of right children under the node specified
by the given address.
"""
- children = chain.from_iterable(self.nodes[node_index]["deps"].values())
- index = self.nodes[node_index]["address"]
+ children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+ index = self.nodes[node_index]['address']
return sum(1 for c in children if c > index)
def add_node(self, node):
- if not self.contains_address(node["address"]):
- self.nodes[node["address"]].update(node)
+ if not self.contains_address(node['address']):
+ self.nodes[node['address']].update(node)
def _parse(
self,
cell_extractor=None,
zero_based=False,
cell_separator=None,
- top_relation_label="ROOT",
+ top_relation_label='ROOT',
):
"""Parse a sentence.
def extract_3_cells(cells, index):
word, tag, head = cells
- return index, word, word, tag, tag, "", head, ""
+ return index, word, word, tag, tag, '', head, ''
def extract_4_cells(cells, index):
word, tag, head, rel = cells
- return index, word, word, tag, tag, "", head, rel
+ return index, word, word, tag, tag, '', head, rel
def extract_7_cells(cells, index):
line_index, word, lemma, tag, _, head, rel = cells
except ValueError:
# index can't be parsed as an integer, use default
pass
- return index, word, lemma, tag, tag, "", head, rel
+ return index, word, lemma, tag, tag, '', head, rel
def extract_10_cells(cells, index):
line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
10: extract_10_cells,
}
- if isinstance(input_, str):
- input_ = (line for line in input_.split("\n"))
+ if isinstance(input_, string_types):
+ input_ = (line for line in input_.split('\n'))
lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l)
cell_extractor = extractors[cell_number]
except KeyError:
raise ValueError(
- "Number of tab-delimited fields ({0}) not supported by "
- "CoNLL(10) or Malt-Tab(4) format".format(cell_number)
+ 'Number of tab-delimited fields ({0}) not supported by '
+ 'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
)
try:
# extractor and doesn't accept or return an index.
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
- if head == "_":
+ if head == '_':
continue
head = int(head)
self.nodes[index].update(
{
- "address": index,
- "word": word,
- "lemma": lemma,
- "ctag": ctag,
- "tag": tag,
- "feats": feats,
- "head": head,
- "rel": rel,
+ 'address': index,
+ 'word': word,
+ 'lemma': lemma,
+ 'ctag': ctag,
+ 'tag': tag,
+ 'feats': feats,
+ 'head': head,
+ 'rel': rel,
}
)
# Make sure that the fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = top_relation_label
- self.nodes[head]["deps"][rel].append(index)
+ self.nodes[head]['deps'][rel].append(index)
- if self.nodes[0]["deps"][top_relation_label]:
- root_address = self.nodes[0]["deps"][top_relation_label][0]
+ if self.nodes[0]['deps'][top_relation_label]:
+ root_address = self.nodes[0]['deps'][top_relation_label][0]
self.root = self.nodes[root_address]
self.top_relation_label = top_relation_label
else:
)
def _word(self, node, filter=True):
- w = node["word"]
+ w = node['word']
if filter:
- if w != ",":
+ if w != ',':
return w
return w
:return: either a word (if the indexed node is a leaf) or a ``Tree``.
"""
node = self.get_by_address(i)
- word = node["word"]
- deps = sorted(chain.from_iterable(node["deps"].values()))
+ word = node['word']
+ deps = sorted(chain.from_iterable(node['deps'].values()))
if deps:
return Tree(word, [self._tree(dep) for dep in deps])
"""
node = self.root
- word = node["word"]
- deps = sorted(chain.from_iterable(node["deps"].values()))
+ word = node['word']
+ deps = sorted(chain.from_iterable(node['deps'].values()))
return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None):
if not node:
node = self.root
- head = (node["word"], node["ctag"])
- for i in sorted(chain.from_iterable(node["deps"].values())):
+ head = (node['word'], node['ctag'])
+ for i in sorted(chain.from_iterable(node['deps'].values())):
dep = self.get_by_address(i)
- yield (head, dep["rel"], (dep["word"], dep["ctag"]))
+ yield (head, dep['rel'], (dep['word'], dep['ctag']))
for triple in self.triples(node=dep):
yield triple
def _hd(self, i):
try:
- return self.nodes[i]["head"]
+ return self.nodes[i]['head']
except IndexError:
return None
def _rel(self, i):
try:
- return self.nodes[i]["rel"]
+ return self.nodes[i]['rel']
except IndexError:
return None
distances = {}
for node in self.nodes.values():
- for dep in node["deps"]:
- key = tuple([node["address"], dep])
+ for dep in node['deps']:
+ key = tuple([node['address'], dep])
distances[key] = 1
for _ in self.nodes:
return False # return []?
def get_cycle_path(self, curr_node, goal_node_index):
- for dep in curr_node["deps"]:
+ for dep in curr_node['deps']:
if dep == goal_node_index:
- return [curr_node["address"]]
- for dep in curr_node["deps"]:
+ return [curr_node['address']]
+ for dep in curr_node['deps']:
path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
if len(path) > 0:
- path.insert(0, curr_node["address"])
+ path.insert(0, curr_node['address'])
return path
return []
"""
if style == 3:
- template = "{word}\t{tag}\t{head}\n"
+ template = '{word}\t{tag}\t{head}\n'
elif style == 4:
- template = "{word}\t{tag}\t{head}\t{rel}\n"
+ template = '{word}\t{tag}\t{head}\t{rel}\n'
elif style == 10:
template = (
- "{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n"
+ '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n'
)
else:
raise ValueError(
- "Number of tab-delimited fields ({0}) not supported by "
- "CoNLL(10) or Malt-Tab(4) format".format(style)
+ 'Number of tab-delimited fields ({0}) not supported by '
+ 'CoNLL(10) or Malt-Tab(4) format'.format(style)
)
- return "".join(
+ return ''.join(
template.format(i=i, **node)
for i, node in sorted(self.nodes.items())
- if node["tag"] != "TOP"
+ if node['tag'] != 'TOP'
)
def nx_graph(self):
]
self.nx_labels = {}
for n in nx_nodelist:
- self.nx_labels[n] = self.nodes[n]["word"]
+ self.nx_labels[n] = self.nodes[n]['word']
g = networkx.MultiDiGraph()
g.add_nodes_from(nx_nodelist)
networkx.draw_networkx_labels(g, pos, dg.nx_labels)
pylab.xticks([])
pylab.yticks([])
- pylab.savefig("tree.png")
+ pylab.savefig('tree.png')
pylab.show()
def conll_file_demo():
- print("Mass conll_read demo...")
- graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+ print('Mass conll_read demo...')
+ graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
for graph in graphs:
tree = graph.tree()
- print("\n")
+ print('\n')
tree.pprint()
dg = DependencyGraph(treebank_data)
print(dg.contains_cycle())
cyclic_dg = DependencyGraph()
- cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0})
- cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1})
- cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2})
- cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3})
- cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4})
+ cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
+ cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
+ cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
+ cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
+ cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
print(cyclic_dg.contains_cycle())
16 . . Punc Punc punt 15 punct _ _
"""
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Incremental Earley Chart Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Rob Speer <rspeer@mit.edu>
# Edward Loper <edloper@gmail.com>
The main parser class is ``EarleyChartParser``, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""
+from __future__ import print_function, division
-from time import perf_counter
+from six.moves import range
from nltk.parse.chart import (
Chart,
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
print_grammar=False,
print_trees=True,
trace=2,
- sent="I saw John with a dog with my cookie",
+ sent='I saw John with a dog with my cookie',
numparses=5,
):
"""
# Do the parsing.
earley = EarleyChartParser(grammar, trace=trace)
- t = perf_counter()
+ t = time.clock()
chart = earley.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
- t = perf_counter() - t
+ t = time.clock() - t
# Print results.
if numparses:
- assert len(parses) == numparses, "Not all parses found"
+ assert len(parses) == numparses, 'Not all parses found'
if print_trees:
for tree in parses:
print(tree)
print("Time:", t)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Long Duong <longdt219@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import division
+
import unicodedata
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Rob Speer <rspeer@mit.edu>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# URL: <http://nltk.org/>
Extension of chart parsing implementation to handle grammars with
feature structures as nodes.
"""
-from time import perf_counter
+from __future__ import print_function, unicode_literals
+from six.moves import range
+
+from nltk.compat import python_2_unicode_compatible
from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
from nltk.sem import logic
from nltk.tree import Tree
# ////////////////////////////////////////////////////////////
+@python_2_unicode_compatible
class FeatureTreeEdge(TreeEdge):
"""
A specialized tree edge that allows shared variable bindings
def __str__(self):
if self.is_complete():
- return super().__str__()
+ return TreeEdge.__unicode__(self)
else:
- bindings = "{%s}" % ", ".join(
- "%s: %r" % item for item in sorted(self._bindings.items())
+ bindings = '{%s}' % ', '.join(
+ '%s: %r' % item for item in sorted(self._bindings.items())
)
- return "%s %s" % (super().__str__(), bindings)
+ return '%s %s' % (TreeEdge.__unicode__(self), bindings)
# ////////////////////////////////////////////////////////////
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = {}
return dict(
(var, logic.unique_variable())
for var in edge.lhs().variables()
- if var.name.startswith("@")
+ if var.name.startswith('@')
)
print_sentence=True,
trace=1,
parser=FeatureChartParser,
- sent="I saw John with a dog with my cookie",
+ sent='I saw John with a dog with my cookie',
):
import sys, time
if print_sentence:
print("Sentence:", sent)
tokens = sent.split()
- t = perf_counter()
+ t = time.clock()
cp = parser(grammar, trace=trace)
chart = cp.chart_parse(tokens)
trees = list(chart.parses(grammar.start()))
if print_times:
- print("Time: %s" % (perf_counter() - t))
+ print("Time: %s" % (time.clock() - t))
if print_trees:
for tree in trees:
print(tree)
def run_profile():
import profile
- profile.run("for i in range(1): demo()", "/tmp/profile.out")
+ profile.run('for i in range(1): demo()', '/tmp/profile.out')
import pstats
- p = pstats.Stats("/tmp/profile.out")
- p.strip_dirs().sort_stats("time", "cum").print_stats(60)
- p.strip_dirs().sort_stats("cum", "time").print_stats(60)
+ p = pstats.Stats('/tmp/profile.out')
+ p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
+ p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
-if __name__ == "__main__":
+if __name__ == '__main__':
from nltk.data import load
demo()
print()
- grammar = load("grammars/book_grammars/feat0.fcfg")
+ grammar = load('grammars/book_grammars/feat0.fcfg')
cp = FeatureChartParser(grammar, trace=2)
- sent = "Kim likes children"
+ sent = 'Kim likes children'
tokens = sent.split()
trees = cp.parse(tokens)
for tree in trees:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Generating from a CFG
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import print_function
import itertools
import sys
def demo(N=23):
from nltk.grammar import CFG
- print("Generating the first %d sentences for demo grammar:" % (N,))
+ print('Generating the first %d sentences for demo grammar:' % (N,))
print(demo_grammar)
grammar = CFG.fromstring(demo_grammar)
for n, sent in enumerate(generate(grammar, n=N), 1):
- print("%3d. %s" % (n, " ".join(sent)))
+ print('%3d. %s' % (n, ' '.join(sent)))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Author: Dan Garrette <dhgarrette@gmail.com>
# Contributor: Liling Tan, Mustufain, osamamukhtar11
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
import os
import sys
import tempfile
import subprocess
import inspect
+from six import text_type
+
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir, find_file, find_jars_within_path
_tagger = RegexpTagger(
[
- (r"\.$", "."),
- (r"\,$", ","),
- (r"\?$", "?"), # fullstop, comma, Qmark
- (r"\($", "("),
- (r"\)$", ")"), # round brackets
- (r"\[$", "["),
- (r"\]$", "]"), # square brackets
- (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
- (r"(The|the|A|a|An|an)$", "DT"), # articles
- (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
- (r"(His|his|Her|her|Its|its)$", "PRP$"), # possesive
- (r"(my|Your|your|Yours|yours)$", "PRP$"), # possesive
- (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
- (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
- (r"(till|Till|until|Until)$", "IN"), # time prepopsitions
- (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
- (r"(under|Under|below|Below)$", "IN"), # space prepopsitions
- (r"(over|Over|above|Above)$", "IN"), # space prepopsitions
- (r"(across|Across|through|Through)$", "IN"), # space prepopsitions
- (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
- (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
- (r".*able$", "JJ"), # adjectives
- (r".*ness$", "NN"), # nouns formed from adjectives
- (r".*ly$", "RB"), # adverbs
- (r".*s$", "NNS"), # plural nouns
- (r".*ing$", "VBG"), # gerunds
- (r".*ed$", "VBD"), # past tense verbs
- (r".*", "NN"), # nouns (default)
+ (r'\.$', '.'),
+ (r'\,$', ','),
+ (r'\?$', '?'), # fullstop, comma, Qmark
+ (r'\($', '('),
+ (r'\)$', ')'), # round brackets
+ (r'\[$', '['),
+ (r'\]$', ']'), # square brackets
+ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'DT'), # articles
+ (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
+ (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive
+ (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive
+ (r'(on|On|in|In|at|At|since|Since)$', 'IN'), # time prepopsitions
+ (r'(for|For|ago|Ago|before|Before)$', 'IN'), # time prepopsitions
+ (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions
+ (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions
+ (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions
+ (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions
+ (r'(across|Across|through|Through)$', 'IN'), # space prepopsitions
+ (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions
+ (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN'), # nouns (default)
]
)
return _tagger.tag
if os.path.exists(parser_dirname): # If a full path is given.
_malt_dir = parser_dirname
else: # Try to find path to maltparser directory in environment variables.
- _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
+ _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
# Checks that that the found directory contains all the necessary .jar
- malt_dependencies = ["", "", ""]
+ malt_dependencies = ['', '', '']
_malt_jars = set(find_jars_within_path(_malt_dir))
_jars = set(os.path.split(jar)[1] for jar in _malt_jars)
- malt_dependencies = set(["log4j.jar", "libsvm.jar", "liblinear-1.8.jar"])
+ malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
assert malt_dependencies.issubset(_jars)
assert any(
- filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
+ filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)
)
return list(_malt_jars)
A module to find pre-trained MaltParser model.
"""
if model_filename is None:
- return "malt_temp.mco"
+ return 'malt_temp.mco'
elif os.path.exists(model_filename): # If a full path is given.
return model_filename
else: # Try to find path to malt model in environment variables.
- return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
+ return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
class MaltParser(ParserI):
)
# Initialize model.
self.model = find_malt_model(model_filename)
- self._trained = self.model != "malt_temp.mco"
+ self._trained = self.model != 'malt_temp.mco'
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
self.working_dir = tempfile.gettempdir()
# Initialize POS tagger.
self.tagger = tagger if tagger is not None else malt_regex_tagger()
- def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
+ def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
"""
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
sentences where each sentence is a list of (word, tag) tuples.
raise Exception("Parser has not been trained. Call train() first.")
with tempfile.NamedTemporaryFile(
- prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
+ prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False
) as input_file:
with tempfile.NamedTemporaryFile(
- prefix="malt_output.conll.",
+ prefix='malt_output.conll.',
dir=self.working_dir,
- mode="w",
+ mode='w',
delete=False,
) as output_file:
# Convert list of sentences to CONLL format.
for line in taggedsents_to_conll(sentences):
- input_file.write(str(line))
+ input_file.write(text_type(line))
input_file.close()
# Generate command to run maltparser.
ret = self._execute(cmd, verbose) # Run command.
os.chdir(_current_path) # Change back to current path.
- if ret != 0:
+ if ret is not 0:
raise Exception(
"MaltParser parsing (%s) failed with exit "
- "code %d" % (" ".join(cmd), ret)
+ "code %d" % (' '.join(cmd), ret)
)
# Must return iter(iter(Tree))
with open(output_file.name) as infile:
- for tree_str in infile.read().split("\n\n"):
+ for tree_str in infile.read().split('\n\n'):
yield (
iter(
[
os.remove(input_file.name)
os.remove(output_file.name)
- def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
+ def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
"""
Use MaltParser to parse multiple sentences.
Takes a list of sentences, where each sentence is a list of words.
:type outputfilename: str
"""
- cmd = ["java"]
+ cmd = ['java']
cmd += self.additional_java_args # Adds additional java arguments
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
- classpaths_separator = ";" if sys.platform.startswith("win") else ":"
+ classpaths_separator = ';' if sys.platform.startswith('win') else ':'
cmd += [
- "-cp",
+ '-cp',
classpaths_separator.join(self.malt_jars),
] # Adds classpaths for jars
- cmd += ["org.maltparser.Malt"] # Adds the main function.
+ cmd += ['org.maltparser.Malt'] # Adds the main function.
# Adds the model file.
if os.path.exists(self.model): # when parsing
- cmd += ["-c", os.path.split(self.model)[-1]]
+ cmd += ['-c', os.path.split(self.model)[-1]]
else: # when learning
- cmd += ["-c", self.model]
+ cmd += ['-c', self.model]
- cmd += ["-i", inputfilename]
- if mode == "parse":
- cmd += ["-o", outputfilename]
- cmd += ["-m", mode] # mode use to generate parses.
+ cmd += ['-i', inputfilename]
+ if mode == 'parse':
+ cmd += ['-o', outputfilename]
+ cmd += ['-m', mode] # mode use to generate parses.
return cmd
@staticmethod
# Write the conll_str to malt_train.conll file in /tmp/
with tempfile.NamedTemporaryFile(
- prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
+ prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
) as input_file:
- input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
- input_file.write(str(input_str))
+ input_str = '\n'.join(dg.to_conll(10) for dg in depgraphs)
+ input_file.write(text_type(input_str))
# Trains the model with the malt_train.conll
self.train_from_file(input_file.name, verbose=verbose)
# Removes the malt_train.conll once training finishes.
# then we need to do some extra massaging
if isinstance(conll_file, ZipFilePathPointer):
with tempfile.NamedTemporaryFile(
- prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
+ prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
) as input_file:
with conll_file.open() as conll_input_file:
conll_str = conll_input_file.read()
- input_file.write(str(conll_str))
+ input_file.write(text_type(conll_str))
return self.train_from_file(input_file.name, verbose=verbose)
# Generate command to run maltparser.
if ret != 0:
raise Exception(
"MaltParser training (%s) failed with exit "
- "code %d" % (" ".join(cmd), ret)
+ "code %d" % (' '.join(cmd), ret)
)
self._trained = True
if __name__ == '__main__':
- """
- A demonstration function to show how NLTK users can use the malt parser API.
+ '''
+ A demostration function to show how NLTK users can use the malt parser API.
>>> from nltk import pos_tag
>>> assert 'MALT_PARSER' in os.environ, str(
>>> # Parse a single sentence.
>>> parsed_sent1 = mp.parse_one(sent1)
>>> parsed_sent2 = mp.parse_one(sent2)
- >>> print(parsed_sent1.tree())
+ >>> print (parsed_sent1.tree())
(sees John Mary .)
- >>> print(parsed_sent2.tree())
+ >>> print (parsed_sent2.tree())
(walks John (dog a) .)
>>>
>>> # Parsing multiple sentences.
(shot I (elephant an) (in (pajamas my)) .)
>>> print(next(next(parsed_sents)).tree())
(flies Time (like banana) .)
- """
-
+ '''
import doctest
+
doctest.testmod()
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import print_function
import math
import logging
+from six.moves import range
+
from nltk.parse.dependencygraph import DependencyGraph
logger = logging.getLogger(__name__)
def __init__(self):
if self.__class__ == DependencyScorerI:
- raise TypeError("DependencyScorerI is an abstract interface")
+ raise TypeError('DependencyScorerI is an abstract interface')
def train(self, graphs):
"""
for graph in graphs:
for head_node in graph.nodes.values():
for child_index, child_node in graph.nodes.items():
- if child_index in head_node["deps"]:
+ if child_index in head_node['deps']:
label = "T"
else:
label = "F"
labeled_examples.append(
(
dict(
- a=head_node["word"],
- b=head_node["tag"],
- c=child_node["word"],
- d=child_node["tag"],
+ a=head_node['word'],
+ b=head_node['tag'],
+ c=child_node['word'],
+ d=child_node['tag'],
),
label,
)
edges.append(
(
dict(
- a=head_node["word"],
- b=head_node["tag"],
- c=child_node["word"],
- d=child_node["tag"],
+ a=head_node['word'],
+ b=head_node['tag'],
+ c=child_node['word'],
+ d=child_node['tag'],
)
)
)
row = []
count = 0
for pdist in self.classifier.prob_classify_many(edges):
- logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
+ logger.debug('%.4f %.4f', pdist.prob('T'), pdist.prob('F'))
# smoothing in case the probability = 0
row.append([math.log(pdist.prob("T") + 0.00000000001)])
count += 1
# A short class necessary to show parsing example from paper
class DemoScorer(DependencyScorerI):
def train(self, graphs):
- print("Training...")
+ print('Training...')
def score(self, graph):
# scores for Keith Hall 'K-best Spanning Tree Parsing' paper
"""
Creates a new non-projective parser.
"""
- logging.debug("initializing prob. nonprojective...")
+ logging.debug('initializing prob. nonprojective...')
def train(self, graphs, dependency_scorer):
"""
:type g_graph, b_graph, c_graph: DependencyGraph
:param g_graph, b_graph, c_graph: Graphs which need to be updated.
"""
- logger.debug("Collapsing nodes...")
+ logger.debug('Collapsing nodes...')
# Collapse all cycle nodes into v_n+1 in G_Graph
for cycle_node_index in cycle_path:
g_graph.remove_by_address(cycle_node_index)
g_graph.add_node(new_node)
- g_graph.redirect_arcs(cycle_path, new_node["address"])
+ g_graph.redirect_arcs(cycle_path, new_node['address'])
def update_edge_scores(self, new_node, cycle_path):
"""
:type cycle_path: A list of integers.
:param cycle_path: A list of node addresses that belong to the cycle.
"""
- logger.debug("cycle %s", cycle_path)
+ logger.debug('cycle %s', cycle_path)
cycle_path = self.compute_original_indexes(cycle_path)
- logger.debug("old cycle %s", cycle_path)
- logger.debug("Prior to update: %s", self.scores)
+ logger.debug('old cycle %s', cycle_path)
+ logger.debug('Prior to update: %s', self.scores)
for i, row in enumerate(self.scores):
for j, column in enumerate(self.scores[i]):
if j in cycle_path and i not in cycle_path and self.scores[i][j]:
subtract_val = self.compute_max_subtract_score(j, cycle_path)
- logger.debug("%s - %s", self.scores[i][j], subtract_val)
+ logger.debug('%s - %s', self.scores[i][j], subtract_val)
new_vals = []
for cur_val in self.scores[i][j]:
if i in cycle_path and j in cycle_path:
self.scores[i][j] = []
- logger.debug("After update: %s", self.scores)
+ logger.debug('After update: %s', self.scores)
def compute_original_indexes(self, new_indexes):
"""
the node that is arced to.
"""
originals = self.compute_original_indexes([node_index])
- logger.debug("originals: %s", originals)
+ logger.debug('originals: %s', originals)
max_arc = None
max_score = None
for row_index in range(len(self.scores)):
for col_index in range(len(self.scores[row_index])):
+ # print self.scores[row_index][col_index]
if col_index in originals and (
max_score is None or self.scores[row_index][col_index] > max_score
):
max_score = self.scores[row_index][col_index]
max_arc = row_index
- logger.debug("%s, %s", row_index, col_index)
+ logger.debug('%s, %s', row_index, col_index)
logger.debug(max_score)
g_graph = DependencyGraph()
for index, token in enumerate(tokens):
g_graph.nodes[index + 1].update(
- {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+ {'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1}
)
+ # print (g_graph.nodes)
# Fully connect non-root nodes in g_graph
g_graph.connect_graph()
original_graph = DependencyGraph()
for index, token in enumerate(tokens):
original_graph.nodes[index + 1].update(
- {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+ {'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1}
)
b_graph = DependencyGraph()
for index, token in enumerate(tokens):
c_graph.nodes[index + 1].update(
- {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+ {'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1}
)
# Assign initial scores to g_graph edges
self.initialize_edge_scores(g_graph)
logger.debug(self.scores)
# Initialize a list of unvisited vertices (by node address)
- unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
+ unvisited_vertices = [vertex['address'] for vertex in c_graph.nodes.values()]
# Iterate over unvisited vertices
nr_vertices = len(tokens)
betas = {}
while unvisited_vertices:
# Mark current node as visited
current_vertex = unvisited_vertices.pop(0)
- logger.debug("current_vertex: %s", current_vertex)
+ logger.debug('current_vertex: %s', current_vertex)
# Get corresponding node n_i to vertex v_i
current_node = g_graph.get_by_address(current_vertex)
- logger.debug("current_node: %s", current_node)
+ logger.debug('current_node: %s', current_node)
# Get best in-edge node b for current node
best_in_edge = self.best_incoming_arc(current_vertex)
betas[current_vertex] = self.original_best_arc(current_vertex)
- logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
+ logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
# b_graph = Union(b_graph, b)
for new_vertex in [current_vertex, best_in_edge]:
b_graph.nodes[new_vertex].update(
- {"word": "TEMP", "rel": "NTOP", "address": new_vertex}
+ {'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex}
)
b_graph.add_arc(best_in_edge, current_vertex)
# Beta(current node) = b - stored for parse recovery
cycle_path = b_graph.contains_cycle()
if cycle_path:
# Create a new node v_n+1 with address = len(nodes) + 1
- new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
+ new_node = {'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1}
# c_graph = Union(c_graph, v_n+1)
c_graph.add_node(new_node)
# Collapse all nodes in cycle C into v_n+1
self.update_edge_scores(new_node, cycle_path)
self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
for cycle_index in cycle_path:
- c_graph.add_arc(new_node["address"], cycle_index)
+ c_graph.add_arc(new_node['address'], cycle_index)
# self.replaced_by[cycle_index] = new_node['address']
- self.inner_nodes[new_node["address"]] = cycle_path
+ self.inner_nodes[new_node['address']] = cycle_path
# Add v_n+1 to list of unvisited vertices
unvisited_vertices.insert(0, nr_vertices + 1)
for cycle_node_address in cycle_path:
b_graph.remove_by_address(cycle_node_address)
- logger.debug("g_graph: %s", g_graph)
- logger.debug("b_graph: %s", b_graph)
- logger.debug("c_graph: %s", c_graph)
- logger.debug("Betas: %s", betas)
- logger.debug("replaced nodes %s", self.inner_nodes)
+ logger.debug('g_graph: %s', g_graph)
+ logger.debug('b_graph: %s', b_graph)
+ logger.debug('c_graph: %s', c_graph)
+ logger.debug('Betas: %s', betas)
+ logger.debug('replaced nodes %s', self.inner_nodes)
# Recover parse tree
- logger.debug("Final scores: %s", self.scores)
+ logger.debug('Final scores: %s', self.scores)
- logger.debug("Recovering parse...")
+ logger.debug('Recovering parse...')
for i in range(len(tokens) + 1, nr_vertices + 1):
betas[betas[i][1]] = betas[i]
- logger.debug("Betas: %s", betas)
+ logger.debug('Betas: %s', betas)
for node in original_graph.nodes.values():
# TODO: It's dangerous to assume that deps it a dictionary
# because it's a default dictionary. Ideally, here we should not
# be concerned how dependencies are stored inside of a dependency
# graph.
- node["deps"] = {}
+ node['deps'] = {}
for i in range(1, len(tokens) + 1):
original_graph.add_arc(betas[i][0], betas[i][1])
- logger.debug("Done.")
+ logger.debug('Done.')
yield original_graph
for index, token in enumerate(tokens):
self._graph.nodes[index] = {
- "word": token,
- "deps": [],
- "rel": "NTOP",
- "address": index,
+ 'word': token,
+ 'deps': [],
+ 'rel': 'NTOP',
+ 'address': index,
}
for head_node in self._graph.nodes.values():
deps = []
for dep_node in self._graph.nodes.values():
if (
- self._grammar.contains(head_node["word"], dep_node["word"])
- and head_node["word"] != dep_node["word"]
+ self._grammar.contains(head_node['word'], dep_node['word'])
+ and head_node['word'] != dep_node['word']
):
- deps.append(dep_node["address"])
- head_node["deps"] = deps
+ deps.append(dep_node['address'])
+ head_node['deps'] = deps
# Create lattice of possible heads
roots = []
head_address = head_index + 1
node = graph.nodes[address]
- node.update({"word": token, "address": address})
+ node.update({'word': token, 'address': address})
if head_address == 0:
- rel = "ROOT"
+ rel = 'ROOT'
else:
- rel = ""
- graph.nodes[head_index + 1]["deps"][rel].append(address)
+ rel = ''
+ graph.nodes[head_index + 1]['deps'][rel].append(address)
# TODO: check for cycles
yield graph
def hall_demo():
npp = ProbabilisticNonprojectiveParser()
npp.train([], DemoScorer())
- for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
+ for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]):
print(parse_graph)
def nonprojective_conll_parse_demo():
from nltk.parse.dependencygraph import conll_data2
- graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+ graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
npp = ProbabilisticNonprojectiveParser()
npp.train(graphs, NaiveBayesDependencyScorer())
for parse_graph in npp.parse(
- ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
+ ['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']
):
print(parse_graph)
ndp = NonprojectiveDependencyParser(grammar)
graphs = ndp.parse(
[
- "the",
- "man",
- "in",
- "the",
- "corner",
- "taught",
- "his",
- "dachshund",
- "to",
- "play",
- "golf",
+ 'the',
+ 'man',
+ 'in',
+ 'the',
+ 'corner',
+ 'taught',
+ 'his',
+ 'dachshund',
+ 'to',
+ 'play',
+ 'golf',
]
)
- print("Graphs:")
+ print('Graphs:')
for graph in graphs:
print(graph)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Probabilistic Chart Parsers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
argument beam_size. If non-zero, this controls the size of the beam
(aka the edge queue). This option is most useful with InsideChartParser.
"""
+from __future__ import print_function, unicode_literals
##//////////////////////////////////////////////////////
## Bottom-Up PCFG Chart Parser
from nltk.parse.api import ParserI
from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
+from nltk.compat import python_2_unicode_compatible
# Probabilistic edges
class ProbabilisticLeafEdge(LeafEdge):
yield new_edge
+@python_2_unicode_compatible
class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
NUM_EDGES = 1
yield new_edge
def __str__(self):
- return "Fundamental Rule"
+ return 'Fundamental Rule'
class BottomUpProbabilisticChartParser(ParserI):
for edge in bu_init.apply(chart, grammar):
if self._trace > 1:
print(
- " %-50s [%s]"
+ ' %-50s [%s]'
% (chart.pretty_format_edge(edge, width=2), edge.prob())
)
queue.append(edge)
edge = queue.pop()
if self._trace > 0:
print(
- " %-50s [%s]"
+ ' %-50s [%s]'
% (chart.pretty_format_edge(edge, width=2), edge.prob())
)
split = len(queue) - self.beam_size
if self._trace > 2:
for edge in queue[:split]:
- print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
+ print(' %-50s [DISCARDED]' % chart.pretty_format_edge(edge, 2))
del queue[:split]
# bestp.get(elt,0))
#
# self._bestp = bestp
-# for (k,v) in self._bestp.items(): print(k,v)
+# for (k,v) in self._bestp.items(): print k,v
#
# def _sortkey(self, edge):
# return edge.structure()[PROB] * self._bestp[edge.lhs()]
)
demos = [
- ("I saw John with my telescope", toy_pcfg1),
- ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
+ ('I saw John with my telescope', toy_pcfg1),
+ ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
]
if choice is None:
# Ask the user which demo they want to use.
print()
for i in range(len(demos)):
- print("%3s: %s" % (i + 1, demos[i][0]))
- print(" %r" % demos[i][1])
+ print('%3s: %s' % (i + 1, demos[i][0]))
+ print(' %r' % demos[i][1])
print()
- print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
+ print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
choice = int(sys.stdin.readline().strip()) - 1
try:
sent, grammar = demos[choice]
except:
- print("Bad sentence number")
+ print('Bad sentence number')
return
# Tokenize the sentence.
num_parses = []
all_parses = {}
for parser in parsers:
- print("\ns: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
+ print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
parser.trace(3)
t = time.time()
parses = list(parser.parse(tokens))
# Print some summary statistics
print()
- print(" Parser Beam | Time (secs) # Parses Average P(parse)")
- print("------------------------+------------------------------------------")
+ print(' Parser Beam | Time (secs) # Parses Average P(parse)')
+ print('------------------------+------------------------------------------')
for i in range(len(parsers)):
print(
- "%18s %4d |%11.4f%11d%19.14f"
+ '%18s %4d |%11.4f%11d%19.14f'
% (
parsers[i].__class__.__name__,
parsers[i].beam_size,
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
else:
p = 0
- print("------------------------+------------------------------------------")
- print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
+ print('------------------------+------------------------------------------')
+ print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))
if draw_parses is None:
# Ask the user if we should draw the parses.
print()
- print("Draw parses (y/n)? ", end=" ")
- draw_parses = sys.stdin.readline().strip().lower().startswith("y")
+ print('Draw parses (y/n)? ', end=' ')
+ draw_parses = sys.stdin.readline().strip().lower().startswith('y')
if draw_parses:
from nltk.draw.tree import draw_trees
- print(" please wait...")
+ print(' please wait...')
draw_trees(*parses)
if print_parses is None:
# Ask the user if we should print the parses.
print()
- print("Print parses (y/n)? ", end=" ")
- print_parses = sys.stdin.readline().strip().lower().startswith("y")
+ print('Print parses (y/n)? ', end=' ')
+ print_parses = sys.stdin.readline().strip().lower().startswith('y')
if print_parses:
for parse in parses:
print(parse)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import print_function, unicode_literals
from collections import defaultdict
from itertools import chain
)
from nltk.parse.dependencygraph import DependencyGraph
from nltk.internals import raise_unorderable_types
-
+from nltk.compat import python_2_unicode_compatible
#################################################################
# Dependency Span
@total_ordering
+@python_2_unicode_compatible
class DependencySpan(object):
"""
A contiguous span over some part of the input string representing
:return: A concise string representatino of the ``DependencySpan``.
:rtype: str.
"""
- return "Span %d-%d; Head Index: %d" % (
+ return 'Span %d-%d; Head Index: %d' % (
self._start_index,
self._end_index,
self._head_index,
:return: A verbose string representation of the ``DependencySpan``.
:rtype: str
"""
- str = "Span %d-%d; Head Index: %d" % (
+ str = 'Span %d-%d; Head Index: %d' % (
self._start_index,
self._end_index,
self._head_index,
)
for i in range(len(self._arcs)):
- str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i])
+ str += '\n%d <- %d, %s' % (i, self._arcs[i], self._tags[i])
return str
def __eq__(self, other):
#################################################################
+@python_2_unicode_compatible
class ChartCell(object):
"""
A cell from the parse chart formed when performing the CYK algorithm.
:return: A verbose string representation of this ``ChartCell``.
:rtype: str.
"""
- return "CC[%d,%d]: %s" % (self._x, self._y, self._entries)
+ return 'CC[%d,%d]: %s' % (self._x, self._y, self._entries)
def __repr__(self):
"""
:return: A concise string representation of this ``ChartCell``.
:rtype: str.
"""
- return "%s" % self
+ return '%s' % self
#################################################################
for j in range(0, len(self._tokens) + 1):
chart[i].append(ChartCell(i, j))
if i == j + 1:
- chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
+ chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ['null']))
for i in range(1, len(self._tokens) + 1):
for j in range(i - 2, -1, -1):
# malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
# Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
- conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
+ conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (
i + 1,
tokens[i],
tokens[i],
- "null",
- "null",
- "null",
+ 'null',
+ 'null',
+ 'null',
parse._arcs[i] + 1,
- "ROOT",
- "-",
- "-",
+ 'ROOT',
+ '-',
+ '-',
)
dg = DependencyGraph(conll_format)
# if self.meets_arity(dg):
"""
spans = []
if span1._start_index == span2._start_index:
- print("Error: Mismatched spans - replace this with thrown error")
+ print('Error: Mismatched spans - replace this with thrown error')
if span1._start_index > span2._start_index:
temp_span = span1
span1 = span2
if self._grammar.contains(
self._tokens[span1._head_index], self._tokens[span2._head_index]
):
- # print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index))
+ # print 'Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)
new_arcs[span2._head_index - span1._start_index] = span1._head_index
spans.append(
DependencySpan(
if self._grammar.contains(
self._tokens[span2._head_index], self._tokens[span1._head_index]
):
- # print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index))
+ # print 'performing leftward cover %d to %d' % (span2._head_index, span1._head_index)
new_arcs[span1._head_index - span1._start_index] = span2._head_index
spans.append(
DependencySpan(
)
else:
print(
- "No tag found for input token '%s', parse is impossible."
+ 'No tag found for input token \'%s\', parse is impossible.'
% tokens[i - 1]
)
return []
conll_format = ""
malt_format = ""
for i in range(len(tokens)):
- malt_format += "%s\t%s\t%d\t%s\n" % (
+ malt_format += '%s\t%s\t%d\t%s\n' % (
tokens[i],
- "null",
+ 'null',
parse._arcs[i] + 1,
- "null",
+ 'null',
)
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
# Modify to comply with recent change in dependency graph such that there must be a ROOT element.
- conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
+ conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (
i + 1,
tokens[i],
tokens[i],
parse._tags[i],
parse._tags[i],
- "null",
+ 'null',
parse._arcs[i] + 1,
- "ROOT",
- "-",
- "-",
+ 'ROOT',
+ '-',
+ '-',
)
dg = DependencyGraph(conll_format)
score = self.compute_prob(dg)
"""
spans = []
if span1._start_index == span2._start_index:
- print("Error: Mismatched spans - replace this with thrown error")
+ print('Error: Mismatched spans - replace this with thrown error')
if span1._start_index > span2._start_index:
temp_span = span1
span1 = span2
for dg in graphs:
for node_index in range(1, len(dg.nodes)):
# children = dg.nodes[node_index]['deps']
- children = list(chain(*dg.nodes[node_index]["deps"].values()))
+ children = list(chain(*dg.nodes[node_index]['deps'].values()))
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
for child_index in range(
0 - (nr_left_children + 1), nr_right_children + 2
):
- head_word = dg.nodes[node_index]["word"]
- head_tag = dg.nodes[node_index]["tag"]
+ head_word = dg.nodes[node_index]['word']
+ head_tag = dg.nodes[node_index]['tag']
if head_word in tags:
tags[head_word].add(head_tag)
else:
tags[head_word] = set([head_tag])
- child = "STOP"
- child_tag = "STOP"
- prev_word = "START"
- prev_tag = "START"
+ child = 'STOP'
+ child_tag = 'STOP'
+ prev_word = 'START'
+ prev_tag = 'START'
if child_index < 0:
array_index = child_index + nr_left_children
if array_index >= 0:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != -1:
- prev_word = dg.nodes[children[array_index + 1]]["word"]
- prev_tag = dg.nodes[children[array_index + 1]]["tag"]
- if child != "STOP":
+ prev_word = dg.nodes[children[array_index + 1]]['word']
+ prev_tag = dg.nodes[children[array_index + 1]]['tag']
+ if child != 'STOP':
productions.append(DependencyProduction(head_word, [child]))
- head_event = "(head (%s %s) (mods (%s, %s, %s) left))" % (
+ head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
- mod_event = "(mods (%s, %s, %s) left))" % (
+ mod_event = '(mods (%s, %s, %s) left))' % (
prev_tag,
head_word,
head_tag,
elif child_index > 0:
array_index = child_index + nr_left_children - 1
if array_index < nr_children:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != 1:
- prev_word = dg.nodes[children[array_index - 1]]["word"]
- prev_tag = dg.nodes[children[array_index - 1]]["tag"]
- if child != "STOP":
+ prev_word = dg.nodes[children[array_index - 1]]['word']
+ prev_tag = dg.nodes[children[array_index - 1]]['tag']
+ if child != 'STOP':
productions.append(DependencyProduction(head_word, [child]))
- head_event = "(head (%s %s) (mods (%s, %s, %s) right))" % (
+ head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
- mod_event = "(mods (%s, %s, %s) right))" % (
+ mod_event = '(mods (%s, %s, %s) right))' % (
prev_tag,
head_word,
head_tag,
prob = 1.0
for node_index in range(1, len(dg.nodes)):
# children = dg.nodes[node_index]['deps']
- children = list(chain(*dg.nodes[node_index]["deps"].values()))
+ children = list(chain(*dg.nodes[node_index]['deps'].values()))
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
nr_children = nr_left_children + nr_right_children
for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
- head_word = dg.nodes[node_index]["word"]
- head_tag = dg.nodes[node_index]["tag"]
- child = "STOP"
- child_tag = "STOP"
- prev_word = "START"
- prev_tag = "START"
+ head_word = dg.nodes[node_index]['word']
+ head_tag = dg.nodes[node_index]['tag']
+ child = 'STOP'
+ child_tag = 'STOP'
+ prev_word = 'START'
+ prev_tag = 'START'
if child_index < 0:
array_index = child_index + nr_left_children
if array_index >= 0:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != -1:
- prev_word = dg.nodes[children[array_index + 1]]["word"]
- prev_tag = dg.nodes[children[array_index + 1]]["tag"]
- head_event = "(head (%s %s) (mods (%s, %s, %s) left))" % (
+ prev_word = dg.nodes[children[array_index + 1]]['word']
+ prev_tag = dg.nodes[children[array_index + 1]]['tag']
+ head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
- mod_event = "(mods (%s, %s, %s) left))" % (
+ mod_event = '(mods (%s, %s, %s) left))' % (
prev_tag,
head_word,
head_tag,
elif child_index > 0:
array_index = child_index + nr_left_children - 1
if array_index < nr_children:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != 1:
- prev_word = dg.nodes[children[array_index - 1]]["word"]
- prev_tag = dg.nodes[children[array_index - 1]]["tag"]
- head_event = "(head (%s %s) (mods (%s, %s, %s) right))" % (
+ prev_word = dg.nodes[children[array_index - 1]]['word']
+ prev_tag = dg.nodes[children[array_index - 1]]['tag']
+ head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
- mod_event = "(mods (%s, %s, %s) right))" % (
+ mod_event = '(mods (%s, %s, %s) right))' % (
prev_tag,
head_word,
head_tag,
)
print(grammar)
pdp = ProjectiveDependencyParser(grammar)
- trees = pdp.parse(["the", "cats", "scratch", "the", "walls"])
+ trees = pdp.parse(['the', 'cats', 'scratch', 'the', 'walls'])
for tree in trees:
print(tree)
created by a ``ProjectiveDependencyParser``.
"""
print()
- print("A grammar with no arity constraints. Each DependencyProduction")
- print("specifies a relationship between one head word and only one")
- print("modifier word.")
+ print('A grammar with no arity constraints. Each DependencyProduction')
+ print('specifies a relationship between one head word and only one')
+ print('modifier word.')
grammar = DependencyGrammar.fromstring(
"""
'fell' -> 'price' | 'stock'
print(grammar)
print()
- print("For the sentence 'The price of the stock fell', this grammar")
- print("will produce the following three parses:")
+ print('For the sentence \'The price of the stock fell\', this grammar')
+ print('will produce the following three parses:')
pdp = ProjectiveDependencyParser(grammar)
- trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
+ trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
for tree in trees:
print(tree)
print()
- print("By contrast, the following grammar contains a ")
- print("DependencyProduction that specifies a relationship")
- print("between a single head word, 'price', and two modifier")
- print("words, 'of' and 'the'.")
+ print('By contrast, the following grammar contains a ')
+ print('DependencyProduction that specifies a relationship')
+ print('between a single head word, \'price\', and two modifier')
+ print('words, \'of\' and \'the\'.')
grammar = DependencyGrammar.fromstring(
"""
'fell' -> 'price' | 'stock'
print()
print(
- "This constrains the number of possible parses to just one:"
+ 'This constrains the number of possible parses to just one:'
) # unimplemented, soon to replace
pdp = ProjectiveDependencyParser(grammar)
- trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
+ trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
for tree in trees:
print(tree)
"""
from nltk.parse.dependencygraph import conll_data2
- graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+ graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
ppdp = ProbabilisticProjectiveDependencyParser()
- print("Training Probabilistic Projective Dependency Parser...")
+ print('Training Probabilistic Projective Dependency Parser...')
ppdp.train(graphs)
- sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
- print("Parsing '", " ".join(sent), "'...")
- print("Parse:")
+ sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
+ print('Parsing \'', " ".join(sent), '\'...')
+ print('Parse:')
for tree in ppdp.parse(sent):
print(tree)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Recursive Descent Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
from nltk.grammar import Nonterminal
from nltk.tree import Tree, ImmutableTree
+from nltk.compat import unicode_repr
from nltk.parse.api import ParserI
"""
if treeloc == ():
- print("*", end=" ")
+ print("*", end=' ')
if isinstance(tree, Tree):
if len(tree) == 0:
- print(repr(Nonterminal(tree.label())), end=" ")
+ print(unicode_repr(Nonterminal(tree.label())), end=' ')
for i in range(len(tree)):
if treeloc is not None and i == treeloc[0]:
self._trace_fringe(tree[i], treeloc[1:])
else:
self._trace_fringe(tree[i])
else:
- print(repr(tree), end=" ")
+ print(unicode_repr(tree), end=' ')
def _trace_tree(self, tree, frontier, operation):
"""
:rtype: None
"""
if self._trace == 2:
- print(" %c [" % operation, end=" ")
+ print(' %c [' % operation, end=' ')
else:
- print(" [", end=" ")
+ print(' [', end=' ')
if len(frontier) > 0:
self._trace_fringe(tree, frontier[0])
else:
self._trace_fringe(tree)
- print("]")
+ print(']')
def _trace_start(self, tree, frontier, text):
- print("Parsing %r" % " ".join(text))
+ print('Parsing %r' % " ".join(text))
if self._trace > 2:
- print("Start:")
+ print('Start:')
if self._trace > 1:
- self._trace_tree(tree, frontier, " ")
+ self._trace_tree(tree, frontier, ' ')
def _trace_expand(self, tree, frontier, production):
if self._trace > 2:
- print("Expand: %s" % production)
+ print('Expand: %s' % production)
if self._trace > 1:
- self._trace_tree(tree, frontier, "E")
+ self._trace_tree(tree, frontier, 'E')
def _trace_match(self, tree, frontier, tok):
if self._trace > 2:
- print("Match: %r" % tok)
+ print('Match: %r' % tok)
if self._trace > 1:
- self._trace_tree(tree, frontier, "M")
+ self._trace_tree(tree, frontier, 'M')
def _trace_succeed(self, tree, frontier):
if self._trace > 2:
- print("GOOD PARSE:")
+ print('GOOD PARSE:')
if self._trace == 1:
- print("Found a parse:\n%s" % tree)
+ print('Found a parse:\n%s' % tree)
if self._trace > 1:
- self._trace_tree(tree, frontier, "+")
+ self._trace_tree(tree, frontier, '+')
def _trace_backtrack(self, tree, frontier, toks=None):
if self._trace > 2:
if toks:
- print("Backtrack: %r match failed" % toks[0])
+ print('Backtrack: %r match failed' % toks[0])
else:
- print("Backtrack")
+ print('Backtrack')
##//////////////////////////////////////////////////////
for prod in grammar.productions():
print(prod)
- sent = "I saw a man in the park".split()
+ sent = 'I saw a man in the park'.split()
parser = parse.RecursiveDescentParser(grammar, trace=2)
for p in parser.parse(sent):
print(p)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Shift-Reduce Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
from nltk.grammar import Nonterminal
from nltk.tree import Tree
+from nltk.compat import unicode_repr
from nltk.parse.api import ParserI
# Trace output.
if self._trace:
- print("Parsing %r" % " ".join(tokens))
+ print('Parsing %r' % " ".join(tokens))
self._trace_stack(stack, remaining_text)
# iterate through the text, pushing the token onto
# 3: display which tokens & productions are shifed/reduced
self._trace = trace
- def _trace_stack(self, stack, remaining_text, marker=" "):
+ def _trace_stack(self, stack, remaining_text, marker=' '):
"""
Print trace output displaying the given stack and text.
stack. This is used with trace level 2 to print 'S'
before shifted stacks and 'R' before reduced stacks.
"""
- s = " " + marker + " [ "
+ s = ' ' + marker + ' [ '
for elt in stack:
if isinstance(elt, Tree):
- s += repr(Nonterminal(elt.label())) + " "
+ s += unicode_repr(Nonterminal(elt.label())) + ' '
else:
- s += repr(elt) + " "
- s += "* " + " ".join(remaining_text) + "]"
+ s += unicode_repr(elt) + ' '
+ s += '* ' + ' '.join(remaining_text) + ']'
print(s)
def _trace_shift(self, stack, remaining_text):
:rtype: None
"""
if self._trace > 2:
- print("Shift %r:" % stack[-1])
+ print('Shift %r:' % stack[-1])
if self._trace == 2:
- self._trace_stack(stack, remaining_text, "S")
+ self._trace_stack(stack, remaining_text, 'S')
elif self._trace > 0:
self._trace_stack(stack, remaining_text)
"""
if self._trace > 2:
rhs = " ".join(production.rhs())
- print("Reduce %r <- %s" % (production.lhs(), rhs))
+ print('Reduce %r <- %s' % (production.lhs(), rhs))
if self._trace == 2:
- self._trace_stack(stack, remaining_text, "R")
+ self._trace_stack(stack, remaining_text, 'R')
elif self._trace > 1:
self._trace_stack(stack, remaining_text)
rhs1 = productions[i].rhs()
rhs2 = productions[j].rhs()
if rhs1[: len(rhs2)] == rhs2:
- print("Warning: %r will never be used" % productions[i])
+ print('Warning: %r will never be used' % productions[i])
##//////////////////////////////////////////////////////
"""
)
- sent = "I saw a man in the park".split()
+ sent = 'I saw a man in the park'.split()
parser = parse.ShiftReduceParser(grammar, trace=2)
for p in parser.parse(sent):
print(p)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+
import tempfile
import os
import warnings
from unittest import skip
from subprocess import PIPE
+from six import text_type
+
from nltk.internals import (
find_jar_iter,
config_java,
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree
-_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
+_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
class GenericStanfordParser(ParserI):
"""Interface to the Stanford Parser"""
- _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
- _JAR = r"stanford-parser\.jar"
- _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
+ _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar'
+ _JAR = r'stanford-parser\.jar'
+ _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
_USE_STDIN = False
_DOUBLE_SPACED_OUTPUT = False
self,
path_to_jar=None,
path_to_models_jar=None,
- model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
- encoding="utf8",
+ model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
+ encoding='utf8',
verbose=False,
- java_options="-mx4g",
- corenlp_options="",
+ java_options='-mx4g',
+ corenlp_options='',
):
# find the most recent code and model jar
find_jar_iter(
self._JAR,
path_to_jar,
- env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
+ env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
searchpath=(),
url=_stanford_url,
verbose=verbose,
find_jar_iter(
self._MODEL_JAR_PATTERN,
path_to_models_jar,
- env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
+ env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
searchpath=(),
url=_stanford_url,
verbose=verbose,
cur_trees = []
blank = False
for line in output_.splitlines(False):
- if line == "":
+ if line == '':
if blank:
res.append(iter(cur_trees))
cur_trees = []
blank = False
elif self._DOUBLE_SPACED_OUTPUT:
- cur_trees.append(self._make_tree("\n".join(cur_lines)))
+ cur_trees.append(self._make_tree('\n'.join(cur_lines)))
cur_lines = []
blank = True
else:
- res.append(iter([self._make_tree("\n".join(cur_lines))]))
+ res.append(iter([self._make_tree('\n'.join(cur_lines))]))
cur_lines = []
else:
cur_lines.append(line)
"""
cmd = [
self._MAIN_CLASS,
- "-model",
+ '-model',
self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
+ '-sentences',
+ 'newline',
+ '-outputFormat',
self._OUTPUT_FORMAT,
- "-tokenized",
- "-escaper",
- "edu.stanford.nlp.process.PTBEscapingProcessor",
+ '-tokenized',
+ '-escaper',
+ 'edu.stanford.nlp.process.PTBEscapingProcessor',
]
return self._parse_trees_output(
self._execute(
- cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
+ cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose
)
)
"""
cmd = [
self._MAIN_CLASS,
- "-model",
+ '-model',
self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
+ '-sentences',
+ 'newline',
+ '-outputFormat',
self._OUTPUT_FORMAT,
]
return self._parse_trees_output(
- self._execute(cmd, "\n".join(sentences), verbose)
+ self._execute(cmd, '\n'.join(sentences), verbose)
)
def tagged_parse(self, sentence, verbose=False):
:type sentences: list(list(tuple(str, str)))
:rtype: iter(iter(Tree))
"""
- tag_separator = "/"
+ tag_separator = '/'
cmd = [
self._MAIN_CLASS,
- "-model",
+ '-model',
self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
+ '-sentences',
+ 'newline',
+ '-outputFormat',
self._OUTPUT_FORMAT,
- "-tokenized",
- "-tagSeparator",
+ '-tokenized',
+ '-tagSeparator',
tag_separator,
- "-tokenizerFactory",
- "edu.stanford.nlp.process.WhitespaceTokenizer",
- "-tokenizerMethod",
- "newCoreLabelTokenizerFactory",
+ '-tokenizerFactory',
+ 'edu.stanford.nlp.process.WhitespaceTokenizer',
+ '-tokenizerMethod',
+ 'newCoreLabelTokenizerFactory',
]
# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
return self._parse_trees_output(
self._execute(
cmd,
- "\n".join(
- " ".join(tag_separator.join(tagged) for tagged in sentence)
+ '\n'.join(
+ ' '.join(tag_separator.join(tagged) for tagged in sentence)
for sentence in sentences
),
verbose,
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
- cmd.extend(["-encoding", encoding])
+ cmd.extend(['-encoding', encoding])
if self.corenlp_options:
cmd.append(self.corenlp_options)
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
- with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+ with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
- if isinstance(input_, str) and encoding:
+ if isinstance(input_, text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
)
- stdout = stdout.replace(b"\xc2\xa0", b" ")
- stdout = stdout.replace(b"\x00\xa0", b" ")
+ stdout = stdout.replace(b'\xc2\xa0', b' ')
+ stdout = stdout.replace(b'\x00\xa0', b' ')
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
"""
- _OUTPUT_FORMAT = "penn"
+ _OUTPUT_FORMAT = 'penn'
def __init__(self, *args, **kwargs):
warnings.warn(
"""
- _OUTPUT_FORMAT = "conll2007"
+ _OUTPUT_FORMAT = 'conll2007'
def __init__(self, *args, **kwargs):
warnings.warn(
super(StanfordDependencyParser, self).__init__(*args, **kwargs)
def _make_tree(self, result):
- return DependencyGraph(result, top_relation_label="root")
+ return DependencyGraph(result, top_relation_label='root')
class StanfordNeuralDependencyParser(GenericStanfordParser):
- """
+ '''
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser
>>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')
... ))], []) # doctest: +NORMALIZE_WHITESPACE
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
- """
+ '''
- _OUTPUT_FORMAT = "conll"
- _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
- _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
- _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
+ _OUTPUT_FORMAT = 'conll'
+ _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
+ _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar'
+ _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar'
_USE_STDIN = True
_DOUBLE_SPACED_OUTPUT = True
)
super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
- self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
+ self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse'
def tagged_parse_sents(self, sentences, verbose=False):
- """
+ '''
Currently unimplemented because the neural dependency parser (and
the StanfordCoreNLP pipeline class) doesn't support passing in pre-
tagged tokens.
- """
+ '''
raise NotImplementedError(
- "tagged_parse[_sents] is not supported by "
- "StanfordNeuralDependencyParser; use "
- "parse[_sents] or raw_parse[_sents] instead."
+ 'tagged_parse[_sents] is not supported by '
+ 'StanfordNeuralDependencyParser; use '
+ 'parse[_sents] or raw_parse[_sents] instead.'
)
def _make_tree(self, result):
- return DependencyGraph(result, top_relation_label="ROOT")
+ return DependencyGraph(result, top_relation_label='ROOT')
@skip("doctests from nltk.parse.stanford are skipped because it's deprecated")
try:
StanfordParser(
- model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+ model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
)
StanfordNeuralDependencyParser()
except LookupError:
raise SkipTest(
- "doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn't exist"
+ 'doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist'
)
#
# Author: Long Duong <longdt219@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
import tempfile
import pickle
def __str__(self):
return (
- "Stack : "
+ 'Stack : '
+ str(self.stack)
- + " Buffer : "
+ + ' Buffer : '
+ str(self.buffer)
- + " Arcs : "
+ + ' Arcs : '
+ str(self.arcs)
)
"""
if feat is None:
return False
- if feat == "":
+ if feat == '':
return False
if flag is False:
- if feat == "_":
+ if feat == '_':
return False
return True
# Stack 0
stack_idx0 = self.stack[len(self.stack) - 1]
token = self._tokens[stack_idx0]
- if self._check_informative(token["word"], True):
- result.append("STK_0_FORM_" + token["word"])
- if "lemma" in token and self._check_informative(token["lemma"]):
- result.append("STK_0_LEMMA_" + token["lemma"])
- if self._check_informative(token["tag"]):
- result.append("STK_0_POS_" + token["tag"])
- if "feats" in token and self._check_informative(token["feats"]):
- feats = token["feats"].split("|")
+ if self._check_informative(token['word'], True):
+ result.append('STK_0_FORM_' + token['word'])
+ if 'lemma' in token and self._check_informative(token['lemma']):
+ result.append('STK_0_LEMMA_' + token['lemma'])
+ if self._check_informative(token['tag']):
+ result.append('STK_0_POS_' + token['tag'])
+ if 'feats' in token and self._check_informative(token['feats']):
+ feats = token['feats'].split("|")
for feat in feats:
- result.append("STK_0_FEATS_" + feat)
+ result.append('STK_0_FEATS_' + feat)
# Stack 1
if len(self.stack) > 1:
stack_idx1 = self.stack[len(self.stack) - 2]
token = self._tokens[stack_idx1]
- if self._check_informative(token["tag"]):
- result.append("STK_1_POS_" + token["tag"])
+ if self._check_informative(token['tag']):
+ result.append('STK_1_POS_' + token['tag'])
# Left most, right most dependency of stack[0]
left_most = 1000000
right_most = -1
- dep_left_most = ""
- dep_right_most = ""
+ dep_left_most = ''
+ dep_right_most = ''
for (wi, r, wj) in self.arcs:
if wi == stack_idx0:
if (wj > wi) and (wj > right_most):
left_most = wj
dep_left_most = r
if self._check_informative(dep_left_most):
- result.append("STK_0_LDEP_" + dep_left_most)
+ result.append('STK_0_LDEP_' + dep_left_most)
if self._check_informative(dep_right_most):
- result.append("STK_0_RDEP_" + dep_right_most)
+ result.append('STK_0_RDEP_' + dep_right_most)
# Check Buffered 0
if len(self.buffer) > 0:
# Buffer 0
buffer_idx0 = self.buffer[0]
token = self._tokens[buffer_idx0]
- if self._check_informative(token["word"], True):
- result.append("BUF_0_FORM_" + token["word"])
- if "lemma" in token and self._check_informative(token["lemma"]):
- result.append("BUF_0_LEMMA_" + token["lemma"])
- if self._check_informative(token["tag"]):
- result.append("BUF_0_POS_" + token["tag"])
- if "feats" in token and self._check_informative(token["feats"]):
- feats = token["feats"].split("|")
+ if self._check_informative(token['word'], True):
+ result.append('BUF_0_FORM_' + token['word'])
+ if 'lemma' in token and self._check_informative(token['lemma']):
+ result.append('BUF_0_LEMMA_' + token['lemma'])
+ if self._check_informative(token['tag']):
+ result.append('BUF_0_POS_' + token['tag'])
+ if 'feats' in token and self._check_informative(token['feats']):
+ feats = token['feats'].split("|")
for feat in feats:
- result.append("BUF_0_FEATS_" + feat)
+ result.append('BUF_0_FEATS_' + feat)
# Buffer 1
if len(self.buffer) > 1:
buffer_idx1 = self.buffer[1]
token = self._tokens[buffer_idx1]
- if self._check_informative(token["word"], True):
- result.append("BUF_1_FORM_" + token["word"])
- if self._check_informative(token["tag"]):
- result.append("BUF_1_POS_" + token["tag"])
+ if self._check_informative(token['word'], True):
+ result.append('BUF_1_FORM_' + token['word'])
+ if self._check_informative(token['tag']):
+ result.append('BUF_1_POS_' + token['tag'])
if len(self.buffer) > 2:
buffer_idx2 = self.buffer[2]
token = self._tokens[buffer_idx2]
- if self._check_informative(token["tag"]):
- result.append("BUF_2_POS_" + token["tag"])
+ if self._check_informative(token['tag']):
+ result.append('BUF_2_POS_' + token['tag'])
if len(self.buffer) > 3:
buffer_idx3 = self.buffer[3]
token = self._tokens[buffer_idx3]
- if self._check_informative(token["tag"]):
- result.append("BUF_3_POS_" + token["tag"])
+ if self._check_informative(token['tag']):
+ result.append('BUF_3_POS_' + token['tag'])
# Left most, right most dependency of stack[0]
left_most = 1000000
right_most = -1
- dep_left_most = ""
- dep_right_most = ""
+ dep_left_most = ''
+ dep_right_most = ''
for (wi, r, wj) in self.arcs:
if wi == buffer_idx0:
if (wj > wi) and (wj > right_most):
left_most = wj
dep_left_most = r
if self._check_informative(dep_left_most):
- result.append("BUF_0_LDEP_" + dep_left_most)
+ result.append('BUF_0_LDEP_' + dep_left_most)
if self._check_informative(dep_right_most):
- result.append("BUF_0_RDEP_" + dep_right_most)
+ result.append('BUF_0_RDEP_' + dep_right_most)
return result
"""
# Define set of transitions
- LEFT_ARC = "LEFTARC"
- RIGHT_ARC = "RIGHTARC"
- SHIFT = "SHIFT"
- REDUCE = "REDUCE"
+ LEFT_ARC = 'LEFTARC'
+ RIGHT_ARC = 'RIGHTARC'
+ SHIFT = 'SHIFT'
+ REDUCE = 'REDUCE'
def __init__(self, alg_option):
"""
Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
"""
- ARC_STANDARD = "arc-standard"
- ARC_EAGER = "arc-eager"
+ ARC_STANDARD = 'arc-standard'
+ ARC_EAGER = 'arc-eager'
def __init__(self, algorithm):
"""
p_node = depgraph.nodes[idx_parent]
c_node = depgraph.nodes[idx_child]
- if c_node["word"] is None:
+ if c_node['word'] is None:
return None # Root word
- if c_node["head"] == p_node["address"]:
- return c_node["rel"]
+ if c_node['head'] == p_node['address']:
+ return c_node['rel']
else:
return None
unsorted_result.append(self._dictionary[feature])
# Default value of each feature is 1.0
- return " ".join(
- str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
+ return ' '.join(
+ str(featureID) + ':1.0' for featureID in sorted(unsorted_result)
)
def _is_projective(self, depgraph):
for key in depgraph.nodes:
node = depgraph.nodes[key]
- if "head" in node:
- childIdx = node["address"]
- parentIdx = node["head"]
+ if 'head' in node:
+ childIdx = node['address']
+ parentIdx = node['head']
if parentIdx is not None:
arc_list.append((parentIdx, childIdx))
self._transition.setdefault(key, len(self._transition) + 1)
self._match_transition[self._transition[key]] = key
- input_str = str(self._transition[key]) + " " + binary_features + "\n"
- input_file.write(input_str.encode("utf-8"))
+ input_str = str(self._transition[key]) + ' ' + binary_features + '\n'
+ input_file.write(input_str.encode('utf-8'))
def _create_training_examples_arc_std(self, depgraphs, input_file):
"""
# Left-arc operation
rel = self._get_dep_relation(b0, s0, depgraph)
if rel is not None:
- key = Transition.LEFT_ARC + ":" + rel
+ key = Transition.LEFT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.left_arc(conf, rel)
training_seq.append(key)
precondition = False
if precondition:
- key = Transition.RIGHT_ARC + ":" + rel
+ key = Transition.RIGHT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.right_arc(conf, rel)
training_seq.append(key)
# Left-arc operation
rel = self._get_dep_relation(b0, s0, depgraph)
if rel is not None:
- key = Transition.LEFT_ARC + ":" + rel
+ key = Transition.LEFT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.left_arc(conf, rel)
training_seq.append(key)
# Right-arc operation
rel = self._get_dep_relation(s0, b0, depgraph)
if rel is not None:
- key = Transition.RIGHT_ARC + ":" + rel
+ key = Transition.RIGHT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.right_arc(conf, rel)
training_seq.append(key)
try:
input_file = tempfile.NamedTemporaryFile(
- prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
+ prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False
)
if self._algorithm == self.ARC_STANDARD:
# Todo : because of probability = True => very slow due to
# cross-validation. Need to improve the speed here
model = svm.SVC(
- kernel="poly",
+ kernel='poly',
degree=2,
coef0=0,
gamma=0.2,
model.fit(x_train, y_train)
# Save the model to file name (as pickle)
- pickle.dump(model, open(modelfile, "wb"))
+ pickle.dump(model, open(modelfile, 'wb'))
finally:
remove(input_file.name)
"""
result = []
# First load the model
- model = pickle.load(open(modelFile, "rb"))
+ model = pickle.load(open(modelFile, 'rb'))
operation = Transition(self._algorithm)
for depgraph in depgraphs:
new_depgraph = deepcopy(depgraph)
for key in new_depgraph.nodes:
node = new_depgraph.nodes[key]
- node["rel"] = ""
+ node['rel'] = ''
# With the default, all the token depend on the Root
- node["head"] = 0
+ node['head'] = 0
for (head, rel, child) in conf.arcs:
c_node = new_depgraph.nodes[child]
- c_node["head"] = head
- c_node["rel"] = rel
+ c_node['head'] = head
+ c_node['rel'] = rel
result.append(new_depgraph)
return result
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility functions for parsers.
"""
+from __future__ import print_function
from nltk.grammar import CFG, FeatureGrammar, PCFG
from nltk.data import load
:return: a generator yielding a single sentence in CONLL format.
"""
for (i, (word, tag)) in enumerate(sentence, start=1):
- input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
+ input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
input_str = "\t".join(input_str) + "\n"
yield input_str
for sentence in sentences:
for input_str in taggedsent_to_conll(sentence):
yield input_str
- yield "\n\n"
+ yield '\n\n'
######################################################################
according to the grammar, then the value of ``trees`` will be None.
"""
for test in self.suite:
- print(test["doc"] + ":", end=" ")
- for key in ["accept", "reject"]:
+ print(test['doc'] + ":", end=' ')
+ for key in ['accept', 'reject']:
for sent in test[key]:
tokens = sent.split()
trees = list(self.cp.parse(tokens))
print(sent)
for tree in trees:
print(tree)
- if key == "accept":
+ if key == 'accept':
if trees == []:
raise ValueError("Sentence '%s' failed to parse'" % sent)
else:
if encoding is not None:
string = string.decode(encoding)
sentences = []
- for sentence in string.split("\n"):
- if sentence == "" or sentence[0] in comment_chars:
+ for sentence in string.split('\n'):
+ if sentence == '' or sentence[0] in comment_chars:
continue
- split_info = sentence.split(":", 1)
+ split_info = sentence.split(':', 1)
result = None
if len(split_info) == 2:
- if split_info[0] in ["True", "true", "False", "false"]:
- result = split_info[0] in ["True", "true"]
+ if split_info[0] in ['True', 'true', 'False', 'false']:
+ result = split_info[0] in ['True', 'true']
sentence = split_info[1]
else:
result = int(split_info[0])
# Natural Language Toolkit: Viterbi Probabilistic Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
from functools import reduce
from nltk.tree import Tree, ProbabilisticTree
+from nltk.compat import python_2_unicode_compatible
from nltk.parse.api import ParserI
##//////////////////////////////////////////////////////
+@python_2_unicode_compatible
class ViterbiParser(ParserI):
"""
A bottom-up ``PCFG`` parser that uses dynamic programming to find
# Initialize the constituents dictionary with the words from
# the text.
if self._trace:
- print(("Inserting tokens into the most likely" + " constituents table..."))
+ print(('Inserting tokens into the most likely' + ' constituents table...'))
for index in range(len(tokens)):
token = tokens[index]
constituents[index, index + 1, token] = token
if self._trace:
print(
(
- "Finding the most likely constituents"
- + " spanning %d text elements..." % length
+ 'Finding the most likely constituents'
+ + ' spanning %d text elements...' % length
)
)
for start in range(len(tokens) - length + 1):
if self._trace > 1:
if c is None or c != tree:
if c is None or c.prob() < tree.prob():
- print(" Insert:", end=" ")
+ print(' Insert:', end=' ')
else:
- print(" Discard:", end=" ")
+ print(' Discard:', end=' ')
self._trace_production(production, p, span, len(tokens))
if c is None or c.prob() < tree.prob():
constituents[span[0], span[1], production.lhs()] = tree
:rtype: None
"""
- str = "|" + "." * span[0]
- str += "=" * (span[1] - span[0])
- str += "." * (width - span[1]) + "| "
- str += "%s" % production
+ str = '|' + '.' * span[0]
+ str += '=' * (span[1] - span[0])
+ str += '.' * (width - span[1]) + '| '
+ str += '%s' % production
if self._trace > 2:
- str = "%-40s %12.10f " % (str, p)
+ str = '%-40s %12.10f ' % (str, p)
print(str)
def _trace_lexical_insertion(self, token, index, width):
- str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
- str += "%s" % (token,)
+ str = ' Insert: |' + '.' * index + '=' + '.' * (width - index - 1) + '| '
+ str += '%s' % (token,)
print(str)
def __repr__(self):
- return "<ViterbiParser for %r>" % self._grammar
+ return '<ViterbiParser for %r>' % self._grammar
##//////////////////////////////////////////////////////
# Define two demos. Each demo has a sentence and a grammar.
demos = [
- ("I saw the man with my telescope", toy_pcfg1),
- ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
+ ('I saw the man with my telescope', toy_pcfg1),
+ ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
]
# Ask the user which demo they want to use.
print()
for i in range(len(demos)):
- print("%3s: %s" % (i + 1, demos[i][0]))
- print(" %r" % demos[i][1])
+ print('%3s: %s' % (i + 1, demos[i][0]))
+ print(' %r' % demos[i][1])
print()
- print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
+ print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
try:
snum = int(sys.stdin.readline().strip()) - 1
sent, grammar = demos[snum]
except:
- print("Bad sentence number")
+ print('Bad sentence number')
return
# Tokenize the sentence.
parser = ViterbiParser(grammar)
all_parses = {}
- print("\nsent: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
+ print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
parser.trace(3)
t = time.time()
parses = parser.parse_all(tokens)
# Print some summary statistics
print()
- print("Time (secs) # Parses Average P(parse)")
- print("-----------------------------------------")
- print("%11.4f%11d%19.14f" % (time, num_parses, average))
+ print('Time (secs) # Parses Average P(parse)')
+ print('-----------------------------------------')
+ print('%11.4f%11d%19.14f' % (time, num_parses, average))
parses = all_parses.keys()
if parses:
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
else:
p = 0
- print("------------------------------------------")
- print("%11s%11d%19.14f" % ("n/a", len(parses), p))
+ print('------------------------------------------')
+ print('%11s%11d%19.14f' % ('n/a', len(parses), p))
# Ask the user if we should draw the parses.
print()
- print("Draw parses (y/n)? ", end=" ")
- if sys.stdin.readline().strip().lower().startswith("y"):
+ print('Draw parses (y/n)? ', end=' ')
+ if sys.stdin.readline().strip().lower().startswith('y'):
from nltk.draw.tree import draw_trees
- print(" please wait...")
+ print(' please wait...')
draw_trees(*parses)
# Ask the user if we should print the parses.
print()
- print("Print parses (y/n)? ", end=" ")
- if sys.stdin.readline().strip().lower().startswith("y"):
+ print('Print parses (y/n)? ', end=' ')
+ if sys.stdin.readline().strip().lower().startswith('y'):
for parse in parses:
print(parse)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Probability and Statistics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (additions)
# Trevor Cohn <tacohn@cs.mu.oz.au> (additions)
``ConditionalProbDist``, a derived distribution.
"""
+from __future__ import print_function, unicode_literals, division
import math
import random
from functools import reduce
from abc import ABCMeta, abstractmethod
+from six import itervalues, text_type, add_metaclass
+
+from nltk import compat
from nltk.internals import raise_unorderable_types
-_NINF = float("-1e300")
+_NINF = float('-1e300')
##//////////////////////////////////////////////////////
## Frequency Distributions
##//////////////////////////////////////////////////////
-
+@compat.python_2_unicode_compatible
class FreqDist(Counter):
"""
A frequency distribution for the outcomes of an experiment. A
"""
if len(self) == 0:
raise ValueError(
- "A FreqDist must have at least one sample before max is defined."
+ 'A FreqDist must have at least one sample before max is defined.'
)
return self.most_common(1)[0][0]
:type title: bool
"""
try:
- import matplotlib.pyplot as plt
+ from matplotlib import pylab
except ImportError:
raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
+ 'The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/'
)
if len(args) == 0:
args = [len(self)]
samples = [item for item, _ in self.most_common(*args)]
- cumulative = _get_kwarg(kwargs, "cumulative", False)
- percents = _get_kwarg(kwargs, "percents", False)
+ cumulative = _get_kwarg(kwargs, 'cumulative', False)
+ percents = _get_kwarg(kwargs, 'percents', False)
if cumulative:
freqs = list(self._cumulative_frequencies(samples))
ylabel = "Cumulative Counts"
ylabel = "Counts"
# percents = [f * 100 for f in freqs] only in ProbDist?
- ax = plt.gca()
- ax.grid(True, color="silver")
-
+ pylab.grid(True, color="silver")
if "linewidth" not in kwargs:
kwargs["linewidth"] = 2
if "title" in kwargs:
- ax.set_title(kwargs["title"])
+ pylab.title(kwargs["title"])
del kwargs["title"]
-
- ax.plot(freqs, **kwargs)
- ax.set_xticks(range(len(samples)))
- ax.set_xticklabels([str(s) for s in samples], rotation=90)
- ax.set_xlabel("Samples")
- ax.set_ylabel(ylabel)
-
- plt.show()
-
- return ax
+ pylab.plot(freqs, **kwargs)
+ pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+ pylab.xlabel("Samples")
+ pylab.ylabel(ylabel)
+ pylab.show()
def tabulate(self, *args, **kwargs):
"""
args = [len(self)]
samples = [item for item, _ in self.most_common(*args)]
- cumulative = _get_kwarg(kwargs, "cumulative", False)
+ cumulative = _get_kwarg(kwargs, 'cumulative', False)
if cumulative:
freqs = list(self._cumulative_frequencies(samples))
else:
freqs = [self[sample] for sample in samples]
# percents = [f * 100 for f in freqs] only in ProbDist?
- width = max(len("{}".format(s)) for s in samples)
+ width = max(len("%s" % s) for s in samples)
width = max(width, max(len("%d" % f) for f in freqs))
for i in range(len(samples)):
- print("%*s" % (width, samples[i]), end=" ")
+ print("%*s" % (width, samples[i]), end=' ')
print()
for i in range(len(samples)):
- print("%*d" % (width, freqs[i]), end=" ")
+ print("%*d" % (width, freqs[i]), end=' ')
print()
def copy(self):
return self.__class__(super(FreqDist, self).__and__(other))
def __le__(self, other):
- """
- Returns True if this frequency distribution is a subset of the other
- and for no key the value exceeds the value of the same key from
- the other frequency distribution.
-
- The <= operator forms partial order and satisfying the axioms
- reflexivity, antisymmetry and transitivity.
-
- >>> FreqDist('a') <= FreqDist('a')
- True
- >>> a = FreqDist('abc')
- >>> b = FreqDist('aabc')
- >>> (a <= b, b <= a)
- (True, False)
- >>> FreqDist('a') <= FreqDist('abcd')
- True
- >>> FreqDist('abc') <= FreqDist('xyz')
- False
- >>> FreqDist('xyz') <= FreqDist('abc')
- False
- >>> c = FreqDist('a')
- >>> d = FreqDist('aa')
- >>> e = FreqDist('aaa')
- >>> c <= d and d <= e and c <= e
- True
- """
if not isinstance(other, FreqDist):
raise_unorderable_types("<=", self, other)
return set(self).issubset(other) and all(
self[key] <= other[key] for key in self
)
- def __ge__(self, other):
- if not isinstance(other, FreqDist):
- raise_unorderable_types(">=", self, other)
- return set(self).issuperset(other) and all(
- self[key] >= other[key] for key in other
- )
-
+ # @total_ordering doesn't work here, since the class inherits from a builtin class
+ __ge__ = lambda self, other: not self <= other or self == other
__lt__ = lambda self, other: self <= other and not self == other
- __gt__ = lambda self, other: self >= other and not self == other
+ __gt__ = lambda self, other: not self <= other
def __repr__(self):
"""
:type maxlen: int
:rtype: string
"""
- items = ["{0!r}: {1!r}".format(*item) for item in self.most_common(maxlen)]
+ items = ['{0!r}: {1!r}'.format(*item) for item in self.most_common(maxlen)]
if len(self) > maxlen:
- items.append("...")
- return "FreqDist({{{0}}})".format(", ".join(items))
+ items.append('...')
+ return 'FreqDist({{{0}}})'.format(', '.join(items))
def __str__(self):
"""
:rtype: string
"""
- return "<FreqDist with %d samples and %d outcomes>" % (len(self), self.N())
-
- def __iter__(self):
- """
- Return an iterator which yields tokens ordered by frequency.
-
- :rtype: iterator
- """
- for token, _ in self.most_common(self.B()):
- yield token
+ return '<FreqDist with %d samples and %d outcomes>' % (len(self), self.N())
##//////////////////////////////////////////////////////
##//////////////////////////////////////////////////////
-class ProbDistI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ProbDistI(object):
"""
A probability distribution for the outcomes of an experiment. A
probability distribution specifies how likely it is that an
return random.choice(list(self.samples()))
-
+@compat.python_2_unicode_compatible
class UniformProbDist(ProbDistI):
"""
A probability distribution that assigns equal probability to each
"""
if len(samples) == 0:
raise ValueError(
- "A Uniform probability distribution must " + "have at least one sample."
+ 'A Uniform probability distribution must ' + 'have at least one sample.'
)
self._sampleset = set(samples)
self._prob = 1.0 / len(self._sampleset)
return self._samples
def __repr__(self):
- return "<UniformProbDist with %d samples>" % len(self._sampleset)
-
+ return '<UniformProbDist with %d samples>' % len(self._sampleset)
+@compat.python_2_unicode_compatible
class RandomProbDist(ProbDistI):
"""
Generates a random probability distribution whereby each sample
def __init__(self, samples):
if len(samples) == 0:
raise ValueError(
- "A probability distribution must " + "have at least one sample."
+ 'A probability distribution must ' + 'have at least one sample.'
)
self._probs = self.unirand(samples)
self._samples = list(self._probs.keys())
return dict((s, randrow[i]) for i, s in enumerate(samples))
def max(self):
- if not hasattr(self, "_max"):
+ if not hasattr(self, '_max'):
self._max = max((p, v) for (v, p) in self._probs.items())[1]
return self._max
return self._samples
def __repr__(self):
- return "<RandomUniformProbDist with %d samples>" % len(self._probs)
-
+ return '<RandomUniformProbDist with %d samples>' % len(self._probs)
+@compat.python_2_unicode_compatible
class DictionaryProbDist(ProbDistI):
"""
A probability distribution whose probabilities are directly
if normalize:
if len(prob_dict) == 0:
raise ValueError(
- "A DictionaryProbDist must have at least one sample "
- + "before it can be normalized."
+ 'A DictionaryProbDist must have at least one sample '
+ + 'before it can be normalized.'
)
if log:
value_sum = sum_logs(list(self._prob_dict.values()))
return math.log(self._prob_dict[sample], 2)
def max(self):
- if not hasattr(self, "_max"):
+ if not hasattr(self, '_max'):
self._max = max((p, v) for (v, p) in self._prob_dict.items())[1]
return self._max
return self._prob_dict.keys()
def __repr__(self):
- return "<ProbDist with %d samples>" % len(self._prob_dict)
-
+ return '<ProbDist with %d samples>' % len(self._prob_dict)
+@compat.python_2_unicode_compatible
class MLEProbDist(ProbDistI):
"""
The maximum likelihood estimate for the probability distribution
:rtype: str
:return: A string representation of this ``ProbDist``.
"""
- return "<MLEProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<MLEProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class LidstoneProbDist(ProbDistI):
"""
The Lidstone estimate for the probability distribution of the
if (bins == 0) or (bins is None and freqdist.N() == 0):
name = self.__class__.__name__[:-8]
raise ValueError(
- "A %s probability distribution " % name + "must have at least one bin."
+ 'A %s probability distribution ' % name + 'must have at least one bin.'
)
if (bins is not None) and (bins < freqdist.B()):
name = self.__class__.__name__[:-8]
raise ValueError(
- "\nThe number of bins in a %s distribution " % name
- + "(%d) must be greater than or equal to\n" % bins
- + "the number of bins in the FreqDist used "
- + "to create it (%d)." % freqdist.B()
+ '\nThe number of bins in a %s distribution ' % name
+ + '(%d) must be greater than or equal to\n' % bins
+ + 'the number of bins in the FreqDist used '
+ + 'to create it (%d).' % freqdist.B()
)
self._freqdist = freqdist
:rtype: str
"""
- return "<LidstoneProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class LaplaceProbDist(LidstoneProbDist):
"""
The Laplace estimate for the probability distribution of the
:rtype: str
:return: A string representation of this ``ProbDist``.
"""
- return "<LaplaceProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class ELEProbDist(LidstoneProbDist):
"""
The expected likelihood estimate for the probability distribution
:rtype: str
"""
- return "<ELEProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<ELEProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class HeldoutProbDist(ProbDistI):
"""
The heldout estimate for the probability distribution of the
:rtype: str
:return: A string representation of this ``ProbDist``.
"""
- s = "<HeldoutProbDist: %d base samples; %d heldout samples>"
+ s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
return s % (self._base_fdist.N(), self._heldout_fdist.N())
-
+@compat.python_2_unicode_compatible
class CrossValidationProbDist(ProbDistI):
"""
The cross-validation estimate for the probability distribution of
:rtype: str
"""
- return "<CrossValidationProbDist: %d-way>" % len(self._freqdists)
-
+ return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
+@compat.python_2_unicode_compatible
class WittenBellProbDist(ProbDistI):
"""
The Witten-Bell estimate of a probability distribution. This distribution
:type bins: int
"""
assert bins is None or bins >= freqdist.B(), (
- "bins parameter must not be less than %d=freqdist.B()" % freqdist.B()
+ 'bins parameter must not be less than %d=freqdist.B()' % freqdist.B()
)
if bins is None:
bins = freqdist.B()
:rtype: str
"""
- return "<WittenBellProbDist based on %d samples>" % self._freqdist.N()
+ return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
##//////////////////////////////////////////////////////
##//////////////////////////////////////////////////////
-
+@compat.python_2_unicode_compatible
class SimpleGoodTuringProbDist(ProbDistI):
"""
SimpleGoodTuring ProbDist approximates from frequency to frequency of
"""
assert (
bins is None or bins > freqdist.B()
- ), "bins parameter must not be less than %d=freqdist.B()+1" % (freqdist.B() + 1)
+ ), 'bins parameter must not be less than %d=freqdist.B()+1' % (freqdist.B() + 1)
if bins is None:
bins = freqdist.B() + 1
self._freqdist = freqdist
self._slope = xy_cov / x_var if x_var != 0 else 0.0
if self._slope >= -1:
warnings.warn(
- "SimpleGoodTuring did not find a proper best fit "
- "line for smoothing probabilities of occurrences. "
- "The probability estimates are likely to be "
- "unreliable."
+ 'SimpleGoodTuring did not find a proper best fit '
+ 'line for smoothing probabilities of occurrences. '
+ 'The probability estimates are likely to be '
+ 'unreliable.'
)
self._intercept = y_mean - self._slope * x_mean
:rtype: str
"""
- return "<SimpleGoodTuringProbDist based on %d samples>" % self._freqdist.N()
+ return '<SimpleGoodTuringProbDist based on %d samples>' % self._freqdist.N()
class MutableProbDist(ProbDistI):
# inherit documentation
i = self._sample_dict.get(sample)
if i is None:
- return float("-inf")
+ return float('-inf')
return self._data[i] if self._logs else math.log(self._data[i], 2)
def update(self, sample, prob, log=True):
# where possible.
-
+@compat.python_2_unicode_compatible
class KneserNeyProbDist(ProbDistI):
"""
Kneser-Ney estimate of a probability distribution. This is a version of
def prob(self, trigram):
# sample must be a triple
if len(trigram) != 3:
- raise ValueError("Expected an iterable with 3 members.")
+ raise ValueError('Expected an iterable with 3 members.')
trigram = tuple(trigram)
w0, w1, w2 = trigram
return self._trigrams.max()
def __repr__(self):
- """
+ '''
Return a string representation of this ProbDist
:rtype: str
- """
- return "<KneserNeyProbDist based on {0} trigrams".format(self._trigrams.N())
+ '''
+ return '<KneserNeyProbDist based on {0} trigrams'.format(self._trigrams.N())
##//////////////////////////////////////////////////////
def log_likelihood(test_pdist, actual_pdist):
if not isinstance(test_pdist, ProbDistI) or not isinstance(actual_pdist, ProbDistI):
- raise ValueError("expected a ProbDist.")
+ raise ValueError('expected a ProbDist.')
# Is this right?
return sum(
actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2) for s in actual_pdist
##//////////////////////////////////////////////////////
-
+@compat.python_2_unicode_compatible
class ConditionalFreqDist(defaultdict):
"""
A collection of frequency distributions for a single experiment
:rtype: int
"""
- return sum(fdist.N() for fdist in self.values())
+ return sum(fdist.N() for fdist in itervalues(self))
def plot(self, *args, **kwargs):
"""
:type conditions: list
"""
try:
- import matplotlib.pyplot as plt #import statment fix
+ from matplotlib import pylab
except ImportError:
raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
+ 'The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/'
)
cumulative = _get_kwarg(kwargs, 'cumulative', False)
percents = _get_kwarg(kwargs, 'percents', False)
- conditions = [c for c in _get_kwarg(kwargs, 'conditions', self.conditions()) if c in self] # conditions should be in self
+ conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
title = _get_kwarg(kwargs, 'title', '')
samples = _get_kwarg(
- kwargs, 'samples', sorted(set(v
- for c in conditions
- for v in self[c]))
+ kwargs, 'samples', sorted(set(v for c in conditions for v in self[c]))
) # this computation could be wasted
if "linewidth" not in kwargs:
kwargs["linewidth"] = 2
- ax = plt.gca()
- if (len(conditions) != 0):
- freqs = []
- for condition in conditions:
- if cumulative:
- # freqs should be a list of list where each sub list will be a frequency of a condition
- freqs.append(list(self[condition]._cumulative_frequencies(samples)))
- ylabel = "Cumulative Counts"
- legend_loc = 'lower right'
- if percents:
- freqs[-1] = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
- ylabel = "Cumulative Percents"
- else:
- freqs.append([self[condition][sample] for sample in samples])
- ylabel = "Counts"
- legend_loc = 'upper right'
- # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
-
- i = 0
- for freq in freqs:
- kwargs['label'] = conditions[i] #label for each condition
- i += 1
- ax.plot(freq, *args, **kwargs)
- ax.legend(loc=legend_loc)
- ax.grid(True, color="silver")
- ax.set_xticks(range(len(samples)))
- ax.set_xticklabels([str(s) for s in samples], rotation=90)
- if title:
- ax.set_title(title)
- ax.set_xlabel("Samples")
- ax.set_ylabel(ylabel)
- plt.show()
-
- return ax
+
+ for condition in conditions:
+ if cumulative:
+ freqs = list(self[condition]._cumulative_frequencies(samples))
+ ylabel = "Cumulative Counts"
+ legend_loc = 'lower right'
+ if percents:
+ freqs = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
+ ylabel = "Cumulative Percents"
+ else:
+ freqs = [self[condition][sample] for sample in samples]
+ ylabel = "Counts"
+ legend_loc = 'upper right'
+ # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
+ kwargs['label'] = "%s" % condition
+ pylab.plot(freqs, *args, **kwargs)
+
+ pylab.legend(loc=legend_loc)
+ pylab.grid(True, color="silver")
+ pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+ if title:
+ pylab.title(title)
+ pylab.xlabel("Samples")
+ pylab.ylabel(ylabel)
+ pylab.show()
def tabulate(self, *args, **kwargs):
"""
:type title: bool
"""
- cumulative = _get_kwarg(kwargs, "cumulative", False)
- conditions = _get_kwarg(kwargs, "conditions", sorted(self.conditions()))
+ cumulative = _get_kwarg(kwargs, 'cumulative', False)
+ conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
samples = _get_kwarg(
- kwargs,
- "samples",
- sorted(set(v for c in conditions if c in self for v in self[c])),
+ kwargs, 'samples', sorted(set(v for c in conditions for v in self[c]))
) # this computation could be wasted
width = max(len("%s" % s) for s in samples)
width = max(width, max(len("%d" % f) for f in freqs[c]))
condition_size = max(len("%s" % c) for c in conditions)
- print(" " * condition_size, end=" ")
+ print(' ' * condition_size, end=' ')
for s in samples:
- print("%*s" % (width, s), end=" ")
+ print("%*s" % (width, s), end=' ')
print()
for c in conditions:
- print("%*s" % (condition_size, c), end=" ")
+ print("%*s" % (condition_size, c), end=' ')
for f in freqs[c]:
- print("%*d" % (width, f), end=" ")
+ print("%*d" % (width, f), end=' ')
print()
# Mathematical operators
:rtype: str
"""
- return "<ConditionalFreqDist with %d conditions>" % len(self)
-
+ return '<ConditionalFreqDist with %d conditions>' % len(self)
-class ConditionalProbDistI(dict, metaclass=ABCMeta):
+@compat.python_2_unicode_compatible
+@add_metaclass(ABCMeta)
+class ConditionalProbDistI(dict):
"""
A collection of probability distributions for a single experiment
run under different conditions. Conditional probability
:rtype: str
"""
- return "<%s with %d conditions>" % (type(self).__name__, len(self))
+ return '<%s with %d conditions>' % (type(self).__name__, len(self))
class ConditionalProbDist(ConditionalProbDistI):
the object.
:type logprob: float
"""
- if "prob" in kwargs:
- if "logprob" in kwargs:
- raise TypeError("Must specify either prob or logprob " "(not both)")
+ if 'prob' in kwargs:
+ if 'logprob' in kwargs:
+ raise TypeError('Must specify either prob or logprob ' '(not both)')
else:
- ProbabilisticMixIn.set_prob(self, kwargs["prob"])
- elif "logprob" in kwargs:
- ProbabilisticMixIn.set_logprob(self, kwargs["logprob"])
+ ProbabilisticMixIn.set_prob(self, kwargs['prob'])
+ elif 'logprob' in kwargs:
+ ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
else:
self.__prob = self.__logprob = None
class ImmutableProbabilisticMixIn(ProbabilisticMixIn):
def set_prob(self, prob):
- raise ValueError("%s is immutable" % self.__class__.__name__)
+ raise ValueError('%s is immutable' % self.__class__.__name__)
def set_logprob(self, prob):
- raise ValueError("%s is immutable" % self.__class__.__name__)
+ raise ValueError('%s is immutable' % self.__class__.__name__)
## Helper function for processing keyword arguments
samples are numbers from 1 to ``numsamples``, and are generated by
summing two numbers, each of which has a uniform distribution.
"""
+ import random
fdist = FreqDist()
for x in range(numoutcomes):
# Print the results in a formatted table.
print(
(
- "%d samples (1-%d); %d outcomes were sampled for each FreqDist"
+ '%d samples (1-%d); %d outcomes were sampled for each FreqDist'
% (numsamples, numsamples, numoutcomes)
)
)
- print("=" * 9 * (len(pdists) + 2))
- FORMATSTR = " FreqDist " + "%8s " * (len(pdists) - 1) + "| Actual"
+ print('=' * 9 * (len(pdists) + 2))
+ FORMATSTR = ' FreqDist ' + '%8s ' * (len(pdists) - 1) + '| Actual'
print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1]))
- print("-" * 9 * (len(pdists) + 2))
- FORMATSTR = "%3d %8.6f " + "%8.6f " * (len(pdists) - 1) + "| %8.6f"
+ print('-' * 9 * (len(pdists) + 2))
+ FORMATSTR = '%3d %8.6f ' + '%8.6f ' * (len(pdists) - 1) + '| %8.6f'
for val in vals:
print(FORMATSTR % val)
# Print the totals for each column (should all be 1.0)
zvals = list(zip(*vals))
sums = [sum(val) for val in zvals[1:]]
- print("-" * 9 * (len(pdists) + 2))
- FORMATSTR = "Total " + "%8.6f " * (len(pdists)) + "| %8.6f"
+ print('-' * 9 * (len(pdists) + 2))
+ FORMATSTR = 'Total ' + '%8.6f ' * (len(pdists)) + '| %8.6f'
print(FORMATSTR % tuple(sums))
- print("=" * 9 * (len(pdists) + 2))
+ print('=' * 9 * (len(pdists) + 2))
# Display the distributions themselves, if they're short enough.
if len("%s" % fdist1) < 70:
- print(" fdist1: %s" % fdist1)
- print(" fdist2: %s" % fdist2)
- print(" fdist3: %s" % fdist3)
+ print(' fdist1: %s' % fdist1)
+ print(' fdist2: %s' % fdist2)
+ print(' fdist3: %s' % fdist3)
print()
- print("Generating:")
+ print('Generating:')
for pdist in pdists:
fdist = FreqDist(pdist.generate() for i in range(5000))
- print("%20s %s" % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
+ print('%20s %s' % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
print()
def gt_demo():
from nltk import corpus
- emma_words = corpus.gutenberg.words("austen-emma.txt")
+ emma_words = corpus.gutenberg.words('austen-emma.txt')
fd = FreqDist(emma_words)
sgt = SimpleGoodTuringProbDist(fd)
- print("%18s %8s %14s" % ("word", "freqency", "SimpleGoodTuring"))
+ print('%18s %8s %14s' % ("word", "freqency", "SimpleGoodTuring"))
fd_keys_sorted = (
key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True)
)
for key in fd_keys_sorted:
- print("%18s %8d %14e" % (key, fd[key], sgt.prob(key)))
+ print('%18s %8d %14e' % (key, fd[key], sgt.prob(key)))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo(6, 10)
demo(5, 5000)
gt_demo()
__all__ = [
- "ConditionalFreqDist",
- "ConditionalProbDist",
- "ConditionalProbDistI",
- "CrossValidationProbDist",
- "DictionaryConditionalProbDist",
- "DictionaryProbDist",
- "ELEProbDist",
- "FreqDist",
- "SimpleGoodTuringProbDist",
- "HeldoutProbDist",
- "ImmutableProbabilisticMixIn",
- "LaplaceProbDist",
- "LidstoneProbDist",
- "MLEProbDist",
- "MutableProbDist",
- "KneserNeyProbDist",
- "ProbDistI",
- "ProbabilisticMixIn",
- "UniformProbDist",
- "WittenBellProbDist",
- "add_logs",
- "log_likelihood",
- "sum_logs",
- "entropy",
+ 'ConditionalFreqDist',
+ 'ConditionalProbDist',
+ 'ConditionalProbDistI',
+ 'CrossValidationProbDist',
+ 'DictionaryConditionalProbDist',
+ 'DictionaryProbDist',
+ 'ELEProbDist',
+ 'FreqDist',
+ 'SimpleGoodTuringProbDist',
+ 'HeldoutProbDist',
+ 'ImmutableProbabilisticMixIn',
+ 'LaplaceProbDist',
+ 'LidstoneProbDist',
+ 'MLEProbDist',
+ 'MutableProbDist',
+ 'KneserNeyProbDist',
+ 'ProbDistI',
+ 'ProbabilisticMixIn',
+ 'UniformProbDist',
+ 'WittenBellProbDist',
+ 'add_logs',
+ 'log_likelihood',
+ 'sum_logs',
+ 'entropy',
]
# Natural Language Toolkit: Semantic Interpretation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
models/
boxer/
"""
+from __future__ import print_function, unicode_literals
import os
import re
DrtVariableExpression,
)
+from nltk.compat import python_2_unicode_compatible
+
class Boxer(object):
"""
self.set_bin_dir(bin_dir, verbose)
def set_bin_dir(self, bin_dir, verbose=False):
- self._candc_bin = self._find_binary("candc", bin_dir, verbose)
+ self._candc_bin = self._find_binary('candc', bin_dir, verbose)
self._candc_models_path = os.path.normpath(
- os.path.join(self._candc_bin[:-5], "../models")
+ os.path.join(self._candc_bin[:-5], '../models')
)
- self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
+ self._boxer_bin = self._find_binary('boxer', bin_dir, verbose)
def interpret(self, input, discourse_id=None, question=False, verbose=False):
"""
:return: stdout
"""
args = [
- "--models",
- os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
- "--candc-printer",
- "boxer",
+ '--models',
+ os.path.join(self._candc_models_path, ['boxer', 'questions'][question]),
+ '--candc-printer',
+ 'boxer',
]
return self._call(
- "\n".join(
+ '\n'.join(
sum(
(
["<META>'{0}'".format(id)] + d
f = None
try:
fd, temp_filename = tempfile.mkstemp(
- prefix="boxer-", suffix=".in", text=True
+ prefix='boxer-', suffix='.in', text=True
)
- f = os.fdopen(fd, "w")
+ f = os.fdopen(fd, 'w')
f.write(candc_out)
finally:
if f:
f.close()
args = [
- "--box",
- "false",
- "--semantics",
- "drs",
+ '--box',
+ 'false',
+ '--semantics',
+ 'drs',
#'--flat', 'false', # removed from boxer
- "--resolve",
- ["false", "true"][self._resolve],
- "--elimeq",
- ["false", "true"][self._elimeq],
- "--format",
- "prolog",
- "--instantiate",
- "true",
- "--input",
+ '--resolve',
+ ['false', 'true'][self._resolve],
+ '--elimeq',
+ ['false', 'true'][self._elimeq],
+ '--format',
+ 'prolog',
+ '--instantiate',
+ 'true',
+ '--input',
temp_filename,
]
stdout = self._call(None, self._boxer_bin, args, verbose)
return find_binary(
name,
path_to_bin=bin_dir,
- env_vars=["CANDC"],
- url="http://svn.ask.it.usyd.edu.au/trac/candc/",
- binary_names=[name, name + ".exe"],
+ env_vars=['CANDC'],
+ url='http://svn.ask.it.usyd.edu.au/trac/candc/',
+ binary_names=[name, name + '.exe'],
verbose=verbose,
)
:return: stdout
"""
if verbose:
- print("Calling:", binary)
- print("Args:", args)
- print("Input:", input_str)
- print("Command:", binary + " " + " ".join(args))
+ print('Calling:', binary)
+ print('Args:', args)
+ print('Input:', input_str)
+ print('Command:', binary + ' ' + ' '.join(args))
# Call via a subprocess
if input_str is None:
cmd = [binary] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
- cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, " ".join(args))
+ cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, ' '.join(args))
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
)
stdout, stderr = p.communicate()
if verbose:
- print("Return code:", p.returncode)
+ print('Return code:', p.returncode)
if stdout:
- print("stdout:\n", stdout, "\n")
+ print('stdout:\n', stdout, '\n')
if stderr:
- print("stderr:\n", stderr, "\n")
+ print('stderr:\n', stderr, '\n')
if p.returncode != 0:
raise Exception(
- "ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}".format(
- binary, " ".join(args), p.returncode, stderr
+ 'ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}'.format(
+ binary, ' '.join(args), p.returncode, stderr
)
)
return stdout
def _parse_to_drs_dict(self, boxer_out, use_disc_id):
- lines = boxer_out.split("\n")
+ lines = boxer_out.split('\n')
drs_dict = {}
i = 0
while i < len(lines):
line = lines[i]
- if line.startswith("id("):
- comma_idx = line.index(",")
+ if line.startswith('id('):
+ comma_idx = line.index(',')
discourse_id = line[3:comma_idx]
if discourse_id[0] == "'" and discourse_id[-1] == "'":
discourse_id = discourse_id[1:-1]
- drs_id = line[comma_idx + 1 : line.index(")")]
+ drs_id = line[comma_idx + 1 : line.index(')')]
i += 1
line = lines[i]
- assert line.startswith("sem({0},".format(drs_id))
+ assert line.startswith('sem({0},'.format(drs_id))
if line[-4:] == "').'":
line = line[:-4] + ")."
- assert line.endswith(")."), "can't parse line: {0}".format(line)
+ assert line.endswith(').'), "can't parse line: {0}".format(line)
- search_start = len("sem({0},[".format(drs_id))
+ search_start = len('sem({0},['.format(drs_id))
brace_count = 1
drs_start = -1
for j, c in enumerate(line[search_start:]):
- if c == "[":
+ if c == '[':
brace_count += 1
- if c == "]":
+ if c == ']':
brace_count -= 1
if brace_count == 0:
drs_start = search_start + j + 1
return DrtParser.parse(self, data, signature)
def get_all_symbols(self):
- return ["(", ")", ",", "[", "]", ":"]
+ return ['(', ')', ',', '[', ']', ':']
def handle(self, tok, context):
return self.handle_drs(tok)
return accum
def handle_drs(self, tok):
- if tok == "drs":
+ if tok == 'drs':
return self.parse_drs()
- elif tok in ["merge", "smerge"]:
+ elif tok in ['merge', 'smerge']:
return self._handle_binary_expression(self._make_merge_expression)(None, [])
- elif tok in ["alfa"]:
+ elif tok in ['alfa']:
return self._handle_alfa(self._make_merge_expression)(None, [])
def handle_condition(self, tok, indices):
:param indices: list of int
:return: list of ``DrtExpression``
"""
- if tok == "not":
+ if tok == 'not':
return [self._handle_not()]
- if tok == "or":
+ if tok == 'or':
conds = [self._handle_binary_expression(self._make_or_expression)]
- elif tok == "imp":
+ elif tok == 'imp':
conds = [self._handle_binary_expression(self._make_imp_expression)]
- elif tok == "eq":
+ elif tok == 'eq':
conds = [self._handle_eq()]
- elif tok == "prop":
+ elif tok == 'prop':
conds = [self._handle_prop()]
- elif tok == "pred":
+ elif tok == 'pred':
conds = [self._handle_pred()]
- elif tok == "named":
+ elif tok == 'named':
conds = [self._handle_named()]
- elif tok == "rel":
+ elif tok == 'rel':
conds = [self._handle_rel()]
- elif tok == "timex":
+ elif tok == 'timex':
conds = self._handle_timex()
- elif tok == "card":
+ elif tok == 'card':
conds = [self._handle_card()]
- elif tok == "whq":
+ elif tok == 'whq':
conds = [self._handle_whq()]
- elif tok == "duplex":
+ elif tok == 'duplex':
conds = [self._handle_duplex()]
else:
)
def _handle_not(self):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
drs = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return BoxerNot(drs)
def _handle_pred(self):
# pred(_G3943, dog, n, 0)
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
name = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
pos = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
sense = int(self.token())
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
def _handle_pred_f(sent_index, word_indices):
return BoxerPred(
def _handle_duplex(self):
# duplex(whq, drs(...), var, drs(...))
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
# self.assertToken(self.token(), '[')
ans_types = []
# while self.token(0) != ']':
# ans_types.append(self.token())
# self.token() #swallow the ']'
- self.assertToken(self.token(), "whq")
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), 'whq')
+ self.assertToken(self.token(), ',')
d1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
ref = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
d2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerWhq(
self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
)
def _handle_named(self):
# named(x0, john, per, 0)
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
name = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
type = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
sense = self.token() # as per boxer rev 2554
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerNamed(
self.discourse_id, sent_index, word_indices, variable, name, type, sense
)
def _handle_rel(self):
# rel(_G3993, _G3943, agent, 0)
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
var1 = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
var2 = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
rel = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
sense = int(self.token())
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerRel(
self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
)
def _handle_timex(self):
# timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
arg = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
new_conds = self._handle_time_expression(arg)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return new_conds
def _handle_time_expression(self, arg):
# date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
tok = self.token()
- self.assertToken(self.token(), "(")
- if tok == "date":
+ self.assertToken(self.token(), '(')
+ if tok == 'date':
conds = self._handle_date(arg)
- elif tok == "time":
+ elif tok == 'time':
conds = self._handle_time(arg)
else:
return None
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return [
lambda sent_index, word_indices: BoxerPred(
- self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
+ self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0
)
] + [lambda sent_index, word_indices: cond for cond in conds]
(sent_index, word_indices), = self._sent_and_word_indices(
self._parse_index_list()
)
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
pol = self.token()
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
conds.append(
BoxerPred(
self.discourse_id,
sent_index,
word_indices,
arg,
- "date_pol_{0}".format(pol),
- "a",
+ 'date_pol_{0}'.format(pol),
+ 'a',
0,
)
)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
(sent_index, word_indices), = self._sent_and_word_indices(
self._parse_index_list()
)
year = self.token()
- if year != "XXXX":
- year = year.replace(":", "_")
+ if year != 'XXXX':
+ year = year.replace(':', '_')
conds.append(
BoxerPred(
self.discourse_id,
sent_index,
word_indices,
arg,
- "date_year_{0}".format(year),
- "a",
+ 'date_year_{0}'.format(year),
+ 'a',
0,
)
)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
(sent_index, word_indices), = self._sent_and_word_indices(
self._parse_index_list()
)
month = self.token()
- if month != "XX":
+ if month != 'XX':
conds.append(
BoxerPred(
self.discourse_id,
sent_index,
word_indices,
arg,
- "date_month_{0}".format(month),
- "a",
+ 'date_month_{0}'.format(month),
+ 'a',
0,
)
)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
(sent_index, word_indices), = self._sent_and_word_indices(
self._parse_index_list()
)
day = self.token()
- if day != "XX":
+ if day != 'XX':
conds.append(
BoxerPred(
self.discourse_id,
sent_index,
word_indices,
arg,
- "date_day_{0}".format(day),
- "a",
+ 'date_day_{0}'.format(day),
+ 'a',
0,
)
)
conds = []
self._parse_index_list()
hour = self.token()
- if hour != "XX":
- conds.append(self._make_atom("r_hour_2", arg, hour))
- self.assertToken(self.token(), ",")
+ if hour != 'XX':
+ conds.append(self._make_atom('r_hour_2', arg, hour))
+ self.assertToken(self.token(), ',')
self._parse_index_list()
min = self.token()
- if min != "XX":
- conds.append(self._make_atom("r_min_2", arg, min))
- self.assertToken(self.token(), ",")
+ if min != 'XX':
+ conds.append(self._make_atom('r_min_2', arg, min))
+ self.assertToken(self.token(), ',')
self._parse_index_list()
sec = self.token()
- if sec != "XX":
- conds.append(self._make_atom("r_sec_2", arg, sec))
+ if sec != 'XX':
+ conds.append(self._make_atom('r_sec_2', arg, sec))
return conds
def _handle_card(self):
# card(_G18535, 28, ge)
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
value = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
type = self.token()
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerCard(
self.discourse_id, sent_index, word_indices, variable, value, type
)
def _handle_prop(self):
# prop(_G15949, drs(...))
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerProp(
self.discourse_id, sent_index, word_indices, variable, drs
)
def _parse_index_list(self):
# [1001,1002]:
indices = []
- self.assertToken(self.token(), "[")
- while self.token(0) != "]":
+ self.assertToken(self.token(), '[')
+ while self.token(0) != ']':
indices.append(self.parse_index())
- if self.token(0) == ",":
+ if self.token(0) == ',':
self.token() # swallow ','
self.token() # swallow ']'
- self.assertToken(self.token(), ":")
+ self.assertToken(self.token(), ':')
return indices
def parse_drs(self):
# drs([[1001]:_G3943],
# [[1002]:pred(_G3943, dog, n, 0)]
# )
- self.assertToken(self.token(), "(")
- self.assertToken(self.token(), "[")
+ self.assertToken(self.token(), '(')
+ self.assertToken(self.token(), '[')
refs = set()
- while self.token(0) != "]":
+ while self.token(0) != ']':
indices = self._parse_index_list()
refs.add(self.parse_variable())
- if self.token(0) == ",":
+ if self.token(0) == ',':
self.token() # swallow ','
self.token() # swallow ']'
- self.assertToken(self.token(), ",")
- self.assertToken(self.token(), "[")
+ self.assertToken(self.token(), ',')
+ self.assertToken(self.token(), '[')
conds = []
- while self.token(0) != "]":
+ while self.token(0) != ']':
indices = self._parse_index_list()
conds.extend(self.parse_condition(indices))
- if self.token(0) == ",":
+ if self.token(0) == ',':
self.token() # swallow ','
self.token() # swallow ']'
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return BoxerDrs(list(refs), conds)
def _handle_binary_expression(self, make_callback):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
drs1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: make_callback(
sent_index, word_indices, drs1, drs2
)
def _handle_alfa(self, make_callback):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
type = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: make_callback(
sent_index, word_indices, drs1, drs2
)
def _handle_eq(self):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
var1 = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
var2 = self.parse_variable()
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerEq(
self.discourse_id, sent_index, word_indices, var1, var2
)
def _handle_whq(self):
- self.assertToken(self.token(), "(")
- self.assertToken(self.token(), "[")
+ self.assertToken(self.token(), '(')
+ self.assertToken(self.token(), '[')
ans_types = []
- while self.token(0) != "]":
+ while self.token(0) != ']':
cat = self.token()
- self.assertToken(self.token(), ":")
- if cat == "des":
+ self.assertToken(self.token(), ':')
+ if cat == 'des':
ans_types.append(self.token())
- elif cat == "num":
- ans_types.append("number")
+ elif cat == 'num':
+ ans_types.append('number')
typ = self.token()
- if typ == "cou":
- ans_types.append("count")
+ if typ == 'cou':
+ ans_types.append('count')
else:
ans_types.append(typ)
else:
ans_types.append(self.token())
self.token() # swallow the ']'
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
d1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
ref = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
d2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerWhq(
self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
)
def parse_variable(self):
var = self.token()
- assert re.match("^[exps]\d+$", var), var
+ assert re.match('^[exps]\d+$', var), var
return var
def parse_index(self):
# conds = self.handle_conds(None)
# self.assertNextToken(DrtTokens.CLOSE)
# return BoxerDrs(label, refs, conds)
- if tok == "pred":
+ if tok == 'pred':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
sense = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
- elif tok == "named":
+ elif tok == 'named':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
return BoxerNamed(
disc_id, sent_id, word_ids, variable, name, type, sense
)
- elif tok == "rel":
+ elif tok == 'rel':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
sense = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
- elif tok == "prop":
+ elif tok == 'prop':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
drs = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
- elif tok == "not":
+ elif tok == 'not':
self.assertNextToken(DrtTokens.OPEN)
drs = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerNot(drs)
- elif tok == "imp":
+ elif tok == 'imp':
self.assertNextToken(DrtTokens.OPEN)
drs1 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.COMMA)
drs2 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerDrs(drs1.refs, drs1.conds, drs2)
- elif tok == "or":
+ elif tok == 'or':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
drs2 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
- elif tok == "eq":
+ elif tok == 'eq':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
var2 = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
- elif tok == "card":
+ elif tok == 'card':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
type = self.token()
self.assertNextToken(DrtTokens.CLOSE)
return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
- elif tok == "whq":
+ elif tok == 'whq':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (
self.discourse_id if self.discourse_id is not None else self.token()
def nullableIntToken(self):
t = self.token()
- return int(t) if t != "None" else None
+ return int(t) if t != 'None' else None
def get_next_token_variable(self, description):
try:
return self.token()
except ExpectedMoreTokensException as e:
- raise ExpectedMoreTokensException(e.index, "Variable expected.")
+ raise ExpectedMoreTokensException(e.index, 'Variable expected.')
class AbstractBoxerDrs(object):
def variable_types(self):
vartypes = {}
- for t, vars in zip(("z", "e", "p"), self.variables()):
+ for t, vars in zip(('z', 'e', 'p'), self.variables()):
for v in vars:
vartypes[v] = t
return vartypes
return self
def _clean_name(self, name):
- return name.replace("-", "_").replace("'", "_")
+ return name.replace('-', '_').replace("'", "_")
def renumber_sentences(self, f):
return self
return hash("{0}".format(self))
+@python_2_unicode_compatible
class BoxerDrs(AbstractBoxerDrs):
def __init__(self, refs, conds, consequent=None):
AbstractBoxerDrs.__init__(self)
)
def __repr__(self):
- s = "drs([%s], [%s])" % (
- ", ".join("%s" % r for r in self.refs),
- ", ".join("%s" % c for c in self.conds),
+ s = 'drs([%s], [%s])' % (
+ ', '.join("%s" % r for r in self.refs),
+ ', '.join("%s" % c for c in self.conds),
)
if self.consequent is not None:
- s = "imp(%s, %s)" % (s, self.consequent)
+ s = 'imp(%s, %s)' % (s, self.consequent)
return s
def __eq__(self, other):
__hash__ = AbstractBoxerDrs.__hash__
+@python_2_unicode_compatible
class BoxerNot(AbstractBoxerDrs):
def __init__(self, drs):
AbstractBoxerDrs.__init__(self)
return BoxerNot(self.drs.renumber_sentences(f))
def __repr__(self):
- return "not(%s)" % (self.drs)
+ return 'not(%s)' % (self.drs)
def __eq__(self, other):
return self.__class__ == other.__class__ and self.drs == other.drs
__hash__ = AbstractBoxerDrs.__hash__
+@python_2_unicode_compatible
class BoxerIndexed(AbstractBoxerDrs):
def __init__(self, discourse_id, sent_index, word_indices):
AbstractBoxerDrs.__init__(self)
__hash__ = AbstractBoxerDrs.__hash__
def __repr__(self):
- s = "%s(%s, %s, [%s]" % (
+ s = '%s(%s, %s, [%s]' % (
self._pred(),
self.discourse_id,
self.sent_index,
- ", ".join("%s" % wi for wi in self.word_indices),
+ ', '.join("%s" % wi for wi in self.word_indices),
)
for v in self:
- s += ", %s" % v
- return s + ")"
+ s += ', %s' % v
+ return s + ')'
class BoxerPred(BoxerIndexed):
return iter((self.var, self.name, self.pos, self.sense))
def _pred(self):
- return "pred"
+ return 'pred'
class BoxerNamed(BoxerIndexed):
return iter((self.var, self.name, self.type, self.sense))
def _pred(self):
- return "named"
+ return 'named'
class BoxerRel(BoxerIndexed):
return iter((self.var1, self.var2, self.rel, self.sense))
def _pred(self):
- return "rel"
+ return 'rel'
class BoxerProp(BoxerIndexed):
return iter((self.var, self.drs))
def _pred(self):
- return "prop"
+ return 'prop'
class BoxerEq(BoxerIndexed):
return iter((self.var1, self.var2))
def _pred(self):
- return "eq"
+ return 'eq'
class BoxerCard(BoxerIndexed):
return iter((self.var, self.value, self.type))
def _pred(self):
- return "card"
+ return 'card'
class BoxerOr(BoxerIndexed):
return iter((self.drs1, self.drs2))
def _pred(self):
- return "or"
+ return 'or'
class BoxerWhq(BoxerIndexed):
def __iter__(self):
return iter(
- ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
+ ('[' + ','.join(self.ans_types) + ']', self.drs1, self.variable, self.drs2)
)
def _pred(self):
- return "whq"
+ return 'whq'
class PassthroughBoxerDrsInterpreter(object):
elif isinstance(ex, BoxerNot):
return DrtNegatedExpression(self.interpret(ex.drs))
elif isinstance(ex, BoxerPred):
- pred = self._add_occur_indexing("%s_%s" % (ex.pos, ex.name), ex)
+ pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerNamed):
- pred = self._add_occur_indexing("ne_%s_%s" % (ex.type, ex.name), ex)
+ pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerRel):
- pred = self._add_occur_indexing("%s" % (ex.rel), ex)
+ pred = self._add_occur_indexing('%s' % (ex.rel), ex)
return self._make_atom(pred, ex.var1, ex.var2)
elif isinstance(ex, BoxerProp):
return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
DrtVariableExpression(Variable(ex.var2)),
)
elif isinstance(ex, BoxerCard):
- pred = self._add_occur_indexing("card_%s_%s" % (ex.type, ex.value), ex)
+ pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerOr):
return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
drs1 = self.interpret(ex.drs1)
drs2 = self.interpret(ex.drs2)
return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
- assert False, "%s: %s" % (ex.__class__.__name__, ex)
+ assert False, '%s: %s' % (ex.__class__.__name__, ex)
def _make_atom(self, pred, *args):
accum = DrtVariableExpression(Variable(pred))
def _add_occur_indexing(self, base, ex):
if self._occur_index and ex.sent_index is not None:
if ex.discourse_id:
- base += "_%s" % ex.discourse_id
- base += "_s%s" % ex.sent_index
- base += "_w%s" % sorted(ex.word_indices)[0]
+ base += '_%s' % ex.discourse_id
+ base += '_s%s' % ex.sent_index
+ base += '_w%s' % sorted(ex.word_indices)[0]
return base
pass
-if __name__ == "__main__":
+if __name__ == '__main__':
opts = OptionParser("usage: %prog TEXT [options]")
opts.add_option(
"--verbose",
interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
drs = Boxer(interpreter).interpret_multi(
- args[0].split(r"\n"), question=options.question, verbose=options.verbose
+ args[0].split(r'\n'), question=options.question, verbose=options.verbose
)
if drs is None:
print(None)
# Natural Language Toolkit: Chat-80 KB Reader
# See http://www.w3.org/TR/swbp-skos-core-guide/
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>,
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
current directory.
"""
+from __future__ import print_function, unicode_literals
import re
import shelve
import os
import sys
+from six import string_types
+
import nltk.data
+from nltk.compat import python_2_unicode_compatible
###########################################################################
# Chat-80 relation metadata bundles needed to build the valuation
###########################################################################
borders = {
- "rel_name": "borders",
- "closures": ["symmetric"],
- "schema": ["region", "border"],
- "filename": "borders.pl",
+ 'rel_name': 'borders',
+ 'closures': ['symmetric'],
+ 'schema': ['region', 'border'],
+ 'filename': 'borders.pl',
}
contains = {
- "rel_name": "contains0",
- "closures": ["transitive"],
- "schema": ["region", "contain"],
- "filename": "contain.pl",
+ 'rel_name': 'contains0',
+ 'closures': ['transitive'],
+ 'schema': ['region', 'contain'],
+ 'filename': 'contain.pl',
}
city = {
- "rel_name": "city",
- "closures": [],
- "schema": ["city", "country", "population"],
- "filename": "cities.pl",
+ 'rel_name': 'city',
+ 'closures': [],
+ 'schema': ['city', 'country', 'population'],
+ 'filename': 'cities.pl',
}
country = {
- "rel_name": "country",
- "closures": [],
- "schema": [
- "country",
- "region",
- "latitude",
- "longitude",
- "area",
- "population",
- "capital",
- "currency",
+ 'rel_name': 'country',
+ 'closures': [],
+ 'schema': [
+ 'country',
+ 'region',
+ 'latitude',
+ 'longitude',
+ 'area',
+ 'population',
+ 'capital',
+ 'currency',
],
- "filename": "countries.pl",
+ 'filename': 'countries.pl',
}
circle_of_lat = {
- "rel_name": "circle_of_latitude",
- "closures": [],
- "schema": ["circle_of_latitude", "degrees"],
- "filename": "world1.pl",
+ 'rel_name': 'circle_of_latitude',
+ 'closures': [],
+ 'schema': ['circle_of_latitude', 'degrees'],
+ 'filename': 'world1.pl',
}
circle_of_long = {
- "rel_name": "circle_of_longitude",
- "closures": [],
- "schema": ["circle_of_longitude", "degrees"],
- "filename": "world1.pl",
+ 'rel_name': 'circle_of_longitude',
+ 'closures': [],
+ 'schema': ['circle_of_longitude', 'degrees'],
+ 'filename': 'world1.pl',
}
continent = {
- "rel_name": "continent",
- "closures": [],
- "schema": ["continent"],
- "filename": "world1.pl",
+ 'rel_name': 'continent',
+ 'closures': [],
+ 'schema': ['continent'],
+ 'filename': 'world1.pl',
}
region = {
- "rel_name": "in_continent",
- "closures": [],
- "schema": ["region", "continent"],
- "filename": "world1.pl",
+ 'rel_name': 'in_continent',
+ 'closures': [],
+ 'schema': ['region', 'continent'],
+ 'filename': 'world1.pl',
}
ocean = {
- "rel_name": "ocean",
- "closures": [],
- "schema": ["ocean"],
- "filename": "world1.pl",
+ 'rel_name': 'ocean',
+ 'closures': [],
+ 'schema': ['ocean'],
+ 'filename': 'world1.pl',
}
-sea = {"rel_name": "sea", "closures": [], "schema": ["sea"], "filename": "world1.pl"}
+sea = {'rel_name': 'sea', 'closures': [], 'schema': ['sea'], 'filename': 'world1.pl'}
items = [
- "borders",
- "contains",
- "city",
- "country",
- "circle_of_lat",
- "circle_of_long",
- "continent",
- "region",
- "ocean",
- "sea",
+ 'borders',
+ 'contains',
+ 'city',
+ 'country',
+ 'circle_of_lat',
+ 'circle_of_long',
+ 'continent',
+ 'region',
+ 'ocean',
+ 'sea',
]
items = tuple(sorted(items))
item_metadata = {
- "borders": borders,
- "contains": contains,
- "city": city,
- "country": country,
- "circle_of_lat": circle_of_lat,
- "circle_of_long": circle_of_long,
- "continent": continent,
- "region": region,
- "ocean": ocean,
- "sea": sea,
+ 'borders': borders,
+ 'contains': contains,
+ 'city': city,
+ 'country': country,
+ 'circle_of_lat': circle_of_lat,
+ 'circle_of_long': circle_of_long,
+ 'continent': continent,
+ 'region': region,
+ 'ocean': ocean,
+ 'sea': sea,
}
rels = item_metadata.values()
-not_unary = ["borders.pl", "contain.pl"]
+not_unary = ['borders.pl', 'contain.pl']
###########################################################################
+@python_2_unicode_compatible
class Concept(object):
"""
A Concept class, loosely based on SKOS
from nltk.sem import is_rel
assert is_rel(self._extension)
- if "symmetric" in self.closures:
+ if 'symmetric' in self.closures:
pairs = []
for (x, y) in self._extension:
pairs.append((y, x))
sym = set(pairs)
self._extension = self._extension.union(sym)
- if "transitive" in self.closures:
+ if 'transitive' in self.closures:
all = self._make_graph(self._extension)
closed = self._transclose(all)
trans = self._make_pairs(closed)
+ # print sorted(trans)
self._extension = self._extension.union(trans)
self.extension = sorted(list(self._extension))
cur = connection.cursor()
if setup:
cur.execute(
- """CREATE TABLE city_table
- (City text, Country text, Population int)"""
+ '''CREATE TABLE city_table
+ (City text, Country text, Population int)'''
)
table_name = "city_table"
for t in records:
- cur.execute("insert into %s values (?,?,?)" % table_name, t)
+ cur.execute('insert into %s values (?,?,?)' % table_name, t)
if verbose:
print("inserting values into %s: " % table_name, t)
connection.commit()
contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
for line in contents.splitlines():
if line.startswith(rel):
- line = re.sub(rel + r"\(", "", line)
- line = re.sub(r"\)\.$", "", line)
- record = line.split(",")
+ line = re.sub(rel + r'\(', '', line)
+ line = re.sub(r'\)\.$', '', line)
+ record = line.split(',')
recs.append(record)
return recs
:return: ``Concept`` of arity 2
:rtype: Concept
"""
- if not label == "border" and not label == "contain":
- label = label + "_of"
+ if not label == 'border' and not label == 'contain':
+ label = label + '_of'
c = Concept(label, arity=2, closures=closures, extension=set())
for record in records:
c.augment((record[subj], record[obj]))
"""
concepts = {}
for rel in rels:
- rel_name = rel["rel_name"]
- closures = rel["closures"]
- schema = rel["schema"]
- filename = rel["filename"]
+ rel_name = rel['rel_name']
+ closures = rel['closures']
+ schema = rel['schema']
+ filename = rel['filename']
concept_list = clause2concepts(filename, rel_name, schema, closures)
for c in concept_list:
"""
concepts = process_bundle(rels).values()
valuation = make_valuation(concepts, read=True)
- db_out = shelve.open(db, "n")
+ db_out = shelve.open(db, 'n')
db_out.update(valuation)
pairs = [(e, e) for e in domain]
if lexicon:
lex = make_lex(domain)
- with open("chat_pnames.cfg", "w") as outfile:
+ with open("chat_pnames.cfg", 'w') as outfile:
outfile.writelines(lex)
# read the pairs into the valuation
valuation.update(pairs)
template = "PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n"
for s in symbols:
- parts = s.split("_")
+ parts = s.split('_')
caps = [p.capitalize() for p in parts]
- pname = "_".join(caps)
+ pname = '_'.join(caps)
rule = template % (s, pname)
lex.append(rule)
return lex
:return: the ``Concept`` objects which are extracted from the relations
:rtype: list(Concept)
"""
- if isinstance(items, str):
+ if isinstance(items, string_types):
items = (items,)
rels = [item_metadata[r] for r in items]
"""
print()
print("Using SQL to extract rows from 'city.db' RDB.")
- for row in sql_query("corpora/city_database/city.db", "SELECT * FROM city_table"):
+ for row in sql_query('corpora/city_database/city.db', "SELECT * FROM city_table"):
print(row)
-if __name__ == "__main__":
+if __name__ == '__main__':
main()
sql_demo()
# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from nltk.sem.logic import LambdaExpression, ApplicationExpression, Variable
from nltk.parse import load_parser
self.featstruct = featstruct
self.readings = []
try:
- self.core = featstruct["CORE"]
- self.store = featstruct["STORE"]
+ self.core = featstruct['CORE']
+ self.store = featstruct['STORE']
except KeyError:
print("%s is not a Cooper storage structure" % featstruct)
Use a grammar with Binding Operators to parse a sentence.
"""
if not grammar:
- grammar = "grammars/book_grammars/storage.fcfg"
+ grammar = 'grammars/book_grammars/storage.fcfg'
parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart)
# Parse the sentence.
tokens = sentence.split()
print("=" * 50)
trees = cs.parse_with_bindops(sentence, trace=0)
for tree in trees:
- semrep = cs.CooperStore(tree.label()["SEM"])
+ semrep = cs.CooperStore(tree.label()['SEM'])
print()
print("Binding operators:")
print("-" * 15)
print("%s: %s" % (i + 1, reading))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
import operator
from functools import reduce
from itertools import chain
+from six import string_types
+
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import (
APP,
AbstractVariableExpression,
# Import Tkinter-based modules if they are available
try:
- from tkinter import Canvas, Tk
- from tkinter.font import Font
+ from six.moves.tkinter import Canvas, Tk
+ from six.moves.tkinter_font import Font
from nltk.util import in_idle
except ImportError:
class DrtTokens(Tokens):
- DRS = "DRS"
- DRS_CONC = "+"
- PRONOUN = "PRO"
- OPEN_BRACKET = "["
- CLOSE_BRACKET = "]"
- COLON = ":"
+ DRS = 'DRS'
+ DRS_CONC = '+'
+ PRONOUN = 'PRO'
+ OPEN_BRACKET = '['
+ CLOSE_BRACKET = ']'
+ COLON = ':'
PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON]
# Support expressions like: DRS([x y],C) == DRS([x,y],C)
if refs and self.token(0) == DrtTokens.COMMA:
self.token() # swallow the comma
- refs.append(self.get_next_token_variable("quantified"))
+ refs.append(self.get_next_token_variable('quantified'))
self.assertNextToken(DrtTokens.CLOSE_BRACKET)
return refs
def handle_prop(self, tok, context):
variable = self.make_VariableExpression(tok)
- self.assertNextToken(":")
+ self.assertNextToken(':')
drs = self.process_next_expression(DrtTokens.COLON)
return DrtProposition(variable, drs)
return DRS(first.refs, first.conds, second)
if isinstance(first, DrtConcatenation):
return DrtConcatenation(first.first, first.second, second)
- raise Exception("Antecedent of implication must be a DRS")
+ raise Exception('Antecedent of implication must be a DRS')
return make_imp_expression
else:
return DRS(self.refs, self.conds, other)
if isinstance(self, DrtConcatenation):
return DrtConcatenation(self.first, self.second, other)
- raise Exception("Antecedent of implication must be a DRS")
+ raise Exception('Antecedent of implication must be a DRS')
def equiv(self, other, prover=None):
"""
Draw the DRS
:return: the pretty print string
"""
- return "\n".join(self._pretty())
+ return '\n'.join(self._pretty())
def pretty_print(self):
print(self.pretty_format())
DrsDrawer(self).draw()
+@python_2_unicode_compatible
class DRS(DrtExpression, Expression):
"""A Discourse Representation Structure."""
return accum
def _pretty(self):
- refs_line = " ".join(self._order_ref_strings(self.refs))
+ refs_line = ' '.join(self._order_ref_strings(self.refs))
cond_lines = [
cond
length = max([len(refs_line)] + list(map(len, cond_lines)))
drs = (
[
- " _" + "_" * length + "_ ",
- "| " + refs_line.ljust(length) + " |",
- "|-" + "-" * length + "-|",
+ ' _' + '_' * length + '_ ',
+ '| ' + refs_line.ljust(length) + ' |',
+ '|-' + '-' * length + '-|',
]
- + ["| " + line.ljust(length) + " |" for line in cond_lines]
- + ["|_" + "_" * length + "_|"]
+ + ['| ' + line.ljust(length) + ' |' for line in cond_lines]
+ + ['|_' + '_' * length + '_|']
)
if self.consequent:
return DrtBinaryExpression._assemble_pretty(
__hash__ = Expression.__hash__
def __str__(self):
- drs = "([%s],[%s])" % (
- ",".join(self._order_ref_strings(self.refs)),
- ", ".join("%s" % cond for cond in self.conds),
+ drs = '([%s],[%s])' % (
+ ','.join(self._order_ref_strings(self.refs)),
+ ', '.join("%s" % cond for cond in self.conds),
) # map(str, self.conds)))
if self.consequent:
return (
DrtTokens.OPEN
+ drs
- + " "
+ + ' '
+ DrtTokens.IMP
- + " "
+ + ' '
+ "%s" % self.consequent
+ DrtTokens.CLOSE
)
def _pretty(self):
s = "%s" % self
- blank = " " * len(s)
+ blank = ' ' * len(s)
return [blank, blank, s, blank]
def eliminate_equality(self):
pass
+@python_2_unicode_compatible
class DrtProposition(DrtExpression, Expression):
def __init__(self, variable, drs):
self.variable = variable
def _pretty(self):
drs_s = self.drs._pretty()
- blank = " " * len("%s" % self.variable)
+ blank = ' ' * len("%s" % self.variable)
return (
- [blank + " " + line for line in drs_s[:1]]
- + ["%s" % self.variable + ":" + line for line in drs_s[1:2]]
- + [blank + " " + line for line in drs_s[2:]]
+ [blank + ' ' + line for line in drs_s[:1]]
+ + ["%s" % self.variable + ':' + line for line in drs_s[1:2]]
+ + [blank + ' ' + line for line in drs_s[2:]]
)
def visit(self, function, combinator):
return combinator(self.variable, function(self.drs))
def __str__(self):
- return "prop(%s, %s)" % (self.variable, self.drs)
+ return 'prop(%s, %s)' % (self.variable, self.drs)
class DrtNegatedExpression(DrtExpression, NegatedExpression):
def _pretty(self):
term_lines = self.term._pretty()
return (
- [" " + line for line in term_lines[:2]]
- + ["__ " + line for line in term_lines[2:3]]
- + [" | " + line for line in term_lines[3:4]]
- + [" " + line for line in term_lines[4:]]
+ [' ' + line for line in term_lines[:2]]
+ + ['__ ' + line for line in term_lines[2:3]]
+ + [' | ' + line for line in term_lines[3:4]]
+ + [' ' + line for line in term_lines[4:]]
)
while term.__class__ == self.__class__:
variables.append(term.variable)
term = term.term
- var_string = " ".join("%s" % v for v in variables) + DrtTokens.DOT
+ var_string = ' '.join("%s" % v for v in variables) + DrtTokens.DOT
term_lines = term._pretty()
- blank = " " * len(var_string)
+ blank = ' ' * len(var_string)
return (
- [" " + blank + line for line in term_lines[:1]]
- + [" \ " + blank + line for line in term_lines[1:2]]
- + [" /\ " + var_string + line for line in term_lines[2:3]]
- + [" " + blank + line for line in term_lines[3:]]
+ [' ' + blank + line for line in term_lines[:1]]
+ + [' \ ' + blank + line for line in term_lines[1:2]]
+ + [' /\ ' + var_string + line for line in term_lines[2:3]]
+ + [' ' + blank + line for line in term_lines[3:]]
)
max_lines = max(len(first_lines), len(second_lines))
first_lines = _pad_vertically(first_lines, max_lines)
second_lines = _pad_vertically(second_lines, max_lines)
- blank = " " * len(op)
+ blank = ' ' * len(op)
first_second_lines = list(zip(first_lines, second_lines))
return (
[
- " " + first_line + " " + blank + " " + second_line + " "
+ ' ' + first_line + ' ' + blank + ' ' + second_line + ' '
for first_line, second_line in first_second_lines[:2]
]
+ [
- "(" + first_line + " " + op + " " + second_line + ")"
+ '(' + first_line + ' ' + op + ' ' + second_line + ')'
for first_line, second_line in first_second_lines[2:3]
]
+ [
- " " + first_line + " " + blank + " " + second_line + " "
+ ' ' + first_line + ' ' + blank + ' ' + second_line + ' '
for first_line, second_line in first_second_lines[3:]
]
)
return EqualityExpression(self.first.fol(), self.second.fol())
+@python_2_unicode_compatible
class DrtConcatenation(DrtBooleanExpression):
"""DRS of the form '(DRS + DRS)'"""
def __str__(self):
first = self._str_subex(self.first)
second = self._str_subex(self.second)
- drs = Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE
+ drs = Tokens.OPEN + first + ' ' + self.getOp() + ' ' + second + Tokens.CLOSE
if self.consequent:
return (
DrtTokens.OPEN
+ drs
- + " "
+ + ' '
+ DrtTokens.IMP
- + " "
+ + ' '
+ "%s" % self.consequent
+ DrtTokens.CLOSE
)
func_args_lines = list(zip(function_lines, list(zip(*args_lines))))
return (
[
- func_line + " " + " ".join(args_line) + " "
+ func_line + ' ' + ' '.join(args_line) + ' '
for func_line, args_line in func_args_lines[:2]
]
+ [
- func_line + "(" + ",".join(args_line) + ")"
+ func_line + '(' + ','.join(args_line) + ')'
for func_line, args_line in func_args_lines[2:3]
]
+ [
- func_line + " " + " ".join(args_line) + " "
+ func_line + ' ' + ' '.join(args_line) + ' '
for func_line, args_line in func_args_lines[3:]
]
)
def _pad_vertically(lines, max_lines):
- pad_line = [" " * len(lines[0])]
+ pad_line = [' ' * len(lines[0])]
return lines + pad_line * (max_lines - len(lines))
+@python_2_unicode_compatible
class PossibleAntecedents(list, DrtExpression, Expression):
def free(self):
"""Set of free variables."""
def _pretty(self):
s = "%s" % self
- blank = " " * len(s)
+ blank = ' ' * len(s)
return [blank, blank, s]
def __str__(self):
- return "[" + ",".join("%s" % it for it in self) + "]"
+ return '[' + ','.join("%s" % it for it in self) + ']'
class AnaphoraResolutionException(Exception):
master = Tk()
master.title("DRT")
- font = Font(family="helvetica", size=12)
+ font = Font(family='helvetica', size=12)
if size_canvas:
canvas = Canvas(master, width=0, height=0)
:param y: the left side of the current drawing area
:return: the bottom-rightmost point
"""
- if isinstance(item, str):
- self.canvas.create_text(x, y, anchor="nw", font=self.canvas.font, text=item)
+ if isinstance(item, string_types):
+ self.canvas.create_text(x, y, anchor='nw', font=self.canvas.font, text=item)
elif isinstance(item, tuple):
# item is the lower-right of a box
(right, bottom) = item
:param y: the left side of the current drawing area
:return: the bottom-rightmost point
"""
- if isinstance(item, str):
+ if isinstance(item, string_types):
return (x + self.canvas.font.measure(item), y + self._get_text_height())
elif isinstance(item, tuple):
return item
# Handle Discourse Referents
if expression.refs:
- refs = " ".join("%s" % r for r in expression.refs)
+ refs = ' '.join("%s" % r for r in expression.refs)
else:
- refs = " "
+ refs = ' '
(max_right, bottom) = command(refs, left, bottom)
bottom += self.BUFFER * 2
if i + 1 < len(args):
# since it's not the last arg, add a comma
- right = command(DrtTokens.COMMA + " ", right, centred_string_top)[0]
+ right = command(DrtTokens.COMMA + ' ', right, centred_string_top)[0]
# Handle close paren
right = command(DrtTokens.CLOSE, right, centred_string_top)[0]
)
# Handle the operator
- right = command(" %s " % expression.getOp(), right, centred_string_top)[0]
+ right = command(' %s ' % expression.getOp(), right, centred_string_top)[0]
# Handle the second operand
second_height = expression.second._drawing_height
def demo():
- print("=" * 20 + "TEST PARSE" + "=" * 20)
+ print('=' * 20 + 'TEST PARSE' + '=' * 20)
dexpr = DrtExpression.fromstring
- print(dexpr(r"([x,y],[sees(x,y)])"))
- print(dexpr(r"([x],[man(x), walks(x)])"))
- print(dexpr(r"\x.\y.([],[sees(x,y)])"))
- print(dexpr(r"\x.([],[walks(x)])(john)"))
- print(dexpr(r"(([x],[walks(x)]) + ([y],[runs(y)]))"))
- print(dexpr(r"(([],[walks(x)]) -> ([],[runs(x)]))"))
- print(dexpr(r"([x],[PRO(x), sees(John,x)])"))
- print(dexpr(r"([x],[man(x), -([],[walks(x)])])"))
- print(dexpr(r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])"))
-
- print("=" * 20 + "Test fol()" + "=" * 20)
- print(dexpr(r"([x,y],[sees(x,y)])").fol())
-
- print("=" * 20 + "Test alpha conversion and lambda expression equality" + "=" * 20)
- e1 = dexpr(r"\x.([],[P(x)])")
+ print(dexpr(r'([x,y],[sees(x,y)])'))
+ print(dexpr(r'([x],[man(x), walks(x)])'))
+ print(dexpr(r'\x.\y.([],[sees(x,y)])'))
+ print(dexpr(r'\x.([],[walks(x)])(john)'))
+ print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))'))
+ print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))'))
+ print(dexpr(r'([x],[PRO(x), sees(John,x)])'))
+ print(dexpr(r'([x],[man(x), -([],[walks(x)])])'))
+ print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'))
+
+ print('=' * 20 + 'Test fol()' + '=' * 20)
+ print(dexpr(r'([x,y],[sees(x,y)])').fol())
+
+ print('=' * 20 + 'Test alpha conversion and lambda expression equality' + '=' * 20)
+ e1 = dexpr(r'\x.([],[P(x)])')
print(e1)
- e2 = e1.alpha_convert(Variable("z"))
+ e2 = e1.alpha_convert(Variable('z'))
print(e2)
print(e1 == e2)
- print("=" * 20 + "Test resolve_anaphora()" + "=" * 20)
- print(resolve_anaphora(dexpr(r"([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])")))
+ print('=' * 20 + 'Test resolve_anaphora()' + '=' * 20)
+ print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])')))
print(
- resolve_anaphora(dexpr(r"([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])"))
+ resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))
)
- print(resolve_anaphora(dexpr(r"(([x,y],[]) + ([],[PRO(x)]))")))
+ print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')))
- print("=" * 20 + "Test pretty_print()" + "=" * 20)
+ print('=' * 20 + 'Test pretty_print()' + '=' * 20)
dexpr(r"([],[])").pretty_print()
dexpr(
r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])"
def test_draw():
try:
- from tkinter import Tk
+ from six.moves.tkinter import Tk
except ImportError:
from nose import SkipTest
raise SkipTest("tkinter is required, but it's not available.")
expressions = [
- r"x",
- r"([],[])",
- r"([x],[])",
- r"([x],[man(x)])",
- r"([x,y],[sees(x,y)])",
- r"([x],[man(x), walks(x)])",
- r"\x.([],[man(x), walks(x)])",
- r"\x y.([],[sees(x,y)])",
- r"([],[(([],[walks(x)]) + ([],[runs(x)]))])",
- r"([x],[man(x), -([],[walks(x)])])",
- r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])",
+ r'x',
+ r'([],[])',
+ r'([x],[])',
+ r'([x],[man(x)])',
+ r'([x,y],[sees(x,y)])',
+ r'([x],[man(x), walks(x)])',
+ r'\x.([],[man(x), walks(x)])',
+ r'\x y.([],[sees(x,y)])',
+ r'([],[(([],[walks(x)]) + ([],[runs(x)]))])',
+ r'([x],[man(x), -([],[walks(x)])])',
+ r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])',
]
for e in expressions:
d.draw()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
try:
- from tkinter import (
+ from six.moves.tkinter import (
Button,
Frame,
IntVar,
Scrollbar,
Tk,
)
- from tkinter.font import Font
+ from six.moves.tkinter_font import Font
from nltk.draw.util import CanvasFrame, ShowText
except ImportError:
def __init__(self, examples):
# Set up the main window.
self._top = Tk()
- self._top.title("DRT Glue Demo")
+ self._top.title('DRT Glue Demo')
# Set up key bindings.
self._init_bindings()
self._init_canvas(self._top)
# Resize callback
- self._canvas.bind("<Configure>", self._configure)
+ self._canvas.bind('<Configure>', self._configure)
#########################################
## Initialization Helpers
def _init_glue(self):
tagger = RegexpTagger(
[
- ("^(David|Mary|John)$", "NNP"),
+ ('^(David|Mary|John)$', 'NNP'),
(
- "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
- "VB",
+ '^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
+ 'VB',
),
- ("^(go|order|vanish|find|approach)$", "VB"),
- ("^(a)$", "ex_quant"),
- ("^(every)$", "univ_quant"),
- ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
- ("^(big|gray|former)$", "JJ"),
- ("^(him|himself)$", "PRP"),
+ ('^(go|order|vanish|find|approach)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+ ('^(big|gray|former)$', 'JJ'),
+ ('^(him|himself)$', 'PRP'),
]
)
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
+ self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+ self._font = Font(family='helvetica', size=self._size.get())
if self._size.get() < 0:
big = self._size.get() - 2
else:
big = self._size.get() + 2
- self._bigfont = Font(family="helvetica", weight="bold", size=big)
+ self._bigfont = Font(family='helvetica', weight='bold', size=big)
def _init_exampleListbox(self, parent):
self._exampleFrame = listframe = Frame(parent)
- self._exampleFrame.pack(fill="both", side="left", padx=2)
+ self._exampleFrame.pack(fill='both', side='left', padx=2)
self._exampleList_label = Label(
- self._exampleFrame, font=self._boldfont, text="Examples"
+ self._exampleFrame, font=self._boldfont, text='Examples'
)
self._exampleList_label.pack()
self._exampleList = Listbox(
self._exampleFrame,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
+ selectmode='single',
+ relief='groove',
+ background='white',
+ foreground='#909090',
font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
+ selectforeground='#004040',
+ selectbackground='#c0f0c0',
)
- self._exampleList.pack(side="right", fill="both", expand=1)
+ self._exampleList.pack(side='right', fill='both', expand=1)
for example in self._examples:
- self._exampleList.insert("end", (" %s" % example))
+ self._exampleList.insert('end', (' %s' % example))
self._exampleList.config(height=min(len(self._examples), 25), width=40)
# Add a scrollbar if there are more than 25 examples.
if len(self._examples) > 25:
- listscroll = Scrollbar(self._exampleFrame, orient="vertical")
+ listscroll = Scrollbar(self._exampleFrame, orient='vertical')
self._exampleList.config(yscrollcommand=listscroll.set)
listscroll.config(command=self._exampleList.yview)
- listscroll.pack(side="left", fill="y")
+ listscroll.pack(side='left', fill='y')
# If they select a example, apply it.
- self._exampleList.bind("<<ListboxSelect>>", self._exampleList_select)
+ self._exampleList.bind('<<ListboxSelect>>', self._exampleList_select)
def _init_readingListbox(self, parent):
self._readingFrame = listframe = Frame(parent)
- self._readingFrame.pack(fill="both", side="left", padx=2)
+ self._readingFrame.pack(fill='both', side='left', padx=2)
self._readingList_label = Label(
- self._readingFrame, font=self._boldfont, text="Readings"
+ self._readingFrame, font=self._boldfont, text='Readings'
)
self._readingList_label.pack()
self._readingList = Listbox(
self._readingFrame,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
+ selectmode='single',
+ relief='groove',
+ background='white',
+ foreground='#909090',
font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
+ selectforeground='#004040',
+ selectbackground='#c0f0c0',
)
- self._readingList.pack(side="right", fill="both", expand=1)
+ self._readingList.pack(side='right', fill='both', expand=1)
# Add a scrollbar if there are more than 25 examples.
- listscroll = Scrollbar(self._readingFrame, orient="vertical")
+ listscroll = Scrollbar(self._readingFrame, orient='vertical')
self._readingList.config(yscrollcommand=listscroll.set)
listscroll.config(command=self._readingList.yview)
- listscroll.pack(side="right", fill="y")
+ listscroll.pack(side='right', fill='y')
self._populate_readingListbox()
def _populate_readingListbox(self):
# Populate the listbox with integers
- self._readingList.delete(0, "end")
+ self._readingList.delete(0, 'end')
for i in range(len(self._readings)):
- self._readingList.insert("end", (" %s" % (i + 1)))
+ self._readingList.insert('end', (' %s' % (i + 1)))
self._readingList.config(height=min(len(self._readings), 25), width=5)
# If they select a example, apply it.
- self._readingList.bind("<<ListboxSelect>>", self._readingList_select)
+ self._readingList.bind('<<ListboxSelect>>', self._readingList_select)
def _init_bindings(self):
# Key bindings are a good thing.
- self._top.bind("<Control-q>", self.destroy)
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Escape>", self.destroy)
- self._top.bind("n", self.next)
- self._top.bind("<space>", self.next)
- self._top.bind("p", self.prev)
- self._top.bind("<BackSpace>", self.prev)
+ self._top.bind('<Control-q>', self.destroy)
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Escape>', self.destroy)
+ self._top.bind('n', self.next)
+ self._top.bind('<space>', self.next)
+ self._top.bind('p', self.prev)
+ self._top.bind('<BackSpace>', self.prev)
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
- buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
+ buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
Button(
buttonframe,
- text="Prev",
- background="#90c0d0",
- foreground="black",
+ text='Prev',
+ background='#90c0d0',
+ foreground='black',
command=self.prev,
- ).pack(side="left")
+ ).pack(side='left')
Button(
buttonframe,
- text="Next",
- background="#90c0d0",
- foreground="black",
+ text='Next',
+ background='#90c0d0',
+ foreground='black',
command=self.next,
- ).pack(side="left")
+ ).pack(side='left')
def _configure(self, event):
self._autostep = 0
(x1, y1, x2, y2) = self._cframe.scrollregion()
y2 = event.height - 6
- self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+ self._canvas['scrollregion'] = '%d %d %d %d' % (x1, y1, x2, y2)
self._redraw()
def _init_canvas(self, parent):
self._cframe = CanvasFrame(
parent,
- background="white",
+ background='white',
# width=525, height=250,
closeenough=10,
border=2,
- relief="sunken",
+ relief='sunken',
)
- self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+ self._cframe.pack(expand=1, fill='both', side='top', pady=2)
canvas = self._canvas = self._cframe.canvas()
# Initially, there's no tree or text
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="q"
+ label='Exit', underline=1, command=self.destroy, accelerator='q'
)
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
actionmenu = Menu(menubar, tearoff=0)
actionmenu.add_command(
- label="Next", underline=0, command=self.next, accelerator="n, Space"
+ label='Next', underline=0, command=self.next, accelerator='n, Space'
)
actionmenu.add_command(
- label="Previous", underline=0, command=self.prev, accelerator="p, Backspace"
+ label='Previous', underline=0, command=self.prev, accelerator='p, Backspace'
)
- menubar.add_cascade(label="Action", underline=0, menu=actionmenu)
+ menubar.add_cascade(label='Action', underline=0, menu=actionmenu)
optionmenu = Menu(menubar, tearoff=0)
optionmenu.add_checkbutton(
- label="Remove Duplicates",
+ label='Remove Duplicates',
underline=0,
variable=self._glue.remove_duplicates,
command=self._toggle_remove_duplicates,
- accelerator="r",
+ accelerator='r',
)
- menubar.add_cascade(label="Options", underline=0, menu=optionmenu)
+ menubar.add_cascade(label='Options', underline=0, menu=optionmenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_radiobutton(
- label="Tiny",
+ label='Tiny',
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Small",
+ label='Small',
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Medium",
+ label='Medium',
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Large",
+ label='Large',
variable=self._size,
underline=0,
value=18,
command=self.resize,
)
viewmenu.add_radiobutton(
- label="Huge",
+ label='Huge',
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0, command=self.about)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
"NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"
+ "Written by Daniel H. Garrette"
)
- TITLE = "About: NLTK DRT Glue Demo"
+ TITLE = 'About: NLTK DRT Glue Demo'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
def _toggle_remove_duplicates(self):
self._glue.remove_duplicates = not self._glue.remove_duplicates
- self._exampleList.selection_clear(0, "end")
+ self._exampleList.selection_clear(0, 'end')
self._readings = []
self._populate_readingListbox()
self._readingCache = [None for ex in self._examples]
self._curExample = index
example = self._examples[index]
- self._exampleList.selection_clear(0, "end")
+ self._exampleList.selection_clear(0, 'end')
if example:
cache = self._readingCache[index]
if cache:
self._readingCache[index] = self._readings
except Exception as e:
self._readings = []
- self._error = DrtVariableExpression(Variable("Error: " + str(e)))
+ self._error = DrtVariableExpression(Variable('Error: ' + str(e)))
self._readingCache[index] = self._error
# add a star to the end of the example
self._exampleList.delete(index)
- self._exampleList.insert(index, (" %s *" % example))
+ self._exampleList.insert(index, (' %s *' % example))
self._exampleList.config(
height=min(len(self._examples), 25), width=40
)
def _readingList_store_selection(self, index):
reading = self._readings[index]
- self._readingList.selection_clear(0, "end")
+ self._readingList.selection_clear(0, 'end')
if reading:
self._readingList.selection_set(index)
self._drs = drs
self._canvas = canvas
canvas.font = Font(
- font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font")
+ font=canvas.itemcget(canvas.create_text(0, 0, text=''), 'font')
)
canvas._BUFFER = 3
self.bbox = (0, 0, 0, 0)
def demo():
examples = [
- "John walks",
- "David sees Mary",
- "David eats a sandwich",
- "every man chases a dog",
+ 'John walks',
+ 'David sees Mary',
+ 'David eats a sandwich',
+ 'every man chases a dog',
# 'every man believes a dog yawns',
# 'John gives David a sandwich',
- "John chases himself",
+ 'John chases himself',
# 'John persuades David to order a pizza',
# 'John tries to go',
# 'John tries to find a unicorn',
DrtGlueDemo(examples).mainloop()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Models for first-order languages with lambda
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>,
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
This module provides data structures for representing first-order
models.
"""
+from __future__ import print_function, unicode_literals
from pprint import pformat
import inspect
import re
import sys
+from six import string_types
+
from nltk.decorators import decorator # this used in code that is commented out
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import (
AbstractVariableExpression,
def trace(f, *args, **kw):
- argspec = inspect.getfullargspec(f)
+ if sys.version_info[0] >= 3:
+ argspec = inspect.getfullargspec(f)
+ else:
+ argspec = inspect.getargspec(f)
d = dict(zip(argspec[0], args))
- if d.pop("trace", None):
+ if d.pop('trace', None):
print()
for item in d.items():
print("%s => %s" % item)
"""
new = set()
for elem in s:
- if isinstance(elem, str):
+ if isinstance(elem, string_types):
new.add((elem,))
elif isinstance(elem, int):
new.add((str(elem)))
return len(list(rel)[0])
+@python_2_unicode_compatible
class Valuation(dict):
"""
A dictionary which represents a model-theoretic Valuation of non-logical constants.
"""
super(Valuation, self).__init__()
for (sym, val) in xs:
- if isinstance(val, str) or isinstance(val, bool):
+ if isinstance(val, string_types) or isinstance(val, bool):
self[sym] = val
elif isinstance(val, set):
self[sym] = set2rel(val)
"""Set-theoretic domain of the value-space of a Valuation."""
dom = []
for val in self.values():
- if isinstance(val, str):
+ if isinstance(val, string_types):
dom.append(val)
elif not isinstance(val, bool):
dom.extend(
##########################################
# REs used by the _read_valuation function
##########################################
-_VAL_SPLIT_RE = re.compile(r"\s*=+>\s*")
-_ELEMENT_SPLIT_RE = re.compile(r"\s*,\s*")
+_VAL_SPLIT_RE = re.compile(r'\s*=+>\s*')
+_ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*')
_TUPLES_RE = re.compile(
r"""\s*
(\([^)]+\)) # tuple-expression
symbol = pieces[0]
value = pieces[1]
# check whether the value is meant to be a set
- if value.startswith("{"):
+ if value.startswith('{'):
value = value[1:-1]
tuple_strings = _TUPLES_RE.findall(value)
# are the set elements tuples?
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith("#") or line == "":
+ if line.startswith('#') or line == '':
continue
try:
statements.append(_read_valuation_line(line))
except ValueError:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return Valuation(statements)
+@python_2_unicode_compatible
class Assignment(dict):
"""
A dictionary which represents an assignment of values to variables.
return self
+@python_2_unicode_compatible
class Model(object):
"""
A first order model is a domain *D* of discourse and a valuation *V*.
if trace:
print()
print("'%s' is undefined under M, %s" % (expr, g))
- return "Undefined"
+ return 'Undefined'
def satisfy(self, parsed, g, trace=None):
"""
:return: a set of the entities that satisfy ``parsed``.
"""
- spacer = " "
+ spacer = ' '
indent = spacer + (spacer * nesting)
candidates = []
- if isinstance(varex, str):
+ if isinstance(varex, string_types):
var = Variable(varex)
else:
var = varex
"""Example of a propositional model."""
global val1, dom1, m1, g1
- val1 = Valuation([("P", True), ("Q", True), ("R", False)])
+ val1 = Valuation([('P', True), ('Q', True), ('R', False)])
dom1 = set([])
m1 = Model(dom1, val1)
g1 = Assignment(dom1)
print()
- print("*" * mult)
+ print('*' * mult)
print("Propositional Formulas Demo")
- print("*" * mult)
- print("(Propositional constants treated as nullary predicates)")
+ print('*' * mult)
+ print('(Propositional constants treated as nullary predicates)')
print()
print("Model m1:\n", m1)
- print("*" * mult)
+ print('*' * mult)
sentences = [
- "(P & Q)",
- "(P & R)",
- "- P",
- "- R",
- "- - P",
- "- (P & R)",
- "(P | R)",
- "(R | P)",
- "(R | R)",
- "(- P | R)",
- "(P | - P)",
- "(P -> Q)",
- "(P -> R)",
- "(R -> P)",
- "(P <-> P)",
- "(R <-> R)",
- "(P <-> R)",
+ '(P & Q)',
+ '(P & R)',
+ '- P',
+ '- R',
+ '- - P',
+ '- (P & R)',
+ '(P | R)',
+ '(R | P)',
+ '(R | R)',
+ '(- P | R)',
+ '(P | - P)',
+ '(P -> Q)',
+ '(P -> R)',
+ '(R -> P)',
+ '(P <-> P)',
+ '(R <-> R)',
+ '(P <-> R)',
]
for sent in sentences:
global val2, v2, dom2, m2, g2
v2 = [
- ("adam", "b1"),
- ("betty", "g1"),
- ("fido", "d1"),
- ("girl", set(["g1", "g2"])),
- ("boy", set(["b1", "b2"])),
- ("dog", set(["d1"])),
- ("love", set([("b1", "g1"), ("b2", "g2"), ("g1", "b1"), ("g2", "b1")])),
+ ('adam', 'b1'),
+ ('betty', 'g1'),
+ ('fido', 'd1'),
+ ('girl', set(['g1', 'g2'])),
+ ('boy', set(['b1', 'b2'])),
+ ('dog', set(['d1'])),
+ ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])),
]
val2 = Valuation(v2)
dom2 = val2.domain
m2 = Model(dom2, val2)
- g2 = Assignment(dom2, [("x", "b1"), ("y", "g2")])
+ g2 = Assignment(dom2, [('x', 'b1'), ('y', 'g2')])
if not quiet:
print()
- print("*" * mult)
+ print('*' * mult)
print("Models Demo")
print("*" * mult)
print("Model m2:\n", "-" * 14, "\n", m2)
print("Variable assignment = ", g2)
- exprs = ["adam", "boy", "love", "walks", "x", "y", "z"]
+ exprs = ['adam', 'boy', 'love', 'walks', 'x', 'y', 'z']
parsed_exprs = [Expression.fromstring(e) for e in exprs]
print()
print("The interpretation of '%s' in m2 is Undefined" % parsed)
applications = [
- ("boy", ("adam")),
- ("walks", ("adam",)),
- ("love", ("adam", "y")),
- ("love", ("y", "adam")),
+ ('boy', ('adam')),
+ ('walks', ('adam',)),
+ ('love', ('adam', 'y')),
+ ('love', ('y', 'adam')),
]
for (fun, args) in applications:
folmodel(quiet=True)
print()
- print("*" * mult)
+ print('*' * mult)
print("FOL Formulas Demo")
- print("*" * mult)
+ print('*' * mult)
formulas = [
- "love (adam, betty)",
- "(adam = mia)",
- "\\x. (boy(x) | girl(x))",
- "\\x. boy(x)(adam)",
- "\\x y. love(x, y)",
- "\\x y. love(x, y)(adam)(betty)",
- "\\x y. love(x, y)(adam, betty)",
- "\\x y. (boy(x) & love(x, y))",
- "\\x. exists y. (boy(x) & love(x, y))",
- "exists z1. boy(z1)",
- "exists x. (boy(x) & -(x = adam))",
- "exists x. (boy(x) & all y. love(y, x))",
- "all x. (boy(x) | girl(x))",
- "all x. (girl(x) -> exists y. boy(y) & love(x, y))", # Every girl loves exists boy.
- "exists x. (boy(x) & all y. (girl(y) -> love(y, x)))", # There is exists boy that every girl loves.
- "exists x. (boy(x) & all y. (girl(y) -> love(x, y)))", # exists boy loves every girl.
- "all x. (dog(x) -> - girl(x))",
- "exists x. exists y. (love(x, y) & love(x, y))",
+ 'love (adam, betty)',
+ '(adam = mia)',
+ '\\x. (boy(x) | girl(x))',
+ '\\x. boy(x)(adam)',
+ '\\x y. love(x, y)',
+ '\\x y. love(x, y)(adam)(betty)',
+ '\\x y. love(x, y)(adam, betty)',
+ '\\x y. (boy(x) & love(x, y))',
+ '\\x. exists y. (boy(x) & love(x, y))',
+ 'exists z1. boy(z1)',
+ 'exists x. (boy(x) & -(x = adam))',
+ 'exists x. (boy(x) & all y. love(y, x))',
+ 'all x. (boy(x) | girl(x))',
+ 'all x. (girl(x) -> exists y. boy(y) & love(x, y))', # Every girl loves exists boy.
+ 'exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', # There is exists boy that every girl loves.
+ 'exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', # exists boy loves every girl.
+ 'all x. (dog(x) -> - girl(x))',
+ 'exists x. exists y. (love(x, y) & love(x, y))',
]
for fmla in formulas:
"""Satisfiers of an open formula in a first order model."""
print()
- print("*" * mult)
+ print('*' * mult)
print("Satisfiers Demo")
- print("*" * mult)
+ print('*' * mult)
folmodel(quiet=True)
formulas = [
- "boy(x)",
- "(x = x)",
- "(boy(x) | girl(x))",
- "(boy(x) & girl(x))",
- "love(adam, x)",
- "love(x, adam)",
- "-(x = adam)",
- "exists z22. love(x, z22)",
- "exists y. love(y, x)",
- "all y. (girl(y) -> love(x, y))",
- "all y. (girl(y) -> love(y, x))",
- "all y. (girl(y) -> (boy(x) & love(y, x)))",
- "(boy(x) & all y. (girl(y) -> love(x, y)))",
- "(boy(x) & all y. (girl(y) -> love(y, x)))",
- "(boy(x) & exists y. (girl(y) & love(y, x)))",
- "(girl(x) -> dog(x))",
- "all y. (dog(y) -> (x = y))",
- "exists y. love(y, x)",
- "exists y. (love(adam, y) & love(y, x))",
+ 'boy(x)',
+ '(x = x)',
+ '(boy(x) | girl(x))',
+ '(boy(x) & girl(x))',
+ 'love(adam, x)',
+ 'love(x, adam)',
+ '-(x = adam)',
+ 'exists z22. love(x, z22)',
+ 'exists y. love(y, x)',
+ 'all y. (girl(y) -> love(x, y))',
+ 'all y. (girl(y) -> love(y, x))',
+ 'all y. (girl(y) -> (boy(x) & love(y, x)))',
+ '(boy(x) & all y. (girl(y) -> love(x, y)))',
+ '(boy(x) & all y. (girl(y) -> love(y, x)))',
+ '(boy(x) & exists y. (girl(y) & love(y, x)))',
+ '(girl(x) -> dog(x))',
+ 'all y. (dog(y) -> (x = y))',
+ 'exists y. love(y, x)',
+ 'exists y. (love(adam, y) & love(y, x))',
]
if trace:
for p in parsed:
g2.purge()
- print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, "x", g2, trace)))
+ print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, 'x', g2, trace)))
def demo(num=0, trace=None):
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
import os
from itertools import chain
+from six import string_types
+
import nltk
from nltk.internals import Counter
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
LambdaExpression,
AbstractVariableExpression,
)
+from nltk.compat import python_2_unicode_compatible
from nltk.sem import drt
from nltk.sem import linearlogic
SPEC_SEMTYPES = {
- "a": "ex_quant",
- "an": "ex_quant",
- "every": "univ_quant",
- "the": "def_art",
- "no": "no_quant",
- "default": "ex_quant",
+ 'a': 'ex_quant',
+ 'an': 'ex_quant',
+ 'every': 'univ_quant',
+ 'the': 'def_art',
+ 'no': 'no_quant',
+ 'default': 'ex_quant',
}
-OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
+OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct']
+@python_2_unicode_compatible
class GlueFormula(object):
def __init__(self, meaning, glue, indices=None):
if not indices:
indices = set()
- if isinstance(meaning, str):
+ if isinstance(meaning, string_types):
self.meaning = Expression.fromstring(meaning)
elif isinstance(meaning, Expression):
self.meaning = meaning
else:
raise RuntimeError(
- "Meaning term neither string or expression: %s, %s"
+ 'Meaning term neither string or expression: %s, %s'
% (meaning, meaning.__class__)
)
- if isinstance(glue, str):
+ if isinstance(glue, string_types):
self.glue = linearlogic.LinearLogicParser().parse(glue)
elif isinstance(glue, linearlogic.Expression):
self.glue = glue
else:
raise RuntimeError(
- "Glue term neither string or expression: %s, %s"
+ 'Glue term neither string or expression: %s, %s'
% (glue, glue.__class__)
)
::-1
]: # if self.glue is (A -o B), dep is in A.dependencies
arg_meaning_abstracted = self.make_LambdaExpression(
- Variable("v%s" % dep), arg_meaning_abstracted
+ Variable('v%s' % dep), arg_meaning_abstracted
)
return_meaning = self.meaning.applyto(arg_meaning_abstracted)
def __str__(self):
assert isinstance(self.indices, set)
- accum = "%s : %s" % (self.meaning, self.glue)
+ accum = '%s : %s' % (self.meaning, self.glue)
if self.indices:
- accum += " : {" + ", ".join(str(index) for index in self.indices) + "}"
+ accum += ' : {' + ', '.join(str(index) for index in self.indices) + '}'
return accum
def __repr__(self):
return "%s" % self
+@python_2_unicode_compatible
class GlueDict(dict):
def __init__(self, filename, encoding=None):
self.filename = filename
try:
contents = nltk.data.load(
- self.filename, format="text", encoding=self.file_encoding
+ self.filename, format='text', encoding=self.file_encoding
)
# TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
except LookupError as e:
try:
contents = nltk.data.load(
- "file:" + self.filename, format="text", encoding=self.file_encoding
+ 'file:' + self.filename, format='text', encoding=self.file_encoding
)
except LookupError:
raise e
line = line.strip() # remove trailing newline
if not len(line):
continue # skip empty lines
- if line[0] == "#":
+ if line[0] == '#':
continue # skip commented out lines
parts = line.split(
- " : ", 2
+ ' : ', 2
) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
glue_formulas = []
if len(parts) > 1:
for (i, c) in enumerate(parts[1]):
- if c == "(":
+ if c == '(':
if paren_count == 0: # if it's the first '(' of a tuple
tuple_start = i + 1 # then save the index
paren_count += 1
- elif c == ")":
+ elif c == ')':
paren_count -= 1
if paren_count == 0: # if it's the last ')' of a tuple
meaning_term = parts[1][
glue_formulas.append(
[meaning_term, glue_term]
) # add the GlueFormula to the list
- elif c == ",":
+ elif c == ',':
if (
paren_count == 1
): # if it's a comma separating the parts of the tuple
tuple_comma = i # then save the index
- elif c == "#": # skip comments at the ends of lines
+ elif c == '#': # skip comments at the ends of lines
if (
paren_count != 0
): # if the line hasn't parsed correctly so far
raise RuntimeError(
- "Formula syntax is incorrect for entry " + line
+ 'Formula syntax is incorrect for entry ' + line
)
break # break to the next line
if len(parts) > 2: # if there is a relationship entry at the end
- rel_start = parts[2].index("[") + 1
- rel_end = parts[2].index("]")
+ rel_start = parts[2].index('[') + 1
+ rel_end = parts[2].index(']')
if rel_start == rel_end:
relationships = frozenset()
else:
relationships = frozenset(
- r.strip() for r in parts[2][rel_start:rel_end].split(",")
+ r.strip() for r in parts[2][rel_start:rel_end].split(',')
)
try:
- start_inheritance = parts[0].index("(")
- end_inheritance = parts[0].index(")")
+ start_inheritance = parts[0].index('(')
+ end_inheritance = parts[0].index(')')
sem = parts[0][:start_inheritance].strip()
supertype = parts[0][start_inheritance + 1 : end_inheritance]
except:
) # add the glue entry to the dictionary
def __str__(self):
- accum = ""
+ accum = ''
for pos in self:
str_pos = "%s" % pos
for relset in self[pos]:
i = 1
for gf in self[pos][relset]:
if i == 1:
- accum += str_pos + ": "
+ accum += str_pos + ': '
else:
- accum += " " * (len(str_pos) + 2)
+ accum += ' ' * (len(str_pos) + 2)
accum += "%s" % gf
if relset and i == len(self[pos][relset]):
- accum += " : %s" % relset
- accum += "\n"
+ accum += ' : %s' % relset
+ accum += '\n'
i += 1
return accum
if node is None:
# TODO: should it be depgraph.root? Is this code tested?
top = depgraph.nodes[0]
- depList = list(chain(*top["deps"].values()))
+ depList = list(chain(*top['deps'].values()))
root = depgraph.nodes[depList[0]]
return self.to_glueformula_list(depgraph, root, Counter(), verbose)
glueformulas = self.lookup(node, depgraph, counter)
- for dep_idx in chain(*node["deps"].values()):
+ for dep_idx in chain(*node['deps'].values()):
dep = depgraph.nodes[dep_idx]
glueformulas.extend(
self.to_glueformula_list(depgraph, dep, counter, verbose)
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
- "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
+ "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel'])
)
return self.get_glueformulas_from_semtype_entry(
- lookup, node["word"], node, depgraph, counter
+ lookup, node['word'], node, depgraph, counter
)
def add_missing_dependencies(self, node, depgraph):
- rel = node["rel"].lower()
-
- if rel == "main":
- headnode = depgraph.nodes[node["head"]]
- subj = self.lookup_unique("subj", headnode, depgraph)
- relation = subj["rel"]
- node["deps"].setdefault(relation, [])
- node["deps"][relation].append(subj["address"])
+ rel = node['rel'].lower()
+
+ if rel == 'main':
+ headnode = depgraph.nodes[node['head']]
+ subj = self.lookup_unique('subj', headnode, depgraph)
+ relation = subj['rel']
+ node['deps'].setdefault(relation, [])
+ node['deps'][relation].append(subj['address'])
# node['deps'].append(subj['address'])
def _lookup_semtype_option(self, semtype, node, depgraph):
relationships = frozenset(
- depgraph.nodes[dep]["rel"].lower()
- for dep in chain(*node["deps"].values())
- if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
+ depgraph.nodes[dep]['rel'].lower()
+ for dep in chain(*node['deps'].values())
+ if depgraph.nodes[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS
)
try:
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
- rel = node["rel"].lower()
- word = node["word"].lower()
+ rel = node['rel'].lower()
+ word = node['word'].lower()
- if rel == "spec":
+ if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
- return [SPEC_SEMTYPES["default"]]
- elif rel in ["nmod", "vmod"]:
- return [node["tag"], rel]
+ return [SPEC_SEMTYPES['default']]
+ elif rel in ['nmod', 'vmod']:
+ return [node['tag'], rel]
else:
- return [node["tag"]]
+ return [node['tag']]
def get_glueformulas_from_semtype_entry(
self, lookup, word, node, depgraph, counter
if not len(glueformulas):
gf.word = word
else:
- gf.word = "%s%s" % (word, len(glueformulas) + 1)
+ gf.word = '%s%s' % (word, len(glueformulas) + 1)
gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
parameter "<word>"
:param word: The actual word to be replace "<word>"
"""
- word = word.replace(".", "")
- return generic.replace("<word>", word)
+ word = word.replace('.', '')
+ return generic.replace('<word>', word)
def initialize_labels(self, expr, node, depgraph, unique_index):
if isinstance(expr, linearlogic.AtomicExpression):
def find_label_name(self, name, node, depgraph, unique_index):
try:
- dot = name.index(".")
+ dot = name.index('.')
before_dot = name[:dot]
after_dot = name[dot + 1 :]
- if before_dot == "super":
+ if before_dot == 'super':
return self.find_label_name(
- after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
+ after_dot, depgraph.nodes[node['head']], depgraph, unique_index
)
else:
return self.find_label_name(
)
except ValueError:
lbl = self.get_label(node)
- if name == "f":
+ if name == 'f':
return lbl
- elif name == "v":
- return "%sv" % lbl
- elif name == "r":
- return "%sr" % lbl
- elif name == "super":
- return self.get_label(depgraph.nodes[node["head"]])
- elif name == "var":
- return "%s%s" % (lbl.upper(), unique_index)
- elif name == "a":
- return self.get_label(self.lookup_unique("conja", node, depgraph))
- elif name == "b":
- return self.get_label(self.lookup_unique("conjb", node, depgraph))
+ elif name == 'v':
+ return '%sv' % lbl
+ elif name == 'r':
+ return '%sr' % lbl
+ elif name == 'super':
+ return self.get_label(depgraph.nodes[node['head']])
+ elif name == 'var':
+ return '%s%s' % (lbl.upper(), unique_index)
+ elif name == 'a':
+ return self.get_label(self.lookup_unique('conja', node, depgraph))
+ elif name == 'b':
+ return self.get_label(self.lookup_unique('conjb', node, depgraph))
else:
return self.get_label(self.lookup_unique(name, node, depgraph))
:param value: where to index into the list of characters
:type value: int
"""
- value = node["address"]
+ value = node['address']
letter = [
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "a",
- "b",
- "c",
- "d",
- "e",
+ 'f',
+ 'g',
+ 'h',
+ 'i',
+ 'j',
+ 'k',
+ 'l',
+ 'm',
+ 'n',
+ 'o',
+ 'p',
+ 'q',
+ 'r',
+ 's',
+ 't',
+ 'u',
+ 'v',
+ 'w',
+ 'x',
+ 'y',
+ 'z',
+ 'a',
+ 'b',
+ 'c',
+ 'd',
+ 'e',
][value - 1]
num = int(value) // 26
if num > 0:
"""
deps = [
depgraph.nodes[dep]
- for dep in chain(*node["deps"].values())
- if depgraph.nodes[dep]["rel"].lower() == rel.lower()
+ for dep in chain(*node['deps'].values())
+ if depgraph.nodes[dep]['rel'].lower() == rel.lower()
]
if len(deps) == 0:
- raise KeyError("'%s' doesn't contain a feature '%s'" % (node["word"], rel))
+ raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel))
elif len(deps) > 1:
raise KeyError(
- "'%s' should only have one feature '%s'" % (node["word"], rel)
+ "'%s' should only have one feature '%s'" % (node['word'], rel)
)
else:
return deps[0]
self.semtype_file = semtype_file
else:
self.semtype_file = os.path.join(
- "grammars", "sample_grammars", "glue.semtype"
+ 'grammars', 'sample_grammars', 'glue.semtype'
)
def train_depparser(self, depgraphs=None):
else:
self.depparser.train_from_file(
nltk.data.find(
- os.path.join("grammars", "sample_grammars", "glue_train.conll")
+ os.path.join('grammars', 'sample_grammars', 'glue_train.conll')
)
)
# if there is an exception, the syntax of the formula
# may not be understandable by the prover, so don't
# throw out the reading.
- print("Error when checking logical equality of statements", e)
-
+ print('Error when checking logical equality of statements', e)
+
if add_reading:
reading_list.append(glueformula.meaning)
return_list.extend(gf.compile(index_counter))
if self.verbose:
- print("Compiled Glue Premises:")
+ print('Compiled Glue Premises:')
for cgf in return_list:
print(cgf)
regexp_tagger = RegexpTagger(
[
- (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
- (r"(The|the|A|a|An|an)$", "AT"), # articles
- (r".*able$", "JJ"), # adjectives
- (r".*ness$", "NN"), # nouns formed from adjectives
- (r".*ly$", "RB"), # adverbs
- (r".*s$", "NNS"), # plural nouns
- (r".*ing$", "VBG"), # gerunds
- (r".*ed$", "VBD"), # past tense verbs
- (r".*", "NN"), # nouns (default)
+ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'AT'), # articles
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN'), # nouns (default)
]
)
- brown_train = brown.tagged_sents(categories="news")
+ brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
# Override particular words
main_tagger = RegexpTagger(
- [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
+ [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
backoff=trigram_tagger,
)
if not indices:
indices = set()
- if isinstance(meaning, str):
+ if isinstance(meaning, string_types):
self.meaning = drt.DrtExpression.fromstring(meaning)
elif isinstance(meaning, drt.DrtExpression):
self.meaning = meaning
else:
raise RuntimeError(
- "Meaning term neither string or expression: %s, %s"
+ 'Meaning term neither string or expression: %s, %s'
% (meaning, meaning.__class__)
)
- if isinstance(glue, str):
+ if isinstance(glue, string_types):
self.glue = linearlogic.LinearLogicParser().parse(glue)
elif isinstance(glue, linearlogic.Expression):
self.glue = glue
else:
raise RuntimeError(
- "Glue term neither string or expression: %s, %s"
+ 'Glue term neither string or expression: %s, %s'
% (glue, glue.__class__)
)
):
if not semtype_file:
semtype_file = os.path.join(
- "grammars", "sample_grammars", "drt_glue.semtype"
+ 'grammars', 'sample_grammars', 'drt_glue.semtype'
)
Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
from nltk.parse import MaltParser
examples = [
- "David sees Mary",
- "David eats a sandwich",
- "every man chases a dog",
- "every man believes a dog sleeps",
- "John gives David a sandwich",
- "John chases himself",
+ 'David sees Mary',
+ 'David eats a sandwich',
+ 'every man chases a dog',
+ 'every man believes a dog sleeps',
+ 'John gives David a sandwich',
+ 'John chases himself',
]
# 'John persuades David to order a pizza',
# 'John tries to go',
# 'every big gray cat leaves',
# 'a former senator leaves',
- print("============== DEMO ==============")
+ print('============== DEMO ==============')
tagger = RegexpTagger(
[
- ("^(David|Mary|John)$", "NNP"),
+ ('^(David|Mary|John)$', 'NNP'),
(
- "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
- "VB",
+ '^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
+ 'VB',
),
- ("^(go|order|vanish|find|approach)$", "VB"),
- ("^(a)$", "ex_quant"),
- ("^(every)$", "univ_quant"),
- ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
- ("^(big|gray|former)$", "JJ"),
- ("^(him|himself)$", "PRP"),
+ ('^(go|order|vanish|find|approach)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+ ('^(big|gray|former)$', 'JJ'),
+ ('^(him|himself)$', 'PRP'),
]
)
for (i, sentence) in enumerate(examples):
if i == show_example or show_example == -1:
- print("[[[Example %s]]] %s" % (i, sentence))
+ print('[[[Example %s]]] %s' % (i, sentence))
for reading in glue.parse_to_meaning(sentence.split()):
print(reading.simplify())
- print("")
+ print('')
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Author: Peter Wang
# Updated by: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
representation that is not easy to read. We use a "plugging" algorithm to
convert that representation into first-order logic formulas.
"""
+from __future__ import print_function, unicode_literals
from functools import reduce
+from six import itervalues
+
+from nltk import compat
from nltk.parse import load_parser
from nltk.sem.skolemize import skolemize
class Constants(object):
- ALL = "ALL"
- EXISTS = "EXISTS"
- NOT = "NOT"
- AND = "AND"
- OR = "OR"
- IMP = "IMP"
- IFF = "IFF"
- PRED = "PRED"
- LEQ = "LEQ"
- HOLE = "HOLE"
- LABEL = "LABEL"
+ ALL = 'ALL'
+ EXISTS = 'EXISTS'
+ NOT = 'NOT'
+ AND = 'AND'
+ OR = 'OR'
+ IMP = 'IMP'
+ IFF = 'IFF'
+ PRED = 'PRED'
+ LEQ = 'LEQ'
+ HOLE = 'HOLE'
+ LABEL = 'LABEL'
MAP = {
ALL: lambda v, e: AllExpression(v.variable, e),
def _find_top_nodes(self, node_list):
top_nodes = node_list.copy()
- for f in self.fragments.values():
+ for f in itervalues(self.fragments):
# the label is the first argument of the predicate
args = f[1]
for arg in args:
head = [(a, ancestors) for a in args if self.is_node(a)]
self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
else:
- raise Exception("queue empty")
+ raise Exception('queue empty')
def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record):
"""
return node
+@compat.python_2_unicode_compatible
class Constraint(object):
"""
This class represents a constraint of the form (L =< N),
return hash(repr(self))
def __repr__(self):
- return "(%s < %s)" % (self.lhs, self.rhs)
+ return '(%s < %s)' % (self.lhs, self.rhs)
def hole_readings(sentence, grammar_filename=None, verbose=False):
if not grammar_filename:
- grammar_filename = "grammars/sample_grammars/hole.fcfg"
+ grammar_filename = 'grammars/sample_grammars/hole.fcfg'
if verbose:
- print("Reading grammar file", grammar_filename)
+ print('Reading grammar file', grammar_filename)
parser = load_parser(grammar_filename)
tokens = sentence.split()
trees = list(parser.parse(tokens))
if verbose:
- print("Got %d different parses" % len(trees))
+ print('Got %d different parses' % len(trees))
all_readings = []
for tree in trees:
# Get the semantic feature from the top of the parse tree.
- sem = tree.label()["SEM"].simplify()
+ sem = tree.label()['SEM'].simplify()
# Print the raw semantic representation.
if verbose:
- print("Raw: ", sem)
+ print('Raw: ', sem)
# Skolemize away all quantifiers. All variables become unique.
while isinstance(sem, LambdaExpression):
skolemized = skolemize(sem)
if verbose:
- print("Skolemized:", skolemized)
+ print('Skolemized:', skolemized)
# Break the hole semantics representation down into its components
# i.e. holes, labels, formula fragments and constraints.
# Maybe show the details of the semantic representation.
if verbose:
- print("Holes: ", hole_sem.holes)
- print("Labels: ", hole_sem.labels)
- print("Constraints: ", hole_sem.constraints)
- print("Top hole: ", hole_sem.top_hole)
- print("Top labels: ", hole_sem.top_most_labels)
- print("Fragments:")
+ print('Holes: ', hole_sem.holes)
+ print('Labels: ', hole_sem.labels)
+ print('Constraints: ', hole_sem.constraints)
+ print('Top hole: ', hole_sem.top_hole)
+ print('Top labels: ', hole_sem.top_most_labels)
+ print('Fragments:')
for l, f in hole_sem.fragments.items():
- print("\t%s: %s" % (l, f))
+ print('\t%s: %s' % (l, f))
# Find all the possible ways to plug the formulas together.
pluggings = hole_sem.pluggings()
if verbose:
for i, r in enumerate(readings):
print()
- print("%d. %s" % (i, r))
+ print('%d. %s' % (i, r))
print()
all_readings.extend(readings)
return all_readings
-if __name__ == "__main__":
- for r in hole_readings("a dog barks"):
+if __name__ == '__main__':
+ for r in hole_readings('a dog barks'):
print(r)
print()
- for r in hole_readings("every girl chases a dog"):
+ for r in hole_readings('every girl chases a dog'):
print(r)
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
from itertools import chain
from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class FStructure(dict):
def safeappend(self, key, item):
"""
depgraph = DependencyGraph()
nodes = depgraph.nodes
- self._to_depgraph(nodes, 0, "ROOT")
+ self._to_depgraph(nodes, 0, 'ROOT')
# Add all the dependencies for all the nodes
for address, node in nodes.items():
- for n2 in (n for n in nodes.values() if n["rel"] != "TOP"):
- if n2["head"] == address:
- relation = n2["rel"]
- node["deps"].setdefault(relation, [])
- node["deps"][relation].append(n2["address"])
+ for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'):
+ if n2['head'] == address:
+ relation = n2['rel']
+ node['deps'].setdefault(relation, [])
+ node['deps'][relation].append(n2['address'])
depgraph.root = nodes[1]
nodes[index].update(
{
- "address": index,
- "word": self.pred[0],
- "tag": self.pred[1],
- "head": head,
- "rel": rel,
+ 'address': index,
+ 'word': self.pred[0],
+ 'tag': self.pred[1],
+ 'head': head,
+ 'rel': rel,
}
)
new_index = len(nodes)
nodes[new_index].update(
{
- "address": new_index,
- "word": item[0],
- "tag": item[1],
- "head": index,
- "rel": feature,
+ 'address': new_index,
+ 'word': item[0],
+ 'tag': item[1],
+ 'head': index,
+ 'rel': feature,
}
)
elif isinstance(item, list):
n._to_depgraph(nodes, index, feature)
else:
raise Exception(
- "feature %s is not an FStruct, a list, or a tuple" % feature
+ 'feature %s is not an FStruct, a list, or a tuple' % feature
)
@staticmethod
if not label_counter:
label_counter = Counter()
- if node["rel"].lower() in ["spec", "punct"]:
+ if node['rel'].lower() in ['spec', 'punct']:
# the value of a 'spec' entry is a word, not an FStructure
- return (node["word"], node["tag"])
+ return (node['word'], node['tag'])
else:
fstruct = FStructure()
fstruct.parent = parent
- word, tag = node["word"], node["tag"]
- if tag[:2] == "VB":
- if tag[2:3] == "D":
- fstruct.safeappend("tense", ("PAST", "tense"))
+ word, tag = node['word'], node['tag']
+ if tag[:2] == 'VB':
+ if tag[2:3] == 'D':
+ fstruct.safeappend('tense', ('PAST', 'tense'))
fstruct.pred = (word, tag[:2])
if not fstruct.pred:
fstruct.pred = (word, tag)
- children = [depgraph.nodes[idx] for idx in chain(*node["deps"].values())]
+ children = [depgraph.nodes[idx] for idx in chain(*node['deps'].values())]
for child in children:
fstruct.safeappend(
- child["rel"],
+ child['rel'],
FStructure._read_depgraph(child, depgraph, label_counter, fstruct),
)
:type value: int
"""
letter = [
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "a",
- "b",
- "c",
- "d",
- "e",
+ 'f',
+ 'g',
+ 'h',
+ 'i',
+ 'j',
+ 'k',
+ 'l',
+ 'm',
+ 'n',
+ 'o',
+ 'p',
+ 'q',
+ 'r',
+ 's',
+ 't',
+ 'u',
+ 'v',
+ 'w',
+ 'x',
+ 'y',
+ 'z',
+ 'a',
+ 'b',
+ 'c',
+ 'd',
+ 'e',
][value - 1]
num = int(value) // 26
if num > 0:
return letter
def __repr__(self):
- return self.__str__().replace("\n", "")
+ return self.__unicode__().replace('\n', '')
def __str__(self):
return self.pretty_format()
def pretty_format(self, indent=3):
try:
- accum = "%s:[" % self.label
+ accum = '%s:[' % self.label
except NameError:
- accum = "["
+ accum = '['
try:
- accum += "pred '%s'" % (self.pred[0])
+ accum += 'pred \'%s\'' % (self.pred[0])
except NameError:
pass
for item in self[feature]:
if isinstance(item, FStructure):
next_indent = indent + len(feature) + 3 + len(self.label)
- accum += "\n%s%s %s" % (
- " " * (indent),
+ accum += '\n%s%s %s' % (
+ ' ' * (indent),
feature,
item.pretty_format(next_indent),
)
elif isinstance(item, tuple):
- accum += "\n%s%s '%s'" % (" " * (indent), feature, item[0])
+ accum += '\n%s%s \'%s\'' % (' ' * (indent), feature, item[0])
elif isinstance(item, list):
- accum += "\n%s%s {%s}" % (
- " " * (indent),
+ accum += '\n%s%s {%s}' % (
+ ' ' * (indent),
feature,
- ("\n%s" % (" " * (indent + len(feature) + 2))).join(item),
+ ('\n%s' % (' ' * (indent + len(feature) + 2))).join(item),
)
else: # ERROR
raise Exception(
- "feature %s is not an FStruct, a list, or a tuple" % feature
+ 'feature %s is not an FStruct, a list, or a tuple' % feature
)
- return accum + "]"
+ return accum + ']'
def demo_read_depgraph():
print(FStructure.read_depgraph(dg))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo_read_depgraph()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from six import string_types
from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import LogicParser, APP
_counter = Counter()
class Tokens(object):
# Punctuation
- OPEN = "("
- CLOSE = ")"
+ OPEN = '('
+ CLOSE = ')'
# Operations
- IMP = "-o"
+ IMP = '-o'
PUNCT = [OPEN, CLOSE]
TOKENS = PUNCT + [IMP]
return ConstantExpression(name)
+@python_2_unicode_compatible
class Expression(object):
_linear_logic_parser = LinearLogicParser()
return self.applyto(other)
def __repr__(self):
- return "<%s %s>" % (self.__class__.__name__, self)
+ return '<%s %s>' % (self.__class__.__name__, self)
+@python_2_unicode_compatible
class AtomicExpression(Expression):
def __init__(self, name, dependencies=None):
"""
:param name: str for the constant name
:param dependencies: list of int for the indices on which this atom is dependent
"""
- assert isinstance(name, str)
+ assert isinstance(name, string_types)
self.name = name
if not dependencies:
raise UnificationException(self, other, bindings)
+@python_2_unicode_compatible
class ImpExpression(Expression):
def __init__(self, antecedent, consequent):
"""
(c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory)
fresh_index = index_counter.get()
c.dependencies.append(fresh_index)
- new_v = glueFormulaFactory("v%s" % fresh_index, a, set([fresh_index]))
+ new_v = glueFormulaFactory('v%s' % fresh_index, a, set([fresh_index]))
return (c, a_new + c_new + [new_v])
def initialize_labels(self, fstruct):
def __hash__(self):
return hash(
- "%s%s%s" % (hash(self.antecedent), Tokens.IMP, hash(self.consequent))
+ '%s%s%s' % (hash(self.antecedent), Tokens.IMP, hash(self.consequent))
)
+@python_2_unicode_compatible
class ApplicationExpression(Expression):
def __init__(self, function, argument, argument_indices=None):
"""
bindings += function_simp.antecedent.unify(argument_simp, bindings)
except UnificationException as e:
raise LinearLogicApplicationException(
- "Cannot apply %s to %s. %s" % (function_simp, argument_simp, e)
+ 'Cannot apply %s to %s. %s' % (function_simp, argument_simp, e)
)
# If you are running it on complied premises, more conditions apply
# A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices
if not set(function_simp.antecedent.dependencies) < argument_indices:
raise LinearLogicApplicationException(
- "Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s"
+ 'Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s'
% (function_simp, argument_simp)
)
if set(function_simp.antecedent.dependencies) == argument_indices:
raise LinearLogicApplicationException(
- "Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s"
+ 'Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s'
% (function_simp, argument_simp)
)
def __hash__(self):
return hash(
- "%s%s%s" % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent))
+ '%s%s%s' % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent))
)
+@python_2_unicode_compatible
class BindingDict(object):
def __init__(self, bindings=None):
"""
self.d[variable] = binding
else:
raise VariableBindingException(
- "Variable %s already bound to another value" % (variable)
+ 'Variable %s already bound to another value' % (variable)
)
def __getitem__(self, variable):
return combined
except VariableBindingException:
raise VariableBindingException(
- "Attempting to add two contradicting"
- " VariableBindingsLists: %s, %s" % (self, other)
+ 'Attempting to add two contradicting'
+ ' VariableBindingsLists: %s, %s' % (self, other)
)
def __ne__(self, other):
return self.d == other.d
def __str__(self):
- return "{" + ", ".join("%s: %s" % (v, self.d[v]) for v in self.d) + "}"
+ return '{' + ', '.join('%s: %s' % (v, self.d[v]) for v in self.d) + '}'
def __repr__(self):
- return "BindingDict: %s" % self
+ return 'BindingDict: %s' % self
class VariableBindingException(Exception):
class UnificationException(Exception):
def __init__(self, a, b, bindings):
- Exception.__init__(self, "Cannot unify %s with %s given %s" % (a, b, bindings))
+ Exception.__init__(self, 'Cannot unify %s with %s given %s' % (a, b, bindings))
class LinearLogicApplicationException(Exception):
def demo():
lexpr = Expression.fromstring
- print(lexpr(r"f"))
- print(lexpr(r"(g -o f)"))
- print(lexpr(r"((g -o G) -o G)"))
- print(lexpr(r"g -o h -o f"))
- print(lexpr(r"(g -o f)(g)").simplify())
- print(lexpr(r"(H -o f)(g)").simplify())
- print(lexpr(r"((g -o G) -o G)((g -o f))").simplify())
- print(lexpr(r"(H -o H)((g -o f))").simplify())
+ print(lexpr(r'f'))
+ print(lexpr(r'(g -o f)'))
+ print(lexpr(r'((g -o G) -o G)'))
+ print(lexpr(r'g -o h -o f'))
+ print(lexpr(r'(g -o f)(g)').simplify())
+ print(lexpr(r'(H -o f)(g)').simplify())
+ print(lexpr(r'((g -o G) -o G)((g -o f))').simplify())
+ print(lexpr(r'(H -o H)((g -o f))').simplify())
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
A version of first order predicate logic, built on
top of the typed lambda calculus.
"""
+from __future__ import print_function, unicode_literals
import re
import operator
from collections import defaultdict
from functools import reduce, total_ordering
+from six import string_types
+
from nltk.util import Trie
from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
-APP = "APP"
+APP = 'APP'
_counter = Counter()
class Tokens(object):
- LAMBDA = "\\"
- LAMBDA_LIST = ["\\"]
+ LAMBDA = '\\'
+ LAMBDA_LIST = ['\\']
# Quantifiers
- EXISTS = "exists"
- EXISTS_LIST = ["some", "exists", "exist"]
- ALL = "all"
- ALL_LIST = ["all", "forall"]
+ EXISTS = 'exists'
+ EXISTS_LIST = ['some', 'exists', 'exist']
+ ALL = 'all'
+ ALL_LIST = ['all', 'forall']
# Punctuation
- DOT = "."
- OPEN = "("
- CLOSE = ")"
- COMMA = ","
+ DOT = '.'
+ OPEN = '('
+ CLOSE = ')'
+ COMMA = ','
# Operations
- NOT = "-"
- NOT_LIST = ["not", "-", "!"]
- AND = "&"
- AND_LIST = ["and", "&", "^"]
- OR = "|"
- OR_LIST = ["or", "|"]
- IMP = "->"
- IMP_LIST = ["implies", "->", "=>"]
- IFF = "<->"
- IFF_LIST = ["iff", "<->", "<=>"]
- EQ = "="
- EQ_LIST = ["=", "=="]
- NEQ = "!="
- NEQ_LIST = ["!="]
+ NOT = '-'
+ NOT_LIST = ['not', '-', '!']
+ AND = '&'
+ AND_LIST = ['and', '&', '^']
+ OR = '|'
+ OR_LIST = ['or', '|']
+ IMP = '->'
+ IMP_LIST = ['implies', '->', '=>']
+ IFF = '<->'
+ IFF_LIST = ['iff', '<->', '<=>']
+ EQ = '='
+ EQ_LIST = ['=', '==']
+ NEQ = '!='
+ NEQ_LIST = ['!=']
# Collections of tokens
BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST
TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST
# Special
- SYMBOLS = [x for x in TOKENS if re.match(r"^[-\\.(),!&^|>=<]*$", x)]
+ SYMBOLS = [x for x in TOKENS if re.match(r'^[-\\.(),!&^|>=<]*$', x)]
def boolean_ops():
print("%-15s\t%s" % pair)
+@python_2_unicode_compatible
class LogicParser(object):
"""A lambda calculus expression parser."""
if self.inRange(0):
raise UnexpectedTokenException(self._currentIndex + 1, self.token(0))
except LogicalExpressionException as e:
- msg = "%s\n%s\n%s^" % (e, data, " " * mapping[e.index - 1])
+ msg = '%s\n%s\n%s^' % (e, data, ' ' * mapping[e.index - 1])
raise LogicalExpressionException(None, msg)
if self.type_check:
out = []
mapping = {}
tokenTrie = Trie(self.get_all_symbols())
- token = ""
+ token = ''
data_idx = 0
token_start_idx = data_idx
while data_idx < len(data):
st = tokenTrie
c = data[data_idx]
- symbol = ""
+ symbol = ''
while c in st:
symbol += c
st = st[c]
if token:
mapping[len(out)] = token_start_idx
out.append(token)
- token = ""
+ token = ''
mapping[len(out)] = data_idx
out.append(symbol)
data_idx += len(symbol)
else:
- if data[data_idx] in " \t\n": # any whitespace
+ if data[data_idx] in ' \t\n': # any whitespace
if token:
mapping[len(out)] = token_start_idx
out.append(token)
- token = ""
+ token = ''
else:
if not token:
token_start_idx = data_idx
return out, mapping
def process_quoted_token(self, data_idx, data):
- token = ""
+ token = ''
c = data[data_idx]
i = data_idx
for start, end, escape, incl_quotes in self.quote_chars:
token += data[i]
i += 1
if not token:
- raise LogicalExpressionException(None, "Empty quoted token found")
+ raise LogicalExpressionException(None, 'Empty quoted token found')
break
return token, i
tok = self.token()
except ExpectedMoreTokensException:
raise ExpectedMoreTokensException(
- self._currentIndex + 1, message="Expression expected."
+ self._currentIndex + 1, message='Expression expected.'
)
accum = self.handle(tok, context)
if not accum:
raise UnexpectedTokenException(
- self._currentIndex, tok, message="Expression expected."
+ self._currentIndex, tok, message='Expression expected.'
)
return self.attempt_adjuncts(accum, context)
try:
tok = self.token()
except ExpectedMoreTokensException as e:
- raise ExpectedMoreTokensException(e.index, "Variable expected.")
+ raise ExpectedMoreTokensException(e.index, 'Variable expected.')
if isinstance(self.make_VariableExpression(tok), ConstantExpression):
raise LogicalExpressionException(
self._currentIndex,
self._currentIndex + 2,
message="Variable and Expression expected following lambda operator.",
)
- vars = [self.get_next_token_variable("abstracted")]
+ vars = [self.get_next_token_variable('abstracted')]
while True:
if not self.inRange(0) or (
self.token(0) == Tokens.DOT and not self.inRange(1)
if not self.isvariable(self.token(0)):
break
# Support expressions like: \x y.M == \x.\y.M
- vars.append(self.get_next_token_variable("abstracted"))
+ vars.append(self.get_next_token_variable('abstracted'))
if self.inRange(0) and self.token(0) == Tokens.DOT:
self.token() # swallow the dot
message="Variable and Expression expected following quantifier '%s'."
% tok,
)
- vars = [self.get_next_token_variable("quantified")]
+ vars = [self.get_next_token_variable('quantified')]
while True:
if not self.inRange(0) or (
self.token(0) == Tokens.DOT and not self.inRange(1)
if not self.isvariable(self.token(0)):
break
# Support expressions like: some x y.M == some x.some y.M
- vars.append(self.get_next_token_variable("quantified"))
+ vars.append(self.get_next_token_variable('quantified'))
if self.inRange(0) and self.token(0) == Tokens.DOT:
self.token() # swallow the dot
def __repr__(self):
if self.inRange(0):
- msg = "Next token: " + self.token(0)
+ msg = 'Next token: ' + self.token(0)
else:
- msg = "No more tokens"
- return "<" + self.__class__.__name__ + ": " + msg + ">"
+ msg = 'No more tokens'
+ return '<' + self.__class__.__name__ + ': ' + msg + '>'
def read_logic(s, logic_parser=None, encoding=None):
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith("#") or line == "":
+ if line.startswith('#') or line == '':
continue
try:
statements.append(logic_parser.parse(line))
except LogicalExpressionException:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return statements
@total_ordering
+@python_2_unicode_compatible
class Variable(object):
def __init__(self, name):
"""
:param name: the name of the variable
"""
- assert isinstance(name, str), "%s is not a string" % name
+ assert isinstance(name, string_types), "%s is not a string" % name
self.name = name
def __eq__(self, other):
"""
if pattern is not None:
if is_indvar(pattern.name):
- prefix = "z"
+ prefix = 'z'
elif is_funcvar(pattern.name):
- prefix = "F"
+ prefix = 'F'
elif is_eventvar(pattern.name):
- prefix = "e0"
+ prefix = 'e0'
else:
assert False, "Cannot generate a unique constant"
else:
- prefix = "z"
+ prefix = 'z'
v = Variable("%s%s" % (prefix, _counter.get()))
while ignore is not None and v in ignore:
Return a skolem function over the variables in univ_scope
param univ_scope
"""
- skolem = VariableExpression(Variable("F%s" % _counter.get()))
+ skolem = VariableExpression(Variable('F%s' % _counter.get()))
if univ_scope:
for v in list(univ_scope):
skolem = skolem(VariableExpression(v))
return skolem
+@python_2_unicode_compatible
class Type(object):
def __repr__(self):
return "%s" % self
return read_type(s)
+@python_2_unicode_compatible
class ComplexType(Type):
def __init__(self, first, second):
assert isinstance(first, Type), "%s is not a Type" % first
if self == ANY_TYPE:
return "%s" % ANY_TYPE
else:
- return "<%s,%s>" % (self.first, self.second)
+ return '<%s,%s>' % (self.first, self.second)
def str(self):
if self == ANY_TYPE:
return ANY_TYPE.str()
else:
- return "(%s -> %s)" % (self.first.str(), self.second.str())
+ return '(%s -> %s)' % (self.first.str(), self.second.str())
class BasicType(Type):
return None
+@python_2_unicode_compatible
class EntityType(BasicType):
def __str__(self):
- return "e"
+ return 'e'
def str(self):
- return "IND"
+ return 'IND'
+@python_2_unicode_compatible
class TruthValueType(BasicType):
def __str__(self):
- return "t"
+ return 't'
def str(self):
- return "BOOL"
+ return 'BOOL'
+@python_2_unicode_compatible
class EventType(BasicType):
def __str__(self):
- return "v"
+ return 'v'
def str(self):
- return "EVENT"
+ return 'EVENT'
+@python_2_unicode_compatible
class AnyType(BasicType, ComplexType):
def __init__(self):
pass
return other
def __str__(self):
- return "?"
+ return '?'
def str(self):
- return "ANY"
+ return 'ANY'
TRUTH_TYPE = TruthValueType()
def read_type(type_string):
- assert isinstance(type_string, str)
- type_string = type_string.replace(" ", "") # remove spaces
+ assert isinstance(type_string, string_types)
+ type_string = type_string.replace(' ', '') # remove spaces
- if type_string[0] == "<":
- assert type_string[-1] == ">"
+ if type_string[0] == '<':
+ assert type_string[-1] == '>'
paren_count = 0
for i, char in enumerate(type_string):
- if char == "<":
+ if char == '<':
paren_count += 1
- elif char == ">":
+ elif char == '>':
paren_count -= 1
assert paren_count > 0
- elif char == ",":
+ elif char == ',':
if paren_count == 1:
break
return ComplexType(
elif type_string[0] == "%s" % ANY_TYPE:
return ANY_TYPE
else:
- raise LogicalExpressionException(
- None, "Unexpected character: '%s'." % type_string[0]
- )
+ raise LogicalExpressionException(None, "Unexpected character: '%s'." % type_string[0])
class TypeException(Exception):
raise NotImplementedError()
+@python_2_unicode_compatible
class Expression(SubstituteBindingsI):
"""This is the base abstract object for all logical expressions"""
val = self.make_VariableExpression(val)
elif not isinstance(val, Expression):
raise ValueError(
- "Can not substitute a non-expression "
- "value into an expression: %r" % (val,)
+ 'Can not substitute a non-expression '
+ 'value into an expression: %r' % (val,)
)
# Substitute bindings in the target value.
val = val.substitute_bindings(bindings)
result = self
for i, e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)):
if isinstance(e, EventVariableExpression):
- newVar = e.__class__(Variable("e0%s" % (i + 1)))
+ newVar = e.__class__(Variable('e0%s' % (i + 1)))
elif isinstance(e, IndividualVariableExpression):
- newVar = e.__class__(Variable("z%s" % (i + 1)))
+ newVar = e.__class__(Variable('z%s' % (i + 1)))
else:
newVar = e
result = result.replace(e.variable, newVar, True)
return self.visit(function, lambda parts: combinator(*parts))
def __repr__(self):
- return "<%s %s>" % (self.__class__.__name__, self)
+ return '<%s %s>' % (self.__class__.__name__, self)
def __str__(self):
return self.str()
:return: set of ``Variable`` objects
"""
return self.free() | set(
- p for p in self.predicates() | self.constants() if re.match("^[?@]", p.name)
+ p for p in self.predicates() | self.constants() if re.match('^[?@]', p.name)
)
def free(self):
return VariableExpression(variable)
+@python_2_unicode_compatible
class ApplicationExpression(Expression):
r"""
This class is used to represent two related types of logical expressions.
# uncurry the arguments and find the base function
if self.is_atom():
function, args = self.uncurry()
- arg_str = ",".join("%s" % arg for arg in args)
+ arg_str = ','.join("%s" % arg for arg in args)
else:
# Leave arguments curried
function = self.function
@total_ordering
+@python_2_unicode_compatible
class AbstractVariableExpression(Expression):
"""This class represents a variable to be used as a predicate or entity"""
__hash__ = Expression.__hash__
+@python_2_unicode_compatible
class LambdaExpression(VariableBinderExpression):
@property
def type(self):
term = term.term
return (
Tokens.LAMBDA
- + " ".join("%s" % v for v in variables)
+ + ' '.join("%s" % v for v in variables)
+ Tokens.DOT
+ "%s" % term
)
+@python_2_unicode_compatible
class QuantifiedExpression(VariableBinderExpression):
@property
def type(self):
term = term.term
return (
self.getQuantifier()
- + " "
- + " ".join("%s" % v for v in variables)
+ + ' '
+ + ' '.join("%s" % v for v in variables)
+ Tokens.DOT
+ "%s" % term
)
return Tokens.ALL
+@python_2_unicode_compatible
class NegatedExpression(Expression):
def __init__(self, term):
assert isinstance(term, Expression), "%s is not an Expression" % term
return Tokens.NOT + "%s" % self.term
+@python_2_unicode_compatible
class BinaryExpression(Expression):
def __init__(self, first, second):
assert isinstance(first, Expression), "%s is not an Expression" % first
def __str__(self):
first = self._str_subex(self.first)
second = self._str_subex(self.second)
- return Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE
+ return Tokens.OPEN + first + ' ' + self.getOp() + ' ' + second + Tokens.CLOSE
def _str_subex(self, subex):
return "%s" % subex
elif unexpected:
msg = "Unexpected token: '%s'." % unexpected
if message:
- msg += " " + message
+ msg += ' ' + message
else:
msg = "Expected token '%s'." % expected
LogicalExpressionException.__init__(self, index, msg)
class ExpectedMoreTokensException(LogicalExpressionException):
def __init__(self, index, message=None):
if not message:
- message = "More tokens expected."
+ message = 'More tokens expected.'
LogicalExpressionException.__init__(
- self, index, "End of input found. " + message
+ self, index, 'End of input found. ' + message
)
:param expr: str
:return: bool True if expr is of the correct form
"""
- assert isinstance(expr, str), "%s is not a string" % expr
- return re.match(r"^[a-df-z]\d*$", expr) is not None
+ assert isinstance(expr, string_types), "%s is not a string" % expr
+ return re.match(r'^[a-df-z]\d*$', expr) is not None
def is_funcvar(expr):
:param expr: str
:return: bool True if expr is of the correct form
"""
- assert isinstance(expr, str), "%s is not a string" % expr
- return re.match(r"^[A-Z]\d*$", expr) is not None
+ assert isinstance(expr, string_types), "%s is not a string" % expr
+ return re.match(r'^[A-Z]\d*$', expr) is not None
def is_eventvar(expr):
:param expr: str
:return: bool True if expr is of the correct form
"""
- assert isinstance(expr, str), "%s is not a string" % expr
- return re.match(r"^e\d*$", expr) is not None
+ assert isinstance(expr, string_types), "%s is not a string" % expr
+ return re.match(r'^e\d*$', expr) is not None
def demo():
lexpr = Expression.fromstring
- print("=" * 20 + "Test reader" + "=" * 20)
- print(lexpr(r"john"))
- print(lexpr(r"man(x)"))
- print(lexpr(r"-man(x)"))
- print(lexpr(r"(man(x) & tall(x) & walks(x))"))
- print(lexpr(r"exists x.(man(x) & tall(x) & walks(x))"))
- print(lexpr(r"\x.man(x)"))
- print(lexpr(r"\x.man(x)(john)"))
- print(lexpr(r"\x y.sees(x,y)"))
- print(lexpr(r"\x y.sees(x,y)(a,b)"))
- print(lexpr(r"(\x.exists y.walks(x,y))(x)"))
- print(lexpr(r"exists x.x = y"))
- print(lexpr(r"exists x.(x = y)"))
- print(lexpr("P(x) & x=y & P(y)"))
- print(lexpr(r"\P Q.exists x.(P(x) & Q(x))"))
- print(lexpr(r"man(x) <-> tall(x)"))
-
- print("=" * 20 + "Test simplify" + "=" * 20)
- print(lexpr(r"\x.\y.sees(x,y)(john)(mary)").simplify())
- print(lexpr(r"\x.\y.sees(x,y)(john, mary)").simplify())
- print(lexpr(r"all x.(man(x) & (\x.exists y.walks(x,y))(x))").simplify())
- print(lexpr(r"(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))").simplify())
-
- print("=" * 20 + "Test alpha conversion and binder expression equality" + "=" * 20)
- e1 = lexpr("exists x.P(x)")
+ print('=' * 20 + 'Test reader' + '=' * 20)
+ print(lexpr(r'john'))
+ print(lexpr(r'man(x)'))
+ print(lexpr(r'-man(x)'))
+ print(lexpr(r'(man(x) & tall(x) & walks(x))'))
+ print(lexpr(r'exists x.(man(x) & tall(x) & walks(x))'))
+ print(lexpr(r'\x.man(x)'))
+ print(lexpr(r'\x.man(x)(john)'))
+ print(lexpr(r'\x y.sees(x,y)'))
+ print(lexpr(r'\x y.sees(x,y)(a,b)'))
+ print(lexpr(r'(\x.exists y.walks(x,y))(x)'))
+ print(lexpr(r'exists x.x = y'))
+ print(lexpr(r'exists x.(x = y)'))
+ print(lexpr('P(x) & x=y & P(y)'))
+ print(lexpr(r'\P Q.exists x.(P(x) & Q(x))'))
+ print(lexpr(r'man(x) <-> tall(x)'))
+
+ print('=' * 20 + 'Test simplify' + '=' * 20)
+ print(lexpr(r'\x.\y.sees(x,y)(john)(mary)').simplify())
+ print(lexpr(r'\x.\y.sees(x,y)(john, mary)').simplify())
+ print(lexpr(r'all x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify())
+ print(lexpr(r'(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))').simplify())
+
+ print('=' * 20 + 'Test alpha conversion and binder expression equality' + '=' * 20)
+ e1 = lexpr('exists x.P(x)')
print(e1)
- e2 = e1.alpha_convert(Variable("z"))
+ e2 = e1.alpha_convert(Variable('z'))
print(e2)
print(e1 == e2)
def demo_errors():
- print("=" * 20 + "Test reader errors" + "=" * 20)
- demoException("(P(x) & Q(x)")
- demoException("((P(x) &) & Q(x))")
- demoException("P(x) -> ")
- demoException("P(x")
- demoException("P(x,")
- demoException("P(x,)")
- demoException("exists")
- demoException("exists x.")
- demoException("\\")
- demoException("\\ x y.")
- demoException("P(x)Q(x)")
- demoException("(P(x)Q(x)")
- demoException("exists x -> y")
+ print('=' * 20 + 'Test reader errors' + '=' * 20)
+ demoException('(P(x) & Q(x)')
+ demoException('((P(x) &) & Q(x))')
+ demoException('P(x) -> ')
+ demoException('P(x')
+ demoException('P(x,')
+ demoException('P(x,)')
+ demoException('exists')
+ demoException('exists x.')
+ demoException('\\')
+ demoException('\\ x y.')
+ demoException('P(x)Q(x)')
+ demoException('(P(x)Q(x)')
+ demoException('exists x -> y')
def demoException(s):
print("%s : %s" % (ex.str(), ex.type))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# demo_errors()
# Natural Language Toolkit: Relation Extraction
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
- A clause is an atom of the form ``relsym(subjsym, objsym)``,
where the relation, subject and object have been canonicalized to single strings.
"""
+from __future__ import print_function
# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
from collections import defaultdict
-import html
import re
+from six.moves import html_entities
+
# Dictionary that associates corpora with NE classes
NE_CLASSES = {
- "ieer": [
- "LOCATION",
- "ORGANIZATION",
- "PERSON",
- "DURATION",
- "DATE",
- "CARDINAL",
- "PERCENT",
- "MONEY",
- "MEASURE",
+ 'ieer': [
+ 'LOCATION',
+ 'ORGANIZATION',
+ 'PERSON',
+ 'DURATION',
+ 'DATE',
+ 'CARDINAL',
+ 'PERCENT',
+ 'MONEY',
+ 'MEASURE',
],
- "conll2002": ["LOC", "PER", "ORG"],
- "ace": [
- "LOCATION",
- "ORGANIZATION",
- "PERSON",
- "DURATION",
- "DATE",
- "CARDINAL",
- "PERCENT",
- "MONEY",
- "MEASURE",
- "FACILITY",
- "GPE",
+ 'conll2002': ['LOC', 'PER', 'ORG'],
+ 'ace': [
+ 'LOCATION',
+ 'ORGANIZATION',
+ 'PERSON',
+ 'DURATION',
+ 'DATE',
+ 'CARDINAL',
+ 'PERCENT',
+ 'MONEY',
+ 'MEASURE',
+ 'FACILITY',
+ 'GPE',
],
}
# Allow abbreviated class labels
-short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON")
-long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER")
+short2long = dict(LOC='LOCATION', ORG='ORGANIZATION', PER='PERSON')
+long2short = dict(LOCATION='LOC', ORGANIZATION='ORG', PERSON='PER')
def _expand(type):
return type
-def _join(lst, sep=" ", untag=False):
+def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
return sep.join(tuple2str(tup) for tup in lst)
-def descape_entity(m, defs=html.entities.entitydefs):
+def descape_entity(m, defs=html_entities.entitydefs):
"""
Translate one entity to its ISO Latin value.
Inspired by example from effbot.org
"""
+ # s = 'mcglashan_&_sarrail'
+ # l = ['mcglashan', '&', 'sarrail']
+ # pattern = re.compile("&(\w+?);")
+ # new = list2sym(l)
+ # s = pattern.sub(descape_entity, s)
+ # print s, new
try:
return defs[m.group(1)]
:return: a Unicode string without whitespace
:rtype: unicode
"""
- sym = _join(lst, "_", untag=True)
+ sym = _join(lst, '_', untag=True)
sym = sym.lower()
ENT = re.compile("&(\w+?);")
sym = ENT.sub(descape_entity, sym)
- sym = sym.replace(".", "")
+ sym = sym.replace('.', '')
return sym
result = []
while len(pairs) > 2:
reldict = defaultdict(str)
- reldict["lcon"] = _join(pairs[0][0][-window:])
- reldict["subjclass"] = pairs[0][1].label()
- reldict["subjtext"] = _join(pairs[0][1].leaves())
- reldict["subjsym"] = list2sym(pairs[0][1].leaves())
- reldict["filler"] = _join(pairs[1][0])
- reldict["untagged_filler"] = _join(pairs[1][0], untag=True)
- reldict["objclass"] = pairs[1][1].label()
- reldict["objtext"] = _join(pairs[1][1].leaves())
- reldict["objsym"] = list2sym(pairs[1][1].leaves())
- reldict["rcon"] = _join(pairs[2][0][:window])
+ reldict['lcon'] = _join(pairs[0][0][-window:])
+ reldict['subjclass'] = pairs[0][1].label()
+ reldict['subjtext'] = _join(pairs[0][1].leaves())
+ reldict['subjsym'] = list2sym(pairs[0][1].leaves())
+ reldict['filler'] = _join(pairs[1][0])
+ reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
+ reldict['objclass'] = pairs[1][1].label()
+ reldict['objtext'] = _join(pairs[1][1].leaves())
+ reldict['objsym'] = list2sym(pairs[1][1].leaves())
+ reldict['rcon'] = _join(pairs[2][0][:window])
if trace:
print(
"(%s(%s, %s)"
% (
- reldict["untagged_filler"],
- reldict["subjclass"],
- reldict["objclass"],
+ reldict['untagged_filler'],
+ reldict['subjclass'],
+ reldict['objclass'],
)
)
result.append(reldict)
return result
-def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10):
+def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
"""
Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
"your value for the object type has not been recognized: %s" % objclass
)
- if corpus == "ace" or corpus == "conll2002":
+ if corpus == 'ace' or corpus == 'conll2002':
pairs = tree2semi_rel(doc)
- elif corpus == "ieer":
+ elif corpus == 'ieer':
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
else:
raise ValueError("corpus type not recognized")
reldicts = semi_rel2reldict(pairs)
relfilter = lambda x: (
- x["subjclass"] == subjclass
- and len(x["filler"].split()) <= window
- and pattern.match(x["filler"])
- and x["objclass"] == objclass
+ x['subjclass'] == subjclass
+ and len(x['filler'].split()) <= window
+ and pattern.match(x['filler'])
+ and x['objclass'] == objclass
)
return list(filter(relfilter, reldicts))
:type reldict: defaultdict
"""
items = [
- class_abbrev(reldict["subjclass"]),
- reldict["subjtext"],
- reldict["filler"],
- class_abbrev(reldict["objclass"]),
- reldict["objtext"],
+ class_abbrev(reldict['subjclass']),
+ reldict['subjtext'],
+ reldict['filler'],
+ class_abbrev(reldict['objclass']),
+ reldict['objtext'],
]
- format = "[%s: %r] %r [%s: %r]"
+ format = '[%s: %r] %r [%s: %r]'
if lcon:
- items = [reldict["lcon"]] + items
- format = "...%r)" + format
+ items = [reldict['lcon']] + items
+ format = '...%r)' + format
if rcon:
- items.append(reldict["rcon"])
- format = format + "(%r..."
+ items.append(reldict['rcon'])
+ format = format + '(%r...'
printargs = tuple(items)
return format % printargs
:param relsym: a label for the relation
:type relsym: str
"""
- items = (relsym, reldict["subjsym"], reldict["objsym"])
+ items = (relsym, reldict['subjsym'], reldict['objsym'])
return "%s(%r, %r)" % items
warnings.warn("Cannot import sqlite; sql flag will be ignored.")
- IN = re.compile(r".*\bin\b(?!\b.+ing)")
+ IN = re.compile(r'.*\bin\b(?!\b.+ing)')
print()
print("IEER: in(ORG, LOC) -- just the clauses:")
if trace:
print(doc.docno)
print("=" * 15)
- for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
- print(clause(rel, relsym="IN"))
+ for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
+ print(clause(rel, relsym='IN'))
if sql:
try:
- rtuple = (rel["subjtext"], rel["objtext"], doc.docno)
+ rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
cur.execute(
"""insert into Locations
values (?, ?, ?)""",
print(doc.docno)
print("=" * 15)
lcon = rcon = True
- for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES):
+ for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
print(rtuple(rel, lcon=lcon, rcon=rcon))
print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
print("=" * 45)
- for doc in conll2002.chunked_sents("ned.train"):
+ for doc in conll2002.chunked_sents('ned.train'):
lcon = rcon = False
if trace:
lcon = rcon = True
for rel in extract_rels(
- "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10
+ 'PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10
):
print(rtuple(rel, lcon=lcon, rcon=rcon))
print("=" * 45)
rels = [
rel
- for doc in conll2002.chunked_sents("esp.train")
- for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE)
+ for doc in conll2002.chunked_sents('esp.train')
+ for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)
]
for r in rels[:10]:
- print(clause(r, relsym="DE"))
+ print(clause(r, relsym='DE'))
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(
- r".*(chairman|president|trader|scientist|economist|analyst|partner).*"
+ r'.*(chairman|president|trader|scientist|economist|analyst|partner).*'
)
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
- rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7)
+ rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
- print("{0:<5}{1}".format(i, rtuple(rel)))
+ print('{0:<5}{1}'.format(i, rtuple(rel)))
-if __name__ == "__main__":
+if __name__ == '__main__':
import nltk
from nltk.sem import relextract
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
elif isinstance(negated, ApplicationExpression):
return expression
else:
- raise Exception("'%s' cannot be skolemized" % expression)
+ raise Exception('\'%s\' cannot be skolemized' % expression)
elif isinstance(expression, ExistsExpression):
term = skolemize(
expression.term, univ_scope, used_variables | set([expression.variable])
elif isinstance(expression, ApplicationExpression):
return expression
else:
- raise Exception("'%s' cannot be skolemized" % expression)
+ raise Exception('\'%s\' cannot be skolemized' % expression)
def to_cnf(first, second):
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
syntax tree, followed by evaluation of the semantic representation in
a first-order model.
"""
+from __future__ import print_function, unicode_literals
import codecs
from nltk.sem import evaluate
return parses
-def root_semrep(syntree, semkey="SEM"):
+def root_semrep(syntree, semkey='SEM'):
"""
Find the semantic representation at the root of a tree.
try:
return node[semkey]
except KeyError:
- print(node, end=" ")
+ print(node, end=' ')
print("has no specification for the feature %s" % semkey)
raise
-def interpret_sents(inputs, grammar, semkey="SEM", trace=0):
+def interpret_sents(inputs, grammar, semkey='SEM', trace=0):
"""
Add the semantic representation to each syntactic parse tree
of each input sentence.
global m0, g0
# Initialize a valuation of non-logical constants."""
v = [
- ("john", "b1"),
- ("mary", "g1"),
- ("suzie", "g2"),
- ("fido", "d1"),
- ("tess", "d2"),
- ("noosa", "n"),
- ("girl", set(["g1", "g2"])),
- ("boy", set(["b1", "b2"])),
- ("dog", set(["d1", "d2"])),
- ("bark", set(["d1", "d2"])),
- ("walk", set(["b1", "g2", "d1"])),
- ("chase", set([("b1", "g1"), ("b2", "g1"), ("g1", "d1"), ("g2", "d2")])),
+ ('john', 'b1'),
+ ('mary', 'g1'),
+ ('suzie', 'g2'),
+ ('fido', 'd1'),
+ ('tess', 'd2'),
+ ('noosa', 'n'),
+ ('girl', set(['g1', 'g2'])),
+ ('boy', set(['b1', 'b2'])),
+ ('dog', set(['d1', 'd2'])),
+ ('bark', set(['d1', 'd2'])),
+ ('walk', set(['b1', 'g2', 'd1'])),
+ ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
(
- "see",
- set([("b1", "g1"), ("b2", "d2"), ("g1", "b1"), ("d2", "b1"), ("g2", "n")]),
+ 'see',
+ set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'), ('d2', 'b1'), ('g2', 'n')]),
),
- ("in", set([("b1", "n"), ("b2", "n"), ("d2", "n")])),
- ("with", set([("b1", "g1"), ("g1", "b1"), ("d1", "b1"), ("b1", "d1")])),
+ ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])),
+ ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')])),
]
# Read in the data from ``v``
val = evaluate.Valuation(v)
g0 = evaluate.Assignment(dom)
-def read_sents(filename, encoding="utf8"):
- with codecs.open(filename, "r", encoding) as fp:
+def read_sents(filename, encoding='utf8'):
+ with codecs.open(filename, 'r', encoding) as fp:
sents = [l.rstrip() for l in fp]
# get rid of blank lines
sents = [l for l in sents if len(l) > 0]
- sents = [l for l in sents if not l[0] == "#"]
+ sents = [l for l in sents if not l[0] == '#']
return sents
)
print("Reading grammar: %s" % g)
print("*" * 20)
- for reading in interpret_sents(["hello"], g, semkey="sem"):
+ for reading in interpret_sents(['hello'], g, semkey='sem'):
syn, sem = reading[0]
print()
print("output: ", sem)
beta=True,
syntrace=0,
semtrace=0,
- demo="default",
- grammar="",
- sentences="",
+ demo='default',
+ grammar='',
+ sentences='',
)
opts.add_option(
(options, args) = opts.parse_args()
- SPACER = "-" * 30
+ SPACER = '-' * 30
demo_model0()
sents = [
- "Fido sees a boy with Mary",
- "John sees Mary",
- "every girl chases a dog",
- "every boy chases a girl",
- "John walks with a girl in Noosa",
- "who walks",
+ 'Fido sees a boy with Mary',
+ 'John sees Mary',
+ 'every girl chases a dog',
+ 'every boy chases a girl',
+ 'John walks with a girl in Noosa',
+ 'who walks',
]
- gramfile = "grammars/sample_grammars/sem2.fcfg"
+ gramfile = 'grammars/sample_grammars/sem2.fcfg'
if options.sentences:
sentsfile = options.sentences
for i, sent in enumerate(sents):
n = 1
- print("\nSentence: %s" % sent)
+ print('\nSentence: %s' % sent)
print(SPACER)
if options.evaluate:
for (syntree, semrep, value) in evaluations[i]:
if isinstance(value, dict):
value = set(value.keys())
- print("%d: %s" % (n, semrep))
+ print('%d: %s' % (n, semrep))
print(value)
n += 1
else:
for (syntree, semrep) in semreps[i]:
- print("%d: %s" % (n, semrep))
+ print('%d: %s' % (n, semrep))
n += 1
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Sentiment Analysis
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
# Natural Language Toolkit: Sentiment Analyzer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
purposes.
"""
-import sys
+from __future__ import print_function
from collections import defaultdict
from nltk.classify.util import apply_features, accuracy as eval_accuracy
from nltk.probability import FreqDist
+from nltk.sentiment.util import save_file, timer
+
class SentimentAnalyzer(object):
"""
print("Training classifier")
self.classifier = trainer(training_set, **kwargs)
if save_classifier:
- self.save_file(self.classifier, save_classifier)
+ save_file(self.classifier, save_classifier)
return self.classifier
- def save_file(self, content, filename):
- """
- Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
- """
- print("Saving", filename, file=sys.stderr)
- with open(filename, 'wb') as storage_file:
- # The protocol=2 parameter is for python2 compatibility
- pickle.dump(content, storage_file, protocol=2)
-
def evaluate(
self,
test_set,
metrics_results = {}
if accuracy == True:
accuracy_score = eval_accuracy(classifier, test_set)
- metrics_results["Accuracy"] = accuracy_score
+ metrics_results['Accuracy'] = accuracy_score
gold_results = defaultdict(set)
test_results = defaultdict(set)
precision_score = eval_precision(
gold_results[label], test_results[label]
)
- metrics_results["Precision [{0}]".format(label)] = precision_score
+ metrics_results['Precision [{0}]'.format(label)] = precision_score
if recall == True:
recall_score = eval_recall(gold_results[label], test_results[label])
- metrics_results["Recall [{0}]".format(label)] = recall_score
+ metrics_results['Recall [{0}]'.format(label)] = recall_score
if f_measure == True:
f_measure_score = eval_f_measure(
gold_results[label], test_results[label]
)
- metrics_results["F-measure [{0}]".format(label)] = f_measure_score
+ metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
# Print evaluation results (in alphabetical order)
if verbose == True:
for result in sorted(metrics_results):
- print("{0}: {1}".format(result, metrics_results[result]))
+ print('{0}: {1}'.format(result, metrics_results[result]))
return metrics_results
#
# Natural Language Toolkit: Sentiment Analyzer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility methods for Sentiment Analysis.
"""
+from __future__ import division
import codecs
import csv
import sys
import time
from copy import deepcopy
+from itertools import tee
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
-CLAUSE_PUNCT = r"^[.:;!?]$"
+CLAUSE_PUNCT = r'^[.:;!?]$'
CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
# Happy and sad emoticons
HAPPY = set(
[
- ":-)",
- ":)",
- ";)",
- ":o)",
- ":]",
- ":3",
- ":c)",
- ":>",
- "=]",
- "8)",
- "=)",
- ":}",
- ":^)",
- ":-D",
- ":D",
- "8-D",
- "8D",
- "x-D",
- "xD",
- "X-D",
- "XD",
- "=-D",
- "=D",
- "=-3",
- "=3",
- ":-))",
+ ':-)',
+ ':)',
+ ';)',
+ ':o)',
+ ':]',
+ ':3',
+ ':c)',
+ ':>',
+ '=]',
+ '8)',
+ '=)',
+ ':}',
+ ':^)',
+ ':-D',
+ ':D',
+ '8-D',
+ '8D',
+ 'x-D',
+ 'xD',
+ 'X-D',
+ 'XD',
+ '=-D',
+ '=D',
+ '=-3',
+ '=3',
+ ':-))',
":'-)",
":')",
- ":*",
- ":^*",
- ">:P",
- ":-P",
- ":P",
- "X-P",
- "x-p",
- "xp",
- "XP",
- ":-p",
- ":p",
- "=p",
- ":-b",
- ":b",
- ">:)",
- ">;)",
- ">:-)",
- "<3",
+ ':*',
+ ':^*',
+ '>:P',
+ ':-P',
+ ':P',
+ 'X-P',
+ 'x-p',
+ 'xp',
+ 'XP',
+ ':-p',
+ ':p',
+ '=p',
+ ':-b',
+ ':b',
+ '>:)',
+ '>;)',
+ '>:-)',
+ '<3',
]
)
SAD = set(
[
- ":L",
- ":-/",
- ">:/",
- ":S",
- ">:[",
- ":@",
- ":-(",
- ":[",
- ":-||",
- "=L",
- ":<",
- ":-[",
- ":-<",
- "=\\",
- "=/",
- ">:(",
- ":(",
- ">.<",
+ ':L',
+ ':-/',
+ '>:/',
+ ':S',
+ '>:[',
+ ':@',
+ ':-(',
+ ':[',
+ ':-||',
+ '=L',
+ ':<',
+ ':-[',
+ ':-<',
+ '=\\',
+ '=/',
+ '>:(',
+ ':(',
+ '>.<',
":'-(",
":'(",
- ":\\",
- ":-c",
- ":c",
- ":{",
- ">:\\",
- ";(",
+ ':\\',
+ ':-c',
+ ':c',
+ ':{',
+ '>:\\',
+ ';(',
]
)
# in Python 2.x round() will return a float, so we convert it to int
secs = int(round(tot_time % 60))
if hours == 0 and mins == 0 and secs < 10:
- print("[TIMER] {0}(): {:.3f} seconds".format(method.__name__, tot_time))
+ print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time))
else:
print(
- "[TIMER] {0}(): {1}h {2}m {3}s".format(
+ '[TIMER] {0}(): {1}h {2}m {3}s'.format(
method.__name__, hours, mins, secs
)
)
return timed
+def pairwise(iterable):
+ """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
+ a, b = tee(iterable)
+ next(b, None)
+ return zip(a, b)
+
+
# ////////////////////////////////////////////////////////////
# { Feature extractor functions
# ////////////////////////////////////////////////////////////
if handle_negation:
document = mark_negation(document)
for word in unigrams:
- features["contains({0})".format(word)] = word in set(document)
+ features['contains({0})'.format(word)] = word in set(document)
return features
"""
features = {}
for bigr in bigrams:
- features["contains({0} - {1})".format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
+ features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
document
)
return features
neg_scope = not neg_scope
continue
else:
- doc[i] += "_NEG"
+ doc[i] += '_NEG'
elif neg_scope and CLAUSE_PUNCT_RE.search(word):
neg_scope = not neg_scope
elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
- doc[i] += "_NEG"
+ doc[i] += '_NEG'
return document
"""
Write the output of an analysis to a file.
"""
- with codecs.open(filename, "at") as outfile:
- text = "\n*** \n\n"
- text += "{0} \n\n".format(time.strftime("%d/%m/%Y, %H:%M"))
+ with codecs.open(filename, 'at') as outfile:
+ text = '\n*** \n\n'
+ text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M"))
for k in sorted(kwargs):
if isinstance(kwargs[k], dict):
dictionary = kwargs[k]
- text += " - **{0}:**\n".format(k)
+ text += ' - **{0}:**\n'.format(k)
for entry in sorted(dictionary):
- text += " - {0}: {1} \n".format(entry, dictionary[entry])
+ text += ' - {0}: {1} \n'.format(entry, dictionary[entry])
elif isinstance(kwargs[k], list):
- text += " - **{0}:**\n".format(k)
+ text += ' - **{0}:**\n'.format(k)
for entry in kwargs[k]:
- text += " - {0}\n".format(entry)
+ text += ' - {0}\n'.format(entry)
else:
- text += " - **{0}:** {1} \n".format(k, kwargs[k])
+ text += ' - **{0}:** {1} \n'.format(k, kwargs[k])
outfile.write(text)
+def save_file(content, filename):
+ """
+ Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
+ """
+ print("Saving", filename)
+ with codecs.open(filename, 'wb') as storage_file:
+ # The protocol=2 parameter is for python2 compatibility
+ pickle.dump(content, storage_file, protocol=2)
+
+
def split_train_test(all_instances, n=None):
"""
Randomly split `n` instances of the dataset into train and test sets.
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
+ 'The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/'
)
- plt.locator_params(axis="y", nbins=3)
+ plt.locator_params(axis='y', nbins=3)
axes = plt.axes()
axes.yaxis.grid()
- plt.plot(x_values, y_values, "ro", color="red")
+ plt.plot(x_values, y_values, 'ro', color='red')
plt.ylim(ymin=-1.2, ymax=1.2)
plt.tight_layout(pad=5)
if x_labels:
- plt.xticks(x_values, x_labels, rotation="vertical")
+ plt.xticks(x_values, x_labels, rotation='vertical')
if y_labels:
- plt.yticks([-1, 0, 1], y_labels, rotation="horizontal")
+ plt.yticks([-1, 0, 1], y_labels, rotation='horizontal')
# Pad margins so that markers are not clipped by the axes
plt.margins(0.2)
plt.show()
json_file,
outfile,
fields,
- encoding="utf8",
- errors="replace",
+ encoding='utf8',
+ errors='replace',
gzip_compress=False,
skip_retweets=True,
skip_tongue_tweets=True,
subsets of the original tweets json data.
"""
with codecs.open(json_file, encoding=encoding) as fp:
- (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
# write the list of fields as header
writer.writerow(fields)
tweet = json.loads(line)
row = extract_fields(tweet, fields)
try:
- text = row[fields.index("text")]
+ text = row[fields.index('text')]
# Remove retweets
if skip_retweets == True:
- if re.search(r"\bRT\b", text):
+ if re.search(r'\bRT\b', text):
continue
# Remove tweets containing ":P" and ":-P" emoticons
if skip_tongue_tweets == True:
- if re.search(r"\:\-?P\b", text):
+ if re.search(r'\:\-?P\b', text):
continue
# Remove tweets containing both happy and sad emoticons
if skip_ambiguous_tweets == True:
continue
# Strip off emoticons from all tweets
if strip_off_emoticons == True:
- row[fields.index("text")] = re.sub(
- r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text)
+ row[fields.index('text')] = re.sub(
+ r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text)
)
# Remove duplicate tweets
if remove_duplicates == True:
- if row[fields.index("text")] in tweets_cache:
+ if row[fields.index('text')] in tweets_cache:
continue
else:
- tweets_cache.append(row[fields.index("text")])
+ tweets_cache.append(row[fields.index('text')])
except ValueError:
pass
writer.writerow(row)
"""
tweets = []
if not sent_tokenizer:
- sent_tokenizer = load("tokenizers/punkt/english.pickle")
-
- with codecs.open(filename, "rt") as csvfile:
- reader = csv.reader(csvfile)
- if skip_header == True:
- next(reader, None) # skip the header
- i = 0
- for tweet_id, text in reader:
- # text = text[1]
- i += 1
- sys.stdout.write("Loaded {0} tweets\r".format(i))
- # Apply sentence and word tokenizer to text
- if word_tokenizer:
- tweet = [
- w
- for sent in sent_tokenizer.tokenize(text)
- for w in word_tokenizer.tokenize(sent)
- ]
- else:
- tweet = text
- tweets.append((tweet, label))
-
+ sent_tokenizer = load('tokenizers/punkt/english.pickle')
+
+ # If we use Python3.x we can proceed using the 'rt' flag
+ if sys.version_info[0] == 3:
+ with codecs.open(filename, 'rt') as csvfile:
+ reader = csv.reader(csvfile)
+ if skip_header == True:
+ next(reader, None) # skip the header
+ i = 0
+ for tweet_id, text in reader:
+ # text = text[1]
+ i += 1
+ sys.stdout.write('Loaded {0} tweets\r'.format(i))
+ # Apply sentence and word tokenizer to text
+ if word_tokenizer:
+ tweet = [
+ w
+ for sent in sent_tokenizer.tokenize(text)
+ for w in word_tokenizer.tokenize(sent)
+ ]
+ else:
+ tweet = text
+ tweets.append((tweet, label))
+ # If we use Python2.x we need to handle encoding problems
+ elif sys.version_info[0] < 3:
+ with codecs.open(filename) as csvfile:
+ reader = csv.reader(csvfile)
+ if skip_header == True:
+ next(reader, None) # skip the header
+ i = 0
+ for row in reader:
+ unicode_row = [x.decode('utf8') for x in row]
+ text = unicode_row[1]
+ i += 1
+ sys.stdout.write('Loaded {0} tweets\r'.format(i))
+ # Apply sentence and word tokenizer to text
+ if word_tokenizer:
+ tweet = [
+ w.encode('utf8')
+ for sent in sent_tokenizer.tokenize(text)
+ for w in word_tokenizer.tokenize(sent)
+ ]
+ else:
+ tweet = text
+ tweets.append((tweet, label))
print("Loaded {0} tweets".format(i))
return tweets
if n_instances is not None:
n_instances = int(n_instances / 2)
- fields = ["id", "text"]
+ fields = ['id', 'text']
positive_json = twitter_samples.abspath("positive_tweets.json")
- positive_csv = "positive_tweets.csv"
+ positive_csv = 'positive_tweets.csv'
json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
negative_json = twitter_samples.abspath("negative_tweets.json")
- negative_csv = "negative_tweets.csv"
+ negative_csv = 'negative_tweets.csv'
json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
- neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer)
- pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer)
+ neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
+ pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
classifier.show_most_informative_features()
except AttributeError:
print(
- "Your classifier does not provide a show_most_informative_features() method."
+ 'Your classifier does not provide a show_most_informative_features() method.'
)
results = sentim_analyzer.evaluate(test_set)
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(
output,
- Dataset="labeled_tweets",
+ Dataset='labeled_tweets',
Classifier=type(classifier).__name__,
Tokenizer=tokenizer.__class__.__name__,
Feats=extr,
n_instances = int(n_instances / 2)
pos_docs = [
- (list(movie_reviews.words(pos_id)), "pos")
- for pos_id in movie_reviews.fileids("pos")[:n_instances]
+ (list(movie_reviews.words(pos_id)), 'pos')
+ for pos_id in movie_reviews.fileids('pos')[:n_instances]
]
neg_docs = [
- (list(movie_reviews.words(neg_id)), "neg")
- for neg_id in movie_reviews.fileids("neg")[:n_instances]
+ (list(movie_reviews.words(neg_id)), 'neg')
+ for neg_id in movie_reviews.fileids('neg')[:n_instances]
]
# We separately split positive and negative instances to keep a balanced
# uniform class distribution in both train and test sets.
classifier.show_most_informative_features()
except AttributeError:
print(
- "Your classifier does not provide a show_most_informative_features() method."
+ 'Your classifier does not provide a show_most_informative_features() method.'
)
results = sentim_analyzer.evaluate(test_set)
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(
output,
- Dataset="Movie_reviews",
+ Dataset='Movie_reviews',
Classifier=type(classifier).__name__,
- Tokenizer="WordPunctTokenizer",
+ Tokenizer='WordPunctTokenizer',
Feats=extr,
Results=results,
Instances=n_instances,
n_instances = int(n_instances / 2)
subj_docs = [
- (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances]
+ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]
]
obj_docs = [
- (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances]
+ (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]
]
# We separately split subjective and objective instances to keep a balanced
classifier.show_most_informative_features()
except AttributeError:
print(
- "Your classifier does not provide a show_most_informative_features() method."
+ 'Your classifier does not provide a show_most_informative_features() method.'
)
results = sentim_analyzer.evaluate(test_set)
if save_analyzer == True:
- save_file(sentim_analyzer, "sa_subjectivity.pickle")
+ save_file(sentim_analyzer, 'sa_subjectivity.pickle')
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(
output,
- Dataset="subjectivity",
+ Dataset='subjectivity',
Classifier=type(classifier).__name__,
- Tokenizer="WhitespaceTokenizer",
+ Tokenizer='WhitespaceTokenizer',
Feats=extr,
Instances=n_instances,
Results=results,
word_tokenizer = regexp.WhitespaceTokenizer()
try:
- sentim_analyzer = load("sa_subjectivity.pickle")
+ sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
- print("Cannot find the sentiment analyzer you want to load.")
- print("Training a new one using NaiveBayesClassifier.")
+ print('Cannot find the sentiment analyzer you want to load.')
+ print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
y.append(0) # neutral
if pos_words > neg_words:
- print("Positive")
+ print('Positive')
elif pos_words < neg_words:
- print("Negative")
+ print('Negative')
elif pos_words == neg_words:
- print("Neutral")
+ print('Neutral')
if plot == True:
_show_plot(
- x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"]
+ x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']
)
if n_instances is not None:
n_instances = int(n_instances / 2)
- fields = ["id", "text"]
+ fields = ['id', 'text']
positive_json = twitter_samples.abspath("positive_tweets.json")
- positive_csv = "positive_tweets.csv"
+ positive_csv = 'positive_tweets.csv'
json2csv_preprocess(
positive_json,
positive_csv,
)
negative_json = twitter_samples.abspath("negative_tweets.json")
- negative_csv = "negative_tweets.csv"
+ negative_csv = 'negative_tweets.csv'
json2csv_preprocess(
negative_json,
negative_csv,
limit=n_instances,
)
- pos_docs = parse_tweets_set(positive_csv, label="pos")
- neg_docs = parse_tweets_set(negative_csv, label="neg")
+ pos_docs = parse_tweets_set(positive_csv, label='pos')
+ neg_docs = parse_tweets_set(negative_csv, label='neg')
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
labels.add(label)
gold_results[label].add(i)
acc_gold_results.append(label)
- score = vader_analyzer.polarity_scores(text)["compound"]
+ score = vader_analyzer.polarity_scores(text)['compound']
if score > 0:
- observed = "pos"
+ observed = 'pos'
else:
- observed = "neg"
+ observed = 'neg'
num += 1
acc_test_results.append(observed)
test_results[observed].add(i)
metrics_results = {}
for label in labels:
accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
- metrics_results["Accuracy"] = accuracy_score
+ metrics_results['Accuracy'] = accuracy_score
precision_score = eval_precision(gold_results[label], test_results[label])
- metrics_results["Precision [{0}]".format(label)] = precision_score
+ metrics_results['Precision [{0}]'.format(label)] = precision_score
recall_score = eval_recall(gold_results[label], test_results[label])
- metrics_results["Recall [{0}]".format(label)] = recall_score
+ metrics_results['Recall [{0}]'.format(label)] = recall_score
f_measure_score = eval_f_measure(gold_results[label], test_results[label])
- metrics_results["F-measure [{0}]".format(label)] = f_measure_score
+ metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
for result in sorted(metrics_results):
- print("{0}: {1}".format(result, metrics_results[result]))
+ print('{0}: {1}'.format(result, metrics_results[result]))
if output:
output_markdown(
output,
- Approach="Vader",
- Dataset="labeled_tweets",
+ Approach='Vader',
+ Dataset='labeled_tweets',
Instances=n_instances,
Results=metrics_results,
)
-if __name__ == "__main__":
+if __name__ == '__main__':
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
- from nltk.twitter.common import _outf_writer, extract_fields
+ from nltk.twitter.common import outf_writer_compat, extract_fields
naive_bayes = NaiveBayesClassifier.train
svm = SklearnClassifier(LinearSVC()).train
# coding: utf-8
# Natural Language Toolkit: vader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
# George Berry <geb97@cornell.edu> (modifications)
-# Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
import re
import string
from itertools import product
-
import nltk.data
-from nltk.util import pairwise
-
-class VaderConstants:
+from .util import pairwise
+
+##Constants##
+
+# (empirically derived mean sentiment intensity rating increase for booster words)
+B_INCR = 0.293
+B_DECR = -0.293
+
+# (empirically derived mean sentiment intensity rating increase for using
+# ALLCAPs to emphasize a word)
+C_INCR = 0.733
+
+N_SCALAR = -0.74
+
+# for removing punctuation
+REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
+
+PUNC_LIST = [
+ ".",
+ "!",
+ "?",
+ ",",
+ ";",
+ ":",
+ "-",
+ "'",
+ "\"",
+ "!!",
+ "!!!",
+ "??",
+ "???",
+ "?!?",
+ "!?!",
+ "?!?!",
+ "!?!?",
+]
+NEGATE = {
+ "aint",
+ "arent",
+ "cannot",
+ "cant",
+ "couldnt",
+ "darent",
+ "didnt",
+ "doesnt",
+ "ain't",
+ "aren't",
+ "can't",
+ "couldn't",
+ "daren't",
+ "didn't",
+ "doesn't",
+ "dont",
+ "hadnt",
+ "hasnt",
+ "havent",
+ "isnt",
+ "mightnt",
+ "mustnt",
+ "neither",
+ "don't",
+ "hadn't",
+ "hasn't",
+ "haven't",
+ "isn't",
+ "mightn't",
+ "mustn't",
+ "neednt",
+ "needn't",
+ "never",
+ "none",
+ "nope",
+ "nor",
+ "not",
+ "nothing",
+ "nowhere",
+ "oughtnt",
+ "shant",
+ "shouldnt",
+ "uhuh",
+ "wasnt",
+ "werent",
+ "oughtn't",
+ "shan't",
+ "shouldn't",
+ "uh-uh",
+ "wasn't",
+ "weren't",
+ "without",
+ "wont",
+ "wouldnt",
+ "won't",
+ "wouldn't",
+ "rarely",
+ "seldom",
+ "despite",
+}
+
+# booster/dampener 'intensifiers' or 'degree adverbs'
+# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+BOOSTER_DICT = {
+ "absolutely": B_INCR,
+ "amazingly": B_INCR,
+ "awfully": B_INCR,
+ "completely": B_INCR,
+ "considerably": B_INCR,
+ "decidedly": B_INCR,
+ "deeply": B_INCR,
+ "effing": B_INCR,
+ "enormously": B_INCR,
+ "entirely": B_INCR,
+ "especially": B_INCR,
+ "exceptionally": B_INCR,
+ "extremely": B_INCR,
+ "fabulously": B_INCR,
+ "flipping": B_INCR,
+ "flippin": B_INCR,
+ "fricking": B_INCR,
+ "frickin": B_INCR,
+ "frigging": B_INCR,
+ "friggin": B_INCR,
+ "fully": B_INCR,
+ "fucking": B_INCR,
+ "greatly": B_INCR,
+ "hella": B_INCR,
+ "highly": B_INCR,
+ "hugely": B_INCR,
+ "incredibly": B_INCR,
+ "intensely": B_INCR,
+ "majorly": B_INCR,
+ "more": B_INCR,
+ "most": B_INCR,
+ "particularly": B_INCR,
+ "purely": B_INCR,
+ "quite": B_INCR,
+ "really": B_INCR,
+ "remarkably": B_INCR,
+ "so": B_INCR,
+ "substantially": B_INCR,
+ "thoroughly": B_INCR,
+ "totally": B_INCR,
+ "tremendously": B_INCR,
+ "uber": B_INCR,
+ "unbelievably": B_INCR,
+ "unusually": B_INCR,
+ "utterly": B_INCR,
+ "very": B_INCR,
+ "almost": B_DECR,
+ "barely": B_DECR,
+ "hardly": B_DECR,
+ "just enough": B_DECR,
+ "kind of": B_DECR,
+ "kinda": B_DECR,
+ "kindof": B_DECR,
+ "kind-of": B_DECR,
+ "less": B_DECR,
+ "little": B_DECR,
+ "marginally": B_DECR,
+ "occasionally": B_DECR,
+ "partly": B_DECR,
+ "scarcely": B_DECR,
+ "slightly": B_DECR,
+ "somewhat": B_DECR,
+ "sort of": B_DECR,
+ "sorta": B_DECR,
+ "sortof": B_DECR,
+ "sort-of": B_DECR,
+}
+
+# check for special case idioms using a sentiment-laden keyword known to SAGE
+SPECIAL_CASE_IDIOMS = {
+ "the shit": 3,
+ "the bomb": 3,
+ "bad ass": 1.5,
+ "yeah right": -2,
+ "cut the mustard": 2,
+ "kiss of death": -1.5,
+ "hand to mouth": -2,
+}
+
+
+##Static methods##
+
+
+def negated(input_words, include_nt=True):
"""
- A class to keep the Vader lists and constants.
+ Determine if input contains negation words
"""
- ##Constants##
- # (empirically derived mean sentiment intensity rating increase for booster words)
- B_INCR = 0.293
- B_DECR = -0.293
-
- # (empirically derived mean sentiment intensity rating increase for using
- # ALLCAPs to emphasize a word)
- C_INCR = 0.733
-
- N_SCALAR = -0.74
-
- NEGATE = {
- "aint",
- "arent",
- "cannot",
- "cant",
- "couldnt",
- "darent",
- "didnt",
- "doesnt",
- "ain't",
- "aren't",
- "can't",
- "couldn't",
- "daren't",
- "didn't",
- "doesn't",
- "dont",
- "hadnt",
- "hasnt",
- "havent",
- "isnt",
- "mightnt",
- "mustnt",
- "neither",
- "don't",
- "hadn't",
- "hasn't",
- "haven't",
- "isn't",
- "mightn't",
- "mustn't",
- "neednt",
- "needn't",
- "never",
- "none",
- "nope",
- "nor",
- "not",
- "nothing",
- "nowhere",
- "oughtnt",
- "shant",
- "shouldnt",
- "uhuh",
- "wasnt",
- "werent",
- "oughtn't",
- "shan't",
- "shouldn't",
- "uh-uh",
- "wasn't",
- "weren't",
- "without",
- "wont",
- "wouldnt",
- "won't",
- "wouldn't",
- "rarely",
- "seldom",
- "despite",
- }
-
- # booster/dampener 'intensifiers' or 'degree adverbs'
- # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
-
- BOOSTER_DICT = {
- "absolutely": B_INCR,
- "amazingly": B_INCR,
- "awfully": B_INCR,
- "completely": B_INCR,
- "considerably": B_INCR,
- "decidedly": B_INCR,
- "deeply": B_INCR,
- "effing": B_INCR,
- "enormously": B_INCR,
- "entirely": B_INCR,
- "especially": B_INCR,
- "exceptionally": B_INCR,
- "extremely": B_INCR,
- "fabulously": B_INCR,
- "flipping": B_INCR,
- "flippin": B_INCR,
- "fricking": B_INCR,
- "frickin": B_INCR,
- "frigging": B_INCR,
- "friggin": B_INCR,
- "fully": B_INCR,
- "fucking": B_INCR,
- "greatly": B_INCR,
- "hella": B_INCR,
- "highly": B_INCR,
- "hugely": B_INCR,
- "incredibly": B_INCR,
- "intensely": B_INCR,
- "majorly": B_INCR,
- "more": B_INCR,
- "most": B_INCR,
- "particularly": B_INCR,
- "purely": B_INCR,
- "quite": B_INCR,
- "really": B_INCR,
- "remarkably": B_INCR,
- "so": B_INCR,
- "substantially": B_INCR,
- "thoroughly": B_INCR,
- "totally": B_INCR,
- "tremendously": B_INCR,
- "uber": B_INCR,
- "unbelievably": B_INCR,
- "unusually": B_INCR,
- "utterly": B_INCR,
- "very": B_INCR,
- "almost": B_DECR,
- "barely": B_DECR,
- "hardly": B_DECR,
- "just enough": B_DECR,
- "kind of": B_DECR,
- "kinda": B_DECR,
- "kindof": B_DECR,
- "kind-of": B_DECR,
- "less": B_DECR,
- "little": B_DECR,
- "marginally": B_DECR,
- "occasionally": B_DECR,
- "partly": B_DECR,
- "scarcely": B_DECR,
- "slightly": B_DECR,
- "somewhat": B_DECR,
- "sort of": B_DECR,
- "sorta": B_DECR,
- "sortof": B_DECR,
- "sort-of": B_DECR,
- }
-
- # check for special case idioms using a sentiment-laden keyword known to SAGE
- SPECIAL_CASE_IDIOMS = {
- "the shit": 3,
- "the bomb": 3,
- "bad ass": 1.5,
- "yeah right": -2,
- "cut the mustard": 2,
- "kiss of death": -1.5,
- "hand to mouth": -2,
- }
-
- # for removing punctuation
- REGEX_REMOVE_PUNCTUATION = re.compile("[{0}]".format(re.escape(string.punctuation)))
-
- PUNC_LIST = [
- ".",
- "!",
- "?",
- ",",
- ";",
- ":",
- "-",
- "'",
- '"',
- "!!",
- "!!!",
- "??",
- "???",
- "?!?",
- "!?!",
- "?!?!",
- "!?!?",
- ]
-
- def __init__(self):
- pass
-
- def negated(self, input_words, include_nt=True):
- """
- Determine if input contains negation words
- """
- neg_words = self.NEGATE
- if any(word.lower() in neg_words for word in input_words):
+ neg_words = NEGATE
+ if any(word.lower() in neg_words for word in input_words):
+ return True
+ if include_nt:
+ if any("n't" in word.lower() for word in input_words):
return True
- if include_nt:
- if any("n't" in word.lower() for word in input_words):
- return True
- for first, second in pairwise(input_words):
- if second.lower() == "least" and first.lower() != "at":
- return True
- return False
-
- def normalize(self, score, alpha=15):
- """
- Normalize the score to be between -1 and 1 using an alpha that
- approximates the max expected value
- """
- norm_score = score / math.sqrt((score * score) + alpha)
- return norm_score
+ for first, second in pairwise(input_words):
+ if second.lower() == "least" and first.lower() != 'at':
+ return True
+ return False
- def scalar_inc_dec(self, word, valence, is_cap_diff):
- """
- Check if the preceding words increase, decrease, or negate/nullify the
- valence
- """
- scalar = 0.0
- word_lower = word.lower()
- if word_lower in self.BOOSTER_DICT:
- scalar = self.BOOSTER_DICT[word_lower]
- if valence < 0:
- scalar *= -1
- # check if booster/dampener word is in ALLCAPS (while others aren't)
- if word.isupper() and is_cap_diff:
- if valence > 0:
- scalar += self.C_INCR
- else:
- scalar -= self.C_INCR
- return scalar
+def normalize(score, alpha=15):
+ """
+ Normalize the score to be between -1 and 1 using an alpha that
+ approximates the max expected value
+ """
+ norm_score = score / math.sqrt((score * score) + alpha)
+ return norm_score
+
+
+def allcap_differential(words):
+ """
+ Check whether just some words in the input are ALL CAPS
+
+ :param list words: The words to inspect
+ :returns: `True` if some but not all items in `words` are ALL CAPS
+ """
+ is_different = False
+ allcap_words = 0
+ for word in words:
+ if word.isupper():
+ allcap_words += 1
+ cap_differential = len(words) - allcap_words
+ if 0 < cap_differential < len(words):
+ is_different = True
+ return is_different
+
+
+def scalar_inc_dec(word, valence, is_cap_diff):
+ """
+ Check if the preceding words increase, decrease, or negate/nullify the
+ valence
+ """
+ scalar = 0.0
+ word_lower = word.lower()
+ if word_lower in BOOSTER_DICT:
+ scalar = BOOSTER_DICT[word_lower]
+ if valence < 0:
+ scalar *= -1
+ # check if booster/dampener word is in ALLCAPS (while others aren't)
+ if word.isupper() and is_cap_diff:
+ if valence > 0:
+ scalar += C_INCR
+ else:
+ scalar -= C_INCR
+ return scalar
-class SentiText:
+class SentiText(object):
"""
Identify sentiment-relevant string-level properties of input text.
"""
- def __init__(self, text, punc_list, regex_remove_punctuation):
+ def __init__(self, text):
if not isinstance(text, str):
- text = str(text.encode("utf-8"))
+ text = str(text.encode('utf-8'))
self.text = text
- self.PUNC_LIST = punc_list
- self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
self.words_and_emoticons = self._words_and_emoticons()
- # doesn't separate words from
+ # doesn't separate words from\
# adjacent punctuation (keeps emoticons & contractions)
- self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
+ self.is_cap_diff = allcap_differential(self.words_and_emoticons)
def _words_plus_punc(self):
"""
',cat': 'cat',
}
"""
- no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
+ no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
# removes punctuation (but loses emoticons & contractions)
words_only = no_punc_text.split()
# remove singletons
words_only = set(w for w in words_only if len(w) > 1)
# the product gives ('cat', ',') and (',', 'cat')
- punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
- punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
+ punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
+ punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
words_punc_dict = punc_before
words_punc_dict.update(punc_after)
return words_punc_dict
wes[i] = words_punc_dict[we]
return wes
- def allcap_differential(self, words):
- """
- Check whether just some words in the input are ALL CAPS
- :param list words: The words to inspect
- :returns: `True` if some but not all items in `words` are ALL CAPS
- """
- is_different = False
- allcap_words = 0
- for word in words:
- if word.isupper():
- allcap_words += 1
- cap_differential = len(words) - allcap_words
- if 0 < cap_differential < len(words):
- is_different = True
- return is_different
-
-
-class SentimentIntensityAnalyzer:
+class SentimentIntensityAnalyzer(object):
"""
Give a sentiment intensity score to sentences.
"""
def __init__(
- self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
+ self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"
):
self.lexicon_file = nltk.data.load(lexicon_file)
self.lexicon = self.make_lex_dict()
- self.constants = VaderConstants()
def make_lex_dict(self):
"""
Convert lexicon file to a dictionary
"""
lex_dict = {}
- for line in self.lexicon_file.split("\n"):
- (word, measure) = line.strip().split("\t")[0:2]
+ for line in self.lexicon_file.split('\n'):
+ (word, measure) = line.strip().split('\t')[0:2]
lex_dict[word] = float(measure)
return lex_dict
Positive values are positive valence, negative value are negative
valence.
"""
+ sentitext = SentiText(text)
# text, words_and_emoticons, is_cap_diff = self.preprocess(text)
- sentitext = SentiText(text, self.constants.PUNC_LIST,
- self.constants.REGEX_REMOVE_PUNCTUATION)
+
sentiments = []
words_and_emoticons = sentitext.words_and_emoticons
for item in words_and_emoticons:
i < len(words_and_emoticons) - 1
and item.lower() == "kind"
and words_and_emoticons[i + 1].lower() == "of"
- ) or item.lower() in self.constants.BOOSTER_DICT:
+ ) or item.lower() in BOOSTER_DICT:
sentiments.append(valence)
continue
# check if sentiment laden word is in ALL CAPS (while others aren't)
if item.isupper() and is_cap_diff:
if valence > 0:
- valence += self.constants.C_INCR
+ valence += C_INCR
else:
- valence -= self.constants.C_INCR
+ valence -= C_INCR
for start_i in range(0, 3):
if (
# dampen the scalar modifier of preceding words and emoticons
# (excluding the ones that immediately preceed the item) based
# on their distance from the current item.
- s = self.constants.scalar_inc_dec(
+ s = scalar_inc_dec(
words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
)
if start_i == 1 and s != 0:
words_and_emoticons[i - 2].lower() != "at"
and words_and_emoticons[i - 2].lower() != "very"
):
- valence = valence * self.constants.N_SCALAR
+ valence = valence * N_SCALAR
elif (
i > 0
and words_and_emoticons[i - 1].lower() not in self.lexicon
and words_and_emoticons[i - 1].lower() == "least"
):
- valence = valence * self.constants.N_SCALAR
+ valence = valence * N_SCALAR
return valence
def _but_check(self, words_and_emoticons, sentiments):
- but = {"but", "BUT"} & set(words_and_emoticons)
- if but:
- bi = words_and_emoticons.index(next(iter(but)))
- for sidx, sentiment in enumerate(sentiments):
- if sidx < bi:
- sentiments[sidx] = sentiment * 0.5
- elif sidx > bi:
- sentiments[sidx] = sentiment * 1.5
+ # check for modification in sentiment due to contrastive conjunction 'but'
+ if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
+ try:
+ bi = words_and_emoticons.index('but')
+ except ValueError:
+ bi = words_and_emoticons.index('BUT')
+ for sentiment in sentiments:
+ si = sentiments.index(sentiment)
+ if si < bi:
+ sentiments.pop(si)
+ sentiments.insert(si, sentiment * 0.5)
+ elif si > bi:
+ sentiments.pop(si)
+ sentiments.insert(si, sentiment * 1.5)
return sentiments
def _idioms_check(self, valence, words_and_emoticons, i):
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
for seq in sequences:
- if seq in self.constants.SPECIAL_CASE_IDIOMS:
- valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
+ if seq in SPECIAL_CASE_IDIOMS:
+ valence = SPECIAL_CASE_IDIOMS[seq]
break
if len(words_and_emoticons) - 1 > i:
zeroone = "{0} {1}".format(
words_and_emoticons[i], words_and_emoticons[i + 1]
)
- if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
- valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
+ if zeroone in SPECIAL_CASE_IDIOMS:
+ valence = SPECIAL_CASE_IDIOMS[zeroone]
if len(words_and_emoticons) - 1 > i + 1:
zeroonetwo = "{0} {1} {2}".format(
words_and_emoticons[i],
words_and_emoticons[i + 1],
words_and_emoticons[i + 2],
)
- if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
- valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
+ if zeroonetwo in SPECIAL_CASE_IDIOMS:
+ valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
# check for booster/dampener bi-grams such as 'sort of' or 'kind of'
- if threetwo in self.constants.BOOSTER_DICT or twoone in self.constants.BOOSTER_DICT:
- valence = valence + self.constants.B_DECR
+ if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
+ valence = valence + B_DECR
return valence
def _never_check(self, valence, words_and_emoticons, start_i, i):
if start_i == 0:
- if self.constants.negated([words_and_emoticons[i - 1]]):
- valence = valence * self.constants.N_SCALAR
+ if negated([words_and_emoticons[i - 1]]):
+ valence = valence * N_SCALAR
if start_i == 1:
if words_and_emoticons[i - 2] == "never" and (
words_and_emoticons[i - 1] == "so"
or words_and_emoticons[i - 1] == "this"
):
valence = valence * 1.5
- elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
- valence = valence * self.constants.N_SCALAR
+ elif negated([words_and_emoticons[i - (start_i + 1)]]):
+ valence = valence * N_SCALAR
if start_i == 2:
if (
words_and_emoticons[i - 3] == "never"
)
):
valence = valence * 1.25
- elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
- valence = valence * self.constants.N_SCALAR
+ elif negated([words_and_emoticons[i - (start_i + 1)]]):
+ valence = valence * N_SCALAR
return valence
def _punctuation_emphasis(self, sum_s, text):
elif sum_s < 0:
sum_s -= punct_emph_amplifier
- compound = self.constants.normalize(sum_s)
+ compound = normalize(sum_s)
# discriminate between positive, negative and neutral sentiment scores
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Natural Language Toolkit: Stemmer Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
-class StemmerI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class StemmerI(object):
"""
A processing interface for removing morphological affixes from
words. This process is known as stemming.
#
# Natural Language Toolkit: ARLSTem Stemmer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
#
# Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
# Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
ARLSTem is promising and producing high performances. This stemmer is not
based on any dictionary and can be used on-line effectively.
"""
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
class ARLSTem(StemmerI):
- """
+ '''
ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
Department of Telecommunication & Information Processing. USTHB University,
Algiers, Algeria.
ARLSTem.stem(token) returns the Arabic stem for the input token.
The ARLSTem Stemmer requires that all tokens are encoded using Unicode
encoding.
- """
+ '''
def __init__(self):
# different Alif with hamza
- self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
- self.re_alifMaqsura = re.compile(r"[\u0649]")
- self.re_diacritics = re.compile(r"[\u064B-\u065F]")
+ self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
+ self.re_alifMaqsura = re.compile(r'[\u0649]')
+ self.re_diacritics = re.compile(r'[\u064B-\u065F]')
# Alif Laam, Laam Laam, Fa Laam, Fa Ba
- self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
+ self.pr2 = ['\u0627\u0644', '\u0644\u0644', '\u0641\u0644', '\u0641\u0628']
# Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
- self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
+ self.pr3 = ['\u0628\u0627\u0644', '\u0643\u0627\u0644', '\u0648\u0627\u0644']
# Fa Laam Laam, Waaw Laam Laam
- self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
+ self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
# Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
self.pr4 = [
- "\u0641\u0628\u0627\u0644",
- "\u0648\u0628\u0627\u0644",
- "\u0641\u0643\u0627\u0644",
+ '\u0641\u0628\u0627\u0644',
+ '\u0648\u0628\u0627\u0644',
+ '\u0641\u0643\u0627\u0644',
]
# Kaf Yaa, Kaf Miim
- self.su2 = ["\u0643\u064A", "\u0643\u0645"]
+ self.su2 = ['\u0643\u064A', '\u0643\u0645']
# Ha Alif, Ha Miim
- self.su22 = ["\u0647\u0627", "\u0647\u0645"]
+ self.su22 = ['\u0647\u0627', '\u0647\u0645']
# Kaf Miim Alif, Kaf Noon Shadda
- self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
+ self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
# Ha Miim Alif, Ha Noon Shadda
- self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
+ self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
# Alif Noon, Ya Noon, Waaw Noon
- self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
+ self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
# Taa Alif Noon, Taa Ya Noon
- self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
+ self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
# Alif Noon, Waaw Noon
- self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
+ self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
# Siin Taa, Siin Yaa
- self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
+ self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
# Siin Alif, Siin Noon
- self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
+ self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
# Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
self.verb_pr33 = [
- "\u0644\u0646",
- "\u0644\u062A",
- "\u0644\u064A",
- "\u0644\u0623",
+ '\u0644\u0646',
+ '\u0644\u062A',
+ '\u0644\u064A',
+ '\u0644\u0623',
]
# Taa Miim Alif, Taa Noon Shadda
- self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
+ self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
# Noon Alif, Taa Miim, Taa Alif, Waaw Alif
self.verb_suf2 = [
- "\u0646\u0627",
- "\u062A\u0645",
- "\u062A\u0627",
- "\u0648\u0627",
+ '\u0646\u0627',
+ '\u062A\u0645',
+ '\u062A\u0627',
+ '\u0648\u0627',
]
# Taa, Alif, Noon
- self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
+ self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
def stem(self, token):
"""
beginning.
"""
# strip Arabic diacritics
- token = self.re_diacritics.sub("", token)
+ token = self.re_diacritics.sub('', token)
# replace Hamzated Alif with Alif bare
- token = self.re_hamzated_alif.sub("\u0627", token)
+ token = self.re_hamzated_alif.sub('\u0627', token)
# replace alifMaqsura with Yaa
- token = self.re_alifMaqsura.sub("\u064A", token)
+ token = self.re_alifMaqsura.sub('\u064A', token)
# strip the Waaw from the word beginning if the remaining is 3 letters
# at least
- if token.startswith("\u0648") and len(token) > 3:
+ if token.startswith('\u0648') and len(token) > 3:
token = token[1:]
return token
"""
remove suffixes from the word's end.
"""
- if token.endswith("\u0643") and len(token) > 3:
+ if token.endswith('\u0643') and len(token) > 3:
return token[:-1]
if len(token) > 4:
for s2 in self.su2:
for s3 in self.su3:
if token.endswith(s3):
return token[:-3]
- if token.endswith("\u0647") and len(token) > 3:
+ if token.endswith('\u0647') and len(token) > 3:
token = token[:-1]
return token
if len(token) > 4:
for s3 in self.su32:
if token.endswith(s3):
return token[:-3]
- if token.endswith("\u0646\u0627") and len(token) > 4:
+ if token.endswith('\u0646\u0627') and len(token) > 4:
return token[:-2]
return token
"""
transform the word from the feminine form to the masculine form.
"""
- if token.endswith("\u0629") and len(token) > 3:
+ if token.endswith('\u0629') and len(token) > 3:
return token[:-1]
def plur2sing(self, token):
for ps3 in self.pl_si3:
if token.endswith(ps3):
return token[:-3]
- if len(token) > 3 and token.endswith("\u0627\u062A"):
+ if len(token) > 3 and token.endswith('\u0627\u062A'):
return token[:-2]
- if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627":
+ if len(token) > 3 and token.startswith('\u0627') and token[2] == '\u0627':
return token[:2] + token[3:]
- if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627":
+ if len(token) > 4 and token.startswith('\u0627') and token[-2] == '\u0627':
return token[1:-2] + token[-1]
def verb(self, token):
"""
stem the present prefixes and suffixes
"""
- if len(token) > 5 and token.startswith("\u062A"): # Taa
+ if len(token) > 5 and token.startswith('\u062A'): # Taa
for s2 in self.pl_si2:
if token.endswith(s2):
return token[1:-2]
- if len(token) > 5 and token.startswith("\u064A"): # Yaa
+ if len(token) > 5 and token.startswith('\u064A'): # Yaa
for s2 in self.verb_su2:
if token.endswith(s2):
return token[1:-2]
- if len(token) > 4 and token.startswith("\u0627"): # Alif
+ if len(token) > 4 and token.startswith('\u0627'): # Alif
# Waaw Alif
- if len(token) > 5 and token.endswith("\u0648\u0627"):
+ if len(token) > 5 and token.endswith('\u0648\u0627'):
return token[1:-2]
# Yaa
- if token.endswith("\u064A"):
+ if token.endswith('\u064A'):
return token[1:-1]
# Alif
- if token.endswith("\u0627"):
+ if token.endswith('\u0627'):
return token[1:-1]
# Noon
- if token.endswith("\u0646"):
+ if token.endswith('\u0646'):
return token[1:-1]
# ^Yaa, Noon$
- if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
+ if len(token) > 4 and token.startswith('\u064A') and token.endswith('\u0646'):
return token[1:-1]
# ^Taa, Noon$
- if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
+ if len(token) > 4 and token.startswith('\u062A') and token.endswith('\u0646'):
return token[1:-1]
def verb_t2(self, token):
if (
len(token) > 5
and token.startswith(self.verb_pr2[0])
- and token.endswith("\u0646")
+ and token.endswith('\u0646')
):
return token[2:-1]
# ^Siin Yaa, Noon$
if (
len(token) > 5
and token.startswith(self.verb_pr2[1])
- and token.endswith("\u0646")
+ and token.endswith('\u0646')
):
return token[2:-1]
for pr1 in self.verb_suf1:
if token.startswith(pr1):
return token[1:]
- if token.startswith("\u064A"):
+ if token.startswith('\u064A'):
return token[1:]
def verb_t5(self, token):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: CISTEM Stemmer for German
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Leonie Weissweiler <l.weissweiler@outlook.de>
# Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
# Alexander Fraser <fraser@cis.lmu.de>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class Cistem(StemmerI):
"""
CISTEM Stemmer for German
is thrice as fast as the Snowball stemmer for German while being about as fast
as most other stemmers.
- case_insensitive is a a boolean specifying if case-insensitive stemming
+ case_insensitive is a a boolean specifiying if case-insensitive stemming
should be used. Case insensitivity improves performance only if words in the
text may be incorrectly upper case. For all-lowercase and correctly cased
text, best performance is achieved by setting case_insensitive for false.
:param case_insensitive: if True, the stemming is case insensitive. False by default.
:type case_insensitive: bool
"""
-
strip_ge = re.compile(r"^ge(.{4,})")
repl_xx = re.compile(r"(.)\1")
strip_emr = re.compile(r"e[mr]$")
return word
+
def segment(self, word):
"""
This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
#
# Natural Language Toolkit: The ISRI Arabic Stemmer
#
-# Copyright (C) 2001-2020 NLTK Proejct
+# Copyright (C) 2001-2019 NLTK Proejct
# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
# Author: Hosam Algasaier <hosam_hme@yahoo.com>
# URL: <http://nltk.org/>
increases the word ambiguities and changes the original root.
"""
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
class ISRIStemmer(StemmerI):
- """
+ '''
ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
Information Science Research Institute. University of Nevada, Las Vegas, USA.
The ISRI Stemmer requires that all tokens have Unicode string types.
If you use Python IDLE on Arabic Windows you have to decode text first
using Arabic '1256' coding.
- """
+ '''
def __init__(self):
# length three prefixes
self.p3 = [
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644",
- "\u0648\u0644\u0644",
- "\u0648\u0627\u0644",
+ '\u0643\u0627\u0644',
+ '\u0628\u0627\u0644',
+ '\u0648\u0644\u0644',
+ '\u0648\u0627\u0644',
]
# length two prefixes
- self.p2 = ["\u0627\u0644", "\u0644\u0644"]
+ self.p2 = ['\u0627\u0644', '\u0644\u0644']
# length one prefixes
self.p1 = [
- "\u0644",
- "\u0628",
- "\u0641",
- "\u0633",
- "\u0648",
- "\u064a",
- "\u062a",
- "\u0646",
- "\u0627",
+ '\u0644',
+ '\u0628',
+ '\u0641',
+ '\u0633',
+ '\u0648',
+ '\u064a',
+ '\u062a',
+ '\u0646',
+ '\u0627',
]
# length three suffixes
self.s3 = [
- "\u062a\u0645\u0644",
- "\u0647\u0645\u0644",
- "\u062a\u0627\u0646",
- "\u062a\u064a\u0646",
- "\u0643\u0645\u0644",
+ '\u062a\u0645\u0644',
+ '\u0647\u0645\u0644',
+ '\u062a\u0627\u0646',
+ '\u062a\u064a\u0646',
+ '\u0643\u0645\u0644',
]
# length two suffixes
self.s2 = [
- "\u0648\u0646",
- "\u0627\u062a",
- "\u0627\u0646",
- "\u064a\u0646",
- "\u062a\u0646",
- "\u0643\u0645",
- "\u0647\u0646",
- "\u0646\u0627",
- "\u064a\u0627",
- "\u0647\u0627",
- "\u062a\u0645",
- "\u0643\u0646",
- "\u0646\u064a",
- "\u0648\u0627",
- "\u0645\u0627",
- "\u0647\u0645",
+ '\u0648\u0646',
+ '\u0627\u062a',
+ '\u0627\u0646',
+ '\u064a\u0646',
+ '\u062a\u0646',
+ '\u0643\u0645',
+ '\u0647\u0646',
+ '\u0646\u0627',
+ '\u064a\u0627',
+ '\u0647\u0627',
+ '\u062a\u0645',
+ '\u0643\u0646',
+ '\u0646\u064a',
+ '\u0648\u0627',
+ '\u0645\u0627',
+ '\u0647\u0645',
]
# length one suffixes
- self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"]
+ self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', '\u0627', '\u0646']
# groups of length four patterns
self.pr4 = {
- 0: ["\u0645"],
- 1: ["\u0627"],
- 2: ["\u0627", "\u0648", "\u064A"],
- 3: ["\u0629"],
+ 0: ['\u0645'],
+ 1: ['\u0627'],
+ 2: ['\u0627', '\u0648', '\u064A'],
+ 3: ['\u0629'],
}
# Groups of length five patterns and length three roots
self.pr53 = {
- 0: ["\u0627", "\u062a"],
- 1: ["\u0627", "\u064a", "\u0648"],
- 2: ["\u0627", "\u062a", "\u0645"],
- 3: ["\u0645", "\u064a", "\u062a"],
- 4: ["\u0645", "\u062a"],
- 5: ["\u0627", "\u0648"],
- 6: ["\u0627", "\u0645"],
+ 0: ['\u0627', '\u062a'],
+ 1: ['\u0627', '\u064a', '\u0648'],
+ 2: ['\u0627', '\u062a', '\u0645'],
+ 3: ['\u0645', '\u064a', '\u062a'],
+ 4: ['\u0645', '\u062a'],
+ 5: ['\u0627', '\u0648'],
+ 6: ['\u0627', '\u0645'],
}
- self.re_short_vowels = re.compile(r"[\u064B-\u0652]")
- self.re_hamza = re.compile(r"[\u0621\u0624\u0626]")
- self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]")
+ self.re_short_vowels = re.compile(r'[\u064B-\u0652]')
+ self.re_hamza = re.compile(r'[\u0621\u0624\u0626]')
+ self.re_initial_hamza = re.compile(r'^[\u0622\u0623\u0625]')
self.stop_words = [
- "\u064a\u0643\u0648\u0646",
- "\u0648\u0644\u064a\u0633",
- "\u0648\u0643\u0627\u0646",
- "\u0643\u0630\u0644\u0643",
- "\u0627\u0644\u062a\u064a",
- "\u0648\u0628\u064a\u0646",
- "\u0639\u0644\u064a\u0647\u0627",
- "\u0645\u0633\u0627\u0621",
- "\u0627\u0644\u0630\u064a",
- "\u0648\u0643\u0627\u0646\u062a",
- "\u0648\u0644\u0643\u0646",
- "\u0648\u0627\u0644\u062a\u064a",
- "\u062a\u0643\u0648\u0646",
- "\u0627\u0644\u064a\u0648\u0645",
- "\u0627\u0644\u0644\u0630\u064a\u0646",
- "\u0639\u0644\u064a\u0647",
- "\u0643\u0627\u0646\u062a",
- "\u0644\u0630\u0644\u0643",
- "\u0623\u0645\u0627\u0645",
- "\u0647\u0646\u0627\u0643",
- "\u0645\u0646\u0647\u0627",
- "\u0645\u0627\u0632\u0627\u0644",
- "\u0644\u0627\u0632\u0627\u0644",
- "\u0644\u0627\u064a\u0632\u0627\u0644",
- "\u0645\u0627\u064a\u0632\u0627\u0644",
- "\u0627\u0635\u0628\u062d",
- "\u0623\u0635\u0628\u062d",
- "\u0623\u0645\u0633\u0649",
- "\u0627\u0645\u0633\u0649",
- "\u0623\u0636\u062d\u0649",
- "\u0627\u0636\u062d\u0649",
- "\u0645\u0627\u0628\u0631\u062d",
- "\u0645\u0627\u0641\u062a\u0626",
- "\u0645\u0627\u0627\u0646\u0641\u0643",
- "\u0644\u0627\u0633\u064a\u0645\u0627",
- "\u0648\u0644\u0627\u064a\u0632\u0627\u0644",
- "\u0627\u0644\u062d\u0627\u0644\u064a",
- "\u0627\u0644\u064a\u0647\u0627",
- "\u0627\u0644\u0630\u064a\u0646",
- "\u0641\u0627\u0646\u0647",
- "\u0648\u0627\u0644\u0630\u064a",
- "\u0648\u0647\u0630\u0627",
- "\u0644\u0647\u0630\u0627",
- "\u0641\u0643\u0627\u0646",
- "\u0633\u062a\u0643\u0648\u0646",
- "\u0627\u0644\u064a\u0647",
- "\u064a\u0645\u0643\u0646",
- "\u0628\u0647\u0630\u0627",
- "\u0627\u0644\u0630\u0649",
+ '\u064a\u0643\u0648\u0646',
+ '\u0648\u0644\u064a\u0633',
+ '\u0648\u0643\u0627\u0646',
+ '\u0643\u0630\u0644\u0643',
+ '\u0627\u0644\u062a\u064a',
+ '\u0648\u0628\u064a\u0646',
+ '\u0639\u0644\u064a\u0647\u0627',
+ '\u0645\u0633\u0627\u0621',
+ '\u0627\u0644\u0630\u064a',
+ '\u0648\u0643\u0627\u0646\u062a',
+ '\u0648\u0644\u0643\u0646',
+ '\u0648\u0627\u0644\u062a\u064a',
+ '\u062a\u0643\u0648\u0646',
+ '\u0627\u0644\u064a\u0648\u0645',
+ '\u0627\u0644\u0644\u0630\u064a\u0646',
+ '\u0639\u0644\u064a\u0647',
+ '\u0643\u0627\u0646\u062a',
+ '\u0644\u0630\u0644\u0643',
+ '\u0623\u0645\u0627\u0645',
+ '\u0647\u0646\u0627\u0643',
+ '\u0645\u0646\u0647\u0627',
+ '\u0645\u0627\u0632\u0627\u0644',
+ '\u0644\u0627\u0632\u0627\u0644',
+ '\u0644\u0627\u064a\u0632\u0627\u0644',
+ '\u0645\u0627\u064a\u0632\u0627\u0644',
+ '\u0627\u0635\u0628\u062d',
+ '\u0623\u0635\u0628\u062d',
+ '\u0623\u0645\u0633\u0649',
+ '\u0627\u0645\u0633\u0649',
+ '\u0623\u0636\u062d\u0649',
+ '\u0627\u0636\u062d\u0649',
+ '\u0645\u0627\u0628\u0631\u062d',
+ '\u0645\u0627\u0641\u062a\u0626',
+ '\u0645\u0627\u0627\u0646\u0641\u0643',
+ '\u0644\u0627\u0633\u064a\u0645\u0627',
+ '\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
+ '\u0627\u0644\u062d\u0627\u0644\u064a',
+ '\u0627\u0644\u064a\u0647\u0627',
+ '\u0627\u0644\u0630\u064a\u0646',
+ '\u0641\u0627\u0646\u0647',
+ '\u0648\u0627\u0644\u0630\u064a',
+ '\u0648\u0647\u0630\u0627',
+ '\u0644\u0647\u0630\u0627',
+ '\u0641\u0643\u0627\u0646',
+ '\u0633\u062a\u0643\u0648\u0646',
+ '\u0627\u0644\u064a\u0647',
+ '\u064a\u0645\u0643\u0646',
+ '\u0628\u0647\u0630\u0627',
+ '\u0627\u0644\u0630\u0649',
]
def stem(self, token):
num=3 both 1&2
"""
if num == 1:
- word = self.re_short_vowels.sub("", word)
+ word = self.re_short_vowels.sub('', word)
elif num == 2:
- word = self.re_initial_hamza.sub("\u0627", word)
+ word = self.re_initial_hamza.sub('\u0627', word)
elif num == 3:
- word = self.re_short_vowels.sub("", word)
- word = self.re_initial_hamza.sub("\u0627", word)
+ word = self.re_short_vowels.sub('', word)
+ word = self.re_initial_hamza.sub('\u0627', word)
return word
def pre32(self, word):
def waw(self, word):
"""remove connective ‘و’ if it precedes a word beginning with ‘و’ """
- if len(word) >= 4 and word[:2] == "\u0648\u0648":
+ if len(word) >= 4 and word[:2] == '\u0648\u0648':
word = word[1:]
return word
def pro_w53(self, word):
"""process length five patterns and extract length three roots"""
- if word[2] in self.pr53[0] and word[0] == "\u0627": # افتعل - افاعل
+ if word[2] in self.pr53[0] and word[0] == '\u0627': # افتعل - افاعل
word = word[1] + word[3:]
- elif word[3] in self.pr53[1] and word[0] == "\u0645": # مفعول - مفعال - مفعيل
+ elif word[3] in self.pr53[1] and word[0] == '\u0645': # مفعول - مفعال - مفعيل
word = word[1:3] + word[4]
- elif word[0] in self.pr53[2] and word[4] == "\u0629": # مفعلة - تفعلة - افعلة
+ elif word[0] in self.pr53[2] and word[4] == '\u0629': # مفعلة - تفعلة - افعلة
word = word[1:4]
- elif word[0] in self.pr53[3] and word[2] == "\u062a": # مفتعل - يفتعل - تفتعل
+ elif word[0] in self.pr53[3] and word[2] == '\u062a': # مفتعل - يفتعل - تفتعل
word = word[1] + word[3:]
- elif word[0] in self.pr53[4] and word[2] == "\u0627": # مفاعل - تفاعل
+ elif word[0] in self.pr53[4] and word[2] == '\u0627': # مفاعل - تفاعل
word = word[1] + word[3:]
- elif word[2] in self.pr53[5] and word[4] == "\u0629": # فعولة - فعالة
+ elif word[2] in self.pr53[5] and word[4] == '\u0629': # فعولة - فعالة
word = word[:2] + word[3]
- elif word[0] in self.pr53[6] and word[1] == "\u0646": # انفعل - منفعل
+ elif word[0] in self.pr53[6] and word[1] == '\u0646': # انفعل - منفعل
word = word[2:]
- elif word[3] == "\u0627" and word[0] == "\u0627": # افعال
+ elif word[3] == '\u0627' and word[0] == '\u0627': # افعال
word = word[1:3] + word[4]
- elif word[4] == "\u0646" and word[3] == "\u0627": # فعلان
+ elif word[4] == '\u0646' and word[3] == '\u0627': # فعلان
word = word[:3]
- elif word[3] == "\u064a" and word[0] == "\u062a": # تفعيل
+ elif word[3] == '\u064a' and word[0] == '\u062a': # تفعيل
word = word[1:3] + word[4]
- elif word[3] == "\u0648" and word[1] == "\u0627": # فاعول
+ elif word[3] == '\u0648' and word[1] == '\u0627': # فاعول
word = word[0] + word[2] + word[4]
- elif word[2] == "\u0627" and word[1] == "\u0648": # فواعل
+ elif word[2] == '\u0627' and word[1] == '\u0648': # فواعل
word = word[0] + word[3:]
- elif word[3] == "\u0626" and word[2] == "\u0627": # فعائل
+ elif word[3] == '\u0626' and word[2] == '\u0627': # فعائل
word = word[:2] + word[4]
- elif word[4] == "\u0629" and word[1] == "\u0627": # فاعلة
+ elif word[4] == '\u0629' and word[1] == '\u0627': # فاعلة
word = word[0] + word[2:4]
- elif word[4] == "\u064a" and word[2] == "\u0627": # فعالي
+ elif word[4] == '\u064a' and word[2] == '\u0627': # فعالي
word = word[:2] + word[3]
else:
word = self.suf1(word) # do - normalize short sufix
"""process length five patterns and extract length four roots"""
if word[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل
word = word[1:]
- elif word[4] == "\u0629": # فعللة
+ elif word[4] == '\u0629': # فعللة
word = word[:4]
- elif word[2] == "\u0627": # فعالل
+ elif word[2] == '\u0627': # فعالل
word = word[:2] + word[3:]
return word
def pro_w6(self, word):
"""process length six patterns and extract length three roots"""
- if word.startswith("\u0627\u0633\u062a") or word.startswith(
- "\u0645\u0633\u062a"
+ if word.startswith('\u0627\u0633\u062a') or word.startswith(
+ '\u0645\u0633\u062a'
): # مستفعل - استفعل
word = word[3:]
elif (
- word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629"
+ word[0] == '\u0645' and word[3] == '\u0627' and word[5] == '\u0629'
): # مفعالة
word = word[1:3] + word[4]
elif (
- word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627"
+ word[0] == '\u0627' and word[2] == '\u062a' and word[4] == '\u0627'
): # افتعال
word = word[1] + word[3] + word[5]
elif (
- word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4]
+ word[0] == '\u0627' and word[3] == '\u0648' and word[2] == word[4]
): # افعوعل
word = word[1] + word[4:]
elif (
- word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a"
+ word[0] == '\u062a' and word[2] == '\u0627' and word[4] == '\u064a'
): # تفاعيل new pattern
word = word[1] + word[3] + word[5]
else:
def pro_w64(self, word):
"""process length six patterns and extract length four roots"""
- if word[0] == "\u0627" and word[4] == "\u0627": # افعلال
+ if word[0] == '\u0627' and word[4] == '\u0627': # افعلال
word = word[1:4] + word[5]
- elif word.startswith("\u0645\u062a"): # متفعلل
+ elif word.startswith('\u0645\u062a'): # متفعلل
word = word[2:]
return word
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Tomcavage <stomcava@law.upenn.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
"""
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class LancasterStemmer(StemmerI):
"""
Lancaster Stemmer
word, remove_total, append_string
)
rule_was_applied = True
- if cont_flag == ".":
+ if cont_flag == '.':
proceed = False
break
elif self.__isAcceptable(word, remove_total):
word, remove_total, append_string
)
rule_was_applied = True
- if cont_flag == ".":
+ if cont_flag == '.':
proceed = False
break
# If no rules apply, the word doesn't need any more stemming
return word
def __repr__(self):
- return "<LancasterStemmer>"
+ return '<LancasterStemmer>'
in many languages.
"""
-__docformat__ = "plaintext"
+from __future__ import print_function, unicode_literals
+
+__docformat__ = 'plaintext'
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class PorterStemmer(StemmerI):
"""
A word stemmer based on the Porter stemming algorithm.
For the best stemming, you should use the default NLTK_EXTENSIONS
version. However, if you need to get the same results as either the
original algorithm or one of Martin Porter's hosted versions for
- compatibility with an existing implementation or dataset, you can use
+ compability with an existing implementation or dataset, you can use
one of the other modes instead.
"""
# Modes the Stemmer can be instantiated in
- NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
- MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
- ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
+ NLTK_EXTENSIONS = 'NLTK_EXTENSIONS'
+ MARTIN_EXTENSIONS = 'MARTIN_EXTENSIONS'
+ ORIGINAL_ALGORITHM = 'ORIGINAL_ALGORITHM'
def __init__(self, mode=NLTK_EXTENSIONS):
if mode not in (
for val in irregular_forms[key]:
self.pool[val] = key
- self.vowels = frozenset(["a", "e", "i", "o", "u"])
+ self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
def _is_consonant(self, word, i):
"""Returns True if word[i] is a consonant, False otherwise
"""
if word[i] in self.vowels:
return False
- if word[i] == "y":
+ if word[i] == 'y':
if i == 0:
return True
else:
m=1 TROUBLE, OATS, TREES, IVY.
m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
"""
- cv_sequence = ""
+ cv_sequence = ''
# Construct a string of 'c's and 'v's representing whether each
# character in `stem` is a consonant or a vowel.
# 'architecture' becomes 'vcccvcvccvcv'
for i in range(len(stem)):
if self._is_consonant(stem, i):
- cv_sequence += "c"
+ cv_sequence += 'c'
else:
- cv_sequence += "v"
+ cv_sequence += 'v'
# Count the number of 'vc' occurences, which is equivalent to
# the number of 'VC' occurrences in Porter's reduced form in the
# docstring above, which is in turn equivalent to `m`
- return cv_sequence.count("vc")
+ return cv_sequence.count('vc')
def _has_positive_measure(self, stem):
return self._measure(stem) > 0
and self._is_consonant(word, len(word) - 3)
and not self._is_consonant(word, len(word) - 2)
and self._is_consonant(word, len(word) - 1)
- and word[-1] not in ("w", "x", "y")
+ and word[-1] not in ('w', 'x', 'y')
) or (
self.mode == self.NLTK_EXTENSIONS
and len(word) == 2
def _replace_suffix(self, word, suffix, replacement):
"""Replaces `suffix` of `word` with `replacement"""
assert word.endswith(suffix), "Given word doesn't end with given suffix"
- if suffix == "":
+ if suffix == '':
return word + replacement
else:
return word[: -len(suffix)] + replacement
"""
for rule in rules:
suffix, replacement, condition = rule
- if suffix == "*d" and self._ends_double_consonant(word):
+ if suffix == '*d' and self._ends_double_consonant(word):
stem = word[:-2]
if condition is None or condition(stem):
return stem + replacement
# Don't try any further rules
return word
if word.endswith(suffix):
- stem = self._replace_suffix(word, suffix, "")
+ stem = self._replace_suffix(word, suffix, '')
if condition is None or condition(stem):
return stem + replacement
else:
# this NLTK-only rule extends the original algorithm, so
# that 'flies'->'fli' but 'dies'->'die' etc
if self.mode == self.NLTK_EXTENSIONS:
- if word.endswith("ies") and len(word) == 4:
- return self._replace_suffix(word, "ies", "ie")
+ if word.endswith('ies') and len(word) == 4:
+ return self._replace_suffix(word, 'ies', 'ie')
return self._apply_rule_list(
word,
[
- ("sses", "ss", None), # SSES -> SS
- ("ies", "i", None), # IES -> I
- ("ss", "ss", None), # SS -> SS
- ("s", "", None), # S ->
+ ('sses', 'ss', None), # SSES -> SS
+ ('ies', 'i', None), # IES -> I
+ ('ss', 'ss', None), # SS -> SS
+ ('s', '', None), # S ->
],
)
# this NLTK-only block extends the original algorithm, so that
# 'spied'->'spi' but 'died'->'die' etc
if self.mode == self.NLTK_EXTENSIONS:
- if word.endswith("ied"):
+ if word.endswith('ied'):
if len(word) == 4:
- return self._replace_suffix(word, "ied", "ie")
+ return self._replace_suffix(word, 'ied', 'ie')
else:
- return self._replace_suffix(word, "ied", "i")
+ return self._replace_suffix(word, 'ied', 'i')
# (m>0) EED -> EE
- if word.endswith("eed"):
- stem = self._replace_suffix(word, "eed", "")
+ if word.endswith('eed'):
+ stem = self._replace_suffix(word, 'eed', '')
if self._measure(stem) > 0:
- return stem + "ee"
+ return stem + 'ee'
else:
return word
rule_2_or_3_succeeded = False
- for suffix in ["ed", "ing"]:
+ for suffix in ['ed', 'ing']:
if word.endswith(suffix):
- intermediate_stem = self._replace_suffix(word, suffix, "")
+ intermediate_stem = self._replace_suffix(word, suffix, '')
if self._contains_vowel(intermediate_stem):
rule_2_or_3_succeeded = True
break
return self._apply_rule_list(
intermediate_stem,
[
- ("at", "ate", None), # AT -> ATE
- ("bl", "ble", None), # BL -> BLE
- ("iz", "ize", None), # IZ -> IZE
+ ('at', 'ate', None), # AT -> ATE
+ ('bl', 'ble', None), # BL -> BLE
+ ('iz', 'ize', None), # IZ -> IZE
# (*d and not (*L or *S or *Z))
# -> single letter
(
- "*d",
+ '*d',
intermediate_stem[-1],
- lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
+ lambda stem: intermediate_stem[-1] not in ('l', 's', 'z'),
),
# (m=1 and *o) -> E
(
- "",
- "e",
+ '',
+ 'e',
lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
),
],
word,
[
(
- "y",
- "i",
+ 'y',
+ 'i',
nltk_condition
if self.mode == self.NLTK_EXTENSIONS
else original_condition,
# Instead of applying the ALLI -> AL rule after '(a)bli' per
# the published algorithm, instead we apply it first, and,
# if it succeeds, run the result through step2 again.
- if word.endswith("alli") and self._has_positive_measure(
- self._replace_suffix(word, "alli", "")
+ if word.endswith('alli') and self._has_positive_measure(
+ self._replace_suffix(word, 'alli', '')
):
- return self._step2(self._replace_suffix(word, "alli", "al"))
+ return self._step2(self._replace_suffix(word, 'alli', 'al'))
- bli_rule = ("bli", "ble", self._has_positive_measure)
- abli_rule = ("abli", "able", self._has_positive_measure)
+ bli_rule = ('bli', 'ble', self._has_positive_measure)
+ abli_rule = ('abli', 'able', self._has_positive_measure)
rules = [
- ("ational", "ate", self._has_positive_measure),
- ("tional", "tion", self._has_positive_measure),
- ("enci", "ence", self._has_positive_measure),
- ("anci", "ance", self._has_positive_measure),
- ("izer", "ize", self._has_positive_measure),
+ ('ational', 'ate', self._has_positive_measure),
+ ('tional', 'tion', self._has_positive_measure),
+ ('enci', 'ence', self._has_positive_measure),
+ ('anci', 'ance', self._has_positive_measure),
+ ('izer', 'ize', self._has_positive_measure),
abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
- ("alli", "al", self._has_positive_measure),
- ("entli", "ent", self._has_positive_measure),
- ("eli", "e", self._has_positive_measure),
- ("ousli", "ous", self._has_positive_measure),
- ("ization", "ize", self._has_positive_measure),
- ("ation", "ate", self._has_positive_measure),
- ("ator", "ate", self._has_positive_measure),
- ("alism", "al", self._has_positive_measure),
- ("iveness", "ive", self._has_positive_measure),
- ("fulness", "ful", self._has_positive_measure),
- ("ousness", "ous", self._has_positive_measure),
- ("aliti", "al", self._has_positive_measure),
- ("iviti", "ive", self._has_positive_measure),
- ("biliti", "ble", self._has_positive_measure),
+ ('alli', 'al', self._has_positive_measure),
+ ('entli', 'ent', self._has_positive_measure),
+ ('eli', 'e', self._has_positive_measure),
+ ('ousli', 'ous', self._has_positive_measure),
+ ('ization', 'ize', self._has_positive_measure),
+ ('ation', 'ate', self._has_positive_measure),
+ ('ator', 'ate', self._has_positive_measure),
+ ('alism', 'al', self._has_positive_measure),
+ ('iveness', 'ive', self._has_positive_measure),
+ ('fulness', 'ful', self._has_positive_measure),
+ ('ousness', 'ous', self._has_positive_measure),
+ ('aliti', 'al', self._has_positive_measure),
+ ('iviti', 'ive', self._has_positive_measure),
+ ('biliti', 'ble', self._has_positive_measure),
]
if self.mode == self.NLTK_EXTENSIONS:
- rules.append(("fulli", "ful", self._has_positive_measure))
+ rules.append(('fulli', 'ful', self._has_positive_measure))
# The 'l' of the 'logi' -> 'log' rule is put with the stem,
# so that short stems like 'geo' 'theo' etc work like
return self._apply_rule_list(
word,
[
- ("icate", "ic", self._has_positive_measure),
- ("ative", "", self._has_positive_measure),
- ("alize", "al", self._has_positive_measure),
- ("iciti", "ic", self._has_positive_measure),
- ("ical", "ic", self._has_positive_measure),
- ("ful", "", self._has_positive_measure),
- ("ness", "", self._has_positive_measure),
+ ('icate', 'ic', self._has_positive_measure),
+ ('ative', '', self._has_positive_measure),
+ ('alize', 'al', self._has_positive_measure),
+ ('iciti', 'ic', self._has_positive_measure),
+ ('ical', 'ic', self._has_positive_measure),
+ ('ful', '', self._has_positive_measure),
+ ('ness', '', self._has_positive_measure),
],
)
return self._apply_rule_list(
word,
[
- ("al", "", measure_gt_1),
- ("ance", "", measure_gt_1),
- ("ence", "", measure_gt_1),
- ("er", "", measure_gt_1),
- ("ic", "", measure_gt_1),
- ("able", "", measure_gt_1),
- ("ible", "", measure_gt_1),
- ("ant", "", measure_gt_1),
- ("ement", "", measure_gt_1),
- ("ment", "", measure_gt_1),
- ("ent", "", measure_gt_1),
+ ('al', '', measure_gt_1),
+ ('ance', '', measure_gt_1),
+ ('ence', '', measure_gt_1),
+ ('er', '', measure_gt_1),
+ ('ic', '', measure_gt_1),
+ ('able', '', measure_gt_1),
+ ('ible', '', measure_gt_1),
+ ('ant', '', measure_gt_1),
+ ('ement', '', measure_gt_1),
+ ('ment', '', measure_gt_1),
+ ('ent', '', measure_gt_1),
# (m>1 and (*S or *T)) ION ->
(
- "ion",
- "",
- lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
+ 'ion',
+ '',
+ lambda stem: self._measure(stem) > 1 and stem[-1] in ('s', 't'),
),
- ("ou", "", measure_gt_1),
- ("ism", "", measure_gt_1),
- ("ate", "", measure_gt_1),
- ("iti", "", measure_gt_1),
- ("ous", "", measure_gt_1),
- ("ive", "", measure_gt_1),
- ("ize", "", measure_gt_1),
+ ('ou', '', measure_gt_1),
+ ('ism', '', measure_gt_1),
+ ('ate', '', measure_gt_1),
+ ('iti', '', measure_gt_1),
+ ('ous', '', measure_gt_1),
+ ('ive', '', measure_gt_1),
+ ('ize', '', measure_gt_1),
],
)
# no explicit mention of the inconsistency; you have to infer it
# from the examples.
# For this reason, we can't use _apply_rule_list here.
- if word.endswith("e"):
- stem = self._replace_suffix(word, "e", "")
+ if word.endswith('e'):
+ stem = self._replace_suffix(word, 'e', '')
if self._measure(stem) > 1:
return stem
if self._measure(stem) == 1 and not self._ends_cvc(stem):
roll -> roll
"""
return self._apply_rule_list(
- word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
+ word, [('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)]
)
def stem(self, word):
return stem
def __repr__(self):
- return "<PorterStemmer>"
+ return '<PorterStemmer>'
def demo():
stemmed.append(stemmer.stem(word))
# Convert the results to a string, and word-wrap them.
- results = " ".join(stemmed)
- results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
+ results = ' '.join(stemmed)
+ results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip()
# Convert the original to a string, and word wrap it.
- original = " ".join(orig)
- original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
+ original = ' '.join(orig)
+ original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip()
# Print the results.
- print("-Original-".center(70).replace(" ", "*").replace("-", " "))
+ print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
print(original)
- print("-Results-".center(70).replace(" ", "*").replace("-", " "))
+ print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
print(results)
- print("*" * 70)
+ print('*' * 70)
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class RegexpStemmer(StemmerI):
"""
A stemmer that uses regular expressions to identify morphological
def __init__(self, regexp, min=0):
- if not hasattr(regexp, "pattern"):
+ if not hasattr(regexp, 'pattern'):
regexp = re.compile(regexp)
self._regexp = regexp
self._min = min
if len(word) < self._min:
return word
else:
- return self._regexp.sub("", word)
+ return self._regexp.sub('', word)
def __repr__(self):
- return "<RegexpStemmer: {!r}>".format(self._regexp.pattern)
+ return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)
# Natural Language Toolkit: RSLP Stemmer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tiago Tresoldi <tresoldi@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
# do NLTK para o português para qualquer debate.
-
+from __future__ import print_function, unicode_literals
from nltk.data import load
from nltk.stem.api import StemmerI
self._model.append(self.read_rule("step6.pt"))
def read_rule(self, filename):
- rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
+ rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != ""] # remove blank lines
#
# Natural Language Toolkit: Snowball Stemmer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Peter Michael Stahl <pemistahl@gmail.com>
# Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
# Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer)
There is also a demo function: `snowball.demo()`.
"""
+from __future__ import unicode_literals, print_function
import re
+from six.moves import input
+from nltk import compat
from nltk.corpus import stopwords
from nltk.stem import porter
from nltk.stem.util import suffix_replace, prefix_replace
return self.stemmer.stem(self, token)
+@compat.python_2_unicode_compatible
class _LanguageSpecificStemmer(StemmerI):
"""
r1 = ""
for i in range(1, len(word)):
if word[i] not in vowels and word[i - 1] in vowels:
- if 3 > len(word[: i + 1]) > 0:
+ if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
r1 = word[3:]
elif len(word[: i + 1]) >= 3:
r1 = word[i + 1 :]
# Normalize_pre stes
__vocalization = re.compile(
- r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]"
+ r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]'
) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
- __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda
+ __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda
- __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟
+ __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') # ؛ ، ؟
# Normalize_post
- __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ
+ __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ
# normalize other hamza's
- __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ
+ __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') # أ، إ، آ
- __waw_hamza = re.compile(r"[\u0624]") # ؤ
+ __waw_hamza = re.compile(r'[\u0624]') # ؤ
- __yeh_hamza = re.compile(r"[\u0626]") # ئ
+ __yeh_hamza = re.compile(r'[\u0626]') # ئ
- __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ
+ __alefat = re.compile(r'[\u0623\u0622\u0625]') # أ، إ، آ
# Checks
__checks1 = (
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644", # بال، كال
- "\u0627\u0644",
- "\u0644\u0644", # لل، ال
+ '\u0643\u0627\u0644',
+ '\u0628\u0627\u0644', # بال، كال
+ '\u0627\u0644',
+ '\u0644\u0644', # لل، ال
)
- __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات
+ __checks2 = ('\u0629', '\u0627\u062a') # ة # female plural ات
# Suffixes
__suffix_noun_step1a = (
- "\u064a",
- "\u0643",
- "\u0647", # ي، ك، ه
- "\u0646\u0627",
- "\u0643\u0645",
- "\u0647\u0627",
- "\u0647\u0646",
- "\u0647\u0645", # نا، كم، ها، هن، هم
- "\u0643\u0645\u0627",
- "\u0647\u0645\u0627", # كما، هما
+ '\u064a',
+ '\u0643',
+ '\u0647', # ي، ك، ه
+ '\u0646\u0627',
+ '\u0643\u0645',
+ '\u0647\u0627',
+ '\u0647\u0646',
+ '\u0647\u0645', # نا، كم، ها، هن، هم
+ '\u0643\u0645\u0627',
+ '\u0647\u0645\u0627', # كما، هما
)
- __suffix_noun_step1b = "\u0646" # ن
+ __suffix_noun_step1b = '\u0646' # ن
- __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و
+ __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و
- __suffix_noun_step2b = "\u0627\u062a" # ات
+ __suffix_noun_step2b = '\u0627\u062a' # ات
- __suffix_noun_step2c1 = "\u062a" # ت
+ __suffix_noun_step2c1 = '\u062a' # ت
- __suffix_noun_step2c2 = "\u0629" # ة
+ __suffix_noun_step2c2 = '\u0629' # ة
- __suffix_noun_step3 = "\u064a" # ي
+ __suffix_noun_step3 = '\u064a' # ي
__suffix_verb_step1 = (
- "\u0647",
- "\u0643", # ه، ك
- "\u0646\u064a",
- "\u0646\u0627",
- "\u0647\u0627",
- "\u0647\u0645", # ني، نا، ها، هم
- "\u0647\u0646",
- "\u0643\u0645",
- "\u0643\u0646", # هن، كم، كن
- "\u0647\u0645\u0627",
- "\u0643\u0645\u0627",
- "\u0643\u0645\u0648", # هما، كما، كمو
+ '\u0647',
+ '\u0643', # ه، ك
+ '\u0646\u064a',
+ '\u0646\u0627',
+ '\u0647\u0627',
+ '\u0647\u0645', # ني، نا، ها، هم
+ '\u0647\u0646',
+ '\u0643\u0645',
+ '\u0643\u0646', # هن، كم، كن
+ '\u0647\u0645\u0627',
+ '\u0643\u0645\u0627',
+ '\u0643\u0645\u0648', # هما، كما، كمو
)
__suffix_verb_step2a = (
- "\u062a",
- "\u0627",
- "\u0646",
- "\u064a", # ت، ا، ن، ي
- "\u0646\u0627",
- "\u062a\u0627",
- "\u062a\u0646", # نا، تا، تن Past
- "\u0627\u0646",
- "\u0648\u0646",
- "\u064a\u0646", # ان، هن، ين Present
- "\u062a\u0645\u0627", # تما
+ '\u062a',
+ '\u0627',
+ '\u0646',
+ '\u064a', # ت، ا، ن، ي
+ '\u0646\u0627',
+ '\u062a\u0627',
+ '\u062a\u0646', # نا، تا، تن Past
+ '\u0627\u0646',
+ '\u0648\u0646',
+ '\u064a\u0646', # ان، هن، ين Present
+ '\u062a\u0645\u0627', # تما
)
- __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم
+ __suffix_verb_step2b = ('\u0648\u0627', '\u062a\u0645') # وا، تم
- __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو
+ __suffix_verb_step2c = ('\u0648', '\u062a\u0645\u0648') # و # تمو
- __suffix_all_alef_maqsura = "\u0649" # ى
+ __suffix_all_alef_maqsura = '\u0649' # ى
# Prefixes
__prefix_step1 = (
- "\u0623", # أ
- "\u0623\u0623",
- "\u0623\u0622",
- "\u0623\u0624",
- "\u0623\u0627",
- "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ
+ '\u0623', # أ
+ '\u0623\u0623',
+ '\u0623\u0622',
+ '\u0623\u0624',
+ '\u0623\u0627',
+ '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ
)
- __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال
+ __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال
- __prefix_step2b = ("\u0641", "\u0648") # ف، و
+ __prefix_step2b = ('\u0641', '\u0648') # ف، و
__prefix_step3a_noun = (
- "\u0627\u0644",
- "\u0644\u0644", # لل، ال
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644", # بال، كال
+ '\u0627\u0644',
+ '\u0644\u0644', # لل، ال
+ '\u0643\u0627\u0644',
+ '\u0628\u0627\u0644', # بال، كال
)
__prefix_step3b_noun = (
- "\u0628",
- "\u0643",
- "\u0644", # ب، ك، ل
- "\u0628\u0628",
- "\u0643\u0643", # بب، كك
+ '\u0628',
+ '\u0643',
+ '\u0644', # ب، ك، ل
+ '\u0628\u0628',
+ '\u0643\u0643', # بب، كك
)
__prefix_step3_verb = (
- "\u0633\u064a",
- "\u0633\u062a",
- "\u0633\u0646",
- "\u0633\u0623",
+ '\u0633\u064a',
+ '\u0633\u062a',
+ '\u0633\u0646',
+ '\u0633\u0623',
) # سي، ست، سن، سأ
__prefix_step4_verb = (
- "\u064a\u0633\u062a",
- "\u0646\u0633\u062a",
- "\u062a\u0633\u062a",
+ '\u064a\u0633\u062a',
+ '\u0646\u0633\u062a',
+ '\u062a\u0633\u062a',
) # يست، نست، تست
# Suffixes added due to Conjugation Verbs
- __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك
+ __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك
__conjugation_suffix_verb_2 = (
- "\u0646\u064a",
- "\u0646\u0627",
- "\u0647\u0627", # ني، نا، ها
- "\u0647\u0645",
- "\u0647\u0646",
- "\u0643\u0645", # هم، هن، كم
- "\u0643\u0646", # كن
+ '\u0646\u064a',
+ '\u0646\u0627',
+ '\u0647\u0627', # ني، نا، ها
+ '\u0647\u0645',
+ '\u0647\u0646',
+ '\u0643\u0645', # هم، هن، كم
+ '\u0643\u0646', # كن
)
__conjugation_suffix_verb_3 = (
- "\u0647\u0645\u0627",
- "\u0643\u0645\u0627",
- "\u0643\u0645\u0648",
+ '\u0647\u0645\u0627',
+ '\u0643\u0645\u0627',
+ '\u0643\u0645\u0648',
) # هما، كما، كمو
- __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي
+ __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي
__conjugation_suffix_verb_past = (
- "\u0646\u0627",
- "\u062a\u0627",
- "\u062a\u0646",
+ '\u0646\u0627',
+ '\u062a\u0627',
+ '\u062a\u0646',
) # نا، تا، تن
__conjugation_suffix_verb_present = (
- "\u0627\u0646",
- "\u0648\u0646",
- "\u064a\u0646",
+ '\u0627\u0646',
+ '\u0648\u0646',
+ '\u064a\u0646',
) # ان، ون، ين
# Suffixes added due to derivation Names
- __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه
+ __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه
__conjugation_suffix_noun_2 = (
- "\u0646\u0627",
- "\u0643\u0645", # نا، كم
- "\u0647\u0627",
- "\u0647\u0646",
- "\u0647\u0645", # ها، هن، هم
+ '\u0646\u0627',
+ '\u0643\u0645', # نا، كم
+ '\u0647\u0627',
+ '\u0647\u0646',
+ '\u0647\u0645', # ها، هن، هم
)
__conjugation_suffix_noun_3 = (
- "\u0643\u0645\u0627",
- "\u0647\u0645\u0627",
+ '\u0643\u0645\u0627',
+ '\u0647\u0645\u0627',
) # كما، هما
# Prefixes added due to derivation Names
- __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا
+ __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا
- __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال
+ __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644') # بال كال
- __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل
+ __articles_2len = ('\u0627\u0644', '\u0644\u0644') # ال لل
# Prepositions letters
- __prepositions1 = ("\u0643", "\u0644") # ك، ل
- __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك
+ __prepositions1 = ('\u0643', '\u0644') # ك، ل
+ __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك
is_verb = True
is_noun = True
:return: normalized token type string
"""
# strip diacritics
- token = self.__vocalization.sub("", token)
+ token = self.__vocalization.sub('', token)
# strip kasheeda
- token = self.__kasheeda.sub("", token)
+ token = self.__kasheeda.sub('', token)
# strip punctuation marks
- token = self.__arabic_punctuation_marks.sub("", token)
+ token = self.__arabic_punctuation_marks.sub('', token)
return token
def __normalize_post(self, token):
# normalize last hamza
for hamza in self.__last_hamzat:
if token.endswith(hamza):
- token = suffix_replace(token, hamza, "\u0621")
+ token = suffix_replace(token, hamza, '\u0621')
break
# normalize other hamzat
- token = self.__initial_hamzat.sub("\u0627", token)
- token = self.__waw_hamza.sub("\u0648", token)
- token = self.__yeh_hamza.sub("\u064a", token)
- token = self.__alefat.sub("\u0627", token)
+ token = self.__initial_hamzat.sub('\u0627', token)
+ token = self.__waw_hamza.sub('\u0648', token)
+ token = self.__yeh_hamza.sub('\u064a', token)
+ token = self.__alefat.sub('\u0627', token)
return token
def __checks_1(self, token):
def __checks_2(self, token):
for suffix in self.__checks2:
if token.endswith(suffix):
- if suffix == "\u0629" and len(token) > 2:
+ if suffix == '\u0629' and len(token) > 2:
self.is_noun = True
self.is_verb = False
break
- if suffix == "\u0627\u062a" and len(token) > 3:
+ if suffix == '\u0627\u062a' and len(token) > 3:
self.is_noun = True
self.is_verb = False
break
def __Suffix_Verb_Step2a(self, token):
for suffix in self.__suffix_verb_step2a:
if token.endswith(suffix) and len(token) > 3:
- if suffix == "\u062a" and len(token) >= 4:
+ if suffix == '\u062a' and len(token) >= 4:
token = token[:-1]
self.suffix_verb_step2a_success = True
break
self.suffix_verb_step2a_success = True
break
- if suffix == "\u062a\u0645\u0627" and len(token) >= 6:
+ if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
token = token[:-3]
self.suffix_verb_step2a_success = True
break
def __Suffix_Verb_Step2c(self, token):
for suffix in self.__suffix_verb_step2c:
if token.endswith(suffix):
- if suffix == "\u062a\u0645\u0648" and len(token) >= 6:
+ if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
token = token[:-3]
break
- if suffix == "\u0648" and len(token) >= 4:
+ if suffix == '\u0648' and len(token) >= 4:
token = token[:-1]
break
return token
def __Suffix_All_alef_maqsura(self, token):
for suffix in self.__suffix_all_alef_maqsura:
if token.endswith(suffix):
- token = suffix_replace(token, suffix, "\u064a")
+ token = suffix_replace(token, suffix, '\u064a')
return token
def __Prefix_Step1(self, token):
for prefix in self.__prefix_step1:
if token.startswith(prefix) and len(token) > 3:
- if prefix == "\u0623\u0623":
- token = prefix_replace(token, prefix, "\u0623")
+ if prefix == '\u0623\u0623':
+ token = prefix_replace(token, prefix, '\u0623')
break
- elif prefix == "\u0623\u0622":
- token = prefix_replace(token, prefix, "\u0622")
+ elif prefix == '\u0623\u0622':
+ token = prefix_replace(token, prefix, '\u0622')
break
- elif prefix == "\u0623\u0624":
- token = prefix_replace(token, prefix, "\u0624")
+ elif prefix == '\u0623\u0624':
+ token = prefix_replace(token, prefix, '\u0624')
break
- elif prefix == "\u0623\u0627":
- token = prefix_replace(token, prefix, "\u0627")
+ elif prefix == '\u0623\u0627':
+ token = prefix_replace(token, prefix, '\u0627')
break
- elif prefix == "\u0623\u0625":
- token = prefix_replace(token, prefix, "\u0625")
+ elif prefix == '\u0623\u0625':
+ token = prefix_replace(token, prefix, '\u0625')
break
return token
for prefix in self.__prefix_step3b_noun:
if token.startswith(prefix):
if len(token) > 3:
- if prefix == "\u0628":
+ if prefix == '\u0628':
token = token[len(prefix) :]
self.prefix_step3b_noun_success = True
break
def __Prefix_Step4_Verb(self, token):
for prefix in self.__prefix_step4_verb:
if token.startswith(prefix) and len(token) > 4:
- token = prefix_replace(token, prefix, "\u0627\u0633\u062a")
+ token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
self.is_verb = True
self.is_noun = False
break
# contains at least 3 letters.
for i in range(1, len(word)):
if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
- if 3 > len(word[: i + 1]) > 0:
+ if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
r1 = word[3:]
elif len(word[: i + 1]) == 0:
return word
__step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
__step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
__step2_suffixes = (
- "ization",
- "ational",
- "fulness",
- "ousness",
- "iveness",
- "tional",
- "biliti",
- "lessli",
- "entli",
- "ation",
- "alism",
- "aliti",
- "ousli",
- "iviti",
- "fulli",
- "enci",
- "anci",
- "abli",
- "izer",
- "ator",
- "alli",
- "bli",
- "ogi",
- "li",
+ 'ization',
+ 'ational',
+ 'fulness',
+ 'ousness',
+ 'iveness',
+ 'tional',
+ 'biliti',
+ 'lessli',
+ 'entli',
+ 'ation',
+ 'alism',
+ 'aliti',
+ 'ousli',
+ 'iviti',
+ 'fulli',
+ 'enci',
+ 'anci',
+ 'abli',
+ 'izer',
+ 'ator',
+ 'alli',
+ 'bli',
+ 'ogi',
+ 'li',
)
__step3_suffixes = (
- "ational",
- "tional",
- "alize",
- "icate",
- "iciti",
- "ative",
- "ical",
- "ness",
- "ful",
+ 'ational',
+ 'tional',
+ 'alize',
+ 'icate',
+ 'iciti',
+ 'ative',
+ 'ical',
+ 'ness',
+ 'ful',
)
__step4_suffixes = (
- "ement",
- "ance",
- "ence",
- "able",
- "ible",
- "ment",
- "ant",
- "ent",
- "ism",
- "ate",
- "iti",
- "ous",
- "ive",
- "ize",
- "ion",
- "al",
- "er",
- "ic",
+ 'ement',
+ 'ance',
+ 'ence',
+ 'able',
+ 'ible',
+ 'ment',
+ 'ant',
+ 'ent',
+ 'ism',
+ 'ate',
+ 'iti',
+ 'ous',
+ 'ive',
+ 'ize',
+ 'ion',
+ 'al',
+ 'er',
+ 'ic',
)
__step5_suffixes = ("e", "l")
__special_words = {
"zz",
)
__step1_suffixes = (
- "kaan",
- "k\xE4\xE4n",
- "sti",
- "kin",
- "han",
- "h\xE4n",
- "ko",
- "k\xF6",
- "pa",
- "p\xE4",
+ 'kaan',
+ 'k\xE4\xE4n',
+ 'sti',
+ 'kin',
+ 'han',
+ 'h\xE4n',
+ 'ko',
+ 'k\xF6',
+ 'pa',
+ 'p\xE4',
)
- __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en")
+ __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni', 'an', '\xE4n', 'en')
__step3_suffixes = (
- "siin",
- "tten",
- "seen",
- "han",
- "hen",
- "hin",
- "hon",
- "h\xE4n",
- "h\xF6n",
- "den",
- "tta",
- "tt\xE4",
- "ssa",
- "ss\xE4",
- "sta",
- "st\xE4",
- "lla",
- "ll\xE4",
- "lta",
- "lt\xE4",
- "lle",
- "ksi",
- "ine",
- "ta",
- "t\xE4",
- "na",
- "n\xE4",
- "a",
- "\xE4",
- "n",
+ 'siin',
+ 'tten',
+ 'seen',
+ 'han',
+ 'hen',
+ 'hin',
+ 'hon',
+ 'h\xE4n',
+ 'h\xF6n',
+ 'den',
+ 'tta',
+ 'tt\xE4',
+ 'ssa',
+ 'ss\xE4',
+ 'sta',
+ 'st\xE4',
+ 'lla',
+ 'll\xE4',
+ 'lta',
+ 'lt\xE4',
+ 'lle',
+ 'ksi',
+ 'ine',
+ 'ta',
+ 't\xE4',
+ 'na',
+ 'n\xE4',
+ 'a',
+ '\xE4',
+ 'n',
)
__step4_suffixes = (
- "impi",
- "impa",
- "imp\xE4",
- "immi",
- "imma",
- "imm\xE4",
- "mpi",
- "mpa",
- "mp\xE4",
- "mmi",
- "mma",
- "mm\xE4",
- "eja",
- "ej\xE4",
+ 'impi',
+ 'impa',
+ 'imp\xE4',
+ 'immi',
+ 'imma',
+ 'imm\xE4',
+ 'mpi',
+ 'mpa',
+ 'mp\xE4',
+ 'mmi',
+ 'mma',
+ 'mm\xE4',
+ 'eja',
+ 'ej\xE4',
)
def stem(self, word):
__vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
__step1_suffixes = (
- "issements",
- "issement",
- "atrices",
- "atrice",
- "ateurs",
- "ations",
- "logies",
- "usions",
- "utions",
- "ements",
- "amment",
- "emment",
- "ances",
- "iqUes",
- "ismes",
- "ables",
- "istes",
- "ateur",
- "ation",
- "logie",
- "usion",
- "ution",
- "ences",
- "ement",
- "euses",
- "ments",
- "ance",
- "iqUe",
- "isme",
- "able",
- "iste",
- "ence",
- "it\xE9s",
- "ives",
- "eaux",
- "euse",
- "ment",
- "eux",
- "it\xE9",
- "ive",
- "ifs",
- "aux",
- "if",
+ 'issements',
+ 'issement',
+ 'atrices',
+ 'atrice',
+ 'ateurs',
+ 'ations',
+ 'logies',
+ 'usions',
+ 'utions',
+ 'ements',
+ 'amment',
+ 'emment',
+ 'ances',
+ 'iqUes',
+ 'ismes',
+ 'ables',
+ 'istes',
+ 'ateur',
+ 'ation',
+ 'logie',
+ 'usion',
+ 'ution',
+ 'ences',
+ 'ement',
+ 'euses',
+ 'ments',
+ 'ance',
+ 'iqUe',
+ 'isme',
+ 'able',
+ 'iste',
+ 'ence',
+ 'it\xE9s',
+ 'ives',
+ 'eaux',
+ 'euse',
+ 'ment',
+ 'eux',
+ 'it\xE9',
+ 'ive',
+ 'ifs',
+ 'aux',
+ 'if',
)
__step2a_suffixes = (
- "issaIent",
- "issantes",
- "iraIent",
- "issante",
- "issants",
- "issions",
- "irions",
- "issais",
- "issait",
- "issant",
- "issent",
- "issiez",
- "issons",
- "irais",
- "irait",
- "irent",
- "iriez",
- "irons",
- "iront",
- "isses",
- "issez",
- "\xEEmes",
- "\xEEtes",
- "irai",
- "iras",
- "irez",
- "isse",
- "ies",
- "ira",
- "\xEEt",
- "ie",
- "ir",
- "is",
- "it",
- "i",
+ 'issaIent',
+ 'issantes',
+ 'iraIent',
+ 'issante',
+ 'issants',
+ 'issions',
+ 'irions',
+ 'issais',
+ 'issait',
+ 'issant',
+ 'issent',
+ 'issiez',
+ 'issons',
+ 'irais',
+ 'irait',
+ 'irent',
+ 'iriez',
+ 'irons',
+ 'iront',
+ 'isses',
+ 'issez',
+ '\xEEmes',
+ '\xEEtes',
+ 'irai',
+ 'iras',
+ 'irez',
+ 'isse',
+ 'ies',
+ 'ira',
+ '\xEEt',
+ 'ie',
+ 'ir',
+ 'is',
+ 'it',
+ 'i',
)
__step2b_suffixes = (
- "eraIent",
- "assions",
- "erions",
- "assent",
- "assiez",
- "\xE8rent",
- "erais",
- "erait",
- "eriez",
- "erons",
- "eront",
- "aIent",
- "antes",
- "asses",
- "ions",
- "erai",
- "eras",
- "erez",
- "\xE2mes",
- "\xE2tes",
- "ante",
- "ants",
- "asse",
- "\xE9es",
- "era",
- "iez",
- "ais",
- "ait",
- "ant",
- "\xE9e",
- "\xE9s",
- "er",
- "ez",
- "\xE2t",
- "ai",
- "as",
- "\xE9",
- "a",
+ 'eraIent',
+ 'assions',
+ 'erions',
+ 'assent',
+ 'assiez',
+ '\xE8rent',
+ 'erais',
+ 'erait',
+ 'eriez',
+ 'erons',
+ 'eront',
+ 'aIent',
+ 'antes',
+ 'asses',
+ 'ions',
+ 'erai',
+ 'eras',
+ 'erez',
+ '\xE2mes',
+ '\xE2tes',
+ 'ante',
+ 'ants',
+ 'asse',
+ '\xE9es',
+ 'era',
+ 'iez',
+ 'ais',
+ 'ait',
+ 'ant',
+ '\xE9e',
+ '\xE9s',
+ 'er',
+ 'ez',
+ '\xE2t',
+ 'ai',
+ 'as',
+ '\xE9',
+ 'a',
)
- __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB")
+ __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier', 'e', '\xEB')
def stem(self, word):
"""
step2b_success = True
elif suffix in (
- "eraIent",
- "erions",
- "\xE8rent",
- "erais",
- "erait",
- "eriez",
- "erons",
- "eront",
- "erai",
- "eras",
- "erez",
- "\xE9es",
- "era",
- "iez",
- "\xE9e",
- "\xE9s",
- "er",
- "ez",
- "\xE9",
+ 'eraIent',
+ 'erions',
+ '\xE8rent',
+ 'erais',
+ 'erait',
+ 'eriez',
+ 'erons',
+ 'eront',
+ 'erai',
+ 'eras',
+ 'erez',
+ '\xE9es',
+ 'era',
+ 'iez',
+ '\xE9e',
+ '\xE9s',
+ 'er',
+ 'ez',
+ '\xE9',
):
word = word[: -len(suffix)]
step2b_success = True
elif suffix in (
- "assions",
- "assent",
- "assiez",
- "aIent",
- "antes",
- "asses",
- "\xE2mes",
- "\xE2tes",
- "ante",
- "ants",
- "asse",
- "ais",
- "ait",
- "ant",
- "\xE2t",
- "ai",
- "as",
- "a",
+ 'assions',
+ 'assent',
+ 'assiez',
+ 'aIent',
+ 'antes',
+ 'asses',
+ '\xE2mes',
+ '\xE2tes',
+ 'ante',
+ 'ants',
+ 'asse',
+ 'ais',
+ 'ait',
+ 'ant',
+ '\xE2t',
+ 'ai',
+ 'as',
+ 'a',
):
word = word[: -len(suffix)]
rv = rv[: -len(suffix)]
# contains at least 3 letters.
for i in range(1, len(word)):
if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
- if 3 > len(word[: i + 1]) > 0:
+ if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
r1 = word[3:]
elif len(word[: i + 1]) == 0:
return word
__step1_suffixes = ("al", "el")
__step2_suffixes = (
- "k\xE9ppen",
- "onk\xE9nt",
- "enk\xE9nt",
- "ank\xE9nt",
- "k\xE9pp",
- "k\xE9nt",
- "ban",
- "ben",
- "nak",
- "nek",
- "val",
- "vel",
- "t\xF3l",
- "t\xF5l",
- "r\xF3l",
- "r\xF5l",
- "b\xF3l",
- "b\xF5l",
- "hoz",
- "hez",
- "h\xF6z",
- "n\xE1l",
- "n\xE9l",
- "\xE9rt",
- "kor",
- "ba",
- "be",
- "ra",
- "re",
- "ig",
- "at",
- "et",
- "ot",
- "\xF6t",
- "ul",
- "\xFCl",
- "v\xE1",
- "v\xE9",
- "en",
- "on",
- "an",
- "\xF6n",
- "n",
- "t",
+ 'k\xE9ppen',
+ 'onk\xE9nt',
+ 'enk\xE9nt',
+ 'ank\xE9nt',
+ 'k\xE9pp',
+ 'k\xE9nt',
+ 'ban',
+ 'ben',
+ 'nak',
+ 'nek',
+ 'val',
+ 'vel',
+ 't\xF3l',
+ 't\xF5l',
+ 'r\xF3l',
+ 'r\xF5l',
+ 'b\xF3l',
+ 'b\xF5l',
+ 'hoz',
+ 'hez',
+ 'h\xF6z',
+ 'n\xE1l',
+ 'n\xE9l',
+ '\xE9rt',
+ 'kor',
+ 'ba',
+ 'be',
+ 'ra',
+ 're',
+ 'ig',
+ 'at',
+ 'et',
+ 'ot',
+ '\xF6t',
+ 'ul',
+ '\xFCl',
+ 'v\xE1',
+ 'v\xE9',
+ 'en',
+ 'on',
+ 'an',
+ '\xF6n',
+ 'n',
+ 't',
)
__step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
__step4_suffixes = (
- "astul",
- "est\xFCl",
- "\xE1stul",
- "\xE9st\xFCl",
- "stul",
- "st\xFCl",
+ 'astul',
+ 'est\xFCl',
+ '\xE1stul',
+ '\xE9st\xFCl',
+ 'stul',
+ 'st\xFCl',
)
__step5_suffixes = ("\xE1", "\xE9")
__step6_suffixes = (
- "ok\xE9",
- "\xF6k\xE9",
- "ak\xE9",
- "ek\xE9",
- "\xE1k\xE9",
- "\xE1\xE9i",
- "\xE9k\xE9",
- "\xE9\xE9i",
- "k\xE9",
- "\xE9i",
- "\xE9\xE9",
- "\xE9",
+ 'ok\xE9',
+ '\xF6k\xE9',
+ 'ak\xE9',
+ 'ek\xE9',
+ '\xE1k\xE9',
+ '\xE1\xE9i',
+ '\xE9k\xE9',
+ '\xE9\xE9i',
+ 'k\xE9',
+ '\xE9i',
+ '\xE9\xE9',
+ '\xE9',
)
__step7_suffixes = (
- "\xE1juk",
- "\xE9j\xFCk",
- "\xFCnk",
- "unk",
- "juk",
- "j\xFCk",
- "\xE1nk",
- "\xE9nk",
- "nk",
- "uk",
- "\xFCk",
- "em",
- "om",
- "am",
- "od",
- "ed",
- "ad",
- "\xF6d",
- "ja",
- "je",
- "\xE1m",
- "\xE1d",
- "\xE9m",
- "\xE9d",
- "m",
- "d",
- "a",
- "e",
- "o",
- "\xE1",
- "\xE9",
+ '\xE1juk',
+ '\xE9j\xFCk',
+ '\xFCnk',
+ 'unk',
+ 'juk',
+ 'j\xFCk',
+ '\xE1nk',
+ '\xE9nk',
+ 'nk',
+ 'uk',
+ '\xFCk',
+ 'em',
+ 'om',
+ 'am',
+ 'od',
+ 'ed',
+ 'ad',
+ '\xF6d',
+ 'ja',
+ 'je',
+ '\xE1m',
+ '\xE1d',
+ '\xE9m',
+ '\xE9d',
+ 'm',
+ 'd',
+ 'a',
+ 'e',
+ 'o',
+ '\xE1',
+ '\xE9',
)
__step8_suffixes = (
- "jaitok",
- "jeitek",
- "jaink",
- "jeink",
- "aitok",
- "eitek",
- "\xE1itok",
- "\xE9itek",
- "jaim",
- "jeim",
- "jaid",
- "jeid",
- "eink",
- "aink",
- "itek",
- "jeik",
- "jaik",
- "\xE1ink",
- "\xE9ink",
- "aim",
- "eim",
- "aid",
- "eid",
- "jai",
- "jei",
- "ink",
- "aik",
- "eik",
- "\xE1im",
- "\xE1id",
- "\xE1ik",
- "\xE9im",
- "\xE9id",
- "\xE9ik",
- "im",
- "id",
- "ai",
- "ei",
- "ik",
- "\xE1i",
- "\xE9i",
- "i",
+ 'jaitok',
+ 'jeitek',
+ 'jaink',
+ 'jeink',
+ 'aitok',
+ 'eitek',
+ '\xE1itok',
+ '\xE9itek',
+ 'jaim',
+ 'jeim',
+ 'jaid',
+ 'jeid',
+ 'eink',
+ 'aink',
+ 'itek',
+ 'jeik',
+ 'jaik',
+ '\xE1ink',
+ '\xE9ink',
+ 'aim',
+ 'eim',
+ 'aid',
+ 'eid',
+ 'jai',
+ 'jei',
+ 'ink',
+ 'aik',
+ 'eik',
+ '\xE1im',
+ '\xE1id',
+ '\xE1ik',
+ '\xE9im',
+ '\xE9id',
+ '\xE9ik',
+ 'im',
+ 'id',
+ 'ai',
+ 'ei',
+ 'ik',
+ '\xE1i',
+ '\xE9i',
+ 'i',
)
__step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
__vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
__step0_suffixes = (
- "gliela",
- "gliele",
- "glieli",
- "glielo",
- "gliene",
- "sene",
- "mela",
- "mele",
- "meli",
- "melo",
- "mene",
- "tela",
- "tele",
- "teli",
- "telo",
- "tene",
- "cela",
- "cele",
- "celi",
- "celo",
- "cene",
- "vela",
- "vele",
- "veli",
- "velo",
- "vene",
- "gli",
- "ci",
- "la",
- "le",
- "li",
- "lo",
- "mi",
- "ne",
- "si",
- "ti",
- "vi",
+ 'gliela',
+ 'gliele',
+ 'glieli',
+ 'glielo',
+ 'gliene',
+ 'sene',
+ 'mela',
+ 'mele',
+ 'meli',
+ 'melo',
+ 'mene',
+ 'tela',
+ 'tele',
+ 'teli',
+ 'telo',
+ 'tene',
+ 'cela',
+ 'cele',
+ 'celi',
+ 'celo',
+ 'cene',
+ 'vela',
+ 'vele',
+ 'veli',
+ 'velo',
+ 'vene',
+ 'gli',
+ 'ci',
+ 'la',
+ 'le',
+ 'li',
+ 'lo',
+ 'mi',
+ 'ne',
+ 'si',
+ 'ti',
+ 'vi',
)
__step1_suffixes = (
- "atrice",
- "atrici",
- "azione",
- "azioni",
- "uzione",
- "uzioni",
- "usione",
- "usioni",
- "amento",
- "amenti",
- "imento",
- "imenti",
- "amente",
- "abile",
- "abili",
- "ibile",
- "ibili",
- "mente",
- "atore",
- "atori",
- "logia",
- "logie",
- "anza",
- "anze",
- "iche",
- "ichi",
- "ismo",
- "ismi",
- "ista",
- "iste",
- "isti",
- "ist\xE0",
- "ist\xE8",
- "ist\xEC",
- "ante",
- "anti",
- "enza",
- "enze",
- "ico",
- "ici",
- "ica",
- "ice",
- "oso",
- "osi",
- "osa",
- "ose",
- "it\xE0",
- "ivo",
- "ivi",
- "iva",
- "ive",
+ 'atrice',
+ 'atrici',
+ 'azione',
+ 'azioni',
+ 'uzione',
+ 'uzioni',
+ 'usione',
+ 'usioni',
+ 'amento',
+ 'amenti',
+ 'imento',
+ 'imenti',
+ 'amente',
+ 'abile',
+ 'abili',
+ 'ibile',
+ 'ibili',
+ 'mente',
+ 'atore',
+ 'atori',
+ 'logia',
+ 'logie',
+ 'anza',
+ 'anze',
+ 'iche',
+ 'ichi',
+ 'ismo',
+ 'ismi',
+ 'ista',
+ 'iste',
+ 'isti',
+ 'ist\xE0',
+ 'ist\xE8',
+ 'ist\xEC',
+ 'ante',
+ 'anti',
+ 'enza',
+ 'enze',
+ 'ico',
+ 'ici',
+ 'ica',
+ 'ice',
+ 'oso',
+ 'osi',
+ 'osa',
+ 'ose',
+ 'it\xE0',
+ 'ivo',
+ 'ivi',
+ 'iva',
+ 'ive',
)
__step2_suffixes = (
- "erebbero",
- "irebbero",
- "assero",
- "assimo",
- "eranno",
- "erebbe",
- "eremmo",
- "ereste",
- "eresti",
- "essero",
- "iranno",
- "irebbe",
- "iremmo",
- "ireste",
- "iresti",
- "iscano",
- "iscono",
- "issero",
- "arono",
- "avamo",
- "avano",
- "avate",
- "eremo",
- "erete",
- "erono",
- "evamo",
- "evano",
- "evate",
- "iremo",
- "irete",
- "irono",
- "ivamo",
- "ivano",
- "ivate",
- "ammo",
- "ando",
- "asse",
- "assi",
- "emmo",
- "enda",
- "ende",
- "endi",
- "endo",
- "erai",
- "erei",
- "Yamo",
- "iamo",
- "immo",
- "irai",
- "irei",
- "isca",
- "isce",
- "isci",
- "isco",
- "ano",
- "are",
- "ata",
- "ate",
- "ati",
- "ato",
- "ava",
- "avi",
- "avo",
- "er\xE0",
- "ere",
- "er\xF2",
- "ete",
- "eva",
- "evi",
- "evo",
- "ir\xE0",
- "ire",
- "ir\xF2",
- "ita",
- "ite",
- "iti",
- "ito",
- "iva",
- "ivi",
- "ivo",
- "ono",
- "uta",
- "ute",
- "uti",
- "uto",
- "ar",
- "ir",
+ 'erebbero',
+ 'irebbero',
+ 'assero',
+ 'assimo',
+ 'eranno',
+ 'erebbe',
+ 'eremmo',
+ 'ereste',
+ 'eresti',
+ 'essero',
+ 'iranno',
+ 'irebbe',
+ 'iremmo',
+ 'ireste',
+ 'iresti',
+ 'iscano',
+ 'iscono',
+ 'issero',
+ 'arono',
+ 'avamo',
+ 'avano',
+ 'avate',
+ 'eremo',
+ 'erete',
+ 'erono',
+ 'evamo',
+ 'evano',
+ 'evate',
+ 'iremo',
+ 'irete',
+ 'irono',
+ 'ivamo',
+ 'ivano',
+ 'ivate',
+ 'ammo',
+ 'ando',
+ 'asse',
+ 'assi',
+ 'emmo',
+ 'enda',
+ 'ende',
+ 'endi',
+ 'endo',
+ 'erai',
+ 'erei',
+ 'Yamo',
+ 'iamo',
+ 'immo',
+ 'irai',
+ 'irei',
+ 'isca',
+ 'isce',
+ 'isci',
+ 'isco',
+ 'ano',
+ 'are',
+ 'ata',
+ 'ate',
+ 'ati',
+ 'ato',
+ 'ava',
+ 'avi',
+ 'avo',
+ 'er\xE0',
+ 'ere',
+ 'er\xF2',
+ 'ete',
+ 'eva',
+ 'evi',
+ 'evo',
+ 'ir\xE0',
+ 'ire',
+ 'ir\xF2',
+ 'ita',
+ 'ite',
+ 'iti',
+ 'ito',
+ 'iva',
+ 'ivi',
+ 'ivo',
+ 'ono',
+ 'uta',
+ 'ute',
+ 'uti',
+ 'uto',
+ 'ar',
+ 'ir',
)
def stem(self, word):
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
__step1_suffixes = (
- "amentos",
- "imentos",
- "uço~es",
- "amento",
- "imento",
- "adoras",
- "adores",
- "a\xE7o~es",
- "logias",
- "\xEAncias",
- "amente",
- "idades",
- "an\xE7as",
- "ismos",
- "istas",
- "adora",
- "a\xE7a~o",
- "antes",
- "\xE2ncia",
- "logia",
- "uça~o",
- "\xEAncia",
- "mente",
- "idade",
- "an\xE7a",
- "ezas",
- "icos",
- "icas",
- "ismo",
- "\xE1vel",
- "\xEDvel",
- "ista",
- "osos",
- "osas",
- "ador",
- "ante",
- "ivas",
- "ivos",
- "iras",
- "eza",
- "ico",
- "ica",
- "oso",
- "osa",
- "iva",
- "ivo",
- "ira",
+ 'amentos',
+ 'imentos',
+ 'uço~es',
+ 'amento',
+ 'imento',
+ 'adoras',
+ 'adores',
+ 'a\xE7o~es',
+ 'logias',
+ '\xEAncias',
+ 'amente',
+ 'idades',
+ 'an\xE7as',
+ 'ismos',
+ 'istas',
+ 'adora',
+ 'a\xE7a~o',
+ 'antes',
+ '\xE2ncia',
+ 'logia',
+ 'uça~o',
+ '\xEAncia',
+ 'mente',
+ 'idade',
+ 'an\xE7a',
+ 'ezas',
+ 'icos',
+ 'icas',
+ 'ismo',
+ '\xE1vel',
+ '\xEDvel',
+ 'ista',
+ 'osos',
+ 'osas',
+ 'ador',
+ 'ante',
+ 'ivas',
+ 'ivos',
+ 'iras',
+ 'eza',
+ 'ico',
+ 'ica',
+ 'oso',
+ 'osa',
+ 'iva',
+ 'ivo',
+ 'ira',
)
__step2_suffixes = (
- "ar\xEDamos",
- "er\xEDamos",
- "ir\xEDamos",
- "\xE1ssemos",
- "\xEAssemos",
- "\xEDssemos",
- "ar\xEDeis",
- "er\xEDeis",
- "ir\xEDeis",
- "\xE1sseis",
- "\xE9sseis",
- "\xEDsseis",
- "\xE1ramos",
- "\xE9ramos",
- "\xEDramos",
- "\xE1vamos",
- "aremos",
- "eremos",
- "iremos",
- "ariam",
- "eriam",
- "iriam",
- "assem",
- "essem",
- "issem",
- "ara~o",
- "era~o",
- "ira~o",
- "arias",
- "erias",
- "irias",
- "ardes",
- "erdes",
- "irdes",
- "asses",
- "esses",
- "isses",
- "astes",
- "estes",
- "istes",
- "\xE1reis",
- "areis",
- "\xE9reis",
- "ereis",
- "\xEDreis",
- "ireis",
- "\xE1veis",
- "\xEDamos",
- "armos",
- "ermos",
- "irmos",
- "aria",
- "eria",
- "iria",
- "asse",
- "esse",
- "isse",
- "aste",
- "este",
- "iste",
- "arei",
- "erei",
- "irei",
- "aram",
- "eram",
- "iram",
- "avam",
- "arem",
- "erem",
- "irem",
- "ando",
- "endo",
- "indo",
- "adas",
- "idas",
- "ar\xE1s",
- "aras",
- "er\xE1s",
- "eras",
- "ir\xE1s",
- "avas",
- "ares",
- "eres",
- "ires",
- "\xEDeis",
- "ados",
- "idos",
- "\xE1mos",
- "amos",
- "emos",
- "imos",
- "iras",
- "ada",
- "ida",
- "ar\xE1",
- "ara",
- "er\xE1",
- "era",
- "ir\xE1",
- "ava",
- "iam",
- "ado",
- "ido",
- "ias",
- "ais",
- "eis",
- "ira",
- "ia",
- "ei",
- "am",
- "em",
- "ar",
- "er",
- "ir",
- "as",
- "es",
- "is",
- "eu",
- "iu",
- "ou",
+ 'ar\xEDamos',
+ 'er\xEDamos',
+ 'ir\xEDamos',
+ '\xE1ssemos',
+ '\xEAssemos',
+ '\xEDssemos',
+ 'ar\xEDeis',
+ 'er\xEDeis',
+ 'ir\xEDeis',
+ '\xE1sseis',
+ '\xE9sseis',
+ '\xEDsseis',
+ '\xE1ramos',
+ '\xE9ramos',
+ '\xEDramos',
+ '\xE1vamos',
+ 'aremos',
+ 'eremos',
+ 'iremos',
+ 'ariam',
+ 'eriam',
+ 'iriam',
+ 'assem',
+ 'essem',
+ 'issem',
+ 'ara~o',
+ 'era~o',
+ 'ira~o',
+ 'arias',
+ 'erias',
+ 'irias',
+ 'ardes',
+ 'erdes',
+ 'irdes',
+ 'asses',
+ 'esses',
+ 'isses',
+ 'astes',
+ 'estes',
+ 'istes',
+ '\xE1reis',
+ 'areis',
+ '\xE9reis',
+ 'ereis',
+ '\xEDreis',
+ 'ireis',
+ '\xE1veis',
+ '\xEDamos',
+ 'armos',
+ 'ermos',
+ 'irmos',
+ 'aria',
+ 'eria',
+ 'iria',
+ 'asse',
+ 'esse',
+ 'isse',
+ 'aste',
+ 'este',
+ 'iste',
+ 'arei',
+ 'erei',
+ 'irei',
+ 'aram',
+ 'eram',
+ 'iram',
+ 'avam',
+ 'arem',
+ 'erem',
+ 'irem',
+ 'ando',
+ 'endo',
+ 'indo',
+ 'adas',
+ 'idas',
+ 'ar\xE1s',
+ 'aras',
+ 'er\xE1s',
+ 'eras',
+ 'ir\xE1s',
+ 'avas',
+ 'ares',
+ 'eres',
+ 'ires',
+ '\xEDeis',
+ 'ados',
+ 'idos',
+ '\xE1mos',
+ 'amos',
+ 'emos',
+ 'imos',
+ 'iras',
+ 'ada',
+ 'ida',
+ 'ar\xE1',
+ 'ara',
+ 'er\xE1',
+ 'era',
+ 'ir\xE1',
+ 'ava',
+ 'iam',
+ 'ado',
+ 'ido',
+ 'ias',
+ 'ais',
+ 'eis',
+ 'ira',
+ 'ia',
+ 'ei',
+ 'am',
+ 'em',
+ 'ar',
+ 'er',
+ 'ir',
+ 'as',
+ 'es',
+ 'is',
+ 'eu',
+ 'iu',
+ 'ou',
)
__step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
__vowels = "aeiou\u0103\xE2\xEE"
__step0_suffixes = (
- "iilor",
- "ului",
- "elor",
- "iile",
- "ilor",
- "atei",
- "a\u0163ie",
- "a\u0163ia",
- "aua",
- "ele",
- "iua",
- "iei",
- "ile",
- "ul",
- "ea",
- "ii",
+ 'iilor',
+ 'ului',
+ 'elor',
+ 'iile',
+ 'ilor',
+ 'atei',
+ 'a\u0163ie',
+ 'a\u0163ia',
+ 'aua',
+ 'ele',
+ 'iua',
+ 'iei',
+ 'ile',
+ 'ul',
+ 'ea',
+ 'ii',
)
__step1_suffixes = (
- "abilitate",
- "abilitati",
- "abilit\u0103\u0163i",
- "ibilitate",
- "abilit\u0103i",
- "ivitate",
- "ivitati",
- "ivit\u0103\u0163i",
- "icitate",
- "icitati",
- "icit\u0103\u0163i",
- "icatori",
- "ivit\u0103i",
- "icit\u0103i",
- "icator",
- "a\u0163iune",
- "atoare",
- "\u0103toare",
- "i\u0163iune",
- "itoare",
- "iciva",
- "icive",
- "icivi",
- "iciv\u0103",
- "icala",
- "icale",
- "icali",
- "ical\u0103",
- "ativa",
- "ative",
- "ativi",
- "ativ\u0103",
- "atori",
- "\u0103tori",
- "itiva",
- "itive",
- "itivi",
- "itiv\u0103",
- "itori",
- "iciv",
- "ical",
- "ativ",
- "ator",
- "\u0103tor",
- "itiv",
- "itor",
+ 'abilitate',
+ 'abilitati',
+ 'abilit\u0103\u0163i',
+ 'ibilitate',
+ 'abilit\u0103i',
+ 'ivitate',
+ 'ivitati',
+ 'ivit\u0103\u0163i',
+ 'icitate',
+ 'icitati',
+ 'icit\u0103\u0163i',
+ 'icatori',
+ 'ivit\u0103i',
+ 'icit\u0103i',
+ 'icator',
+ 'a\u0163iune',
+ 'atoare',
+ '\u0103toare',
+ 'i\u0163iune',
+ 'itoare',
+ 'iciva',
+ 'icive',
+ 'icivi',
+ 'iciv\u0103',
+ 'icala',
+ 'icale',
+ 'icali',
+ 'ical\u0103',
+ 'ativa',
+ 'ative',
+ 'ativi',
+ 'ativ\u0103',
+ 'atori',
+ '\u0103tori',
+ 'itiva',
+ 'itive',
+ 'itivi',
+ 'itiv\u0103',
+ 'itori',
+ 'iciv',
+ 'ical',
+ 'ativ',
+ 'ator',
+ '\u0103tor',
+ 'itiv',
+ 'itor',
)
__step2_suffixes = (
- "abila",
- "abile",
- "abili",
- "abil\u0103",
- "ibila",
- "ibile",
- "ibili",
- "ibil\u0103",
- "atori",
- "itate",
- "itati",
- "it\u0103\u0163i",
- "abil",
- "ibil",
- "oasa",
- "oas\u0103",
- "oase",
- "anta",
- "ante",
- "anti",
- "ant\u0103",
- "ator",
- "it\u0103i",
- "iune",
- "iuni",
- "isme",
- "ista",
- "iste",
- "isti",
- "ist\u0103",
- "i\u015Fti",
- "ata",
- "at\u0103",
- "ati",
- "ate",
- "uta",
- "ut\u0103",
- "uti",
- "ute",
- "ita",
- "it\u0103",
- "iti",
- "ite",
- "ica",
- "ice",
- "ici",
- "ic\u0103",
- "osi",
- "o\u015Fi",
- "ant",
- "iva",
- "ive",
- "ivi",
- "iv\u0103",
- "ism",
- "ist",
- "at",
- "ut",
- "it",
- "ic",
- "os",
- "iv",
+ 'abila',
+ 'abile',
+ 'abili',
+ 'abil\u0103',
+ 'ibila',
+ 'ibile',
+ 'ibili',
+ 'ibil\u0103',
+ 'atori',
+ 'itate',
+ 'itati',
+ 'it\u0103\u0163i',
+ 'abil',
+ 'ibil',
+ 'oasa',
+ 'oas\u0103',
+ 'oase',
+ 'anta',
+ 'ante',
+ 'anti',
+ 'ant\u0103',
+ 'ator',
+ 'it\u0103i',
+ 'iune',
+ 'iuni',
+ 'isme',
+ 'ista',
+ 'iste',
+ 'isti',
+ 'ist\u0103',
+ 'i\u015Fti',
+ 'ata',
+ 'at\u0103',
+ 'ati',
+ 'ate',
+ 'uta',
+ 'ut\u0103',
+ 'uti',
+ 'ute',
+ 'ita',
+ 'it\u0103',
+ 'iti',
+ 'ite',
+ 'ica',
+ 'ice',
+ 'ici',
+ 'ic\u0103',
+ 'osi',
+ 'o\u015Fi',
+ 'ant',
+ 'iva',
+ 'ive',
+ 'ivi',
+ 'iv\u0103',
+ 'ism',
+ 'ist',
+ 'at',
+ 'ut',
+ 'it',
+ 'ic',
+ 'os',
+ 'iv',
)
__step3_suffixes = (
- "seser\u0103\u0163i",
- "aser\u0103\u0163i",
- "iser\u0103\u0163i",
- "\xE2ser\u0103\u0163i",
- "user\u0103\u0163i",
- "seser\u0103m",
- "aser\u0103m",
- "iser\u0103m",
- "\xE2ser\u0103m",
- "user\u0103m",
- "ser\u0103\u0163i",
- "sese\u015Fi",
- "seser\u0103",
- "easc\u0103",
- "ar\u0103\u0163i",
- "ur\u0103\u0163i",
- "ir\u0103\u0163i",
- "\xE2r\u0103\u0163i",
- "ase\u015Fi",
- "aser\u0103",
- "ise\u015Fi",
- "iser\u0103",
- "\xe2se\u015Fi",
- "\xE2ser\u0103",
- "use\u015Fi",
- "user\u0103",
- "ser\u0103m",
- "sesem",
- "indu",
- "\xE2ndu",
- "eaz\u0103",
- "e\u015Fti",
- "e\u015Fte",
- "\u0103\u015Fti",
- "\u0103\u015Fte",
- "ea\u0163i",
- "ia\u0163i",
- "ar\u0103m",
- "ur\u0103m",
- "ir\u0103m",
- "\xE2r\u0103m",
- "asem",
- "isem",
- "\xE2sem",
- "usem",
- "se\u015Fi",
- "ser\u0103",
- "sese",
- "are",
- "ere",
- "ire",
- "\xE2re",
- "ind",
- "\xE2nd",
- "eze",
- "ezi",
- "esc",
- "\u0103sc",
- "eam",
- "eai",
- "eau",
- "iam",
- "iai",
- "iau",
- "a\u015Fi",
- "ar\u0103",
- "u\u015Fi",
- "ur\u0103",
- "i\u015Fi",
- "ir\u0103",
- "\xE2\u015Fi",
- "\xe2r\u0103",
- "ase",
- "ise",
- "\xE2se",
- "use",
- "a\u0163i",
- "e\u0163i",
- "i\u0163i",
- "\xe2\u0163i",
- "sei",
- "ez",
- "am",
- "ai",
- "au",
- "ea",
- "ia",
- "ui",
- "\xE2i",
- "\u0103m",
- "em",
- "im",
- "\xE2m",
- "se",
+ 'seser\u0103\u0163i',
+ 'aser\u0103\u0163i',
+ 'iser\u0103\u0163i',
+ '\xE2ser\u0103\u0163i',
+ 'user\u0103\u0163i',
+ 'seser\u0103m',
+ 'aser\u0103m',
+ 'iser\u0103m',
+ '\xE2ser\u0103m',
+ 'user\u0103m',
+ 'ser\u0103\u0163i',
+ 'sese\u015Fi',
+ 'seser\u0103',
+ 'easc\u0103',
+ 'ar\u0103\u0163i',
+ 'ur\u0103\u0163i',
+ 'ir\u0103\u0163i',
+ '\xE2r\u0103\u0163i',
+ 'ase\u015Fi',
+ 'aser\u0103',
+ 'ise\u015Fi',
+ 'iser\u0103',
+ '\xe2se\u015Fi',
+ '\xE2ser\u0103',
+ 'use\u015Fi',
+ 'user\u0103',
+ 'ser\u0103m',
+ 'sesem',
+ 'indu',
+ '\xE2ndu',
+ 'eaz\u0103',
+ 'e\u015Fti',
+ 'e\u015Fte',
+ '\u0103\u015Fti',
+ '\u0103\u015Fte',
+ 'ea\u0163i',
+ 'ia\u0163i',
+ 'ar\u0103m',
+ 'ur\u0103m',
+ 'ir\u0103m',
+ '\xE2r\u0103m',
+ 'asem',
+ 'isem',
+ '\xE2sem',
+ 'usem',
+ 'se\u015Fi',
+ 'ser\u0103',
+ 'sese',
+ 'are',
+ 'ere',
+ 'ire',
+ '\xE2re',
+ 'ind',
+ '\xE2nd',
+ 'eze',
+ 'ezi',
+ 'esc',
+ '\u0103sc',
+ 'eam',
+ 'eai',
+ 'eau',
+ 'iam',
+ 'iai',
+ 'iau',
+ 'a\u015Fi',
+ 'ar\u0103',
+ 'u\u015Fi',
+ 'ur\u0103',
+ 'i\u015Fi',
+ 'ir\u0103',
+ '\xE2\u015Fi',
+ '\xe2r\u0103',
+ 'ase',
+ 'ise',
+ '\xE2se',
+ 'use',
+ 'a\u0163i',
+ 'e\u0163i',
+ 'i\u0163i',
+ '\xe2\u0163i',
+ 'sei',
+ 'ez',
+ 'am',
+ 'ai',
+ 'au',
+ 'ea',
+ 'ia',
+ 'ui',
+ '\xE2i',
+ '\u0103m',
+ 'em',
+ 'im',
+ '\xE2m',
+ 'se',
)
def stem(self, word):
if word.endswith(suffix):
if suffix in rv:
if suffix in (
- "seser\u0103\u0163i",
- "seser\u0103m",
- "ser\u0103\u0163i",
- "sese\u015Fi",
- "seser\u0103",
- "ser\u0103m",
- "sesem",
- "se\u015Fi",
- "ser\u0103",
- "sese",
- "a\u0163i",
- "e\u0163i",
- "i\u0163i",
- "\xE2\u0163i",
- "sei",
- "\u0103m",
- "em",
- "im",
- "\xE2m",
- "se",
+ 'seser\u0103\u0163i',
+ 'seser\u0103m',
+ 'ser\u0103\u0163i',
+ 'sese\u015Fi',
+ 'seser\u0103',
+ 'ser\u0103m',
+ 'sesem',
+ 'se\u015Fi',
+ 'ser\u0103',
+ 'sese',
+ 'a\u0163i',
+ 'e\u0163i',
+ 'i\u0163i',
+ '\xE2\u0163i',
+ 'sei',
+ '\u0103m',
+ 'em',
+ 'im',
+ '\xE2m',
+ 'se',
):
word = word[: -len(suffix)]
rv = rv[: -len(suffix)]
"v",
)
__adjectival_suffixes = (
- "ui^ushchi^ui^u",
- "ui^ushchi^ai^a",
- "ui^ushchimi",
- "ui^ushchymi",
- "ui^ushchego",
- "ui^ushchogo",
- "ui^ushchemu",
- "ui^ushchomu",
- "ui^ushchikh",
- "ui^ushchykh",
- "ui^ushchui^u",
- "ui^ushchaia",
- "ui^ushchoi^u",
- "ui^ushchei^u",
- "i^ushchi^ui^u",
- "i^ushchi^ai^a",
- "ui^ushchee",
- "ui^ushchie",
- "ui^ushchye",
- "ui^ushchoe",
- "ui^ushchei`",
- "ui^ushchii`",
- "ui^ushchyi`",
- "ui^ushchoi`",
- "ui^ushchem",
- "ui^ushchim",
- "ui^ushchym",
- "ui^ushchom",
- "i^ushchimi",
- "i^ushchymi",
- "i^ushchego",
- "i^ushchogo",
- "i^ushchemu",
- "i^ushchomu",
- "i^ushchikh",
- "i^ushchykh",
- "i^ushchui^u",
- "i^ushchai^a",
- "i^ushchoi^u",
- "i^ushchei^u",
- "i^ushchee",
- "i^ushchie",
- "i^ushchye",
- "i^ushchoe",
- "i^ushchei`",
- "i^ushchii`",
- "i^ushchyi`",
- "i^ushchoi`",
- "i^ushchem",
- "i^ushchim",
- "i^ushchym",
- "i^ushchom",
- "shchi^ui^u",
- "shchi^ai^a",
- "ivshi^ui^u",
- "ivshi^ai^a",
- "yvshi^ui^u",
- "yvshi^ai^a",
- "shchimi",
- "shchymi",
- "shchego",
- "shchogo",
- "shchemu",
- "shchomu",
- "shchikh",
- "shchykh",
- "shchui^u",
- "shchai^a",
- "shchoi^u",
- "shchei^u",
- "ivshimi",
- "ivshymi",
- "ivshego",
- "ivshogo",
- "ivshemu",
- "ivshomu",
- "ivshikh",
- "ivshykh",
- "ivshui^u",
- "ivshai^a",
- "ivshoi^u",
- "ivshei^u",
- "yvshimi",
- "yvshymi",
- "yvshego",
- "yvshogo",
- "yvshemu",
- "yvshomu",
- "yvshikh",
- "yvshykh",
- "yvshui^u",
- "yvshai^a",
- "yvshoi^u",
- "yvshei^u",
- "vshi^ui^u",
- "vshi^ai^a",
- "shchee",
- "shchie",
- "shchye",
- "shchoe",
- "shchei`",
- "shchii`",
- "shchyi`",
- "shchoi`",
- "shchem",
- "shchim",
- "shchym",
- "shchom",
- "ivshee",
- "ivshie",
- "ivshye",
- "ivshoe",
- "ivshei`",
- "ivshii`",
- "ivshyi`",
- "ivshoi`",
- "ivshem",
- "ivshim",
- "ivshym",
- "ivshom",
- "yvshee",
- "yvshie",
- "yvshye",
- "yvshoe",
- "yvshei`",
- "yvshii`",
- "yvshyi`",
- "yvshoi`",
- "yvshem",
- "yvshim",
- "yvshym",
- "yvshom",
- "vshimi",
- "vshymi",
- "vshego",
- "vshogo",
- "vshemu",
- "vshomu",
- "vshikh",
- "vshykh",
- "vshui^u",
- "vshai^a",
- "vshoi^u",
- "vshei^u",
- "emi^ui^u",
- "emi^ai^a",
- "nni^ui^u",
- "nni^ai^a",
- "vshee",
- "vshie",
- "vshye",
- "vshoe",
- "vshei`",
- "vshii`",
- "vshyi`",
- "vshoi`",
- "vshem",
- "vshim",
- "vshym",
- "vshom",
- "emimi",
- "emymi",
- "emego",
- "emogo",
- "ememu",
- "emomu",
- "emikh",
- "emykh",
- "emui^u",
- "emai^a",
- "emoi^u",
- "emei^u",
- "nnimi",
- "nnymi",
- "nnego",
- "nnogo",
- "nnemu",
- "nnomu",
- "nnikh",
- "nnykh",
- "nnui^u",
- "nnai^a",
- "nnoi^u",
- "nnei^u",
- "emee",
- "emie",
- "emye",
- "emoe",
- "emei`",
- "emii`",
- "emyi`",
- "emoi`",
- "emem",
- "emim",
- "emym",
- "emom",
- "nnee",
- "nnie",
- "nnye",
- "nnoe",
- "nnei`",
- "nnii`",
- "nnyi`",
- "nnoi`",
- "nnem",
- "nnim",
- "nnym",
- "nnom",
- "i^ui^u",
- "i^ai^a",
- "imi",
- "ymi",
- "ego",
- "ogo",
- "emu",
- "omu",
- "ikh",
- "ykh",
- "ui^u",
- "ai^a",
- "oi^u",
- "ei^u",
- "ee",
- "ie",
- "ye",
- "oe",
- "ei`",
- "ii`",
- "yi`",
- "oi`",
- "em",
- "im",
- "ym",
- "om",
+ 'ui^ushchi^ui^u',
+ 'ui^ushchi^ai^a',
+ 'ui^ushchimi',
+ 'ui^ushchymi',
+ 'ui^ushchego',
+ 'ui^ushchogo',
+ 'ui^ushchemu',
+ 'ui^ushchomu',
+ 'ui^ushchikh',
+ 'ui^ushchykh',
+ 'ui^ushchui^u',
+ 'ui^ushchaia',
+ 'ui^ushchoi^u',
+ 'ui^ushchei^u',
+ 'i^ushchi^ui^u',
+ 'i^ushchi^ai^a',
+ 'ui^ushchee',
+ 'ui^ushchie',
+ 'ui^ushchye',
+ 'ui^ushchoe',
+ 'ui^ushchei`',
+ 'ui^ushchii`',
+ 'ui^ushchyi`',
+ 'ui^ushchoi`',
+ 'ui^ushchem',
+ 'ui^ushchim',
+ 'ui^ushchym',
+ 'ui^ushchom',
+ 'i^ushchimi',
+ 'i^ushchymi',
+ 'i^ushchego',
+ 'i^ushchogo',
+ 'i^ushchemu',
+ 'i^ushchomu',
+ 'i^ushchikh',
+ 'i^ushchykh',
+ 'i^ushchui^u',
+ 'i^ushchai^a',
+ 'i^ushchoi^u',
+ 'i^ushchei^u',
+ 'i^ushchee',
+ 'i^ushchie',
+ 'i^ushchye',
+ 'i^ushchoe',
+ 'i^ushchei`',
+ 'i^ushchii`',
+ 'i^ushchyi`',
+ 'i^ushchoi`',
+ 'i^ushchem',
+ 'i^ushchim',
+ 'i^ushchym',
+ 'i^ushchom',
+ 'shchi^ui^u',
+ 'shchi^ai^a',
+ 'ivshi^ui^u',
+ 'ivshi^ai^a',
+ 'yvshi^ui^u',
+ 'yvshi^ai^a',
+ 'shchimi',
+ 'shchymi',
+ 'shchego',
+ 'shchogo',
+ 'shchemu',
+ 'shchomu',
+ 'shchikh',
+ 'shchykh',
+ 'shchui^u',
+ 'shchai^a',
+ 'shchoi^u',
+ 'shchei^u',
+ 'ivshimi',
+ 'ivshymi',
+ 'ivshego',
+ 'ivshogo',
+ 'ivshemu',
+ 'ivshomu',
+ 'ivshikh',
+ 'ivshykh',
+ 'ivshui^u',
+ 'ivshai^a',
+ 'ivshoi^u',
+ 'ivshei^u',
+ 'yvshimi',
+ 'yvshymi',
+ 'yvshego',
+ 'yvshogo',
+ 'yvshemu',
+ 'yvshomu',
+ 'yvshikh',
+ 'yvshykh',
+ 'yvshui^u',
+ 'yvshai^a',
+ 'yvshoi^u',
+ 'yvshei^u',
+ 'vshi^ui^u',
+ 'vshi^ai^a',
+ 'shchee',
+ 'shchie',
+ 'shchye',
+ 'shchoe',
+ 'shchei`',
+ 'shchii`',
+ 'shchyi`',
+ 'shchoi`',
+ 'shchem',
+ 'shchim',
+ 'shchym',
+ 'shchom',
+ 'ivshee',
+ 'ivshie',
+ 'ivshye',
+ 'ivshoe',
+ 'ivshei`',
+ 'ivshii`',
+ 'ivshyi`',
+ 'ivshoi`',
+ 'ivshem',
+ 'ivshim',
+ 'ivshym',
+ 'ivshom',
+ 'yvshee',
+ 'yvshie',
+ 'yvshye',
+ 'yvshoe',
+ 'yvshei`',
+ 'yvshii`',
+ 'yvshyi`',
+ 'yvshoi`',
+ 'yvshem',
+ 'yvshim',
+ 'yvshym',
+ 'yvshom',
+ 'vshimi',
+ 'vshymi',
+ 'vshego',
+ 'vshogo',
+ 'vshemu',
+ 'vshomu',
+ 'vshikh',
+ 'vshykh',
+ 'vshui^u',
+ 'vshai^a',
+ 'vshoi^u',
+ 'vshei^u',
+ 'emi^ui^u',
+ 'emi^ai^a',
+ 'nni^ui^u',
+ 'nni^ai^a',
+ 'vshee',
+ 'vshie',
+ 'vshye',
+ 'vshoe',
+ 'vshei`',
+ 'vshii`',
+ 'vshyi`',
+ 'vshoi`',
+ 'vshem',
+ 'vshim',
+ 'vshym',
+ 'vshom',
+ 'emimi',
+ 'emymi',
+ 'emego',
+ 'emogo',
+ 'ememu',
+ 'emomu',
+ 'emikh',
+ 'emykh',
+ 'emui^u',
+ 'emai^a',
+ 'emoi^u',
+ 'emei^u',
+ 'nnimi',
+ 'nnymi',
+ 'nnego',
+ 'nnogo',
+ 'nnemu',
+ 'nnomu',
+ 'nnikh',
+ 'nnykh',
+ 'nnui^u',
+ 'nnai^a',
+ 'nnoi^u',
+ 'nnei^u',
+ 'emee',
+ 'emie',
+ 'emye',
+ 'emoe',
+ 'emei`',
+ 'emii`',
+ 'emyi`',
+ 'emoi`',
+ 'emem',
+ 'emim',
+ 'emym',
+ 'emom',
+ 'nnee',
+ 'nnie',
+ 'nnye',
+ 'nnoe',
+ 'nnei`',
+ 'nnii`',
+ 'nnyi`',
+ 'nnoi`',
+ 'nnem',
+ 'nnim',
+ 'nnym',
+ 'nnom',
+ 'i^ui^u',
+ 'i^ai^a',
+ 'imi',
+ 'ymi',
+ 'ego',
+ 'ogo',
+ 'emu',
+ 'omu',
+ 'ikh',
+ 'ykh',
+ 'ui^u',
+ 'ai^a',
+ 'oi^u',
+ 'ei^u',
+ 'ee',
+ 'ie',
+ 'ye',
+ 'oe',
+ 'ei`',
+ 'ii`',
+ 'yi`',
+ 'oi`',
+ 'em',
+ 'im',
+ 'ym',
+ 'om',
)
__reflexive_suffixes = ("si^a", "s'")
__verb_suffixes = (
"esh'",
- "ei`te",
- "ui`te",
- "ui^ut",
+ 'ei`te',
+ 'ui`te',
+ 'ui^ut',
"ish'",
- "ete",
- "i`te",
- "i^ut",
- "nno",
- "ila",
- "yla",
- "ena",
- "ite",
- "ili",
- "yli",
- "ilo",
- "ylo",
- "eno",
- "i^at",
- "uet",
- "eny",
+ 'ete',
+ 'i`te',
+ 'i^ut',
+ 'nno',
+ 'ila',
+ 'yla',
+ 'ena',
+ 'ite',
+ 'ili',
+ 'yli',
+ 'ilo',
+ 'ylo',
+ 'eno',
+ 'i^at',
+ 'uet',
+ 'eny',
"it'",
"yt'",
- "ui^u",
- "la",
- "na",
- "li",
- "em",
- "lo",
- "no",
- "et",
- "ny",
+ 'ui^u',
+ 'la',
+ 'na',
+ 'li',
+ 'em',
+ 'lo',
+ 'no',
+ 'et',
+ 'ny',
"t'",
- "ei`",
- "ui`",
- "il",
- "yl",
- "im",
- "ym",
- "en",
- "it",
- "yt",
- "i^u",
- "i`",
- "l",
- "n",
+ 'ei`',
+ 'ui`',
+ 'il',
+ 'yl',
+ 'im',
+ 'ym',
+ 'en',
+ 'it',
+ 'yt',
+ 'i^u',
+ 'i`',
+ 'l',
+ 'n',
)
__noun_suffixes = (
- "ii^ami",
- "ii^akh",
- "i^ami",
- "ii^am",
- "i^akh",
- "ami",
- "iei`",
- "i^am",
- "iem",
- "akh",
- "ii^u",
+ 'ii^ami',
+ 'ii^akh',
+ 'i^ami',
+ 'ii^am',
+ 'i^akh',
+ 'ami',
+ 'iei`',
+ 'i^am',
+ 'iem',
+ 'akh',
+ 'ii^u',
"'i^u",
- "ii^a",
+ 'ii^a',
"'i^a",
- "ev",
- "ov",
- "ie",
+ 'ev',
+ 'ov',
+ 'ie',
"'e",
- "ei",
- "ii",
- "ei`",
- "oi`",
- "ii`",
- "em",
- "am",
- "om",
- "i^u",
- "i^a",
- "a",
- "e",
- "i",
- "i`",
- "o",
- "u",
- "y",
+ 'ei',
+ 'ii',
+ 'ei`',
+ 'oi`',
+ 'ii`',
+ 'em',
+ 'am',
+ 'om',
+ 'i^u',
+ 'i^a',
+ 'a',
+ 'e',
+ 'i',
+ 'i`',
+ 'o',
+ 'u',
+ 'y',
"'",
)
__superlative_suffixes = ("ei`she", "ei`sh")
chr_exceeded = True
break
- if not chr_exceeded:
- return word
-
- word = self.__cyrillic_to_roman(word)
+ if chr_exceeded:
+ word = self.__cyrillic_to_roman(word)
step1_success = False
adjectival_removed = False
for suffix in self.__adjectival_suffixes:
if rv.endswith(suffix):
if suffix in (
- "i^ushchi^ui^u",
- "i^ushchi^ai^a",
- "i^ushchui^u",
- "i^ushchai^a",
- "i^ushchoi^u",
- "i^ushchei^u",
- "i^ushchimi",
- "i^ushchymi",
- "i^ushchego",
- "i^ushchogo",
- "i^ushchemu",
- "i^ushchomu",
- "i^ushchikh",
- "i^ushchykh",
- "shchi^ui^u",
- "shchi^ai^a",
- "i^ushchee",
- "i^ushchie",
- "i^ushchye",
- "i^ushchoe",
- "i^ushchei`",
- "i^ushchii`",
- "i^ushchyi`",
- "i^ushchoi`",
- "i^ushchem",
- "i^ushchim",
- "i^ushchym",
- "i^ushchom",
- "vshi^ui^u",
- "vshi^ai^a",
- "shchui^u",
- "shchai^a",
- "shchoi^u",
- "shchei^u",
- "emi^ui^u",
- "emi^ai^a",
- "nni^ui^u",
- "nni^ai^a",
- "shchimi",
- "shchymi",
- "shchego",
- "shchogo",
- "shchemu",
- "shchomu",
- "shchikh",
- "shchykh",
- "vshui^u",
- "vshai^a",
- "vshoi^u",
- "vshei^u",
- "shchee",
- "shchie",
- "shchye",
- "shchoe",
- "shchei`",
- "shchii`",
- "shchyi`",
- "shchoi`",
- "shchem",
- "shchim",
- "shchym",
- "shchom",
- "vshimi",
- "vshymi",
- "vshego",
- "vshogo",
- "vshemu",
- "vshomu",
- "vshikh",
- "vshykh",
- "emui^u",
- "emai^a",
- "emoi^u",
- "emei^u",
- "nnui^u",
- "nnai^a",
- "nnoi^u",
- "nnei^u",
- "vshee",
- "vshie",
- "vshye",
- "vshoe",
- "vshei`",
- "vshii`",
- "vshyi`",
- "vshoi`",
- "vshem",
- "vshim",
- "vshym",
- "vshom",
- "emimi",
- "emymi",
- "emego",
- "emogo",
- "ememu",
- "emomu",
- "emikh",
- "emykh",
- "nnimi",
- "nnymi",
- "nnego",
- "nnogo",
- "nnemu",
- "nnomu",
- "nnikh",
- "nnykh",
- "emee",
- "emie",
- "emye",
- "emoe",
- "emei`",
- "emii`",
- "emyi`",
- "emoi`",
- "emem",
- "emim",
- "emym",
- "emom",
- "nnee",
- "nnie",
- "nnye",
- "nnoe",
- "nnei`",
- "nnii`",
- "nnyi`",
- "nnoi`",
- "nnem",
- "nnim",
- "nnym",
- "nnom",
+ 'i^ushchi^ui^u',
+ 'i^ushchi^ai^a',
+ 'i^ushchui^u',
+ 'i^ushchai^a',
+ 'i^ushchoi^u',
+ 'i^ushchei^u',
+ 'i^ushchimi',
+ 'i^ushchymi',
+ 'i^ushchego',
+ 'i^ushchogo',
+ 'i^ushchemu',
+ 'i^ushchomu',
+ 'i^ushchikh',
+ 'i^ushchykh',
+ 'shchi^ui^u',
+ 'shchi^ai^a',
+ 'i^ushchee',
+ 'i^ushchie',
+ 'i^ushchye',
+ 'i^ushchoe',
+ 'i^ushchei`',
+ 'i^ushchii`',
+ 'i^ushchyi`',
+ 'i^ushchoi`',
+ 'i^ushchem',
+ 'i^ushchim',
+ 'i^ushchym',
+ 'i^ushchom',
+ 'vshi^ui^u',
+ 'vshi^ai^a',
+ 'shchui^u',
+ 'shchai^a',
+ 'shchoi^u',
+ 'shchei^u',
+ 'emi^ui^u',
+ 'emi^ai^a',
+ 'nni^ui^u',
+ 'nni^ai^a',
+ 'shchimi',
+ 'shchymi',
+ 'shchego',
+ 'shchogo',
+ 'shchemu',
+ 'shchomu',
+ 'shchikh',
+ 'shchykh',
+ 'vshui^u',
+ 'vshai^a',
+ 'vshoi^u',
+ 'vshei^u',
+ 'shchee',
+ 'shchie',
+ 'shchye',
+ 'shchoe',
+ 'shchei`',
+ 'shchii`',
+ 'shchyi`',
+ 'shchoi`',
+ 'shchem',
+ 'shchim',
+ 'shchym',
+ 'shchom',
+ 'vshimi',
+ 'vshymi',
+ 'vshego',
+ 'vshogo',
+ 'vshemu',
+ 'vshomu',
+ 'vshikh',
+ 'vshykh',
+ 'emui^u',
+ 'emai^a',
+ 'emoi^u',
+ 'emei^u',
+ 'nnui^u',
+ 'nnai^a',
+ 'nnoi^u',
+ 'nnei^u',
+ 'vshee',
+ 'vshie',
+ 'vshye',
+ 'vshoe',
+ 'vshei`',
+ 'vshii`',
+ 'vshyi`',
+ 'vshoi`',
+ 'vshem',
+ 'vshim',
+ 'vshym',
+ 'vshom',
+ 'emimi',
+ 'emymi',
+ 'emego',
+ 'emogo',
+ 'ememu',
+ 'emomu',
+ 'emikh',
+ 'emykh',
+ 'nnimi',
+ 'nnymi',
+ 'nnego',
+ 'nnogo',
+ 'nnemu',
+ 'nnomu',
+ 'nnikh',
+ 'nnykh',
+ 'emee',
+ 'emie',
+ 'emye',
+ 'emoe',
+ 'emei`',
+ 'emii`',
+ 'emyi`',
+ 'emoi`',
+ 'emem',
+ 'emim',
+ 'emym',
+ 'emom',
+ 'nnee',
+ 'nnie',
+ 'nnye',
+ 'nnoe',
+ 'nnei`',
+ 'nnii`',
+ 'nnyi`',
+ 'nnoi`',
+ 'nnem',
+ 'nnim',
+ 'nnym',
+ 'nnom',
):
if (
rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
if word.endswith("'"):
word = word[:-1]
- word = self.__roman_to_cyrillic(word)
+ if chr_exceeded:
+ word = self.__roman_to_cyrillic(word)
return word
"lo",
)
__step1_suffixes = (
- "amientos",
- "imientos",
- "amiento",
- "imiento",
- "aciones",
- "uciones",
- "adoras",
- "adores",
- "ancias",
- "log\xEDas",
- "encias",
- "amente",
- "idades",
- "anzas",
- "ismos",
- "ables",
- "ibles",
- "istas",
- "adora",
- "aci\xF3n",
- "antes",
- "ancia",
- "log\xEDa",
- "uci\xf3n",
- "encia",
- "mente",
- "anza",
- "icos",
- "icas",
- "ismo",
- "able",
- "ible",
- "ista",
- "osos",
- "osas",
- "ador",
- "ante",
- "idad",
- "ivas",
- "ivos",
- "ico",
- "ica",
- "oso",
- "osa",
- "iva",
- "ivo",
+ 'amientos',
+ 'imientos',
+ 'amiento',
+ 'imiento',
+ 'aciones',
+ 'uciones',
+ 'adoras',
+ 'adores',
+ 'ancias',
+ 'log\xEDas',
+ 'encias',
+ 'amente',
+ 'idades',
+ 'anzas',
+ 'ismos',
+ 'ables',
+ 'ibles',
+ 'istas',
+ 'adora',
+ 'aci\xF3n',
+ 'antes',
+ 'ancia',
+ 'log\xEDa',
+ 'uci\xf3n',
+ 'encia',
+ 'mente',
+ 'anza',
+ 'icos',
+ 'icas',
+ 'ismo',
+ 'able',
+ 'ible',
+ 'ista',
+ 'osos',
+ 'osas',
+ 'ador',
+ 'ante',
+ 'idad',
+ 'ivas',
+ 'ivos',
+ 'ico',
+ 'ica',
+ 'oso',
+ 'osa',
+ 'iva',
+ 'ivo',
)
__step2a_suffixes = (
- "yeron",
- "yendo",
- "yamos",
- "yais",
- "yan",
- "yen",
- "yas",
- "yes",
- "ya",
- "ye",
- "yo",
- "y\xF3",
+ 'yeron',
+ 'yendo',
+ 'yamos',
+ 'yais',
+ 'yan',
+ 'yen',
+ 'yas',
+ 'yes',
+ 'ya',
+ 'ye',
+ 'yo',
+ 'y\xF3',
)
__step2b_suffixes = (
- "ar\xEDamos",
- "er\xEDamos",
- "ir\xEDamos",
- "i\xE9ramos",
- "i\xE9semos",
- "ar\xEDais",
- "aremos",
- "er\xEDais",
- "eremos",
- "ir\xEDais",
- "iremos",
- "ierais",
- "ieseis",
- "asteis",
- "isteis",
- "\xE1bamos",
- "\xE1ramos",
- "\xE1semos",
- "ar\xEDan",
- "ar\xEDas",
- "ar\xE9is",
- "er\xEDan",
- "er\xEDas",
- "er\xE9is",
- "ir\xEDan",
- "ir\xEDas",
- "ir\xE9is",
- "ieran",
- "iesen",
- "ieron",
- "iendo",
- "ieras",
- "ieses",
- "abais",
- "arais",
- "aseis",
- "\xE9amos",
- "ar\xE1n",
- "ar\xE1s",
- "ar\xEDa",
- "er\xE1n",
- "er\xE1s",
- "er\xEDa",
- "ir\xE1n",
- "ir\xE1s",
- "ir\xEDa",
- "iera",
- "iese",
- "aste",
- "iste",
- "aban",
- "aran",
- "asen",
- "aron",
- "ando",
- "abas",
- "adas",
- "idas",
- "aras",
- "ases",
- "\xEDais",
- "ados",
- "idos",
- "amos",
- "imos",
- "emos",
- "ar\xE1",
- "ar\xE9",
- "er\xE1",
- "er\xE9",
- "ir\xE1",
- "ir\xE9",
- "aba",
- "ada",
- "ida",
- "ara",
- "ase",
- "\xEDan",
- "ado",
- "ido",
- "\xEDas",
- "\xE1is",
- "\xE9is",
- "\xEDa",
- "ad",
- "ed",
- "id",
- "an",
- "i\xF3",
- "ar",
- "er",
- "ir",
- "as",
- "\xEDs",
- "en",
- "es",
+ 'ar\xEDamos',
+ 'er\xEDamos',
+ 'ir\xEDamos',
+ 'i\xE9ramos',
+ 'i\xE9semos',
+ 'ar\xEDais',
+ 'aremos',
+ 'er\xEDais',
+ 'eremos',
+ 'ir\xEDais',
+ 'iremos',
+ 'ierais',
+ 'ieseis',
+ 'asteis',
+ 'isteis',
+ '\xE1bamos',
+ '\xE1ramos',
+ '\xE1semos',
+ 'ar\xEDan',
+ 'ar\xEDas',
+ 'ar\xE9is',
+ 'er\xEDan',
+ 'er\xEDas',
+ 'er\xE9is',
+ 'ir\xEDan',
+ 'ir\xEDas',
+ 'ir\xE9is',
+ 'ieran',
+ 'iesen',
+ 'ieron',
+ 'iendo',
+ 'ieras',
+ 'ieses',
+ 'abais',
+ 'arais',
+ 'aseis',
+ '\xE9amos',
+ 'ar\xE1n',
+ 'ar\xE1s',
+ 'ar\xEDa',
+ 'er\xE1n',
+ 'er\xE1s',
+ 'er\xEDa',
+ 'ir\xE1n',
+ 'ir\xE1s',
+ 'ir\xEDa',
+ 'iera',
+ 'iese',
+ 'aste',
+ 'iste',
+ 'aban',
+ 'aran',
+ 'asen',
+ 'aron',
+ 'ando',
+ 'abas',
+ 'adas',
+ 'idas',
+ 'aras',
+ 'ases',
+ '\xEDais',
+ 'ados',
+ 'idos',
+ 'amos',
+ 'imos',
+ 'emos',
+ 'ar\xE1',
+ 'ar\xE9',
+ 'er\xE1',
+ 'er\xE9',
+ 'ir\xE1',
+ 'ir\xE9',
+ 'aba',
+ 'ada',
+ 'ida',
+ 'ara',
+ 'ase',
+ '\xEDan',
+ 'ado',
+ 'ido',
+ '\xEDas',
+ '\xE1is',
+ '\xE9is',
+ '\xEDa',
+ 'ad',
+ 'ed',
+ 'id',
+ 'an',
+ 'i\xF3',
+ 'ar',
+ 'er',
+ 'ir',
+ 'as',
+ '\xEDs',
+ 'en',
+ 'es',
)
__step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
"""
+ import re
from nltk.corpus import udhr
udhr_corpus = {
excerpt = udhr.words(udhr_corpus[language])[:300]
stemmed = " ".join(stemmer.stem(word) for word in excerpt)
- stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip()
+ stemmed = re.sub(r"(.{,70})\s", r'\1\n', stemmed + ' ').rstrip()
excerpt = " ".join(excerpt)
- excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip()
+ excerpt = re.sub(r"(.{,70})\s", r'\1\n', excerpt + ' ').rstrip()
print("\n")
- print("-" * 70)
- print("ORIGINAL".center(70))
+ print('-' * 70)
+ print('ORIGINAL'.center(70))
print(excerpt)
print("\n\n")
- print("STEMMED RESULTS".center(70))
+ print('STEMMED RESULTS'.center(70))
print(stemmed)
- print("-" * 70)
+ print('-' * 70)
print("\n")
# Natural Language Toolkit: Stemmer Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Helder <he7d3r@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Natural Language Toolkit: WordNet stemmer interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class WordNetLemmatizer(object):
"""
WordNet Lemmatizer
return min(lemmas, key=len) if lemmas else word
def __repr__(self):
- return "<WordNetLemmatizer>"
+ return '<WordNetLemmatizer>'
# unload wordnet
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Taggers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
For more information, please consult chapter 5 of the NLTK Book.
"""
+from __future__ import print_function
from nltk.tag.api import TaggerI
from nltk.tag.util import str2tuple, tuple2str, untag
from nltk.data import load, find
RUS_PICKLE = (
- "taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle"
+ 'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle'
)
def _get_tagger(lang=None):
- if lang == "rus":
+ if lang == 'rus':
tagger = PerceptronTagger(False)
- ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
+ ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
else:
tagger = PerceptronTagger()
def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
# Currently only supoorts English and Russian.
- if lang not in ["eng", "rus"]:
+ if lang not in ['eng', 'rus']:
raise NotImplementedError(
"Currently, NLTK pos_tag only supports English and Russian "
"(i.e. lang='eng' or lang='rus')"
else:
tagged_tokens = tagger.tag(tokens)
if tagset: # Maps to the specified tagset.
- if lang == "eng":
+ if lang == 'eng':
tagged_tokens = [
- (token, map_tag("en-ptb", tagset, tag))
+ (token, map_tag('en-ptb', tagset, tag))
for (token, tag) in tagged_tokens
]
- elif lang == "rus":
+ elif lang == 'rus':
# Note that the new Russion pos tags from the model contains suffixes,
# see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
tagged_tokens = [
- (token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0]))
+ (token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0]))
for (token, tag) in tagged_tokens
]
return tagged_tokens
-def pos_tag(tokens, tagset=None, lang="eng"):
+def pos_tag(tokens, tagset=None, lang='eng'):
"""
Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens.
return _pos_tag(tokens, tagset, tagger, lang)
-def pos_tag_sents(sentences, tagset=None, lang="eng"):
+def pos_tag_sents(sentences, tagset=None, lang='eng'):
"""
Use NLTK's currently recommended part of speech tagger to tag the
given list of sentences, each consisting of a list of tokens.
- :param sentences: List of sentences to be tagged
- :type sentences: list(list(str))
+ :param tokens: List of sentences to be tagged
+ :type tokens: list(list(str))
:param tagset: the tagset to be used, e.g. universal, wsj, brown
:type tagset: str
:param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
:rtype: list(list(tuple(str, str)))
"""
tagger = _get_tagger(lang)
- return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences]
+ return [_pos_tag(sent, tagset, tagger) for sent in sentences]
# Natural Language Toolkit: Tagger Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
from abc import ABCMeta, abstractmethod
from itertools import chain
+from six import add_metaclass
from nltk.internals import overridden
from nltk.metrics import accuracy
from nltk.tag.util import untag
-class TaggerI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TaggerI(object):
"""
A processing interface for assigning a tag to each token in a list.
Tags are case sensitive strings that identify some property of each
def _check_params(self, train, model):
if (train and model) or (not train and not model):
- raise ValueError("Must specify either training data or trained model.")
+ raise ValueError('Must specify either training data or trained model.')
class FeaturesetTaggerI(TaggerI):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division
+
from collections import defaultdict, Counter
from nltk.tag import TaggerI
Feature which examines the text (word) of nearby tokens.
"""
- json_tag = "nltk.tag.brill.Word"
+ json_tag = 'nltk.tag.brill.Word'
@staticmethod
def extract_property(tokens, index):
Feature which examines the tags of nearby tokens.
"""
- json_tag = "nltk.tag.brill.Pos"
+ json_tag = 'nltk.tag.brill.Pos'
@staticmethod
def extract_property(tokens, index):
of the TaggerTrainers available.
"""
- json_tag = "nltk.tag.BrillTagger"
+ json_tag = 'nltk.tag.BrillTagger'
def __init__(self, initial_tagger, rules, training_stats=None):
"""
tids = [r.templateid for r in self._rules]
train_stats = self.train_stats()
- trainscores = train_stats["rulescores"]
+ trainscores = train_stats['rulescores']
assert len(trainscores) == len(tids), (
"corrupt statistics: "
"{0} train scores for {1} rules".format(trainscores, tids)
print(s)
def print_testtrain_stats():
- testscores = test_stats["rulescores"]
+ testscores = test_stats['rulescores']
print(
"TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format(
len(template_counts), len(tids)
return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair))
testing_stats = {}
- testing_stats["tokencount"] = sum(len(t) for t in sequences)
- testing_stats["sequencecount"] = len(sequences)
+ testing_stats['tokencount'] = sum(len(t) for t in sequences)
+ testing_stats['sequencecount'] = len(sequences)
tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences]
- testing_stats["initialerrors"] = counterrors(tagged_tokenses)
- testing_stats["initialacc"] = (
- 1 - testing_stats["initialerrors"] / testing_stats["tokencount"]
+ testing_stats['initialerrors'] = counterrors(tagged_tokenses)
+ testing_stats['initialacc'] = (
+ 1 - testing_stats['initialerrors'] / testing_stats['tokencount']
)
# Apply each rule to the entire corpus, in order
- errors = [testing_stats["initialerrors"]]
+ errors = [testing_stats['initialerrors']]
for rule in self._rules:
for tagged_tokens in tagged_tokenses:
rule.apply(tagged_tokens)
errors.append(counterrors(tagged_tokenses))
- testing_stats["rulescores"] = [
+ testing_stats['rulescores'] = [
err0 - err1 for (err0, err1) in zip(errors, errors[1:])
]
- testing_stats["finalerrors"] = errors[-1]
- testing_stats["finalacc"] = (
- 1 - testing_stats["finalerrors"] / testing_stats["tokencount"]
+ testing_stats['finalerrors'] = errors[-1]
+ testing_stats['finalacc'] = (
+ 1 - testing_stats['finalerrors'] / testing_stats['tokencount']
)
return (tagged_tokenses, testing_stats)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division
+
import bisect
import textwrap
from collections import defaultdict
# Collect some statistics on the training process
trainstats = {}
- trainstats["min_acc"] = min_acc
- trainstats["min_score"] = min_score
- trainstats["tokencount"] = sum(len(t) for t in test_sents)
- trainstats["sequencecount"] = len(test_sents)
- trainstats["templatecount"] = len(self._templates)
- trainstats["rulescores"] = []
- trainstats["initialerrors"] = sum(
+ trainstats['min_acc'] = min_acc
+ trainstats['min_score'] = min_score
+ trainstats['tokencount'] = sum(len(t) for t in test_sents)
+ trainstats['sequencecount'] = len(test_sents)
+ trainstats['templatecount'] = len(self._templates)
+ trainstats['rulescores'] = []
+ trainstats['initialerrors'] = sum(
tag[1] != truth[1]
for paired in zip(test_sents, train_sents)
for (tag, truth) in zip(*paired)
)
- trainstats["initialacc"] = (
- 1 - trainstats["initialerrors"] / trainstats["tokencount"]
+ trainstats['initialacc'] = (
+ 1 - trainstats['initialerrors'] / trainstats['tokencount']
)
if self._trace > 0:
print(
print("Finding initial useful rules...")
self._init_mappings(test_sents, train_sents)
if self._trace:
- print((" Found {} useful rules.".format(len(self._rule_scores))))
+ print((" Found %d useful rules." % len(self._rule_scores)))
# Let the user know what we're up to.
if self._trace > 2:
if rule:
rules.append(rule)
score = self._rule_scores[rule]
- trainstats["rulescores"].append(score)
+ trainstats['rulescores'].append(score)
else:
break # No more good rules left!
# The user can cancel training manually:
except KeyboardInterrupt:
- print("Training stopped manually -- {} rules found".format(len(rules)))
+ print("Training stopped manually -- %d rules found" % len(rules))
# Discard our tag position mapping & rule mappings.
self._clean()
- trainstats["finalerrors"] = trainstats["initialerrors"] - sum(
- trainstats["rulescores"]
+ trainstats['finalerrors'] = trainstats['initialerrors'] - sum(
+ trainstats['rulescores']
)
- trainstats["finalacc"] = (
- 1 - trainstats["finalerrors"] / trainstats["tokencount"]
+ trainstats['finalacc'] = (
+ 1 - trainstats['finalerrors'] / trainstats['tokencount']
)
# Create and return a tagger from the rules we found.
return BrillTagger(self._initial_tagger, rules, trainstats)
rulestr = rule.format(self._ruleformat)
if self._trace > 2:
print(
- "{:4d}{:4d}{:4d}{:4d} |".format(score, num_fixed, num_broken, num_other), end=" "
+ '%4d%4d%4d%4d |' % (score, num_fixed, num_broken, num_other), end=' '
)
print(
textwrap.fill(
rulestr,
- initial_indent=" " * 20,
+ initial_indent=' ' * 20,
width=79,
- subsequent_indent=" " * 18 + "| ",
+ subsequent_indent=' ' * 18 + '| ',
).strip()
)
else:
print(rulestr)
def _trace_apply(self, num_updates):
- prefix = " " * 18 + "|"
+ prefix = ' ' * 18 + '|'
print(prefix)
- print(prefix, "Applying rule to {} positions.".format(num_updates))
+ print(prefix, 'Applying rule to %d positions.' % num_updates)
def _trace_update_rules(self, num_obsolete, num_new, num_unseen):
- prefix = " " * 18 + "|"
- print(prefix, "Updated rule tables:")
- print(prefix, (" - {} rule applications removed".format(num_obsolete)))
+ prefix = ' ' * 18 + '|'
+ print(prefix, 'Updated rule tables:')
+ print(prefix, (' - %d rule applications removed' % num_obsolete))
print(
prefix,
- (" - {} rule applications added ({} novel)".format(num_new, num_unseen)),
+ (' - %d rule applications added (%d novel)' % (num_new, num_unseen)),
)
print(prefix)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the CRFSuite Tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Long Duong <longdt219@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for POS tagging using CRFSuite
"""
-
+from __future__ import absolute_import
+from __future__ import unicode_literals
import unicodedata
import re
from nltk.tag.api import TaggerI
"""
- self._model_file = ""
+ self._model_file = ''
self._tagger = pycrfsuite.Tagger()
if feature_func is None:
self._verbose = verbose
self._training_options = training_opt
- self._pattern = re.compile(r"\d")
+ self._pattern = re.compile(r'\d')
def set_model_file(self, model_file):
self._model_file = model_file
# Capitalization
if token[0].isupper():
- feature_list.append("CAPITALIZATION")
+ feature_list.append('CAPITALIZATION')
# Number
if re.search(self._pattern, token) is not None:
- feature_list.append("HAS_NUM")
+ feature_list.append('HAS_NUM')
# Punctuation
punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
if all(unicodedata.category(x) in punc_cat for x in token):
- feature_list.append("PUNCTUATION")
+ feature_list.append('PUNCTUATION')
# Suffix up to length 3
if len(token) > 1:
- feature_list.append("SUF_" + token[-1:])
+ feature_list.append('SUF_' + token[-1:])
if len(token) > 2:
- feature_list.append("SUF_" + token[-2:])
+ feature_list.append('SUF_' + token[-2:])
if len(token) > 3:
- feature_list.append("SUF_" + token[-3:])
+ feature_list.append('SUF_' + token[-3:])
- feature_list.append("WORD_" + token)
+ feature_list.append('WORD_' + token)
return feature_list
def tag_sents(self, sents):
- """
+ '''
Tag a list of sentences. NB before using this function, user should specify the mode_file either by
- Train a new model using ``train'' function
- Use the pre-trained model which is set via ``set_model_file'' function
:type sentences : list(list(str))
:return : list of tagged sentences.
:rtype : list (list (tuple(str,str)))
- """
- if self._model_file == "":
+ '''
+ if self._model_file == '':
raise Exception(
- " No model file is found !! Please use train or set_model_file function"
+ ' No model file is found !! Please use train or set_model_file function'
)
# We need the list of sentences instead of the list generator for matching the input and output
labels = self._tagger.tag(features)
if len(labels) != len(tokens):
- raise Exception(" Predicted Length Not Matched, Expect Errors !")
+ raise Exception(' Predicted Length Not Matched, Expect Errors !')
tagged_sent = list(zip(tokens, labels))
result.append(tagged_sent)
return result
def train(self, train_data, model_file):
- """
+ '''
Train the CRF tagger using CRFSuite
:params train_data : is the list of annotated sentences.
:type train_data : list (list(tuple(str,str)))
:params model_file : the model will be saved to this file.
- """
+ '''
trainer = pycrfsuite.Trainer(verbose=self._verbose)
trainer.set_params(self._training_options)
self.set_model_file(model_file)
def tag(self, tokens):
- """
+ '''
Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by
- Train a new model using ``train'' function
- Use the pre-trained model which is set via ``set_model_file'' function
:type tokens : list(str)
:return : list of tagged tokens.
:rtype : list (tuple(str,str))
- """
+ '''
return self.tag_sents([tokens])[0]
# Natural Language Toolkit: Hidden Markov Model
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@csse.unimelb.edu.au>
# Philip Blunsom <pcbl@csse.unimelb.edu.au>
# Tiago Tresoldi <tiago@tresoldi.pro.br> (fixes)
For more information, please consult the source code for this module,
which includes extensive demonstration code.
"""
+from __future__ import print_function, unicode_literals, division
import re
import itertools
+from six.moves import map, zip
+
try:
import numpy as np
except ImportError:
)
from nltk.metrics import accuracy
from nltk.util import LazyMap, unique_list
+from nltk.compat import python_2_unicode_compatible
from nltk.tag.api import TaggerI
return labeled_symbols
+@python_2_unicode_compatible
class HiddenMarkovModelTagger(TaggerI):
"""
Hidden Markov model class, a generative model for labelling sequence data.
)
if test_sequence:
- hmm.test(test_sequence, verbose=kwargs.get("verbose", False))
+ hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
if unlabeled_sequence:
- max_iterations = kwargs.get("max_iterations", 5)
+ max_iterations = kwargs.get('max_iterations', 5)
hmm = trainer.train_unsupervised(
unlabeled_sequence, model=hmm, max_iterations=max_iterations
)
if test_sequence:
- hmm.test(test_sequence, verbose=kwargs.get("verbose", False))
+ hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
return hmm
if cum_p <= p <= cum_p + add_p:
return sample
cum_p += add_p
- raise Exception("Invalid probability distribution - " "does not sum to one")
+ raise Exception('Invalid probability distribution - ' 'does not sum to one')
def entropy(self, unlabeled_sequence):
"""
for i, state in enumerate(self._states):
p = 2 ** (alpha[0, i] + beta[0, i] - normalisation)
entropy -= p * self._priors.logprob(state)
- # print('p(s_0 = %s) =' % state, p)
+ # print 'p(s_0 = %s) =' % state, p
# state transitions
for t0 in range(T - 1):
- normalisation
)
entropy -= p * self._transitions[s0].logprob(s1)
- # print('p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p)
+ # print 'p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p
# symbol emissions
for t in range(T):
entropy -= p * self._outputs[state].logprob(
unlabeled_sequence[t][_TEXT]
)
- # print('p(s_%d = %s) =' % (t, state), p)
+ # print 'p(s_%d = %s) =' % (t, state), p
return entropy
log_probs.append(lp)
normalisation = _log_add(*log_probs)
+ # ps = zeros((T, N), float64)
+ # for labelling, lp in zip(labellings, log_probs):
+ # for t in range(T):
+ # ps[t, self._states.index(labelling[t])] += \
+ # 2**(lp - normalisation)
+
+ # for t in range(T):
+ # print 'prob[%d] =' % t, ps[t]
+
entropy = 0
for lp in log_probs:
lp -= normalisation
if verbose:
for test_sent, predicted_sent in zip(test_sequence, predicted_sequence):
print(
- "Test:",
- " ".join("%s/%s" % (token, tag) for (token, tag) in test_sent),
+ 'Test:',
+ ' '.join('%s/%s' % (token, tag) for (token, tag) in test_sent),
)
print()
- print("Untagged:", " ".join("%s" % token for (token, tag) in test_sent))
+ print('Untagged:', ' '.join("%s" % token for (token, tag) in test_sent))
print()
print(
- "HMM-tagged:",
- " ".join("%s/%s" % (token, tag) for (token, tag) in predicted_sent),
+ 'HMM-tagged:',
+ ' '.join('%s/%s' % (token, tag) for (token, tag) in predicted_sent),
)
print()
print(
- "Entropy:",
+ 'Entropy:',
self.entropy([(token, None) for (token, tag) in predicted_sent]),
)
print()
- print("-" * 60)
+ print('-' * 60)
test_tags = flatten(map(tags, test_sequence))
predicted_tags = flatten(map(tags, predicted_sequence))
acc = accuracy(test_tags, predicted_tags)
count = sum(len(sent) for sent in test_sequence)
- print("accuracy over %d tokens: %.2f" % (count, acc * 100))
+ print('accuracy over %d tokens: %.2f' % (count, acc * 100))
def __repr__(self):
- return "<HiddenMarkovModelTagger %d states and %d output symbols>" % (
+ return '<HiddenMarkovModelTagger %d states and %d output symbols>' % (
len(self._states),
len(self._symbols),
)
model = self.train_supervised(labeled_sequences, **kwargs)
if unlabeled_sequences:
if model:
- kwargs["model"] = model
+ kwargs['model'] = model
model = self.train_unsupervised(unlabeled_sequences, **kwargs)
return model
# create a uniform HMM, which will be iteratively refined, unless
# given an existing model
- model = kwargs.get("model")
+ model = kwargs.get('model')
if not model:
priors = RandomProbDist(self._states)
transitions = DictionaryConditionalProbDist(
converged = False
last_logprob = None
iteration = 0
- max_iterations = kwargs.get("max_iterations", 1000)
- epsilon = kwargs.get("convergence_logprob", 1e-6)
+ max_iterations = kwargs.get('max_iterations', 1000)
+ epsilon = kwargs.get('convergence_logprob', 1e-6)
while not converged and iteration < max_iterations:
A_numer = _ninf_array((N, N))
if iteration > 0 and abs(logprob - last_logprob) < epsilon:
converged = True
- print("iteration", iteration, "logprob", logprob)
+ print('iteration', iteration, 'logprob', logprob)
iteration += 1
last_logprob = logprob
"""
Return an example HMM (described at page 381, Huang et al)
"""
- states = ["bull", "bear", "static"]
- symbols = ["up", "down", "unchanged"]
+ states = ['bull', 'bear', 'static']
+ symbols = ['up', 'down', 'unchanged']
A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64)
B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64)
pi = np.array([0.5, 0.2, 0.3], np.float64)
model, states, symbols = _market_hmm_example()
- print("Testing", model)
+ print('Testing', model)
for test in [
- ["up", "up"],
- ["up", "down", "up"],
- ["down"] * 5,
- ["unchanged"] * 5 + ["up"],
+ ['up', 'up'],
+ ['up', 'down', 'up'],
+ ['down'] * 5,
+ ['unchanged'] * 5 + ['up'],
]:
sequence = [(t, None) for t in test]
- print("Testing with state sequence", test)
- print("probability =", model.probability(sequence))
- print("tagging = ", model.tag([word for (word, tag) in sequence]))
- print("p(tagged) = ", model.probability(sequence))
- print("H = ", model.entropy(sequence))
- print("H_exh = ", model._exhaustive_entropy(sequence))
- print("H(point) = ", model.point_entropy(sequence))
- print("H_exh(point)=", model._exhaustive_point_entropy(sequence))
+ print('Testing with state sequence', test)
+ print('probability =', model.probability(sequence))
+ print('tagging = ', model.tag([word for (word, tag) in sequence]))
+ print('p(tagged) = ', model.probability(sequence))
+ print('H = ', model.entropy(sequence))
+ print('H_exh = ', model._exhaustive_entropy(sequence))
+ print('H(point) = ', model.point_entropy(sequence))
+ print('H_exh(point)=', model._exhaustive_point_entropy(sequence))
print()
def load_pos(num_sents):
from nltk.corpus import brown
- sentences = brown.tagged_sents(categories="news")[:num_sents]
+ sentences = brown.tagged_sents(categories='news')[:num_sents]
- tag_re = re.compile(r"[*]|--|[^+*-]+")
+ tag_re = re.compile(r'[*]|--|[^+*-]+')
tag_set = set()
symbols = set()
print("HMM POS tagging demo")
print()
- print("Training HMM...")
+ print('Training HMM...')
labelled_sequences, tag_set, symbols = load_pos(20000)
trainer = HiddenMarkovModelTrainer(tag_set, symbols)
hmm = trainer.train_supervised(
estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins),
)
- print("Testing...")
+ print('Testing...')
hmm.test(labelled_sequences[:10], verbose=True)
print("Baum-Welch demo for POS tagging")
print()
- print("Training HMM (supervised, %d sentences)..." % supervised)
+ print('Training HMM (supervised, %d sentences)...' % supervised)
sentences, tag_set, symbols = load_pos(test + supervised + unsupervised)
hmm.test(sentences[:test], verbose=verbose)
- print("Training (unsupervised, %d sentences)..." % unsupervised)
+ print('Training (unsupervised, %d sentences)...' % unsupervised)
# it's rather slow - so only use 10 samples by default
unlabeled = _untag(sentences[test + supervised :])
hmm = trainer.train_unsupervised(
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the HunPos POS-tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Dávid Márk Nemeskey <nemeskeyd@gmail.com> (modifications)
# Attila Zséder <zseder@gmail.com> (modifications)
import os
from subprocess import Popen, PIPE
+from six import text_type
+
from nltk.internals import find_binary, find_file
from nltk.tag.api import TaggerI
-_hunpos_url = "http://code.google.com/p/hunpos/"
+_hunpos_url = 'http://code.google.com/p/hunpos/'
-_hunpos_charset = "ISO-8859-1"
+_hunpos_charset = 'ISO-8859-1'
"""The default encoding used by hunpos: ISO-8859-1."""
"""
self._closed = True
hunpos_paths = [
- ".",
- "/usr/bin",
- "/usr/local/bin",
- "/opt/local/bin",
- "/Applications/bin",
- "~/bin",
- "~/Applications/bin",
+ '.',
+ '/usr/bin',
+ '/usr/local/bin',
+ '/opt/local/bin',
+ '/Applications/bin',
+ '~/bin',
+ '~/Applications/bin',
]
hunpos_paths = list(map(os.path.expanduser, hunpos_paths))
self._hunpos_bin = find_binary(
- "hunpos-tag",
+ 'hunpos-tag',
path_to_bin,
- env_vars=("HUNPOS_TAGGER",),
+ env_vars=('HUNPOS_TAGGER',),
searchpath=hunpos_paths,
url=_hunpos_url,
verbose=verbose,
)
self._hunpos_model = find_file(
- path_to_model, env_vars=("HUNPOS_TAGGER",), verbose=verbose
+ path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose
)
self._encoding = encoding
self._hunpos = Popen(
"""
for token in tokens:
assert "\n" not in token, "Tokens should not contain newlines"
- if isinstance(token, str):
+ if isinstance(token, text_type):
token = token.encode(self._encoding)
self._hunpos.stdin.write(token + b"\n")
# We write a final empty line to tell hunpos that the sentence is finished:
from nose import SkipTest
try:
- HunposTagger("en_wsj.model")
+ HunposTagger('en_wsj.model')
except LookupError:
raise SkipTest("HunposTagger is not available")
# Natural Language Toolkit: Tagset Mapping
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Nathan Schneider <nathan@cmu.edu>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
"""
+from __future__ import print_function, unicode_literals, division
from collections import defaultdict
from os.path import join
_UNIVERSAL_DATA = "taggers/universal_tagset"
_UNIVERSAL_TAGS = (
- "VERB",
- "NOUN",
- "PRON",
- "ADJ",
- "ADV",
- "ADP",
- "CONJ",
- "DET",
- "NUM",
- "PRT",
- "X",
- ".",
+ 'VERB',
+ 'NOUN',
+ 'PRON',
+ 'ADJ',
+ 'ADV',
+ 'ADP',
+ 'CONJ',
+ 'DET',
+ 'NUM',
+ 'PRT',
+ 'X',
+ '.',
)
# _MAPPINGS = defaultdict(lambda: defaultdict(dict))
# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
-_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: "UNK")))
+_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))
def _load_universal_map(fileid):
- contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
+ contents = load(join(_UNIVERSAL_DATA, fileid + '.map'), format="text")
# When mapping to the Universal Tagset,
# map unknown inputs to 'X' not 'UNK'
- _MAPPINGS[fileid]["universal"].default_factory = lambda: "X"
+ _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
for line in contents.splitlines():
line = line.strip()
- if line == "":
+ if line == '':
continue
- fine, coarse = line.split("\t")
+ fine, coarse = line.split('\t')
- assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format(coarse)
+ assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
assert (
- fine not in _MAPPINGS[fileid]["universal"]
- ), "Multiple entries for original tag: {}".format(fine)
+ fine not in _MAPPINGS[fileid]['universal']
+ ), 'Multiple entries for original tag: {}'.format(fine)
- _MAPPINGS[fileid]["universal"][fine] = coarse
+ _MAPPINGS[fileid]['universal'][fine] = coarse
def tagset_mapping(source, target):
"""
if source not in _MAPPINGS or target not in _MAPPINGS[source]:
- if target == "universal":
+ if target == 'universal':
_load_universal_map(source)
# Added the new Russian National Corpus mappings because the
# Russian model for nltk.pos_tag() uses it.
- _MAPPINGS["ru-rnc-new"]["universal"] = {
- "A": "ADJ",
- "A-PRO": "PRON",
- "ADV": "ADV",
- "ADV-PRO": "PRON",
- "ANUM": "ADJ",
- "CONJ": "CONJ",
- "INTJ": "X",
- "NONLEX": ".",
- "NUM": "NUM",
- "PARENTH": "PRT",
- "PART": "PRT",
- "PR": "ADP",
- "PRAEDIC": "PRT",
- "PRAEDIC-PRO": "PRON",
- "S": "NOUN",
- "S-PRO": "PRON",
- "V": "VERB",
+ _MAPPINGS['ru-rnc-new']['universal'] = {
+ 'A': 'ADJ',
+ 'A-PRO': 'PRON',
+ 'ADV': 'ADV',
+ 'ADV-PRO': 'PRON',
+ 'ANUM': 'ADJ',
+ 'CONJ': 'CONJ',
+ 'INTJ': 'X',
+ 'NONLEX': '.',
+ 'NUM': 'NUM',
+ 'PARENTH': 'PRT',
+ 'PART': 'PRT',
+ 'PR': 'ADP',
+ 'PRAEDIC': 'PRT',
+ 'PRAEDIC-PRO': 'PRON',
+ 'S': 'NOUN',
+ 'S-PRO': 'PRON',
+ 'V': 'VERB',
}
return _MAPPINGS[source][target]
"""
# we need a systematic approach to naming
- if target == "universal":
- if source == "wsj":
- source = "en-ptb"
- if source == "brown":
- source = "en-brown"
+ if target == 'universal':
+ if source == 'wsj':
+ source = 'en-ptb'
+ if source == 'brown':
+ source = 'en-brown'
return tagset_mapping(source, target)[source_tag]
#
# This module is provided under the terms of the MIT License.
+from __future__ import absolute_import
+from __future__ import print_function, division
+
import random
from collections import defaultdict
import pickle
from nltk.tag.api import TaggerI
from nltk.data import find, load
-
-from nltk import jsontags
-
-try:
- import numpy as np
-except ImportError:
- pass
+from nltk.compat import python_2_unicode_compatible
PICKLE = "averaged_perceptron_tagger.pickle"
-@jsontags.register_tag
-class AveragedPerceptron:
- """An averaged perceptron, as implemented by Matthew Honnibal.
+class AveragedPerceptron(object):
+
+ '''An averaged perceptron, as implemented by Matthew Honnibal.
See more implementation details here:
https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
- """
-
- json_tag = "nltk.tag.perceptron.AveragedPerceptron"
+ '''
- def __init__(self, weights=None):
+ def __init__(self):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
- self.weights = weights if weights else {}
+ self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
# Number of instances seen
self.i = 0
- def _softmax(self, scores):
- s = np.fromiter(scores.values(), dtype=float)
- exps = np.exp(s)
- return exps / np.sum(exps)
-
- def predict(self, features, return_conf=False):
- """Dot-product the features and current weights and return the best label."""
+ def predict(self, features):
+ '''Dot-product the features and current weights and return the best label.'''
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
-
# Do a secondary alphabetic sort, for stability
- best_label = max(self.classes, key=lambda label: (scores[label], label))
- # compute the confidence
- conf = max(self._softmax(scores)) if return_conf == True else None
-
- return best_label, conf
+ return max(self.classes, key=lambda label: (scores[label], label))
def update(self, truth, guess, features):
- """Update the feature weights."""
+ '''Update the feature weights.'''
def upd_feat(c, f, w, v):
param = (f, c)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
def average_weights(self):
- """Average weights from all iterations."""
+ '''Average weights from all iterations.'''
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
self.weights[feat] = new_feat_weights
def save(self, path):
- """Save the pickled model weights."""
- with open(path, "wb") as fout:
+ '''Save the pickled model weights.'''
+ with open(path, 'wb') as fout:
return pickle.dump(dict(self.weights), fout)
def load(self, path):
- """Load the pickled model weights."""
+ '''Load the pickled model weights.'''
self.weights = load(path)
- def encode_json_obj(self):
- return self.weights
-
- @classmethod
- def decode_json_obj(cls, obj):
- return cls(obj)
-
-@jsontags.register_tag
+@python_2_unicode_compatible
class PerceptronTagger(TaggerI):
- """
+ '''
Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
See more implementation details here:
https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
>>> pretrain.tag("The red cat".split())
[('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')]
- """
-
- json_tag = "nltk.tag.sequential.PerceptronTagger"
+ '''
- START = ["-START-", "-START2-"]
- END = ["-END-", "-END2-"]
+ START = ['-START-', '-START2-']
+ END = ['-END-', '-END2-']
def __init__(self, load=True):
- """
+ '''
:param load: Load the pickled model upon instantiation.
- """
+ '''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
- AP_MODEL_LOC = "file:" + str(
- find("taggers/averaged_perceptron_tagger/" + PICKLE)
+ AP_MODEL_LOC = 'file:' + str(
+ find('taggers/averaged_perceptron_tagger/' + PICKLE)
)
self.load(AP_MODEL_LOC)
- def tag(self, tokens, return_conf=False, use_tagdict=True):
- """
+ def tag(self, tokens):
+ '''
Tag tokenized sentences.
:params tokens: list of word
:type tokens: list(str)
- """
+ '''
prev, prev2 = self.START
output = []
context = self.START + [self.normalize(w) for w in tokens] + self.END
for i, word in enumerate(tokens):
- tag, conf = (
- (self.tagdict.get(word), 1.0) if use_tagdict == True else (None, None)
- )
+ tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
- tag, conf = self.model.predict(features, return_conf)
- output.append((word, tag, conf) if return_conf == True else (word, tag))
-
+ tag = self.model.predict(features)
+ output.append((word, tag))
prev2 = prev
prev = tag
return output
def train(self, sentences, save_loc=None, nr_iter=5):
- """Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
+ '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
controls the number of Perceptron training iterations.
:param sentences: A list or iterator of sentences, where each sentence
is a list of (words, tags) tuples.
:param save_loc: If not ``None``, saves a pickled model in this location.
:param nr_iter: Number of training iterations.
- """
+ '''
# We'd like to allow ``sentences`` to be either a list or an iterator,
# the latter being especially important for a large training dataset.
# Because ``self._make_tagdict(sentences)`` runs regardless, we make
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
- guess, _ = self.model.predict(feats)
+ guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev
prev = guess
self.model.average_weights()
# Pickle as a binary file
if save_loc is not None:
- with open(save_loc, "wb") as fout:
+ with open(save_loc, 'wb') as fout:
# changed protocol from -1 to 2 to make pickling Python 2 compatible
pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2)
def load(self, loc):
- """
+ '''
:param loc: Load a pickled model at location.
:type loc: str
- """
+ '''
self.model.weights, self.tagdict, self.classes = load(loc)
self.model.classes = self.classes
- def encode_json_obj(self):
- return self.model.weights, self.tagdict, list(self.classes)
-
- @classmethod
- def decode_json_obj(cls, obj):
- tagger = cls(load=False)
- tagger.model.weights, tagger.tagdict, tagger.classes = obj
- tagger.classes = set(tagger.classes)
- tagger.model.classes = tagger.classes
- return tagger
-
def normalize(self, word):
- """
+ '''
Normalization used in pre-processing.
- All words are lower cased
- Groups of digits of length 4 are represented as !YEAR;
- Other digits are represented as !DIGITS
:rtype: str
- """
- if "-" in word and word[0] != "-":
- return "!HYPHEN"
+ '''
+ if '-' in word and word[0] != '-':
+ return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
- return "!YEAR"
+ return '!YEAR'
elif word[0].isdigit():
- return "!DIGITS"
+ return '!DIGITS'
else:
return word.lower()
def _get_features(self, i, word, context, prev, prev2):
- """Map tokens into a feature representation, implemented as a
+ '''Map tokens into a feature representation, implemented as a
{hashable: int} dict. If the features change, a new model must be
trained.
- """
+ '''
def add(name, *args):
- features[" ".join((name,) + tuple(args))] += 1
+ features[' '.join((name,) + tuple(args))] += 1
i += len(self.START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
- add("bias")
- add("i suffix", word[-3:])
- add("i pref1", word[0])
- add("i-1 tag", prev)
- add("i-2 tag", prev2)
- add("i tag+i-2 tag", prev, prev2)
- add("i word", context[i])
- add("i-1 tag+i word", prev, context[i])
- add("i-1 word", context[i - 1])
- add("i-1 suffix", context[i - 1][-3:])
- add("i-2 word", context[i - 2])
- add("i+1 word", context[i + 1])
- add("i+1 suffix", context[i + 1][-3:])
- add("i+2 word", context[i + 2])
+ add('bias')
+ add('i suffix', word[-3:])
+ add('i pref1', word[0])
+ add('i-1 tag', prev)
+ add('i-2 tag', prev2)
+ add('i tag+i-2 tag', prev, prev2)
+ add('i word', context[i])
+ add('i-1 tag+i word', prev, context[i])
+ add('i-1 word', context[i - 1])
+ add('i-1 suffix', context[i - 1][-3:])
+ add('i-2 word', context[i - 2])
+ add('i+1 word', context[i + 1])
+ add('i+1 suffix', context[i + 1][-3:])
+ add('i+2 word', context[i + 2])
return features
def _make_tagdict(self, sentences):
- """
+ '''
Make a tag dictionary for single-tag words.
:param sentences: A list of list of (word, tag) tuples.
- """
+ '''
counts = defaultdict(lambda: defaultdict(int))
for sentence in sentences:
self._sentences.append(sentence)
def _load_data_conll_format(filename):
- print("Read from file: ", filename)
- with open(filename, "rb") as fin:
+ print('Read from file: ', filename)
+ with open(filename, 'rb') as fin:
sentences = []
sentence = []
for line in fin.readlines():
sentences.append(sentence)
sentence = []
continue
- tokens = line.split("\t")
+ tokens = line.split('\t')
word = tokens[1]
tag = tokens[4]
sentence.append((word, tag))
# Train: section 2-11
# Test : section 23
tagger = PerceptronTagger()
- training = _load_data_conll_format("english_ptb_train.conll")
- testing = _load_data_conll_format("english_ptb_test.conll")
- print("Size of training and testing (sentence)", len(training), len(testing))
+ training = _load_data_conll_format('english_ptb_train.conll')
+ testing = _load_data_conll_format('english_ptb_test.conll')
+ print('Size of training and testing (sentence)', len(training), len(testing))
# Train and save the model
tagger.train(training, PICKLE)
- print("Accuracy : ", tagger.evaluate(testing))
+ print('Accuracy : ', tagger.evaluate(testing))
-if __name__ == "__main__":
+if __name__ == '__main__':
# _get_pretrain_model()
pass
# encoding: utf-8
# Natural Language Toolkit: Senna POS Tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
"""
+from nltk.compat import python_2_unicode_compatible
from nltk.classify import Senna
-
+@python_2_unicode_compatible
class SennaTagger(Senna):
- def __init__(self, path, encoding="utf-8"):
- super(SennaTagger, self).__init__(path, ["pos"], encoding)
+ def __init__(self, path, encoding='utf-8'):
+ super(SennaTagger, self).__init__(path, ['pos'], encoding)
def tag_sents(self, sentences):
"""
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
annotations = tagged_sents[i][j]
- tagged_sents[i][j] = (annotations["word"], annotations["pos"])
+ tagged_sents[i][j] = (annotations['word'], annotations['pos'])
return tagged_sents
-
+@python_2_unicode_compatible
class SennaChunkTagger(Senna):
- def __init__(self, path, encoding="utf-8"):
- super(SennaChunkTagger, self).__init__(path, ["chk"], encoding)
+ def __init__(self, path, encoding='utf-8'):
+ super(SennaChunkTagger, self).__init__(path, ['chk'], encoding)
def tag_sents(self, sentences):
"""
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
annotations = tagged_sents[i][j]
- tagged_sents[i][j] = (annotations["word"], annotations["chk"])
+ tagged_sents[i][j] = (annotations['word'], annotations['chk'])
return tagged_sents
def bio_to_chunks(self, tagged_sent, chunk_type):
current_chunk_position = []
for idx, word_pos in enumerate(tagged_sent):
word, pos = word_pos
- if "-" + chunk_type in pos: # Append the word to the current_chunk.
+ if '-' + chunk_type in pos: # Append the word to the current_chunk.
current_chunk.append((word))
current_chunk_position.append((idx))
else:
if current_chunk: # Flush the full chunk when out of an NP.
- _chunk_str = " ".join(current_chunk)
- _chunk_pos_str = "-".join(map(str, current_chunk_position))
+ _chunk_str = ' '.join(current_chunk)
+ _chunk_pos_str = '-'.join(map(str, current_chunk_position))
yield _chunk_str, _chunk_pos_str
current_chunk = []
current_chunk_position = []
if current_chunk: # Flush the last chunk.
- yield " ".join(current_chunk), "-".join(map(str, current_chunk_position))
-
+ yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))
+@python_2_unicode_compatible
class SennaNERTagger(Senna):
- def __init__(self, path, encoding="utf-8"):
- super(SennaNERTagger, self).__init__(path, ["ner"], encoding)
+ def __init__(self, path, encoding='utf-8'):
+ super(SennaNERTagger, self).__init__(path, ['ner'], encoding)
def tag_sents(self, sentences):
"""
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
annotations = tagged_sents[i][j]
- tagged_sents[i][j] = (annotations["word"], annotations["ner"])
+ tagged_sents[i][j] = (annotations['word'], annotations['ner'])
return tagged_sents
from nose import SkipTest
try:
- tagger = Senna("/usr/share/senna-v3.0", ["pos", "chk", "ner"])
+ tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
except OSError:
raise SkipTest("Senna executable not found")
# Natural Language Toolkit: Sequential Backoff Taggers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
consulted instead. Any SequentialBackoffTagger may serve as a
backoff tagger for any other SequentialBackoffTagger.
"""
-import ast
+from __future__ import print_function, unicode_literals
from abc import abstractmethod
import re
from nltk.probability import ConditionalFreqDist
from nltk.classify import NaiveBayesClassifier
+from nltk.compat import python_2_unicode_compatible
from nltk.tag.api import TaggerI, FeaturesetTaggerI
"""
+@python_2_unicode_compatible
class ContextTagger(SequentialBackoffTagger):
"""
An abstract base class for sequential backoff taggers that choose
:param context_to_tag: A dictionary mapping contexts to tags.
:param backoff: The backoff tagger that should be used for this tagger.
"""
- super().__init__(backoff)
+ SequentialBackoffTagger.__init__(self, backoff)
self._context_to_tag = context_to_tag if context_to_tag else {}
@abstractmethod
return len(self._context_to_tag)
def __repr__(self):
- return "<{}: size={}>".format(self.__class__.__name__, self.size())
+ return '<%s: size=%d>' % (self.__class__.__name__, self.size())
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
- print("[Trained Unigram tagger:", end=" ")
- print("size={}, backoff={:.2f}%, pruning={:.2f}%]".format(size, backoff, pruning))
+ print("[Trained Unigram tagger:", end=' ')
+ print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning))
######################################################################
# Tagger Classes
######################################################################
-
-
+@python_2_unicode_compatible
@jsontags.register_tag
class DefaultTagger(SequentialBackoffTagger):
"""
:type tag: str
"""
- json_tag = "nltk.tag.sequential.DefaultTagger"
+ json_tag = 'nltk.tag.sequential.DefaultTagger'
def __init__(self, tag):
self._tag = tag
- super().__init__(None)
+ SequentialBackoffTagger.__init__(self, None)
def encode_json_obj(self):
return self._tag
return self._tag # ignore token and history
def __repr__(self):
- return "<DefaultTagger: tag={}>".format(self._tag)
+ return '<DefaultTagger: tag=%s>' % self._tag
@jsontags.register_tag
context-to-tag table for the new tagger.
"""
- json_tag = "nltk.tag.sequential.NgramTagger"
+ json_tag = 'nltk.tag.sequential.NgramTagger'
def __init__(
self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False
self._n = n
self._check_params(train, model)
- super().__init__(model, backoff)
+ ContextTagger.__init__(self, model, backoff)
if train:
self._train(train, cutoff, verbose)
def encode_json_obj(self):
- _context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()}
- if "NgramTagger" in self.__class__.__name__:
- return self._n, _context_to_tag, self.backoff
- else:
- return _context_to_tag, self.backoff
+ return self._n, self._context_to_tag, self.backoff
@classmethod
def decode_json_obj(cls, obj):
- try:
- _n, _context_to_tag, backoff = obj
- except ValueError:
- _context_to_tag, backoff = obj
-
- if not _context_to_tag:
- return backoff
-
- _context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()}
-
- if "NgramTagger" in cls.__name__:
- return cls(_n, model=_context_to_tag, backoff=backoff)
- else:
- return cls(model=_context_to_tag, backoff=backoff)
+ _n, _context_to_tag, backoff = obj
+ return cls(_n, model=_context_to_tag, backoff=backoff)
def context(self, tokens, index, history):
tag_context = tuple(history[max(0, index - self._n + 1) : index])
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
- ... print("({}, {}), ".format(tok, tag))
+ ... print("(%s, %s), " % (tok, tag))
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
:type cutoff: int
"""
- json_tag = "nltk.tag.sequential.UnigramTagger"
+ json_tag = 'nltk.tag.sequential.UnigramTagger'
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
- super().__init__(1, train, model, backoff, cutoff, verbose)
+ NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
+
+ def encode_json_obj(self):
+ return self._context_to_tag, self.backoff
+
+ @classmethod
+ def decode_json_obj(cls, obj):
+ _context_to_tag, backoff = obj
+ return cls(model=_context_to_tag, backoff=backoff)
def context(self, tokens, index, history):
return tokens[index]
:type cutoff: int
"""
- json_tag = "nltk.tag.sequential.BigramTagger"
+ json_tag = 'nltk.tag.sequential.BigramTagger'
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
- super().__init__(2, train, model, backoff, cutoff, verbose)
+ NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose)
+
+ def encode_json_obj(self):
+ return self._context_to_tag, self.backoff
+
+ @classmethod
+ def decode_json_obj(cls, obj):
+ _context_to_tag, backoff = obj
+ return cls(model=_context_to_tag, backoff=backoff)
@jsontags.register_tag
:type cutoff: int
"""
- json_tag = "nltk.tag.sequential.TrigramTagger"
+ json_tag = 'nltk.tag.sequential.TrigramTagger'
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
- super().__init__(3, train, model, backoff, cutoff, verbose)
+ NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose)
+
+ def encode_json_obj(self):
+ return self._context_to_tag, self.backoff
+
+ @classmethod
+ def decode_json_obj(cls, obj):
+ _context_to_tag, backoff = obj
+ return cls(model=_context_to_tag, backoff=backoff)
@jsontags.register_tag
tag of None by this tagger.
"""
- json_tag = "nltk.tag.sequential.AffixTagger"
+ json_tag = 'nltk.tag.sequential.AffixTagger'
def __init__(
self,
self._check_params(train, model)
- super().__init__(model, backoff)
+ ContextTagger.__init__(self, model, backoff)
self._affix_length = affix_length
self._min_word_length = min_stem_length + abs(affix_length)
return token[self._affix_length :]
+@python_2_unicode_compatible
@jsontags.register_tag
class RegexpTagger(SequentialBackoffTagger):
"""
assigned the tag None.
"""
- json_tag = "nltk.tag.sequential.RegexpTagger"
+ json_tag = 'nltk.tag.sequential.RegexpTagger'
def __init__(self, regexps, backoff=None):
"""
"""
- super().__init__(backoff)
- try:
- self._regexps = [(re.compile(regexp), tag,) for regexp, tag in regexps]
- except Exception as e:
- raise Exception(
- 'Invalid RegexpTagger regexp:', str(e), 'regexp:', regexp, 'tag:', tag)
+ SequentialBackoffTagger.__init__(self, backoff)
+ self._regexs = [(re.compile(regexp), tag) for regexp, tag in regexps]
def encode_json_obj(self):
- return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff
+ return [(regexp.patten, tag) for regexp, tag in self._regexs], self.backoff
@classmethod
def decode_json_obj(cls, obj):
regexps, backoff = obj
- return cls(regexps, backoff)
+ self = cls(())
+ self._regexs = [(re.compile(regexp), tag) for regexp, tag in regexps]
+ SequentialBackoffTagger.__init__(self, backoff)
+ return self
def choose_tag(self, tokens, index, history):
- for regexp, tag in self._regexps:
+ for regexp, tag in self._regexs:
if re.match(regexp, tokens[index]):
return tag
return None
def __repr__(self):
- return "<Regexp Tagger: size={}>".format(len(self._regexps))
+ return '<Regexp Tagger: size=%d>' % len(self._regexs)
+@python_2_unicode_compatible
class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
"""
A sequential tagger that uses a classifier to choose the tag for
):
self._check_params(train, classifier)
- super().__init__(backoff)
+ SequentialBackoffTagger.__init__(self, backoff)
if (train and classifier) or (not train and not classifier):
raise ValueError(
- "Must specify either training data or " "trained classifier."
+ 'Must specify either training data or ' 'trained classifier.'
)
if feature_detector is not None:
classifier_corpus = []
if verbose:
- print("Constructing training corpus for classifier.")
+ print('Constructing training corpus for classifier.')
for sentence in tagged_corpus:
history = []
history.append(tags[index])
if verbose:
- print("Training classifier ({} instances)".format(len(classifier_corpus)))
+ print('Training classifier (%d instances)' % len(classifier_corpus))
self._classifier = classifier_builder(classifier_corpus)
def __repr__(self):
- return "<ClassifierBasedTagger: {}>".format(self._classifier)
+ return '<ClassifierBasedTagger: %r>' % self._classifier
def feature_detector(self, tokens, index, history):
"""
prevtag = history[index - 1]
prevprevtag = history[index - 2]
- if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word):
- shape = "number"
- elif re.match("\W+$", word):
- shape = "punct"
- elif re.match("[A-Z][a-z]+$", word):
- shape = "upcase"
- elif re.match("[a-z]+$", word):
- shape = "downcase"
- elif re.match("\w+$", word):
- shape = "mixedcase"
+ if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
+ shape = 'number'
+ elif re.match('\W+$', word):
+ shape = 'punct'
+ elif re.match('[A-Z][a-z]+$', word):
+ shape = 'upcase'
+ elif re.match('[a-z]+$', word):
+ shape = 'downcase'
+ elif re.match('\w+$', word):
+ shape = 'mixedcase'
else:
- shape = "other"
+ shape = 'other'
features = {
- "prevtag": prevtag,
- "prevprevtag": prevprevtag,
- "word": word,
- "word.lower": word.lower(),
- "suffix3": word.lower()[-3:],
- "suffix2": word.lower()[-2:],
- "suffix1": word.lower()[-1:],
- "prevprevword": prevprevword,
- "prevword": prevword,
- "prevtag+word": "{}+{}".format(prevtag, word.lower()),
- "prevprevtag+word": "{}+{}".format(prevprevtag, word.lower()),
- "prevword+word": "{}+{}".format(prevword, word.lower()),
- "shape": shape,
+ 'prevtag': prevtag,
+ 'prevprevtag': prevprevtag,
+ 'word': word,
+ 'word.lower': word.lower(),
+ 'suffix3': word.lower()[-3:],
+ 'suffix2': word.lower()[-2:],
+ 'suffix1': word.lower()[-1:],
+ 'prevprevword': prevprevword,
+ 'prevword': prevword,
+ 'prevtag+word': '%s+%s' % (prevtag, word.lower()),
+ 'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
+ 'prevword+word': '%s+%s' % (prevword, word.lower()),
+ 'shape': shape,
}
return features
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org>
# Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
from subprocess import PIPE
import warnings
+from six import text_type
+
from nltk.internals import find_file, find_jar, config_java, java, _java_options
from nltk.tag.api import TaggerI
-_stanford_url = "https://nlp.stanford.edu/software"
+_stanford_url = 'https://nlp.stanford.edu/software'
class StanfordTagger(TaggerI):
- ``_JAR`` file: Class constant that represents the jar file name.
"""
- _SEPARATOR = ""
- _JAR = ""
+ _SEPARATOR = ''
+ _JAR = ''
def __init__(
self,
model_filename,
path_to_jar=None,
- encoding="utf8",
+ encoding='utf8',
verbose=False,
- java_options="-mx1000m",
+ java_options='-mx1000m',
):
# Raise deprecation warning.
warnings.warn(
if not self._JAR:
warnings.warn(
- "The StanfordTagger class is not meant to be "
- "instantiated directly. Did you mean "
- "StanfordPOSTagger or StanfordNERTagger?"
+ 'The StanfordTagger class is not meant to be '
+ 'instantiated directly. Did you mean '
+ 'StanfordPOSTagger or StanfordNERTagger?'
)
self._stanford_jar = find_jar(
self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose
)
self._stanford_model = find_file(
- model_filename, env_vars=("STANFORD_MODELS",), verbose=verbose
+ model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose
)
self._encoding = encoding
def tag_sents(self, sentences):
encoding = self._encoding
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=False)
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
cmd = list(self._cmd)
- cmd.extend(["-encoding", encoding])
+ cmd.extend(['-encoding', encoding])
# Write the actual sentences to the temporary input file
- _input_fh = os.fdopen(_input_fh, "wb")
- _input = "\n".join((" ".join(x) for x in sentences))
- if isinstance(_input, str) and encoding:
+ _input_fh = os.fdopen(_input_fh, 'wb')
+ _input = '\n'.join((' '.join(x) for x in sentences))
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
sentence = []
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
- sentence.append(("".join(word_tags[:-1]), word_tags[-1]))
+ sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
tagged_sentences.append(sentence)
return tagged_sentences
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
- _SEPARATOR = "_"
- _JAR = "stanford-postagger.jar"
+ _SEPARATOR = '_'
+ _JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs):
super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
return [
- "edu.stanford.nlp.tagger.maxent.MaxentTagger",
- "-model",
+ 'edu.stanford.nlp.tagger.maxent.MaxentTagger',
+ '-model',
self._stanford_model,
- "-textFile",
+ '-textFile',
self._input_file_path,
- "-tokenize",
- "false",
- "-outputFormatOptions",
- "keepEmptySentences",
+ '-tokenize',
+ 'false',
+ '-outputFormatOptions',
+ 'keepEmptySentences',
]
('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
"""
- _SEPARATOR = "/"
- _JAR = "stanford-ner.jar"
- _FORMAT = "slashTags"
+ _SEPARATOR = '/'
+ _JAR = 'stanford-ner.jar'
+ _FORMAT = 'slashTags'
def __init__(self, *args, **kwargs):
super(StanfordNERTagger, self).__init__(*args, **kwargs)
def _cmd(self):
# Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
return [
- "edu.stanford.nlp.ie.crf.CRFClassifier",
- "-loadClassifier",
+ 'edu.stanford.nlp.ie.crf.CRFClassifier',
+ '-loadClassifier',
self._stanford_model,
- "-textFile",
+ '-textFile',
self._input_file_path,
- "-outputFormat",
+ '-outputFormat',
self._FORMAT,
- "-tokenizerFactory",
- "edu.stanford.nlp.process.WhitespaceTokenizer",
- "-tokenizerOptions",
- '"tokenizeNLs=false"',
+ '-tokenizerFactory',
+ 'edu.stanford.nlp.process.WhitespaceTokenizer',
+ '-tokenizerOptions',
+ '\"tokenizeNLs=false\"',
]
def parse_output(self, text, sentences):
- if self._FORMAT == "slashTags":
+ if self._FORMAT == 'slashTags':
# Joint together to a big list
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
- tagged_sentences.append(("".join(word_tags[:-1]), word_tags[-1]))
+ tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
# Separate it according to the input
result = []
from nose import SkipTest
try:
- StanfordPOSTagger("english-bidirectional-distsim.tagger")
+ StanfordPOSTagger('english-bidirectional-distsim.tagger')
except LookupError:
raise SkipTest(
- "Doctests from nltk.tag.stanford are skipped because one \
- of the stanford jars cannot be found."
+ 'Doctests from nltk.tag.stanford are skipped because one \
+ of the stanford jars cannot be found.'
)
# Natural Language Toolkit: TnT Tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Sam Huston <sjh900@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-"""
+'''
Implementation of 'TnT - A Statisical Part of Speech Tagger'
by Thorsten Brants
http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
-"""
-
+'''
+from __future__ import print_function, division
from math import log
from operator import itemgetter
class TnT(TaggerI):
- """
+ '''
TnT - Statistical POS tagger
IMPORTANT NOTES:
It is possible to differentiate the tags which are assigned to
capitalized words. However this does not result in a significant
gain in the accuracy of the results.
- """
+ '''
def __init__(self, unk=None, Trained=False, N=1000, C=False):
- """
+ '''
Construct a TnT statistical tagger. Tagger must be trained
before being used to tag input.
information for tagging.
NOTE: using capitalization may not increase the accuracy
of the tagger
- """
+ '''
self._uni = FreqDist()
self._bi = ConditionalFreqDist()
self.known = 0
def train(self, data):
- """
+ '''
Uses a set of tagged data to train the tagger.
If an unknown word tagger is specified,
it is trained on the same data.
:param data: List of lists of (word, tag) tuples
:type data: tuple(str)
- """
+ '''
# Ensure that local C flag is initialized before use
C = False
self._unk.train(data)
for sent in data:
- history = [("BOS", False), ("BOS", False)]
+ history = [('BOS', False), ('BOS', False)]
for w, t in sent:
# if capitalization is requested,
# set local flag C to false for the next word
C = False
- self._eos[t]["EOS"] += 1
+ self._eos[t]['EOS'] += 1
# compute lambda values from the trained frequency distributions
self._compute_lambda()
+ # (debugging -- ignore or delete me)
+ # print "lambdas"
+ # print i, self._l1, i, self._l2, i, self._l3
+
def _compute_lambda(self):
- """
+ '''
creates lambda values based upon training data
NOTE: no need to explicitly reference C,
ISSUES -- Resolutions:
if 2 values are equal, increment both lambda values
by (f(t1,t2,t3) / 2)
- """
+ '''
# temporary lambda variables
tl1 = 0.0
# otherwise there might be a problem
# eg: all values = 0
else:
+ # print "Problem", c1, c2 ,c3
pass
# Lambda normalisation:
self._l3 = tl3 / (tl1 + tl2 + tl3)
def _safe_div(self, v1, v2):
- """
+ '''
Safe floating point division function, does not allow division by 0
returns -1 if the denominator is 0
- """
+ '''
if v2 == 0:
return -1
else:
return v1 / v2
def tagdata(self, data):
- """
+ '''
Tags each sentence in a list of sentences
:param data:list of list of words
Invokes tag(sent) function for each sentence
compiles the results into a list of tagged sentences
each tagged sentence is a list of (word, tag) tuples
- """
+ '''
res = []
for sent in data:
res1 = self.tag(sent)
return res
def tag(self, data):
- """
+ '''
Tags a single sentence
:param data: list of words
with the correct words in the input sequence
returns a list of (word, tag) tuples
- """
+ '''
- current_state = [(["BOS", "BOS"], 0.0)]
+ current_state = [(['BOS', 'BOS'], 0.0)]
sent = list(data)
return res
def _tagword(self, sent, current_states):
- """
+ '''
:param sent : List of words remaining in the sentence
:type sent : [word,]
:param current_states : List of possible tag combinations for
Uses formula specified above to calculate the probability
of a particular tag
- """
+ '''
# if this word marks the end of the sentance,
# return the most probable tag
# if no unknown word tagger has been specified
# then use the tag 'Unk'
if self._unk is None:
- tag = ("Unk", C)
+ tag = ('Unk', C)
# otherwise apply the unknown word tagger
else:
def basic_sent_chop(data, raw=True):
- """
+ '''
Basic method for tokenizing input into sentences
for this tagger:
This is a simple method which enhances the performance of the TnT
tagger. Better sentence tokenization will further enhance the results.
- """
+ '''
new_data = []
curr_sent = []
- sent_mark = [",", ".", "?", "!"]
+ sent_mark = [',', '.', '?', '!']
if raw:
for word in data:
sents = list(brown.tagged_sents())
test = list(brown.sents())
+ # create and train the tagger
tagger = TnT()
tagger.train(sents[200:1000])
+ # tag some data
tagged_data = tagger.tagdata(test[100:120])
+ # print results
for j in range(len(tagged_data)):
s = tagged_data[j]
t = sents[j + 100]
for i in range(len(s)):
- print(s[i], "--", t[i])
+ print(s[i], '--', t[i])
print()
t.unknown = 0
t.known = 0
- print("Capitalization off:")
- print("Accuracy:", tacc)
- print("Percentage known:", tp_kn)
- print("Percentage unknown:", tp_un)
- print("Accuracy over known words:", (tacc / tp_kn))
+ print('Capitalization off:')
+ print('Accuracy:', tacc)
+ print('Percentage known:', tp_kn)
+ print('Percentage unknown:', tp_un)
+ print('Accuracy over known words:', (tacc / tp_kn))
sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
sp_un = s.unknown / (s.known + s.unknown)
s.unknown = 0
s.known = 0
- print("Capitalization on:")
- print("Accuracy:", sacc)
- print("Percentage known:", sp_kn)
- print("Percentage unknown:", sp_un)
- print("Accuracy over known words:", (sacc / sp_kn))
+ print('Capitalization on:')
+ print('Accuracy:', sacc)
+ print('Percentage known:', sp_kn)
+ print('Percentage unknown:', sp_un)
+ print('Accuracy over known words:', (sacc / sp_kn))
def demo3():
tallacc += tacc
sallacc += sacc
- # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc)
+ # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
print("brown: acc over words known:", 10 * tknacc)
print(" : overall accuracy:", 10 * tallacc)
# Natural Language Toolkit: Tagger Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-def str2tuple(s, sep="/"):
+def str2tuple(s, sep='/'):
"""
Given the string representation of a tagged token, return the
corresponding tuple representation. The rightmost occurrence of
return (s, None)
-def tuple2str(tagged_token, sep="/"):
+def tuple2str(tagged_token, sep='/'):
"""
Given the tuple representation of a tagged token, return the
corresponding string representation. This representation is
if tag is None:
return word
else:
- assert sep not in tag, "tag may not contain sep!"
- return "%s%s%s" % (word, sep, tag)
+ assert sep not in tag, 'tag may not contain sep!'
+ return '%s%s%s' % (word, sep, tag)
def untag(tagged_sentence):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import, division
import os
import pickle
baseline_tagger = UnigramTagger(
baseline_data, backoff=baseline_backoff_tagger
)
- with open(cache_baseline_tagger, "w") as print_rules:
+ with open(cache_baseline_tagger, 'w') as print_rules:
pickle.dump(baseline_tagger, print_rules)
print(
"Trained baseline tagger, pickled it to {0}".format(
# writing error analysis to file
if error_output is not None:
- with open(error_output, "w") as f:
- f.write("Errors for Brill Tagger %r\n\n" % serialize_output)
+ with open(error_output, 'w') as f:
+ f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
f.write(
- u"\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n"
+ u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n'
)
print("Wrote tagger errors including context to {0}".format(error_output))
# serializing the tagger to a pickle file and reloading (just to see it works)
if serialize_output is not None:
taggedtest = brill_tagger.tag_sents(testing_data)
- with open(serialize_output, "w") as print_rules:
+ with open(serialize_output, 'w') as print_rules:
pickle.dump(brill_tagger, print_rules)
print("Wrote pickled tagger to {0}".format(serialize_output))
with open(serialize_output, "r") as print_rules:
def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
- testcurve = [teststats["initialerrors"]]
- for rulescore in teststats["rulescores"]:
+ testcurve = [teststats['initialerrors']]
+ for rulescore in teststats['rulescores']:
testcurve.append(testcurve[-1] - rulescore)
- testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]]
+ testcurve = [1 - x / teststats['tokencount'] for x in testcurve[:take]]
- traincurve = [trainstats["initialerrors"]]
- for rulescore in trainstats["rulescores"]:
+ traincurve = [trainstats['initialerrors']]
+ for rulescore in trainstats['rulescores']:
traincurve.append(traincurve[-1] - rulescore)
- traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]]
+ traincurve = [1 - x / trainstats['tokencount'] for x in traincurve[:take]]
import matplotlib.pyplot as plt
plt.savefig(learning_curve_output)
-NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])
+NN_CD_TAGGER = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
REGEXP_TAGGER = RegexpTagger(
[
- (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
- (r"(The|the|A|a|An|an)$", "AT"), # articles
- (r".*able$", "JJ"), # adjectives
- (r".*ness$", "NN"), # nouns formed from adjectives
- (r".*ly$", "RB"), # adverbs
- (r".*s$", "NNS"), # plural nouns
- (r".*ing$", "VBG"), # gerunds
- (r".*ed$", "VBD"), # past tense verbs
- (r".*", "NN"), # nouns (default)
+ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'AT'), # articles
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN'), # nouns (default)
]
)
return (len(seqs), sum(len(x) for x in seqs))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo_learning_curve()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
+
+
# returns a list of errors in string format
:param test_sents: The tagged corpus
:type test_sents: list(tuple)
"""
- hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % (
- "left context",
- "word/test->gold".center(22),
- "right context",
+ hdr = ('%25s | %s | %s\n' + '-' * 26 + '+' + '-' * 24 + '+' + '-' * 26) % (
+ 'left context',
+ 'word/test->gold'.center(22),
+ 'right context',
)
errors = [hdr]
for (train_sent, test_sent) in zip(train_sents, test_sents):
for wordnum, (word, train_pos) in enumerate(train_sent):
test_pos = test_sent[wordnum][1]
if train_pos != test_pos:
- left = " ".join("%s/%s" % w for w in train_sent[:wordnum])
- right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :])
- mid = "%s/%s->%s" % (word, test_pos, train_pos)
+ left = ' '.join('%s/%s' % w for w in train_sent[:wordnum])
+ right = ' '.join('%s/%s' % w for w in train_sent[wordnum + 1 :])
+ mid = '%s/%s->%s' % (word, test_pos, train_pos)
errors.append(
- "%25s | %s | %s" % (left[-25:], mid.center(22), right[:25])
+ '%25s | %s | %s' % (left[-25:], mid.center(22), right[:25])
)
return errors
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import division, print_function, unicode_literals
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
-class Feature(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class Feature(object):
"""
An abstract base class for Features. A Feature is a combination of
a specific property-computing method and a list of relative positions
"""
- json_tag = "nltk.tbl.Feature"
+ json_tag = 'nltk.tbl.Feature'
PROPERTY_NAME = None
def __init__(self, positions, end=None):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk import jsontags
######################################################################
# Tag Rules
######################################################################
-class TagRule(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TagRule(object):
"""
An interface for tag transformations on a tagged corpus, as
performed by tbl taggers. Each transformation finds all tokens
raise TypeError("Rules must implement __hash__()")
+@python_2_unicode_compatible
@jsontags.register_tag
class Rule(TagRule):
"""
"""
- json_tag = "nltk.tbl.Rule"
+ json_tag = 'nltk.tbl.Rule'
def __init__(self, templateid, original_tag, replacement_tag, conditions):
"""
def encode_json_obj(self):
return {
- "templateid": self.templateid,
- "original": self.original_tag,
- "replacement": self.replacement_tag,
- "conditions": self._conditions,
+ 'templateid': self.templateid,
+ 'original': self.original_tag,
+ 'replacement': self.replacement_tag,
+ 'conditions': self._conditions,
}
@classmethod
def decode_json_obj(cls, obj):
return cls(
- obj["templateid"],
- obj["original"],
- obj["replacement"],
- tuple(tuple(feat) for feat in obj["conditions"])
+ obj['templateid'], obj['original'], obj['replacement'], obj['conditions']
)
def applies(self, tokens, index):
self.__repr = "{0}('{1}', {2}, {3}, [{4}])".format(
self.__class__.__name__,
self.templateid,
- repr(self.original_tag),
- repr(self.replacement_tag),
+ unicode_repr(self.original_tag),
+ unicode_repr(self.replacement_tag),
# list(self._conditions) would be simpler but will not generate
# the same Rule.__repr__ in python 2 and 3 and thus break some tests
- ", ".join(
- "({0},{1})".format(f, repr(v))
+ ', '.join(
+ "({0},{1})".format(f, unicode_repr(v))
for (f, v) in self._conditions
),
)
Return a compact, predicate-logic styled string representation
of the given condition.
"""
- return "{0}:{1}@[{2}]".format(
+ return '{0}:{1}@[{2}]'.format(
feature.PROPERTY_NAME,
value,
",".join(str(w) for w in feature.positions),
)
- conditions = " & ".join(
+ conditions = ' & '.join(
[_condition_to_logic(f, v) for (f, v) in self._conditions]
)
- s = "{0}->{1} if {2}".format(
+ s = '{0}->{1} if {2}'.format(
self.original_tag, self.replacement_tag, conditions
)
if len(positions) == 1:
p = positions[0]
if p == 0:
- return "this word"
+ return 'this word'
if p == -1:
- return "the preceding word"
+ return 'the preceding word'
elif p == 1:
- return "the following word"
+ return 'the following word'
elif p < 0:
- return "word i-%d" % -p
+ return 'word i-%d' % -p
elif p > 0:
- return "word i+%d" % p
+ return 'word i+%d' % p
else:
# for complete compatibility with the wordy format of nltk2
mx = max(positions)
mn = min(positions)
if mx - mn == len(positions) - 1:
- return "words i%+d...i%+d" % (mn, mx)
+ return 'words i%+d...i%+d' % (mn, mx)
else:
- return "words {%s}" % (",".join("i%+d" % d for d in positions),)
+ return 'words {%s}' % (",".join("i%+d" % d for d in positions),)
- replacement = "%s -> %s" % (self.original_tag, self.replacement_tag)
- conditions = (" if " if self._conditions else "") + ", and ".join(
+ replacement = '%s -> %s' % (self.original_tag, self.replacement_tag)
+ conditions = (' if ' if self._conditions else "") + ', and '.join(
condition_to_str(f, v) for (f, v) in self._conditions
)
return replacement + conditions
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import itertools as it
from nltk.tbl.feature import Feature
from nltk.tbl.rule import Rule
-class BrillTemplateI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class BrillTemplateI(object):
"""
An interface for generating lists of transformational rules that
apply at given sentence positions. ``BrillTemplateI`` is used by
# Natural Language Toolkit: Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
def additional_tests():
- # print("here-000000000000000")
- # print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest')))
+ # print "here-000000000000000"
+ # print "-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest'))
dir = os.path.dirname(__file__)
- paths = glob(os.path.join(dir, "*.doctest"))
+ paths = glob(os.path.join(dir, '*.doctest'))
files = [os.path.basename(path) for path in paths]
return unittest.TestSuite([doctest.DocFileSuite(file) for file in files])
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
>>> import os.path
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==============================
Lexicons for the tests:
- >>> lex = lexicon.parseLexicon('''
+ >>> lex = lexicon.parseLexicon(u'''
... :- S, N, NP, PP
...
... AdjI :: N\\N
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==============================================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=======
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
def setup_module(module):
import nltk.data
try:
- nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/")
+ nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
except LookupError as e:
print(e)
raise SkipTest(
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==========
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=============
... (dict(a=0,b=1,c=0), 'x'),
... (dict(a=0,b=0,c=0), 'x'),
... (dict(a=0,b=1,c=1), 'y'),
- ... (dict(a=None,b=1,c=0), 'x'),
... ]
>>> test = [
... (dict(a=1,b=0,c=1)), # unseen
['y', 'x', 'y', 'x']
>>> for pdist in classifier.prob_classify_many(test):
... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
- 0.2500 0.7500
- 0.5833 0.4167
- 0.3571 0.6429
- 0.7000 0.3000
+ 0.3203 0.6797
+ 0.5857 0.4143
+ 0.3792 0.6208
+ 0.6470 0.3530
>>> classifier.show_most_informative_features()
Most Informative Features
- c = 0 x : y = 2.3 : 1.0
- c = 1 y : x = 1.8 : 1.0
- a = 1 y : x = 1.7 : 1.0
- a = 0 x : y = 1.0 : 1.0
- b = 0 x : y = 1.0 : 1.0
- b = 1 x : y = 1.0 : 1.0
+ c = 0 x : y = 2.0 : 1.0
+ c = 1 y : x = 1.5 : 1.0
+ a = 1 y : x = 1.4 : 1.0
+ b = 0 x : y = 1.2 : 1.0
+ a = 0 x : y = 1.2 : 1.0
+ b = 1 y : x = 1.1 : 1.0
-Test the Decision Tree classifier (without None):
+Test the Decision Tree classifier:
>>> classifier = nltk.classify.DecisionTreeClassifier.train(
- ... train[:-1], entropy_cutoff=0,
- ... support_cutoff=0)
+ ... train, entropy_cutoff=0,
+ ... support_cutoff=0)
>>> sorted(classifier.labels())
['x', 'y']
>>> print(classifier)
Traceback (most recent call last):
. . .
NotImplementedError
-
-
-Test the Decision Tree classifier (with None):
-
- >>> classifier = nltk.classify.DecisionTreeClassifier.train(
- ... train, entropy_cutoff=0,
- ... support_cutoff=0)
- >>> sorted(classifier.labels())
- ['x', 'y']
- >>> print(classifier)
- c=0? .................................................. x
- a=0? ................................................ x
- a=1? ................................................ y
- a=None? ............................................. x
- c=1? .................................................. y
- <BLANKLINE>
-
Test SklearnClassifier, which requires the scikit-learn package.
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
# most of classify.doctest requires numpy
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===========
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==============
>>> from nltk.collocations import *
>>> bigram_measures = nltk.collocations.BigramAssocMeasures()
>>> trigram_measures = nltk.collocations.TrigramAssocMeasures()
- >>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
>>> finder = BigramCollocationFinder.from_words(
... nltk.corpus.genesis.words('english-web.txt'))
>>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
- [('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'),
- ('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'),
- ('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'),
- ('cutting', 'instrument')]
+ [(u'Allon', u'Bacuth'), (u'Ashteroth', u'Karnaim'), (u'Ben', u'Ammi'),
+ (u'En', u'Mishpat'), (u'Jegar', u'Sahadutha'), (u'Salt', u'Sea'),
+ (u'Whoever', u'sheds'), (u'appoint', u'overseers'), (u'aromatic', u'resin'),
+ (u'cutting', u'instrument')]
While these words are highly collocated, the expressions are also very
infrequent. Therefore it is useful to apply filters, such as ignoring all
>>> finder.apply_freq_filter(3)
>>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
- [('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'),
- ('Most', 'High'), ('ewe', 'lambs'), ('many', 'colors'),
- ('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'),
- ('living', 'creature')]
+ [(u'Beer', u'Lahai'), (u'Lahai', u'Roi'), (u'gray', u'hairs'),
+ (u'Most', u'High'), (u'ewe', u'lambs'), (u'many', u'colors'),
+ (u'burnt', u'offering'), (u'Paddan', u'Aram'), (u'east', u'wind'),
+ (u'living', u'creature')]
We may similarly find collocations among tagged words:
>>> ignored_words = nltk.corpus.stopwords.words('english')
>>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
>>> finder.nbest(bigram_measures.likelihood_ratio, 10) # doctest: +NORMALIZE_WHITESPACE
- [('chief', 'chief'), ('became', 'father'), ('years', 'became'),
- ('hundred', 'years'), ('lived', 'became'), ('king', 'king'),
- ('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'),
- ('hundred', 'became')]
+ [(u'chief', u'chief'), (u'became', u'father'), (u'years', u'became'),
+ (u'hundred', u'years'), (u'lived', u'became'), (u'king', u'king'),
+ (u'lived', u'years'), (u'became', u'became'), (u'chief', u'chiefs'),
+ (u'hundred', u'became')]
Finders
~~~~~~~
((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1),
(('Sam', 'I', 'am'), 1)]
-A similar interface is provided for fourgrams:
-
- >>> finder_4grams = QuadgramCollocationFinder.from_words(tokens)
- >>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq)
- >>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4))
- True
Filtering candidates
~~~~~~~~~~~~~~~~~~~~
--- /dev/null
+
+=========================================
+NLTK Python 2.x - 3.x Compatibility Layer
+=========================================
+
+NLTK comes with a Python 2.x/3.x compatibility layer, nltk.compat
+(which is loosely based on `six <http://packages.python.org/six/>`_)::
+
+ >>> from nltk import compat
+ >>> compat.PY3
+ False
+ >>> # and so on
+
+@python_2_unicode_compatible
+----------------------------
+
+Under Python 2.x ``__str__`` and ``__repr__`` methods must
+return bytestrings.
+
+``@python_2_unicode_compatible`` decorator allows writing these methods
+in a way compatible with Python 3.x:
+
+1) wrap a class with this decorator,
+2) define ``__str__`` and ``__repr__`` methods returning unicode text
+ (that's what they must return under Python 3.x),
+
+and they would be fixed under Python 2.x to return byte strings::
+
+ >>> from nltk.compat import python_2_unicode_compatible
+
+ >>> @python_2_unicode_compatible
+ ... class Foo(object):
+ ... def __str__(self):
+ ... return u'__str__ is called'
+ ... def __repr__(self):
+ ... return u'__repr__ is called'
+
+ >>> foo = Foo()
+ >>> foo.__str__().__class__
+ <type 'str'>
+ >>> foo.__repr__().__class__
+ <type 'str'>
+ >>> print(foo)
+ __str__ is called
+ >>> foo
+ __repr__ is called
+
+Original versions of ``__str__`` and ``__repr__`` are available as
+``__unicode__`` and ``unicode_repr``::
+
+ >>> foo.__unicode__().__class__
+ <type 'unicode'>
+ >>> foo.unicode_repr().__class__
+ <type 'unicode'>
+ >>> unicode(foo)
+ u'__str__ is called'
+ >>> foo.unicode_repr()
+ u'__repr__ is called'
+
+There is no need to wrap a subclass with ``@python_2_unicode_compatible``
+if it doesn't override ``__str__`` and ``__repr__``::
+
+ >>> class Bar(Foo):
+ ... pass
+ >>> bar = Bar()
+ >>> bar.__str__().__class__
+ <type 'str'>
+
+However, if a subclass overrides ``__str__`` or ``__repr__``,
+wrap it again::
+
+ >>> class BadBaz(Foo):
+ ... def __str__(self):
+ ... return u'Baz.__str__'
+ >>> baz = BadBaz()
+ >>> baz.__str__().__class__ # this is incorrect!
+ <type 'unicode'>
+
+ >>> @python_2_unicode_compatible
+ ... class GoodBaz(Foo):
+ ... def __str__(self):
+ ... return u'Baz.__str__'
+ >>> baz = GoodBaz()
+ >>> baz.__str__().__class__
+ <type 'str'>
+ >>> baz.__unicode__().__class__
+ <type 'unicode'>
+
+Applying ``@python_2_unicode_compatible`` to a subclass
+shouldn't break methods that was not overridden::
+
+ >>> baz.__repr__().__class__
+ <type 'str'>
+ >>> baz.unicode_repr().__class__
+ <type 'unicode'>
+
+unicode_repr
+------------
+
+Under Python 3.x ``repr(unicode_string)`` doesn't have a leading "u" letter.
+
+``nltk.compat.unicode_repr`` function may be used instead of ``repr`` and
+``"%r" % obj`` to make the output more consistent under Python 2.x and 3.x::
+
+ >>> from nltk.compat import unicode_repr
+ >>> print(repr(u"test"))
+ u'test'
+ >>> print(unicode_repr(u"test"))
+ 'test'
+
+It may be also used to get an original unescaped repr (as unicode)
+of objects which class was fixed by ``@python_2_unicode_compatible``
+decorator::
+
+ >>> @python_2_unicode_compatible
+ ... class Foo(object):
+ ... def __repr__(self):
+ ... return u'<Foo: foo>'
+
+ >>> foo = Foo()
+ >>> repr(foo)
+ '<Foo: foo>'
+ >>> unicode_repr(foo)
+ u'<Foo: foo>'
+
+For other objects it returns the same value as ``repr``::
+
+ >>> unicode_repr(5)
+ '5'
+
+It may be a good idea to use ``unicode_repr`` instead of ``%r``
+string formatting specifier inside ``__repr__`` or ``__str__``
+methods of classes fixed by ``@python_2_unicode_compatible``
+to make the output consistent between Python 2.x and 3.x.
--- /dev/null
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from nltk.compat import PY3
+
+
+def setup_module(module):
+ from nose import SkipTest
+
+ if PY3:
+ raise SkipTest("compat.doctest is for Python 2.x")
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
================
typically load all documents in the corpus.
>>> len(inaugural.words())
- 149797
+ 145735
If a corpus contains a README file, it can be accessed with a ``readme()`` method:
>>> nltk.corpus.abc.words()
['PM', 'denies', 'knowledge', 'of', 'AWB', ...]
>>> nltk.corpus.genesis.words()
- ['In', 'the', 'beginning', 'God', 'created', ...]
+ [u'In', u'the', u'beginning', u'God', u'created', ...]
>>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt')
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...]
>>> nltk.corpus.inaugural.words()
(NP the/DT Exchequer/NNP)
...)
>>> print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...]
+ [[u'Sao', u'Paulo', u'(', u'Brasil', u')', u',', ...], [u'-'], ...]
>>> for tree in conll2002.chunked_sents()[:2]:
... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
(S
>>> stopwords.fileids() # doctest: +ELLIPSIS
['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', ...]
- >>> sorted(stopwords.words('portuguese')) # doctest: +ELLIPSIS
- ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...]
+ >>> stopwords.words('portuguese') # doctest: +ELLIPSIS
+ ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', ...]
>>> names.fileids()
['female.txt', 'male.txt']
>>> names.words('male.txt') # doctest: +ELLIPSIS
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
- >>> mean = tot / n_reviews
+ >>> # We use float for backward compatibility with division in Python2.7
+ >>> mean = tot/float(n_reviews)
>>> print(n_reviews, tot, mean)
15 24 1.6
At a high level, corpora can be divided into three basic types:
-- A *token corpus* contains information about specific occurrences of
+- A *token corpus* contains information about specific occurences of
language use (or linguistic tokens), such as dialogues or written
texts. Examples of token corpora are collections of written text
and collections of speech.
>>> nltk.corpus.treebank.words()
['Pierre', 'Vinken', ',', '61', 'years', 'old', ...]
>>> nltk.corpus.conll2002.words()
- ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...]
+ [u'Sao', u'Paulo', u'(', u'Brasil', u')', u',', u'23', ...]
>>> nltk.corpus.genesis.words()
- ['In', 'the', 'beginning', 'God', 'created', ...]
+ [u'In', u'the', u'beginning', u'God', u'created', ...]
On the other hand, the `tagged_words()` method is only supported by
corpora that include part-of-speech annotations:
>>> nltk.corpus.treebank.tagged_words()
[('Pierre', 'NNP'), ('Vinken', 'NNP'), ...]
>>> nltk.corpus.conll2002.tagged_words()
- [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]
+ [(u'Sao', u'NC'), (u'Paulo', u'VMI'), (u'(', u'Fpa'), ...]
>>> nltk.corpus.genesis.tagged_words()
Traceback (most recent call last):
...
return ``unicode`` objects (not ``str`` objects).
>>> reader.read() # read the entire file.
- 'This is a test file.\nIt is encoded in ascii.\n'
+ u'This is a test file.\nIt is encoded in ascii.\n'
>>> reader.seek(0) # rewind to the start.
>>> reader.read(5) # read at most 5 bytes.
- 'This '
+ u'This '
>>> reader.readline() # read to the end of the line.
- 'is a test file.\n'
+ u'is a test file.\n'
>>> reader.seek(0) # rewind to the start.
>>> for line in reader:
... print(repr(line)) # iterate over lines
- 'This is a test file.\n'
- 'It is encoded in ascii.\n'
+ u'This is a test file.\n'
+ u'It is encoded in ascii.\n'
>>> reader.seek(0) # rewind to the start.
>>> reader.readlines() # read a list of line strings
- ['This is a test file.\n', 'It is encoded in ascii.\n']
+ [u'This is a test file.\n', u'It is encoded in ascii.\n']
>>> reader.close()
Size argument to ``read()``
... """.decode('ascii').encode('utf-16'))
>>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
>>> reader.read(10)
- 'This '
+ u'This '
If a read block ends in the middle of the byte string encoding a
single character, then that byte string is stored in an internal
>>> reader.seek(0) # rewind to the start.
>>> reader.read(1) # we actually need to read 4 bytes
- 'T'
+ u'T'
>>> int(reader.tell())
4
>>> reader.seek(0) # rewind to the start.
>>> reader.readline() # stores extra text in a buffer
- 'This is a test file.\n'
+ u'This is a test file.\n'
>>> print(reader.linebuffer) # examine the buffer contents
- ['It is encoded i']
+ [u'It is encoded i']
>>> reader.read(0) # returns the contents of the buffer
- 'It is encoded i'
+ u'It is encoded i'
>>> print(reader.linebuffer) # examine the buffer contents
None
... """.decode('ascii').encode('utf-16'))
>>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
>>> reader.read(20)
- 'This is a '
+ u'This is a '
>>> pos = reader.tell(); print(pos)
22
>>> reader.read(20)
- 'test file.'
+ u'test file.'
>>> reader.seek(pos) # rewind to the position from tell.
>>> reader.read(20)
- 'test file.'
+ u'test file.'
The ``seek()`` and ``tell()`` methods work property even when
``readline()`` is used.
... """.decode('ascii').encode('utf-16'))
>>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
>>> reader.readline()
- 'This is a test file.\n'
+ u'This is a test file.\n'
>>> pos = reader.tell(); print(pos)
44
>>> reader.readline()
- 'It is encoded in utf-16.\n'
+ u'It is encoded in utf-16.\n'
>>> reader.seek(pos) # rewind to the position from tell.
>>> reader.readline()
- 'It is encoded in utf-16.\n'
+ u'It is encoded in utf-16.\n'
Squashed Bugs
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
from nltk.corpus import teardown_module
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
Crubadan Corpus Reader
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=========================================
loaded. The ``nltk:`` protocol loads files from the NLTK data
distribution:
+ >>> from __future__ import print_function
>>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
>>> tokenizer.tokenize('Hello. This is a test. It works!')
['Hello.', 'This is a test.', 'It works!']
and writing work as intended and does not test how much improvement buffering
provides.
- >>> from io import StringIO
+ >>> from nltk.compat import StringIO
>>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10)
>>> ans = []
>>> for i in range(10000):
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==================
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
# FIXME: the entire discourse.doctest is skipped if Prover9/Mace4 is
try:
m = Mace()
- m._find_binary("mace4")
+ m._find_binary('mace4')
except LookupError:
raise SkipTest("Mace4/Prover9 is not available so discourse.doctest is skipped")
--- /dev/null
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import re
+import sys
+import os
+import codecs
+import doctest
+from nose.util import tolist, anyp
+from nose.plugins.base import Plugin
+from nose.suite import ContextList
+from nose.plugins.doctests import Doctest, log, DocFileCase
+
+ALLOW_UNICODE = doctest.register_optionflag('ALLOW_UNICODE')
+
+
+class _UnicodeOutputChecker(doctest.OutputChecker):
+ _literal_re = re.compile(r"(\W|^)[uU]([rR]?[\'\"])", re.UNICODE)
+
+ def _remove_u_prefixes(self, txt):
+ return re.sub(self._literal_re, r'\1\2', txt)
+
+ def check_output(self, want, got, optionflags):
+ res = doctest.OutputChecker.check_output(self, want, got, optionflags)
+ if res:
+ return True
+ if not (optionflags & ALLOW_UNICODE):
+ return False
+
+ # ALLOW_UNICODE is active and want != got
+ cleaned_want = self._remove_u_prefixes(want)
+ cleaned_got = self._remove_u_prefixes(got)
+ res = doctest.OutputChecker.check_output(
+ self, cleaned_want, cleaned_got, optionflags
+ )
+ return res
+
+
+_checker = _UnicodeOutputChecker()
+
+
+class DoctestPluginHelper(object):
+ """
+ This mixin adds print_function future import to all test cases.
+
+ It also adds support for:
+ '#doctest +ALLOW_UNICODE' option that
+ makes DocTestCase think u'foo' == 'foo'.
+
+ '#doctest doctestencoding=utf-8' option that
+ changes the encoding of doctest files
+ """
+
+ OPTION_BY_NAME = ('doctestencoding',)
+
+ def loadTestsFromFileUnicode(self, filename):
+ if self.extension and anyp(filename.endswith, self.extension):
+ name = os.path.basename(filename)
+ dh = codecs.open(filename, 'r', self.options.get('doctestencoding'))
+ try:
+ doc = dh.read()
+ finally:
+ dh.close()
+
+ fixture_context = None
+ globs = {'__file__': filename}
+ if self.fixtures:
+ base, ext = os.path.splitext(name)
+ dirname = os.path.dirname(filename)
+ sys.path.append(dirname)
+ fixt_mod = base + self.fixtures
+ try:
+ fixture_context = __import__(fixt_mod, globals(), locals(), ["nop"])
+ except ImportError as e:
+ log.debug("Could not import %s: %s (%s)", fixt_mod, e, sys.path)
+ log.debug("Fixture module %s resolved to %s", fixt_mod, fixture_context)
+ if hasattr(fixture_context, 'globs'):
+ globs = fixture_context.globs(globs)
+ parser = doctest.DocTestParser()
+ test = parser.get_doctest(
+ doc, globs=globs, name=name, filename=filename, lineno=0
+ )
+ if test.examples:
+ case = DocFileCase(
+ test,
+ optionflags=self.optionflags,
+ setUp=getattr(fixture_context, 'setup_test', None),
+ tearDown=getattr(fixture_context, 'teardown_test', None),
+ result_var=self.doctest_result_var,
+ )
+ if fixture_context:
+ yield ContextList((case,), context=fixture_context)
+ else:
+ yield case
+ else:
+ yield False # no tests to load
+
+ def loadTestsFromFile(self, filename):
+
+ cases = self.loadTestsFromFileUnicode(filename)
+
+ for case in cases:
+ if isinstance(case, ContextList):
+ yield ContextList([self._patchTestCase(c) for c in case], case.context)
+ else:
+ yield self._patchTestCase(case)
+
+ def loadTestsFromModule(self, module):
+ """Load doctests from the module.
+ """
+ for suite in super(DoctestPluginHelper, self).loadTestsFromModule(module):
+ cases = [self._patchTestCase(case) for case in suite._get_tests()]
+ yield self.suiteClass(cases, context=module, can_split=False)
+
+ def _patchTestCase(self, case):
+ if case:
+ case._dt_test.globs['print_function'] = print_function
+ case._dt_checker = _checker
+ return case
+
+ def configure(self, options, config):
+ # it is overriden in order to fix doctest options discovery
+
+ Plugin.configure(self, options, config)
+ self.doctest_result_var = options.doctest_result_var
+ self.doctest_tests = options.doctest_tests
+ self.extension = tolist(options.doctestExtension)
+ self.fixtures = options.doctestFixtures
+ self.finder = doctest.DocTestFinder()
+
+ # super(DoctestPluginHelper, self).configure(options, config)
+ self.optionflags = 0
+ self.options = {}
+
+ if options.doctestOptions:
+ stroptions = ",".join(options.doctestOptions).split(',')
+ for stroption in stroptions:
+ try:
+ if stroption.startswith('+'):
+ self.optionflags |= doctest.OPTIONFLAGS_BY_NAME[stroption[1:]]
+ continue
+ elif stroption.startswith('-'):
+ self.optionflags &= ~doctest.OPTIONFLAGS_BY_NAME[stroption[1:]]
+ continue
+ try:
+ key, value = stroption.split('=')
+ except ValueError:
+ pass
+ else:
+ if not key in self.OPTION_BY_NAME:
+ raise ValueError()
+ self.options[key] = value
+ continue
+ except (AttributeError, ValueError, KeyError):
+ raise ValueError("Unknown doctest option {}".format(stroption))
+ else:
+ raise ValueError(
+ "Doctest option is not a flag or a key/value pair: {} ".format(
+ stroption
+ )
+ )
+
+
+class DoctestFix(DoctestPluginHelper, Doctest):
+ pass
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
================================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=========================
Grammars can be parsed from strings.
+ >>> from __future__ import print_function
>>> import nltk
>>> from nltk import grammar, parse
>>> g = """
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==================================
Feature Structures & Unification
==================================
+ >>> from __future__ import print_function
>>> from nltk.featstruct import FeatStruct
>>> from nltk.sem.logic import Variable, VariableExpression, Expression
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
========
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===============================================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=======================================
Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.
>>> model.most_similar(positive=['university'], topn = 3)
- [('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)]
+ [(u'universities', 0.70039...), (u'faculty', 0.67809...), (u'undergraduate', 0.65870...)]
Finding a word that is not in a list is also supported, although, implementing this by yourself is simple.
the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'.
>>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
- [('queen', 0.71181...)]
+ [(u'queen', 0.71181...)]
>>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
- [('France', 0.78840...)]
+ [(u'France', 0.78840...)]
We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words.
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
def setup_module(module):
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==============================================================================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
.. see also: gluesemantics.doctest
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
def setup_module(module):
from nltk.parse.malt import MaltParser
try:
- depparser = MaltParser("maltparser-1.7.2")
+ depparser = MaltParser('maltparser-1.7.2')
except LookupError:
raise SkipTest("MaltParser is not available")
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===============
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==========================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
.. _align howto: align.html
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
====================================
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
def setup_module(module):
try:
m = Mace()
- m._find_binary("mace4")
+ m._find_binary('mace4')
except LookupError:
raise SkipTest(
"Mace4/Prover9 is not available so inference.doctest was skipped"
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==========================================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
============================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
.. -*- coding: utf-8 -*-
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=======================
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. -*- coding: utf-8 -*-
-
-=============
-METEOR tests
-=============
-
-No Allignment test
-------------------
-
- >>> from nltk.translate import meteor
-
-If the candidate has no alignment to any of the references, the METEOR score is 0.
-
- >>> round(meteor(
- ... ['The candidate has no alignment to any of the references'],
- ... 'John loves Mary'
- ... ),4)
- 0.0
-
-Tests based on wikipedia examples
----------------------------------
-
-Testing on `wikipedia examples <https://en.wikipedia.org/wiki/METEOR#Examples>`_
-
- >>> same_res = round(meteor(
- ... ['The cat sat on the mat'],
- ... 'The cat sat on the mat'
- ... ),4)
- >>> abs(same_res - 0.9977) < 1e-2
- True
-
- >>> meteor(
- ... ['The cat sat on the mat'],
- ... 'on the mat sat the cat'
- ... )
- 0.5
-
- >>> round(meteor(
- ... ['The cat sat on the mat'],
- ... 'The cat was sat on the mat'
- ... ),4)
- 0.9654
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=======
The `nltk.metrics` package provides a variety of *evaluation measures*
which can be used for a wide variety of NLP tasks.
+ >>> from __future__ import print_function
>>> from nltk.metrics import *
------------------
>>> edit_distance("rain", "shine")
3
- >>> edit_distance_align("shine", "shine")
- [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
- >>> edit_distance_align("rain", "brainy")
- [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)]
- >>> edit_distance_align("", "brainy")
- [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)]
- >>> edit_distance_align("", "")
- [(0, 0)]
Other distance measures:
True
>>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N)
True
-
-
-For fourgrams, we have to provide more count information:
-
- >>> n_w1_w2_w3_w4 = 5
- >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
- >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10
- >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
- >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4)
- >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400
- >>> uni_counts = (n_w1, n_w2, n_w3, n_w4)
- >>> N = 14307668
- >>> qam = QuadgramAssocMeasures
- >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N
- True
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
--------------------------------------------------------------------------------
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
======================
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
def setup_module(module):
try:
m = Mace()
- m._find_binary("mace4")
+ m._find_binary('mace4')
except LookupError:
raise SkipTest(
"Mace4/Prover9 is not available so nonmonotonic.doctest was skipped"
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=========
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==================================
>>> [w for w in psent1 if w.endswith('a')]
['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna']
>>> [w for w in ptext4 if len(w) > 15]
- ['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro',
- 'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente']
+ [u'norte-irlandeses', u'pan-nacionalismo', u'predominatemente', u'primeiro-ministro',
+ u'primeiro-ministro', u'irlandesa-americana', u'responsabilidades', u'significativamente']
We can examine the relative frequency of words in a text, using ``FreqDist``:
>>> fd1['olhos']
137
>>> fd1.max()
- ','
+ u','
>>> fd1.samples()[:100]
- [',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o',
- '\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu',
- 'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?',
- 'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia',
- 'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito',
- 'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem',
- 'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem',
- 'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia',
- 't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma',
- 'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim',
- 'ent\xe3o', 'dizia', 'aos', 'Borba']
+ [u',', u'.', u'a', u'que', u'de', u'e', u'-', u'o', u';', u'me', u'um', u'n\xe3o',
+ u'\x97', u'se', u'do', u'da', u'uma', u'com', u'os', u'\xe9', u'era', u'as', u'eu',
+ u'lhe', u'ao', u'em', u'para', u'mas', u'...', u'!', u'\xe0', u'na', u'mais', u'?',
+ u'no', u'como', u'por', u'N\xe3o', u'dos', u'ou', u'ele', u':', u'Virg\xedlia',
+ u'meu', u'disse', u'minha', u'das', u'O', u'/', u'A', u'CAP\xcdTULO', u'muito',
+ u'depois', u'coisa', u'foi', u'sem', u'olhos', u'ela', u'nos', u'tinha', u'nem',
+ u'E', u'outro', u'vida', u'nada', u'tempo', u'menos', u'outra', u'casa', u'homem',
+ u'porque', u'quando', u'mim', u'mesmo', u'ser', u'pouco', u'estava', u'dia',
+ u't\xe3o', u'tudo', u'Mas', u'at\xe9', u'D', u'ainda', u's\xf3', u'alguma',
+ u'la', u'vez', u'anos', u'h\xe1', u'Era', u'pai', u'esse', u'lo', u'dizer', u'assim',
+ u'ent\xe3o', u'dizia', u'aos', u'Borba']
---------------
Reading Corpora
[['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o',
'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'],
['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional',
- 'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...]
+ 'do', 'Zebu', ',', 'que', 'come\xe7a', 'dia', '25'], ...]
>>> nltk.corpus.mac_morpho.tagged_words()
[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...]
('Paulo', 'NPROP')],
[('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'),
('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'),
- ('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'),
+ ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'),
('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]
This data can be used to train taggers (examples below for the Floresta treebank).
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from nltk.compat import PY3
+
from nltk.corpus import teardown_module
raise SkipTest(
"portuguese_en.doctest imports nltk.examples.pt which doesn't exist!"
)
+
+ if not PY3:
+ raise SkipTest(
+ "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
+ )
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===========
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
# probability.doctest uses HMM which requires numpy;
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
========
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
======================
>>> from nltk.corpus import conll2002
>>> for doc in conll2002.chunked_sents('ned.train')[27]:
... print(doc)
- ('Het', 'Art')
+ (u'Het', u'Art')
(ORG Hof/N van/Prep Cassatie/N)
- ('verbrak', 'V')
- ('het', 'Art')
- ('arrest', 'N')
- ('zodat', 'Conj')
- ('het', 'Pron')
- ('moest', 'V')
- ('worden', 'V')
- ('overgedaan', 'V')
- ('door', 'Prep')
- ('het', 'Art')
- ('hof', 'N')
- ('van', 'Prep')
- ('beroep', 'N')
- ('van', 'Prep')
+ (u'verbrak', u'V')
+ (u'het', u'Art')
+ (u'arrest', u'N')
+ (u'zodat', u'Conj')
+ (u'het', u'Pron')
+ (u'moest', u'V')
+ (u'worden', u'V')
+ (u'overgedaan', u'V')
+ (u'door', u'Prep')
+ (u'het', u'Art')
+ (u'hof', u'N')
+ (u'van', u'Prep')
+ (u'beroep', u'N')
+ (u'van', u'Prep')
(LOC Antwerpen/N)
- ('.', 'Punc')
+ (u'.', u'Punc')
Relation Extraction
~~~~~~~~~~~~~~~~~~~
... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
>>> for r in rels[:10]:
... print(relextract.clause(r, relsym='DE')) # doctest: +NORMALIZE_WHITESPACE
- DE('tribunal_supremo', 'victoria')
- DE('museo_de_arte', 'alcorc\xf3n')
- DE('museo_de_bellas_artes', 'a_coru\xf1a')
- DE('siria', 'l\xedbano')
- DE('uni\xf3n_europea', 'pek\xedn')
- DE('ej\xe9rcito', 'rogberi')
- DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n')
- DE('psoe', 'villanueva_de_la_serena')
- DE('ej\xe9rcito', 'l\xedbano')
- DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta')
+ DE(u'tribunal_supremo', u'victoria')
+ DE(u'museo_de_arte', u'alcorc\xf3n')
+ DE(u'museo_de_bellas_artes', u'a_coru\xf1a')
+ DE(u'siria', u'l\xedbano')
+ DE(u'uni\xf3n_europea', u'pek\xedn')
+ DE(u'ej\xe9rcito', u'rogberi')
+ DE(u'juzgado_de_instrucci\xf3n_n\xfamero_1', u'san_sebasti\xe1n')
+ DE(u'psoe', u'villanueva_de_la_serena')
+ DE(u'ej\xe9rcito', u'l\xedbano')
+ DE(u'juzgado_de_lo_penal_n\xfamero_2', u'ceuta')
>>> vnv = """
... (
... is/V|
>>> for doc in conll2002.chunked_sents('ned.train'):
... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
... print(relextract.clause(r, relsym="VAN"))
- VAN("cornet_d'elzius", 'buitenlandse_handel')
- VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
- VAN('annie_lennox', 'eurythmics')
+ VAN(u"cornet_d'elzius", u'buitenlandse_handel')
+ VAN(u'johan_rottiers', u'kardinaal_van_roey_instituut')
+ VAN(u'annie_lennox', u'eurythmics')
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=========================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function
import sys
import os
import nose
from nose.plugins.doctests import Doctest
from nose.plugins import builtin
-NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
sys.path.insert(0, NLTK_ROOT)
-NLTK_TEST_DIR = os.path.join(NLTK_ROOT, "nltk")
+NLTK_TEST_DIR = os.path.join(NLTK_ROOT, 'nltk')
-if __name__ == "__main__":
+if __name__ == '__main__':
# there shouldn't be import from NLTK for coverage to work properly
+ from doctest_nose_plugin import DoctestFix
+
try:
# Import RedNose plugin for colored test output
from rednose import RedNose
def loadPlugins(self):
for plug in builtin.plugins:
- self.addPlugin(plug())
+ if plug != Doctest:
+ self.addPlugin(plug())
+ self.addPlugin(DoctestFix())
if rednose_available:
self.addPlugin(RedNose())
if not args:
args = [NLTK_TEST_DIR]
- if all(arg.startswith("-") for arg in args):
+ if all(arg.startswith('-') for arg in args):
# only extra options were passed
args += [NLTK_TEST_DIR]
# Activate RedNose and hide skipped test messages from output
if rednose_available:
- args += ["--rednose", "--hide-skips"]
+ args += ['--rednose', '--hide-skips']
arguments = [
- "--exclude=", # why is this needed?
+ '--exclude=', # why is this needed?
# '--with-xunit',
# '--xunit-file=$WORKSPACE/nosetests.xml',
# '--nocapture',
- "--with-doctest",
+ '--with-doctest',
# '--doctest-tests',
# '--debug=nose,nose.importer,nose.inspector,nose.plugins,nose.result,nose.selector',
- "--doctest-extension=.doctest",
- "--doctest-fixtures=_fixt",
- "--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL",
+ '--doctest-extension=.doctest',
+ '--doctest-fixtures=_fixt',
+ '--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL,+ALLOW_UNICODE,'
+ 'doctestencoding=utf-8',
# '--verbosity=3',
] + args
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
# skip segmentation.doctest if numpy is not available
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=========
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
# reset the variables counter before running tests
def setup_module(module):
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
======================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=================
This file contains some simple tests that will be run by EasyInstall in
order to test the installation when NLTK-Data is absent.
-
+ >>> from __future__ import print_function
+
------------
Tokenization
------------
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
==========
Stemmers remove morphological affixes from words, leaving only the
word stem.
+ >>> from __future__ import print_function
>>> from nltk.stem import *
Unit tests for the Porter stemmer
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
Regression Tests
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
+ >>> from __future__ import print_function
>>> from nltk.tokenize import *
Regression Tests: Treebank Tokenizer
Testing improvement made to the TreebankWordTokenizer
- >>> sx1 = '\xabNow that I can do.\xbb'
- >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
+ >>> sx1 = u'\xabNow that I can do.\xbb'
+ >>> expected = [u'\xab', u'Now', u'that', u'I', u'can', u'do', u'.', u'\xbb']
>>> word_tokenize(sx1) == expected
True
- >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
- >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
+ >>> sx2 = u'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
+ >>> expected = [u'The', u'unicode', u'201C', u'and', u'201D', u'\u201c', u'LEFT', u'(', u'RIGHT', u')', u'DOUBLE', u'QUOTATION', u'MARK', u'\u201d', u'is', u'also', u'OPEN_PUNCT', u'and', u'CLOSE_PUNCT', u'.']
>>> word_tokenize(sx2) == expected
True
>>> tknzr = TweetTokenizer()
>>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L"
>>> tknzr.tokenize(s10)
- ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']
+ [u'Photo', u':', u"Aujourd'hui", u'sur', u'http://t.co/0gebOFDUzn', u'Projet', u'...', u'http://t.co/bKfIUbydz2', u'...', u'http://fb.me/3b6uXpz0L']
Regression Tests: PunktSentenceTokenizer
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===============================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
.. -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
from nltk.corpus import teardown_module
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===============================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
========================================================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
-------------------------------------------
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
+import six
+
from nltk import FreqDist
from nltk.lm import NgramCounter
from nltk.util import everygrams
bigrams = self.trigram_counter[2]
trigrams = self.trigram_counter[3]
- self.assertCountEqual(expected_bigram_contexts, bigrams.conditions())
- self.assertCountEqual(expected_trigram_contexts, trigrams.conditions())
+ six.assertCountEqual(self, expected_bigram_contexts, bigrams.conditions())
+ six.assertCountEqual(self, expected_trigram_contexts, trigrams.conditions())
def test_bigram_counts_seen_ngrams(self):
b_given_a_count = 1
self.assertFalse(bool(counter[3]))
self.assertFalse(bool(counter[2]))
- self.assertCountEqual(words, counter[1].keys())
+ six.assertCountEqual(self, words, counter[1].keys())
def test_train_on_illegal_sentences(self):
str_sent = ["Check", "this", "out", "!"]
bigram_contexts = [("a",), ("c",)]
trigram_contexts = [("e", "f")]
- self.assertCountEqual(unigrams, counter[1].keys())
- self.assertCountEqual(bigram_contexts, counter[2].keys())
- self.assertCountEqual(trigram_contexts, counter[3].keys())
+ six.assertCountEqual(self, unigrams, counter[1].keys())
+ six.assertCountEqual(self, bigram_contexts, counter[2].keys())
+ six.assertCountEqual(self, trigram_contexts, counter[3].keys())
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import division
import math
+import sys
import unittest
+from six import add_metaclass
from nltk.lm import (
Vocabulary,
dct["test_score_{0}".format(i)] = cls.add_score_test(
word, context, expected_score
)
- return super().__new__(cls, name, bases, dct)
+ return super(ParametrizeTestsMeta, cls).__new__(cls, name, bases, dct)
@classmethod
def add_score_test(cls, word, context, expected_score):
- message = "word='{word}', context={context}"
+ if sys.version_info > (3, 5):
+ message = "word='{word}', context={context}"
+ else:
+ # Python 2 doesn't report the mismatched values if we pass a custom
+ # message, so we have to report them manually.
+ message = (
+ "{score} != {expected_score} within 4 places, "
+ "word='{word}', context={context}"
+ )
def test_method(self):
score = self.model.score(word, context)
return test
-class MleBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """Unit tests for MLE ngram model."""
+@add_metaclass(ParametrizeTestsMeta)
+class MleBigramTests(unittest.TestCase):
+ """unit tests for MLENgramModel class"""
score_tests = [
("d", ["c"], 1),
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
-class MleTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class MleTrigramTests(unittest.TestCase):
"""MLE trigram model tests"""
score_tests = [
self.model.fit(training_text)
-class LidstoneBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """Unit tests for Lidstone class"""
+@add_metaclass(ParametrizeTestsMeta)
+class LidstoneBigramTests(unittest.TestCase):
+ """unit tests for Lidstone class"""
score_tests = [
# count(d | c) = 1
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
-class LidstoneTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class LidstoneTrigramTests(unittest.TestCase):
score_tests = [
# Logic behind this is the same as for bigram model
("d", ["c"], 1.1 / 1.8),
self.model.fit(training_text)
-class LaplaceBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """Unit tests for Laplace class"""
+@add_metaclass(ParametrizeTestsMeta)
+class LaplaceBigramTests(unittest.TestCase):
+ """unit tests for Laplace class"""
score_tests = [
# basic sanity-check:
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
-class WittenBellInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class WittenBellInterpolatedTrigramTests(unittest.TestCase):
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = WittenBellInterpolated(3, vocabulary=vocab)
# gamma(['a', 'b']) = 0.0667
# mle("c", ["a", "b"]) = 1
("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
- # The ngram 'z b c' was not seen, so we should simply revert to
- # the score of the ngram 'b c'. See issue #2332.
- ("c", ["z", "b"], ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
]
-class KneserNeyInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = KneserNeyInterpolated(3, vocabulary=vocab)
# gamma(['a', 'b']) = 0.1 * 1
# normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
- # The ngram 'z b c' was not seen, so we should simply revert to
- # the score of the ngram 'b c'. See issue #2332.
- ("c", ["z", "b"], ((0.9 + 0.2 * (1 / 8)) / 2)),
]
class NgramModelTextGenerationTests(unittest.TestCase):
- """Using MLE model, generate some text."""
+ """Using MLE estimator, generate some text."""
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
)
- def test_generate_cycle(self):
- # Add a cycle to the model: bd -> b, db -> d
- more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
- self.model.fit(more_training_text)
- # Test that we can escape the cycle
+ def test_generate_no_seed_unigrams(self):
self.assertEqual(
- self.model.generate(7, text_seed=("b", "d"), random_seed=5),
- ["b", "d", "b", "d", "b", "d", "</s>"],
+ self.model.generate(5, random_seed=3),
+ ["<UNK>", "</s>", "</s>", "</s>", "</s>"],
)
def test_generate_with_text_seed(self):
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from collections import Counter
+import six
from nltk.lm import Vocabulary
vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
vocab_items = ["a", "b", "d", "e", "<UNK>"]
- self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys()))
- self.assertCountEqual(vocab_items, list(self.vocab))
+ six.assertCountEqual(self, vocab_counts, list(self.vocab.counts.keys()))
+ six.assertCountEqual(self, vocab_items, list(self.vocab))
def test_update_empty_vocab(self):
empty = Vocabulary(unk_cutoff=2)
def test_str(self):
self.assertEqual(
- str(self.vocab), "<Vocabulary with cutoff=2 unk_label='<UNK>' and 5 items>"
+ str(self.vocab),
+ ("<Vocabulary with cutoff=2 " "unk_label='<UNK>' and 5 items>"),
)
def test_creation_with_counter(self):
--- /dev/null
+# -*- coding: utf-8 -*-
+"""
+Unit tests for nltk.compat.
+See also nltk/test/compat.doctest.
+"""
+from __future__ import absolute_import, unicode_literals
+import unittest
+
+from nltk.text import Text
+from nltk.compat import PY3, python_2_unicode_compatible
+
+
+def setup_module(module):
+ from nose import SkipTest
+
+ if PY3:
+ raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x")
+
+
+class TestTextTransliteration(unittest.TestCase):
+ txt = Text(["São", "Tomé", "and", "Príncipe"])
+
+ def test_repr(self):
+ self.assertEqual(repr(self.txt), br"<Text: S\xe3o Tom\xe9 and Pr\xedncipe...>")
+
+ def test_str(self):
+ self.assertEqual(str(self.txt), b"<Text: Sao Tome and Principe...>")
+
+
+class TestFraction(unittest.TestCase):
+ def test_unnoramlize_fraction(self):
+ from fractions import Fraction as NativePythonFraction
+ from nltk.compat import Fraction as NLTKFraction
+
+ # The native fraction should throw a TypeError in Python < 3.5
+ with self.assertRaises(TypeError):
+ NativePythonFraction(0, 1000, _normalize=False)
+
+ # Using nltk.compat.Fraction in Python < 3.5
+ compat_frac = NLTKFraction(0, 1000, _normalize=False)
+ # The numerator and denominator does not change.
+ assert compat_frac.numerator == 0
+ assert compat_frac.denominator == 1000
+ # The floating point value remains normalized.
+ assert float(compat_frac) == 0.0
+
+ # Checks that the division is not divided by
+ # # by greatest common divisor (gcd).
+ six_twelve = NLTKFraction(6, 12, _normalize=False)
+ assert six_twelve.numerator == 6
+ assert six_twelve.denominator == 12
+
+ one_two = NLTKFraction(1, 2, _normalize=False)
+ assert one_two.numerator == 1
+ assert one_two.denominator == 2
+
+ # Checks against the native fraction.
+ six_twelve_original = NativePythonFraction(6, 12)
+ # Checks that rational values of one_two and six_twelve is the same.
+ assert float(one_two) == float(six_twelve) == float(six_twelve_original)
+
+ # Checks that the fraction does get normalized, even when
+ # _normalize == False when numerator is using native
+ # fractions.Fraction.from_float
+ assert NLTKFraction(3.142, _normalize=False) == NativePythonFraction(3.142)
Unit tests for nltk.metrics.aline
"""
+from __future__ import unicode_literals
import unittest
+++ /dev/null
-import unittest
-from nltk import ConditionalFreqDist, tokenize
-
-class TestEmptyCondFreq(unittest.TestCase):
- def test_tabulate(self):
- empty = ConditionalFreqDist()
- self.assertEqual(empty.conditions(),[])
- try:
- empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
- except:
- pass
- self.assertEqual(empty.conditions(), [])
-
-
- def test_plot(self):
- empty = ConditionalFreqDist()
- self.assertEqual(empty.conditions(),[])
- try:
- empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
- except:
- pass
- self.assertEqual(empty.conditions(),[])
-
- def test_increment(self):
- # make sure that we can still mutate cfd normally
- text = "cow cat mouse cat tiger"
- cfd = ConditionalFreqDist()
-
- # create cfd with word length as condition
- for word in tokenize.word_tokenize(text):
- condition = len(word)
- cfd[condition][word] += 1
-
- self.assertEqual(cfd.conditions(), [3,5])
-
- # incrementing previously unseen key is still possible
- cfd[2]['hi'] += 1
- self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
- self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
+++ /dev/null
-# -*- coding: utf-8 -*-
-import unittest
-import nltk
-from nltk.grammar import CFG
-
-
-class ChomskyNormalFormForCFGTest(unittest.TestCase):
- def test_simple(self):
- grammar = CFG.fromstring(
- """
- S -> NP VP
- PP -> P NP
- NP -> Det N | NP PP P
- VP -> V NP | VP PP
- VP -> Det
- Det -> 'a' | 'the'
- N -> 'dog' | 'cat'
- V -> 'chased' | 'sat'
- P -> 'on' | 'in'
- """
- )
- self.assertFalse(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
- grammar = grammar.chomsky_normal_form(flexible=True)
- self.assertTrue(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
-
- grammar2 = CFG.fromstring(
- """
- S -> NP VP
- NP -> VP N P
- VP -> P
- N -> 'dog' | 'cat'
- P -> 'on' | 'in'
- """
- )
- self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar2.is_chomsky_normal_form())
- grammar2 = grammar2.chomsky_normal_form()
- self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
- self.assertTrue(grammar2.is_chomsky_normal_form())
-
- def test_complex(self):
- grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
- self.assertFalse(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
- grammar = grammar.chomsky_normal_form(flexible=True)
- self.assertTrue(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
import unittest
from nltk import RegexpParser
"""
Unit tests for nltk.classify. See also: nltk/test/classify.doctest
"""
+from __future__ import absolute_import
from nose import SkipTest
from nltk import classify
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
import unittest
from nltk.collocations import BigramCollocationFinder
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
import unittest
import contextlib
import sys
-from io import StringIO
from nose import with_setup
from nltk.corpus import gutenberg
from nltk.text import Text
+try:
+ from StringIO import StringIO
+except ImportError as e:
+ from io import StringIO
+
@contextlib.contextmanager
def stdout_redirect(where):
import sys
from itertools import chain
from unittest import TestCase, SkipTest
-from unittest.mock import MagicMock
+try:
+ from unittest.mock import MagicMock
+except ImportError:
+ raise SkipTest('unittest.mock no supported in Python2')
from nltk.tree import Tree
from nltk.parse import corenlp
corenlp_parser.api_call.assert_called_once_with(
"The quick brown fox jumps over the lazy dog",
- properties={'ssplit.eolonly': 'true'},
+ properties={'ssplit.ssplit.eolonly': 'true'},
)
self.assertEqual(expected_output, parsed_data)
corenlp_parser.api_call.assert_called_once_with(
"The quick brown fox jumps over the lazy dog",
- properties={'ssplit.eolonly': 'true'},
+ properties={'ssplit.ssplit.eolonly': 'true'},
)
self.assertEqual(expected_output, parsed_data.tree())
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
import unittest
from nltk.corpus import (
udhr,
) # mwa_ppdb
+from nltk.compat import python_2_unicode_compatible
from nltk.tree import Tree
from nltk.test.unit.utils import skipIf
"""
Corpus View Regression Tests
"""
+from __future__ import absolute_import, unicode_literals
import unittest
import nltk.data
from nltk.corpus.reader.util import (
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
import unittest
from nltk.metrics.agreement import AnnotationTask
+++ /dev/null
-import unittest
-import nltk
-
-
-class TestFreqDist(unittest.TestCase):
-
- def test_iterating_returns_an_iterator_ordered_by_frequency(self):
-
- samples = ['one', 'two', 'two']
-
- distribution = nltk.FreqDist(samples)
-
- most_frequent, less_frequent = [entry for entry in distribution]
-
- self.assertEqual(most_frequent, 'two')
- self.assertEqual(less_frequent, 'one')
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
from nltk.tag import hmm
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import os
import unittest
-from tempfile import TemporaryDirectory
+from six.moves import zip
+
+from nltk.compat import TemporaryDirectory
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv, json2csv_entities
+++ /dev/null
-import unittest
-
-from nltk.corpus import brown
-from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder
-from nltk.tag import DefaultTagger, RegexpTagger, AffixTagger
-from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, NgramTagger
-from nltk.tag import PerceptronTagger
-from nltk.tag import BrillTaggerTrainer, BrillTagger
-from nltk.tag.brill import nltkdemo18
-
-
-class TestJSONSerialization(unittest.TestCase):
- def setUp(self):
- self.corpus = brown.tagged_sents()[:35]
- self.decoder = JSONTaggedDecoder()
- self.encoder = JSONTaggedEncoder()
- self.default_tagger = DefaultTagger("NN")
-
- def test_default_tagger(self):
- encoded = self.encoder.encode(self.default_tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(self.default_tagger), repr(decoded))
- self.assertEqual(self.default_tagger._tag, decoded._tag)
-
- def test_regexp_tagger(self):
- tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(tagger), repr(decoded))
- self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
- self.assertEqual(tagger._regexps, decoded._regexps)
-
- def test_affix_tagger(self):
- tagger = AffixTagger(self.corpus, backoff=self.default_tagger)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(tagger), repr(decoded))
- self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
- self.assertEqual(tagger._affix_length, decoded._affix_length)
- self.assertEqual(tagger._min_word_length, decoded._min_word_length)
- self.assertEqual(tagger._context_to_tag, decoded._context_to_tag)
-
- def test_ngram_taggers(self):
- unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger)
- bitagger = BigramTagger(self.corpus, backoff=unitagger)
- tritagger = TrigramTagger(self.corpus, backoff=bitagger)
- ntagger = NgramTagger(4, self.corpus, backoff=tritagger)
-
- encoded = self.encoder.encode(ntagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(ntagger), repr(decoded))
- self.assertEqual(repr(tritagger), repr(decoded.backoff))
- self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff))
- self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff))
- self.assertEqual(repr(self.default_tagger),
- repr(decoded.backoff.backoff.backoff.backoff))
-
- def test_perceptron_tagger(self):
- tagger = PerceptronTagger(load=False)
- tagger.train(self.corpus)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(tagger.model.weights, decoded.model.weights)
- self.assertEqual(tagger.tagdict, decoded.tagdict)
- self.assertEqual(tagger.classes, decoded.classes)
-
- def test_brill_tagger(self):
- trainer = BrillTaggerTrainer(self.default_tagger, nltkdemo18(),
- deterministic=True)
- tagger = trainer.train(self.corpus, max_rules=30)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(tagger._initial_tagger),
- repr(decoded._initial_tagger))
- self.assertEqual(tagger._rules, decoded._rules)
- self.assertEqual(tagger._training_stats, decoded._training_stats)
-
# -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
import unittest
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.corpus.nombank
-"""
-
-import unittest
-
-from nltk.corpus import nombank
-# Load the nombank once.
-nombank.nouns()
-
-class NombankDemo(unittest.TestCase):
- def test_numbers(self):
- # No. of instances.
- self.assertEqual(len(nombank.instances()), 114574)
- # No. of rolesets
- self.assertEqual(len(nombank.rolesets()), 5577)
- # No. of nouns.
- self.assertEqual(len(nombank.nouns()), 4704)
-
-
- def test_instance(self):
- self.assertEqual(nombank.instances()[0].roleset, 'perc-sign.01')
-
- def test_framefiles_fileids(self):
- self.assertEqual(len(nombank.fileids()), 4705)
- self.assertTrue(all(fileid.endswith('.xml') for fileid in nombank.fileids()))
+++ /dev/null
-import unittest
-
-import nltk
-from nltk.corpus.reader import pl196x
-
-
-class TestCorpusViews(unittest.TestCase):
-
- def test_corpus_reader(self):
- pl196x_dir = nltk.data.find('corpora/pl196x')
- pl = pl196x.Pl196xCorpusReader(pl196x_dir, r'.*\.xml',
- textids='textids.txt',
- cat_file='cats.txt')
- pl.tagged_words(fileids=pl.fileids(), categories='cats.txt')
Tests for nltk.pos_tag
"""
+from __future__ import unicode_literals
import unittest
# -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
import unittest
The following test performs a random series of reads, seeks, and
tells, and checks that the results are consistent.
"""
+from __future__ import absolute_import, unicode_literals
import random
import functools
from io import BytesIO
Unit tests for Senna
"""
+from __future__ import unicode_literals
from os import environ, path, sep
import logging
# -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
import os
import unittest
assert ar_stemmer.stem("الكلمات") == "كلم"
def test_russian(self):
+ # Russian words both consisting of Cyrillic
+ # and Roman letters can be stemmed.
stemmer_russian = SnowballStemmer("russian")
assert stemmer_russian.stem("авантненькая") == "авантненьк"
+ assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k"
def test_german(self):
stemmer_german = SnowballStemmer("german")
# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
def test_basic():
#
# Natural Language Toolkit: TGrep search
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Will Roberts <wildwilhelm@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Unit tests for nltk.tgrep.
'''
+from __future__ import absolute_import, print_function, unicode_literals
import unittest
+from six import b
+
from nltk.tree import ParentedTree
from nltk import tgrep
Test that tokenization handles bytes and strs the same way.
'''
self.assertEqual(
- tgrep.tgrep_tokenize(b'A .. (B !< C . D) | ![<< (E , F) $ G]'),
+ tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'),
)
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
)
self.assertEqual(
- list(tgrep.tgrep_positions(b'NN', [tree])),
- list(tgrep.tgrep_positions(b'NN', [tree])),
+ list(tgrep.tgrep_positions(b('NN'), [tree])),
+ list(tgrep.tgrep_positions('NN', [tree])),
)
self.assertEqual(
- list(tgrep.tgrep_nodes(b'NN', [tree])),
+ list(tgrep.tgrep_nodes(b('NN'), [tree])),
list(tgrep.tgrep_nodes('NN', [tree])),
)
self.assertEqual(
- list(tgrep.tgrep_positions(b'NN|JJ', [tree])),
+ list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
list(tgrep.tgrep_positions('NN|JJ', [tree])),
)
See also nltk/test/tokenize.doctest
"""
+from __future__ import unicode_literals
+import os
import unittest
from nose import SkipTest
-from nose.tools import assert_equal
-from nltk.tokenize import (
- punkt,
- word_tokenize,
- TweetTokenizer,
- StanfordSegmenter,
- TreebankWordTokenizer,
- SyllableTokenizer,
-)
+from nltk.tokenize import word_tokenize
+from nltk.tokenize import TweetTokenizer, StanfordSegmenter, TreebankWordTokenizer
class TestTokenize(unittest.TestCase):
'français',
]
self.assertEqual(tokens, expected)
-
- def test_sonority_sequencing_syllable_tokenizer(self):
- """
- Test SyllableTokenizer tokenizer.
- """
- tokenizer = SyllableTokenizer()
- tokens = tokenizer.tokenize('justification')
- self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
def test_stanford_segmenter_arabic(self):
"""
expected = ['(', '393', ')', "928 -3010"]
result = tokenizer.tokenize(test2)
self.assertEqual(result, expected)
-
- def test_pad_asterisk(self):
- """
- Test padding of asterisk for word tokenization.
- """
- text = "This is a, *weird sentence with *asterisks in it."
- expected = ['This', 'is', 'a', ',', '*', 'weird', 'sentence',
- 'with', '*', 'asterisks', 'in', 'it', '.']
- self.assertEqual(word_tokenize(text), expected)
-
- def test_pad_dotdot(self):
- """
- Test padding of dotdot* for word tokenization.
- """
- text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
- expected = ['Why', 'did', 'dotdot', '..', 'not', 'get',
- 'tokenized', 'but', 'dotdotdot', '...', 'did', '?',
- 'How', 'about', 'manydots', '.....']
- self.assertEqual(word_tokenize(text), expected)
def test_remove_handle(self):
"""
result = list(tokenizer.span_tokenize(test3))
self.assertEqual(result, expected)
+
def test_word_tokenize(self):
"""
Test word_tokenize function
sentence = "'v' 're'"
expected = ["'", 'v', "'", "'re", "'"]
self.assertEqual(word_tokenize(sentence), expected)
-
- def test_punkt_pair_iter(self):
-
- test_cases = [
- ('12', [('1', '2'), ('2', None)]),
- ('123', [('1', '2'), ('2', '3'), ('3', None)]),
- ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
- ]
-
- for (test_input, expected_output) in test_cases:
- actual_output = [x for x in punkt._pair_iter(test_input)]
-
- assert_equal(actual_output, expected_output)
-
- def test_punkt_pair_iter_handles_stop_iteration_exception(self):
- # test input to trigger StopIteration from next()
- it = iter([])
- # call method under test and produce a generator
- gen = punkt._pair_iter(it)
- # unpack generator, ensure that no error is raised
- list(gen)
-
- def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
- obj = punkt.PunktBaseClass()
-
- class TestPunktTokenizeWordsMock:
- def word_tokenize(self, s):
- return iter([])
-
- obj._lang_vars = TestPunktTokenizeWordsMock()
- # unpack generator, ensure that no error is raised
- list(obj._tokenize_words('test'))
See also nltk/test/wordnet.doctest
"""
+from __future__ import unicode_literals
-import collections
import os
import unittest
self.assertAlmostEqual(
S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3
)
-
- def test_omw_lemma_no_trailing_underscore(self):
- expected = sorted([
- u'popolna_sprememba_v_mišljenju',
- u'popoln_obrat',
- u'preobrat',
- u'preobrat_v_mišljenju'
- ])
- self.assertEqual(sorted(S('about-face.n.02').lemma_names(lang='slv')), expected)
-
- def test_iterable_type_for_all_lemma_names(self):
- # Duck-test for iterables.
- # See https://stackoverflow.com/a/36230057/610569
- cat_lemmas = wn.all_lemma_names(lang='cat')
- eng_lemmas = wn.all_lemma_names(lang='eng')
-
- self.assertTrue(hasattr(eng_lemmas, '__iter__'))
- self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
- self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
-
- self.assertTrue(hasattr(cat_lemmas, '__iter__'))
- self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
- self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
from unittest import TestCase
from functools import wraps
from nose.plugins.skip import SkipTest
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=================
Utility functions
=================
+ >>> from __future__ import print_function
>>> from nltk.util import *
>>> from nltk.tree import Tree
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
=================
=================
WordNet is just another NLTK corpus reader, and can be imported like this:
+ >>> from __future__ import print_function, unicode_literals
>>> from nltk.corpus import wordnet
For more compact code, we recommend:
'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm']
>>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
[Synset('dog.n.01'), Synset('spy.n.01')]
-
+
wn.synset('spy.n.01').lemma_names('jpn') # doctest: +NORMALIZE_WHITESPACE
['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
'\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
'\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
'\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
-
+
>>> wn.synset('dog.n.01').lemma_names('ita')
['cane', 'Canis_familiaris']
>>> wn.lemmas('cane', lang='ita') # doctest: +NORMALIZE_WHITESPACE
- [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
+ [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
Lemma('incompetent.n.01.cane')]
>>> sorted(wn.synset('dog.n.01').lemmas('dan')) # doctest: +NORMALIZE_WHITESPACE
[Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]
-
+
sorted(wn.synset('dog.n.01').lemmas('por'))
[Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')]
-
+
>>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
>>> dog_lemma
Lemma('dog.n.01.c\xe3o')
>>> dog_lemma.lang()
'por'
- >>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn')))
+ >>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))
64797
-------
Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'),
Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...]
>>> list(dog.closure(hyper)) # doctest: +NORMALIZE_WHITESPACE
- [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'),
+ [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
# -*- coding: utf-8 -*-
+from __future__ import absolute_import
def teardown_module(module=None):
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
===============================
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
.. For license information, see LICENSE.TXT
.. -*- coding: utf-8 -*-
# Natural Language Toolkit: Texts
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
regular expression search over tokenized strings, and
distributional similarity.
"""
+from __future__ import print_function, division, unicode_literals, absolute_import
from math import log
from collections import defaultdict, Counter, namedtuple
from functools import reduce
import re
-import sys
-from nltk.lm import MLE
-from nltk.lm.preprocessing import padded_everygram_pipeline
+from six import text_type
+
from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist as CFD
from nltk.util import tokenwrap, LazyConcatenation
from nltk.metrics import f_measure, BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
-from nltk.tokenize import sent_tokenize
+from nltk.compat import python_2_unicode_compatible
ConcordanceLine = namedtuple(
- "ConcordanceLine",
- ["left", "query", "right", "offset", "left_print", "right_print", "line"],
+ 'ConcordanceLine',
+ ['left', 'query', 'right', 'offset', 'left_print', 'right_print', 'line'],
)
@staticmethod
def _default_context(tokens, i):
"""One left token and one right token, normalized to lowercase"""
- left = tokens[i - 1].lower() if i != 0 else "*START*"
- right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*"
+ left = tokens[i - 1].lower() if i != 0 else '*START*'
+ right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*'
return (left, right)
def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
return fd
-
+@python_2_unicode_compatible
class ConcordanceIndex(object):
"""
An index that can be used to look up the offset locations at which
return self._offsets[word]
def __repr__(self):
- return "<ConcordanceIndex for %d tokens (%d types)>" % (
+ return '<ConcordanceIndex for %d tokens (%d types)>' % (
len(self._tokens),
len(self._offsets),
)
left_context = self._tokens[max(0, i - context) : i]
right_context = self._tokens[i + 1 : i + context]
# Create the pretty lines with the query_word in the middle.
- left_print = " ".join(left_context)[-half_width:]
- right_print = " ".join(right_context)[:half_width]
+ left_print = ' '.join(left_context)[-half_width:]
+ right_print = ' '.join(right_context)[:half_width]
# The WYSIWYG line of the concordance.
- line_print = " ".join([left_print, query_word, right_print])
+ line_print = ' '.join([left_print, query_word, right_print])
# Create the ConcordanceLine
concordance_line = ConcordanceLine(
left_context,
"""
def __init__(self, tokens):
- self._raw = "".join("<" + w + ">" for w in tokens)
+ self._raw = ''.join('<' + w + '>' for w in tokens)
def findall(self, regexp):
"""
:type regexp: str
"""
# preprocess the regular expression
- regexp = re.sub(r"\s", "", regexp)
- regexp = re.sub(r"<", "(?:<(?:", regexp)
- regexp = re.sub(r">", ")>)", regexp)
- regexp = re.sub(r"(?<!\\)\.", "[^>]", regexp)
+ regexp = re.sub(r'\s', '', regexp)
+ regexp = re.sub(r'<', '(?:<(?:', regexp)
+ regexp = re.sub(r'>', ')>)', regexp)
+ regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
# perform the search
hits = re.findall(regexp, self._raw)
# Sanity check
for h in hits:
- if not h.startswith("<") and h.endswith(">"):
- raise ValueError("Bad regexp for TokenSearcher.findall")
+ if not h.startswith('<') and h.endswith('>'):
+ raise ValueError('Bad regexp for TokenSearcher.findall')
# postprocess the output
- hits = [h[1:-1].split("><") for h in hits]
+ hits = [h[1:-1].split('><') for h in hits]
return hits
-
+@python_2_unicode_compatible
class Text(object):
"""
A wrapper around a sequence of simple (string) tokens, which is
if name:
self.name = name
- elif "]" in tokens[:20]:
- end = tokens[:20].index("]")
- self.name = " ".join(str(tok) for tok in tokens[1:end])
+ elif ']' in tokens[:20]:
+ end = tokens[:20].index(']')
+ self.name = " ".join(text_type(tok) for tok in tokens[1:end])
else:
- self.name = " ".join(str(tok) for tok in tokens[:8]) + "..."
+ self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
# ////////////////////////////////////////////////////////////
# Support item & slice access
:seealso: ``ConcordanceIndex``
"""
- if "_concordance_index" not in self.__dict__:
+ if '_concordance_index' not in self.__dict__:
self._concordance_index = ConcordanceIndex(
self.tokens, key=lambda s: s.lower()
)
:seealso: ``ConcordanceIndex``
"""
- if "_concordance_index" not in self.__dict__:
+ if '_concordance_index' not in self.__dict__:
self._concordance_index = ConcordanceIndex(
self.tokens, key=lambda s: s.lower()
)
return self._concordance_index.find_concordance(word, width)[:lines]
- def collocation_list(self, num=20, window_size=2):
+ def collocations(self, num=20, window_size=2):
"""
- Return collocations derived from the text, ignoring stopwords.
-
- >>> from nltk.book import text4
- >>> text4.collocation_list()[:2]
- [('United', 'States'), ('fellow', 'citizens')]
+ Print collocations derived from the text, ignoring stopwords.
- :param num: The maximum number of collocations to return.
+ :seealso: find_collocations
+ :param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
- :rtype: list(tuple(str, str))
"""
if not (
- "_collocations" in self.__dict__
+ '_collocations' in self.__dict__
and self._num == num
and self._window_size == window_size
):
# print("Building collocations list")
from nltk.corpus import stopwords
- ignored_words = stopwords.words("english")
+ ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
- self._collocations = list(finder.nbest(bigram_measures.likelihood_ratio, num))
- return self._collocations
-
- def collocations(self, num=20, window_size=2):
- """
- Print collocations derived from the text, ignoring stopwords.
-
- >>> from nltk.book import text4
- >>> text4.collocations() # doctest: +ELLIPSIS
- United States; fellow citizens; four years; ...
-
- :param num: The maximum number of collocations to print.
- :type num: int
- :param window_size: The number of tokens spanned by a collocation (default=2)
- :type window_size: int
- """
-
- collocation_strings = [
- w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
- ]
- print(tokenwrap(collocation_strings, separator="; "))
+ self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
+ colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
+ print(tokenwrap(colloc_strings, separator="; "))
def count(self, word):
"""
:type num: int
:seealso: ContextIndex.similar_words()
"""
- if "_word_context_index" not in self.__dict__:
+ if '_word_context_index' not in self.__dict__:
# print('Building word-context index...')
self._word_context_index = ContextIndex(
self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
Find contexts where the specified words appear; list
most frequent common contexts first.
- :param words: The words used to seed the similarity search
- :type words: str
+ :param word: The word used to seed the similarity search
+ :type word: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.common_contexts()
"""
- if "_word_context_index" not in self.__dict__:
+ if '_word_context_index' not in self.__dict__:
# print('Building word-context index...')
self._word_context_index = ContextIndex(
self.tokens, key=lambda s: s.lower()
dispersion_plot(self, words)
- def _train_default_ngram_lm(self, tokenized_sents, n=3):
- train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents)
- model = MLE(order=n)
- model.fit(train_data, padded_sents)
- return model
-
- def generate(self, length=100, text_seed=None, random_seed=42):
+ def generate(self, words):
"""
- Print random text, generated using a trigram language model.
- See also `help(nltk.lm)`.
-
- :param length: The length of text to generate (default=100)
- :type length: int
-
- :param text_seed: Generation can be conditioned on preceding context.
- :type text_seed: list(str)
-
- :param random_seed: A random seed or an instance of `random.Random`. If provided,
- makes the random sampling part of generation reproducible. (default=42)
- :type random_seed: int
-
+ Issues a reminder to users following the book online
"""
- # Create the model when using it the first time.
- self._tokenized_sents = [
- sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens))
- ]
- if not hasattr(self, "trigram_model"):
- print("Building ngram index...", file=sys.stderr)
- self._trigram_model = self._train_default_ngram_lm(
- self._tokenized_sents, n=3
- )
+ import warnings
- generated_tokens = []
-
- assert length > 0, "The `length` must be more than 0."
- while len(generated_tokens) < length:
- for idx, token in enumerate(
- self._trigram_model.generate(
- length, text_seed=text_seed, random_seed=random_seed
- )
- ):
- if token == "<s>":
- continue
- if token == "</s>":
- break
- generated_tokens.append(token)
- random_seed += 1
-
- prefix = " ".join(text_seed) + " " if text_seed else ""
- output_str = prefix + tokenwrap(generated_tokens[:length])
- print(output_str)
- return output_str
+ warnings.warn(
+ 'The generate() method is no longer available.', DeprecationWarning
+ )
def plot(self, *args):
"""
self._token_searcher = TokenSearcher(self)
hits = self._token_searcher.findall(regexp)
- hits = [" ".join(h) for h in hits]
+ hits = [' '.join(h) for h in hits]
print(tokenwrap(hits, "; "))
# ////////////////////////////////////////////////////////////
# Helper Methods
# ////////////////////////////////////////////////////////////
- _CONTEXT_RE = re.compile("\w+|[\.\!\?]")
+ _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
def _context(self, tokens, i):
"""
j = i - 1
while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
j -= 1
- left = tokens[j] if j != 0 else "*START*"
+ left = tokens[j] if j != 0 else '*START*'
# Right context
j = i + 1
while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
j += 1
- right = tokens[j] if j != len(tokens) else "*END*"
+ right = tokens[j] if j != len(tokens) else '*END*'
return (left, right)
# ////////////////////////////////////////////////////////////
def __str__(self):
- return "<Text: %s>" % self.name
+ return '<Text: %s>' % self.name
def __repr__(self):
- return "<Text: %s>" % self.name
+ return '<Text: %s>' % self.name
# Prototype only; this approach will be slow to load
"""
def __init__(self, source):
- if hasattr(source, "words"): # bridge to the text corpus reader
+ if hasattr(source, 'words'): # bridge to the text corpus reader
source = [source.words(f) for f in source.fileids()]
self._texts = source
if idf is None:
matches = len([True for text in self._texts if term in text])
if len(self._texts) == 0:
- raise ValueError("IDF undefined for empty document collection")
+ raise ValueError('IDF undefined for empty document collection')
idf = log(len(self._texts) / matches) if matches else 0.0
self._idf_cache[term] = idf
return idf
def demo():
from nltk.corpus import brown
- text = Text(brown.words(categories="news"))
+ text = Text(brown.words(categories='news'))
print(text)
print()
print("Concordance:")
- text.concordance("news")
+ text.concordance('news')
print()
print("Distributionally similar words:")
- text.similar("news")
+ text.similar('news')
print()
print("Collocations:")
text.collocations()
# text.generate()
# print()
print("Dispersion plot:")
- text.dispersion_plot(["news", "report", "said", "announced"])
+ text.dispersion_plot(['news', 'report', 'said', 'announced'])
print()
print("Vocabulary plot:")
text.plot(50)
print("Indexing:")
print("text[3]:", text[3])
print("text[3:5]:", text[3:5])
- print("text.vocab()['news']:", text.vocab()["news"])
+ print("text.vocab()['news']:", text.vocab()['news'])
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
__all__ = [
#
# Natural Language Toolkit: TGrep search
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Will Roberts <wildwilhelm@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-"""
+'''
============================================
TGrep search implementation for NLTK trees
============================================
predicates must always pass the value of these arguments on. The
top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
-"""
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
import functools
import re
+from six import binary_type, text_type
+
try:
import pyparsing
except ImportError:
- print("Warning: nltk.tgrep will not work without the `pyparsing` package")
- print("installed.")
+ print('Warning: nltk.tgrep will not work without the `pyparsing` package')
+ print('installed.')
import nltk.tree
class TgrepException(Exception):
- """Tgrep exception type."""
+ '''Tgrep exception type.'''
pass
def ancestors(node):
- """
+ '''
Returns the list of all nodes dominating the given tree node.
This method will not work with leaf nodes, since there is no way
to recover the parent.
- """
+ '''
results = []
try:
current = node.parent()
def unique_ancestors(node):
- """
+ '''
Returns the list of all nodes dominating the given node, where
there is only a single path of descent.
- """
+ '''
results = []
try:
current = node.parent()
def _descendants(node):
- """
+ '''
Returns the list of all nodes which are descended from the given
tree node in some way.
- """
+ '''
try:
treepos = node.treepositions()
except AttributeError:
def _leftmost_descendants(node):
- """
+ '''
Returns the set of all nodes descended in some way through
left branches from this node.
- """
+ '''
try:
treepos = node.treepositions()
except AttributeError:
def _rightmost_descendants(node):
- """
+ '''
Returns the set of all nodes descended in some way through
right branches from this node.
- """
+ '''
try:
rightmost_leaf = max(node.treepositions())
except AttributeError:
def _istree(obj):
- """Predicate to check whether `obj` is a nltk.tree.Tree."""
+ '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
return isinstance(obj, nltk.tree.Tree)
def _unique_descendants(node):
- """
+ '''
Returns the list of all nodes descended from the given node, where
there is only a single path of descent.
- """
+ '''
results = []
current = node
while current and _istree(current) and len(current) == 1:
def _before(node):
- """
+ '''
Returns the set of all nodes that are before the given node.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
def _immediately_before(node):
- """
+ '''
Returns the set of all nodes that are immediately before the given
node.
Tree node A immediately precedes node B if the last terminal
symbol (word) produced by A immediately precedes the first
terminal symbol produced by B.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
def _after(node):
- """
+ '''
Returns the set of all nodes that are after the given node.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
def _immediately_after(node):
- """
+ '''
Returns the set of all nodes that are immediately after the given
node.
Tree node A immediately follows node B if the first terminal
symbol (word) produced by A immediately follows the last
terminal symbol produced by B.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
def _tgrep_node_literal_value(node):
- """
+ '''
Gets the string value of a given parse tree node, for comparison
using the tgrep node literal predicates.
- """
- return node.label() if _istree(node) else str(node)
+ '''
+ return node.label() if _istree(node) else text_type(node)
def _tgrep_macro_use_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function which looks up the macro name used.
- """
+ '''
assert len(tokens) == 1
- assert tokens[0][0] == "@"
+ assert tokens[0][0] == '@'
macro_name = tokens[0][1:]
def macro_use(n, m=None, l=None):
if m is None or macro_name not in m:
- raise TgrepException("macro {0} not defined".format(macro_name))
+ raise TgrepException('macro {0} not defined'.format(macro_name))
return m[macro_name](n, m, l)
return macro_use
def _tgrep_node_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
depending on the name of its node.
- """
+ '''
+ # print 'node tokens: ', tokens
if tokens[0] == "'":
# strip initial apostrophe (tgrep2 print command)
tokens = tokens[1:]
if len(tokens) > 1:
# disjunctive definition of a node name
- assert list(set(tokens[1::2])) == ["|"]
+ assert list(set(tokens[1::2])) == ['|']
# recursively call self to interpret each node name definition
tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]]
# capture tokens and return the disjunction
return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
else:
- if hasattr(tokens[0], "__call__"):
+ if hasattr(tokens[0], '__call__'):
# this is a previously interpreted parenthetical node
# definition (lambda function)
return tokens[0]
- elif tokens[0] == "*" or tokens[0] == "__":
+ elif tokens[0] == '*' or tokens[0] == '__':
return lambda n, m=None, l=None: True
elif tokens[0].startswith('"'):
assert tokens[0].endswith('"')
- node_lit = tokens[0][1:-1].replace('\\"', '"').replace("\\\\", "\\")
+ node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
return (
lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s
)(node_lit)
- elif tokens[0].startswith("/"):
- assert tokens[0].endswith("/")
+ elif tokens[0].startswith('/'):
+ assert tokens[0].endswith('/')
node_lit = tokens[0][1:-1]
return (
lambda r: lambda n, m=None, l=None: r.search(
_tgrep_node_literal_value(n)
)
)(re.compile(node_lit))
- elif tokens[0].startswith("i@"):
+ elif tokens[0].startswith('i@'):
node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
return (
lambda f: lambda n, m=None, l=None: f(
def _tgrep_parens_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
from a parenthetical notation.
- """
+ '''
+ # print 'parenthetical tokens: ', tokens
assert len(tokens) == 3
- assert tokens[0] == "("
- assert tokens[2] == ")"
+ assert tokens[0] == '('
+ assert tokens[2] == ')'
return tokens[1]
def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
which returns true if the node is located at a specific tree
position.
- """
+ '''
# recover the tuple from the parsed sting
node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
# capture the node's tree position
return (
lambda i: lambda n, m=None, l=None: (
- hasattr(n, "treeposition") and n.treeposition() == i
+ hasattr(n, 'treeposition') and n.treeposition() == i
)
)(node_tree_position)
def _tgrep_relation_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
depending on its relation to other nodes in the tree.
- """
+ '''
+ # print 'relation tokens: ', tokens
# process negation first if needed
negated = False
- if tokens[0] == "!":
+ if tokens[0] == '!':
negated = True
tokens = tokens[1:]
- if tokens[0] == "[":
+ if tokens[0] == '[':
# process square-bracketed relation expressions
assert len(tokens) == 3
- assert tokens[2] == "]"
+ assert tokens[2] == ']'
retval = tokens[1]
else:
# process operator-node relation expressions
assert len(tokens) == 2
operator, predicate = tokens
# A < B A is the parent of (immediately dominates) B.
- if operator == "<":
+ if operator == '<':
retval = lambda n, m=None, l=None: (
_istree(n) and any(predicate(x, m, l) for x in n)
)
# A > B A is the child of B.
- elif operator == ">":
+ elif operator == '>':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and predicate(n.parent(), m, l)
)
# A <, B Synonymous with A <1 B.
- elif operator == "<," or operator == "<1":
+ elif operator == '<,' or operator == '<1':
retval = lambda n, m=None, l=None: (
_istree(n) and bool(list(n)) and predicate(n[0], m, l)
)
# A >, B Synonymous with A >1 B.
- elif operator == ">," or operator == ">1":
+ elif operator == '>,' or operator == '>1':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and (n is n.parent()[0])
and predicate(n.parent(), m, l)
)
# A <N B B is the Nth child of A (the first child is <1).
- elif operator[0] == "<" and operator[1:].isdigit():
+ elif operator[0] == '<' and operator[1:].isdigit():
idx = int(operator[1:])
# capture the index parameter
retval = (
)
)(idx - 1)
# A >N B A is the Nth child of B (the first child is >1).
- elif operator[0] == ">" and operator[1:].isdigit():
+ elif operator[0] == '>' and operator[1:].isdigit():
idx = int(operator[1:])
# capture the index parameter
retval = (
lambda i: lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and 0 <= i < len(n.parent())
and (n is n.parent()[i])
)(idx - 1)
# A <' B B is the last child of A (also synonymous with A <-1 B).
# A <- B B is the last child of A (synonymous with A <-1 B).
- elif operator == "<'" or operator == "<-" or operator == "<-1":
+ elif operator == '<\'' or operator == '<-' or operator == '<-1':
retval = lambda n, m=None, l=None: (
_istree(n) and bool(list(n)) and predicate(n[-1], m, l)
)
# A >' B A is the last child of B (also synonymous with A >-1 B).
# A >- B A is the last child of B (synonymous with A >-1 B).
- elif operator == ">'" or operator == ">-" or operator == ">-1":
+ elif operator == '>\'' or operator == '>-' or operator == '>-1':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and (n is n.parent()[-1])
and predicate(n.parent(), m, l)
)
# A <-N B B is the N th-to-last child of A (the last child is <-1).
- elif operator[:2] == "<-" and operator[2:].isdigit():
+ elif operator[:2] == '<-' and operator[2:].isdigit():
idx = -int(operator[2:])
# capture the index parameter
retval = (
)
)(idx)
# A >-N B A is the N th-to-last child of B (the last child is >-1).
- elif operator[:2] == ">-" and operator[2:].isdigit():
+ elif operator[:2] == '>-' and operator[2:].isdigit():
idx = -int(operator[2:])
# capture the index parameter
retval = (
lambda i: lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and 0 <= (i + len(n.parent())) < len(n.parent())
and (n is n.parent()[i + len(n.parent())])
)
)(idx)
# A <: B B is the only child of A
- elif operator == "<:":
+ elif operator == '<:':
retval = lambda n, m=None, l=None: (
_istree(n) and len(n) == 1 and predicate(n[0], m, l)
)
# A >: B A is the only child of B.
- elif operator == ">:":
+ elif operator == '>:':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and len(n.parent()) == 1
and predicate(n.parent(), m, l)
)
# A << B A dominates B (A is an ancestor of B).
- elif operator == "<<":
+ elif operator == '<<':
retval = lambda n, m=None, l=None: (
_istree(n) and any(predicate(x, m, l) for x in _descendants(n))
)
# A >> B A is dominated by B (A is a descendant of B).
- elif operator == ">>":
+ elif operator == '>>':
retval = lambda n, m=None, l=None: any(
predicate(x, m, l) for x in ancestors(n)
)
# A <<, B B is a left-most descendant of A.
- elif operator == "<<," or operator == "<<1":
+ elif operator == '<<,' or operator == '<<1':
retval = lambda n, m=None, l=None: (
_istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n))
)
# A >>, B A is a left-most descendant of B.
- elif operator == ">>,":
+ elif operator == '>>,':
retval = lambda n, m=None, l=None: any(
(predicate(x, m, l) and n in _leftmost_descendants(x))
for x in ancestors(n)
)
# A <<' B B is a right-most descendant of A.
- elif operator == "<<'":
+ elif operator == '<<\'':
retval = lambda n, m=None, l=None: (
_istree(n)
and any(predicate(x, m, l) for x in _rightmost_descendants(n))
)
# A >>' B A is a right-most descendant of B.
- elif operator == ">>'":
+ elif operator == '>>\'':
retval = lambda n, m=None, l=None: any(
(predicate(x, m, l) and n in _rightmost_descendants(x))
for x in ancestors(n)
)
# A <<: B There is a single path of descent from A and B is on it.
- elif operator == "<<:":
+ elif operator == '<<:':
retval = lambda n, m=None, l=None: (
_istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n))
)
# A >>: B There is a single path of descent from B and A is on it.
- elif operator == ">>:":
+ elif operator == '>>:':
retval = lambda n, m=None, l=None: any(
predicate(x, m, l) for x in unique_ancestors(n)
)
# A . B A immediately precedes B.
- elif operator == ".":
+ elif operator == '.':
retval = lambda n, m=None, l=None: any(
predicate(x, m, l) for x in _immediately_after(n)
)
# A , B A immediately follows B.
- elif operator == ",":
+ elif operator == ',':
retval = lambda n, m=None, l=None: any(
predicate(x, m, l) for x in _immediately_before(n)
)
# A .. B A precedes B.
- elif operator == "..":
+ elif operator == '..':
retval = lambda n, m=None, l=None: any(
predicate(x, m, l) for x in _after(n)
)
# A ,, B A follows B.
- elif operator == ",,":
+ elif operator == ',,':
retval = lambda n, m=None, l=None: any(
predicate(x, m, l) for x in _before(n)
)
# A $ B A is a sister of B (and A != B).
- elif operator == "$" or operator == "%":
+ elif operator == '$' or operator == '%':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
+ hasattr(n, 'parent')
and bool(n.parent())
and any(predicate(x, m, l) for x in n.parent() if x is not n)
)
# A $. B A is a sister of and immediately precedes B.
- elif operator == "$." or operator == "%.":
+ elif operator == '$.' or operator == '%.':
retval = lambda n, m=None, l=None: (
- hasattr(n, "right_sibling")
+ hasattr(n, 'right_sibling')
and bool(n.right_sibling())
and predicate(n.right_sibling(), m, l)
)
# A $, B A is a sister of and immediately follows B.
- elif operator == "$," or operator == "%,":
+ elif operator == '$,' or operator == '%,':
retval = lambda n, m=None, l=None: (
- hasattr(n, "left_sibling")
+ hasattr(n, 'left_sibling')
and bool(n.left_sibling())
and predicate(n.left_sibling(), m, l)
)
# A $.. B A is a sister of and precedes B.
- elif operator == "$.." or operator == "%..":
+ elif operator == '$..' or operator == '%..':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and hasattr(n, "parent_index")
+ hasattr(n, 'parent')
+ and hasattr(n, 'parent_index')
and bool(n.parent())
and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :])
)
# A $,, B A is a sister of and follows B.
- elif operator == "$,," or operator == "%,,":
+ elif operator == '$,,' or operator == '%,,':
retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and hasattr(n, "parent_index")
+ hasattr(n, 'parent')
+ and hasattr(n, 'parent_index')
and bool(n.parent())
and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()])
)
return retval
-def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"):
- """
+def _tgrep_conjunction_action(_s, _l, tokens, join_char='&'):
+ '''
Builds a lambda function representing a predicate on a tree node
from the conjunction of several other such lambda functions.
tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
list of segmented patterns (`tgrep_expr_labeled`, processed by
`_tgrep_segmented_pattern_action`).
- """
+ '''
# filter out the ampersand
tokens = [x for x in tokens if x != join_char]
+ # print 'relation conjunction tokens: ', tokens
if len(tokens) == 1:
return tokens[0]
else:
def _tgrep_segmented_pattern_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a segmented pattern.
Called for expressions like (`tgrep_expr_labeled`)::
parse action to the pred use inside a node_expr. See
`_tgrep_node_label_use_action` and
`_tgrep_node_label_pred_use_action`.
- """
+ '''
# tokens[0] is a string containing the node label
node_label = tokens[0]
# tokens[1:] is an (optional) list of predicates which must all
reln_preds = tokens[1:]
def pattern_segment_pred(n, m=None, l=None):
- """This predicate function ignores its node argument."""
+ '''This predicate function ignores its node argument.'''
# look up the bound node using its label
if l is None or node_label not in l:
raise TgrepException(
- "node_label ={0} not bound in pattern".format(node_label)
+ 'node_label ={0} not bound in pattern'.format(node_label)
)
node = l[node_label]
# match the relation predicates against the node
def _tgrep_node_label_use_action(_s, _l, tokens):
- """
+ '''
Returns the node label used to begin a tgrep_expr_labeled. See
`_tgrep_segmented_pattern_action`.
expression (see `_tgrep_segmented_pattern_action`).
It returns the node label.
- """
+ '''
assert len(tokens) == 1
- assert tokens[0].startswith("=")
+ assert tokens[0].startswith('=')
return tokens[0][1:]
def _tgrep_node_label_pred_use_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
which describes the use of a previously bound node label.
relation). The predicate returns true if and only if its node
argument is identical the the node looked up in the node label
dictionary using the node's label.
- """
+ '''
assert len(tokens) == 1
- assert tokens[0].startswith("=")
+ assert tokens[0].startswith('=')
node_label = tokens[0][1:]
def node_label_use_pred(n, m=None, l=None):
# look up the bound node using its label
if l is None or node_label not in l:
raise TgrepException(
- "node_label ={0} not bound in pattern".format(node_label)
+ 'node_label ={0} not bound in pattern'.format(node_label)
)
node = l[node_label]
# truth means the given node is this node
def _tgrep_bind_node_label_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
which can optionally bind a matching node into the tgrep2 string's
label_dict.
/NP/
@NP=n
- """
+ '''
# tokens[0] is a tgrep_node_expr
if len(tokens) == 1:
return tokens[0]
# if present, tokens[1] is the character '=', and tokens[2] is
# a tgrep_node_label, a string value containing the node label
assert len(tokens) == 3
- assert tokens[1] == "="
+ assert tokens[1] == '='
node_pred = tokens[0]
node_label = tokens[2]
# bind `n` into the dictionary `l`
if l is None:
raise TgrepException(
- "cannot bind node_label {0}: label_dict is None".format(
+ 'cannot bind node_label {0}: label_dict is None'.format(
node_label
)
)
def _tgrep_rel_disjunction_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
from the disjunction of several other such lambda functions.
- """
+ '''
# filter out the pipe
- tokens = [x for x in tokens if x != "|"]
+ tokens = [x for x in tokens if x != '|']
+ # print 'relation disjunction tokens: ', tokens
if len(tokens) == 1:
return tokens[0]
elif len(tokens) == 2:
def _macro_defn_action(_s, _l, tokens):
- """
+ '''
Builds a dictionary structure which defines the given macro.
- """
+ '''
assert len(tokens) == 3
- assert tokens[0] == "@"
+ assert tokens[0] == '@'
return {tokens[1]: tokens[2]}
def _tgrep_exprs_action(_s, _l, tokens):
- """
+ '''
This is the top-lebel node in a tgrep2 search string; the
predicate function it returns binds together all the state of a
tgrep2 search string.
from the disjunction of several tgrep expressions. Also handles
macro definitions and macro name binding, and node label
definitions and node label binding.
- """
+ '''
if len(tokens) == 1:
return lambda n, m=None, l=None: tokens[0](n, None, {})
# filter out all the semicolons
- tokens = [x for x in tokens if x != ";"]
+ tokens = [x for x in tokens if x != ';']
# collect all macro definitions
macro_dict = {}
macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
def _build_tgrep_parser(set_parse_actions=True):
- """
+ '''
Builds a pyparsing-based parser object for tokenizing and
interpreting tgrep search strings.
- """
- tgrep_op = pyparsing.Optional("!") + pyparsing.Regex("[$%,.<>][%,.<>0-9-':]*")
+ '''
+ tgrep_op = pyparsing.Optional('!') + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')
tgrep_qstring = pyparsing.QuotedString(
- quoteChar='"', escChar="\\", unquoteResults=False
+ quoteChar='"', escChar='\\', unquoteResults=False
)
tgrep_node_regex = pyparsing.QuotedString(
- quoteChar="/", escChar="\\", unquoteResults=False
+ quoteChar='/', escChar='\\', unquoteResults=False
)
tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
- tgrep_node_regex_icase = pyparsing.Regex("i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/")
- tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+")
+ tgrep_node_regex_icase = pyparsing.Regex('i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
+ tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
tgrep_expr = pyparsing.Forward()
tgrep_relations = pyparsing.Forward()
- tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")"
+ tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
tgrep_nltk_tree_pos = (
- pyparsing.Literal("N(")
+ pyparsing.Literal('N(')
+ pyparsing.Optional(
pyparsing.Word(pyparsing.nums)
- + ","
+ + ','
+ pyparsing.Optional(
- pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",")
- + pyparsing.Optional(",")
+ pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=',')
+ + pyparsing.Optional(',')
)
)
- + ")"
+ + ')'
)
- tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+")
- tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label)
+ tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
+ tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
# see _tgrep_segmented_pattern_action
tgrep_node_label_use_pred = tgrep_node_label_use.copy()
- macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+")
- macro_name.setWhitespaceChars("")
- macro_use = pyparsing.Combine("@" + macro_name)
+ macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
+ macro_name.setWhitespaceChars('')
+ macro_use = pyparsing.Combine('@' + macro_name)
tgrep_node_expr = (
tgrep_node_label_use_pred
| macro_use
| tgrep_node_regex_icase
| tgrep_qstring
| tgrep_node_regex
- | "*"
+ | '*'
| tgrep_node_literal
)
tgrep_node_expr2 = (
tgrep_node_expr
- + pyparsing.Literal("=").setWhitespaceChars("")
- + tgrep_node_label.copy().setWhitespaceChars("")
+ + pyparsing.Literal('=').setWhitespaceChars('')
+ + tgrep_node_label.copy().setWhitespaceChars('')
) | tgrep_node_expr
tgrep_node = tgrep_parens | (
pyparsing.Optional("'")
+ tgrep_node_expr2
+ pyparsing.ZeroOrMore("|" + tgrep_node_expr)
)
- tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]"
+ tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
tgrep_rel_conjunction = pyparsing.Forward()
tgrep_rel_conjunction << (
tgrep_relation
- + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction)
+ + pyparsing.ZeroOrMore(pyparsing.Optional('&') + tgrep_rel_conjunction)
)
tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
"|" + tgrep_relations
)
tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
- tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled)
+ tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
macro_defn = (
- pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2
+ pyparsing.Literal('@') + pyparsing.White().suppress() + macro_name + tgrep_expr2
)
tgrep_exprs = (
- pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";")
+ pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';')
+ tgrep_expr2
- + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2))
- + pyparsing.ZeroOrMore(";").suppress()
+ + pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2))
+ + pyparsing.ZeroOrMore(';').suppress()
)
if set_parse_actions:
tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
tgrep_expr.setParseAction(_tgrep_conjunction_action)
tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
tgrep_expr2.setParseAction(
- functools.partial(_tgrep_conjunction_action, join_char=":")
+ functools.partial(_tgrep_conjunction_action, join_char=':')
)
tgrep_exprs.setParseAction(_tgrep_exprs_action)
- return tgrep_exprs.ignore("#" + pyparsing.restOfLine)
+ return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
def tgrep_tokenize(tgrep_string):
- """
+ '''
Tokenizes a TGrep search string into separate tokens.
- """
+ '''
parser = _build_tgrep_parser(False)
- if isinstance(tgrep_string, bytes):
+ if isinstance(tgrep_string, binary_type):
tgrep_string = tgrep_string.decode()
return list(parser.parseString(tgrep_string))
def tgrep_compile(tgrep_string):
- """
+ '''
Parses (and tokenizes, if necessary) a TGrep search string into a
lambda function.
- """
+ '''
parser = _build_tgrep_parser(True)
- if isinstance(tgrep_string, bytes):
+ if isinstance(tgrep_string, binary_type):
tgrep_string = tgrep_string.decode()
return list(parser.parseString(tgrep_string, parseAll=True))[0]
def treepositions_no_leaves(tree):
- """
+ '''
Returns all the tree positions in the given tree which are not
leaf nodes.
- """
+ '''
treepositions = tree.treepositions()
# leaves are treeposition tuples that are not prefixes of any
# other treeposition
:rtype: iter(tree positions)
"""
- if isinstance(pattern, (bytes, str)):
+ if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
:rtype: iter(tree nodes)
"""
- if isinstance(pattern, (bytes, str)):
+ if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# Contributors: matthewmc, clouds56
from nltk.data import load
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
from nltk.tokenize.mwe import MWETokenizer
-from nltk.tokenize.destructive import NLTKWordTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.regexp import (
RegexpTokenizer,
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
-from nltk.tokenize.sonority_sequencing import SyllableTokenizer
# Standard sentence tokenizer.
-def sent_tokenize(text, language="english"):
+def sent_tokenize(text, language='english'):
"""
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
:param text: text to split into sentences
:param language: the model name in the Punkt corpus
"""
- tokenizer = load("tokenizers/punkt/{0}.pickle".format(language))
+ tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
return tokenizer.tokenize(text)
# Standard word tokenizer.
-_treebank_word_tokenizer = NLTKWordTokenizer()
-
-
-def word_tokenize(text, language="english", preserve_line=False):
+_treebank_word_tokenizer = TreebankWordTokenizer()
+
+# See discussion on https://github.com/nltk/nltk/pull/1437
+# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
+# - chervon quotes u'\xab' and u'\xbb' .
+# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
+# Also, behavior of splitting on clitics now follows Stanford CoreNLP
+# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
+improved_open_quote_regex = re.compile(u'([«“‘„]|[`]+)', re.U)
+improved_open_single_quote_regex = re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U)
+improved_close_quote_regex = re.compile(u'([»”’])', re.U)
+improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
+_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
+_treebank_word_tokenizer.STARTING_QUOTES.append((improved_open_single_quote_regex, r'\1 \2'))
+_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
+_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
+
+
+def word_tokenize(text, language='english', preserve_line=False):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
# Natural Language Toolkit: Tokenizer Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
Tokenizer Interface
"""
-from abc import ABC, abstractmethod
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
-class TokenizerI(ABC):
+@add_metaclass(ABCMeta)
+class TokenizerI(object):
"""
A processing interface for tokenizing a string.
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
on the specified string (defined in subclasses).
"""
- @property
- @abstractmethod
- def _string(self):
- raise NotImplementedError
-
def tokenize(self, s):
return s.split(self._string)
#
# Natural Language Toolkit: Twitter Tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <> (modifications)
######################################################################
-import regex # https://github.com/nltk/nltk/issues/2409
-import html
+from __future__ import unicode_literals
+import re
+
+from six import int2byte, unichr
+from six.moves import html_entities
######################################################################
# The following strings are components in the regular expression
######################################################################
# This is the core tokenizing regex:
-WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
+WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE)
# WORD_RE performs poorly on these patterns:
-HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
+HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
-EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
+EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
# These are for regularizing HTML entities to Unicode:
-ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
+ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
######################################################################
######################################################################
-def _str_to_unicode(text, encoding=None, errors="strict"):
+def _str_to_unicode(text, encoding=None, errors='strict'):
if encoding is None:
- encoding = "utf-8"
+ encoding = 'utf-8'
if isinstance(text, bytes):
return text.decode(encoding, errors)
return text
-def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
+def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
"""
Remove entities from text by converting them to their
corresponding unicode character.
# Numeric character references in the 80-9F range are typically
# interpreted by browsers as representing the characters mapped
# to bytes 80-9F in the Windows-1252 encoding. For more info
- # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
+ # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
if 0x80 <= number <= 0x9F:
- return bytes((number,)).decode("cp1252")
+ return int2byte(number).decode('cp1252')
except ValueError:
number = None
else:
if entity_body in keep:
return match.group(0)
else:
- number = html.entities.name2codepoint.get(entity_body)
+ number = html_entities.name2codepoint.get(entity_body)
if number is not None:
try:
- return chr(number)
+ return unichr(number)
except ValueError:
pass
if self.reduce_len:
text = reduce_lengthening(text)
# Shorten problematic sequences of characters
- safe_text = HANG_RE.sub(r"\1\1\1", text)
+ safe_text = HANG_RE.sub(r'\1\1\1', text)
# Tokenize:
words = WORD_RE.findall(safe_text)
# Possibly alter the case, but avoid changing emoticons like :D into :d:
Replace repeated character sequences of length 3 or greater with sequences
of length 3.
"""
- pattern = regex.compile(r"(.)\1{2,}")
+ pattern = re.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text)
"""
Remove Twitter username handles from text.
"""
- pattern = regex.compile(
+ pattern = re.compile(
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
)
- # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
- return pattern.sub(" ", text)
+ # Substitute hadnles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+ return pattern.sub(' ', text)
######################################################################
+++ /dev/null
-# Natural Language Toolkit: NLTK's very own tokenizer.
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author:
-# URL: <http://nltk.sourceforge.net>
-# For license information, see LICENSE.TXT
-
-
-import re
-from nltk.tokenize.api import TokenizerI
-
-
-class MacIntyreContractions:
- """
- List of contractions adapted from Robert MacIntyre's tokenizer.
- """
-
- CONTRACTIONS2 = [
- r"(?i)\b(can)(?#X)(not)\b",
- r"(?i)\b(d)(?#X)('ye)\b",
- r"(?i)\b(gim)(?#X)(me)\b",
- r"(?i)\b(gon)(?#X)(na)\b",
- r"(?i)\b(got)(?#X)(ta)\b",
- r"(?i)\b(lem)(?#X)(me)\b",
- r"(?i)\b(mor)(?#X)('n)\b",
- r"(?i)\b(wan)(?#X)(na)\s",
- ]
- CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
- CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
-
-
-class NLTKWordTokenizer(TokenizerI):
- """
- The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
-
- The tokenizer is "destructive" such that the regexes applied will munge the
- input string to a state beyond re-construction. It is possible to apply
- `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
- `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
- revert to the original string.
- """
-
- # Starting quotes.
- STARTING_QUOTES = [
- (re.compile(u"([«“‘„]|[`]+)", re.U), r" \1 "),
- (re.compile(r"^\""), r"``"),
- (re.compile(r"(``)"), r" \1 "),
- (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
- (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U), r"\1 \2"),
- ]
-
- # Ending quotes.
- ENDING_QUOTES = [
- (re.compile(u"([»”’])", re.U), r" \1 "),
- (re.compile(r'"'), " '' "),
- (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
- (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
- (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
- ]
-
- # For improvements for starting/closing quotes from TreebankWordTokenizer,
- # see discussion on https://github.com/nltk/nltk/pull/1437
- # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
- # - chervon quotes u'\xab' and u'\xbb' .
- # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
- # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
- # Also, behavior of splitting on clitics now follows Stanford CoreNLP
- # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
-
- # Punctuation.
- PUNCTUATION = [
- (re.compile(r'([^\.])(\.)([\]\)}>"\'' u"»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
- (re.compile(r"([:,])([^\d])"), r" \1 \2"),
- (re.compile(r"([:,])$"), r" \1 "),
- (re.compile(r"\.{2,}", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
- (re.compile(r"[;@#$%&]"), r" \g<0> "),
- (
- re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
- r"\1 \2\3 ",
- ), # Handles the final period.
- (re.compile(r"[?!]"), r" \g<0> "),
- (re.compile(r"([^'])' "), r"\1 ' "),
- (re.compile(r"[*]", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
- ]
-
- # Pads parentheses
- PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
-
- # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
- CONVERT_PARENTHESES = [
- (re.compile(r"\("), "-LRB-"),
- (re.compile(r"\)"), "-RRB-"),
- (re.compile(r"\["), "-LSB-"),
- (re.compile(r"\]"), "-RSB-"),
- (re.compile(r"\{"), "-LCB-"),
- (re.compile(r"\}"), "-RCB-"),
- ]
-
- DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
-
- # List of contractions adapted from Robert MacIntyre's tokenizer.
- _contractions = MacIntyreContractions()
- CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
- CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
-
- def tokenize(self, text, convert_parentheses=False, return_str=False):
- for regexp, substitution in self.STARTING_QUOTES:
- text = regexp.sub(substitution, text)
-
- for regexp, substitution in self.PUNCTUATION:
- text = regexp.sub(substitution, text)
-
- # Handles parentheses.
- regexp, substitution = self.PARENS_BRACKETS
- text = regexp.sub(substitution, text)
- # Optionally convert parentheses
- if convert_parentheses:
- for regexp, substitution in self.CONVERT_PARENTHESES:
- text = regexp.sub(substitution, text)
-
- # Handles double dash.
- regexp, substitution = self.DOUBLE_DASHES
- text = regexp.sub(substitution, text)
-
- # add extra space to make things easier
- text = " " + text + " "
-
- for regexp, substitution in self.ENDING_QUOTES:
- text = regexp.sub(substitution, text)
-
- for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r" \1 \2 ", text)
- for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r" \1 \2 ", text)
-
- # We are not using CONTRACTIONS4 since
- # they are also commented out in the SED scripts
- # for regexp in self._contractions.CONTRACTIONS4:
- # text = regexp.sub(r' \1 \2 \3 ', text)
-
- return text if return_str else text.split()
# Multi-Word Expression tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
into single tokens.
"""
- def __init__(self, mwes=None, separator="_"):
+ def __init__(self, mwes=None, separator='_'):
"""Initialize the multi-word tokenizer with a list of expressions and a
separator
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
"""
+from __future__ import unicode_literals
import io
import re
+from six import text_type
from nltk.corpus import perluniprops
from nltk.tokenize.api import TokenizerI
paragraph-based tokenization from mteval-14.pl; The sentence-based
tokenization is consistent with the other tokenizers available in NLTK.
+ >>> from six import text_type
>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
>>> s = "Good muffins cost $3.88 in New York."
"""
# Strip "skipped" tags
- STRIP_SKIP = re.compile("<skipped>"), ""
+ STRIP_SKIP = re.compile('<skipped>'), ''
# Strip end-of-line hyphenation and join lines
- STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
+ STRIP_EOL_HYPHEN = re.compile(u'\u2028'), ' '
# Tokenize punctuation.
- PUNCT = re.compile("([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
+ PUNCT = re.compile('([\{-\~\[-\` -\&\(-\+\:-\@\/])'), ' \\1 '
# Tokenize period and comma unless preceded by a digit.
- PERIOD_COMMA_PRECEED = re.compile("([^0-9])([\.,])"), "\\1 \\2 "
+ PERIOD_COMMA_PRECEED = re.compile('([^0-9])([\.,])'), '\\1 \\2 '
# Tokenize period and comma unless followed by a digit.
- PERIOD_COMMA_FOLLOW = re.compile("([\.,])([^0-9])"), " \\1 \\2"
+ PERIOD_COMMA_FOLLOW = re.compile('([\.,])([^0-9])'), ' \\1 \\2'
# Tokenize dash when preceded by a digit
- DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
+ DASH_PRECEED_DIGIT = re.compile('([0-9])(-)'), '\\1 \\2 '
LANG_DEPENDENT_REGEXES = [
PUNCT,
]
# Perluniprops characters used in NIST tokenizer.
- pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
- pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
- pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
+ pup_number = text_type(''.join(set(perluniprops.chars('Number')))) # i.e. \p{N}
+ pup_punct = text_type(''.join(set(perluniprops.chars('Punctuation')))) # i.e. \p{P}
+ pup_symbol = text_type(''.join(set(perluniprops.chars('Symbol')))) # i.e. \p{S}
# Python regexes needs to escape some special symbols, see
# see https://stackoverflow.com/q/45670950/610569
- number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
- punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
- symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
+ number_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_number)
+ punct_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_punct)
+ symbol_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_symbol)
# Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
# (i) strip trailing and heading spaces and
# (ii) de-deuplicate spaces.
# In Python, this would do: ' '.join(str.strip().split())
# Thus, the next two lines were commented out.
- # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
- # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+ # Line_Separator = text_type(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+ # Separator = text_type(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
# Pads non-ascii strings with space.
- NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
+ NONASCII = re.compile('([\x00-\x7f]+)'), r' \1 '
# Tokenize any punctuation unless followed AND preceded by a digit.
PUNCT_1 = (
- re.compile("([{n}])([{p}])".format(n=number_regex, p=punct_regex)),
- "\\1 \\2 ",
+ re.compile(u"([{n}])([{p}])".format(n=number_regex, p=punct_regex)),
+ '\\1 \\2 ',
)
PUNCT_2 = (
- re.compile("([{p}])([{n}])".format(n=number_regex, p=punct_regex)),
- " \\1 \\2",
+ re.compile(u"([{p}])([{n}])".format(n=number_regex, p=punct_regex)),
+ ' \\1 \\2',
)
# Tokenize symbols
- SYMBOLS = re.compile("([{s}])".format(s=symbol_regex)), " \\1 "
+ SYMBOLS = re.compile(u"([{s}])".format(s=symbol_regex)), ' \\1 '
INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
return text
def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
- text = str(text)
+ text = text_type(text)
# Language independent regex.
text = self.lang_independent_sub(text)
# Language dependent regex.
if western_lang:
# Pad string with whitespace.
- text = " " + text + " "
+ text = ' ' + text + ' '
if lowercase:
text = text.lower()
for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
text = regexp.sub(substitution, text)
# Remove contiguous whitespaces.
- text = " ".join(text.split())
+ text = ' '.join(text.split())
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
- text = str(text.strip())
+ text = text_type(text.strip())
return text if return_str else text.split()
def international_tokenize(
self, text, lowercase=False, split_non_ascii=True, return_str=False
):
- text = str(text)
+ text = text_type(text)
# Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
# first before unescaping.
regexp, substitution = self.STRIP_SKIP
# Make sure that there's only one space only between words.
# Strip leading and trailing spaces.
- text = " ".join(text.strip().split())
+ text = ' '.join(text.strip().split())
return text if return_str else text.split()
# Natural Language Toolkit: Punkt sentence tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Algorithm: Kiss & Strunk (2006)
# Author: Willy <willy@csse.unimelb.edu.au> (original Python port)
# Steven Bird <stevenbird1@gmail.com> (additions)
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
Boundary Detection. Computational Linguistics 32: 485-525.
"""
+from __future__ import print_function, unicode_literals, division
# TODO: Make orthographic heuristic less susceptible to overtraining
# TODO: Frequent sentence starters optionally exclude always-capitalised words
import math
from collections import defaultdict
+from six import string_types
+
+from nltk.compat import unicode_repr, python_2_unicode_compatible
from nltk.probability import FreqDist
from nltk.tokenize.api import TokenizerI
"""Orthographic context: occurs with lower case."""
_ORTHO_MAP = {
- ("initial", "upper"): _ORTHO_BEG_UC,
- ("internal", "upper"): _ORTHO_MID_UC,
- ("unknown", "upper"): _ORTHO_UNK_UC,
- ("initial", "lower"): _ORTHO_BEG_LC,
- ("internal", "lower"): _ORTHO_MID_LC,
- ("unknown", "lower"): _ORTHO_UNK_LC,
+ ('initial', 'upper'): _ORTHO_BEG_UC,
+ ('internal', 'upper'): _ORTHO_MID_UC,
+ ('unknown', 'upper'): _ORTHO_UNK_UC,
+ ('initial', 'lower'): _ORTHO_BEG_LC,
+ ('internal', 'lower'): _ORTHO_MID_LC,
+ ('unknown', 'lower'): _ORTHO_UNK_LC,
}
"""A map from context position and first-letter case to the
appropriate orthographic context flag."""
# { Decision reasons for debugging
######################################################################
-REASON_DEFAULT_DECISION = "default decision"
-REASON_KNOWN_COLLOCATION = "known collocation (both words)"
-REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic"
-REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter"
-REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
-REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
+REASON_DEFAULT_DECISION = 'default decision'
+REASON_KNOWN_COLLOCATION = 'known collocation (both words)'
+REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = 'abbreviation + orthographic heuristic'
+REASON_ABBR_WITH_SENTENCE_STARTER = 'abbreviation + frequent sentence starter'
+REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
+REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = (
- "initial + special orthographic heuristic"
+ 'initial + special orthographic heuristic'
)
constructors.
"""
- __slots__ = ("_re_period_context", "_re_word_tokenizer")
+ __slots__ = ('_re_period_context', '_re_word_tokenizer')
def __getstate__(self):
# All modifications to the class are performed by inheritance.
def __setstate__(self, state):
return 1
- sent_end_chars = (".", "?", "!")
+ sent_end_chars = ('.', '?', '!')
"""Characters which are candidates for sentence boundaries"""
@property
def _re_sent_end_chars(self):
- return "[%s]" % re.escape("".join(self.sent_end_chars))
+ return '[%s]' % re.escape(''.join(self.sent_end_chars))
- internal_punctuation = ",:;" # might want to extend this..
+ internal_punctuation = ',:;' # might want to extend this..
"""sentence internal punctuation, which indicates an abbreviation if
preceded by a period-final token."""
_re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
"""Hyphen and ellipsis are multi-character punctuation"""
- _word_tokenize_fmt = r"""(
+ _word_tokenize_fmt = r'''(
%(MultiChar)s
|
(?=%(WordStart)s)\S+? # Accept word characters until end is found
)
|
\S
- )"""
+ )'''
"""Format of a regular expression to split punctuation from words,
excluding period."""
self._re_word_tokenizer = re.compile(
self._word_tokenize_fmt
% {
- "NonWord": self._re_non_word_chars,
- "MultiChar": self._re_multi_char_punct,
- "WordStart": self._re_word_start,
+ 'NonWord': self._re_non_word_chars,
+ 'MultiChar': self._re_multi_char_punct,
+ 'WordStart': self._re_word_start,
},
re.UNICODE | re.VERBOSE,
)
self._re_period_context = re.compile(
self._period_context_fmt
% {
- "NonWord": self._re_non_word_chars,
- "SentEndChars": self._re_sent_end_chars,
+ 'NonWord': self._re_non_word_chars,
+ 'SentEndChars': self._re_sent_end_chars,
},
re.UNICODE | re.VERBOSE,
)
return self._re_period_context
-_re_non_punct = re.compile(r"[^\W\d]", re.UNICODE)
+_re_non_punct = re.compile(r'[^\W\d]', re.UNICODE)
"""Matches token types that are not merely punctuation. (Types for
numeric tokens are changed to ##number## and hence contain alpha.)"""
pair will have None as its second element.
"""
it = iter(it)
- try:
- prev = next(it)
- except StopIteration:
- return
+ prev = next(it)
for el in it:
yield (prev, el)
prev = el
def _debug_ortho_context(self, typ):
c = self.ortho_context[typ]
if c & _ORTHO_BEG_UC:
- yield "BEG-UC"
+ yield 'BEG-UC'
if c & _ORTHO_MID_UC:
- yield "MID-UC"
+ yield 'MID-UC'
if c & _ORTHO_UNK_UC:
- yield "UNK-UC"
+ yield 'UNK-UC'
if c & _ORTHO_BEG_LC:
- yield "BEG-LC"
+ yield 'BEG-LC'
if c & _ORTHO_MID_LC:
- yield "MID-LC"
+ yield 'MID-LC'
if c & _ORTHO_UNK_LC:
- yield "UNK-LC"
+ yield 'UNK-LC'
######################################################################
######################################################################
+@python_2_unicode_compatible
class PunktToken(object):
"""Stores a token of text with annotations produced during
sentence boundary detection."""
- _properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"]
- __slots__ = ["tok", "type", "period_final"] + _properties
+ _properties = ['parastart', 'linestart', 'sentbreak', 'abbr', 'ellipsis']
+ __slots__ = ['tok', 'type', 'period_final'] + _properties
def __init__(self, tok, **params):
self.tok = tok
self.type = self._get_type(tok)
- self.period_final = tok.endswith(".")
+ self.period_final = tok.endswith('.')
for p in self._properties:
setattr(self, p, None)
# { Regular expressions for properties
# ////////////////////////////////////////////////////////////
# Note: [A-Za-z] is approximated by [^\W\d] in the general case.
- _RE_ELLIPSIS = re.compile(r"\.\.+$")
- _RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$")
- _RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE)
- _RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE)
+ _RE_ELLIPSIS = re.compile(r'\.\.+$')
+ _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
+ _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
+ _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
# ////////////////////////////////////////////////////////////
# { Derived properties
def _get_type(self, tok):
"""Returns a case-normalized representation of the token."""
- return self._RE_NUMERIC.sub("##number##", tok.lower())
+ return self._RE_NUMERIC.sub('##number##', tok.lower())
@property
def type_no_period(self):
"""
The type with its final period removed if it has one.
"""
- if len(self.type) > 1 and self.type[-1] == ".":
+ if len(self.type) > 1 and self.type[-1] == '.':
return self.type[:-1]
return self.type
@property
def first_case(self):
if self.first_lower:
- return "lower"
+ return 'lower'
elif self.first_upper:
- return "upper"
- return "none"
+ return 'upper'
+ return 'none'
@property
def is_ellipsis(self):
@property
def is_number(self):
"""True if the token text is that of a number."""
- return self.type.startswith("##number##")
+ return self.type.startswith('##number##')
@property
def is_initial(self):
with eval(), which lists all the token's non-default
annotations.
"""
- typestr = " type=%s," % repr(self.type) if self.type != self.tok else ""
+ typestr = ' type=%s,' % unicode_repr(self.type) if self.type != self.tok else ''
- propvals = ", ".join(
- "%s=%s" % (p, repr(getattr(self, p)))
+ propvals = ', '.join(
+ '%s=%s' % (p, unicode_repr(getattr(self, p)))
for p in self._properties
if getattr(self, p)
)
- return "%s(%s,%s %s)" % (
+ return '%s(%s,%s %s)' % (
self.__class__.__name__,
- repr(self.tok),
+ unicode_repr(self.tok),
typestr,
propvals,
)
"""
res = self.tok
if self.abbr:
- res += "<A>"
+ res += '<A>'
if self.ellipsis:
- res += "<E>"
+ res += '<E>'
if self.sentbreak:
- res += "<S>"
+ res += '<S>'
return res
respectively.
"""
parastart = False
- for line in plaintext.split("\n"):
+ for line in plaintext.split('\n'):
if line.strip():
line_toks = iter(self._lang_vars.word_tokenize(line))
- try:
- tok = next(line_toks)
- except StopIteration:
- continue
-
- yield self._Token(tok, parastart=parastart, linestart=True)
+ yield self._Token(next(line_toks), parastart=parastart, linestart=True)
parastart = False
for t in line_toks:
aug_tok.sentbreak = True
elif aug_tok.is_ellipsis:
aug_tok.ellipsis = True
- elif aug_tok.period_final and not tok.endswith(".."):
+ elif aug_tok.period_final and not tok.endswith('..'):
if (
tok[:-1].lower() in self._params.abbrev_types
- or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types
+ or tok[:-1].lower().split('-')[-1] in self._params.abbrev_types
):
aug_tok.abbr = True
if is_add:
self._params.abbrev_types.add(abbr)
if verbose:
- print((" Abbreviation: [%6.4f] %s" % (score, abbr)))
+ print((' Abbreviation: [%6.4f] %s' % (score, abbr)))
else:
if not is_add:
self._params.abbrev_types.remove(abbr)
if verbose:
- print((" Removed abbreviation: [%6.4f] %s" % (score, abbr)))
+ print((' Removed abbreviation: [%6.4f] %s' % (score, abbr)))
# Make a preliminary pass through the document, marking likely
# sentence breaks, abbreviations, and ellipsis tokens.
if self._is_rare_abbrev_type(aug_tok1, aug_tok2):
self._params.abbrev_types.add(aug_tok1.type_no_period)
if verbose:
- print((" Rare Abbrev: %s" % aug_tok1.type))
+ print((' Rare Abbrev: %s' % aug_tok1.type))
# Does second token have a high likelihood of starting a sentence?
if self._is_potential_sent_starter(aug_tok2, aug_tok1):
for typ, ll in self._find_sent_starters():
self._params.sent_starters.add(typ)
if verbose:
- print((" Sent Starter: [%6.4f] %r" % (ll, typ)))
+ print((' Sent Starter: [%6.4f] %r' % (ll, typ)))
self._params.clear_collocations()
for (typ1, typ2), ll in self._find_collocations():
self._params.collocations.add((typ1, typ2))
if verbose:
- print((" Collocation: [%6.4f] %r+%r" % (ll, typ1, typ2)))
+ print((' Collocation: [%6.4f] %r+%r' % (ll, typ1, typ2)))
self._finalized = True
positions.
"""
# 'initial' or 'internal' or 'unknown'
- context = "internal"
+ context = 'internal'
tokens = list(tokens)
for aug_tok in tokens:
# that it's a sentence break. But err on the side of
# caution (by not positing a sentence break) if we just
# saw an abbreviation.
- if aug_tok.parastart and context != "unknown":
- context = "initial"
+ if aug_tok.parastart and context != 'unknown':
+ context = 'initial'
# If we're at the beginning of a line, then we can't decide
# between 'internal' and 'initial'.
- if aug_tok.linestart and context == "internal":
- context = "unknown"
+ if aug_tok.linestart and context == 'internal':
+ context = 'unknown'
# Find the case-normalized type of the token. If it's a
# sentence-final token, strip off the period.
# Decide whether the next word is at a sentence boundary.
if aug_tok.sentbreak:
if not (aug_tok.is_number or aug_tok.is_initial):
- context = "initial"
+ context = 'initial'
else:
- context = "unknown"
+ context = 'unknown'
elif aug_tok.ellipsis or aug_tok.abbr:
- context = "unknown"
+ context = 'unknown'
else:
- context = "internal"
+ context = 'internal'
# ////////////////////////////////////////////////////////////
# { Abbreviations
for typ in types:
# Check some basic conditions, to rule out words that are
# clearly not abbrev_types.
- if not _re_non_punct.search(typ) or typ == "##number##":
+ if not _re_non_punct.search(typ) or typ == '##number##':
continue
- if typ.endswith("."):
+ if typ.endswith('.'):
if typ in self._params.abbrev_types:
continue
typ = typ[:-1]
# Count how many periods & nonperiods are in the
# candidate.
- num_periods = typ.count(".") + 1
+ num_periods = typ.count('.') + 1
num_nonperiods = len(typ) - num_periods + 1
# Let <a> be the candidate without the period, and <b>
# indicates whether <ab> occurs as a single unit (high
# value of ll), or as two independent units <a> and
# <b> (low value of ll).
- count_with_period = self._type_fdist[typ + "."]
+ count_with_period = self._type_fdist[typ + '.']
count_without_period = self._type_fdist[typ]
ll = self._dunning_log_likelihood(
count_with_period + count_without_period,
This fails to include abbreviations otherwise found as "rare".
"""
self._params.clear_abbrevs()
- tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
+ tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
if score >= self.ABBREV:
self._params.abbrev_types.add(abbr)
continue
col_count = self._collocation_fdist[types]
- typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."]
- typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."]
+ typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + '.']
+ typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + '.']
if (
typ1_count > 1
and typ2_count > 1
continue
typ_at_break_count = self._sent_starter_fdist[typ]
- typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."]
+ typ_count = self._type_fdist[typ] + self._type_fdist[typ + '.']
if typ_count < typ_at_break_count:
# needed after freq_threshold
continue
given. Repeated calls to this method destroy previous parameters. For
incremental training, instantiate a separate PunktTrainer instance.
"""
- if not isinstance(train_text, str):
+ if not isinstance(train_text, string_types):
return train_text
return PunktTrainer(
train_text, lang_vars=self._lang_vars, token_cls=self._Token
"""
for match in self._lang_vars.period_context_re().finditer(text):
- decision_text = match.group() + match.group("after_tok")
+ decision_text = match.group() + match.group('after_tok')
tokens = self._tokenize_words(decision_text)
tokens = list(self._annotate_first_pass(tokens))
while not tokens[0].period_final:
def _slices_from_text(self, text):
last_break = 0
for match in self._lang_vars.period_context_re().finditer(text):
- context = match.group() + match.group("after_tok")
+ context = match.group() + match.group('after_tok')
if self.text_contains_sentbreak(context):
yield slice(last_break, match.end())
- if match.group("next_tok"):
+ if match.group('next_tok'):
# next sentence starts after whitespace
- last_break = match.start("next_tok")
+ last_break = match.start('next_tok')
else:
# next sentence starts at following punctuation
last_break = match.end()
pos = 0
# A regular expression that finds pieces of whitespace:
- WS_REGEXP = re.compile(r"\s*")
+ WS_REGEXP = re.compile(r'\s*')
- sentence = ""
+ sentence = ''
for aug_tok in tokens:
tok = aug_tok.tok
# token doesn't match, see if adding whitespace helps.
# If so, then use the version with whitespace.
if text[pos : pos + len(tok)] != tok:
- pat = "\s*".join(re.escape(c) for c in tok)
+ pat = '\s*'.join(re.escape(c) for c in tok)
m = re.compile(pat).match(text, pos)
if m:
tok = m.group()
# If we're at a sentence break, then start a new sentence.
if aug_tok.sentbreak:
yield sentence
- sentence = ""
+ sentence = ''
# If the last sentence is emtpy, discard it.
if sentence:
# [XX] TESTING
def dump(self, tokens):
- print("writing to /tmp/punkt.new...")
- with open("/tmp/punkt.new", "w") as outfile:
+ print('writing to /tmp/punkt.new...')
+ with open('/tmp/punkt.new', 'w') as outfile:
for aug_tok in tokens:
if aug_tok.parastart:
- outfile.write("\n\n")
+ outfile.write('\n\n')
elif aug_tok.linestart:
- outfile.write("\n")
+ outfile.write('\n')
else:
- outfile.write(" ")
+ outfile.write(' ')
outfile.write(str(aug_tok))
# { Customization Variables
# ////////////////////////////////////////////////////////////
- PUNCTUATION = tuple(";:,.!?")
+ PUNCTUATION = tuple(';:,.!?')
# ////////////////////////////////////////////////////////////
# { Annotation Procedures
# [4.3. Token-Based Detection of Initials and Ordinals]
# Check if any initials or ordinals tokens that are marked
# as sentbreaks should be reclassified as abbreviations.
- if tok_is_initial or typ == "##number##":
+ if tok_is_initial or typ == '##number##':
# [4.1.1. Orthographic Heuristic] Check if there's
# orthogrpahic evidence about whether the next word
# heuristc is unknown, and next word is always
# capitalized, then mark as abbrev (eg: J. Bach).
if (
- is_sent_starter == "unknown"
+ is_sent_starter == 'unknown'
and tok_is_initial
and aug_tok2.first_upper
and not (self._params.ortho_context[next_typ] & _ORTHO_LC)
return False
# Otherwise, we're not sure.
- return "unknown"
+ return 'unknown'
-DEBUG_DECISION_FMT = """Text: %(text)r (at offset %(period_index)d)
+DEBUG_DECISION_FMT = '''Text: %(text)r (at offset %(period_index)d)
Sentence break? %(break_decision)s (%(reason)s)
Collocation? %(collocation)s
%(type1)r:
known sentence starter: %(type2_is_sent_starter)s
orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s
orthographic contexts in training: %(type2_ortho_contexts)s
-"""
+'''
def format_debug_decision(d):
def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
"""Builds a punkt model and applies it to the same text"""
cleanup = (
- lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ")
+ lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ')
)
trainer = train_cls()
trainer.INCLUDE_ALL_COLLOCS = True
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Trevor Cohn <tacohn@csse.unimelb.edu.au>
``re`` functions, where the pattern is always the first argument.
(This is for consistency with the other NLTK tokenizers.)
"""
+from __future__ import unicode_literals
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import regexp_span_tokenize
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class RegexpTokenizer(TokenizerI):
"""
A tokenizer that splits a string using a regular expression, which
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
):
# If they gave us a regexp object, extract the pattern.
- pattern = getattr(pattern, "pattern", pattern)
+ pattern = getattr(pattern, 'pattern', pattern)
self._pattern = pattern
self._gaps = gaps
yield m.span()
def __repr__(self):
- return "%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)" % (
+ return '%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % (
self.__class__.__name__,
self._pattern,
self._gaps,
"""
def __init__(self):
- RegexpTokenizer.__init__(self, r"\s+", gaps=True)
+ RegexpTokenizer.__init__(self, r'\s+', gaps=True)
class BlanklineTokenizer(RegexpTokenizer):
"""
def __init__(self):
- RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
+ RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
class WordPunctTokenizer(RegexpTokenizer):
"""
def __init__(self):
- RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
+ RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
######################################################################
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals, print_function
+
import os
import re
import sys
import subprocess
import tempfile
+from six import text_type
+
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
- ... print(sent) # doctest: +SKIP
+ ... print sent # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
- ... print(sent) # doctest: +SKIP
+ ... print sent # doctest: +SKIP
...
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
"""
- def __init__(self, repp_dir, encoding="utf8"):
+ def __init__(self, repp_dir, encoding='utf8'):
self.repp_dir = self.find_repptokenizer(repp_dir)
# Set a directory to store the temporary files.
self.working_dir = tempfile.gettempdir()
:rtype: iter(tuple(str))
"""
with tempfile.NamedTemporaryFile(
- prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
+ prefix='repp_input.', dir=self.working_dir, mode='w', delete=False
) as input_file:
# Write sentences to temporary input file.
for sent in sentences:
- input_file.write(str(sent) + "\n")
+ input_file.write(text_type(sent) + '\n')
input_file.close()
# Generate command to run REPP.
cmd = self.generate_repp_command(input_file.name)
:param inputfilename: path to the input file
:type inputfilename: str
"""
- cmd = [self.repp_dir + "/src/repp"]
- cmd += ["-c", self.repp_dir + "/erg/repp.set"]
- cmd += ["--format", "triple"]
+ cmd = [self.repp_dir + '/src/repp']
+ cmd += ['-c', self.repp_dir + '/erg/repp.set']
+ cmd += ['--format', 'triple']
cmd += [inputfilename]
return cmd
:return: an iterable of the tokenized sentences as tuples of strings
:rtype: iter(tuple)
"""
- line_regex = re.compile("^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
- for section in repp_output.split("\n\n"):
+ line_regex = re.compile('^\((\d+), (\d+), (.+)\)$', re.MULTILINE)
+ for section in repp_output.split('\n\n'):
words_with_positions = [
(token, int(start), int(end))
for start, end, token in line_regex.findall(section)
if os.path.exists(repp_dirname): # If a full path is given.
_repp_dir = repp_dirname
else: # Try to find path to REPP directory in environment variables.
- _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
+ _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
# Checks for the REPP binary and erg/repp.set config file.
- assert os.path.exists(_repp_dir + "/src/repp")
- assert os.path.exists(_repp_dir + "/erg/repp.set")
+ assert os.path.exists(_repp_dir + '/src/repp')
+ assert os.path.exists(_repp_dir + '/erg/repp.set')
return _repp_dir
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
# Steven Bird <stevenbird1@gmail.com> (minor edits)
# URL: <http://nltk.sourceforge.net>
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
"""
- def __init__(self, parens="()", strict=True):
+ def __init__(self, parens='()', strict=True):
if len(parens) != 2:
- raise ValueError("parens must contain exactly two strings")
+ raise ValueError('parens must contain exactly two strings')
self._strict = strict
self._open_paren = parens[0]
self._close_paren = parens[1]
self._paren_regexp = re.compile(
- "%s|%s" % (re.escape(parens[0]), re.escape(parens[1]))
+ '%s|%s' % (re.escape(parens[0]), re.escape(parens[1]))
)
def tokenize(self, text):
depth += 1
if paren == self._close_paren:
if self._strict and depth == 0:
- raise ValueError("Un-matched close paren at char %d" % m.start())
+ raise ValueError('Un-matched close paren at char %d' % m.start())
depth = max(0, depth - 1)
if depth == 0:
result.append(text[pos : m.end()])
pos = m.end()
if self._strict and depth > 0:
- raise ValueError("Un-matched open paren at char %d" % pos)
+ raise ValueError('Un-matched open paren at char %d' % pos)
if pos < len(text):
result.append(text[pos:])
return result
# Natural Language Toolkit: Simple Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.sourceforge.net>
to specify the tokenization conventions when building a `CorpusReader`.
"""
-
+from __future__ import unicode_literals
from nltk.tokenize.api import TokenizerI, StringTokenizer
from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
"""
- _string = " "
+ _string = ' '
class TabTokenizer(StringTokenizer):
['a', 'b c\n', ' d']
"""
- _string = "\t"
+ _string = '\t'
class CharTokenizer(StringTokenizer):
a corresponding token ``''`` after that newline.
"""
- def __init__(self, blanklines="discard"):
- valid_blanklines = ("discard", "keep", "discard-eof")
+ def __init__(self, blanklines='discard'):
+ valid_blanklines = ('discard', 'keep', 'discard-eof')
if blanklines not in valid_blanklines:
raise ValueError(
- "Blank lines must be one of: %s" % " ".join(valid_blanklines)
+ 'Blank lines must be one of: %s' % ' '.join(valid_blanklines)
)
self._blanklines = blanklines
def tokenize(self, s):
lines = s.splitlines()
# If requested, strip off blank lines.
- if self._blanklines == "discard":
+ if self._blanklines == 'discard':
lines = [l for l in lines if l.rstrip()]
- elif self._blanklines == "discard-eof":
+ elif self._blanklines == 'discard-eof':
if lines and not lines[-1].strip():
lines.pop()
return lines
# discard-eof not implemented
def span_tokenize(self, s):
- if self._blanklines == "keep":
- for span in string_span_tokenize(s, r"\n"):
+ if self._blanklines == 'keep':
+ for span in string_span_tokenize(s, r'\n'):
yield span
else:
- for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
+ for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
yield span
# XXX: it is stated in module docs that there is no function versions
-def line_tokenize(text, blanklines="discard"):
+def line_tokenize(text, blanklines='discard'):
return LineTokenizer(blanklines).tokenize(text)
+++ /dev/null
-# Natural Language Toolkit: Tokenizers
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Christopher Hench <chris.l.hench@gmail.com>
-# Alex Estes
-# URL: <http://nltk.sourceforge.net>
-# For license information, see LICENSE.TXT
-
-"""
-The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
-by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
-openness of the lips. Syllable breaks occur before troughs in sonority. For more
-on the SSP see Selkirk (1984).
-
-The default implementation uses the English alphabet, but the `sonority_hiearchy`
-can be modified to IPA or any other alphabet for the use-case. The SSP is a
-universal syllabification algorithm, but that does not mean it performs equally
-across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
-if utilizing IPA (pg. 311).
-
-Importantly, if a custom hiearchy is supplied and vowels span across more than
-one level, they should be given separately to the `vowels` class attribute.
-
-References:
-- Otto Jespersen. 1904. Lehrbuch der Phonetik.
- Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
-- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
- In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
- Cambridge, MIT Press. pp. 107-136.
-- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
- In HLT-NAACL. pp. 308-316.
-"""
-
-import warnings
-
-import re
-from string import punctuation
-
-from nltk.tokenize.api import TokenizerI
-from nltk.util import ngrams
-
-
-class SyllableTokenizer(TokenizerI):
- """
- Syllabifies words based on the Sonority Sequencing Principle (SSP).
-
- >>> from nltk.tokenize import SyllableTokenizer
- >>> from nltk import word_tokenize
- >>> SSP = SyllableTokenizer()
- >>> SSP.tokenize('justification')
- ['jus', 'ti', 'fi', 'ca', 'tion']
- >>> text = "This is a foobar-like sentence."
- >>> [SSP.tokenize(token) for token in word_tokenize(text)]
- [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
- """
-
- def __init__(self, lang="en", sonority_hierarchy=False):
- """
- :param lang: Language parameter, default is English, 'en'
- :type lang: str
- :param sonority_hierarchy: Sonority hierarchy according to the
- Sonority Sequencing Principle.
- :type sonority_hierarchy: list(str)
- """
- # Sonority hierarchy should be provided in descending order.
- # If vowels are spread across multiple levels, they should be
- # passed assigned self.vowels var together, otherwise should be
- # placed in first index of hierarchy.
- if not sonority_hierarchy and lang == "en":
- sonority_hierarchy = [
- "aeiouy", # vowels.
- "lmnrw", # nasals.
- "zvsf", # fricatives.
- "bcdgtkpqxhj", # stops.
- ]
-
- self.vowels = sonority_hierarchy[0]
- self.phoneme_map = {}
- for i, level in enumerate(sonority_hierarchy):
- for c in level:
- sonority_level = len(sonority_hierarchy) - i
- self.phoneme_map[c] = sonority_level
- self.phoneme_map[c.upper()] = sonority_level
-
- def assign_values(self, token):
- """
- Assigns each phoneme its value from the sonority hierarchy.
- Note: Sentence/text has to be tokenized first.
-
- :param token: Single word or token
- :type token: str
- :return: List of tuples, first element is character/phoneme and
- second is the soronity value.
- :rtype: list(tuple(str, int))
- """
- syllables_values = []
- for c in token:
- try:
- syllables_values.append((c, self.phoneme_map[c]))
- except KeyError:
- if c not in punctuation:
- warnings.warn(
- "Character not defined in sonority_hierarchy,"
- " assigning as vowel: '{}'".format(c)
- )
- syllables_values.append((c, max(self.phoneme_map.values())))
- self.vowels += c
- else: # If it's a punctuation, assing -1.
- syllables_values.append((c, -1))
- return syllables_values
-
- def validate_syllables(self, syllable_list):
- """
- Ensures each syllable has at least one vowel.
- If the following syllable doesn't have vowel, add it to the current one.
-
- :param syllable_list: Single word or token broken up into syllables.
- :type syllable_list: list(str)
- :return: Single word or token broken up into syllables
- (with added syllables if necessary)
- :rtype: list(str)
- """
- valid_syllables = []
- front = ""
- for i, syllable in enumerate(syllable_list):
- if syllable in punctuation:
- valid_syllables.append(syllable)
- continue
- if not re.search("|".join(self.vowels), syllable):
- if len(valid_syllables) == 0:
- front += syllable
- else:
- valid_syllables = valid_syllables[:-1] + [
- valid_syllables[-1] + syllable
- ]
- else:
- if len(valid_syllables) == 0:
- valid_syllables.append(front + syllable)
- else:
- valid_syllables.append(syllable)
-
- return valid_syllables
-
- def tokenize(self, token):
- """
- Apply the SSP to return a list of syllables.
- Note: Sentence/text has to be tokenized first.
-
- :param token: Single word or token
- :type token: str
- :return syllable_list: Single word or token broken up into syllables.
- :rtype: list(str)
- """
- # assign values from hierarchy
- syllables_values = self.assign_values(token)
-
- # if only one vowel return word
- if sum(token.count(x) for x in self.vowels) <= 1:
- return [token]
-
- syllable_list = []
- syllable = syllables_values[0][0] # start syllable with first phoneme
- for trigram in ngrams(syllables_values, n=3):
- phonemes, values = zip(*trigram)
- # Sonority of previous, focal and following phoneme
- prev_value, focal_value, next_value = values
- # Focal phoneme.
- focal_phoneme = phonemes[1]
-
- # These cases trigger syllable break.
- if focal_value == -1: # If it's a punctuation, just break.
- syllable_list.append(syllable)
- syllable_list.append(focal_phoneme)
- syllable = ""
- elif prev_value >= focal_value == next_value:
- syllable += focal_phoneme
- syllable_list.append(syllable)
- syllable = ""
-
- elif prev_value > focal_value < next_value:
- syllable_list.append(syllable)
- syllable = ""
- syllable += focal_phoneme
-
- # no syllable break
- else:
- syllable += focal_phoneme
-
- syllable += syllables_values[-1][0] # append last phoneme
- syllable_list.append(syllable)
-
- return self.validate_syllables(syllable_list)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals, print_function
+
import tempfile
import os
import json
from subprocess import PIPE
import warnings
+from six import text_type
+
from nltk.internals import find_jar, config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
from nltk.parse.corenlp import CoreNLPParser
-_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
+_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
class StanfordTokenizer(TokenizerI):
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
"""
- _JAR = "stanford-postagger.jar"
+ _JAR = 'stanford-postagger.jar'
def __init__(
self,
path_to_jar=None,
- encoding="utf8",
+ encoding='utf8',
options=None,
verbose=False,
- java_options="-mx1000m",
+ java_options='-mx1000m',
):
# Raise deprecation warning.
warnings.warn(
self._stanford_jar = find_jar(
self._JAR,
path_to_jar,
- env_vars=("STANFORD_POSTAGGER",),
+ env_vars=('STANFORD_POSTAGGER',),
searchpath=(),
url=_stanford_url,
verbose=verbose,
self.java_options = java_options
options = {} if options is None else options
- self._options_cmd = ",".join(
- "{0}={1}".format(key, val) for key, val in options.items()
+ self._options_cmd = ','.join(
+ '{0}={1}'.format(key, val) for key, val in options.items()
)
@staticmethod
"""
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
"""
- cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
+ cmd = ['edu.stanford.nlp.process.PTBTokenizer']
return self._parse_tokenized_output(self._execute(cmd, s))
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
- cmd.extend(["-charset", encoding])
+ cmd.extend(['-charset', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
- cmd.extend(["-options", self._options_cmd])
+ cmd.extend(['-options', self._options_cmd])
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
- with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+ with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
- if isinstance(input_, str) and encoding:
+ if isinstance(input_, text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
StanfordTokenizer()
except LookupError:
raise SkipTest(
- "doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist"
+ 'doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist'
)
# Natural Language Toolkit: Interface to the Stanford Segmenter
# for Chinese and Arabic
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: 52nlp <52nlpcn@gmail.com>
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
# Alex Constantin <alex@keyworder.ch>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals, print_function
+
import tempfile
import os
import json
import warnings
from subprocess import PIPE
+from six import text_type
+
+from nltk import compat
from nltk.internals import (
find_jar,
find_file,
from nltk.tokenize.api import TokenizerI
-_stanford_url = "https://nlp.stanford.edu/software"
+_stanford_url = 'https://nlp.stanford.edu/software'
class StanfordSegmenter(TokenizerI):
<BLANKLINE>
"""
- _JAR = "stanford-segmenter.jar"
+ _JAR = 'stanford-segmenter.jar'
def __init__(
self,
path_to_model=None,
path_to_dict=None,
path_to_sihan_corpora_dict=None,
- sihan_post_processing="false",
- keep_whitespaces="false",
- encoding="UTF-8",
+ sihan_post_processing='false',
+ keep_whitespaces='false',
+ encoding='UTF-8',
options=None,
verbose=False,
- java_options="-mx2g",
+ java_options='-mx2g',
):
# Raise deprecation warning.
- warnings.simplefilter("always", DeprecationWarning)
+ warnings.simplefilter('always', DeprecationWarning)
warnings.warn(
str(
"\nThe StanfordTokenizer will "
DeprecationWarning,
stacklevel=2,
)
- warnings.simplefilter("ignore", DeprecationWarning)
+ warnings.simplefilter('ignore', DeprecationWarning)
stanford_segmenter = find_jar(
self._JAR,
path_to_jar,
- env_vars=("STANFORD_SEGMENTER",),
+ env_vars=('STANFORD_SEGMENTER',),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
if path_to_slf4j is not None:
slf4j = find_jar(
- "slf4j-api.jar",
+ 'slf4j-api.jar',
path_to_slf4j,
- env_vars=("SLF4J", "STANFORD_SEGMENTER"),
+ env_vars=('SLF4J', 'STANFORD_SEGMENTER'),
searchpath=(),
url=_stanford_url,
verbose=verbose,
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
- self._options_cmd = ",".join(
- "{0}={1}".format(key, json.dumps(val)) for key, val in options.items()
+ self._options_cmd = ','.join(
+ '{0}={1}'.format(key, json.dumps(val)) for key, val in options.items()
)
def default_config(self, lang):
"""
search_path = ()
- if os.environ.get("STANFORD_SEGMENTER"):
- search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
+ if os.environ.get('STANFORD_SEGMENTER'):
+ search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
# init for Chinese-specific files
self._dict = None
self._sihan_corpora_dict = None
- self._sihan_post_processing = "false"
+ self._sihan_post_processing = 'false'
- if lang == "ar":
+ if lang == 'ar':
self._java_class = (
- "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
+ 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
)
- model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
+ model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
- elif lang == "zh":
- self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
- model = "pku.gz"
- self._sihan_post_processing = "true"
+ elif lang == 'zh':
+ self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
+ model = 'pku.gz'
+ self._sihan_post_processing = 'true'
- path_to_dict = "dict-chris6.ser.gz"
+ path_to_dict = 'dict-chris6.ser.gz'
try:
self._dict = find_file(
path_to_dict,
searchpath=search_path,
url=_stanford_url,
verbose=False,
- env_vars=("STANFORD_MODELS",),
+ env_vars=('STANFORD_MODELS',),
)
except LookupError:
raise LookupError(
% path_to_dict
)
- sihan_dir = "./data/"
+ sihan_dir = './data/'
try:
path_to_sihan_dir = find_dir(
sihan_dir,
url=_stanford_url,
verbose=False,
- env_vars=("STANFORD_SEGMENTER",),
+ env_vars=('STANFORD_SEGMENTER',),
)
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
except LookupError:
searchpath=search_path,
url=_stanford_url,
verbose=False,
- env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
+ env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER'),
)
except LookupError:
raise LookupError(
"""
cmd = [
self._java_class,
- "-loadClassifier",
+ '-loadClassifier',
self._model,
- "-keepAllWhitespaces",
+ '-keepAllWhitespaces',
self._keep_whitespaces,
- "-textFile",
+ '-textFile',
input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
- "-serDictionary",
+ '-serDictionary',
self._dict,
- "-sighanCorporaDict",
+ '-sighanCorporaDict',
self._sihan_corpora_dict,
- "-sighanPostProcessing",
+ '-sighanPostProcessing',
self._sihan_post_processing,
]
)
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
- _input_fh = os.fdopen(_input_fh, "wb")
- _input = "\n".join((" ".join(x) for x in sentences))
- if isinstance(_input, str) and encoding:
+ _input_fh = os.fdopen(_input_fh, 'wb')
+ _input = '\n'.join((' '.join(x) for x in sentences))
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
self._java_class,
- "-loadClassifier",
+ '-loadClassifier',
self._model,
- "-keepAllWhitespaces",
+ '-keepAllWhitespaces',
self._keep_whitespaces,
- "-textFile",
+ '-textFile',
self._input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
- "-serDictionary",
+ '-serDictionary',
self._dict,
- "-sighanCorporaDict",
+ '-sighanCorporaDict',
self._sihan_corpora_dict,
- "-sighanPostProcessing",
+ '-sighanPostProcessing',
self._sihan_post_processing,
]
)
def _execute(self, cmd, verbose=False):
encoding = self._encoding
- cmd.extend(["-inputEncoding", encoding])
+ cmd.extend(['-inputEncoding', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
- cmd.extend(["-options", self._options_cmd])
+ cmd.extend(['-options', self._options_cmd])
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
try:
seg = StanfordSegmenter()
- seg.default_config("ar")
- seg.default_config("zh")
+ seg.default_config('ar')
+ seg.default_config('zh')
except LookupError as e:
raise SkipTest(
- "Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)
+ 'Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e)
)
# Natural Language Toolkit: TextTiling
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: George Boutsioukis
#
# URL: <http://nltk.org/>
if stopwords is None:
from nltk.corpus import stopwords
- stopwords = stopwords.words("english")
+ stopwords = stopwords.words('english')
self.__dict__.update(locals())
- del self.__dict__["self"]
+ del self.__dict__['self']
def tokenize(self, text):
"""Return a tokenized copy of *text*, where each "token" represents
# Tokenization step starts here
# Remove punctuation
- nopunct_text = "".join(
- c for c in lowercase_text if re.match("[a-z\-' \n\t]", c)
+ nopunct_text = ''.join(
+ c for c in lowercase_text if re.match("[a-z\-\' \n\t]", c)
)
nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
last_tok_seq=None,
):
self.__dict__.update(locals())
- del self.__dict__["self"]
+ del self.__dict__['self']
class TokenSequence(object):
def __init__(self, index, wrdindex_list, original_length=None):
original_length = original_length or len(wrdindex_list)
self.__dict__.update(locals())
- del self.__dict__["self"]
+ del self.__dict__['self']
# Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
-def smooth(x, window_len=11, window="flat"):
+def smooth(x, window_len=11, window='flat'):
"""smooth the data using a window with requested size.
This method is based on the convolution of a scaled window with the signal.
if window_len < 3:
return x
- if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
+ if window not in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
raise ValueError(
"Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
)
s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
# print(len(s))
- if window == "flat": # moving average
- w = numpy.ones(window_len, "d")
+ if window == 'flat': # moving average
+ w = numpy.ones(window_len, 'd')
else:
- w = eval("numpy." + window + "(window_len)")
+ w = eval('numpy.' + window + '(window_len)')
- y = numpy.convolve(w / w.sum(), s, mode="same")
+ y = numpy.convolve(w / w.sum(), s, mode='same')
return y[window_len - 1 : -window_len + 1]
"""
import re
+from six import text_type
from nltk.tokenize.api import TokenizerI
>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
- >>> print(toktok.tokenize(text, return_str=True))
+ >>> print (toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
- >>> print(toktok.tokenize(text, return_str=True))
+ >>> print (toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
# Pad some funky punctuation.
FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
# Pad more funky punctuation.
- FUNKY_PUNCT_2 = re.compile(u"([({\[“‘„‚«‹「『])"), r" \1 "
+ FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 "
# Pad En dash and em dash
- EN_EM_DASHES = re.compile(u"([–—])"), r" \1 "
+ EN_EM_DASHES = re.compile(u'([–—])'), r" \1 "
# Replace problematic character with numeric character reference.
- AMPERCENT = re.compile("& "), "& "
- TAB = re.compile("\t"), " 	 "
- PIPE = re.compile("\|"), " | "
+ AMPERCENT = re.compile('& '), '& '
+ TAB = re.compile('\t'), ' 	 '
+ PIPE = re.compile('\|'), ' | '
# Pad numbers with commas to keep them from further tokenization.
- COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
+ COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 '
# Just pad problematic (often neurotic) hyphen/single quote, etc.
- PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
+ PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 '
# Group ` ` stupid quotes ' ' into a single token.
STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
# Treat continuous commas as fake German,Czech, etc.: „
- MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
+ MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 '
# Treat continuous dashes as fake en-dash, etc.
- MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
+ MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 '
# Treat multiple periods as a thing (eg. ellipsis)
- MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
+ MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 '
# This is the \p{Open_Punctuation} from Perl's perluniprops
# see http://perldoc.perl.org/perluniprops.html
- OPEN_PUNCT = str(
- u"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
- u"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
- u"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
- u"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
- u"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
- u"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
- u"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
- u"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
- u"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
+ OPEN_PUNCT = text_type(
+ u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
+ u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
+ u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
+ u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
+ u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
+ u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
+ u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
+ u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
+ u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
)
# This is the \p{Close_Punctuation} from Perl's perluniprops
- CLOSE_PUNCT = str(
- u")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
- u"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
- u"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
- u"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
- u"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
- u"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
- u"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
- u"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
- u"\uff09\uff3d\uff5d\uff60\uff63"
+ CLOSE_PUNCT = text_type(
+ u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
+ u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
+ u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
+ u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
+ u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
+ u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
+ u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
+ u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
+ u'\uff09\uff3d\uff5d\uff60\uff63'
)
# This is the \p{Close_Punctuation} from Perl's perluniprops
- CURRENCY_SYM = str(
- u"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
- u"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
- u"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
- u"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
- u"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
- u"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
+ CURRENCY_SYM = text_type(
+ u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
+ u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
+ u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
+ u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
+ u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
+ u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6'
)
# Pad spaces after opening punctuations.
- OPEN_PUNCT_RE = re.compile(u"([{}])".format(OPEN_PUNCT)), r"\1 "
+ OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 '
# Pad spaces before closing punctuations.
- CLOSE_PUNCT_RE = re.compile(u"([{}])".format(CLOSE_PUNCT)), r"\1 "
+ CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 '
# Pad spaces after currency symbols.
- CURRENCY_SYM_RE = re.compile(u"([{}])".format(CURRENCY_SYM)), r"\1 "
+ CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 '
# Use for tokenizing URL-unfriendly characters: [:/?#]
- URL_FOE_1 = re.compile(r":(?!//)"), r" : " # in perl s{:(?!//)}{ : }g;
- URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? " # in perl s{\?(?!\S)}{ ? }g;
+ URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g;
+ URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g;
# in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
- URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
- URL_FOE_4 = re.compile(r" /"), r" / " # s{ /}{ / }g;
+ URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
+ URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g;
# Left/Right strip, i.e. remove heading/trailing spaces.
# These strip regexes should NOT be used,
# instead use str.lstrip(), str.rstrip() or str.strip()
# (They are kept for reference purposes to the original toktok.pl code)
- LSTRIP = re.compile(r"^ +"), ""
- RSTRIP = re.compile(r"\s+$"), "\n"
+ LSTRIP = re.compile(r'^ +'), ''
+ RSTRIP = re.compile(r'\s+$'), '\n'
# Merge multiple spaces.
- ONE_SPACE = re.compile(r" {2,}"), " "
+ ONE_SPACE = re.compile(r' {2,}'), ' '
TOKTOK_REGEXES = [
NON_BREAKING,
]
def tokenize(self, text, return_str=False):
- text = str(text) # Converts input string into unicode.
+ text = text_type(text) # Converts input string into unicode.
for regexp, subsitution in self.TOKTOK_REGEXES:
text = regexp.sub(subsitution, text)
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
- text = str(text.strip())
+ text = text_type(text.strip())
return text if return_str else text.split()
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
#
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
-from nltk.tokenize.destructive import MacIntyreContractions
+
+
+class MacIntyreContractions:
+ """
+ List of contractions adapted from Robert MacIntyre's tokenizer.
+ """
+
+ CONTRACTIONS2 = [
+ r"(?i)\b(can)(?#X)(not)\b",
+ r"(?i)\b(d)(?#X)('ye)\b",
+ r"(?i)\b(gim)(?#X)(me)\b",
+ r"(?i)\b(gon)(?#X)(na)\b",
+ r"(?i)\b(got)(?#X)(ta)\b",
+ r"(?i)\b(lem)(?#X)(me)\b",
+ r"(?i)\b(mor)(?#X)('n)\b",
+ r"(?i)\b(wan)(?#X)(na)\s",
+ ]
+ CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
+ CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
class TreebankWordTokenizer(TokenizerI):
# starting quotes
STARTING_QUOTES = [
- (re.compile(r"^\""), r"``"),
- (re.compile(r"(``)"), r" \1 "),
- (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
+ (re.compile(r'^\"'), r'``'),
+ (re.compile(r'(``)'), r' \1 '),
+ (re.compile(r"([ \(\[{<])(\"|\'{2})"), r'\1 `` '),
]
# punctuation
PUNCTUATION = [
- (re.compile(r"([:,])([^\d])"), r" \1 \2"),
- (re.compile(r"([:,])$"), r" \1 "),
- (re.compile(r"\.\.\."), r" ... "),
- (re.compile(r"[;@#$%&]"), r" \g<0> "),
+ (re.compile(r'([:,])([^\d])'), r' \1 \2'),
+ (re.compile(r'([:,])$'), r' \1 '),
+ (re.compile(r'\.\.\.'), r' ... '),
+ (re.compile(r'[;@#$%&]'), r' \g<0> '),
(
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
- r"\1 \2\3 ",
+ r'\1 \2\3 ',
), # Handles the final period.
- (re.compile(r"[?!]"), r" \g<0> "),
+ (re.compile(r'[?!]'), r' \g<0> '),
(re.compile(r"([^'])' "), r"\1 ' "),
]
# Pads parentheses
- PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
+ PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
CONVERT_PARENTHESES = [
- (re.compile(r"\("), "-LRB-"),
- (re.compile(r"\)"), "-RRB-"),
- (re.compile(r"\["), "-LSB-"),
- (re.compile(r"\]"), "-RSB-"),
- (re.compile(r"\{"), "-LCB-"),
- (re.compile(r"\}"), "-RCB-"),
+ (re.compile(r'\('), '-LRB-'),
+ (re.compile(r'\)'), '-RRB-'),
+ (re.compile(r'\['), '-LSB-'),
+ (re.compile(r'\]'), '-RSB-'),
+ (re.compile(r'\{'), '-LCB-'),
+ (re.compile(r'\}'), '-RCB-'),
]
- DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
+ DOUBLE_DASHES = (re.compile(r'--'), r' -- ')
# ending quotes
ENDING_QUOTES = [
(re.compile(r'"'), " '' "),
- (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
+ (re.compile(r'(\S)(\'\')'), r'\1 \2 '),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
text = regexp.sub(substitution, text)
for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r" \1 \2 ", text)
+ text = regexp.sub(r' \1 \2 ', text)
for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r" \1 \2 ", text)
+ text = regexp.sub(r' \1 \2 ', text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
_contractions = MacIntyreContractions()
CONTRACTIONS2 = [
- re.compile(pattern.replace("(?#X)", "\s"))
+ re.compile(pattern.replace('(?#X)', '\s'))
for pattern in _contractions.CONTRACTIONS2
]
CONTRACTIONS3 = [
- re.compile(pattern.replace("(?#X)", "\s"))
+ re.compile(pattern.replace('(?#X)', '\s'))
for pattern in _contractions.CONTRACTIONS3
]
ENDING_QUOTES = [
(re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
(re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
- (re.compile(r"(\S)(\'\')"), r"\1\2 "),
+ (re.compile(r'(\S)(\'\')'), r'\1\2 '),
(re.compile(r" '' "), '"'),
]
# Handles double dashes
- DOUBLE_DASHES = (re.compile(r" -- "), r"--")
+ DOUBLE_DASHES = (re.compile(r' -- '), r'--')
# Optionally: Convert parentheses, brackets and converts them from PTB symbols.
CONVERT_PARENTHESES = [
- (re.compile("-LRB-"), "("),
- (re.compile("-RRB-"), ")"),
- (re.compile("-LSB-"), "["),
- (re.compile("-RSB-"), "]"),
- (re.compile("-LCB-"), "{"),
- (re.compile("-RCB-"), "}"),
+ (re.compile('-LRB-'), '('),
+ (re.compile('-RRB-'), ')'),
+ (re.compile('-LSB-'), '['),
+ (re.compile('-RSB-'), ']'),
+ (re.compile('-LCB-'), '{'),
+ (re.compile('-RCB-'), '}'),
]
# Undo padding on parentheses.
PARENS_BRACKETS = [
- (re.compile(r"\s([\[\(\{\<])\s"), r" \g<1>"),
- (re.compile(r"\s([\]\)\}\>])\s"), r"\g<1> "),
- (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
+ (re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'),
+ (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '),
+ (re.compile(r'([\]\)\}\>])\s([:;,.])'), r'\1\2'),
]
# punctuation
PUNCTUATION = [
(re.compile(r"([^'])\s'\s"), r"\1' "),
- (re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!]
+ (re.compile(r'\s([?!])'), r'\g<1>'), # Strip left pad for [?!]
# (re.compile(r'\s([?!])\s'), r'\g<1>'),
- (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
+ (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
# When tokenizing, [;@#$%&] are padded with whitespace regardless of
# whether there are spaces before or after them.
# But during detokenization, we need to distinguish between left/right
# pad, so we split this up.
- (re.compile(r"\s([#$])\s"), r" \g<1>"), # Left pad.
- (re.compile(r"\s([;%])\s"), r"\g<1> "), # Right pad.
- (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad.
- (re.compile(r"\s\.\.\.\s"), r"..."),
- (re.compile(r"\s([:,])\s$"), r"\1"),
+ (re.compile(r'\s([#$])\s'), r' \g<1>'), # Left pad.
+ (re.compile(r'\s([;%])\s'), r'\g<1> '), # Right pad.
+ (re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
+ (re.compile(r'\s\.\.\.\s'), r'...'),
+ (re.compile(r'\s([:,])\s$'), r'\1'),
(
- re.compile(r"\s([:,])\s([^\d])"),
- r"\1 \2",
+ re.compile(r'\s([:,])\s([^\d])'),
+ r'\1 \2',
) # Keep right pad after comma/colon before non-digits.
# (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
]
# starting quotes
STARTING_QUOTES = [
- (re.compile(r"([ (\[{<])\s``"), r'\1"'),
- (re.compile(r"\s(``)\s"), r"\1"),
- (re.compile(r"^``"), r"\""),
+ (re.compile(r'([ (\[{<])\s``'), r'\1"'),
+ (re.compile(r'\s(``)\s'), r'\1'),
+ (re.compile(r'^``'), r'\"'),
]
def tokenize(self, tokens, convert_parentheses=False):
"""
- Treebank detokenizer, created by undoing the regexes from
- the TreebankWordTokenizer.tokenize.
+ Python port of the Moses detokenizer.
:param tokens: A list of strings, i.e. tokenized text.
:type tokens: list(str)
:return: str
"""
- text = " ".join(tokens)
+ text = ' '.join(tokens)
# Reverse the contractions regexes.
# Note: CONTRACTIONS4 are not used in tokenization.
for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r"\1\2", text)
+ text = regexp.sub(r'\1\2', text)
for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r"\1\2", text)
+ text = regexp.sub(r'\1\2', text)
# Reverse the regexes applied for ending quotes.
for regexp, substitution in self.ENDING_QUOTES:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizer Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
# coding: utf-8
# Natural Language Toolkit: Toolbox Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Greg Aumann <greg_aumann@sil.org>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
Module for reading, writing and manipulating
Toolbox databases and settings files.
"""
+from __future__ import print_function
import re, codecs
from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement
-from io import StringIO
+from six import u
+
+from nltk.compat import StringIO, PY3
from nltk.data import PathPointer, find
# (PathPointer.open doesn't take a mode option)
self._file = sfm_file.open(self._encoding)
else:
- self._file = codecs.open(sfm_file, "rU", self._encoding)
+ self._file = codecs.open(sfm_file, 'rU', self._encoding)
def open_string(self, s):
"""
:rtype: iter(tuple(str, str))
"""
- join_string = "\n"
- line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$"
+ join_string = '\n'
+ line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
# discard a BOM in the first line
- first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?")
- line_pat = re.compile(line_regexp % "")
+ first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?')
+ line_pat = re.compile(line_regexp % '')
# need to get first line outside the loop for correct handling
# of the first marker if it spans multiple lines
file_iter = iter(self._file)
strip=True,
unwrap=True,
encoding=None,
- errors="strict",
+ errors='strict',
unicode_fields=None,
):
"""
:rtype: iter(tuple(str, str))
"""
if encoding is None and unicode_fields is not None:
- raise ValueError("unicode_fields is set but not encoding.")
- unwrap_pat = re.compile(r"\n+")
+ raise ValueError('unicode_fields is set but not encoding.')
+ unwrap_pat = re.compile(r'\n+')
for mkr, val in self.raw_fields():
+ if encoding and not PY3: # kludge - already decoded in PY3?
+ if unicode_fields is not None and mkr in unicode_fields:
+ val = val.decode('utf8', errors)
+ else:
+ val = val.decode(encoding, errors)
+ mkr = mkr.decode(encoding, errors)
if unwrap:
- val = unwrap_pat.sub(" ", val)
+ val = unwrap_pat.sub(' ', val)
if strip:
val = val.rstrip()
yield (mkr, val)
:return: contents of toolbox data divided into header and records
"""
builder = TreeBuilder()
- builder.start("toolbox_data", {})
- builder.start("header", {})
+ builder.start('toolbox_data', {})
+ builder.start('header', {})
in_records = False
for mkr, value in self.fields(**kwargs):
- if key is None and not in_records and mkr[0] != "_":
+ if key is None and not in_records and mkr[0] != '_':
key = mkr
if mkr == key:
if in_records:
- builder.end("record")
+ builder.end('record')
else:
- builder.end("header")
+ builder.end('header')
in_records = True
- builder.start("record", {})
+ builder.start('record', {})
builder.start(mkr, {})
builder.data(value)
builder.end(mkr)
if in_records:
- builder.end("record")
+ builder.end('record')
else:
- builder.end("header")
- builder.end("toolbox_data")
+ builder.end('header')
+ builder.end('toolbox_data')
return builder.close()
def _tree2etree(self, parent):
e.text = text
return root
- def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs):
+ def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs):
"""
Returns an element tree structure corresponding to a toolbox data file
parsed according to the chunk grammar.
cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
db = self.parse(**kwargs)
- tb_etree = Element("toolbox_data")
- header = db.find("header")
+ tb_etree = Element('toolbox_data')
+ header = db.find('header')
tb_etree.append(header)
- for record in db.findall("record"):
+ for record in db.findall('record'):
parsed = cp.parse([(elem.text, elem.tag) for elem in record])
tb_etree.append(self._tree2etree(parsed))
return tb_etree
_is_value = re.compile(r"\S")
-def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None):
+def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
"""
Return a string with a standard format representation of the toolbox
data in tree (tree can be a toolbox database or a single record).
:type unicode_fields: dict(str) or set(str)
:rtype: str
"""
- if tree.tag == "record":
- root = Element("toolbox_data")
+ if tree.tag == 'record':
+ root = Element('toolbox_data')
root.append(tree)
tree = root
- if tree.tag != "toolbox_data":
+ if tree.tag != 'toolbox_data':
raise ValueError("not a toolbox_data element structure")
if encoding is None and unicode_fields is not None:
raise ValueError(
)
l = []
for rec in tree:
- l.append("\n")
+ l.append('\n')
for field in rec:
mkr = field.tag
value = field.text
if encoding is not None:
if unicode_fields is not None and mkr in unicode_fields:
- cur_encoding = "utf8"
+ cur_encoding = 'utf8'
else:
cur_encoding = encoding
if re.search(_is_value, value):
l.append(
- ("\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)
+ (u("\\%s %s\n") % (mkr, value)).encode(cur_encoding, errors)
)
else:
l.append(
- ("\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)
+ (u("\\%s%s\n") % (mkr, value)).encode(cur_encoding, errors)
)
else:
if re.search(_is_value, value):
l.append("\\%s %s\n" % (mkr, value))
else:
l.append("\\%s%s\n" % (mkr, value))
- return "".join(l[1:])
+ return ''.join(l[1:])
class ToolboxSettings(StandardFormat):
def __init__(self):
super(ToolboxSettings, self).__init__()
- def parse(self, encoding=None, errors="strict", **kwargs):
+ def parse(self, encoding=None, errors='strict', **kwargs):
"""
Return the contents of toolbox settings file with a nested structure.
if block == "+":
builder.start(mkr, {})
builder.data(value)
- elif block == "-":
+ elif block == '-':
builder.end(mkr)
else:
builder.start(mkr, {})
return builder.close()
-def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None):
+def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):
# write XML to file
l = list()
_to_settings_string(
errors=errors,
unicode_fields=unicode_fields,
)
- return "".join(l)
+ return ''.join(l)
def _to_settings_string(node, l, **kwargs):
text = node.text
if len(node) == 0:
if text:
- l.append("\\%s %s\n" % (tag, text))
+ l.append('\\%s %s\n' % (tag, text))
else:
- l.append("\\%s\n" % tag)
+ l.append('\\%s\n' % tag)
else:
if text:
- l.append("\\+%s %s\n" % (tag, text))
+ l.append('\\+%s %s\n' % (tag, text))
else:
- l.append("\\+%s\n" % tag)
+ l.append('\\+%s\n' % tag)
for n in node:
_to_settings_string(n, l, **kwargs)
- l.append("\\-%s\n" % tag)
+ l.append('\\-%s\n' % tag)
return
# zip_path = find('corpora/toolbox.zip')
# lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
- file_path = find("corpora/toolbox/rotokas.dic")
+ file_path = find('corpora/toolbox/rotokas.dic')
lexicon = ToolboxData(file_path).parse()
- print("first field in fourth record:")
+ print('first field in fourth record:')
print(lexicon[3][0].tag)
print(lexicon[3][0].text)
- print("\nfields in sequential order:")
- for field in islice(lexicon.find("record"), 10):
+ print('\nfields in sequential order:')
+ for field in islice(lexicon.find('record'), 10):
print(field.tag, field.text)
- print("\nlx fields:")
- for field in islice(lexicon.findall("record/lx"), 10):
+ print('\nlx fields:')
+ for field in islice(lexicon.findall('record/lx'), 10):
print(field.text)
settings = ToolboxSettings()
- file_path = find("corpora/toolbox/MDF/MDF_AltH.typ")
+ file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
settings.open(file_path)
# settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
- tree = settings.parse(unwrap=False, encoding="cp1252")
- print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text)
+ tree = settings.parse(unwrap=False, encoding='cp1252')
+ print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
settings_tree = ElementTree(tree)
- print(to_settings_string(settings_tree).encode("utf8"))
+ print(to_settings_string(settings_tree).encode('utf8'))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Machine Translation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.translate.ibm5 import IBMModel5
from nltk.translate.bleu_score import sentence_bleu as bleu
from nltk.translate.ribes_score import sentence_ribes as ribes
-from nltk.translate.meteor_score import meteor_score as meteor
from nltk.translate.metrics import alignment_error_rate
from nltk.translate.stack_decoder import StackDecoder
# Natural Language Toolkit: API for alignment and translation objects
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
import subprocess
from collections import namedtuple
+from nltk.compat import python_2_unicode_compatible
+
+@python_2_unicode_compatible
class AlignedSent(object):
"""
Return an aligned sentence object, which encapsulates two sentences
"""
Dot representation of the aligned sentence
"""
- s = "graph align {\n"
- s += "node[shape=plaintext]\n"
+ s = 'graph align {\n'
+ s += 'node[shape=plaintext]\n'
# Declare node
for w in self._words:
)
# Put it in the same rank
- s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
- s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
+ s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
+ s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
- s += "}"
+ s += '}'
return s
"""
Ipython magic : show SVG representation of this ``AlignedSent``.
"""
- dot_string = self._to_dot().encode("utf8")
- output_format = "svg"
+ dot_string = self._to_dot().encode('utf8')
+ output_format = 'svg'
try:
process = subprocess.Popen(
- ["dot", "-T%s" % output_format],
+ ['dot', '-T%s' % output_format],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except OSError:
- raise Exception("Cannot find the dot binary from Graphviz package")
+ raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
- return out.decode("utf8")
+ return out.decode('utf8')
def __str__(self):
"""
return AlignedSent(self._mots, self._words, self._alignment.invert())
+@python_2_unicode_compatible
class Alignment(frozenset):
"""
A storage class for representing alignment between two sequences, s1, s2.
raise IndexError("Alignment is outside boundary of mots")
-PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
+PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob'])
class PhraseTable(object):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: BLEU Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""BLEU score implementation."""
+from __future__ import division
import math
import sys
-from fractions import Fraction
+import fractions
import warnings
from collections import Counter
from nltk.util import ngrams
+try:
+ fractions.Fraction(0, 1000, _normalize=False)
+ from fractions import Fraction
+except TypeError:
+ from nltk.compat import Fraction
+
def sentence_bleu(
references,
... 'Party', 'commands']
>>> chencherry = SmoothingFunction()
- >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
0.4489...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
0.4905...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
0.4135...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
0.4905...
:param epsilon: the epsilon value use in method 1
incvnt += 1
return p_n
- def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method4(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 4:
Shorter translations may have inflated precision values due to having
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
for i, p_i in enumerate(p_n):
if p_i.numerator == 0 and hyp_len != 0:
incvnt = i + 1 * self.k / math.log(
hyp_len
) # Note that this K is different from the K from NIST.
- p_n[i] = incvnt / p_i.denominator
+ p_n[i] = 1 / incvnt
return p_n
- def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method5(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 5:
The matched counts for similar values of n should be similar. To a
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
matched counts.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
m = {}
# Requires an precision value for an addition ngram order.
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
m[i] = p_n[i]
return p_n
- def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 6:
Interpolates the maximum likelihood estimate of the precision *p_n* with
Gao and He (2013) Training MRF-Based Phrase Translation Models using
Gradient Ascent. In NAACL.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
# This smoothing only works when p_1 and p_2 is non-zero.
# Raise an error with an appropriate message when the input is too short
# to use this smoothing technique.
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
return p_n
- def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method7(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
- Smoothing method 7:
- Interpolates methods 5 and 6.
+ Smoothing method 6:
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+ between pn and pn−1 will be the same as that between pn−1 and pn−2.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
p_n = self.method4(p_n, references, hypothesis, hyp_len)
p_n = self.method5(p_n, references, hypothesis, hyp_len)
return p_n
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ChrF score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Maja Popovic
# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
""" ChrF score implementation """
+from __future__ import division
from collections import Counter, defaultdict
import re
def _preprocess(sent, ignore_whitespace):
if type(sent) != str:
# turn list of tokens into a string
- sent = " ".join(sent)
+ sent = ' '.join(sent)
if ignore_whitespace:
- sent = re.sub(r"\s+", "", sent)
+ sent = re.sub(r'\s+', '', sent)
return sent
# Natural Language Toolkit: Gale-Church Aligner
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Torsten Marek <marek@ifi.uzh.ch>
# Contributor: Cassidy Laidlaw, Liling Tan
# URL: <http://nltk.org/>
"""
+from __future__ import division
import math
try:
try:
return math.log(1 - norm_cdf(x))
except ValueError:
- return float("-inf")
+ return float('-inf')
LOG2 = math.log(2)
m * params.VARIANCE_CHARACTERS
)
except ZeroDivisionError:
- return float("-inf")
+ return float('-inf')
return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
for i in range(len(source_sents_lens) + 1):
for j in range(len(target_sents_lens) + 1):
- min_dist = float("inf")
+ min_dist = float('inf')
min_align = None
for a in alignment_types:
prev_i = -1 - a[0]
min_dist = p
min_align = a
- if min_dist == float("inf"):
+ if min_dist == float('inf'):
min_dist = 0
backlinks[(i, j)] = min_align
for block_it in split_at(stream, hard_delimiter)
]
+
+# Code for test files in nltk_contrib/align/data/*.tok
+# import sys
+# from contextlib import nested
+# with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t):
+# source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP")
+# target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP")
+# print align_texts(source, target)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: GDFA word alignment symmetrization
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
# Converts pharaoh text format into list of tuples.
- e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
- f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]
+ e2f = [tuple(map(int, a.split('-'))) for a in e2f.split()]
+ f2e = [tuple(map(int, a.split('-'))) for a in f2e.split()]
neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
# *aligned* is used to check if neighbors are aligned in grow_diag()
aligned = defaultdict(set)
for i, j in alignment:
- aligned["e"].add(i)
- aligned["f"].add(j)
+ aligned['e'].add(i)
+ aligned['f'].add(j)
def grow_diag():
"""
e_new not in aligned and f_new not in aligned
) and neighbor in union:
alignment.add(neighbor)
- aligned["e"].add(e_new)
- aligned["f"].add(f_new)
+ aligned['e'].add(e_new)
+ aligned['f'].add(f_new)
prev_len += 1
no_new_points = False
# iterate until no new points added
and (e_new, f_new) in union
):
alignment.add((e_new, f_new))
- aligned["e"].add(e_new)
- aligned["f"].add(f_new)
+ aligned['e'].add(e_new)
+ aligned['f'].add(f_new)
grow_diag()
final_and(e2f)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: GLEU Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors:
# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
""" GLEU score implementation. """
-
+from __future__ import division
from collections import Counter
from nltk.util import ngrams, everygrams
263-311.
"""
+from __future__ import division
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import Alignment
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
+ self.translation_table = probability_tables['translation_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
263-311.
"""
+from __future__ import division
+
import warnings
from collections import defaultdict
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
counts = Model2Counts()
for aligned_sentence in parallel_corpus:
src_sentence = [None] + aligned_sentence.mots
- trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
+ trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
263-311.
"""
+from __future__ import division
+
import warnings
from collections import defaultdict
from math import factorial
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
- self.fertility_table = probability_tables["fertility_table"]
- self.p1 = probability_tables["p1"]
- self.distortion_table = probability_tables["distortion_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.distortion_table = probability_tables['distortion_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 4
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
263-311.
"""
+from __future__ import division
+
import warnings
from collections import defaultdict
from math import factorial
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
- self.fertility_table = probability_tables["fertility_table"]
- self.p1 = probability_tables["p1"]
- self.head_distortion_table = probability_tables["head_distortion_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.head_distortion_table = probability_tables['head_distortion_table']
self.non_head_distortion_table = probability_tables[
- "non_head_distortion_table"
+ 'non_head_distortion_table'
]
for n in range(0, iterations):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 5
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
263-311.
"""
+from __future__ import division
+
import warnings
from collections import defaultdict
from math import factorial
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
- self.fertility_table = probability_tables["fertility_table"]
- self.p1 = probability_tables["p1"]
- self.head_distortion_table = probability_tables["head_distortion_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.head_distortion_table = probability_tables['head_distortion_table']
self.non_head_distortion_table = probability_tables[
- "non_head_distortion_table"
+ 'non_head_distortion_table'
]
- self.head_vacancy_table = probability_tables["head_vacancy_table"]
- self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"]
+ self.head_vacancy_table = probability_tables['head_vacancy_table']
+ self.non_head_vacancy_table = probability_tables['non_head_vacancy_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model Core
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
-
+from __future__ import division
from bisect import insort_left
from collections import defaultdict
from copy import deepcopy
:type i_pegged: int
"""
src_sentence = [None] + sentence_pair.mots
- trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
+ trg_sentence = ['UNUSED'] + sentence_pair.words # 1-indexed
l = len(src_sentence) - 1 # exclude NULL
m = len(trg_sentence) - 1
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Machine Translation
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Uday Krishna <udaykrishna5@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from nltk.stem.porter import PorterStemmer
-from nltk.corpus import wordnet
-from itertools import chain, product
-
-
-def _generate_enums(hypothesis, reference, preprocess=str.lower):
- """
- Takes in string inputs for hypothesis and reference and returns
- enumerated word lists for each of them
-
- :param hypothesis: hypothesis string
- :type hypothesis: str
- :param reference: reference string
- :type reference: str
- :preprocess: preprocessing method (default str.lower)
- :type preprocess: method
- :return: enumerated words list
- :rtype: list of 2D tuples, list of 2D tuples
- """
- hypothesis_list = list(enumerate(preprocess(hypothesis).split()))
- reference_list = list(enumerate(preprocess(reference).split()))
- return hypothesis_list, reference_list
-
-
-def exact_match(hypothesis, reference):
- """
- matches exact words in hypothesis and reference
- and returns a word mapping based on the enumerated
- word id between hypothesis and reference
-
- :param hypothesis: hypothesis string
- :type hypothesis: str
- :param reference: reference string
- :type reference: str
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- hypothesis_list, reference_list = _generate_enums(hypothesis, reference)
- return _match_enums(hypothesis_list, reference_list)
-
-
-def _match_enums(enum_hypothesis_list, enum_reference_list):
- """
- matches exact words in hypothesis and reference and returns
- a word mapping between enum_hypothesis_list and enum_reference_list
- based on the enumerated word id.
-
- :param enum_hypothesis_list: enumerated hypothesis list
- :type enum_hypothesis_list: list of tuples
- :param enum_reference_list: enumerated reference list
- :type enum_reference_list: list of 2D tuples
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- word_match = []
- for i in range(len(enum_hypothesis_list))[::-1]:
- for j in range(len(enum_reference_list))[::-1]:
- if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
- word_match.append(
- (enum_hypothesis_list[i][0], enum_reference_list[j][0])
- )
- (enum_hypothesis_list.pop(i)[1], enum_reference_list.pop(j)[1])
- break
- return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def _enum_stem_match(
- enum_hypothesis_list, enum_reference_list, stemmer=PorterStemmer()
-):
- """
- Stems each word and matches them in hypothesis and reference
- and returns a word mapping between enum_hypothesis_list and
- enum_reference_list based on the enumerated word id. The function also
- returns a enumerated list of unmatched words for hypothesis and reference.
-
- :param enum_hypothesis_list:
- :type enum_hypothesis_list:
- :param enum_reference_list:
- :type enum_reference_list:
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- stemmed_enum_list1 = [
- (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list
- ]
-
- stemmed_enum_list2 = [
- (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list
- ]
-
- word_match, enum_unmat_hypo_list, enum_unmat_ref_list = _match_enums(
- stemmed_enum_list1, stemmed_enum_list2
- )
-
- enum_unmat_hypo_list = (
- list(zip(*enum_unmat_hypo_list)) if len(enum_unmat_hypo_list) > 0 else []
- )
-
- enum_unmat_ref_list = (
- list(zip(*enum_unmat_ref_list)) if len(enum_unmat_ref_list) > 0 else []
- )
-
- enum_hypothesis_list = list(
- filter(lambda x: x[0] not in enum_unmat_hypo_list, enum_hypothesis_list)
- )
-
- enum_reference_list = list(
- filter(lambda x: x[0] not in enum_unmat_ref_list, enum_reference_list)
- )
-
- return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def stem_match(hypothesis, reference, stemmer=PorterStemmer()):
- """
- Stems each word and matches them in hypothesis and reference
- and returns a word mapping between hypothesis and reference
-
- :param hypothesis:
- :type hypothesis:
- :param reference:
- :type reference:
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that
- implements a stem method
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
- return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer)
-
-
-def _enum_wordnetsyn_match(enum_hypothesis_list, enum_reference_list, wordnet=wordnet):
- """
- Matches each word in reference to a word in hypothesis
- if any synonym of a hypothesis word is the exact match
- to the reference word.
-
- :param enum_hypothesis_list: enumerated hypothesis list
- :param enum_reference_list: enumerated reference list
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: list of matched tuples, unmatched hypothesis list, unmatched reference list
- :rtype: list of tuples, list of tuples, list of tuples
-
- """
- word_match = []
- for i in range(len(enum_hypothesis_list))[::-1]:
- hypothesis_syns = set(
- chain(
- *[
- [
- lemma.name()
- for lemma in synset.lemmas()
- if lemma.name().find("_") < 0
- ]
- for synset in wordnet.synsets(enum_hypothesis_list[i][1])
- ]
- )
- ).union({enum_hypothesis_list[i][1]})
- for j in range(len(enum_reference_list))[::-1]:
- if enum_reference_list[j][1] in hypothesis_syns:
- word_match.append(
- (enum_hypothesis_list[i][0], enum_reference_list[j][0])
- )
- enum_hypothesis_list.pop(i), enum_reference_list.pop(j)
- break
- return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def wordnetsyn_match(hypothesis, reference, wordnet=wordnet):
- """
- Matches each word in reference to a word in hypothesis if any synonym
- of a hypothesis word is the exact match to the reference word.
-
- :param hypothesis: hypothesis string
- :param reference: reference string
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: list of mapped tuples
- :rtype: list of tuples
- """
- enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
- return _enum_wordnetsyn_match(
- enum_hypothesis_list, enum_reference_list, wordnet=wordnet
- )
-
-
-def _enum_allign_words(
- enum_hypothesis_list, enum_reference_list, stemmer=PorterStemmer(), wordnet=wordnet
-):
- """
- Aligns/matches words in the hypothesis to reference by sequentially
- applying exact match, stemmed match and wordnet based synonym match.
- in case there are multiple matches the match which has the least number
- of crossing is chosen. Takes enumerated list as input instead of
- string input
-
- :param enum_hypothesis_list: enumerated hypothesis list
- :param enum_reference_list: enumerated reference list
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: sorted list of matched tuples, unmatched hypothesis list,
- unmatched reference list
- :rtype: list of tuples, list of tuples, list of tuples
- """
- exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums(
- enum_hypothesis_list, enum_reference_list
- )
-
- stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match(
- enum_hypothesis_list, enum_reference_list, stemmer=stemmer
- )
-
- wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match(
- enum_hypothesis_list, enum_reference_list, wordnet=wordnet
- )
-
- return (
- sorted(
- exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0]
- ),
- enum_hypothesis_list,
- enum_reference_list,
- )
-
-
-def allign_words(hypothesis, reference, stemmer=PorterStemmer(), wordnet=wordnet):
- """
- Aligns/matches words in the hypothesis to reference by sequentially
- applying exact match, stemmed match and wordnet based synonym match.
- In case there are multiple matches the match which has the least number
- of crossing is chosen.
-
- :param hypothesis: hypothesis string
- :param reference: reference string
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list
- :rtype: list of tuples, list of tuples, list of tuples
- """
- enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
- return _enum_allign_words(
- enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet
- )
-
-
-def _count_chunks(matches):
- """
- Counts the fewest possible number of chunks such that matched unigrams
- of each chunk are adjacent to each other. This is used to caluclate the
- fragmentation part of the metric.
-
- :param matches: list containing a mapping of matched words (output of allign_words)
- :return: Number of chunks a sentence is divided into post allignment
- :rtype: int
- """
- i = 0
- chunks = 1
- while i < len(matches) - 1:
- if (matches[i + 1][0] == matches[i][0] + 1) and (
- matches[i + 1][1] == matches[i][1] + 1
- ):
- i += 1
- continue
- i += 1
- chunks += 1
- return chunks
-
-
-def single_meteor_score(
- reference,
- hypothesis,
- preprocess=str.lower,
- stemmer=PorterStemmer(),
- wordnet=wordnet,
- alpha=0.9,
- beta=3,
- gamma=0.5,
-):
- """
- Calculates METEOR score for single hypothesis and reference as per
- "Meteor: An Automatic Metric for MT Evaluation with HighLevels of
- Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
- in Proceedings of ACL.
- http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
-
-
- >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
-
- >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
-
-
- >>> round(single_meteor_score(reference1, hypothesis1),4)
- 0.7398
-
- If there is no words match during the alignment the method returns the
- score as 0. We can safely return a zero instead of raising a
- division by zero error as no match usually implies a bad translation.
-
- >>> round(meteor_score('this is a cat', 'non matching hypothesis'),4)
- 0.0
-
- :param references: reference sentences
- :type references: list(str)
- :param hypothesis: a hypothesis sentence
- :type hypothesis: str
- :param preprocess: preprocessing function (default str.lower)
- :type preprocess: method
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :param alpha: parameter for controlling relative weights of precision and recall.
- :type alpha: float
- :param beta: parameter for controlling shape of penalty as a
- function of as a function of fragmentation.
- :type beta: float
- :param gamma: relative weight assigned to fragmentation penality.
- :type gamma: float
- :return: The sentence-level METEOR score.
- :rtype: float
- """
- enum_hypothesis, enum_reference = _generate_enums(
- hypothesis, reference, preprocess=preprocess
- )
- translation_length = len(enum_hypothesis)
- reference_length = len(enum_reference)
- matches, _, _ = _enum_allign_words(enum_hypothesis, enum_reference, stemmer=stemmer)
- matches_count = len(matches)
- try:
- precision = float(matches_count) / translation_length
- recall = float(matches_count) / reference_length
- fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
- chunk_count = float(_count_chunks(matches))
- frag_frac = chunk_count / matches_count
- except ZeroDivisionError:
- return 0.0
- penalty = gamma * frag_frac ** beta
- return (1 - penalty) * fmean
-
-
-def meteor_score(
- references,
- hypothesis,
- preprocess=str.lower,
- stemmer=PorterStemmer(),
- wordnet=wordnet,
- alpha=0.9,
- beta=3,
- gamma=0.5,
-):
- """
- Calculates METEOR score for hypothesis with multiple references as
- described in "Meteor: An Automatic Metric for MT Evaluation with
- HighLevels of Correlation with Human Judgments" by Alon Lavie and
- Abhaya Agarwal, in Proceedings of ACL.
- http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
-
-
- In case of multiple references the best score is chosen. This method
- iterates over single_meteor_score and picks the best pair among all
- the references for a given hypothesis
-
- >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
- >>> hypothesis2 = 'It is to insure the troops forever hearing the activity guidebook that party direct'
-
- >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
- >>> reference2 = 'It is the guiding principle which guarantees the military forces always being under the command of the Party'
- >>> reference3 = 'It is the practical guide for the army always to heed the directions of the party'
-
- >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
- 0.7398
-
- If there is no words match during the alignment the method returns the
- score as 0. We can safely return a zero instead of raising a
- division by zero error as no match usually implies a bad translation.
-
- >>> round(meteor_score(['this is a cat'], 'non matching hypothesis'),4)
- 0.0
-
- :param references: reference sentences
- :type references: list(str)
- :param hypothesis: a hypothesis sentence
- :type hypothesis: str
- :param preprocess: preprocessing function (default str.lower)
- :type preprocess: method
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :param alpha: parameter for controlling relative weights of precision and recall.
- :type alpha: float
- :param beta: parameter for controlling shape of penalty as a function
- of as a function of fragmentation.
- :type beta: float
- :param gamma: relative weight assigned to fragmentation penality.
- :type gamma: float
- :return: The sentence-level METEOR score.
- :rtype: float
- """
- return max(
- [
- single_meteor_score(
- reference,
- hypothesis,
- stemmer=stemmer,
- wordnet=wordnet,
- alpha=alpha,
- beta=beta,
- gamma=gamma,
- )
- for reference in references
- ]
- )
# Natural Language Toolkit: Translation metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import division
def alignment_error_rate(reference, hypothesis, possible=None):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: NIST Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors:
# Contributors:
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""NIST score implementation."""
+from __future__ import division
import math
import fractions
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Phrase Extraction Algorithm
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
:type f_start: int
:param f_start: Starting index of the possible foreign language phrases
:type f_end: int
- :param f_end: End index of the possible foreign language phrases
+ :param f_end: Starting index of the possible foreign language phrases
:type e_start: int
:param e_start: Starting index of the possible source language phrases
:type e_end: int
- :param e_end: End index of the possible source language phrases
+ :param e_end: Starting index of the possible source language phrases
:type srctext: list
:param srctext: The source language tokens, a list of string.
:type trgtext: list
trg_phrase = " ".join(trgtext[fs : fe + 1])
# Include more data for later ordering.
phrases.add(
- ((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)
+ ((e_start, e_end + 1), (f_start, f_end + 1), src_phrase, trg_phrase)
)
fe += 1
- if fe in f_aligned or fe >= trglen:
+ if fe in f_aligned or fe == trglen:
break
fs -= 1
if fs in f_aligned or fs < 0:
...
((0, 1), (0, 1), 'michael', 'michael')
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
- ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
+ ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,')
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
((1, 2), (1, 4), 'assumes', 'geht davon aus')
- ((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
+ ((1, 2), (1, 4), 'assumes', 'geht davon aus ,')
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
- ((2, 3), (4, 6), 'that', ', dass')
+ ((2, 3), (5, 6), 'that', ', dass')
((2, 3), (5, 6), 'that', 'dass')
- ((2, 4), (4, 7), 'that he', ', dass er')
+ ((2, 4), (5, 7), 'that he', ', dass er')
((2, 4), (5, 7), 'that he', 'dass er')
- ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
+ ((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt')
((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
((3, 4), (6, 7), 'he', 'er')
((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
:param srctext: The sentence string from the source language.
:type trgtext: str
:param trgtext: The sentence string from the target language.
- :type alignment: list(tuple)
+ :type alignment: str
:param alignment: The word alignment outputs as list of tuples, where
the first elements of tuples are the source words' indices and
second elements are the target words' indices. This is also the output
# -*- coding: utf-8 -*-
# Natural Language Toolkit: RIBES Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
# Mark Byers, ekhumoro, P. Ortiz
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
""" RIBES score implementation """
-
+from __future__ import division
from itertools import islice
import math
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
if not stacks[sentence_length]:
warnings.warn(
- "Unable to translate all words. "
- "The source sentence contains words not in "
- "the phrase table"
+ 'Unable to translate all words. '
+ 'The source sentence contains words not in '
+ 'the phrase table'
)
# Instead of returning empty output, perhaps a partial
# translation could be returned
subsequence covering positions 2, 3, and 4.
:rtype: dict(int: (dict(int): float))
"""
- scores = defaultdict(lambda: defaultdict(lambda: float("-inf")))
+ scores = defaultdict(lambda: defaultdict(lambda: float('-inf')))
for seq_length in range(1, len(src_sentence) + 1):
for start in range(0, len(src_sentence) - seq_length + 1):
end = start + seq_length
self.items = []
if beam_threshold == 0.0:
- self.__log_beam_threshold = float("-inf")
+ self.__log_beam_threshold = float('-inf')
else:
self.__log_beam_threshold = log(beam_threshold)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Text Trees
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@gu.se>
Class for representing hierarchical language structures, such as
syntax trees and morphological trees.
"""
+from __future__ import print_function, unicode_literals
import re
-import sys
from abc import ABCMeta, abstractmethod
+from six import string_types, add_metaclass
from nltk.grammar import Production, Nonterminal
from nltk.probability import ProbabilisticMixIn
from nltk.util import slice_bounds
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
# TODO: add LabelledTree (can be used for dependency trees)
######################################################################
-
+@python_2_unicode_compatible
class Tree(list):
"""
A Tree represents a hierarchical grouping of leaves and subtrees.
raise TypeError(
"%s: Expected a node value and child list " % type(self).__name__
)
- elif isinstance(children, str):
+ elif isinstance(children, string_types):
raise TypeError(
"%s() argument 2 should be a list, not a "
"string" % type(self).__name__
# ////////////////////////////////////////////////////////////
def __mul__(self, v):
- raise TypeError("Tree does not support multiplication")
+ raise TypeError('Tree does not support multiplication')
def __rmul__(self, v):
- raise TypeError("Tree does not support multiplication")
+ raise TypeError('Tree does not support multiplication')
def __add__(self, v):
- raise TypeError("Tree does not support addition")
+ raise TypeError('Tree does not support addition')
def __radd__(self, v):
- raise TypeError("Tree does not support addition")
+ raise TypeError('Tree does not support addition')
# ////////////////////////////////////////////////////////////
# Indexing (with support for tree positions)
return list.__setitem__(self, index, value)
elif isinstance(index, (list, tuple)):
if len(index) == 0:
- raise IndexError("The tree position () may not be " "assigned to.")
+ raise IndexError('The tree position () may not be ' 'assigned to.')
elif len(index) == 1:
self[index[0]] = value
else:
return list.__delitem__(self, index)
elif isinstance(index, (list, tuple)):
if len(index) == 0:
- raise IndexError("The tree position () may not be deleted.")
+ raise IndexError('The tree position () may not be deleted.')
elif len(index) == 1:
del self[index[0]]
else:
max_child_height = max(max_child_height, 1)
return 1 + max_child_height
- def treepositions(self, order="preorder"):
+ def treepositions(self, order='preorder'):
"""
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
>>> t.treepositions() # doctest: +ELLIPSIS
``leaves``.
"""
positions = []
- if order in ("preorder", "bothorder"):
+ if order in ('preorder', 'bothorder'):
positions.append(())
for i, child in enumerate(self):
if isinstance(child, Tree):
positions.extend((i,) + p for p in childpos)
else:
positions.append((i,))
- if order in ("postorder", "bothorder"):
+ if order in ('postorder', 'bothorder'):
positions.append(())
return positions
:rtype: list(Production)
"""
- if not isinstance(self._label, str):
+ if not isinstance(self._label, string_types):
raise TypeError(
- "Productions can only be generated from trees having node labels that are strings"
+ 'Productions can only be generated from trees having node labels that are strings'
)
prods = [Production(Nonterminal(self._label), _child_names(self))]
leaves, or if ``index<0``.
"""
if index < 0:
- raise IndexError("index must be non-negative")
+ raise IndexError('index must be non-negative')
stack = [(self, ())]
while stack:
for i in range(len(value) - 1, -1, -1):
stack.append((value[i], treepos + (i,)))
- raise IndexError("index must be less than or equal to len(self)")
+ raise IndexError('index must be less than or equal to len(self)')
def treeposition_spanning_leaves(self, start, end):
"""
:raise ValueError: if ``end <= start``
"""
if end <= start:
- raise ValueError("end must be greater than start")
+ raise ValueError('end must be greater than start')
# Find the tree positions of the start & end leaves, and
# take the longest common subsequence.
start_treepos = self.leaf_treeposition(start)
else:
return tree
- def __copy__(self):
- return self.copy()
-
- def __deepcopy__(self, memo):
- return self.copy(deep=True)
-
def copy(self, deep=False):
if not deep:
return type(self)(self._label, self)
newcopy = frozen_class.convert(self)
else:
newcopy = self.copy(deep=True)
- for pos in newcopy.treepositions("leaves"):
+ for pos in newcopy.treepositions('leaves'):
newcopy[pos] = leaf_freezer(newcopy[pos])
newcopy = frozen_class.convert(newcopy)
hash(newcopy) # Make sure the leaves are hashable.
def fromstring(
cls,
s,
- brackets="()",
+ brackets='()',
read_node=None,
read_leaf=None,
node_pattern=None,
then it will return a tree of that type.
:rtype: Tree
"""
- if not isinstance(brackets, str) or len(brackets) != 2:
- raise TypeError("brackets must be a length-2 string")
- if re.search("\s", brackets):
- raise TypeError("whitespace brackets not allowed")
+ if not isinstance(brackets, string_types) or len(brackets) != 2:
+ raise TypeError('brackets must be a length-2 string')
+ if re.search('\s', brackets):
+ raise TypeError('whitespace brackets not allowed')
# Construct a regexp that will tokenize the string.
open_b, close_b = brackets
open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
if node_pattern is None:
- node_pattern = "[^\s%s%s]+" % (open_pattern, close_pattern)
+ node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
if leaf_pattern is None:
- leaf_pattern = "[^\s%s%s]+" % (open_pattern, close_pattern)
+ leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
token_re = re.compile(
- "%s\s*(%s)?|%s|(%s)"
+ '%s\s*(%s)?|%s|(%s)'
% (open_pattern, node_pattern, close_pattern, leaf_pattern)
)
# Walk through each token, updating a stack of trees.
# Beginning of a tree/subtree
if token[0] == open_b:
if len(stack) == 1 and len(stack[0][1]) > 0:
- cls._parse_error(s, match, "end-of-string")
+ cls._parse_error(s, match, 'end-of-string')
label = token[1:].lstrip()
if read_node is not None:
label = read_node(label)
if len(stack[0][1]) == 0:
cls._parse_error(s, match, open_b)
else:
- cls._parse_error(s, match, "end-of-string")
+ cls._parse_error(s, match, 'end-of-string')
label, children = stack.pop()
stack[-1][1].append(cls(label, children))
# Leaf node
# check that we got exactly one complete tree.
if len(stack) > 1:
- cls._parse_error(s, "end-of-string", close_b)
+ cls._parse_error(s, 'end-of-string', close_b)
elif len(stack[0][1]) == 0:
- cls._parse_error(s, "end-of-string", open_b)
+ cls._parse_error(s, 'end-of-string', open_b)
else:
assert stack[0][0] is None
assert len(stack[0][1]) == 1
# If the tree has an extra level with node='', then get rid of
# it. E.g.: "((S (NP ...) (VP ...)))"
- if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1:
+ if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1:
tree = tree[0]
# return the tree.
return tree
:param expecting: what we expected to see instead.
"""
# Construct a basic error message
- if match == "end-of-string":
- pos, token = len(s), "end-of-string"
+ if match == 'end-of-string':
+ pos, token = len(s), 'end-of-string'
else:
pos, token = match.start(), match.group()
- msg = "%s.read(): expected %r but got %r\n%sat index %d." % (
+ msg = '%s.read(): expected %r but got %r\n%sat index %d.' % (
cls.__name__,
expecting,
token,
- " " * 12,
+ ' ' * 12,
pos,
)
# Add a display showing the error token itsels:
- s = s.replace("\n", " ").replace("\t", " ")
+ s = s.replace('\n', ' ').replace('\t', ' ')
offset = pos
if len(s) > pos + 10:
- s = s[: pos + 10] + "..."
+ s = s[: pos + 10] + '...'
if pos > 10:
- s = "..." + s[pos - 10 :]
+ s = '...' + s[pos - 10 :]
offset = 13
- msg += '\n%s"%s"\n%s^' % (" " * 16, s, " " * (17 + offset))
+ msg += '\n%s"%s"\n%s^' % (' ' * 16, s, ' ' * (17 + offset))
raise ValueError(msg)
# ////////////////////////////////////////////////////////////
print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream)
def __repr__(self):
- childstr = ", ".join(repr(c) for c in self)
- return "%s(%s, [%s])" % (
+ childstr = ", ".join(unicode_repr(c) for c in self)
+ return '%s(%s, [%s])' % (
type(self).__name__,
- repr(self._label),
+ unicode_repr(self._label),
childstr,
)
_canvas_frame.add_widget(widget)
x, y, w, h = widget.bbox()
# print_to_file uses scrollregion to set the width and height of the pdf.
- _canvas_frame.canvas()["scrollregion"] = (0, 0, w, h)
+ _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h)
with tempfile.NamedTemporaryFile() as file:
- in_path = "{0:}.ps".format(file.name)
- out_path = "{0:}.png".format(file.name)
+ in_path = '{0:}.ps'.format(file.name)
+ out_path = '{0:}.png'.format(file.name)
_canvas_frame.print_to_file(in_path)
_canvas_frame.destroy_widget(widget)
- try:
- subprocess.call(
- [
- find_binary(
- "gs",
- binary_names=["gswin32c.exe", "gswin64c.exe"],
- env_vars=["PATH"],
- verbose=False,
- )
- ]
- + "-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}".format(
- out_path, in_path
- ).split()
- )
- except LookupError:
- pre_error_message = str(
- "The Ghostscript executable isn't found.\n"
- "See http://web.mit.edu/ghostscript/www/Install.htm\n"
- "If you're using a Mac, you can try installing\n"
- "https://docs.brew.sh/Installation then `brew install ghostscript`"
- )
- print(pre_error_message, file=sys.stderr)
- raise LookupError
-
- with open(out_path, "rb") as sr:
+ subprocess.call(
+ [
+ find_binary(
+ 'gs',
+ binary_names=['gswin32c.exe', 'gswin64c.exe'],
+ env_vars=['PATH'],
+ verbose=False,
+ )
+ ]
+ + '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'.format(
+ out_path, in_path
+ ).split()
+ )
+ with open(out_path, 'rb') as sr:
res = sr.read()
os.remove(in_path)
os.remove(out_path)
stream = None
print(self.pformat(**kwargs), file=stream)
- def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False):
+ def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
"""
:return: A pretty-printed string representation of this tree.
:rtype: str
return s
# If it doesn't fit on one line, then write it on multi-lines.
- if isinstance(self._label, str):
- s = "%s%s%s" % (parens[0], self._label, nodesep)
+ if isinstance(self._label, string_types):
+ s = '%s%s%s' % (parens[0], self._label, nodesep)
else:
- s = "%s%s%s" % (parens[0], repr(self._label), nodesep)
+ s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep)
for child in self:
if isinstance(child, Tree):
s += (
- "\n"
- + " " * (indent + 2)
+ '\n'
+ + ' ' * (indent + 2)
+ child.pformat(margin, indent + 2, nodesep, parens, quotes)
)
elif isinstance(child, tuple):
- s += "\n" + " " * (indent + 2) + "/".join(child)
- elif isinstance(child, str) and not quotes:
- s += "\n" + " " * (indent + 2) + "%s" % child
+ s += '\n' + ' ' * (indent + 2) + "/".join(child)
+ elif isinstance(child, string_types) and not quotes:
+ s += '\n' + ' ' * (indent + 2) + '%s' % child
else:
- s += "\n" + " " * (indent + 2) + repr(child)
+ s += '\n' + ' ' * (indent + 2) + unicode_repr(child)
return s + parens[1]
def pformat_latex_qtree(self):
:return: A latex qtree representation of this tree.
:rtype: str
"""
- reserved_chars = re.compile("([#\$%&~_\{\}])")
+ reserved_chars = re.compile('([#\$%&~_\{\}])')
- pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]"))
- return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat)
+ pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]'))
+ return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat)
def _pformat_flat(self, nodesep, parens, quotes):
childstrs = []
childstrs.append(child._pformat_flat(nodesep, parens, quotes))
elif isinstance(child, tuple):
childstrs.append("/".join(child))
- elif isinstance(child, str) and not quotes:
- childstrs.append("%s" % child)
+ elif isinstance(child, string_types) and not quotes:
+ childstrs.append('%s' % child)
else:
- childstrs.append(repr(child))
- if isinstance(self._label, str):
- return "%s%s%s %s%s" % (
+ childstrs.append(unicode_repr(child))
+ if isinstance(self._label, string_types):
+ return '%s%s%s %s%s' % (
parens[0],
self._label,
nodesep,
parens[1],
)
else:
- return "%s%s%s %s%s" % (
+ return '%s%s%s %s%s' % (
parens[0],
- repr(self._label),
+ unicode_repr(self._label),
nodesep,
" ".join(childstrs),
parens[1],
)
def __setitem__(self, index, value):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __setslice__(self, i, j, value):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __delitem__(self, index):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __delslice__(self, i, j):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __iadd__(self, other):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __imul__(self, other):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def append(self, v):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def extend(self, v):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def pop(self, v=None):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def remove(self, v):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def reverse(self):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def sort(self):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __hash__(self):
return self._hash
Set the node label. This will only succeed the first time the
node label is set, which should occur in ImmutableTree.__init__().
"""
- if hasattr(self, "_label"):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ if hasattr(self, '_label'):
+ raise ValueError('%s may not be modified' % type(self).__name__)
self._label = value
######################################################################
## Parented trees
######################################################################
-class AbstractParentedTree(Tree, metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class AbstractParentedTree(Tree):
"""
An abstract base class for a ``Tree`` that automatically maintains
pointers to parent nodes. These parent pointers are updated
if index < 0:
index += len(self)
if index < 0:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# Clear the child's parent pointer.
if isinstance(self[index], Tree):
self._delparent(self[index], index)
elif isinstance(index, (list, tuple)):
# del ptree[()]
if len(index) == 0:
- raise IndexError("The tree position () may not be deleted.")
+ raise IndexError('The tree position () may not be deleted.')
# del ptree[(i,)]
elif len(index) == 1:
del self[index[0]]
if index < 0:
index += len(self)
if index < 0:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# if the value is not changing, do nothing.
if value is self[index]:
return
elif isinstance(index, (list, tuple)):
# ptree[()] = value
if len(index) == 0:
- raise IndexError("The tree position () may not be assigned to.")
+ raise IndexError('The tree position () may not be assigned to.')
# ptree[(i,)] = value
elif len(index) == 1:
self[index[0]] = value
if index < 0:
index += len(self)
if index < 0:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
if isinstance(self[index], Tree):
self._delparent(self[index], index)
return super(AbstractParentedTree, self).pop(index)
# __getitem__ etc., but use max(0, start) and max(0, stop) because
# because negative indices are already handled *before*
# __getslice__ is called; and we don't want to double-count them.
- if hasattr(list, "__getslice__"):
+ if hasattr(list, '__getslice__'):
def __getslice__(self, start, stop):
return self.__getitem__(slice(max(0, start), max(0, stop)))
for i, child in enumerate(self._parent):
if child is self:
return i
- assert False, "expected to find self in self._parent!"
+ assert False, 'expected to find self in self._parent!'
def left_sibling(self):
"""The left sibling of this tree, or None if it has none."""
# If the child's type is incorrect, then complain.
if not isinstance(child, ParentedTree):
raise TypeError(
- "Can not insert a non-ParentedTree " + "into a ParentedTree"
+ 'Can not insert a non-ParentedTree ' + 'into a ParentedTree'
)
# If child already has a parent, then complain.
if child._parent is not None:
- raise ValueError("Can not insert a subtree that already " "has a parent.")
+ raise ValueError('Can not insert a subtree that already ' 'has a parent.')
# Set child's parent pointer & index.
if not dry_run:
# If the child's type is incorrect, then complain.
if not isinstance(child, MultiParentedTree):
raise TypeError(
- "Can not insert a non-MultiParentedTree " + "into a MultiParentedTree"
+ 'Can not insert a non-MultiParentedTree ' + 'into a MultiParentedTree'
)
# Add self as a parent pointer if it's not already listed.
######################################################################
-
+@python_2_unicode_compatible
class ProbabilisticTree(Tree, ProbabilisticMixIn):
def __init__(self, node, children=None, **prob_kwargs):
Tree.__init__(self, node, children)
return ImmutableProbabilisticTree
def __repr__(self):
- return "%s (p=%r)" % (Tree.__repr__(self), self.prob())
+ return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob())
def __str__(self):
- return "%s (p=%.6g)" % (self.pformat(margin=60), self.prob())
+ return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob())
def copy(self, deep=False):
if not deep:
return self.__class__.__name__ < other.__class__.__name__
-
+@python_2_unicode_compatible
class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
def __init__(self, node, children=None, **prob_kwargs):
ImmutableTree.__init__(self, node, children)
return ImmutableProbabilisticTree
def __repr__(self):
- return "%s [%s]" % (Tree.__repr__(self), self.prob())
+ return '%s [%s]' % (Tree.unicode_repr(self), self.prob())
def __str__(self):
- return "%s [%s]" % (self.pformat(margin=60), self.prob())
+ return '%s [%s]' % (self.pformat(margin=60), self.prob())
def copy(self, deep=False):
if not deep:
:param s: The string to be converted
:type s: str
"""
- tokens = re.split(r"([()| ])", s)
+ tokens = re.split(r'([()| ])', s)
for i in range(len(tokens)):
- if tokens[i] == "(":
+ if tokens[i] == '(':
tokens[i - 1], tokens[i] = (
tokens[i],
tokens[i - 1],
) # pull nonterminal inside parens
- elif ":" in tokens[i]:
- fields = tokens[i].split(":")
+ elif ':' in tokens[i]:
+ fields = tokens[i].split(':')
if len(fields) == 2: # non-terminal
tokens[i] = fields[1]
else:
tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
- elif tokens[i] == "|":
- tokens[i] = ""
+ elif tokens[i] == '|':
+ tokens[i] = ''
treebank_string = " ".join(tokens)
return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
from nltk import Tree, ProbabilisticTree
# Demonstrate tree parsing.
- s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))"
+ s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
t = Tree.fromstring(s)
print("Convert bracketed string into tree:")
print(t)
# Demonstrate tree modification.
the_cat = t[0]
- the_cat.insert(1, Tree.fromstring("(JJ big)"))
+ the_cat.insert(1, Tree.fromstring('(JJ big)'))
print("Tree modification:")
print(t)
- t[1, 1, 1] = Tree.fromstring("(NN cake)")
+ t[1, 1, 1] = Tree.fromstring('(NN cake)')
print(t)
print()
print()
# Demonstrate probabilistic trees.
- pt = ProbabilisticTree("x", ["y", "z"], prob=0.5)
+ pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5)
print("Probabilistic Tree:")
print(pt)
print()
print()
# Demonstrate tree nodes containing objects other than strings
- t.set_label(("test", 3))
+ t.set_label(('test', 3))
print(t)
__all__ = [
- "ImmutableProbabilisticTree",
- "ImmutableTree",
- "ProbabilisticMixIn",
- "ProbabilisticTree",
- "Tree",
- "bracket_parse",
- "sinica_parse",
- "ParentedTree",
- "MultiParentedTree",
- "ImmutableParentedTree",
- "ImmutableMultiParentedTree",
+ 'ImmutableProbabilisticTree',
+ 'ImmutableTree',
+ 'ProbabilisticMixIn',
+ 'ProbabilisticTree',
+ 'Tree',
+ 'bracket_parse',
+ 'sinica_parse',
+ 'ParentedTree',
+ 'MultiParentedTree',
+ 'ImmutableParentedTree',
+ 'ImmutableMultiParentedTree',
]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ASCII visualization of NLTK trees
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Andreas van Cranenburgh <A.W.vanCranenburgh@uva.nl>
# Peter Ljunglöf <peter.ljunglof@gu.se>
# URL: <http://nltk.org/>
http://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf
"""
+from __future__ import division, print_function, unicode_literals
+
import re
-try:
- from html import escape
-except ImportError:
- from cgi import escape
+from cgi import escape
from collections import defaultdict
from operator import itemgetter
from nltk.util import OrderedDict
+from nltk.compat import python_2_unicode_compatible
from nltk.tree import Tree
ANSICOLOR = {
- "black": 30,
- "red": 31,
- "green": 32,
- "yellow": 33,
- "blue": 34,
- "magenta": 35,
- "cyan": 36,
- "white": 37,
+ 'black': 30,
+ 'red': 31,
+ 'green': 32,
+ 'yellow': 33,
+ 'blue': 34,
+ 'magenta': 35,
+ 'cyan': 36,
+ 'white': 37,
}
+@python_2_unicode_compatible
class TreePrettyPrinter(object):
"""
Pretty-print a tree in text format, either as ASCII or Unicode.
if not isinstance(b, Tree):
a[n] = len(sentence)
if type(b) == tuple:
- b = "/".join(b)
- sentence.append("%s" % b)
+ b = '/'.join(b)
+ sentence.append('%s' % b)
self.nodes, self.coords, self.edges, self.highlight = self.nodecoords(
tree, sentence, highlight
)
return self.text()
def __repr__(self):
- return "<TreePrettyPrinter with %d nodes>" % len(self.nodes)
+ return '<TreePrettyPrinter with %d nodes>' % len(self.nodes)
@staticmethod
def nodecoords(tree, sentence, highlight):
i += scale
j -= scale
raise ValueError(
- "could not find a free cell for:\n%s\n%s"
- "min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix())
+ 'could not find a free cell for:\n%s\n%s'
+ 'min=%d; max=%d' % (tree[m], minidx, maxidx, dumpmatrix())
)
def dumpmatrix():
"""Dump matrix contents for debugging purposes."""
- return "\n".join(
- "%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row))
+ return '\n'.join(
+ '%2d: %s' % (n, ' '.join(('%2r' % i)[:2] for i in row))
for n, row in enumerate(matrix)
)
leaves = tree.leaves()
if not all(isinstance(n, int) for n in leaves):
- raise ValueError("All leaves must be integer indices.")
+ raise ValueError('All leaves must be integer indices.')
if len(leaves) != len(set(leaves)):
- raise ValueError("Indices must occur at most once.")
+ raise ValueError('Indices must occur at most once.')
if not all(0 <= n < len(sentence) for n in leaves):
raise ValueError(
- "All leaves must be in the interval 0..n "
- "with n=len(sentence)\ntokens: %d indices: "
- "%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence)
+ 'All leaves must be in the interval 0..n '
+ 'with n=len(sentence)\ntokens: %d indices: '
+ '%r\nsentence: %s' % (len(sentence), tree.leaves(), sentence)
)
vertline, corner = -1, -2 # constants
tree = tree.copy(True)
matrix[0][i] = ids[m]
nodes[ids[m]] = sentence[tree[m]]
if nodes[ids[m]] is None:
- nodes[ids[m]] = "..."
+ nodes[ids[m]] = '...'
highlighted_nodes.discard(ids[m])
positions.remove(m)
childcols[m[:-1]].add((0, i))
unicodelines=False,
html=False,
ansi=False,
- nodecolor="blue",
- leafcolor="red",
- funccolor="green",
+ nodecolor='blue',
+ leafcolor='red',
+ funccolor='green',
abbreviate=None,
maxwidth=16,
):
if abbreviate == True:
abbreviate = 5
if unicodelines:
- horzline = "\u2500"
- leftcorner = "\u250c"
- rightcorner = "\u2510"
- vertline = " \u2502 "
- tee = horzline + "\u252C" + horzline
- bottom = horzline + "\u2534" + horzline
- cross = horzline + "\u253c" + horzline
- ellipsis = "\u2026"
+ horzline = '\u2500'
+ leftcorner = '\u250c'
+ rightcorner = '\u2510'
+ vertline = ' \u2502 '
+ tee = horzline + '\u252C' + horzline
+ bottom = horzline + '\u2534' + horzline
+ cross = horzline + '\u253c' + horzline
+ ellipsis = '\u2026'
else:
- horzline = "_"
- leftcorner = rightcorner = " "
- vertline = " | "
+ horzline = '_'
+ leftcorner = rightcorner = ' '
+ vertline = ' | '
tee = 3 * horzline
- cross = bottom = "_|_"
- ellipsis = "."
+ cross = bottom = '_|_'
+ ellipsis = '.'
def crosscell(cur, x=vertline):
"""Overwrite center of this cell with a vertical branch."""
splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1
lst = list(cur)
lst[splitl : splitl + len(x)] = list(x)
- return "".join(lst)
+ return ''.join(lst)
result = []
matrix = defaultdict(dict)
childcols = defaultdict(set)
labels = {}
wrapre = re.compile(
- "(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth)
+ '(.{%d,%d}\\b\\W*|.{%d})' % (maxwidth - 4, maxwidth, maxwidth)
)
# collect labels and coordinates
for a in self.nodes:
if abbreviate and len(label) > abbreviate:
label = label[:abbreviate] + ellipsis
if maxwidth and len(label) > maxwidth:
- label = wrapre.sub(r"\1\n", label).strip()
- label = label.split("\n")
+ label = wrapre.sub(r'\1\n', label).strip()
+ label = label.split('\n')
maxnodeheight[row] = max(maxnodeheight[row], len(label))
maxnodewith[column] = max(maxnodewith[column], max(map(len, label)))
labels[a] = label
# bottom up level order traversal
for row in sorted(matrix, reverse=True):
noderows = [
- ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
+ [''.center(maxnodewith[col]) for col in range(maxcol + 1)]
for _ in range(maxnodeheight[row])
]
- branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
+ branchrow = [''.center(maxnodewith[col]) for col in range(maxcol + 1)]
for col in matrix[row]:
n = matrix[row][col]
node = self.nodes[n]
if n in minchildcol and minchildcol[n] < maxchildcol[n]:
i, j = minchildcol[n], maxchildcol[n]
a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2
- branchrow[i] = ((" " * a) + leftcorner).ljust(
+ branchrow[i] = ((' ' * a) + leftcorner).ljust(
maxnodewith[i], horzline
)
- branchrow[j] = (rightcorner + (" " * b)).rjust(
+ branchrow[j] = (rightcorner + (' ' * b)).rjust(
maxnodewith[j], horzline
)
for i in range(minchildcol[n] + 1, maxchildcol[n]):
branchrow[col] = crosscell(branchrow[col])
text = [a.center(maxnodewith[col]) for a in text]
color = nodecolor if isinstance(node, Tree) else leafcolor
- if isinstance(node, Tree) and node.label().startswith("-"):
+ if isinstance(node, Tree) and node.label().startswith('-'):
color = funccolor
if html:
- text = [escape(a, quote=False) for a in text]
+ text = [escape(a) for a in text]
if n in self.highlight:
- text = ["<font color=%s>%s</font>" % (color, a) for a in text]
+ text = ['<font color=%s>%s</font>' % (color, a) for a in text]
elif ansi and n in self.highlight:
- text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text]
+ text = ['\x1b[%d;1m%s\x1b[0m' % (ANSICOLOR[color], a) for a in text]
for x in range(maxnodeheight[row]):
# draw vertical lines in partially filled multiline node
# labels, but only if it's not a frontier node.
noderows[x][col] = (
text[x]
if x < len(text)
- else (vertline if childcols[n] else " ").center(
- maxnodewith[col], " "
+ else (vertline if childcols[n] else ' ').center(
+ maxnodewith[col], ' '
)
)
# for each column, if there is a node below us which has a parent
for noderow in noderows:
noderow[col] = crosscell(noderow[col])
branchrow = [
- a + ((a[-1] if a[-1] != " " else b[0]) * nodedist)
- for a, b in zip(branchrow, branchrow[1:] + [" "])
+ a + ((a[-1] if a[-1] != ' ' else b[0]) * nodedist)
+ for a, b in zip(branchrow, branchrow[1:] + [' '])
]
- result.append("".join(branchrow))
+ result.append(''.join(branchrow))
result.extend(
- (" " * nodedist).join(noderow) for noderow in reversed(noderows)
+ (' ' * nodedist).join(noderow) for noderow in reversed(noderows)
)
- return "\n".join(reversed(result)) + "\n"
+ return '\n'.join(reversed(result)) + '\n'
- def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"):
+ def svg(self, nodecolor='blue', leafcolor='red', funccolor='green'):
"""
:return: SVG representation of a tree.
"""
y = row * vscale + vstart
if n in self.highlight:
color = nodecolor if isinstance(node, Tree) else leafcolor
- if isinstance(node, Tree) and node.label().startswith("-"):
+ if isinstance(node, Tree) and node.label().startswith('-'):
color = funccolor
else:
- color = "black"
+ color = 'black'
result += [
'\t<text style="text-anchor: middle; fill: %s; '
'font-size: %dpx;" x="%g" y="%g">%s</text>'
fontsize,
x,
y,
- escape(node.label() if isinstance(node, Tree) else node, quote=False),
+ escape(node.label() if isinstance(node, Tree) else node),
)
]
- result += ["</svg>"]
- return "\n".join(result)
+ result += ['</svg>']
+ return '\n'.join(result)
def test():
def print_tree(n, tree, sentence=None, ansi=True, **xargs):
print()
- print('{0}: "{1}"'.format(n, " ".join(sentence or tree.leaves())))
+ print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
print(tree)
print()
drawtree = TreePrettyPrinter(tree, sentence)
tree = treebank.parsed_sents()[n]
print_tree(n, tree, nodedist=2, maxwidth=8)
print()
- print("ASCII version:")
+ print('ASCII version:')
print(TreePrettyPrinter(tree).text(nodedist=2))
tree = Tree.fromstring(
- "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) "
- "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) "
- "(vg 10) (inf (verb 11)))))) (punct 12))",
+ '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
+ '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
+ '(vg 10) (inf (verb 11)))))) (punct 12))',
read_leaf=int,
)
sentence = (
- "Ze had met haar moeder kunnen gaan winkelen ,"
- " zwemmen of terrassen .".split()
+ 'Ze had met haar moeder kunnen gaan winkelen ,'
+ ' zwemmen of terrassen .'.split()
)
- print_tree("Discontinuous tree", tree, sentence, nodedist=2)
+ print_tree('Discontinuous tree', tree, sentence, nodedist=2)
-__all__ = ["TreePrettyPrinter"]
+__all__ = ['TreePrettyPrinter']
-if __name__ == "__main__":
+if __name__ == '__main__':
test()
C D C D
"""
+from __future__ import print_function
from nltk.tree import Tree
draw_trees(t, collapsedTree, cnfTree, parentTree, original)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
import time as _time
from abc import ABCMeta, abstractmethod
-from datetime import tzinfo, timedelta, timezone, datetime
+from datetime import tzinfo, timedelta, datetime
+
+from six import add_metaclass
+
+from nltk.compat import UTC
class LocalTimezoneOffsetWithUTC(tzinfo):
LOCAL = LocalTimezoneOffsetWithUTC()
-class BasicTweetHandler(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class BasicTweetHandler(object):
"""
Minimal implementation of `TweetHandler`.
Validate date limits.
"""
if self.upper_date_limit or self.lower_date_limit:
- date_fmt = "%a %b %d %H:%M:%S +0000 %Y"
- tweet_date = datetime.strptime(data["created_at"], date_fmt).replace(
- tzinfo=timezone.utc
+ date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
+ tweet_date = datetime.strptime(data['created_at'], date_fmt).replace(
+ tzinfo=UTC
)
if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
self.lower_date_limit and tweet_date < self.lower_date_limit
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
Utility functions for the :module:`twitterclient` module which do not require
the `twython` library to have been installed.
"""
+from __future__ import print_function
+
import csv
import gzip
import json
-from nltk.internals import deprecated
+from nltk import compat
HIER_SEPARATOR = "."
_add_field_to_out(tweet, field, out)
except TypeError:
raise RuntimeError(
- "Fatal error when extracting fields. Cannot find field ", field
+ 'Fatal error when extracting fields. Cannot find field ', field
)
return out
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
- if key == "entities" or key == "extended_entities":
+ if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
def json2csv(
- fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False
+ fp, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False
):
"""
Extract selected fields from a file of line-separated JSON tweets and
are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
- Additionally, it allows IDs from other Twitter objects, e. g.,\
+ Additonally, it allows IDs from other Twitter objects, e. g.,\
['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
:param error: Behaviour for encoding errors, see\
:param gzip_compress: if `True`, output files are compressed with gzip
"""
- (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
# write the list of fields as header
writer.writerow(fields)
# process the file
outf.close()
-@deprecated("Use open() and csv.writer() directly instead.")
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
- """Get a CSV writer with optional compression."""
- return _outf_writer(outfile, encoding, errors, gzip_compress)
-
-
-def _outf_writer(outfile, encoding, errors, gzip_compress=False):
- if gzip_compress:
- outf = gzip.open(outfile, "wt", encoding=encoding, errors=errors)
+ """
+ Identify appropriate CSV writer given the Python version
+ """
+ if compat.PY3:
+ if gzip_compress:
+ outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
+ else:
+ outf = open(outfile, 'w', encoding=encoding, errors=errors)
+ writer = csv.writer(outf)
else:
- outf = open(outfile, "w", encoding=encoding, errors=errors)
- writer = csv.writer(outf)
+ if gzip_compress:
+ outf = gzip.open(outfile, 'wb')
+ else:
+ outf = open(outfile, 'wb')
+ writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
return (writer, outf)
main_fields,
entity_type,
entity_fields,
- encoding="utf8",
- errors="replace",
+ encoding='utf8',
+ errors='replace',
gzip_compress=False,
):
"""
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
- (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
For error codes see Twitter's
`Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
"""
+from __future__ import print_function
import datetime
from functools import wraps
import json
-from io import StringIO
+
+from nltk.compat import StringIO
from nltk.twitter import (
Query,
)
-SPACER = "###################################"
+SPACER = '###################################'
def verbose(func):
"""
global USERIDS, FIELDS
- USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"]
+ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800']
# UserIDs corresponding to\
# @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive
- FIELDS = ["id_str"]
+ FIELDS = ['id_str']
@verbose
"""
tw = Twitter()
print("Track from the public stream\n")
- tw.tweets(keywords="love, hate", limit=10) # public stream
+ tw.tweets(keywords='love, hate', limit=10) # public stream
print(SPACER)
print("Search past Tweets\n")
tw = Twitter()
- tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets
+ tw.tweets(keywords='love, hate', stream=False, limit=10) # search past tweets
print(SPACER)
print(
"Follow two accounts in the public stream"
+ " -- be prepared to wait a few minutes\n"
)
tw = Twitter()
- tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream
+ tw.tweets(follow=['759251', '6017542'], stream=True, limit=5) # public stream
@verbose
@verbose
-def search_demo(keywords="nltk"):
+def search_demo(keywords='nltk'):
"""
Use the REST API to search for past tweets containing a given keyword.
"""
oauth = credsfromfile()
client = Query(**oauth)
for tweet in client.search_tweets(keywords=keywords, limit=10):
- print(tweet["text"])
+ print(tweet['text'])
@verbose
-def tweets_by_user_demo(user="NLTK_org", count=200):
+def tweets_by_user_demo(user='NLTK_org', count=200):
"""
Use the REST API to search for past tweets by a given user.
"""
client = Query(**oauth)
user_info = client.user_info_from_id(USERIDS)
for info in user_info:
- name = info["screen_name"]
- followers = info["followers_count"]
- following = info["friends_count"]
+ name = info['screen_name']
+ followers = info['followers_count']
+ following = info['friends_count']
print("{0}, followers: {1}, following: {2}".format(name, followers, following))
print("Cutoff date: {}\n".format(dt_date))
for tweet in client.search_tweets(keywords=keywords):
- print("{} ".format(tweet["created_at"]), end="")
+ print("{} ".format(tweet['created_at']), end='')
client.handler.handle(tweet)
hydrated = client.expand_tweetids(ids_f)
for tweet in hydrated:
- id_str = tweet["id_str"]
- print("id: {}".format(id_str))
- text = tweet["text"]
- if text.startswith("@null"):
+ id_str = tweet['id_str']
+ print('id: {}'.format(id_str))
+ text = tweet['text']
+ if text.startswith('@null'):
text = "[Tweet not available]"
- print(text + "\n")
+ print(text + '\n')
ALL = [
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
"""
if self.do_continue:
if self.handler is not None:
- if "text" in data:
+ if 'text' in data:
self.handler.counter += 1
self.handler.handle(data)
self.do_continue = self.handler.do_continue()
print("Error (stream will continue): {0}".format(e))
continue
- def filter(self, track="", follow="", lang="en"):
+ def filter(self, track='', follow='', lang='en'):
"""
Wrapper for 'statuses / filter' API call
"""
# Stream in an endless loop until limit is reached
try:
- if track == "" and follow == "":
+ if track == '' and follow == '':
msg = "Please supply a value for 'track', 'follow'"
raise ValueError(msg)
self.statuses.filter(track=track, follow=follow, lang=lang)
return itertools.chain.from_iterable(chunked_tweets)
- def _search_tweets(self, keywords, limit=100, lang="en"):
+ def _search_tweets(self, keywords, limit=100, lang='en'):
"""
Assumes that the handler has been informed. Fetches Tweets from
search_tweets generator output and passses them to handler
self,
keywords,
limit=100,
- lang="en",
+ lang='en',
max_id=None,
retries_after_twython_exception=0,
):
Call the REST API ``'search/tweets'`` endpoint with some plausible
defaults. See `the Twitter search documentation
<https://dev.twitter.com/rest/public/search>`_ for more information
- about admissible search parameters.
+ about admissable search parameters.
:param str keywords: A list of query terms to search for, written as\
a comma-separated string
self.handler.max_id = max_id
else:
results = self.search(
- q=keywords, count=min(100, limit), lang=lang, result_type="recent"
+ q=keywords, count=min(100, limit), lang=lang, result_type='recent'
)
- count = len(results["statuses"])
+ count = len(results['statuses'])
if count == 0:
print("No Tweets available through REST API for those keywords")
return
count_from_query = count
- self.handler.max_id = results["statuses"][count - 1]["id"] - 1
+ self.handler.max_id = results['statuses'][count - 1]['id'] - 1
- for result in results["statuses"]:
+ for result in results['statuses']:
yield result
self.handler.counter += 1
if self.handler.do_continue() == False:
count=mcount,
lang=lang,
max_id=self.handler.max_id,
- result_type="recent",
+ result_type='recent',
)
except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e))
raise e
retries += 1
- count = len(results["statuses"])
+ count = len(results['statuses'])
if count == 0:
print("No more Tweets available through rest api")
return
# results['search_metadata']['next_results'], but as part of a
# query and difficult to fetch. This is doing the equivalent
# (last tweet id minus one)
- self.handler.max_id = results["statuses"][count - 1]["id"] - 1
+ self.handler.max_id = results['statuses'][count - 1]['id'] - 1
- for result in results["statuses"]:
+ for result in results['statuses']:
yield result
self.handler.counter += 1
if self.handler.do_continue() == False:
"""
return [self.show_user(user_id=userid) for userid in userids]
- def user_tweets(self, screen_name, limit, include_rts="false"):
+ def user_tweets(self, screen_name, limit, include_rts='false'):
"""
Return a collection of the most recent Tweets posted by the user
def tweets(
self,
- keywords="",
- follow="",
+ keywords='',
+ follow='',
to_screen=True,
stream=True,
limit=100,
date_limit=None,
- lang="en",
+ lang='en',
repeat=False,
gzip_compress=False,
):
if stream:
self.streamer.register(handler)
- if keywords == "" and follow == "":
+ if keywords == '' and follow == '':
self.streamer.sample()
else:
self.streamer.filter(track=keywords, follow=follow, lang=lang)
else:
self.query.register(handler)
- if keywords == "":
+ if keywords == '':
raise ValueError("Please supply at least one keyword to search for.")
else:
self.query._search_tweets(keywords, limit=limit, lang=lang)
:rtype: bool
:param data: Tweet object returned by Twitter API
"""
- text = data["text"]
+ text = data['text']
print(text)
self.check_date_limit(data)
return
def on_finish(self):
- print("Written {0} Tweets".format(self.counter))
+ print('Written {0} Tweets'.format(self.counter))
class TweetWriter(TweetHandlerI):
limit=2000,
upper_date_limit=None,
lower_date_limit=None,
- fprefix="tweets",
- subdir="twitter-files",
+ fprefix='tweets',
+ subdir='twitter-files',
repeat=False,
gzip_compress=False,
):
os.mkdir(subdir)
fname = os.path.join(subdir, fprefix)
- fmt = "%Y%m%d-%H%M%S"
+ fmt = '%Y%m%d-%H%M%S'
timestamp = datetime.datetime.now().strftime(fmt)
if self.gzip_compress:
- suffix = ".gz"
+ suffix = '.gz'
else:
- suffix = ""
- outfile = "{0}.{1}.json{2}".format(fname, timestamp, suffix)
+ suffix = ''
+ outfile = '{0}.{1}.json{2}'.format(fname, timestamp, suffix)
return outfile
def handle(self, data):
"""
if self.startingup:
if self.gzip_compress:
- self.output = gzip.open(self.fname, "w")
+ self.output = gzip.open(self.fname, 'w')
else:
- self.output = open(self.fname, "w")
- print("Writing to {0}".format(self.fname))
+ self.output = open(self.fname, 'w')
+ print('Writing to {0}'.format(self.fname))
json_data = json.dumps(data)
if self.gzip_compress:
- self.output.write((json_data + "\n").encode("utf-8"))
+ self.output.write((json_data + "\n").encode('utf-8'))
else:
self.output.write(json_data + "\n")
self.startingup = False
def on_finish(self):
- print("Written {0} Tweets".format(self.counter))
+ print('Written {0} Tweets'.format(self.counter))
if self.output:
self.output.close()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
Authentication utilities to accompany :module:`twitterclient`.
"""
+from __future__ import print_function
+
import os
import pprint
from twython import Twython
"""
def __init__(self):
- self.creds_file = "credentials.txt"
+ self.creds_file = 'credentials.txt'
self.creds_fullpath = None
self.oauth = {}
try:
- self.twitter_dir = os.environ["TWITTER"]
+ self.twitter_dir = os.environ['TWITTER']
self.creds_subdir = self.twitter_dir
except KeyError:
self.twitter_dir = None
)
if not os.path.isfile(self.creds_fullpath):
- raise OSError("Cannot find file {}".format(self.creds_fullpath))
+ raise OSError('Cannot find file {}'.format(self.creds_fullpath))
with open(self.creds_fullpath) as infile:
if verbose:
- print("Reading credentials file {}".format(self.creds_fullpath))
+ print('Reading credentials file {}'.format(self.creds_fullpath))
for line in infile:
- if "=" in line:
- name, value = line.split("=", 1)
+ if '=' in line:
+ name, value = line.split('=', 1)
self.oauth[name.strip()] = value.strip()
self._validate_creds_file(verbose=verbose)
def _validate_creds_file(self, verbose=False):
"""Check validity of a credentials file."""
oauth1 = False
- oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"]
+ oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret']
oauth2 = False
- oauth2_keys = ["app_key", "app_secret", "access_token"]
+ oauth2_keys = ['app_key', 'app_secret', 'access_token']
if all(k in self.oauth for k in oauth1_keys):
oauth1 = True
elif all(k in self.oauth for k in oauth2_keys):
oauth2 = True
if not (oauth1 or oauth2):
- msg = "Missing or incorrect entries in {}\n".format(self.creds_file)
+ msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file)
msg += pprint.pformat(self.oauth)
raise ValueError(msg)
elif verbose:
"""
if creds_file is None:
path = os.path.dirname(__file__)
- creds_file = os.path.join(path, "credentials2.txt")
+ creds_file = os.path.join(path, 'credentials2.txt')
oauth2 = credsfromfile(creds_file=creds_file)
- app_key = oauth2["app_key"]
- app_secret = oauth2["app_secret"]
+ app_key = oauth2['app_key']
+ app_secret = oauth2['app_secret']
twitter = Twython(app_key, app_secret, oauth_version=2)
access_token = twitter.obtain_access_token()
- tok = "access_token={}\n".format(access_token)
- with open(creds_file, "a") as infile:
+ tok = 'access_token={}\n'.format(access_token)
+ with open(creds_file, 'a') as infile:
print(tok, file=infile)
# Natural Language Toolkit: Utility functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
import sys
import inspect
import bisect
import os
-from itertools import islice, chain, combinations, tee
+from itertools import islice, chain, combinations
from pprint import pprint
from collections import defaultdict, deque
from sys import version_info
-from urllib.request import (
+from six import class_types, string_types, text_type
+from six.moves.urllib.request import (
build_opener,
install_opener,
getproxies,
from nltk.internals import slice_bounds, raise_unorderable_types
from nltk.collections import *
+from nltk.compat import python_2_unicode_compatible
######################################################################
######################################################################
-def usage(obj, selfname="self"):
+def usage(obj, selfname='self'):
str(obj) # In case it's lazy, this will load it.
- if not isinstance(obj, type):
+ if not isinstance(obj, class_types):
obj = obj.__class__
- print("%s supports the following operations:" % obj.__name__)
+ print('%s supports the following operations:' % obj.__name__)
for (name, method) in sorted(pydoc.allmethods(obj).items()):
- if name.startswith("_"):
+ if name.startswith('_'):
continue
- if getattr(method, "__deprecated__", False):
+ if getattr(method, '__deprecated__', False):
continue
- getargspec = inspect.getfullargspec
+ if sys.version_info[0] >= 3:
+ getargspec = inspect.getfullargspec
+ else:
+ getargspec = inspect.getargspec
args, varargs, varkw, defaults = getargspec(method)[:4]
if (
args
- and args[0] == "self"
+ and args[0] == 'self'
and (defaults is None or len(args) > len(defaults))
):
args = args[1:]
- name = "%s.%s" % (selfname, name)
+ name = '%s.%s' % (selfname, name)
argspec = inspect.formatargspec(args, varargs, varkw, defaults)
print(
textwrap.fill(
- "%s%s" % (name, argspec),
- initial_indent=" - ",
- subsequent_indent=" " * (len(name) + 5),
+ '%s%s' % (name, argspec),
+ initial_indent=' - ',
+ subsequent_indent=' ' * (len(name) + 5),
)
)
"""
import sys
- return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy")
+ return sys.stdin.__class__.__name__ in ('PyShell', 'RPCProxy')
##########################################################################
:param width: the display width
:type width: int
"""
- print("\n".join(textwrap.wrap(s, width=width)))
+ print('\n'.join(textwrap.wrap(s, width=width)))
def tokenwrap(tokens, separator=" ", width=70):
:param width: the display width (default=70)
:type width: int
"""
- return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
+ return '\n'.join(textwrap.wrap(separator.join(tokens), width=width))
##########################################################################
# recipe from David Mertz
def filestring(f):
- if hasattr(f, "read"):
+ if hasattr(f, 'read'):
return f.read()
- elif isinstance(f, str):
- with open(f, "r") as infile:
+ elif isinstance(f, string_types):
+ with open(f, 'r') as infile:
return infile.read()
else:
raise ValueError("Must be called with a filename or file-like object")
"""
successful_encoding = None
# we make 'utf-8' the first encoding
- encodings = ["utf-8"]
+ encodings = ['utf-8']
#
# next we add anything we can learn from the locale
try:
pass
#
# we try 'latin-1' last
- encodings.append("latin-1")
+ encodings.append('latin-1')
for enc in encodings:
# some of the locale calls
# may have returned None
if not enc:
continue
try:
- decoded = str(data, enc)
+ decoded = text_type(data, enc)
successful_encoding = enc
except (UnicodeError, LookupError):
break
if not successful_encoding:
raise UnicodeError(
- "Unable to decode input data. "
- "Tried the following encodings: %s."
- % ", ".join([repr(enc) for enc in encodings if enc])
+ 'Unable to decode input data. '
+ 'Tried the following encodings: %s.'
+ % ', '.join([repr(enc) for enc in encodings if enc])
)
else:
return (decoded, successful_encoding)
def invert_dict(d):
inverted_dict = defaultdict(list)
for key in d:
- if hasattr(d[key], "__iter__"):
+ if hasattr(d[key], '__iter__'):
for term in d[key]:
inverted_dict[term].append(key)
else:
"""
# Pads the sequence as desired by **kwargs.
- if "pad_left" in kwargs or "pad_right" in kwargs:
+ if 'pad_left' in kwargs or 'pad_right' in kwargs:
sequence = pad_sequence(sequence, n, **kwargs)
# Note when iterating through the ngrams, the pad_right here is not
:param key: the identifier we are searching for.
"""
- key = key + " "
+ key = key + ' '
keylen = len(key)
start = 0
currentDepth = 0
- if hasattr(file, "name"):
+ if hasattr(file, 'name'):
end = os.stat(file.name).st_size - 1
else:
file.seek(0, 2)
######################################################################
-def set_proxy(proxy, user=None, password=""):
+def set_proxy(proxy, user=None, password=''):
"""
Set the HTTP proxy for Python to download through.
authentication.
:param password: The password to authenticate with.
"""
+ from nltk import compat
+
if proxy is None:
# Try and find the system proxy settings
try:
- proxy = getproxies()["http"]
+ proxy = getproxies()['http']
except KeyError:
- raise ValueError("Could not detect default proxy settings")
+ raise ValueError('Could not detect default proxy settings')
# Set up the proxy handler
- proxy_handler = ProxyHandler({"https": proxy, "http": proxy})
+ proxy_handler = ProxyHandler({'https': proxy, 'http': proxy})
opener = build_opener(proxy_handler)
if user is not None:
return ntok // ktok
else:
return 0
-
-
-######################################################################
-# Iteration utilities
-######################################################################
-
-
-def pairwise(iterable):
- """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
- a, b = tee(iterable)
- next(b, None)
- return zip(a, b)
-
-######################################################################
-# Parallization.
-######################################################################
-
-
-def parallelize_preprocess(func, iterator, processes, progress_bar=False):
- from tqdm import tqdm
- from joblib import Parallel, delayed
-
- iterator = tqdm(iterator) if progress_bar else iterator
- if processes <= 1:
- return map(func, iterator)
- return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)
# Authors: Liling Tan <alvations@gmail.com>,
# Dmitrijs Milajevs <dimazest@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT