--- /dev/null
+The files in this repository contain mappings from treebank specific tagsets
+to a set of 12 universal part-of-speech tags. The 12 universal tags are:
+
+VERB - verbs (all tenses and modes)
+NOUN - nouns (common and proper)
+PRON - pronouns
+ADJ - adjectives
+ADV - adverbs
+ADP - adpositions (prepositions and postpositions)
+CONJ - conjunctions
+DET - determiners
+NUM - cardinal numbers
+PRT - particles or other function words
+X - other: foreign words, typos, abbreviations
+. - punctuation
+
+See "A Universal Part-of-Speech Tagset"
+by Slav Petrov, Dipanjan Das and Ryan McDonald
+for more details:
+http://arxiv.org/abs/1104.2086
--- /dev/null
+en-tweet is a mapping into Petrov et al.'s universal tagset from the tagset
+used in the corpus of English Twitter messages published as
+
+ Kevin Gimpel, Nathan Schneider, Brendan O’Connor, Dipanjan Das, Daniel Mills,
+ Jacob Eisenstein, Michael Heilman, Dani Yogatama, Jeffrey Flanigan, and
+ Noah A. Smith (2011). Part-of-Speech Tagging for Twitter: Annotation,
+ Features, and Experiments. Proc. of ACL.
+
+We summarize the tagset here for convenience:
+
+Nominal, Nominal + Verbal
+N common noun
+O pronoun (personal/WH; not possessive)
+S nominal + possessive
+ˆ proper noun
+Z proper noun + possessive
+L nominal + verbal
+M proper noun + verbal
+
+Other open-class words
+V verb incl. copula, auxiliaries
+A adjective
+R adverb
+! interjection
+
+Other closed-class words
+D determiner
+P pre- or postposition, or subordinating conjunction
+& coordinating conjunction
+T verb particle
+X existential there, predeterminers
+Y X + verbal
+
+Twitter/online-specific
+# hashtag (indicates topic/category for tweet)
+@ at-mention (indicates another user as a recipient of a tweet)
+~ discourse marker, indications of continuation of a message across multiple tweets
+U URL or email address
+E emoticon
+
+Miscellaneous
+$ numeral (CD)
+, punctuation
+G other abbreviations, foreign words, possessive endings, symbols, garbage
+
+
+See http://www.ark.cs.cmu.edu/TweetNLP/ for more information.
+
+- Nathan Schneider, 2011-05-06
--- /dev/null
+'''
+Interface for converting POS tags from various treebanks
+to the universal tagset of Petrov, Das, & McDonald.
+
+The tagset consists of the following 12 coarse tags:
+
+VERB - verbs (all tenses and modes)
+NOUN - nouns (common and proper)
+PRON - pronouns
+ADJ - adjectives
+ADV - adverbs
+ADP - adpositions (prepositions and postpositions)
+CONJ - conjunctions
+DET - determiners
+NUM - cardinal numbers
+PRT - particles or other function words
+X - other: foreign words, typos, abbreviations
+. - punctuation
+
+@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/
+
+@author: Nathan Schneider (nschneid)
+@since: 2011-05-06
+'''
+
+# Strive towards Python 3 compatibility
+from __future__ import print_function, unicode_literals, division
+from future_builtins import map, filter
+
+import re, glob
+from collections import defaultdict
+
+MAP_DIR = 'universal_pos_tags.1.01'
+
+COARSE_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.')
+
+_MAPS = defaultdict(dict)
+
+def readme():
+ with open(MAP_DIR+'/README') as f:
+ return f.read()
+
+def fileids(lang=''):
+ '''
+ Optionally given a two-letter ISO language code, returns names of files
+ containing mappings from a tagset from a treebank in that language to the
+ universal tagset.
+
+ >>> fileids('en')
+ [u'en-ptb']
+ >>> fileids('zh')
+ [u'zh-ctb6', u'zh-sinica']
+ '''
+ return [re.match(r'.*[/]([^/\\]+)[.]map', p).group(1) for p in glob.glob(MAP_DIR + '/{}-*.map'.format(lang.lower()))]
+
+def _read(fileid):
+ with open(MAP_DIR+'/'+fileid+'.map') as f:
+ for ln in f:
+ ln = ln.strip()
+ if ln=='': continue
+ fine, coarse = ln.split('\t')
+ assert coarse in COARSE_TAGS,'Unexpected coarse tag: {}'.format(coarse)
+ assert fine not in _MAPS[fileid],'Multiple entries for original tag: {}'.format(fine)
+ _MAPS[fileid][fine] = coarse
+
+def mapping(fileid):
+ '''
+ Retrieves the mapping from original tags to universal tags for the
+ treebank in question.
+
+ >>> mapping('ru-rnc')=={'!': '.', 'A': 'ADJ', 'AD': 'ADV', 'C': 'CONJ', 'COMP': 'CONJ', 'IJ': 'X', 'NC': 'NUM', 'NN': 'NOUN', 'P': 'PRON', 'PTCL': 'PRT', 'V': 'VERB', 'VG': 'VERB', 'VI': 'VERB', 'VP': 'VERB', 'YES_NO_SENT': 'X', 'Z': 'X'}
+ True
+ '''
+ if fileid not in _MAPS:
+ _read(fileid)
+ return _MAPS[fileid]
+
+def convert(fileid, originalTag):
+ '''
+ Produces the (coarse) universal tag given an original POS tag from the
+ treebank in question.
+
+ >>> convert('en-ptb', 'VBZ')
+ u'VERB'
+ >>> convert('en-ptb', 'VBP')
+ u'VERB'
+ >>> convert('en-ptb', '``')
+ u'.'
+ '''
+ return mapping(fileid)[originalTag]
+
+
+def test():
+ for fileid in fileids():
+ mapping(fileid)
+ import doctest
+ doctest.testmod()
+
+if __name__=='__main__':
+ test()