+++ /dev/null
-The files in this repository contain mappings from treebank specific tagsets
-to a set of 12 universal part-of-speech tags. The 12 universal tags are:
-
-VERB - verbs (all tenses and modes)
-NOUN - nouns (common and proper)
-PRON - pronouns
-ADJ - adjectives
-ADV - adverbs
-ADP - adpositions (prepositions and postpositions)
-CONJ - conjunctions
-DET - determiners
-NUM - cardinal numbers
-PRT - particles or other function words
-X - other: foreign words, typos, abbreviations
-. - punctuation
-
-See "A Universal Part-of-Speech Tagset"
-by Slav Petrov, Dipanjan Das and Ryan McDonald
-for more details:
-http://arxiv.org/abs/1104.2086
+++ /dev/null
-en-tweet is a mapping into Petrov et al.'s universal tagset from the tagset
-used in the corpus of English Twitter messages published as
-
- Kevin Gimpel, Nathan Schneider, Brendan O’Connor, Dipanjan Das, Daniel Mills,
- Jacob Eisenstein, Michael Heilman, Dani Yogatama, Jeffrey Flanigan, and
- Noah A. Smith (2011). Part-of-Speech Tagging for Twitter: Annotation,
- Features, and Experiments. Proc. of ACL.
-
-We summarize the tagset here for convenience:
-
-Nominal, Nominal + Verbal
-N common noun
-O pronoun (personal/WH; not possessive)
-S nominal + possessive
-ˆ proper noun
-Z proper noun + possessive
-L nominal + verbal
-M proper noun + verbal
-
-Other open-class words
-V verb incl. copula, auxiliaries
-A adjective
-R adverb
-! interjection
-
-Other closed-class words
-D determiner
-P pre- or postposition, or subordinating conjunction
-& coordinating conjunction
-T verb particle
-X existential there, predeterminers
-Y X + verbal
-
-Twitter/online-specific
-# hashtag (indicates topic/category for tweet)
-@ at-mention (indicates another user as a recipient of a tweet)
-~ discourse marker, indications of continuation of a message across multiple tweets
-U URL or email address
-E emoticon
-
-Miscellaneous
-$ numeral (CD)
-, punctuation
-G other abbreviations, foreign words, possessive endings, symbols, garbage
-
-
-See http://www.ark.cs.cmu.edu/TweetNLP/ for more information.
-
-- Nathan Schneider, 2011-05-06
+++ /dev/null
-'''
-Interface for converting POS tags from various treebanks
-to the universal tagset of Petrov, Das, & McDonald.
-
-The tagset consists of the following 12 coarse tags:
-
-VERB - verbs (all tenses and modes)
-NOUN - nouns (common and proper)
-PRON - pronouns
-ADJ - adjectives
-ADV - adverbs
-ADP - adpositions (prepositions and postpositions)
-CONJ - conjunctions
-DET - determiners
-NUM - cardinal numbers
-PRT - particles or other function words
-X - other: foreign words, typos, abbreviations
-. - punctuation
-
-@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/
-
-@author: Nathan Schneider (nschneid)
-@since: 2011-05-06
-'''
-
-# Strive towards Python 3 compatibility
-from __future__ import print_function, unicode_literals, division
-from future_builtins import map, filter
-
-import re, glob
-from collections import defaultdict
-
-MAP_DIR = 'universal_pos_tags.1.01'
-
-COARSE_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.')
-
-_MAPS = defaultdict(dict)
-
-def readme():
- with open(MAP_DIR+'/README') as f:
- return f.read()
-
-def fileids(lang=''):
- '''
- Optionally given a two-letter ISO language code, returns names of files
- containing mappings from a tagset from a treebank in that language to the
- universal tagset.
-
- >>> fileids('en')
- [u'en-ptb']
- >>> fileids('zh')
- [u'zh-ctb6', u'zh-sinica']
- '''
- return [re.match(r'.*[/]([^/\\]+)[.]map', p).group(1) for p in glob.glob(MAP_DIR + '/{}-*.map'.format(lang.lower()))]
-
-def _read(fileid):
- with open(MAP_DIR+'/'+fileid+'.map') as f:
- for ln in f:
- ln = ln.strip()
- if ln=='': continue
- fine, coarse = ln.split('\t')
- assert coarse in COARSE_TAGS,'Unexpected coarse tag: {}'.format(coarse)
- assert fine not in _MAPS[fileid],'Multiple entries for original tag: {}'.format(fine)
- _MAPS[fileid][fine] = coarse
-
-def mapping(fileid):
- '''
- Retrieves the mapping from original tags to universal tags for the
- treebank in question.
-
- >>> mapping('ru-rnc')=={'!': '.', 'A': 'ADJ', 'AD': 'ADV', 'C': 'CONJ', 'COMP': 'CONJ', 'IJ': 'X', 'NC': 'NUM', 'NN': 'NOUN', 'P': 'PRON', 'PTCL': 'PRT', 'V': 'VERB', 'VG': 'VERB', 'VI': 'VERB', 'VP': 'VERB', 'YES_NO_SENT': 'X', 'Z': 'X'}
- True
- '''
- if fileid not in _MAPS:
- _read(fileid)
- return _MAPS[fileid]
-
-def convert(fileid, originalTag):
- '''
- Produces the (coarse) universal tag given an original POS tag from the
- treebank in question.
-
- >>> convert('en-ptb', 'VBZ')
- u'VERB'
- >>> convert('en-ptb', 'VBP')
- u'VERB'
- >>> convert('en-ptb', '``')
- u'.'
- '''
- return mapping(fileid)[originalTag]
-
-
-def test():
- for fileid in fileids():
- mapping(fileid)
- import doctest
- doctest.testmod()
-
-if __name__=='__main__':
- test()