add the nlp resource data -- langdetect 97/183997/3
authorjingjin.geng <jingjin.geng@samsung.com>
Fri, 13 Jul 2018 19:32:15 +0000 (03:32 +0800)
committerjingjin.geng <jingjin.geng@samsung.com>
Mon, 16 Jul 2018 17:26:28 +0000 (01:26 +0800)
Change-Id: Ida3cb169df2768bed7c5742cc25070a64a36368c

25 files changed:
CMakeLists.txt
nlp_resource_data/CMakeLists.txt [new file with mode: 0755]
nlp_resource_data/langdetect/__init__.py [new file with mode: 0755]
nlp_resource_data/langdetect/__init__.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/detector.py [new file with mode: 0755]
nlp_resource_data/langdetect/detector.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/detector_factory.py [new file with mode: 0755]
nlp_resource_data/langdetect/detector_factory.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/lang_detect_exception.py [new file with mode: 0755]
nlp_resource_data/langdetect/lang_detect_exception.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/language.py [new file with mode: 0755]
nlp_resource_data/langdetect/language.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/profiles/en [new file with mode: 0755]
nlp_resource_data/langdetect/utils/__init__.py [new file with mode: 0755]
nlp_resource_data/langdetect/utils/__init__.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/utils/lang_profile.py [new file with mode: 0755]
nlp_resource_data/langdetect/utils/lang_profile.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/utils/messages.properties [new file with mode: 0755]
nlp_resource_data/langdetect/utils/messages.py [new file with mode: 0755]
nlp_resource_data/langdetect/utils/messages.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/utils/ngram.py [new file with mode: 0755]
nlp_resource_data/langdetect/utils/ngram.pyc [new file with mode: 0755]
nlp_resource_data/langdetect/utils/unicode_block.py [new file with mode: 0755]
nlp_resource_data/langdetect/utils/unicode_block.pyc [new file with mode: 0755]
packaging/nlp.spec

index 5fb9567..934d58b 100755 (executable)
@@ -1,5 +1,5 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 
 ADD_SUBDIRECTORY(client)
-
 ADD_SUBDIRECTORY(service)
+ADD_SUBDIRECTORY(nlp_resource_data)
\ No newline at end of file
diff --git a/nlp_resource_data/CMakeLists.txt b/nlp_resource_data/CMakeLists.txt
new file mode 100755 (executable)
index 0000000..474de41
--- /dev/null
@@ -0,0 +1,14 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+SET(fw_name "nlp_resource_data")
+PROJECT(${fw_name} C)
+
+INCLUDE(FindPkgConfig)
+
+#INSTALL(DIRECTORY nltk DESTINATION /usr/lib/python2.7/site-packages)
+INSTALL(DIRECTORY langdetect DESTINATION /usr/lib/python2.7/site-packages)
+#INSTALL(DIRECTORY nltk_data DESTINATION /usr/local/lib/)
+
+
+
+
+
diff --git a/nlp_resource_data/langdetect/__init__.py b/nlp_resource_data/langdetect/__init__.py
new file mode 100755 (executable)
index 0000000..7992bfa
--- /dev/null
@@ -0,0 +1 @@
+from .detector_factory import DetectorFactory, PROFILES_DIRECTORY, detect, detect_langs
diff --git a/nlp_resource_data/langdetect/__init__.pyc b/nlp_resource_data/langdetect/__init__.pyc
new file mode 100755 (executable)
index 0000000..1ff21ed
Binary files /dev/null and b/nlp_resource_data/langdetect/__init__.pyc differ
diff --git a/nlp_resource_data/langdetect/detector.py b/nlp_resource_data/langdetect/detector.py
new file mode 100755 (executable)
index 0000000..cc831a0
--- /dev/null
@@ -0,0 +1,249 @@
+import random
+import re
+
+import six
+from six.moves import zip, xrange
+
+from .lang_detect_exception import ErrorCode, LangDetectException
+from .language import Language
+from .utils.ngram import NGram
+from .utils.unicode_block import unicode_block
+
+
+class Detector(object):
+    '''
+    Detector class is to detect language from specified text.
+    Its instance is able to be constructed via the factory class DetectorFactory.
+
+    After appending a target text to the Detector instance with .append(string),
+    the detector provides the language detection results for target text via .detect() or .get_probabilities().
+
+    .detect() method returns a single language name which has the highest probability.
+    .get_probabilities() methods returns a list of multiple languages and their probabilities.
+
+    The detector has some parameters for language detection.
+    See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).
+
+    Example:
+
+        from langdetect.detector_factory import DetectorFactory
+        factory = DetectorFactory()
+        factory.load_profile('/path/to/profile/directory')
+
+        def detect(text):
+            detector = factory.create()
+            detector.append(text)
+            return detector.detect()
+
+        def detect_langs(text):
+            detector = factory.create()
+            detector.append(text)
+            return detector.get_probabilities()
+    '''
+
+    ALPHA_DEFAULT = 0.5
+    ALPHA_WIDTH = 0.05
+
+    ITERATION_LIMIT = 1000
+    PROB_THRESHOLD = 0.1
+    CONV_THRESHOLD = 0.99999
+    BASE_FREQ = 10000
+    UNKNOWN_LANG = 'unknown'
+
+    URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
+    MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')
+
+    def __init__(self, factory):
+        self.word_lang_prob_map = factory.word_lang_prob_map
+        self.langlist = factory.langlist
+        self.seed = factory.seed
+        self.random = random.Random()
+        self.text = ''
+        self.langprob = None
+
+        self.alpha = self.ALPHA_DEFAULT
+        self.n_trial = 7
+        self.max_text_length = 10000
+        self.prior_map = None
+        self.verbose = False
+
+    def set_verbose(self):
+        self.verbose = True
+
+    def set_alpha(self, alpha):
+        self.alpha = alpha
+
+    def set_prior_map(self, prior_map):
+        '''Set prior information about language probabilities.'''
+        self.prior_map = [0.0] * len(self.langlist)
+        sump = 0.0
+        for i in xrange(len(self.prior_map)):
+            lang = self.langlist[i]
+            if lang in prior_map:
+                p = prior_map[lang]
+                if p < 0:
+                    raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
+                self.prior_map[i] = p
+                sump += p
+        if sump <= 0.0:
+            raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
+        for i in xrange(len(self.prior_map)):
+            self.prior_map[i] /= sump
+
+    def set_max_text_length(self, max_text_length):
+        '''Specify max size of target text to use for language detection.
+        The default value is 10000(10KB).
+        '''
+        self.max_text_length = max_text_length
+
+    def append(self, text):
+        '''Append the target text for language detection.
+        If the total size of target text exceeds the limit size specified by
+        Detector.set_max_text_length(int), the rest is cut down.
+        '''
+        text = self.URL_RE.sub(' ', text)
+        text = self.MAIL_RE.sub(' ', text)
+        text = NGram.normalize_vi(text)
+        pre = 0
+        for i in xrange(min(len(text), self.max_text_length)):
+            ch = text[i]
+            if ch != ' ' or pre != ' ':
+                self.text += ch
+            pre = ch
+
+    def cleaning_text(self):
+        '''Cleaning text to detect
+        (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
+        '''
+        latin_count, non_latin_count = 0, 0
+        for ch in self.text:
+            if 'A' <= ch <= 'z':
+                latin_count += 1
+            elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
+                non_latin_count += 1
+
+        if latin_count * 2 < non_latin_count:
+            text_without_latin = ''
+            for ch in self.text:
+                if ch < 'A' or 'z' < ch:
+                    text_without_latin += ch
+            self.text = text_without_latin
+
+    def detect(self):
+        '''Detect language of the target text and return the language name
+        which has the highest probability.
+        '''
+        probabilities = self.get_probabilities()
+        if probabilities:
+            return probabilities[0].lang
+        return self.UNKNOWN_LANG
+
+    def get_probabilities(self):
+        if self.langprob is None:
+            self._detect_block()
+        return self._sort_probability(self.langprob)
+
+    def _detect_block(self):
+        self.cleaning_text()
+        ngrams = self._extract_ngrams()
+        if not ngrams:
+            raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
+
+        self.langprob = [0.0] * len(self.langlist)
+
+        self.random.seed(self.seed)
+        for t in xrange(self.n_trial):
+            prob = self._init_probability()
+            alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH
+
+            i = 0
+            while True:
+                self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
+                if i % 5 == 0:
+                    if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
+                        break
+                    if self.verbose:
+                        six.print_('>', self._sort_probability(prob))
+                i += 1
+            for j in xrange(len(self.langprob)):
+                self.langprob[j] += prob[j] / self.n_trial
+            if self.verbose:
+                six.print_('==>', self._sort_probability(prob))
+
+    def _init_probability(self):
+        '''Initialize the map of language probabilities.
+        If there is the specified prior map, use it as initial map.
+        '''
+        if self.prior_map is not None:
+            return list(self.prior_map)
+        else:
+            return [1.0 / len(self.langlist)] * len(self.langlist)
+
+    def _extract_ngrams(self):
+        '''Extract n-grams from target text.'''
+        RANGE = list(xrange(1, NGram.N_GRAM + 1))
+
+        result = []
+        ngram = NGram()
+        for ch in self.text:
+            ngram.add_char(ch)
+            if ngram.capitalword:
+                continue
+            for n in RANGE:
+                # optimized w = ngram.get(n)
+                if len(ngram.grams) < n:
+                    break
+                w = ngram.grams[-n:]
+                if w and w != ' ' and w in self.word_lang_prob_map:
+                    result.append(w)
+        return result
+
+    def _update_lang_prob(self, prob, word, alpha):
+        '''Update language probabilities with N-gram string(N=1,2,3).'''
+        if word is None or word not in self.word_lang_prob_map:
+            return False
+
+        lang_prob_map = self.word_lang_prob_map[word]
+        if self.verbose:
+            six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))
+
+        weight = alpha / self.BASE_FREQ
+        for i in xrange(len(prob)):
+            prob[i] *= weight + lang_prob_map[i]
+        return True
+
+    def _word_prob_to_string(self, prob):
+        result = ''
+        for j in xrange(len(prob)):
+            p = prob[j]
+            if p >= 0.00001:
+                result += ' %s:%.5f' % (self.langlist[j], p)
+        return result
+
+    def _normalize_prob(self, prob):
+        '''Normalize probabilities and check convergence by the maximun probability.
+        '''
+        maxp, sump = 0.0, sum(prob)
+        for i in xrange(len(prob)):
+            p = prob[i] / sump
+            if maxp < p:
+                maxp = p
+            prob[i] = p
+        return maxp
+
+    def _sort_probability(self, prob):
+        result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
+        result.sort(reverse=True)
+        return result
+
+    def _unicode_encode(self, word):
+        buf = ''
+        for ch in word:
+            if ch >= six.u('\u0080'):
+                st = hex(0x10000 + ord(ch))[2:]
+                while len(st) < 4:
+                    st = '0' + st
+                buf += r'\u' + st[1:5]
+            else:
+                buf += ch
+        return buf
diff --git a/nlp_resource_data/langdetect/detector.pyc b/nlp_resource_data/langdetect/detector.pyc
new file mode 100755 (executable)
index 0000000..1db3897
Binary files /dev/null and b/nlp_resource_data/langdetect/detector.pyc differ
diff --git a/nlp_resource_data/langdetect/detector_factory.py b/nlp_resource_data/langdetect/detector_factory.py
new file mode 100755 (executable)
index 0000000..e026161
--- /dev/null
@@ -0,0 +1,137 @@
+import os
+from os import path
+import sys
+
+try:
+    import simplejson as json
+except ImportError:
+    import json
+
+from .detector import Detector
+from .lang_detect_exception import ErrorCode, LangDetectException
+from .utils.lang_profile import LangProfile
+
+
+class DetectorFactory(object):
+    '''
+    Language Detector Factory Class.
+
+    This class manages an initialization and constructions of Detector.
+
+    Before using language detection library,
+    load profiles with DetectorFactory.load_profile(str)
+    and set initialization parameters.
+
+    When the language detection,
+    construct Detector instance via DetectorFactory.create().
+    See also Detector's sample code.
+    '''
+    seed = None
+
+    def __init__(self):
+        self.word_lang_prob_map = {}
+        self.langlist = []
+
+    def load_profile(self, profile_directory):
+        list_files = os.listdir(profile_directory)
+        if not list_files:
+            raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Not found profile: ' + profile_directory)
+
+        langsize, index = len(list_files), 0
+        for filename in list_files:
+            if filename.startswith('.'):
+                continue
+            filename = path.join(profile_directory, filename)
+            if not path.isfile(filename):
+                continue
+
+            f = None
+            try:
+                if sys.version_info[0] < 3:
+                    f = open(filename, 'r')
+                else:
+                    f = open(filename, 'r', encoding='utf-8')
+                json_data = json.load(f)
+                profile = LangProfile(**json_data)
+                self.add_profile(profile, index, langsize)
+                index += 1
+            except IOError:
+                raise LangDetectException(ErrorCode.FileLoadError, 'Cannot open "%s"' % filename)
+            except:
+                raise LangDetectException(ErrorCode.FormatError, 'Profile format error in "%s"' % filename)
+            finally:
+                if f:
+                    f.close()
+
+    def load_json_profile(self, json_profiles):
+        langsize, index = len(json_profiles), 0
+        if langsize < 2:
+            raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need more than 2 profiles.')
+
+        for json_profile in json_profiles:
+            try:
+                json_data = json.loads(json_profile)
+                profile = LangProfile(**json_data)
+                self.add_profile(profile, index, langsize)
+                index += 1
+            except:
+                raise LangDetectException(ErrorCode.FormatError, 'Profile format error.')
+
+    def add_profile(self, profile, index, langsize):
+        lang = profile.name
+        if lang in self.langlist:
+            raise LangDetectException(ErrorCode.DuplicateLangError, 'Duplicate the same language profile.')
+        self.langlist.append(lang)
+
+        for word in profile.freq:
+            if word not in self.word_lang_prob_map:
+                self.word_lang_prob_map[word] = [0.0] * langsize
+            length = len(word)
+            if 1 <= length <= 3:
+                prob = 1.0 * profile.freq.get(word) / profile.n_words[length - 1]
+                self.word_lang_prob_map[word][index] = prob
+
+    def clear(self):
+        self.langlist = []
+        self.word_lang_prob_map = {}
+
+    def create(self, alpha=None):
+        '''Construct Detector instance with smoothing parameter.'''
+        detector = self._create_detector()
+        if alpha is not None:
+            detector.set_alpha(alpha)
+        return detector
+
+    def _create_detector(self):
+        if not self.langlist:
+            raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need to load profiles.')
+        return Detector(self)
+
+    def set_seed(self, seed):
+        self.seed = seed
+
+    def get_lang_list(self):
+        return list(self.langlist)
+
+
+PROFILES_DIRECTORY = path.join(path.dirname(__file__), 'profiles')
+_factory = None
+
+def init_factory():
+    global _factory
+    if _factory is None:
+        _factory = DetectorFactory()
+        _factory.load_profile(PROFILES_DIRECTORY)
+
+def detect(text):
+    init_factory()
+    detector = _factory.create()
+    detector.append(text)
+    return detector.detect()
+
+
+def detect_langs(text):
+    init_factory()
+    detector = _factory.create()
+    detector.append(text)
+    return detector.get_probabilities()
diff --git a/nlp_resource_data/langdetect/detector_factory.pyc b/nlp_resource_data/langdetect/detector_factory.pyc
new file mode 100755 (executable)
index 0000000..50fd9da
Binary files /dev/null and b/nlp_resource_data/langdetect/detector_factory.pyc differ
diff --git a/nlp_resource_data/langdetect/lang_detect_exception.py b/nlp_resource_data/langdetect/lang_detect_exception.py
new file mode 100755 (executable)
index 0000000..b0c05c1
--- /dev/null
@@ -0,0 +1,22 @@
+_error_codes = {
+    'NoTextError': 0,
+    'FormatError': 1,
+    'FileLoadError': 2,
+    'DuplicateLangError': 3,
+    'NeedLoadProfileError': 4,
+    'CantDetectError': 5,
+    'CantOpenTrainData': 6,
+    'TrainDataFormatError': 7,
+    'InitParamError': 8,
+}
+
+ErrorCode = type('ErrorCode', (), _error_codes)
+
+
+class LangDetectException(Exception):
+    def __init__(self, code, message):
+        super(LangDetectException, self).__init__(message)
+        self.code = code
+
+    def get_code(self):
+        return self.code
diff --git a/nlp_resource_data/langdetect/lang_detect_exception.pyc b/nlp_resource_data/langdetect/lang_detect_exception.pyc
new file mode 100755 (executable)
index 0000000..c39b6be
Binary files /dev/null and b/nlp_resource_data/langdetect/lang_detect_exception.pyc differ
diff --git a/nlp_resource_data/langdetect/language.py b/nlp_resource_data/langdetect/language.py
new file mode 100755 (executable)
index 0000000..36756e1
--- /dev/null
@@ -0,0 +1,17 @@
+class Language(object):
+    '''
+    Language is to store the detected language.
+    Detector.get_probabilities() returns a list of Languages.
+    '''
+
+    def __init__(self, lang, prob):
+        self.lang = lang
+        self.prob = prob
+
+    def __repr__(self):
+        if self.lang is None:
+            return ''
+        return '%s:%s' % (self.lang, self.prob)
+
+    def __lt__(self, other):
+        return self.prob < other.prob
diff --git a/nlp_resource_data/langdetect/language.pyc b/nlp_resource_data/langdetect/language.pyc
new file mode 100755 (executable)
index 0000000..8b6ae44
Binary files /dev/null and b/nlp_resource_data/langdetect/language.pyc differ
diff --git a/nlp_resource_data/langdetect/profiles/en b/nlp_resource_data/langdetect/profiles/en
new file mode 100755 (executable)
index 0000000..4807e95
--- /dev/null
@@ -0,0 +1 @@
+{"freq":{"D":662077,"E":559834,"F":608348,"G":582849,"A":1412556,"B":916671,"C":1498503,"L":649564,"M":1059392,"N":665299,"O":394009,"H":682378,"I":978047,"J":471262,"K":369000,"U":380950,"T":1402307,"W":505824,"V":256072,"Q":51938,"P":880443,"S":1630016,"R":708545,"Y":131400,"X":37224,"Z":73521,"f":5238032,"g":4381944,"d":8729953,"e":27848709,"b":3669334,"c":7841280,"a":23418136,"n":19713516,"o":18673929,"l":10669664,"m":6170962,"j":262547,"k":1633239,"h":10134148,"i":20570816,"w":3362380,"v":2275926,"u":6637499,"t":19408712,"s":16004058,"r":16873084,"q":170855,"p":4621926,"z":397471,"y":4124069,"x":440231,"é":58984," l":780297," m":1079276," n":608651," o":3436911," h":813084," i":3938466," j":78084," k":215197," d":941771," e":740834," f":1754540," g":440359," a":5405782," b":1659799," c":1744132," y":83514," u":322099," t":5058192," w":1900306," v":255788," q":29653," p":1560368," s":2415857," r":962057," J":460420," K":336817," H":654413," I":868613," N":607763," O":345142," L":596797," M":988580," B":847481," C":1361375," A":1263874," F":559603," G":550525," D":599768," E":493444," Z":67911," Y":122764," S":1468740," R":648863," Q":47311," P":804972," W":486015," V":212635," U":359652," T":1336813,"A ":159383,"Da":104182,"Cu":44169,"Cl":60734,"Co":426605,"Cr":65402,"Ce":60359,"Ch":258329,"Ci":62395,"Ed":38074,"Ea":55403,"Du":53228,"Do":65822,"Dr":31956,"De":174035,"Di":112559,"Fe":83008,"Fa":58998,"Eu":42215,"En":125985,"Em":27259,"El":44031,"Ge":134248,"Ga":73890,"I ":75735,"Fr":140633,"Fo":106181,"Fl":37525,"Fi":77847,"B ":28902,"C ":86756,"Au":138720,"Ar":132765,"At":36103,"As":94697,"D ":41383,"Ba":186506,"Af":42815,"Ac":49109,"Ad":33381,"Am":167256,"An":117416,"Ap":56752,"Ai":30417,"Al":127646,"Bu":74783,"Br":187559,"Ca":304473,"E ":36124,"Bi":58070,"Be":129264,"Bo":112086,"Bl":40686,"Ko":45338,"Le":127247,"Li":124649,"La":147235,"Lu":31872,"Lo":122679,"Me":130273,"Mi":151239,"O ":29173,"Ma":405967,"Mu":71213,"Mo":145746,"Ni":43349,"Ne":178415,"Na":143656,"P ":44679,"No":180123,"Ol":47193,"On":40436,"Oc":53105,"Gi":29349,"Gr":130751,"Go":73286,"Gu":48214,"Ha":132380,"He":218817,"II":40680,"Hi":92944,"Ho":127334,"Hu":45621,"K ":32711,"In":253915,"Is":60970,"It":340082,"Ir":53269,"Ja":154594,"L ":34197,"Je":52293,"Jo":101484,"Ju":117086,"Ka":74254,"M ":37484,"Ki":74159,"Ke":51086,"Un":220729,"Tu":36890,"Tr":84116,"US":35610,"To":93419,"Th":850306,"Ti":41292,"Te":99569,"Ta":74444,"V ":32754,"Sw":37200,"Sy":30043,"St":291796,"Su":96654,"Wo":87820,"Wi":99188,"Wh":33410,"Wa":135830,"We":94221,"Vi":82206,"Va":46571,"Ve":37939,"Pu":36227,"Pr":165318,"S ":81629,"Pe":109002,"Pa":220340,"Pl":35710,"Po":107653,"Pi":44494,"Ph":43729,"Or":50633,"R ":32772,"Se":177109,"Sc":119654,"Si":88455,"Sh":112206,"Sp":78520,"So":160970,"Ru":53200,"Sa":155395,"Re":186160,"Ri":92732,"Ro":164209,"Qu":37827,"T ":30323,"Ra":101065,"b ":134105,"a ":2991353,"Yo":71760,"i ":355519,"gd":31577,"ge":721279,"ga":370090,"fl":84104,"ff":180681,"fi":526440,"fr":391564,"fu":82822,"ft":147685,"fo":920673,"gy":74018,"he":4842012,"ha":1007728,"gn":127025,"gl":188283,"gi":377563,"gh":367310,"gg":29821,"gu":248240,"gt":38716,"gs":80931,"gr":275281,"go":177983,"du":289181,"dv":26927,"dw":35481,"dy":74747,"g ":1213593,"ea":1274992,"eb":152577,"ec":841985,"ed":2289411,"de":1320778,"dd":64948,"dg":51752,"di":930651,"dm":40343,"dl":51794,"do":273396,"ds":196380,"dr":141993,"ew":250035,"ex":221189,"eu":76879,"ev":357914,"ey":221145,"fa":236961,"h ":1529402,"fe":315742,"eh":48317,"eg":294475,"ef":194847,"ee":528675,"el":1146347,"ek":64162,"ei":272994,"ep":332185,"eo":187165,"en":2427008,"em":679038,"et":814658,"es":2395636,"er":4179896,"eq":41573,"ca":1058365,"e ":8494237,"by":527627,"bs":58904,"br":217191,"bu":297420,"bo":472510,"bl":324534,"bi":213219,"bb":35633,"be":846369,"da":395082,"f ":2316051,"cy":71900,"cu":238303,"ct":817847,"cs":93807,"cr":254963,"co":1252186,"ck":305854,"cl":251999,"ci":687756,"ch":1106571,"ce":1086909,"cc":111912,"c ":457702,"az":72598,"ay":450984,"ba":470554,"d ":4698126,"at":2664116,"as":2180049,"ar":2492347,"ax":35262,"aw":117859,"av":238618,"au":228752,"ak":197752,"al":2475728,"ai":590954,"aj":50310,"ap":330102,"am":913130,"an":4857931,"ac":715176,"ad":654223,"ab":325448,"ag":433896,"ah":83166,"ae":126319,"af":115896,"nu":217006,"nt":1825754,"ns":843426,"nr":30719,"no":643421,"nn":264870,"nz":30433,"ny":179007,"nv":69977,"oe":69809,"of":2379880,"oc":501750,"od":362363,"oa":174782,"ob":174628,"om":1179222,"on":3432632,"ok":132945,"ol":912097,"oi":135962,"og":247625,"oh":71587,"ot":549343,"os":498414,"ov":435331,"ou":1258409,"op":505111,"oo":421044,"or":2962572,"r ":3075136,"ox":44665,"ow":555778,"oy":75842,"pe":765233,"pa":591162,"pl":399253,"po":579386,"ph":211053,"pi":302192,"lo":761247,"lm":115044,"ll":1129598,"ls":295312,"lp":38509,"lw":35880,"lv":66473,"lu":281619,"lt":228186,"ly":725738,"o ":1535371,"ma":968442,"mb":416311,"me":1451345,"mi":591681,"mm":262444,"mp":468344,"mo":504581,"ms":133483,"mu":261893,"my":62036,"p ":307798,"na":1160193,"nb":27967,"nc":750937,"nd":2690580,"ne":1275364,"nf":87191,"ng":1746068,"nh":37213,"ni":1104016,"nk":109355,"nl":88734,"nm":67028,"ju":44053,"jo":70049,"ki":171410,"kh":26225,"ke":323852,"ka":109789,"m ":983735,"ky":30837,"ks":106786,"ko":46397,"kl":33919,"km":32936,"kn":143680,"li":1420180,"lk":42132,"le":1534709,"ld":351541,"lg":35629,"lf":59414,"la":1421955,"lb":105992,"n ":6374219,"hr":171058,"hw":45980,"ht":193546,"hu":172799,"hi":1052052,"hn":92044,"ho":773733,"hl":46194,"hm":30448,"id":496380,"ic":1849130,"ib":159645,"ia":1169835,"ig":507511,"if":219645,"ie":800933,"hy":82019,"k ":515132,"ir":647827,"is":3249081,"it":1893192,"iu":65009,"iv":578092,"ix":50259,"ik":78883,"il":1065515,"im":394260,"in":4877222,"io":1592954,"ip":270764,"je":50702,"iz":128734,"l ":1934675,"ja":50690,"xi":69443,"xp":49241,"xt":57882,"z ":53637,"xa":40308,"xe":33587,"wh":393518,"wi":429474,"wn":279417,"wo":263879,"wr":100589,"ws":68714,"vy":28150,"y ":3097451,"wa":1012808,"we":415691,"vi":599499,"vo":108876,"ve":1210480,"va":267555,"x ":126181,"ui":189592,"uk":33957,"ul":459768,"ue":281220,"uf":33361,"ug":230153,"ur":968763,"us":936891,"ut":635083,"um":403450,"un":943875,"up":204022,"ty":567576,"tu":446554,"tt":322006,"tw":175696,"ub":264839,"ua":340676,"ud":207254,"uc":282987,"w ":304612,"to":1515648,"tm":51220,"tl":201040,"ts":524135,"tr":893262,"te":2648213,"ti":2353666,"th":4782590,"v ":29410,"tb":79573,"tc":66980,"ta":1180046,"su":341240,"ss":646227,"st":2324937,"sy":102577,"sw":27118,"sl":145841,"sk":108573,"sn":41499,"sm":113246,"sp":316999,"so":700411,"sc":280155,"se":1450470,"sh":709009,"si":1101025,"u ":116514,"sa":232414,"sb":34898,"rr":265014,"rs":761380,"rt":886247,"ru":271327,"rv":158410,"rw":40021,"ry":553810,"rp":87486,"ro":1594919,"rn":650417,"rm":435749,"rl":279627,"rk":261286,"ri":2008460,"rh":32041,"rg":276649,"rf":65494,"re":2611877,"rd":466156,"rc":300833,"rb":95933,"ra":1639206,"t ":3468815,"qu":157005,"s ":7219728,"pt":158295,"pu":265747,"pp":173102,"pr":683558,"ps":94491,"zi":58211,"ze":102677,"za":82368,"zo":27033,"ye":187250,"yc":42157,"yd":28943,"ya":95795,"yt":30002,"ys":153061,"yr":36606,"yp":60686,"yo":54105,"yn":59856,"ym":80995,"yl":75727,"yi":42920,"一":42790," Ga":73074," Ge":133297," Fo":105500," Fr":140138," Fi":76403," Fl":37253," Ha":131725," He":218212," Go":72637," Gr":129383," Gu":47617," Gi":28529," Hu":45416," Ho":126780," II":28541," Hi":92512," Je":52048," Ja":154158," Ir":53197," Is":59983," It":339895," In":252514," Ka":73450," Ke":49770," Ki":73234," Jo":100944," Ju":116938," La":145853," Le":125799," Li":123371," Ko":45184," Ma":403673," Mi":150289," Me":129337," Lo":122023," Lu":31698," Ne":176925," Na":142533," Ni":43042," Mo":144948," Mu":70352," A ":81995," Ap":56659," Am":166952," An":116734," Al":126650," Ai":30214," Af":42668," Ac":48855," Ad":33141," Ba":185398," Au":138482," At":35966," As":93159," Ar":131824," Be":128479," Bi":57574," Bl":40368," Bo":111176," Br":186728," Bu":74408," Ca":300905," Ce":60031," Ci":62055," Ch":257143," Cl":59706," Cr":64523," Co":423247," Cu":43407," Da":103400," Di":111836," De":173171," Dr":31720," Do":63383," Du":52940," Ea":55250," Ed":37865," El":43749," En":125075," Em":27148," Eu":42050," Fe":82594," Fa":58222," Wo":87042," Wi":98480," Wh":33053," We":93550," Wa":135146," Yo":71619," a ":1606658," Or":50217," Po":106745," Pl":35261," Pi":44206," Ph":42887," Pe":108520," Pa":218936," No":179642," Ol":47101," On":40029," Oc":53053," Ra":100251," Qu":37258," Ro":163370," Re":185467," Ri":92437," Pr":164608," Pu":36045," Sy":29848," Sw":37042," Su":96395," St":289114," Ta":73883," Th":847432," Ti":40916," Te":98615," Tr":83585," US":33813," To":92734," Ru":52977," Sa":154637," Sh":111397," Si":87885," Sc":118544," Se":176317," So":160018," Sp":77680," Va":46297," Ve":37650," Vi":81481," Tu":35914," Un":220424," im":47481," in":2124350," is":1535535," it":174191," ki":30820," jo":31616," ju":31891," ha":241917," he":193790," gi":28626," gr":121630," go":63783," gu":29424," hi":208416," ho":110982," hu":32878," ne":121038," na":174050," mu":147201," mo":231837," on":505803," of":2275616," nu":46047," no":232717," le":140717," li":204448," la":217455," kn":129366," km":28301," me":218768," mi":119142," ma":335287," lo":197009," af":58077," ag":45613," ab":70854," ac":136746," ad":69502," am":31088," an":1904322," ap":69597," ai":40857," al":285645," au":57146," ar":316462," at":262226," as":400242," ba":223899," bi":52056," be":361253," bo":296458," bl":27386," by":488337," bu":118496," br":85339," ca":237167," es":49439," en":130005," em":28882," el":86812," fe":73621," fa":169421," ev":56247," ex":119951," fu":49639," fr":341564," fo":746598," fl":49512," fi":320292," ge":92358," ga":85124," cl":107340," co":825577," cr":113302," ce":122706," ch":161778," ci":78624," da":76510," cu":74858," do":65310," dr":48010," de":365522," di":285811," ed":47217," ea":97123," du":78958," ye":60017," ru":58222," sa":63365," se":417230," sc":118543," si":220940," sh":125764," sn":27188," sm":42001," sp":191590," so":270927," qu":28948," ra":127133," re":580460," ri":81201," ro":102378," pu":110299," pr":488102," s ":285424," ot":53672," ou":36365," ov":43302," op":81547," or":335005," ow":28576," pe":132888," pa":238807," pl":194868," po":256546," pi":40433," ph":50321," wa":774642," we":177231," wr":78668," wo":147411," wi":321826," wh":387560," va":54528," ve":51196," vo":40129," vi":105281," ty":26790," tw":69959," us":113343," up":41302," un":149473," ta":60356," sy":54778," st":354336," su":191136," tr":168050," to":791933," th":3629714," ti":77241," te":209958,"Feb":38336,"Eur":34468,"Eng":96944,"Ger":62713,"Geo":33733,"Gen":26192,"Fra":61982,"Fre":54323,"For":54660,"II ":31692,"His":27636,"Hig":26895,"He ":118997,"Her":26981,"Har":34188,"Gre":62525,"Gra":36710,"Int":46918,"Ind":83336,"In ":64014,"Hou":31497,"Arm":26565,"Apr":40277,"Ass":43311,"Aus":73786,"Aug":44651,"Bar":42165,"Afr":28749,"Ame":138898,"Cal":50030,"Car":57990,"Cat":29319,"Can":84273,"Ber":26471,"Bra":33017,"Bro":31697,"Bri":93113,"Dec":45892,"Chr":35874,"Chi":73091,"Cit":38778,"Cen":43272,"Cha":89919,"Cor":34743,"Com":89536,"Col":68792,"Con":74902,"Cou":101975,"Eas":34019,"Dis":41703,"Nat":69269,"New":118080,"Nov":49796,"Nor":97601,"Oct":42904,"Oly":28739,"Pla":26686,"Per":28472,"Pen":27579,"Par":98820,"Pro":69398,"Pri":36739,"Pre":41911,"Pol":34263,"Ita":30913,"Isl":36514,"It ":288844,"Jap":37112,"Jan":53046,"Joh":48960,"Jul":46542,"Jun":47295,"Kin":45793,"Lea":42060,"Lan":29809,"Lin":26493,"Lon":35381,"Man":49459,"Mar":140591,"May":50344,"Mon":37435,"Mic":30191,"Min":35884,"Mus":28824,"Wor":61067,"Wil":46697,"Wes":53739,"War":53697,"Wal":33670,"Yor":47584,"Str":31655,"Sta":170783,"Ste":27246,"She":47105,"Sha":26436,"Ser":32863,"Sep":45219,"Spa":35346,"Sou":76587,"Rus":29193,"Sco":37321,"Sch":55225,"San":42456,"Riv":36843,"Rep":36734,"Rom":34841,"Uni":206527,"The":741033,"Thi":55958,"Tra":32863,"bit":27600,"bil":34265,"bin":32338,"bly":27084,"ble":108674,"bli":153281,"boo":43455,"bor":211694,"bot":33326,"bou":74106,"be ":86424,"ban":103832,"bal":111539,"bac":27179,"bas":116354,"bee":53796,"bec":41647,"ber":372832,"bel":55314,"bes":39082,"bet":76601,"bia":36531,"ca ":71415,"car":71337,"cas":54417,"cat":224104,"can":218593,"cap":33144,"cad":26654,"cam":39022,"cal":268647,"ce ":489631,"bri":35233,"bro":46239,"bra":53964,"bre":38618,"bru":40872,"bur":52830,"bum":61942,"bui":38535,"but":82424,"bus":32037,"by ":517575,"am ":158006,"ake":79597,"aki":36034,"ajo":27606,"al ":1032287,"ail":109529,"ain":263108,"air":75684,"agu":57502,"ago":34773,"anu":71865,"any":116307,"ano":50033,"ann":83296,"ant":217929,"ans":132995,"ane":74984,"ang":153929,"ani":212965,"ank":54539,"ana":169898,"anc":203858,"and":1922995,"amm":33827,"amo":43526,"amp":91990,"ams":33255,"ami":135603,"ame":296400,"amb":32800,"ama":60146,"aly":26784,"alt":66067,"als":156873,"alo":44306,"all":466989,"ali":291050,"ale":104464,"ala":82861,"alb":63960,"an ":1345264,"aba":26164,"abe":27729,"abi":43800,"abl":95465,"abo":73249,"ae ":67161,"ad ":136134,"aft":64376,"aff":27842,"ai ":26417,"aga":52212,"age":211927,"ael":26991,"ado":32963,"adi":130468,"ade":130763,"ack":93180,"aci":48413,"ach":115898,"ace":139311,"acc":38863,"ada":60436,"act":176914,"azi":34919,"ays":38857,"aye":87163,"at ":514237,"arg":65842,"are":299717,"ard":199526,"arc":122863,"ara":130356,"aro":62443,"arn":35663,"arm":40703,"arl":119272,"ark":89799,"ari":222546,"arr":82306,"ars":76523,"art":349130,"ary":226746,"asi":41320,"ash":45659,"ase":201692,"aso":57197,"ask":29635,"ar ":287761,"apa":53686,"ape":43062,"aph":44106,"app":80291,"as ":1288188,"ava":37165,"aut":60210,"avi":60667,"ave":94995,"ay ":243951,"awa":44906,"ata":58365,"ast":279617,"ass":158911,"atr":35326,"ato":75056,"ate":773247,"ati":841381,"ath":123599,"aw ":33098,"att":81527,"atu":81250,"aus":27235,"jec":34085,"jor":28420,"itl":30516,"ito":44622,"itu":81601,"itt":81137,"its":101906,"ity":314774,"ism":33565,"isl":34570,"iso":38810,"iss":83951,"ist":561559,"ita":125707,"ite":286513,"ith":287933,"iti":312557,"ium":39769,"iva":48200,"ix ":27697,"ivi":104219,"ive":414501,"is ":1834908,"ion":1320795,"ior":32367,"iou":46819,"ipa":66227,"ir ":124191,"irs":127746,"isi":109243,"ish":348637,"ise":68787,"isc":59122,"ire":165519,"ird":33893,"irc":26906,"it ":178657,"ize":56700,"iza":46289,"kin":80796,"ker":49025,"ket":62341,"key":30606,"ke ":83682,"ks ":77920,"kno":134011,"ka ":41749,"ha ":26541,"ham":91101,"han":137177,"hai":28881,"hal":42170,"hav":58023,"har":141850,"has":120534,"hat":219394,"had":38461,"he ":3774627,"hel":72126,"hei":75673,"hed":104387,"hea":95197,"hey":47349,"hes":81158,"her":373103,"heo":27238,"hen":82591,"hem":53106,"hig":49277,"hie":27426,"hic":185273,"hip":92853,"hin":140164,"hil":96538,"his":233818,"hit":44255,"hir":71717,"hn ":39410,"ho ":150031,"go ":38352,"gle":57486,"gli":56142,"gn ":28550,"gla":57467,"gne":39182,"gs ":62491,"gov":34838,"gro":78380,"gra":130928,"gre":50553,"gui":30022,"gua":43805,"gue":71236,"gy ":58853,"gus":48388,"iam":54702,"ial":191889,"ian":404233,"iat":85021,"ic ":400287,"ibl":26901,"ibu":29254,"id ":86767,"ibe":47065,"ia ":367003,"iet":50527,"iel":55976,"ien":102314,"ier":62637,"ies":307456,"ied":65179,"ifo":38546,"iff":31829,"ife":37449,"ifi":68240,"ics":90231,"ict":157327,"icu":32368,"ico":35459,"ick":77376,"ici":172909,"ich":222330,"ice":143792,"ie ":67990,"ica":469571,"idi":26260,"ide":209151,"ida":78657,"il ":176048,"im ":36451,"iga":31696,"igh":217420,"igi":75927,"ign":93132,"imp":47812,"ime":121995,"imi":41657,"ip ":77179,"inc":218718,"ind":114260,"ina":201512,"inn":50515,"ino":47946,"int":230917,"ins":127823,"inf":38004,"ine":357110,"ing":1178957,"ini":149339,"inv":30106,"ike":31715,"ila":50860,"in ":2015240,"ilo":31568,"ill":253039,"ilm":70454,"ili":107918,"ild":57179,"ile":87652,"ima":76627,"io ":93766,"ilw":27998,"ily":104441,"ilt":33467,"how":39080,"hol":72031,"hom":45604,"hon":34484,"hos":48768,"hou":71727,"hoo":99559,"hor":109022,"hro":61086,"hre":44973,"hri":42093,"ht ":111042,"hy ":31885,"hum":60594,"hur":46747,"ffe":52374,"ffi":65666,"fes":54841,"fer":101678,"fea":31829,"fam":97344,"fac":45076,"ff ":28943,"fe ":33600,"ext":42471,"exa":34847,"ews":32501,"exp":39847,"exi":35172,"eta":52929,"ete":109151,"eti":91823,"eth":57374,"esp":36940,"est":416254,"ess":250577,"etr":52595,"ett":68152,"etw":81955,"ety":27383,"ew ":152274,"eve":202090,"evi":96051,"ex ":29419,"ey ":184413,"epe":34508,"er ":1640997,"epa":43995,"eor":48705,"eop":40243,"es ":1236398,"ept":72575,"epu":31828,"epr":49381,"erl":52014,"eri":370698,"erg":54150,"ere":274035,"erf":42571,"erc":40232,"era":262856,"erb":34102,"et ":183510,"equ":41108,"esi":112993,"esc":38776,"ese":189976,"ery":53481,"erv":128955,"err":75913,"ert":119791,"ers":454490,"ern":296552,"erm":132045,"ero":60052,"en ":515700,"ela":94480,"eld":88206,"ele":239396,"eli":77371,"ell":173610,"elo":78428,"els":41004,"ely":58392,"emb":232607,"ema":57740,"eme":113005,"emo":49872,"emi":71126,"emp":43662,"ene":128335,"eng":56090,"ena":53361,"end":141343,"enc":194004,"eno":33355,"enn":56995,"eni":62631,"enu":41299,"ens":118987,"ent":917089,"ege":50969,"egi":112559,"ek ":35390,"eir":69644,"ein":59892,"eig":51339,"el ":184102,"em ":65326,"gis":39618,"gin":123942,"gio":71590,"gic":30033,"gia":32230,"ght":158712,"gen":144495,"ger":93467,"ges":68825,"gh ":114066,"ged":27606,"gdo":27360,"ge ":289041,"gas":27663,"gar":46583,"gat":28511,"gam":47831,"gal":29045,"gan":93331,"fte":73646,"ful":30197,"ft ":41960,"fre":30903,"fri":38796,"fro":298934,"fou":123314,"for":682161,"foo":52751,"fol":40754,"fic":131579,"fie":53477,"fil":79707,"fin":56926,"fir":119031,"da ":86635,"de ":228034,"dal":30100,"dae":42405,"dat":50729,"dar":41780,"dan":30719,"day":44277,"cul":69156,"ctu":58584,"cts":42767,"ctr":37744,"cto":137353,"cti":262008,"cte":91935,"cy ":49469,"cus":28281,"cur":73843,"cla":60174,"cle":56340,"clu":89562,"clo":27081,"co ":51753,"coa":30142,"con":327550,"col":81553,"com":369257,"cor":122655,"cov":34759,"cot":39747,"cou":101684,"cs ":92018,"ct ":169935,"cre":89454,"cra":42776,"cri":68130,"cro":41753,"cco":29437,"cce":37436,"cea":26454,"ch ":441284,"cer":71180,"ces":153825,"cen":136988,"cem":49322,"cel":32556,"ced":67120,"cha":166560,"chu":28019,"cia":202890,"ck ":147449,"cie":145817,"che":138582,"chi":112580,"cho":124613,"chn":32667,"cil":38009,"cis":28214,"cit":63842,"cin":40251,"cip":75068,"cke":66684,"ed ":1971122,"ebr":59143,"eac":57373,"eag":49669,"ead":94691,"ean":92098,"eal":72020,"eam":67571,"ear":238611,"eas":225475,"eat":190659,"ea ":97410,"efo":27942,"efe":67003,"ega":48293,"eek":39188,"een":176170,"eed":37937,"eer":50775,"eet":39266,"edi":120330,"ede":54090,"edu":36419,"ech":57443,"eci":121874,"ece":81981,"eca":34537,"ee ":109231,"ecu":34646,"ect":304913,"eco":150097,"dy ":58601,"dur":62131,"don":63380,"dom":45245,"ds ":164811,"duc":125723,"dra":35355,"dre":38221,"dge":45767,"dic":54056,"dia":144125,"der":278249,"des":154773,"dev":51103,"dea":34588,"ded":141601,"def":26810,"del":51924,"den":154840,"dem":42344,"dep":50458,"dle":29935,"do ":34660,"div":36865,"din":170282,"dio":57931,"dir":47066,"dis":162456,"dit":74391,"die":65117,"dif":26218,"rga":62829,"ri ":41893,"rgi":33226,"rge":101036,"ret":62986,"res":357790,"rev":42186,"rfo":28887,"rds":61017,"rg ":35933,"rea":244476,"ree":171216,"ref":62077,"rec":149809,"red":206208,"reg":86193,"rem":55031,"ren":193932,"rel":146404,"rep":68957,"rdi":57659,"rde":80880,"re ":634483,"rch":164110,"rce":59020,"rd ":218199,"rap":56827,"rar":34987,"ras":43044,"rat":260279,"rai":83507,"rag":34608,"ran":265077,"ram":80300,"ral":264928,"rab":26255,"rad":103893,"rac":129420,"rpo":42976,"rs ":371607,"ros":58364,"rot":58999,"rom":337721,"ron":118148,"roo":26751,"rop":109260,"rou":191436,"rov":99028,"row":41881,"roa":55237,"rod":87645,"roc":66994,"rol":65046,"rof":62861,"rog":49055,"rnm":34849,"rna":118629,"rne":61259,"rni":62594,"ro ":47142,"rma":145311,"rme":117007,"rmi":32749,"rly":65353,"rli":51474,"rld":75350,"rle":27840,"rn ":330287,"rks":33396,"rke":46561,"rm ":73619,"rio":79616,"rit":248352,"ris":163763,"riv":53880,"rig":116494,"ril":65623,"rin":255678,"rim":52709,"ria":159780,"rib":60454,"ric":377068,"rid":60669,"rie":181610,"rk ":140894,"ruc":35769,"run":39647,"rum":30319,"rus":32542,"rva":27044,"rvi":62552,"rve":65133,"ry ":509562,"rsi":95891,"rso":35794,"rsh":28300,"rse":60300,"rta":48186,"rst":132558,"rtm":26912,"rte":66419,"rth":190521,"rti":127728,"rua":38894,"rts":59598,"rty":55015,"rt ":230530,"rro":33066,"rri":69478,"rre":97252,"rra":30166,"san":29214,"sha":36063,"sho":85008,"she":117943,"shi":166608,"sid":90354,"sic":111518,"sia":80711,"sit":152904,"sis":68592,"sin":187018,"sio":187210,"sim":26375,"sig":69827,"scr":41549,"se ":323378,"sci":38993,"sch":76599,"sco":58271,"sev":36211,"ser":204330,"ses":72706,"set":56749,"sh ":261718,"sea":111064,"sed":251715,"sec":65010,"sen":110589,"sem":35416,"sel":52566,"spo":60807,"spe":154402,"spi":31807,"spa":33041,"sou":97501,"sol":40004,"som":46889,"son":213206,"sor":49003,"soc":65715,"st ":788491,"ss ":186657,"sla":91489,"sm ":27910,"sna":26235,"so ":118506,"sma":61408,"sys":38243,"sse":101933,"ssa":44414,"sso":82323,"ssi":178639,"ste":308026,"sta":321451,"sto":159447,"sti":202749,"stl":26896,"stu":53381,"str":353511,"sts":54520,"sub":56163,"suc":48817,"sul":26601,"sup":33543,"sus":37393,"sur":52388,"tai":93008,"tak":29757,"tal":158887,"tag":31026,"tab":60969,"tba":73096,"tat":294451,"tar":154358,"tan":148320,"tch":55824,"te ":378459,"ta ":92521,"pe ":62967,"par":214065,"pat":33109,"pac":31182,"pal":68434,"pai":39322,"pan":124559,"phe":35230,"pho":33690,"phi":46768,"pea":63524,"pec":127586,"ped":38131,"pen":76105,"peo":27260,"per":254210,"pet":53680,"pla":213772,"pli":36447,"ple":105992,"plo":26352,"phy":39410,"pic":63629,"pin":46550,"pio":41668,"pit":34080,"por":147134,"pop":69835,"pos":94042,"pon":41677,"pol":103325,"ps ":57068,"ppo":34542,"ppe":56579,"pub":101611,"pte":60939,"pti":34138,"pri":140547,"pre":169814,"pro":351896,"pur":26601,"put":38175,"pul":69919,"qua":54197,"que":60004,"qui":38739,"ra ":108610,"ngi":45147,"ngl":153748,"ngu":51932,"ngt":37246,"ngs":65275,"ni ":33062,"nge":128219,"nga":40074,"ngd":28313,"nel":35766,"nen":30704,"ner":137621,"net":62476,"nes":167423,"ng ":1115424,"nea":48684,"ned":116572,"nee":30138,"nfo":27425,"ney":30714,"new":36703,"nct":31942,"nco":45249,"nci":91672,"ncl":65296,"nce":360792,"nch":88213,"ne ":474805,"ndu":39629,"ndr":33212,"nds":71457,"ndo":66887,"ndi":147835,"nde":264119,"nda":82618,"ncy":32567,"nal":361592,"nam":140852,"nan":41261,"nar":46225,"nad":72347,"nag":33449,"nai":32626,"nd ":1932876,"nat":199905,"na ":142173,"ny ":146188,"nve":33106,"num":41405,"nus":46108,"nua":61979,"nty":82414,"nto":84185,"ntu":58438,"nts":118194,"ntr":155494,"nti":190207,"nth":52495,"ntl":47080,"nta":166897,"nte":282517,"nsu":48967,"nst":113470,"nse":49755,"nsh":53144,"nsi":90541,"nt ":574346,"ns ":352877,"nol":27739,"nom":44734,"non":40510,"not":75828,"nor":109034,"now":162156,"nov":41537,"nne":95671,"nna":27532,"nni":61016,"nme":53495,"nly":54407,"no ":47600,"nic":132031,"nia":137970,"nk ":44974,"niz":43878,"niv":81629,"nis":146601,"nit":215552,"nio":55312,"nin":128702,"ogr":72732,"ogi":44598,"ohn":44056,"ogy":47192,"oin":47463,"ok ":58105,"ol ":109628,"oce":39082,"oci":84252,"ock":92930,"oca":158968,"occ":29313,"ode":77618,"of ":2204484,"odu":81510,"oft":39655,"off":67447,"ofe":46203,"oad":59707,"od ":90721,"obe":66424,"ows":30274,"own":272385,"owi":30580,"ow ":104160,"oti":37576,"oth":132550,"ote":72682,"ott":42106,"oto":39358,"ost":134462,"ota":47221,"otb":59122,"osi":37738,"ose":115458,"oss":51862,"owe":69263,"ovi":113638,"ove":271311,"oug":99629,"oul":30838,"oun":383251,"oup":67359,"ous":158478,"our":185741,"out":265981,"opo":44332,"opi":26584,"opl":40248,"ope":150764,"oph":34738,"os ":57513,"opu":63831,"ool":97648,"ook":83743,"ood":64494,"or ":897485,"oot":80611,"ork":137621,"orl":77957,"orm":213505,"orn":233282,"oro":42745,"orp":39756,"orr":27698,"orc":40804,"ord":187024,"ore":161271,"org":83794,"ori":179394,"ort":324283,"ors":65851,"ory":92826,"ot ":75528,"ora":99397,"ola":48347,"old":74258,"on ":1693252,"oli":168296,"oll":152650,"ole":57272,"olo":130918,"olu":51650,"om ":355568,"ona":295881,"ond":125458,"onc":37498,"onf":27843,"one":226758,"ong":180887,"oni":86395,"onl":50368,"onn":32769,"ono":57151,"ons":362489,"ont":151478,"ony":30391,"oma":89149,"ome":169344,"omb":33063,"omi":82283,"omm":164677,"omp":213665,"omo":39559,"op ":54660,"la ":85626,"le ":464484,"lf ":36467,"lde":36942,"ldi":28865,"lab":36068,"lac":84352,"lag":63397,"lai":31387,"lan":374156,"lar":145329,"lat":207187,"las":100146,"law":29496,"lay":143764,"ld ":226258,"lbu":63795,"ls ":147230,"lon":87604,"lop":54959,"lor":52442,"loc":148176,"log":104122,"los":46581,"low":85754,"lth":30060,"lti":36200,"lud":61069,"lub":38036,"lue":31016,"lso":110778,"lt ":61158,"li ":29468,"ley":50488,"lev":69397,"les":177287,"let":59346,"ler":75366,"lem":45662,"len":64181,"leg":67495,"led":96172,"lec":118736,"lea":145235,"lls":32993,"llu":31038,"lly":155319,"lo ":32182,"lla":120635,"lle":228951,"lli":124524,"llo":71927,"lm ":62508,"ll ":345769,"lit":221318,"lis":254417,"lin":196823,"liz":27437,"liv":42501,"lic":149255,"lia":177899,"lig":44454,"lie":64153,"lif":67751,"ma ":56941,"mb ":40338,"mai":57333,"mad":33441,"mag":32391,"mar":102764,"mas":36713,"mal":90910,"man":305273,"mat":146584,"mbl":26607,"mbi":34922,"mbe":250295,"me ":304088,"med":139063,"mea":33106,"met":85213,"mes":108882,"mer":308315,"mem":71604,"men":353373,"lve":34044,"lum":34502,"lus":42702,"ly ":633235,"lwa":32178,"lym":34483,"mpi":86987,"mpe":67319,"mpo":59756,"mpl":65250,"mpu":32461,"ms ":108315,"mod":36169,"mon":129400,"mol":27569,"mov":30527,"mor":62243,"mos":67786,"mot":45357,"mou":50940,"mpa":78503,"my ":44622,"mus":79882,"mul":30159,"mun":115137,"min":162859,"mil":151172,"mis":47781,"mit":48071,"mic":78126,"mmu":68704,"mmi":37447,"mmo":49689,"mma":30077,"mme":66787,"zed":29181,"zat":36341,"yst":59562,"ysi":32556,"ys ":46066,"ype":26271,"yea":55238,"yed":50493,"yer":52194,"ymp":36974,"yin":35501,"wo ":61016,"wn ":223496,"ws ":47661,"wri":85490,"wor":132758,"wes":66741,"wer":121358,"wel":48031,"wed":28630,"wee":76113,"whe":65586,"whi":161339,"who":156434,"wit":254528,"win":75777,"way":79562,"war":108109,"was":721522,"via":29199,"vil":95978,"vin":109107,"vic":63884,"vid":74216,"vie":42322,"vis":102225,"vol":39542,"ver":416461,"ves":68869,"ven":139602,"vem":57889,"vel":128051,"ved":73656,"ve ":294814,"val":55056,"van":50667,"var":47368,"vat":46841,"usi":127657,"use":190347,"ust":198208,"uss":43860,"uth":195687,"uti":76065,"ute":88005,"us ":269807,"ut ":151960,"ura":72791,"urc":49405,"ure":175159,"urg":39826,"uri":124329,"urn":72450,"uro":43771,"urr":69334,"urs":30553,"urt":42019,"ury":47606,"ur ":96286,"upp":28633,"umm":33986,"uma":31954,"umb":94738,"ume":46174,"uly":40527,"unt":202586,"uni":169488,"unc":71870,"und":253741,"ung":45523,"une":67239,"up ":99562,"um ":146010,"ult":82523,"ull":33469,"ule":32343,"ula":137681,"un ":32424,"uil":50329,"uis":33184,"uit":37221,"ul ":41234,"ugh":110128,"ugu":55721,"uct":69612,"ude":66964,"udi":73540,"uca":35169,"ue ":120838,"uce":56991,"uch":53880,"ues":42067,"uen":42045,"ub ":41295,"uat":42205,"uar":119614,"ual":88610,"ubl":124785,"uag":32210,"typ":31249,"ty ":506251,"tur":219279,"tut":36373,"tua":41791,"tud":63876,"two":81070,"twe":71315,"ts ":465295,"tre":115840,"tra":308513,"tri":221592,"tru":53183,"tro":123988,"try":66671,"tta":29674,"tte":122526,"tti":34349,"ttl":41524,"tme":35635,"to ":731436,"tly":63791,"tob":48945,"tow":72085,"tom":29936,"ton":150493,"tor":304450,"too":27044,"top":29122,"til":53505,"tie":69965,"tit":108778,"tis":139521,"tin":255051,"tim":81225,"tio":971575,"thu":55673,"tia":79923,"tic":276929,"tiv":181662,"tla":29767,"tle":95143,"tem":124758,"ten":154275,"tel":93554,"tee":36958,"tea":71475,"tec":49230,"ted":637757,"th ":648546,"tes":193644,"ter":809390,"ti ":33930,"tho":134583,"thr":87801,"the":3415279,"thi":110969,"tha":240340},"n_words":[260942223,308553243,224934017],"name":"en"}
\ No newline at end of file
diff --git a/nlp_resource_data/langdetect/utils/__init__.py b/nlp_resource_data/langdetect/utils/__init__.py
new file mode 100755 (executable)
index 0000000..e69de29
diff --git a/nlp_resource_data/langdetect/utils/__init__.pyc b/nlp_resource_data/langdetect/utils/__init__.pyc
new file mode 100755 (executable)
index 0000000..565c461
Binary files /dev/null and b/nlp_resource_data/langdetect/utils/__init__.pyc differ
diff --git a/nlp_resource_data/langdetect/utils/lang_profile.py b/nlp_resource_data/langdetect/utils/lang_profile.py
new file mode 100755 (executable)
index 0000000..118cff8
--- /dev/null
@@ -0,0 +1,70 @@
+from collections import defaultdict
+import re
+
+import six
+from six.moves import xrange
+
+from .ngram import NGram
+
+
+class LangProfile(object):
+    MINIMUM_FREQ = 2
+    LESS_FREQ_RATIO = 100000
+
+    ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$')
+    ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*')
+
+    def __init__(self, name=None, freq=None, n_words=None):
+        self.freq = defaultdict(int)
+        if freq is not None:
+            self.freq.update(freq)
+
+        if n_words is None:
+            n_words = [0] * NGram.N_GRAM
+
+        self.name = name
+        self.n_words = n_words
+
+    def add(self, gram):
+        '''Add n-gram to profile.'''
+        if self.name is None or gram is None:  # Illegal
+            return
+        length = len(gram)
+        if length < 1 or length > NGram.N_GRAM:  # Illegal
+            return
+        self.n_words[length - 1] += 1
+        self.freq[gram] += 1
+
+    def omit_less_freq(self):
+        '''Eliminate below less frequency n-grams and noise Latin alphabets.'''
+        if self.name is None:  # Illegal
+            return
+        threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ)
+
+        roman = 0
+        for key, count in list(six.iteritems(self.freq)):
+            if count <= threshold:
+                self.n_words[len(key)-1] -= count
+                del self.freq[key]
+            elif self.ROMAN_CHAR_RE.match(key):
+                roman += count
+
+        # roman check
+        if roman < self.n_words[0] // 3:
+            for key, count in list(six.iteritems(self.freq)):
+                if self.ROMAN_SUBSTR_RE.match(key):
+                    self.n_words[len(key)-1] -= count
+                    del self.freq[key]
+
+    def update(self, text):
+        '''Update the language profile with (fragmented) text.
+        Extract n-grams from text and add their frequency into the profile.
+        '''
+        if text is None:
+            return
+        text = NGram.normalize_vi(text)
+        gram = NGram()
+        for ch in text:
+            gram.add_char(ch)
+            for n in xrange(1, NGram.N_GRAM+1):
+                self.add(gram.get(n))
diff --git a/nlp_resource_data/langdetect/utils/lang_profile.pyc b/nlp_resource_data/langdetect/utils/lang_profile.pyc
new file mode 100755 (executable)
index 0000000..1344115
Binary files /dev/null and b/nlp_resource_data/langdetect/utils/lang_profile.pyc differ
diff --git a/nlp_resource_data/langdetect/utils/messages.properties b/nlp_resource_data/langdetect/utils/messages.properties
new file mode 100755 (executable)
index 0000000..1eb7a49
--- /dev/null
@@ -0,0 +1,135 @@
+NGram.CJK_KANJI_EXCLUDE=\u0020\uFF08\uFF09
+NGram.LATIN1_EXCLUDE=\u00A0\u00AB\u00B0\u00BB
+NGram.KANJI_1_0=\u4F7C\u6934
+NGram.KANJI_1_2=\u88CF\u95B2
+NGram.KANJI_1_4=\u7027\u7DCB
+NGram.KANJI_1_8=\u4E80\u4E9C\u4EEE\u5263\u5264\u5270\u52C5\u52E7\u52F2\u53B3\u5449\u58CA\u58CC\u5968\u59C9\u59EB\u5D8B\u5DE3\u5E30\u6075\u622F\u623B\u6255\u629C\u629E\u62DD\u62E1\u633F\u635C\u63FA\u6442\u6589\u658E\u6669\u66A6\u66FD\u6804\u685C\u6B69\u6B6F\u6BBB\u6C37\u6C5A\u6D44\u6E09\u6E0B\u6E13\u6EDD\u713C\u72A0\u731F\u7363\u7A32\u7A42\u7A93\u7ADC\u7C8B\u7C9B\u7DD1\u7E01\u7E04\u7E26\u7E4A\u7E4B\u7E70\u8074\u8107\u8133\u81D3\u820E\u8217\u8358\u83D3\u85AC\u8987\u899A\u8B21\u8B72\u8B83\u8CDB\u9045\u90F7\u91C8\u9271\u9283\u92AD\u9665\u967A\u96A0\u96A3\u96B7\u970A\u983C\u9854\u9855\u99C6\u9A12\u9ED9\u9F62
+NGram.KANJI_1_11=\u67D8\u831C
+NGram.KANJI_1_12=\u5742\u57FC\u5800
+NGram.KANJI_1_13=\u4E3C\u4E98\u4FE3\u4FF5\u5072\u51A8\u53A9\u5451\u546A\u5504\u5516\u55A9\u55B0\u5618\u5642\u565B\u567A\u56A2\u57F4\u5840\u5841\u58F1\u59F6\u5A2F\u5B22\u5B8D\u5DCC\u5EFB\u5F10\u60A9\u60E3\u61D0\u62F6\u63B4\u63BB\u63C3\u6681\u685F\u6955\u6962\u696F\u698A\u698E\u69FB\u6A2B\u6A7F\u6B53\u6BD8\u6D99\u6E07\u7460\u7473\u7560\u7573\u758E\u7690\u7815\u783A\u7962\u7A4F\u7A63\u7AEA\u7BED\u7CA7\u7D18\u7D3A\u7E4D\u8061\u8218\u8276\u82C5\u8597\u85AB\u86CD\u874B\u88FE\u8ACF\u8B90\u8D0B\u8FBF\u9013\u9061\u914E\u9154\u918D\u9190\u91A4\u91B8\u9262\u929A\u92ED\u92F3\u932C\u96EB\u96F0\u976D\u97EE\u981A\u99C4\u9A28\u9AC4\u9B8E\u9C10\u9D0E\u9D5C\u9D8F\u9E78\u9EB9\u9EBA\u9EBF
+NGram.KANJI_1_14=\u5F66\u7984\u7985
+NGram.KANJI_1_16=\u5861\u7B25\u844E\u9419\u9D07
+NGram.KANJI_1_18=\u5039\u514E\u51E7\u51EA\u5301\u5302\u5859\u58F7\u59AC\u5C2D\u5CA8\u5EFC\u6357\u64B9\u67CA\u6802\u6834\u68BC\u6900\u6919\u691B\u69D9\u6AE8\u6D9C\u6E8C\u6F09\u6F45\u701E\u7026\u7114\u72DB\u7577\u75E9\u783F\u7895\u7A50\u7AC3\u7B48\u7B86\u7BAA\u7C7E\u7C82\u7C8D\u7CCE\u7D2C\u7F6B\u7FEB\u8557\u85AE\u86CE\u877F\u8997\u8ACC\u8CB0\u8CCE\u8FE9\u9197\u920E\u9266\u927E\u92F2\u9306\u9453\u9784\u982C\u9834\u99C8\u9BF5\u9C2F\u9D2C
+NGram.KANJI_1_22=\u6762\u6A17\u887F
+NGram.KANJI_1_27=\u4E21\u4E57\u4ECF\u4F1D\u4FA1\u4FF3\u5024\u50CD\u5150\u5186\u51E6\u52B4\u52B9\u5358\u53CE\u55B6\u56E3\u56F2\u56F3\u570F\u5727\u5869\u5897\u58F2\u5909\u5B9F\u5BDB\u5BFE\u5C02\u5DFB\u5E2F\u5E81\u5E83\u5EC3\u5F3E\u5F93\u5FB3\u5FB4\u5FDC\u60AA\u6226\u6238\u6271\u62E0\u6319\u63B2\u6483\u64AE\u67A0\u67FB\u691C\u697D\u69D8\u6A29\u6B73\u6B74\u6BCE\u6C17\u6CA2\u6D5C\u6E08\u6E80\u702C\u7523\u767A\u770C\u7D4C\u7D75\u7D76\u7D99\u7D9A\u7DCF\u8535\u8846\u89A7\u89B3\u8A33\u8AAC\u8AAD\u8C4A\u8EE2\u8EFD\u8FBA\u8FBC\u9244\u9332\u95A2\u95D8\u96D1\u99C5\u9A13\u9ED2
+NGram.KANJI_1_29=\u4F0E\u4FFA\u5036\u53E1\u54B2\u5506\u583A\u5C3B\u5CAC\u5CE0\u5CEF\u6803\u68B6\u6A0B\u6A8E\u73C2\u7551\u7826\u7881\u79B0\u7B39\u8429\u8599\u8FBB\u9162\u95C7\u9688\u96BC\u9AEA\u9DF2
+NGram.KANJI_1_31=\u5553\u938C
+NGram.KANJI_1_35=\u51B4\u564C\u57DC\u5B2C\u6822\u685D\u690B\u6973\u6C93\u7511\u7887\u7A17\u83D6\u847A\u8494\u8526\u854E\u85C1\u86F8\u88B4\u93A7\u9B92\u9C39\u9C48\u9C52
+NGram.KANJI_2_0=\u4E2B\u4EC3\u4F09\u4F57\u4F6F\u4F70\u4FD1\u4FDA\u500C\u5043\u516E\u5189\u5241\u530D\u5310\u5412\u54AB\u54AF\u5514\u5556\u55B1\u561F\u573B\u586D\u587D\u58C5\u58D1\u5914\u5A62\u5A6A\u5AE6\u5B40\u5B5B\u5B70\u5BB8\u5CD2\u5D01\u5D34\u5E11\u5EA0\u5F0B\u5F2D\u5F87\u607F\u621B\u6221\u6289\u63A3\u6452\u646D\u64D8\u652B\u6600\u6631\u6641\u66F7\u6773\u67B8\u67DD\u67DE\u6829\u68FB\u69AD\u6A47\u6C10\u6C68\u6C74\u6C85\u6CD3\u6D31\u6D93\u6D94\u6DB8\u6DBF\u6DC5\u6E6E\u6EA7\u6EB4\u6EC2\u6F2A\u6F2F\u6FB9\u6FC2\u6FDB\u6FEE\u70AF\u70FD\u7166\u726F\u729B\u739F\u73DE\u740A\u746D\u749C\u749F\u74E0\u759D\u75A3\u75CD\u75DE\u7600\u7620\u7688\u7738\u7762\u776B\u777D\u77E3\u781D\u7837\u78A3\u7946\u7B60\u7F44\u7F54\u7F5F\u7FAF\u8026\u807F\u80C4\u80DB\u80ED\u81E7\u824B\u82B7\u82E3\u8392\u846D\u84D3\u8548\u85B9\u86DE\u873F\u8753\u8782\u87AB\u87B3\u87D1\u87E0\u87FE\u8821\u88D8\u88E8\u8913\u891A\u892B\u8983\u8C3F\u8C49\u8C82\u8D6D\u8DE4\u8E1D\u8E1E\u8E7C\u8FE5\u8FE8\u9005\u9035\u9050\u9082\u9083\u9095\u90E2\u911E\u91AE\u91B4\u93D6\u9621\u968D\u96B9\u96D2\u9711\u9713\u973E\u9AB0\u9AB7\u9AE6\u9B03\u9B23\u9EDC\u9EEF
+NGram.KANJI_2_1=\u4E82\u4F48\u4F54\u50F9\u5167\u528D\u52DE\u532F\u537B\u53C3\u5433\u555F\u55AE\u56B4\u570D\u5716\u58D3\u58DE\u5920\u5967\u5A1B\u5BEB\u5BEC\u5C08\u5C0D\u5C46\u5C6C\u5CFD\u5E36\u5E6B\u5EC8\u5EF3\u5F48\u5F91\u5F9E\u5FB5\u6046\u60E1\u61F7\u6232\u6236\u64C7\u64CA\u64D4\u64DA\u64F4\u651D\u6578\u65B7\u6649\u6A13\u6A23\u6A6B\u6A94\u6AA2\u6B0A\u6B50\u6B61\u6B72\u6B77\u6B78\u6C92\u6EAB\u6EFF\u6FD5\u6FDF\u71DF\u722D\u72C0\u734E\u737B\u746A\u7522\u773E\u78BC\u7A69\u7C3D\u7CB5\u7D55\u7D72\u7DA0\u7DAB\u7DE3\u7E5E\u7E6A\u7E7C\u7E8C\u8072\u807D\u8085\u812B\u8166\u8173\u81D8\u8209\u820A\u8332\u838A\u840A\u85E5\u860B\u8655\u865B\u88DD\u89BA\u89BD\u89C0\u8AAA\u8B6F\u8B7D\u8B8A\u8B93\u8C50\u8CF4\u8E64\u8F15\u8F49\u8FA6\u8FAD\u9109\u9130\u91AB\u91CB\u92B7\u9304\u9322\u95CA\u96A8\u96AA\u96B1\u96B8\u96D6\u96D9\u96DC\u9748\u975C\u986F\u9918\u99DB\u9A57\u9B25\u9EA5\u9EC3\u9EDE\u9F52
+NGram.KANJI_2_4=\u514C\u51AA\u5614\u56AE\u56C2\u582F\u58FA\u5B0C\u5D11\u5DD2\u5DD6\u5E40\u5E5F\u5EEC\u6137\u6417\u6488\u64F2\u652A\u6582\u6689\u689F\u68D7\u69D3\u6A97\u6AB8\u6ABB\u6AC3\u6ADA\u6B7F\u6BB2\u6EA5\u6EC4\u6EF2\u7009\u701D\u7028\u703E\u7165\u71BE\u721B\u7463\u7464\u7469\u7515\u7526\u75FA\u7621\u779E\u79B1\u7A1F\u7AC4\u7AC7\u7B8F\u7BE9\u7D2E\u7D68\u7D8F\u7DB8\u7DBA\u7E46\u7E79\u7F4C\u7F88\u8070\u8073\u8076\u81BE\u82BB\u83A2\u858A\u8591\u861A\u8778\u87EC\u8805\u880D\u893B\u8A1B\u8A25\u8A36\u8A85\u8AA6\u8B17\u8B28\u8CB6\u8CE4\u8D16\u8D1B\u8ECB\u9112\u9214\u9249\u93AC\u9594\u9598\u95BB\u95D5\u965E\u96B4\u97DC\u9821\u9824\u9921\u9952\u9A55\u9A5B\u9B1A\u9C13\u9D09\u9DAF\u9E1A\u9E75\u9F67
+NGram.KANJI_2_9=\u4E9F\u4F6C\u4FDE\u4FFE\u5029\u5140\u51A2\u5345\u539D\u53FB\u54C7\u5599\u560E\u561B\u563B\u566C\u5676\u5729\u574D\u57E4\u595A\u598D\u5A1F\u5A25\u5A77\u5AB2\u5AD6\u5BF0\u5C2C\u5CEA\u5E37\u5F08\u6059\u606A\u6096\u609A\u62A8\u6555\u6556\u66E6\u675E\u68E3\u69BB\u6BCB\u6BD3\u6C1F\u6C26\u6C81\u6DC4\u6DDE\u6E32\u6E44\u6E4D\u6F33\u6F7C\u6FA7\u701A\u701B\u715C\u741B\u7428\u7480\u74A8\u7504\u752C\u768B\u76CE\u78CA\u78FA\u79BA\u7C27\u8046\u81FB\u8331\u8393\u83C1\u8403\u8438\u843C\u8446\u85B0\u87D2\u8862\u8DC6\u9074\u9131\u9672\u96EF\u9704\u9706\u977C\u9ABC\u9E92\u9ECF
+NGram.KANJI_2_10=\u51BD\u5704\u7350\u73A5
+NGram.KANJI_2_11=\u4E15\u4EA2\u4F5A\u50D6\u5349\u53DF\u5484\u5958\u5B34\u5B5A\u5C91\u5E1B\u5F77\u61CB\u61FF\u620C\u620D\u622E\u6248\u6538\u660A\u664F\u678B\u67E9\u69B7\u69C3\u6CB1\u6CD7\u6D5A\u6DAA\u6DC7\u7099\u71EE\u7325\u7425\u7455\u747E\u749E\u75B5\u7678\u7693\u76C2\u77B0\u77BF\u78CB\u7957\u795A\u797A\u7A79\u7B08\u7B75\u7BB4\u7F9A\u7FB2\u7FDF\u80E5\u81BA\u8340\u837C\u8398\u8559\u85A8\u86DF\u8734\u8882\u88F4\u8936\u900D\u907D\u9642\u96C9\u9AFB\u9E9D\u9EBE
+NGram.KANJI_2_12=\u5F57\u7940
+NGram.KANJI_2_13=\u5191\u7791\u792C\u7D46
+NGram.KANJI_2_15=\u5713\u58FD\u5D17\u5D19\u5DBC\u5F4C\u6191\u64A5\u687F\u69AE\u6AFB\u6EEC\u6F3F\u6FE4\u6FF1\u6FFE\u700B\u74CA\u76E1\u76E7\u7926\u792B\u79AE\u7AA9\u7C43\u7C4C\u7C64\u7DBD\u81A0\u856D\u8594\u8606\u8A62\u8AF7\u8CC8\u8CE3\u8D99\u8F1B\u8F3B\u9059\u9127\u9264\u947D\u95A9\u97CB\u980C\u9838\u9846\u99AE\u9A19\u9B06\u9B91\u9F4A\u9F4B
+NGram.KANJI_2_16=\u4E69\u4EC4\u4EDF\u4EF3\u4F0B\u4F5E\u5000\u5028\u50E5\u513B\u5157\u51DC\u52D7\u530F\u5379\u53F5\u5471\u5477\u5555\u555C\u557B\u5594\u55B2\u55C9\u560D\u5616\u562E\u5630\u5653\u5657\u566F\u56A8\u56B6\u5820\u5880\u58CE\u58D9\u5950\u5969\u596D\u599E\u59B3\u59CD\u59D2\u5A40\u5AA7\u5ABC\u5AD7\u5AD8\u5B0B\u5B24\u5B38\u5B53\u5C5C\u5D06\u5D47\u5D94\u5D9D\u5E57\u5EC4\u5F46\u5FAC\u60BD\u60D8\u6123\u615D\u615F\u6175\u618A\u61AB\u61E3\u623E\u6308\u636B\u645F\u6519\u6595\u6698\u66B8\u67D9\u6840\u695D\u696E\u6979\u69C1\u69E8\u6AEC\u6AFA\u6B5F\u6CAC\u6CE0\u6CEF\u6D0C\u6D36\u6DD2\u6DD9\u6DE6\u6DEC\u6E5F\u6FA0\u6FEC\u7156\u71C4\u71DC\u71EC\u71FC\u720D\u7230\u7292\u7296\u72A2\u72CE\u7357\u737A\u7380\u7386\u73A8\u73EE\u743F\u74A6\u74CF\u74D4\u74DA\u755A\u75A5\u75B3\u75C2\u75E0\u75F1\u75FF\u7601\u7609\u7646\u7658\u769A\u76B0\u774F\u775C\u778B\u77BD\u77C7\u7843\u787F\u78F4\u79C8\u7A88\u7A95\u7AFD\u7B1E\u7B67\u7B9D\u7BCC\u7C0D\u7C11\u7C37\u7C40\u7C6E\u7CB3\u7CBD\u7D09\u7D31\u7D40\u7D5B\u7D70\u7D91\u7D9E\u7DB0\u7DD9\u7DF9\u7E08\u7E11\u7E1D\u7E35\u7E52\u7FB6\u7FBF\u7FEE\u8012\u801C\u8028\u8052\u8123\u8188\u81C3\u81DA\u81FE\u8210\u82BE\u83A0\u83D4\u8407\u8435\u8477\u849E\u84C6\u84CA\u85F9\u867A\u86B5\u86B6\u86C4\u8706\u8707\u870A\u8768\u87BB\u8831\u8839\u8879\u8921\u8938\u8964\u89A6\u89AC\u8A10\u8A3E\u8AC2\u8ADB\u8AF3\u8B2B\u8B41\u8B4E\u8B5F\u8B6B\u8B92\u8C55\u8C62\u8C73\u8C8A\u8C8D\u8CB2\u8CB3\u8CD2\u8CE1\u8CFB\u8D0D\u8E34\u8E7A\u8E8A\u8ED4\u8EFE\u8F0A\u8F1C\u8F1E\u8F26\u8FAE\u9088\u90C3\u90FE\u9134\u9148\u91D9\u91E9\u9238\u9239\u923D\u924D\u925A\u9296\u92AC\u92BB\u9315\u9319\u931A\u9321\u9370\u9394\u93A2\u93D8\u93E4\u943A\u9477\u9582\u958E\u95A1\u95C8\u95CC\u95D4\u9658\u966C\u970F\u973D\u9744\u975B\u9766\u97A3\u97A6\u97C1\u97C6\u980A\u9837\u9853\u9870\u98AF\u98B3\u98BA\u98E9\u98ED\u9912\u991B\u991E\u993D\u993F\u99D1\u99DF\u9A01\u9A3E\u9A43\u9A4D\u9ACF\u9AE1\u9B22\u9B58\u9C25\u9C3E\u9C54\u9C56\u9D15\u9D23\u9D89\u9DC2\u9DD3\u9E82\u9E8B\u9EA9\u9EE0\u9EF7\u9F07\u9F2F\u9F34\u9F3E\u9F5F\u9F6C
+NGram.KANJI_2_18=\u5155\u520E\u55DF\u56C0\u56C1\u5793\u5FD6\u5FF8\u6029\u60FA\u613E\u6147\u615A\u62C8\u6384\u6883\u6894\u68F9\u6AA3\u6AAE\u6AC2\u6E63\u7032\u70A4\u7146\u71FB\u7228\u72F7\u7370\u7441\u74BF\u75B8\u75E3\u7622\u76CD\u7768\u79E3\u7A60\u7B6E\u7BC1\u7C5F\u7D06\u7E2F\u7E39\u8146\u81CF\u8703\u8729\u8737\u87EF\u88D2\u8A22\u8AC4\u8AF6\u8E59\u8F33\u8F42\u9169\u91B1\u9278\u93C3\u93DD\u9460\u946A\u9785\u9AD1\u9B4D\u9B4E\u9C31\u9D12\u9ECC
+NGram.KANJI_2_21=\u502A\u544E\u59AE\u59EC\u5D1B\u66A8\u6BD7\u6C76\u6E1D\u70EF\u742A\u7459\u7FE1\u82EF\u8343\u85C9\u8A79\u90DD
+NGram.KANJI_2_22=\u4EDE\u4F7B\u504C\u50EE\u52E3\u52F0\u536E\u54A9\u54BB\u54BF\u54C2\u54E6\u550F\u556A\u55E8\u564E\u5664\u5671\u568F\u56DD\u572F\u57A0\u5809\u5924\u59A3\u59A4\u59E3\u5A13\u5A23\u5B51\u5B73\u5C50\u5C8C\u6035\u60C6\u6106\u6215\u62CE\u62FD\u64ED\u6549\u6554\u655D\u659B\u65CE\u65D6\u6615\u6624\u665E\u6677\u669D\u66E9\u6772\u677C\u696B\u6A84\u6AA0\u6BFD\u6C16\u6C86\u6C94\u6CD6\u6D2E\u6D39\u6F78\u6FB6\u705E\u70CA\u7168\u723B\u7256\u7284\u73B3\u740D\u742F\u7498\u74A9\u752D\u75F3\u7634\u768E\u76B4\u76E5\u77A0\u77DC\u781F\u782D\u7AA0\u7BFE\u7FF1\u80AB\u8174\u81EC\u8202\u8222\u8228\u82DC\u8306\u83FD\u8469\u84FF\u859C\u8617\u86B1\u8722\u8C89\u8D67\u8DCE\u8E49\u8E76\u8E87\u8FE2\u8FE4\u8FF8\u9016\u905B\u9174\u982B\u98E7\u9955\u9B32
+NGram.KANJI_2_23=\u4F8F\u5055\u524C\u548E\u5583\u594E\u5CB7\u5ED6\u5F5D\u6021\u66B9\u66F0\u6C55\u6C7E\u6C82\u6E2D\u6EC7\u6ED5\u70B3\u71B9\u72C4\u73C0\u7426\u745C\u748B\u7696\u777F\u79A7\u79B9\u7F8C\u8153\u8339\u8386\u8725\u90B5\u9102\u962E\u9716\u97F6
+NGram.KANJI_2_28=\u5733\u57D4\u838E\u8FEA
+NGram.KANJI_2_29=\u50ED\u5F29\u62EE\u6A9C\u7BC6\u80F1\u8129\u8171\u822B\u8AEB
+NGram.KANJI_2_30=\u4EB3\u4F15\u4FB7\u5006\u509A\u50A2\u5102\u5109\u5115\u5137\u5138\u513C\u524B\u524E\u5277\u528A\u52E6\u52FB\u5331\u5436\u5443\u54FD\u5538\u555E\u55C6\u55C7\u5679\u5690\u5695\u56C9\u56D1\u56EA\u588A\u58E2\u5AFB\u5B2A\u5B43\u5B7F\u5BE2\u5C37\u5D27\u5D84\u5D87\u5DD4\u5EC1\u5EDD\u5F12\u5FA0\u60F1\u616B\u61F5\u61F6\u61FE\u62DA\u6371\u6399\u63C0\u6451\u647B\u6493\u64BB\u64BF\u64C4\u64F1\u64F7\u650F\u652C\u665D\u6684\u6688\u66EC\u672E\u68E7\u69A6\u69ED\u69F3\u6A01\u6AAF\u6AE5\u6BA4\u6BAE\u6BAF\u6BC6\u6C08\u6C2C\u6C59\u6D87\u6EBC\u6ECC\u6EF7\u6F6F\u6F80\u6F86\u6FD8\u6FF0\u6FFA\u7006\u7018\u7030\u7051\u7192\u71C9\u71D9\u71F4\u71FE\u7274\u7377\u74A3\u750C\u7613\u7627\u7661\u7662\u7665\u766E\u7671\u7672\u76BA\u775E\u776A\u778C\u78E7\u7955\u7A08\u7AC5\u7B4D\u7C2B\u7C6C\u7CF0\u7D02\u7D1C\u7D73\u7DA2\u7DB5\u7DDE\u7E09\u7E0A\u7E37\u7E43\u7E61\u7E7D\u7E93\u7F3D\u7FF9\u81A9\u8271\u83F8\u84C0\u8514\u85BA\u86A9\u86FB\u879E\u8814\u8836\u889E\u8932\u896A\u896F\u8993\u89B2\u8A15\u8A16\u8A1D\u8A5B\u8A6C\u8A6D\u8A7C\u8AA1\u8AA3\u8AA5\u8B0A\u8B4F\u8B59\u8B96\u8C48\u8C54\u8CBD\u8CFA\u8D13\u8E89\u8E8B\u8EAA\u8EC0\u8EDB\u8EFC\u8F12\u8F1F\u8F3E\u8F45\u8FFA\u9015\u9183\u919E\u91A3\u91D7\u91F5\u9209\u9215\u923E\u9240\u9251\u9257\u927B\u9293\u92A8\u92C5\u92C7\u92F0\u9333\u935A\u9382\u938A\u9398\u93B3\u93D7\u93DF\u93E2\u93FD\u942B\u942E\u9433\u9463\u9470\u9472\u947E\u95D0\u96CB\u97C3\u97CC\u981C\u9839\u986B\u98B6\u98EA\u9909\u991A\u9935\u993E\u9951\u99A5\u99B1\u99D9\u99DD\u99F1\u9A2B\u9A62\u9A65\u9AAF\u9AD2\u9AEF\u9B0D\u9B28\u9B77\u9BFD\u9C49\u9C5F\u9C78\u9D3F\u9D72\u9DD7\u9E1B\u9EB4\u9EF4\u9F66\u9F94
+NGram.KANJI_2_31=\u5DBD\u63C6\u6E3E\u7587\u8AF1\u8B5A\u9695
+NGram.KANJI_2_32=\u53A5\u589F\u5CD9\u7109\u7F79\u8006\u8654\u8944\u968B\u96CD
+NGram.KANJI_2_35=\u4F47\u4F91\u4FCE\u4FDF\u527D\u535E\u55DA\u56A5\u5879\u5A11\u5B7A\u5CAB\u5CF4\u5EBE\u5F7F\u5FA8\u601B\u606B\u60B8\u610D\u6134\u619A\u61FA\u6369\u6523\u65CC\u66C4\u6727\u6968\u6A05\u6A48\u6B59\u6BEC\u6D35\u6D38\u6E19\u701F\u7064\u711C\u716C\u71A8\u71E7\u7258\u743A\u746F\u75BD\u75D9\u75F2\u7669\u766C\u76DE\u7729\u77BC\u78EC\u792A\u7A37\u7A62\u7BE6\u7C2A\u7C50\u7D07\u7DD8\u7E5A\u7F8B\u7FD5\u7FF3\u8151\u81CD\u8317\u83F4\u85EA\u85FA\u8823\u895E\u89F4\u8A0C\u8A41\u8AA8\u8ACD\u8B10\u8CC1\u8D05\u8D73\u8E4A\u8E85\u8E91\u8EFB\u8F13\u9087\u914A\u91C9\u923F\u93B0\u9403\u95A8\u95AD\u9730\u9865\u9903\u9945\u9949\u99AD\u99E2\u9A6A\u9D26\u9E1E\u9EDD\u9F2C\u9F72
+NGram.KANJI_2_36=\u4E9E\u4F86\u5011\u50B3\u5152\u5169\u5340\u5718\u5B78\u5BE6\u5BF6\u5C07\u5EE3\u61C9\u6230\u6703\u689D\u6A02\u6C23\u7063\u7368\u756B\u7576\u767C\u7A31\u7D93\u7E23\u7E3D\u81FA\u8207\u842C\u85DD\u865F\u8B49\u8B80\u8CFD\u908A\u9435\u95DC\u965D\u9AD4\u9EE8
+NGram.KANJI_2_37=\u5480\u5580\u5C39\u67EF\u68B5\u6D85\u8521\u90B1
+NGram.KANJI_2_38=\u4E1F\u4F96\u4FE0\u50F1\u5118\u522A\u5291\u52C1\u52DB\u52F3\u52F5\u52F8\u53B2\u55CE\u562F\u580A\u5862\u58AE\u58D8\u58DF\u58E9\u58EF\u5925\u593E\u599D\u5ABD\u5C62\u5EC2\u5EDA\u5EE2\u5F4E\u5F65\u6085\u6158\u61FC\u6200\u62CB\u633E\u6416\u6436\u6490\u64CB\u64E0\u64FA\u6514\u651C\u6524\u6558\u6583\u66B1\u66C6\u66C9\u66E0\u6A11\u6A1E\u6A38\u6A62\u6AB3\u6B16\u6B98\u6BBC\u6C2B\u6DDA\u6DE8\u6DEA\u6DFA\u6EEF\u6EFE\u6F32\u6F51\u6F5B\u700F\u71D2\u7210\u7246\u7260\u72A7\u72F9\u7375\u7378\u758A\u760B\u76DC\u76EA\u77DA\u77FD\u78DA\u7919\u797F\u79AA\u7A05\u7A4C\u7ACA\u7C72\u7D81\u7DDD\u7E31\u7E69\u7E6B\u7E73\u7E96\u7E9C\u81BD\u81C9\u81DF\u8259\u8277\u8396\u83A7\u8523\u8525\u860A\u863F\u8667\u87A2\u87F2\u881F\u883B\u89F8\u8B20\u8B74\u8B9A\u8C4E\u8C6C\u8C93\u8CEC\u8D0A\u8D0F\u8D95\u8E10\u8F4E\u8FAF\u8FF4\u905E\u9072\u9081\u908F\u91AC\u91C0\u91C1\u91D0\u921E\u9223\u9245\u929C\u92B3\u92C1\u9336\u934A\u93C8\u9444\u9452\u947C\u947F\u9592\u95B1\u95C6\u95D6\u95E1\u95E2\u96DE\u9742\u978F\u984F\u9871\u98B1\u98C4\u99ED\u9A37\u9A45\u9A5F\u9AEE\u9B27\u9BCA\u9C77\u9D51\u9D5D\u9E79\u9E7C\u9E7D\u9EB5\u9EBC\u9F61\u9F63\u9F90\u9F9C
+NGram.KANJI_3_1=\u5283\u7562\u7DEC\u88E1\u8F2F
+NGram.KANJI_3_2=\u5009\u502B\u5049\u5075\u507D\u5091\u5098\u50B5\u50B7\u50BE\u5100\u5104\u511F\u518A\u525B\u5289\u5442\u5805\u589C\u58C7\u5922\u596A\u5A66\u5B6B\u5BE7\u5BE9\u5DBA\u5E63\u5E7E\u5FB9\u6163\u616E\u6176\u61B2\u61B6\u61F8\u639B\u63DA\u63EE\u640D\u64B2\u64C1\u64EC\u6557\u6575\u6607\u66AB\u68C4\u6A39\u6C96\u6CC1\u6E1B\u6E6F\u6E9D\u6EC5\u6F01\u6F64\u6FC3\u7058\u707D\u7344\u7642\u76E4\u7832\u790E\u7B46\u7D05\u7D0B\u7D14\u7D19\u7D1B\u7D39\u7D61\u7DB1\u7DCA\u7DD2\u7DE0\u7DE9\u7DEF\u7DF4\u7E2E\u7E3E\u8105\u8108\u81E8\u8266\u84CB\u84EE\u85A9\u885D\u88DC\u8972\u8A02\u8A0E\u8A13\u8A17\u8A2A\u8A34\u8A3A\u8A3C\u8A69\u8A73\u8A95\u8AA0\u8AA4\u8AB2\u8AC7\u8ACB\u8B00\u8B1B\u8B1D\u8B5C\u8C9D\u8C9E\u8CA2\u8CA8\u8CA9\u8CAB\u8CAC\u8CB7\u8CBF\u8CC0\u8CDE\u8CE2\u8CFC\u8D08\u8DE1\u8E8D\u8ECC\u8EDF\u8EF8\u8F14\u8F1D\u8F2A\u8F44\u9055\u9069\u9077\u907C\u90F5\u91DD\u9285\u92FC\u9326\u932F\u9375\u9396\u93AE\u93E1\u9451\u9589\u95A3\u9663\u9670\u9673\u96BB\u9801\u9802\u9803\u9806\u9808\u9810\u983B\u984D\u9858\u9867\u98EF\u98F2\u98FE\u990A\u99D0\u9A0E\u9A5A\u9B5A\u9CE5\u9DB4\u9E97\u9F8D
+NGram.KANJI_3_3=\u543E\u5BEE\u5F18\u6590\u725F\u83C5\u85E9\u9E93
+NGram.KANJI_3_4=\u5016\u53AD\u5606\u5629\u58BE\u5F14\u6065\u6144\u646F\u647A\u67F5\u6953\u6C3E\u6F2C\u6F97\u6FB1\u7169\u71E6\u71ED\u74BD\u79BF\u7A1C\u7A4E\u7AAF\u7CDE\u7D17\u7D43\u7E55\u7FA8\u807E\u8139\u8490\u8569\u856A\u87FB\u8A23\u8AB9\u8AE6\u8AFA\u8B2C\u8CD1\u91D8\u92F8\u9318\u96DB\u99B4\u9BC9\u9C2D\u9CF6\u9D61\u9DFA
+NGram.KANJI_3_5=\u4E26\u4F75\u4FC2\u500B\u5074\u5099\u512A\u5225\u5247\u5275\u5287\u52D5\u52D9\u52DD\u52E2\u5354\u54E1\u554F\u5712\u57F7\u5831\u5834\u5BAE\u5C0E\u5C64\u5CA1\u5CF6\u5E2B\u5E79\u5EAB\u5F35\u5F37\u5F8C\u5FA9\u611B\u614B\u63A1\u63DB\u6642\u66F8\u6771\u696D\u6975\u69CB\u6A19\u6A4B\u6A5F\u6BBA\u6C7A\u6E2C\u6E96\u6F22\u70BA\u7121\u71B1\u7372\u73FE\u74B0\u7570\u76E3\u78BA\u7A2E\u7A4D\u7AF6\u7BC0\u7BC4\u7BC9\u7C21\u7D00\u7D04\u7D0D\u7D1A\u7D30\u7D42\u7D44\u7D50\u7D66\u7D71\u7DAD\u7DDA\u7DE8\u7E54\u7F85\u7FA9\u7FD2\u8056\u805E\u8077\u8208\u83EF\u8449\u8853\u885B\u88FD\u8907\u898B\u898F\u8996\u89AA\u8A08\u8A18\u8A2D\u8A31\u8A55\u8A5E\u8A66\u8A71\u8A72\u8A8C\u8A8D\u8A9E\u8ABF\u8AD6\u8AF8\u8B58\u8B70\u8B77\u8CA0\u8CA1\u8CB4\u8CBB\u8CC7\u8CEA\u8ECA\u8ECD\u8F03\u8F09\u8F38\u8FB2\u9023\u9031\u9032\u904A\u904B\u904E\u9054\u9060\u9078\u907A\u9084\u9280\u9577\u9580\u958B\u9593\u9678\u967D\u968A\u968E\u969B\u96E2\u96E3\u96F2\u96FB\u97D3\u97FF\u9805\u9818\u982D\u984C\u985E\u98A8\u98DB\u9928\u99AC\u9BAE
+NGram.KANJI_3_8=\u5F6B\u6C4E\u7B87\u8A70
+NGram.KANJI_3_9=\u540B\u5B5C\u826E
+NGram.KANJI_3_11=\u4F83\u4FF8\u51CB\u52BE\u53F1\u548B\u558B\u5CB1\u5D69\u5F3C\u620E\u621F\u64E2\u67DA\u6854\u69CC\u6A35\u6C8C\u6E1A\u6F15\u6FE0\u717D\u7252\u7AFA\u82D3\u83DF\u8431\u9041\u9149\u9798
+NGram.KANJI_3_12=\u4ED5\u55E3\u572D\u57A3\u587E\u5983\u5A9B\u5C90\u5E61\u672D\u6960\u6F5F\u72D9\u72E9\u757F\u7949\u7950\u7E82\u7FCC\u82B8\u90B8\u91DC\u961C\u9B45
+NGram.KANJI_3_13=\u55AB\u6249\u643E\u6841\u68B1\u725D\u7B8B\u7C95\u7E1E\u7F36\u8A03\u8A6B\u8E74\u95A4
+NGram.KANJI_3_15=\u50AD\u50D1\u5132\u51F1\u55AC\u5617\u5687\u584A\u59EA\u5B30\u5BF5\u5C0B\u5C4D\u5EDF\u6182\u61A4\u64AB\u64FE\u66A2\u6897\u694A\u69CD\u6B3D\u6BC0\u6D29\u6F38\u7015\u7149\u71C8\u723A\u7336\u7345\u755D\u76C3\u78A9\u798D\u7AAE\u7DFB\u7E2B\u7F75\u7F77\u81E5\u834A\u852D\u85CD\u8755\u8A3B\u8A54\u8AE7\u8B02\u8B39\u8CAA\u8CE6\u8DA8\u8E5F\u8F5F\u905C\u912D\u919C\u92D2\u932B\u937E\u9418\u9583\u9812\u985B\u9905\u99B3\u99C1\u99D5\u9A30\u9CF3\u9D3B\u9D6C
+NGram.KANJI_3_16=\u6D6C\u72FD\u77A5\u8956\u9C0D
+NGram.KANJI_3_18=\u5919\u5F4A\u6063\u63AC\u649A\u6715\u6AD3\u71D0\u758B\u834F\u85F7\u88DF\u8F61\u93D1\u98F4\u9D60
+NGram.KANJI_3_19=\u4F50\u7DB2\u962A
+NGram.KANJI_3_22=\u5E96\u75D4\u91C6
+NGram.KANJI_3_23=\u5E9A\u6C40\u821C\u839E\u8FED\u9EDB
+NGram.KANJI_3_27=\u5F01\u66DC
+NGram.KANJI_3_29=\u5023\u5208\u531D\u536F\u53E9\u54C9\u598A\u59BE\u5A20\u5D6F\u5DF3\u66C7\u66D6\u66F3\u6775\u6A3D\u6ADB\u6B86\u6C72\u6E25\u73EA\u7435\u760D\u7656\u7825\u78D0\u7A14\u7A6B\u7B20\u7BE0\u7CF8\u7DAC\u7DBB\u7DBE\u80E4\u80F4\u837B\u8466\u8568\u867B\u8A63\u91E7\u9320\u935B\u9591\u965B\u98E2\u990C\u9913\u9BAB
+NGram.KANJI_3_30=\u60B6\u8AD2\u8CC2\u9237\u9328\u934D\u9397\u9830
+NGram.KANJI_3_31=\u4FB6\u50D5\u51CD\u559A\u55AA\u5674\u5857\u585A\u5875\u58B3\u596E\u59E6\u5A41\u5D50\u5E25\u5E33\u5F59\u61C7\u61F2\u6368\u6383\u65AC\u68DF\u68F2\u6A3A\u6B04\u6DBC\u6DF5\u6E26\u6E4A\u6E67\u6F54\u6F70\u6FC1\u6FEB\u7159\u727D\u7652\u77EF\u78EF\u798E\u7A40\u7AAA\u7BE4\u7C60\u7CE7\u7CFE\u7D21\u7D33\u7D5E\u7D79\u7DB4\u7DBF\u7E1B\u7E8F\u7F70\u814E\u816B\u8178\u819A\u84BC\u85A6\u865C\u8766\u8A1F\u8A50\u8A60\u8A6E\u8A87\u8A98\u8AB0\u8ADC\u8AED\u8AEE\u8B0E\u8B19\u8CA7\u8CAF\u8CB8\u8CBC\u8CC3\u8CC4\u8CCA\u8CDC\u8CE0\u8CED\u8ED2\u8F29\u8F3F\u91E3\u920D\u9234\u925B\u9298\u9310\u934B\u958F\u95A5\u9727\u97FB\u9811\u984E\u98FC\u98FD\u99D2\u99FF\u9B31\u9BE8\u9C57\u9CE9\u9CF4\u9D28\u9DF9
+NGram.KANJI_3_32=\u4E1E\u502D\u51A5\u5321\u58EC\u5A3C\u5BC5\u5CE8\u61A9\u620A\u65A1\u6714\u6853\u6893\u6C50\u6C5D\u7436\u745A\u745B\u773A\u7941\u7947\u8543\u865E\u8C5A\u914B\u99A8\u9AB8
+NGram.KANJI_3_35=\u4E99\u5BA5\u5DFD\u608C\u60C7\u60DA\u6190\u61A7\u6753\u6777\u6787\u6B4E\u6F23\u6FE1\u6FEF\u7337\u7827\u786F\u7893\u7ABA\u7B94\u7BB8\u7C3E\u7D62\u7E6D\u80B1\u81BF\u81C6\u821B\u82E7\u83F0\u84D1\u86ED\u8888\u8B01\u8B04\u8F4D\u9291\u92E4\u932E\u9354\u936C\u939A\u9957\u9AED\u9BAA\u9BAD\u9BD6\u9BDB\u9C3B\u9D1B
+NGram.KANJI_3_36=\u50C5\u53E2\u5EE0\u65BC\u70CF\u723E\u7D10\u7D9C\u806F\u8607\u862D\u8A0A\u8AFE\u8CD3\u9019\u9813\u9B6F
+NGram.KANJI_3_37=\u4EA8\u4F3D\u5384\u5EFF\u60DF\u66DD\u6E5B\u8087\u82D1\u8FE6\u9640\u9E9F
+NGram.KANJI_3_38=\u5147\u525D\u5678\u617E\u6372\u79A6\u8ABC\u92EA\u9438\u9817
+NGram.KANJI_4_0=\u6D3C\u718F\u74EE\u8712
+NGram.KANJI_4_9=\u4F84\u54C6\u5565\u68F1\u6D82\u83C7
+NGram.KANJI_4_10=\u4FE9\u4FED\u51FF\u523D\u5300\u5364\u538C\u5450\u5455\u545C\u54D1\u54D7\u5578\u56A3\u58F6\u592F\u5CE6\u5D2D\u5E90\u6073\u607C\u60EB\u61D2\u62E2\u62E3\u631A\u6320\u6323\u6361\u63B7\u63B8\u63BA\u6405\u65A9\u65F7\u6619\u6655\u67A3\u67E0\u6805\u6808\u6866\u6868\u6869\u6A71\u6BE1\u6C79\u6CA5\u6CDE\u6DA4\u6DA7\u6DA9\u6E85\u70DB\u70E6\u70EB\u7115\u724D\u7410\u759F\u75AE\u75EA\u75F9\u762B\u763E\u76B1\u77EB\u783E\u79C3\u7A8D\u7A9C\u7B5D\u7BF1\u7EC5\u7ED2\u7EDE\u7EE3\u7EF7\u7EF8\u7EFD\u7F00\u7F0E\u7F15\u7F1A\u7F20\u7F24\u7F28\u7FA1\u7FD8\u8038\u803B\u804B\u80AE\u817B\u82C7\u8327\u835E\u8367\u83BA\u8424\u864F\u8681\u8682\u8715\u8717\u8721\u8747\u874E\u8845\u886C\u889C\u88E4\u89C5\u8BB6\u8BB9\u8BC0\u8BC5\u8BE1\u8BEB\u8BEC\u8BF5\u8C0E\u8C1A\u8D2E\u8D31\u8D43\u8D4E\u8D58\u8F67\u8F7F\u9489\u9499\u949D\u94A0\u94A5\u94AE\u94BE\u94D0\u94DB\u94F2\u9508\u950C\u951A\u9525\u952D\u952F\u9530\u953B\u9540\u9550\u9570\u9576\u95F0\u960E\u9668\u96CF\u97E7\u9885\u988A\u98A4\u9965\u9975\u997A\u997F\u9985\u998D\u998F\u9A6E\u9A6F\u9A74\u9A79\u9A7C\u9A82\u9A87\u9CA4\u9CC4\u9CCD\u9CD6\u9E20\u9E25\u9E35\u9E3D\u9E45\u9E49\u9E4A\u9E66
+NGram.KANJI_4_16=\u576F\u579B\u6345\u78B4\u79EB\u79F8
+NGram.KANJI_4_17=\u4E13\u4E1A\u4E1C\u4E24\u4E25\u4E2A\u4E3E\u4E49\u4E50\u4E66\u4E9A\u4EA7\u4EBF\u4ECE\u4EEC\u4EF7\u4F17\u4F20\u5170\u5173\u519B\u51B3\u51E4\u51FB\u5219\u521B\u522B\u529E\u52A1\u52A8\u52BF\u534F\u5355\u536B\u5386\u53BF\u53D1\u53D8\u542F\u5458\u54CD\u56E2\u56ED\u56F4\u56FE\u573A\u5904\u590D\u5934\u5B81\u5B9E\u5BF9\u5BFC\u5C14\u5C9B\u5E26\u5E7F\u5E94\u5F00\u5F20\u5F3A\u603B\u6218\u65E0\u65F6\u663E\u672F\u6743\u6784\u6807\u6C14\u6C49\u707E\u70ED\u73AF\u73B0\u7535\u76D1\u786E\u79CD\u79EF\u7B80\u7C7B\u7EA2\u7EA6\u7EA7\u7EAA\u7EBF\u7EC4\u7EC7\u7ED3\u7EDF\u7EE7\u7EED\u7EF4\u7F16\u7F57\u804C\u8054\u817E\u8282\u82CF\u83B7\u8425\u89C1\u89C2\u89C4\u89C6\u8BA1\u8BA4\u8BAE\u8BAF\u8BB0\u8BB8\u8BBA\u8BBE\u8BC1\u8BC4\u8BD1\u8BDD\u8BE5\u8BED\u8BF4\u8C03\u8D22\u8D23\u8D28\u8D39\u8D44\u8D5B\u8F66\u8F6C\u8F83\u8FBE\u8FC7\u8FD0\u8FD8\u8FD9\u8FDB\u8FDE\u9009\u94C1\u957F\u95E8\u95EE\u95F4\u95FB\u961F\u9633\u9645\u9646\u96BE\u9879\u9884\u9886\u9898\u98CE\u9A6C\u9F99
+NGram.KANJI_4_18=\u51DB\u67B7
+NGram.KANJI_4_22=\u4FA5\u545B\u5499\u5520\u5570\u56F1\u5A76\u5C96\u60AF\u60ED\u618B\u61A8\u62A0\u62A1\u62E7\u6363\u6390\u63B0\u6400\u6402\u6512\u6748\u70C1\u732C\u765E\u7663\u76CF\u7741\u781A\u7980\u79C6\u79FD\u7AA5\u7B0B\u7B8D\u7BA9\u7BAB\u7BD3\u7CAA\u7EAB\u7ECA\u7EE2\u7F2D\u7F30\u8110\u8113\u81CA\u835A\u8360\u84D6\u852B\u87E5\u8869\u8A8A\u8BA5\u8BF2\u8C05\u8C12\u8D30\u8D4A\u8D61\u8DF7\u8E6D\u8E8F\u8F95\u8F99\u8FAB\u94B3\u94C6\u94E3\u9504\u954A\u9563\u95FA\u9893\u9981\u9992\u9AA1\u9CAB\u9E2F\u9E33\u9EB8
+NGram.KANJI_4_24=\u4E22\u4E8F\u4F1E\u4FA3\u5151\u517D\u51BB\u51D1\u5220\u529D\u52CB\u5367\u5389\u5395\u53E0\u53F9\u5413\u548F\u5524\u575E\u575F\u5784\u5792\u57A6\u57AB\u58F3\u5986\u5988\u5A04\u5A07\u5BA0\u5C18\u5C82\u5DE9\u5E10\u5E1C\u5F2F\u60E9\u6124\u629B\u6321\u6324\u635E\u63FD\u6401\u644A\u6491\u655B\u658B\u6635\u67AB\u67DC\u680B\u692D\u6984\u6A31\u6B7C\u6BD9\u6C22\u6CA6\u6CA7\u6CEA\u6CFB\u6CFC\u6D46\u6D47\u6D4A\u6D51\u6DA1\u6E0A\u6E83\u6EE4\u6EE5\u6F9C\u6FD2\u70C2\u7237\u727A\u730E\u7574\u75AF\u7792\u7816\u7845\u78B1\u7A77\u7A91\u7A9D\u7AD6\u7B3C\u7B5B\u7CAE\u7EA4\u7EB1\u7EBA\u7ECE\u7ED1\u7EF0\u7EF3\u7F14\u7F1D\u7F34\u7F62\u8042\u806A\u80A0\u80A4\u80BE\u80BF\u80C0\u810F\u8138\u8231\u8270\u829C\u82CD\u8350\u83B9\u841D\u8574\u8680\u8BB3\u8BBC\u8BBD\u8BC8\u8BF1\u8BFD\u8C0A\u8C0D\u8C1C\u8C24\u8C26\u8C2C\u8C2D\u8C34\u8D1E\u8D2C\u8D3C\u8D41\u8D42\u8D4C\u8D50\u8D5A\u8F69\u8F88\u8F90\u8FA9\u915D\u9171\u9493\u949E\u94A7\u94A9\u94BB\u94C3\u94C5\u94DD\u94F8\u9505\u9510\u9523\u9524\u95EF\u95F7\u95F9\u9600\u9610\u96F3\u97F5\u987D\u9882\u9888\u9896\u98D8\u9971\u9972\u9976\u997C\u9A84\u9A86\u9A8F\u9A97\u9A9A\u9AA4\u9CB8\u9CDE\u9E26\u9E43\u9E64\u9E70\u9F7F\u9F9F
+NGram.KANJI_4_28=\u534E\u62A5\u7ECF\u7F51
+NGram.KANJI_4_34=\u4E34\u4E3D\u4E4C\u4E54\u4E60\u4E61\u4E70\u4EB2\u4EC5\u4EEA\u4F18\u4F1F\u4F24\u4F26\u4FA7\u50A8\u513F\u5174\u517B\u518C\u519C\u51B5\u51CF\u5218\u521A\u5267\u52B3\u5356\u5382\u5385\u538B\u53A6\u5434\u5706\u5723\u5757\u575A\u575B\u575D\u5907\u591F\u593A\u5956\u5B59\u5BA1\u5BAB\u5BBD\u5BBE\u5BFB\u5C42\u5C81\u5E01\u5E08\u5E86\u5E93\u5F02\u5F39\u5F52\u5F55\u5F7B\u6000\u6001\u6076\u620F\u6237\u6267\u6269\u626C\u62A2\u62A4\u62DF\u62E5\u62E9\u6325\u635F\u6362\u6444\u6653\u6682\u6740\u6742\u6768\u6781\u6811\u6837\u6865\u68C0\u6B22\u6BC1\u6BD5\u6C47\u6C9F\u6CAA\u6CFD\u6D4B\u6DA8\u6E10\u6EE1\u6EE8\u706D\u7075\u70DF\u7231\u739B\u7597\u76D6\u76D8\u77FF\u7801\u7840\u79BB\u7A33\u7ADE\u7B14\u7B7E\u7CA4\u7D27\u7EB3\u7EBD\u7EC3\u7EC6\u7EC8\u7ECD\u7ED5\u7ED9\u7EDC\u7EDD\u7EE9\u7EFC\u7EFF\u7F13\u7F29\u8083\u80DC\u8111\u814A\u8230\u827A\u8363\u836F\u8428\u84DD\u867D\u8865\u88AD\u89C8\u8BA2\u8BA8\u8BA9\u8BAD\u8BB2\u8BBF\u8BC6\u8BCD\u8BD5\u8BEF\u8BF7\u8BF8\u8BFA\u8BFB\u8C08\u8D1D\u8D1F\u8D21\u8D25\u8D27\u8D2D\u8D2F\u8D35\u8D38\u8DC3\u8F6E\u8F6F\u8F7B\u8F7D\u8F86\u8F91\u8F93\u8F96\u8FB9\u8FBD\u8FC1\u8FDC\u8FDD\u9002\u9057\u90BB\u90D1\u91CA\u9488\u949F\u94A2\u94B1\u94F6\u9500\u9526\u9547\u9614\u9634\u9635\u9636\u9648\u9655\u9669\u9690\u97E9\u9875\u9876\u987A\u987B\u987E\u987F\u9891\u989D\u98DE\u9986\u9A7B\u9A8C\u9C81\u9C9C\u9F50
+NGram.KANJI_4_39=\u4E1B\u4E1D\u4E27\u4EA9\u4ED1\u4ED3\u4F2A\u4FA6\u4FA8\u503A\u503E\u507F\u5188\u51AF\u51C0\u51C9\u51ED\u51EF\u5242\u5251\u52B2\u5362\u53A2\u5415\u5417\u5428\u55B7\u5760\u5899\u5939\u594B\u5987\u5A31\u5A74\u5BAA\u5C1D\u5C7F\u5C97\u5CAD\u5E05\u5E2E\u5E99\u5E9E\u5E9F\u5F03\u5FC6\u5FE7\u60AC\u60CA\u60EF\u626B\u6270\u629A\u62E6\u62E8\u6446\u6447\u654C\u67AA\u680F\u6863\u68A6\u6C64\u6D01\u6D53\u6D9D\u6DA6\u6E14\u6E17\u6EDA\u6EE9\u707F\u70BC\u70E7\u7275\u72B9\u72EE\u72F1\u743C\u7545\u76D0\u7855\u7978\u7B79\u7BEE\u7EA0\u7EAC\u7EAF\u7EB2\u7EB5\u7EB7\u7EB8\u7EB9\u7ED8\u7EEA\u7EF5\u7F05\u7F06\u7F18\u7F5A\u80C1\u80F6\u8109\u8206\u8273\u82F9\u8346\u8361\u83B2\u8427\u8651\u867E\u8854\u89C9\u8BC9\u8BCA\u8BD7\u8BDA\u8BDE\u8BE2\u8BE6\u8BFE\u8C01\u8C0B\u8C10\u8C13\u8C22\u8C23\u8C28\u8C31\u8D24\u8D26\u8D29\u8D2A\u8D2B\u8D34\u8D37\u8D3A\u8D3E\u8D3F\u8D4B\u8D4F\u8D54\u8D56\u8D5E\u8D60\u8D62\u8D75\u8D76\u8D8B\u8F68\u8F70\u8F74\u8F85\u8F89\u8FC8\u8FDF\u900A\u9012\u903B\u9093\u90AE\u917F\u9274\u94A6\u94DC\u94ED\u94FA\u94FE\u9501\u950B\u9519\u9521\u952E\u955C\u95EA\u95ED\u95F2\u95F8\u95FD\u9601\u9605\u9647\u96B6\u96FE\u9877\u9881\u9887\u9897\u989C\u98A0\u996D\u996E\u9970\u9A70\u9A71\u9A73\u9A76\u9A7E\u9A91\u9C7C\u9E1F\u9E21\u9E23\u9E2D\u9E3F\u9E4F\u9F84
+NGram.KANJI_5_10=\u5239\u8EAF
+NGram.KANJI_5_11=\u51C4\u8471
+NGram.KANJI_5_12=\u6DC0\u7C98
+NGram.KANJI_5_13=\u5631\u5815\u8695
+NGram.KANJI_5_14=\u4E71\u4FA0\u5265\u52B1\u5374\u53A8\u53D9\u58EE\u5BDD\u5BFF\u5C3D\u5C4A\u5CE1\u5F25\u5F84\u604B\u60A6\u60E7\u60E8\u631F\u636E\u643A\u663C\u664B\u67A2\u6816\u697C\u6B8B\u6BB4\u6D45\u6E7F\u6EDE\u6F5C\u706F\u7089\u72ED\u732A\u732B\u76D7\u793C\u7977\u7A0E\u7A83\u80C6\u811A\u8131\u82A6\u830E\u848B\u865A\u866B\u86EE\u89E6\u8A89\u8DF5\u8E0A\u8E2A\u8F9E\u9065\u968F\u9759\u9EA6
+NGram.KANJI_5_18=\u601C\u75D2
+NGram.KANJI_5_26=\u4E07\u4E0E\u4E89\u4F1A\u4F53\u515A\u5185\u5199\u533A\u533B\u53C2\u53CC\u53F7\u58F0\u5965\u5B66\u5B9D\u5C06\u5C5E\u5F53\u62C5\u6570\u65AD\u65E7\u6761\u6765\u6A2A\u6B27\u6CA1\u6E29\u6E7E\u70B9\u72B6\u72EC\u732E\u753B\u79F0\u88C5\u9EC4
+NGram.KANJI_5_29=\u693F\u82EB
+NGram.KANJI_5_34=\u53F6\u6D9B\u83B1
+NGram.KANJI_5_39=\u5C61\u788D
+NGram.KANJI_6_0=\u4E10\u4E52\u4EC6\u4F88\u4FD0\u51F3\u533E\u53ED\u53EE\u5406\u541D\u5429\u5435\u5440\u5490\u5495\u54B1\u54C4\u54FC\u557C\u55D3\u5669\u56E4\u5777\u5992\u59E8\u5B7D\u5BDE\u5BE5\u5C79\u5C94\u5DCD\u5E18\u5E1A\u5E54\u5FF1\u604D\u6064\u60F6\u6127\u6177\u6233\u6252\u625B\u6273\u6296\u62C2\u62C7\u62F4\u638F\u6396\u63E3\u63EA\u6413\u6479\u64A9\u64C2\u659F\u667E\u6760\u6845\u6963\u6A90\u6B83\u6C13\u6C5E\u6D8E\u6D95\u6DCC\u6ED4\u6F13\u6F3E\u6FA1\u7076\u70D8\u710A\u71CE\u7239\u72E1\u73B7\u7599\u759A\u75A4\u75CA\u7629\u7682\u76C5\u76EF\u778E\u77AA\u787C\u7889\u788C\u78BE\u79E7\u7A96\u7A98\u7B77\u7C7D\u7CB1\u7D0A\u7D6E\u7F94\u7FCE\u8116\u814B\u814C\u819B\u828D\u82DF\u8301\u83E0\u85D5\u8611\u86A3\u8708\u8822\u8C4C\u8DB4\u8DEA\u8E42\u8E66\u8E72\u8EBA\u901B\u9157\u970E\u97ED
+NGram.KANJI_6_3=\u62FC\u88D4\u9B4F
+NGram.KANJI_6_9=\u4ED7\u4F63\u4FCF\u5018\u50BB\u50F5\u5154\u5201\u522E\u5254\u527F\u5306\u5462\u5492\u5496\u54A8\u54AA\u554A\u5561\u5564\u5566\u5885\u5938\u5AC2\u5AE9\u5CED\u5F64\u6084\u608D\u60A8\u60D5\u61C2\u61C8\u6254\u626F\u62AC\u6346\u634D\u640F\u6454\u6487\u6495\u64D2\u6746\u6789\u68B3\u68F5\u695E\u6986\u6995\u69A8\u6A44\u6AAC\u6B79\u6C28\u6C2E\u6CF5\u6DE4\u6E34\u6E3A\u6E89\u6F29\u70AB\u70AC\u7130\u715E\u7184\u71AC\u7238\u7281\u72E0\u74E3\u74F7\u7529\u7578\u761F\u7626\u76D4\u775B\u7779\u7784\u77BB\u780C\u780D\u7838\u7898\u78C5\u78F7\u7AED\u7B28\u7BE1\u7C07\u7CD5\u7CD9\u7CEF\u7F38\u800D\u8084\u809A\u8165\u816E\u832B\u8334\u840D\u8774\u886B\u888D\u88D9\u88F9\u8C41\u8D81\u8D9F\u8E22\u8E29\u8EB2\u8F9C\u9165\u918B\u9631\u964B\u964C\u9661\u9709\u9739\u9776\u9AD3\u9ED4
+NGram.KANJI_6_10=\u4E53\u5582\u5600\u6342\u7B06
+NGram.KANJI_6_11=\u5288\u543C\u5475\u5486\u54EE\u5598\u56BC\u5962\u5A36\u5A9A\u5B75\u5BA6\u5C38\u5C4E\u5F8A\u5F98\u627C\u62CC\u62D7\u63C9\u6930\u6954\u69D0\u6BEF\u6C90\u6CBD\u6CBE\u6F31\u6F88\u70D9\u7329\u75BC\u75F0\u7737\u77D7\u7B19\u7FB9\u803F\u80D6\u813E\u81C0\u8205\u8309\u83BD\u846B\u8517\u868C\u8759\u8815\u8859\u8B6C\u8E81\u8EAC\u90A2\u9698\u9B44
+NGram.KANJI_6_12=\u722C\u7FD4
+NGram.KANJI_6_16=\u5228\u5315\u542E\u54CE\u5509\u5527\u5543\u55B3\u55E1\u5636\u568E\u5FFF\u61E6\u6376\u642A\u6726\u74E4\u76F9\u7736\u7BD9\u8019\u80F0\u80F3\u812F\u818A\u8200\u8214\u8638\u869C\u86C0\u86C6\u86D4\u87C6\u88B1\u8902\u8C7A\u8E4B\u9119
+NGram.KANJI_6_18=\u67D2\u6ED3\u87C0\u87CB\u8DDB\u901E\u9163
+NGram.KANJI_6_20=\u4F5B\u52D2\u54C8\u62FF\u66FC\u6D59\u704C\u7586\u9ECE
+NGram.KANJI_6_21=\u4E48\u4EFF\u4F19\u4FF1\u5021\u5077\u5195\u5212\u5269\u5401\u541E\u5427\u54EA\u5587\u558A\u55BB\u566A\u573E\u574E\u5783\u57AE\u584C\u58E4\u5960\u5976\u59CA\u5A1C\u5DE2\u5F99\u600E\u6015\u6263\u626D\u6293\u62C6\u62D6\u62EF\u62F1\u6316\u632A\u6380\u6389\u63D2\u641E\u64C5\u64CE\u65F1\u6664\u6735\u6770\u67EC\u6846\u684C\u68AD\u6B47\u6B49\u6B67\u6C1B\u6C27\u6C2F\u6C5B\u6C89\u6DF9\u6EAF\u70AE\u70E4\u731C\u7334\u73BB\u7470\u76FC\u788E\u789F\u78B0\u78B3\u7A0D\u7A3B\u7A57\u7CB9\u7F69\u8335\u8354\u84BF\u8DCC\u8DD1\u904F\u90A8\u9189\u9677\u9738\u978B
+NGram.KANJI_6_22=\u5162\u53E8\u542D\u5501\u552C\u5639\u563F\u56B7\u6043\u60B4\u6194\u61CA\u634E\u63CD\u6414\u64AC\u6DAE\u6E43\u6F66\u7095\u7316\u733E\u7728\u7830\u78D5\u7ABF\u7FE9\u8018\u80EF\u8198\u8693\u86AA\u86AF\u874C\u8783\u879F\u8892\u8E6C
+NGram.KANJI_6_23=\u4FD8\u4FEF\u501A\u5085\u5180\u526A\u5323\u54ED\u5634\u56CA\u58A9\u58F9\u5955\u5978\u59DA\u5A49\u5B55\u5BC7\u5BE8\u5D4C\u5E62\u6467\u64BC\u6500\u655E\u6572\u658C\u6670\u68CD\u68D5\u68E0\u6912\u6A0A\u6BB7\u6C9B\u6D3D\u6DC6\u6E23\u6F8E\u7011\u7092\u714C\u73AB\u7405\u7624\u76D2\u7960\u79C9\u7A20\u7BF7\u7F50\u804A\u8086\u81C2\u8292\u82DE\u852C\u857E\u859B\u8760\u8C6B\u8DBE\u8E48\u8F9F\u96A7
+NGram.KANJI_6_25=\u4E8E\u5DF2\u5FB7\u7AD9
+NGram.KANJI_6_28=\u4E58\u4ECD\u4EFD\u4F30\u4F60\u4F69\u503C\u5047\u51B0\u51F0\u5361\u5377\u53E6\u54E5\u552E\u5708\u5740\u5761\u57C3\u5821\u589E\u5979\u59C6\u5B69\u5B83\u5E15\u5E76\u5F17\u5F88\u6208\u622A\u624E\u627E\u62D4\u62DC\u63ED\u641C\u6536\u6548\u65C1\u665A\u6668\u67E5\u6B65\u6BCF\u6C61\u6CDB\u6D4E\u6D89\u6DB5\u6E38\u6EAA\u6FB3\u70B8\u745F\u7538\u7A97\u7F3A\u7F55\u805A\u8258\u827E\u82AC\u8303\u83F2\u8482\u85CF\u8DDF\u903E\u9080\u970D\u9760\u9ED1\u9ED8
+NGram.KANJI_6_29=\u634F\u6518\u7B50\u809B
+NGram.KANJI_6_30=\u54A7\u57C2\u5AB3\u60CB\u6886\u8378\u85D0\u8671
+NGram.KANJI_6_32=\u5080\u5121\u51A4\u54AC\u55DC\u592D\u5DEB\u6292\u68D8\u69B4\u6A59\u6E24\u7FC5\u80DA\u8180\u86DB\u8700\u8DCB\u9761
+NGram.KANJI_6_34=\u4E30\u51E0\u542C\u613F
+NGram.KANJI_6_35=\u4E56\u547B\u55FD\u5C41\u606C\u6115\u6CAE\u7119\u795F\u7CDC\u86C9\u86F9\u8713\u873B\u8757\u8925\u892A\u96F9
+NGram.KANJI_6_37=\u51B2\u5308\u5398\u54B8\u59DC\u5C4F\u5D14\u5F6D\u60E0\u6241\u6350\u699C\u6BEB\u6C6A\u6CC4\u6DEE\u6F58\u6F6D\u7199\u77EE\u7ADF\u8058\u820D\u8212\u8389\u8587\u884D\u8881\u8FA8\u8FF9\u96D5
+NGram.KANJI_6_39=\u574F\u6251\u6302
+NGram.KANJI_7_0=\u52FA\u5544\u60F0\u6994\u86A4\u86E4
+NGram.KANJI_7_3=\u4E59\u4E7E\u4EAD\u4EF0\u4EF2\u4F0F\u4F10\u4FAF\u4FCA\u500D\u501F\u5076\u508D\u50E7\u5112\u5146\u5192\u51AC\u51DD\u51FD\u5200\u5237\u524A\u52A3\u52C3\u52C7\u52DF\u5351\u5352\u5353\u5378\u537F\u53E5\u5439\u54FA\u574A\u5782\u57CB\u5893\u58C1\u5915\u5937\u5949\u5951\u5974\u59B9\u5A18\u5A5A\u5ACC\u5B54\u5B5D\u5B64\u5B8F\u5BBF\u5BD2\u5C3A\u5C6F\u5CB3\u5D07\u5DE7\u5E84\u5E8A\u5F26\u5F69\u5F70\u5F90\u5FAA\u5FCD\u6012\u6016\u602A\u60A0\u60B2\u60BC\u6148\u6162\u6170\u6291\u6298\u62AB\u62BC\u62BD\u62D2\u62D3\u62D8\u62F3\u6311\u638C\u6398\u63E1\u642C\u6458\u64A4\u654F\u656C\u659C\u65E2\u65E8\u65EC\u6606\u6614\u6676\u6691\u6696\u66F9\u6749\u676F\u679A\u679D\u67CF\u67D4\u67F1\u67F3\u67F4\u6817\u6842\u6843\u6851\u68A8\u68CB\u68D2\u6B20\u6B32\u6BBF\u6C57\u6C88\u6CCA\u6D17\u6D1E\u6D69\u6D6E\u6D78\u6DE1\u6DFB\u6E58\u6EB6\u6F0F\u6F20\u7070\u708E\u70AD\u7126\u718A\u71C3\u7267\u72C2\u731B\u7384\u73A9\u73CD\u7434\u75AB\u75DB\u76C6\u76FE\u773C\u7891\u78C1\u795D\u7965\u79D2\u79DF\u79E6\u7A00\u7B11\u7B51\u7B54\u7C89\u7C92\u7CD6\u7D2B\u7F8A\u7FBD\u7FFC\u8010\u80A5\u80CE\u8150\u8179\u819C\u8247\u829D\u82B3\u82D7\u82E6\u8302\u8336\u8352\u83CA\u83CC\u83DC\u845B\u846C\u84B2\u84B8\u84C4\u8584\u864E\u86C7\u8861\u8863\u8870\u888B\u8896\u88D5\u8986\u8C46\u8DA3\u8E0F\u8F9B\u8FC5\u8FEB\u8FF7\u9003\u9006\u902E\u9042\u9063\u90ED\u963B\u9676\u96EA\u9756\u9B3C\u9B42\u9F3B
+NGram.KANJI_7_6=\u4E01\u4E03\u4E45\u4E5D\u4E88\u4E92\u4EA1\u4ECB\u4EE4\u4F01\u4F0A\u4F2F\u4F3C\u4F4E\u4F4F\u4F55\u4F8B\u4F9D\u4FBF\u4FEE\u505C\u50CF\u516B\u516D\u5175\u5177\u5178\u5207\u520A\u5224\u526F\u529F\u52A9\u5343\u5348\u535A\u5370\u53BB\u53CB\u53F3\u5409\u542B\u544A\u547C\u5584\u5747\u5802\u590F\u592B\u5931\u5947\u597D\u5A01\u5A92\u5B63\u5B8C\u5B97\u5BA2\u5BA3\u5BA4\u5BB3\u5BB9\u5BC6\u5BCC\u5BDF\u5C04\u5C1A\u5C45\u5C4B\u5CB8\u5DE6\u5E0C\u5E1D\u5E2D\u5E55\u5E8F\u5E95\u5E97\u5EA7\u5EB7\u5EF6\u5F8B\u5FAE\u5FC5\u5FD7\u5FF5\u601D\u6025\u606F\u60F3\u611F\u623F\u6253\u6279\u627F\u6295\u6297\u62EC\u6388\u6392\u63F4\u6545\u6551\u6574\u6599\u65C5\u65E9\u6613\u6620\u6625\u666E\u666F\u66B4\u66F4\u670D\u671B\u6728\u672B\u6751\u677E\u67B6\u6838\u6839\u6848\u68EE\u690D\u6982\u6A21\u6B4C\u6B62\u6B66\u6BB5\u6BCD\u6C0F\u6C38\u6C42\u6CBF\u6CE2\u6CE8\u6D0B\u6D3E\u6D88\u6DF1\u6E05\u6E56\u706B\u7167\u7206\u7236\u7247\u7387\u7530\u7537\u7559\u7565\u7591\u75C5\u767B\u767D\u767E\u7687\u76DB\u76DF\u771F\u7763\u77ED\u7834\u79FB\u7A81\u7AE0\u7AEF\u7B56\u7B97\u7C4D\u7CBE\u7D20\u7D22\u7F72\u7FA4\u8001\u8003\u81F4\u822A\u826F\u82B1\u8349\u843D\u878D\u8857\u89D2\u8B66\u8C37\u8D70\u8D85\u8D8A\u8DB3\u8FF0\u8FFD\u9001\u901F\u90A3\u90A6\u914D\u91CE\u9632\u963F\u9644\u964D\u9664\u96C4\u96E8\u9752\u9769\u98DF
+NGram.KANJI_7_7=\u4E09\u4E0A\u4E0B\u4E0D\u4E16\u4E3B\u4E8B\u4E8C\u4EE3\u4EE5\u4F4D\u4F5C\u4F7F\u5165\u5168\u516C\u5171\u51FA\u5206\u5229\u5236\u524D\u529B\u52A0\u5316\u5317\u5357\u539F\u53CA\u53F0\u5408\u540C\u540D\u548C\u5730\u57FA\u5916\u591A\u5929\u5B50\u5B9A\u5BB6\u5C0F\u5C71\u5DDE\u5DE5\u5E02\u5E73\u5EA6\u5EFA\u5F0F\u6027\u6210\u6240\u6307\u653F\u6587\u65B0\u65B9\u660E\u6700\u6709\u671F\u672C\u6B21\u6B63\u6C11\u6CBB\u6CD5\u6D77\u7269\u7279\u7406\u751F\u7528\u7531\u754C\u76EE\u76F8\u793E\u79D1\u7ACB\u7B2C\u7B49\u7CFB\u8005\u80FD\u81EA\u82F1\u884C\u8868\u897F\u8981\u901A\u9053\u90E8\u90FD\u91CD\u9AD8
+NGram.KANJI_7_9=\u4E4D\u4F36\u5319\u6A61\u6DCB\u7194
+NGram.KANJI_7_11=\u4E5E\u4F43\u5026\u50FB\u515C\u5243\u5420\u5446\u54B3\u54BD\u553E\u55A7\u5703\u5984\u5AC9\u5B09\u5C51\u5DFE\u5ED3\u5F1B\u6055\u618E\u62D9\u65A7\u6652\u6977\u6EBA\u707C\u75D8\u79E4\u7AFF\u7B4F\u7CA5\u808B\u8098\u80B4\u8235\u82DB\u849C\u8549\u868A\u86FE\u8718\u914C
+NGram.KANJI_7_12=\u4E08\u4E38\u4F8D\u50DA\u5203\u5256\u52C9\u52D8\u52FE\u5320\u533F\u5375\u53D4\u540F\u54E8\u56DA\u5806\u5996\u5999\u59A5\u59A8\u59FF\u5AE1\u5BB0\u5BF8\u5C09\u5C3F\u5C48\u5C65\u5D29\u5E06\u5E4C\u5EB5\u5EB6\u5EB8\u5F13\u5FCC\u5FD8\u6052\u606D\u609F\u60D1\u614E\u6247\u62B1\u6349\u64E6\u6577\u65ED\u6674\u6734\u67C4\u6850\u690E\u6A58\u6B3A\u6B89\u6C41\u6CBC\u6CCC\u6CF3\u6D74\u6DAF\u6DF3\u6ECB\u6F02\u6F84\u71E5\u7261\u7272\u72AC\u72FC\u733F\u7409\u755C\u76F2\u7720\u77AC\u77E2\u7802\u786B\u78E8\u7901\u7948\u79E9\u7A1A\u7A74\u7AE3\u7B4B\u7B52\u7BB1\u7C3F\u8015\u8096\u809D\u80A2\u80A9\u80AA\u80BA\u80F8\u8102\u810A\u8154\u8155\u8170\u817A\u81A8\u81ED\u820C\u8236\u82BD\u8305\u83E9\u83F1\u840C\u85FB\u8650\u8702\u8A93\u8E44\u8FB0\u9038\u9091\u90AA\u916C\u9175\u9177\u9685\u96C0\u96C7\u96CC\u97AD
+NGram.KANJI_7_13=\u63D6\u803D
+NGram.KANJI_7_16=\u602F\u7566
+NGram.KANJI_7_18=\u634C\u7C38
+NGram.KANJI_7_19=\u4E18\u4E73\u4E95\u4EAB\u4EC1\u4ED8\u4ED9\u4F11\u4F34\u4F38\u4F59\u4FB5\u4FC3\u4FD7\u5012\u5019\u5065\u50AC\u5144\u5145\u514D\u517C\u51A0\u51B7\u5211\u5238\u523A\u523B\u5272\u52E4\u5360\u5371\u539A\u541B\u5426\u5438\u5473\u54F2\u5510\u552F\u5531\u559C\u5609\u56F0\u56FA\u591C\u5948\u594F\u59BB\u59D3\u5B85\u5B87\u5B88\u5B99\u5B9C\u5BC4\u5BFA\u5C0A\u5C3E\u5CA9\u5D0E\u5DE1\u5DE8\u5DEE\u5DF1\u5E45\u5E78\u5E7B\u5E7C\u5EAD\u5EF7\u5F1F\u5F31\u5F79\u5F7C\u5F85\u5F92\u5FA1\u5FE0\u6050\u60A3\u6212\u62DB\u632F\u6355\u63A2\u63AA\u63CF\u642D\u6469\u64CD\u653B\u6563\u660C\u662D\u667A\u6697\u66FF\u6750\u675F\u677F\u6790\u67D3\u682A\u6885\u68B0\u6B8A\u6B96\u6BDB\u6C60\u6CB9\u6CC9\u6D25\u6D66\u6DB2\u6DF7\u6E21\u6ED1\u6F2B\u6F6E\u6FC0\u7235\u725B\u72AF\u7389\u7532\u7533\u756A\u75BE\u75C7\u76AE\u76CA\u7740\u786C\u7956\u7968\u796D\u7981\u79C0\u79C1\u79CB\u79D8\u7A3F\u7AE5\u7AF9\u7E41\u7F6A\u7FFB\u8089\u80CC\u80DE\u81E3\u821E\u8239\u82E5\u8328\u8377\u85E4\u8840\u88C1\u88C2\u8C6A\u8D64\u8DDD\u8FCE\u8FD4\u9000\u9014\u907F\u90CA\u90CE\u90E1\u9152\u9178\u9686\u9694\u969C\u9707\u9732\u9AA8\u9B54\u9E7F\u9EBB
+NGram.KANJI_7_20=\u4E39\u4E43\u4EAE\u4F73\u504F\u505A\u51C6\u51CC\u52AA\u5339\u5347\u53EB\u53EC\u5448\u5766\u57F9\u5854\u585E\u58A8\u5B8B\u5C01\u5CF0\u5E72\u5EC9\u5F80\u5F81\u5FBD\u5FEB\u6069\u6211\u624D\u628A\u62B5\u62CD\u6309\u63A7\u64AD\u6566\u6597\u65CB\u65D7\u6628\u6717\u6731\u674E\u675C\u683D\u6881\u6B3E\u6BD2\u6C7D\u6C99\u6CE5\u6CF0\u6D1B\u6D2A\u70C8\u719F\u724C\u7259\u73E0\u73ED\u745E\u74E6\u7518\u751A\u7686\u770B\u7B26\u8033\u80A1\u80E1\u821F\u83AB\u8499\u8D74\u8DE8\u900F\u9010\u9047\u904D\u906D\u9675\u96C5\u96F6\u96F7\u9700\u9F13
+NGram.KANJI_7_21=\u5764\u59D0\u5A03\u6062\u6108\u68C9\u7164\u79BE\u7BAD\u903C
+NGram.KANJI_7_23=\u4EA5\u50B2\u532A\u5366\u543B\u54E9\u5632\u59D1\u5BB5\u5DF7\u5F6A\u5F6C\u5FFD\u6070\u6168\u61BE\u63A0\u63A9\u6478\u65A4\u68A7\u6A1F\u6CAB\u70F9\u711A\u723D\u7262\u72F8\u751C\u754F\u75B9\u76C8\u7709\u7897\u7CCA\u7F9E\u8299\u82AD\u82B9\u82D4\u8304\u84C9\u84EC\u854A\u85AF\u86D9\u8FA3\u9187\u97A0
+NGram.KANJI_7_25=\u4E14\u4E5F\u4F46\u514B\u5176\u5230\u5373\u53EA\u540E\u5982\u5C3C\u5DF4\u6216\u62C9\u65AF\u66FE\u6B64\u6D32\u6D6A\u7BC7\u800C
+NGram.KANJI_7_28=\u4E4E\u4E9B\u4EA6\u4EC0\u4FC4\u5403\u5957\u5C24\u6089\u6258\u67D0\u758F\u7FF0\u8D6B
+NGram.KANJI_7_29=\u4FAE\u5944\u5A29\u6101\u62ED\u6328\u637B\u6666\u6687\u66AE\u673D\u6756\u67FF\u6813\u68A2\u699B\u7078\u708A\u7396\u7422\u7525\u75E2\u76BF\u7766\u77B3\u7A3C\u7A92\u819D\u81FC\u8237\u8338\u8511\u88F3\u8FC2
+NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5993\u5A7F\u5AC1\u5B9B\u5BC2\u5BE1\u5F04\u5F0A\u5F27\u6020\u6028\u6068\u6094\u6109\u611A\u614C\u621A\u62B9\u62D0\u62F7\u62FE\u632B\u633D\u6367\u660F\u6627\u6643\u66D9\u674F\u6795\u67AF\u67D1\u6876\u68DA\u68FA\u6905\u69FD\u6A80\u6B6A\u6CB8\u6CE3\u6DD1\u6DEB\u6E9C\u6EA2\u6EF4\u6F06\u714E\u716E\u722A\u7280\u74A7\u752B\u75B2\u75D5\u75F4\u77AD\u77E9\u785D\u79BD\u7A3D\u7A9F\u7B1B\u7B95\u7C9F\u7CDF\u80C3\u8106\u817F\u818F\u81B3\u828B\u82A5\u82AF\u840E\u851A\u853D\u8776\u87F9\u8877\u8910\u8912\u8C79\u8D66\u8FB1\u9017\u90C1\u916A\u9699\u96C1\u971C\u9774\u978D
+NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
+NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
+NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20
+TO_NORMALIZE_VI_CHARS=AEIOUYaeiouy\u00c2\u00ca\u00d4\u00e2\u00ea\u00f4\u0102\u0103\u01a0\u01a1\u01af\u01b0
+DMARK_CLASS=\u0300\u0301\u0303\u0309\u0323
+NORMALIZED_VI_CHARS_0300=\u00C0\u00C8\u00CC\u00D2\u00D9\u1EF2\u00E0\u00E8\u00EC\u00F2\u00F9\u1EF3\u1EA6\u1EC0\u1ED2\u1EA7\u1EC1\u1ED3\u1EB0\u1EB1\u1EDC\u1EDD\u1EEA\u1EEB
+NORMALIZED_VI_CHARS_0301=\u00C1\u00C9\u00CD\u00D3\u00DA\u00DD\u00E1\u00E9\u00ED\u00F3\u00FA\u00FD\u1EA4\u1EBE\u1ED0\u1EA5\u1EBF\u1ED1\u1EAE\u1EAF\u1EDA\u1EDB\u1EE8\u1EE9
+NORMALIZED_VI_CHARS_0303=\u00C3\u1EBC\u0128\u00D5\u0168\u1EF8\u00E3\u1EBD\u0129\u00F5\u0169\u1EF9\u1EAA\u1EC4\u1ED6\u1EAB\u1EC5\u1ED7\u1EB4\u1EB5\u1EE0\u1EE1\u1EEE\u1EEF
+NORMALIZED_VI_CHARS_0309=\u1EA2\u1EBA\u1EC8\u1ECE\u1EE6\u1EF6\u1EA3\u1EBB\u1EC9\u1ECF\u1EE7\u1EF7\u1EA8\u1EC2\u1ED4\u1EA9\u1EC3\u1ED5\u1EB2\u1EB3\u1EDE\u1EDF\u1EEC\u1EED
+NORMALIZED_VI_CHARS_0323=\u1EA0\u1EB8\u1ECA\u1ECC\u1EE4\u1EF4\u1EA1\u1EB9\u1ECB\u1ECD\u1EE5\u1EF5\u1EAC\u1EC6\u1ED8\u1EAD\u1EC7\u1ED9\u1EB6\u1EB7\u1EE2\u1EE3\u1EF0\u1EF1
diff --git a/nlp_resource_data/langdetect/utils/messages.py b/nlp_resource_data/langdetect/utils/messages.py
new file mode 100755 (executable)
index 0000000..6af3eb5
--- /dev/null
@@ -0,0 +1,23 @@
+from os import path
+
+
+class Messages(object):
+    MESSAGES_FILENAME = path.join(path.dirname(__file__), 'messages.properties')
+
+    def __init__(self):
+        self.messages = {}
+        with open(self.MESSAGES_FILENAME, 'r') as f:
+            for line in f:
+                key, _, value = line.strip().partition('=')
+                self.messages[key] = value.encode().decode('unicode_escape')
+
+    def get_string(self, key):
+        return self.messages.get(key, '!%s!' % key)
+
+
+_messages = None
+def get_string(key):
+    global _messages
+    if _messages is None:
+        _messages = Messages()
+    return _messages.get_string(key)
diff --git a/nlp_resource_data/langdetect/utils/messages.pyc b/nlp_resource_data/langdetect/utils/messages.pyc
new file mode 100755 (executable)
index 0000000..1601f17
Binary files /dev/null and b/nlp_resource_data/langdetect/utils/messages.pyc differ
diff --git a/nlp_resource_data/langdetect/utils/ngram.py b/nlp_resource_data/langdetect/utils/ngram.py
new file mode 100755 (executable)
index 0000000..ee82e38
--- /dev/null
@@ -0,0 +1,260 @@
+import re
+
+import six
+
+from . import messages
+from .unicode_block import (
+    unicode_block,
+    UNICODE_BASIC_LATIN,
+    UNICODE_LATIN_1_SUPPLEMENT,
+    UNICODE_LATIN_EXTENDED_B,
+    UNICODE_GENERAL_PUNCTUATION,
+    UNICODE_ARABIC,
+    UNICODE_LATIN_EXTENDED_ADDITIONAL,
+    UNICODE_HIRAGANA,
+    UNICODE_KATAKANA,
+    UNICODE_BOPOMOFO,
+    UNICODE_BOPOMOFO_EXTENDED,
+    UNICODE_CJK_UNIFIED_IDEOGRAPHS,
+    UNICODE_HANGUL_SYLLABLES,
+)
+
+
+class NGram(object):
+    LATIN1_EXCLUDED = messages.get_string('NGram.LATIN1_EXCLUDE')
+    N_GRAM = 3
+
+    def __init__(self):
+        self.grams = ' '
+        self.capitalword = False
+
+    def add_char(self, ch):
+        '''Append a character into ngram buffer.'''
+        ch = self.normalize(ch)
+        last_char = self.grams[-1]
+        if last_char == ' ':
+            self.grams = ' '
+            self.capitalword = False
+            if ch == ' ':
+                return
+        elif len(self.grams) >= self.N_GRAM:
+            self.grams = self.grams[1:]
+        self.grams += ch
+
+        if ch.isupper():
+            if last_char.isupper():
+                self.capitalword = True
+        else:
+            self.capitalword = False
+
+    def get(self, n):
+        '''Get n-gram.'''
+        if self.capitalword:
+            return
+        if n < 1 or n > self.N_GRAM or len(self.grams) < n:
+            return
+        if n == 1:
+            ch = self.grams[-1]
+            if ch == ' ':
+                return
+            return ch
+        else:
+            return self.grams[-n:]
+
+    @classmethod
+    def normalize(cls, ch):
+        block = unicode_block(ch)
+        if block == UNICODE_BASIC_LATIN:
+            if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch:
+                ch = ' '
+        elif block == UNICODE_LATIN_1_SUPPLEMENT:
+            if cls.LATIN1_EXCLUDED.find(ch) >= 0:
+                ch = ' '
+        elif block == UNICODE_LATIN_EXTENDED_B:
+            # normalization for Romanian
+            if ch == six.u('\u0219'):  # Small S with comma below => with cedilla
+                ch = six.u('\u015f')
+            if ch == six.u('\u021b'):  # Small T with comma below => with cedilla
+                ch = six.u('\u0163')
+        elif block == UNICODE_GENERAL_PUNCTUATION:
+            ch = ' '
+        elif block == UNICODE_ARABIC:
+            if ch == six.u('\u06cc'):
+                ch = six.u('\u064a')  # Farsi yeh => Arabic yeh
+        elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL:
+            if ch >= six.u('\u1ea0'):
+                ch = six.u('\u1ec3')
+        elif block == UNICODE_HIRAGANA:
+            ch = six.u('\u3042')
+        elif block == UNICODE_KATAKANA:
+            ch = six.u('\u30a2')
+        elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED):
+            ch = six.u('\u3105')
+        elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS:
+            ch = cls.CJK_MAP.get(ch, ch)
+        elif block == UNICODE_HANGUL_SYLLABLES:
+            ch = six.u('\uac00')
+        return ch
+
+    @classmethod
+    def normalize_vi(cls, text):
+        '''Normalizer for Vietnamese.
+        Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx.
+        '''
+        def repl(m):
+            alphabet = cls.TO_NORMALIZE_VI_CHARS.find(m.group(1))
+            dmark = cls.DMARK_CLASS.find(m.group(2))  # Diacritical Mark
+            return cls.NORMALIZED_VI_CHARS[dmark][alphabet]
+        return cls.ALPHABET_WITH_DMARK.sub(repl, text)
+
+    NORMALIZED_VI_CHARS = [
+        messages.get_string('NORMALIZED_VI_CHARS_0300'),
+        messages.get_string('NORMALIZED_VI_CHARS_0301'),
+        messages.get_string('NORMALIZED_VI_CHARS_0303'),
+        messages.get_string('NORMALIZED_VI_CHARS_0309'),
+        messages.get_string('NORMALIZED_VI_CHARS_0323')]
+    TO_NORMALIZE_VI_CHARS = messages.get_string('TO_NORMALIZE_VI_CHARS')
+    DMARK_CLASS = messages.get_string('DMARK_CLASS')
+    ALPHABET_WITH_DMARK = re.compile(
+        '([' + TO_NORMALIZE_VI_CHARS + '])([' + DMARK_CLASS + '])',
+        re.UNICODE)
+
+    # CJK Kanji Normalization Mapping
+    CJK_CLASS = [
+        messages.get_string('NGram.KANJI_1_0'),
+        messages.get_string('NGram.KANJI_1_2'),
+        messages.get_string('NGram.KANJI_1_4'),
+        messages.get_string('NGram.KANJI_1_8'),
+        messages.get_string('NGram.KANJI_1_11'),
+        messages.get_string('NGram.KANJI_1_12'),
+        messages.get_string('NGram.KANJI_1_13'),
+        messages.get_string('NGram.KANJI_1_14'),
+        messages.get_string('NGram.KANJI_1_16'),
+        messages.get_string('NGram.KANJI_1_18'),
+        messages.get_string('NGram.KANJI_1_22'),
+        messages.get_string('NGram.KANJI_1_27'),
+        messages.get_string('NGram.KANJI_1_29'),
+        messages.get_string('NGram.KANJI_1_31'),
+        messages.get_string('NGram.KANJI_1_35'),
+        messages.get_string('NGram.KANJI_2_0'),
+        messages.get_string('NGram.KANJI_2_1'),
+        messages.get_string('NGram.KANJI_2_4'),
+        messages.get_string('NGram.KANJI_2_9'),
+        messages.get_string('NGram.KANJI_2_10'),
+        messages.get_string('NGram.KANJI_2_11'),
+        messages.get_string('NGram.KANJI_2_12'),
+        messages.get_string('NGram.KANJI_2_13'),
+        messages.get_string('NGram.KANJI_2_15'),
+        messages.get_string('NGram.KANJI_2_16'),
+        messages.get_string('NGram.KANJI_2_18'),
+        messages.get_string('NGram.KANJI_2_21'),
+        messages.get_string('NGram.KANJI_2_22'),
+        messages.get_string('NGram.KANJI_2_23'),
+        messages.get_string('NGram.KANJI_2_28'),
+        messages.get_string('NGram.KANJI_2_29'),
+        messages.get_string('NGram.KANJI_2_30'),
+        messages.get_string('NGram.KANJI_2_31'),
+        messages.get_string('NGram.KANJI_2_32'),
+        messages.get_string('NGram.KANJI_2_35'),
+        messages.get_string('NGram.KANJI_2_36'),
+        messages.get_string('NGram.KANJI_2_37'),
+        messages.get_string('NGram.KANJI_2_38'),
+        messages.get_string('NGram.KANJI_3_1'),
+        messages.get_string('NGram.KANJI_3_2'),
+        messages.get_string('NGram.KANJI_3_3'),
+        messages.get_string('NGram.KANJI_3_4'),
+        messages.get_string('NGram.KANJI_3_5'),
+        messages.get_string('NGram.KANJI_3_8'),
+        messages.get_string('NGram.KANJI_3_9'),
+        messages.get_string('NGram.KANJI_3_11'),
+        messages.get_string('NGram.KANJI_3_12'),
+        messages.get_string('NGram.KANJI_3_13'),
+        messages.get_string('NGram.KANJI_3_15'),
+        messages.get_string('NGram.KANJI_3_16'),
+        messages.get_string('NGram.KANJI_3_18'),
+        messages.get_string('NGram.KANJI_3_19'),
+        messages.get_string('NGram.KANJI_3_22'),
+        messages.get_string('NGram.KANJI_3_23'),
+        messages.get_string('NGram.KANJI_3_27'),
+        messages.get_string('NGram.KANJI_3_29'),
+        messages.get_string('NGram.KANJI_3_30'),
+        messages.get_string('NGram.KANJI_3_31'),
+        messages.get_string('NGram.KANJI_3_32'),
+        messages.get_string('NGram.KANJI_3_35'),
+        messages.get_string('NGram.KANJI_3_36'),
+        messages.get_string('NGram.KANJI_3_37'),
+        messages.get_string('NGram.KANJI_3_38'),
+        messages.get_string('NGram.KANJI_4_0'),
+        messages.get_string('NGram.KANJI_4_9'),
+        messages.get_string('NGram.KANJI_4_10'),
+        messages.get_string('NGram.KANJI_4_16'),
+        messages.get_string('NGram.KANJI_4_17'),
+        messages.get_string('NGram.KANJI_4_18'),
+        messages.get_string('NGram.KANJI_4_22'),
+        messages.get_string('NGram.KANJI_4_24'),
+        messages.get_string('NGram.KANJI_4_28'),
+        messages.get_string('NGram.KANJI_4_34'),
+        messages.get_string('NGram.KANJI_4_39'),
+        messages.get_string('NGram.KANJI_5_10'),
+        messages.get_string('NGram.KANJI_5_11'),
+        messages.get_string('NGram.KANJI_5_12'),
+        messages.get_string('NGram.KANJI_5_13'),
+        messages.get_string('NGram.KANJI_5_14'),
+        messages.get_string('NGram.KANJI_5_18'),
+        messages.get_string('NGram.KANJI_5_26'),
+        messages.get_string('NGram.KANJI_5_29'),
+        messages.get_string('NGram.KANJI_5_34'),
+        messages.get_string('NGram.KANJI_5_39'),
+        messages.get_string('NGram.KANJI_6_0'),
+        messages.get_string('NGram.KANJI_6_3'),
+        messages.get_string('NGram.KANJI_6_9'),
+        messages.get_string('NGram.KANJI_6_10'),
+        messages.get_string('NGram.KANJI_6_11'),
+        messages.get_string('NGram.KANJI_6_12'),
+        messages.get_string('NGram.KANJI_6_16'),
+        messages.get_string('NGram.KANJI_6_18'),
+        messages.get_string('NGram.KANJI_6_20'),
+        messages.get_string('NGram.KANJI_6_21'),
+        messages.get_string('NGram.KANJI_6_22'),
+        messages.get_string('NGram.KANJI_6_23'),
+        messages.get_string('NGram.KANJI_6_25'),
+        messages.get_string('NGram.KANJI_6_28'),
+        messages.get_string('NGram.KANJI_6_29'),
+        messages.get_string('NGram.KANJI_6_30'),
+        messages.get_string('NGram.KANJI_6_32'),
+        messages.get_string('NGram.KANJI_6_34'),
+        messages.get_string('NGram.KANJI_6_35'),
+        messages.get_string('NGram.KANJI_6_37'),
+        messages.get_string('NGram.KANJI_6_39'),
+        messages.get_string('NGram.KANJI_7_0'),
+        messages.get_string('NGram.KANJI_7_3'),
+        messages.get_string('NGram.KANJI_7_6'),
+        messages.get_string('NGram.KANJI_7_7'),
+        messages.get_string('NGram.KANJI_7_9'),
+        messages.get_string('NGram.KANJI_7_11'),
+        messages.get_string('NGram.KANJI_7_12'),
+        messages.get_string('NGram.KANJI_7_13'),
+        messages.get_string('NGram.KANJI_7_16'),
+        messages.get_string('NGram.KANJI_7_18'),
+        messages.get_string('NGram.KANJI_7_19'),
+        messages.get_string('NGram.KANJI_7_20'),
+        messages.get_string('NGram.KANJI_7_21'),
+        messages.get_string('NGram.KANJI_7_23'),
+        messages.get_string('NGram.KANJI_7_25'),
+        messages.get_string('NGram.KANJI_7_28'),
+        messages.get_string('NGram.KANJI_7_29'),
+        messages.get_string('NGram.KANJI_7_32'),
+        messages.get_string('NGram.KANJI_7_33'),
+        messages.get_string('NGram.KANJI_7_35'),
+        messages.get_string('NGram.KANJI_7_37')]
+
+    CJK_MAP = {}
+
+    @classmethod
+    def _init_cjk_map(cls):
+        for cjk_list in cls.CJK_CLASS:
+            representative = cjk_list[0]
+            for ch in cjk_list:
+                cls.CJK_MAP[ch] = representative
+
+NGram._init_cjk_map()
diff --git a/nlp_resource_data/langdetect/utils/ngram.pyc b/nlp_resource_data/langdetect/utils/ngram.pyc
new file mode 100755 (executable)
index 0000000..ac43ed1
Binary files /dev/null and b/nlp_resource_data/langdetect/utils/ngram.pyc differ
diff --git a/nlp_resource_data/langdetect/utils/unicode_block.py b/nlp_resource_data/langdetect/utils/unicode_block.py
new file mode 100755 (executable)
index 0000000..f074189
--- /dev/null
@@ -0,0 +1,465 @@
+UNICODE_BASIC_LATIN = 1
+UNICODE_LATIN_1_SUPPLEMENT = 2
+UNICODE_LATIN_EXTENDED_A = 3
+UNICODE_LATIN_EXTENDED_B = 4
+UNICODE_IPA_EXTENSIONS = 5
+UNICODE_SPACING_MODIFIER_LETTERS = 6
+UNICODE_COMBINING_DIACRITICAL_MARKS = 7
+UNICODE_GREEK_AND_COPTIC = 8
+UNICODE_CYRILLIC = 9
+UNICODE_CYRILLIC_SUPPLEMENT = 10
+UNICODE_ARMENIAN = 11
+UNICODE_HEBREW = 12
+UNICODE_ARABIC = 13
+UNICODE_SYRIAC = 14
+UNICODE_ARABIC_SUPPLEMENT = 15
+UNICODE_THAANA = 16
+UNICODE_NKO = 17
+UNICODE_SAMARITAN = 18
+UNICODE_MANDAIC = 19
+UNICODE_ARABIC_EXTENDED_A = 20
+UNICODE_DEVANAGARI = 21
+UNICODE_BENGALI = 22
+UNICODE_GURMUKHI = 23
+UNICODE_GUJARATI = 24
+UNICODE_ORIYA = 25
+UNICODE_TAMIL = 26
+UNICODE_TELUGU = 27
+UNICODE_KANNADA = 28
+UNICODE_MALAYALAM = 29
+UNICODE_SINHALA = 30
+UNICODE_THAI = 31
+UNICODE_LAO = 32
+UNICODE_TIBETAN = 33
+UNICODE_MYANMAR = 34
+UNICODE_GEORGIAN = 35
+UNICODE_HANGUL_JAMO = 36
+UNICODE_ETHIOPIC = 37
+UNICODE_ETHIOPIC_SUPPLEMENT = 38
+UNICODE_CHEROKEE = 39
+UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 40
+UNICODE_OGHAM = 41
+UNICODE_RUNIC = 42
+UNICODE_TAGALOG = 43
+UNICODE_HANUNOO = 44
+UNICODE_BUHID = 45
+UNICODE_TAGBANWA = 46
+UNICODE_KHMER = 47
+UNICODE_MONGOLIAN = 48
+UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 49
+UNICODE_LIMBU = 50
+UNICODE_TAI_LE = 51
+UNICODE_NEW_TAI_LUE = 52
+UNICODE_KHMER_SYMBOLS = 53
+UNICODE_BUGINESE = 54
+UNICODE_TAI_THAM = 55
+UNICODE_BALINESE = 56
+UNICODE_SUNDANESE = 57
+UNICODE_BATAK = 58
+UNICODE_LEPCHA = 59
+UNICODE_OL_CHIKI = 60
+UNICODE_SUNDANESE_SUPPLEMENT = 61
+UNICODE_VEDIC_EXTENSIONS = 62
+UNICODE_PHONETIC_EXTENSIONS = 63
+UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT = 64
+UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 65
+UNICODE_LATIN_EXTENDED_ADDITIONAL = 66
+UNICODE_GREEK_EXTENDED = 67
+UNICODE_GENERAL_PUNCTUATION = 68
+UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS = 69
+UNICODE_CURRENCY_SYMBOLS = 70
+UNICODE_COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS = 71
+UNICODE_LETTERLIKE_SYMBOLS = 72
+UNICODE_NUMBER_FORMS = 73
+UNICODE_ARROWS = 74
+UNICODE_MATHEMATICAL_OPERATORS = 75
+UNICODE_MISCELLANEOUS_TECHNICAL = 76
+UNICODE_CONTROL_PICTURES = 77
+UNICODE_OPTICAL_CHARACTER_RECOGNITION = 78
+UNICODE_ENCLOSED_ALPHANUMERICS = 79
+UNICODE_BOX_DRAWING = 80
+UNICODE_BLOCK_ELEMENTS = 81
+UNICODE_GEOMETRIC_SHAPES = 82
+UNICODE_MISCELLANEOUS_SYMBOLS = 83
+UNICODE_DINGBATS = 84
+UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 85
+UNICODE_SUPPLEMENTAL_ARROWS_A = 86
+UNICODE_BRAILLE_PATTERNS = 87
+UNICODE_SUPPLEMENTAL_ARROWS_B = 88
+UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 89
+UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 90
+UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 91
+UNICODE_GLAGOLITIC = 92
+UNICODE_LATIN_EXTENDED_C = 93
+UNICODE_COPTIC = 94
+UNICODE_GEORGIAN_SUPPLEMENT = 95
+UNICODE_TIFINAGH = 96
+UNICODE_ETHIOPIC_EXTENDED = 97
+UNICODE_CYRILLIC_EXTENDED_A = 98
+UNICODE_SUPPLEMENTAL_PUNCTUATION = 99
+UNICODE_CJK_RADICALS_SUPPLEMENT = 100
+UNICODE_KANGXI_RADICALS = 101
+UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 102
+UNICODE_CJK_SYMBOLS_AND_PUNCTUATION = 103
+UNICODE_HIRAGANA = 104
+UNICODE_KATAKANA = 105
+UNICODE_BOPOMOFO = 106
+UNICODE_HANGUL_COMPATIBILITY_JAMO = 107
+UNICODE_KANBUN = 108
+UNICODE_BOPOMOFO_EXTENDED = 109
+UNICODE_CJK_STROKES = 110
+UNICODE_KATAKANA_PHONETIC_EXTENSIONS = 111
+UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS = 112
+UNICODE_CJK_COMPATIBILITY = 113
+UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 114
+UNICODE_YIJING_HEXAGRAM_SYMBOLS = 115
+UNICODE_CJK_UNIFIED_IDEOGRAPHS = 116
+UNICODE_YI_SYLLABLES = 117
+UNICODE_YI_RADICALS = 118
+UNICODE_LISU = 119
+UNICODE_VAI = 120
+UNICODE_CYRILLIC_EXTENDED_B = 121
+UNICODE_BAMUM = 122
+UNICODE_MODIFIER_TONE_LETTERS = 123
+UNICODE_LATIN_EXTENDED_D = 124
+UNICODE_SYLOTI_NAGRI = 125
+UNICODE_COMMON_INDIC_NUMBER_FORMS = 126
+UNICODE_PHAGS_PA = 127
+UNICODE_SAURASHTRA = 128
+UNICODE_DEVANAGARI_EXTENDED = 129
+UNICODE_KAYAH_LI = 130
+UNICODE_REJANG = 131
+UNICODE_HANGUL_JAMO_EXTENDED_A = 132
+UNICODE_JAVANESE = 133
+UNICODE_CHAM = 134
+UNICODE_MYANMAR_EXTENDED_A = 135
+UNICODE_TAI_VIET = 136
+UNICODE_MEETEI_MAYEK_EXTENSIONS = 137
+UNICODE_ETHIOPIC_EXTENDED_A = 138
+UNICODE_MEETEI_MAYEK = 139
+UNICODE_HANGUL_SYLLABLES = 140
+UNICODE_HANGUL_JAMO_EXTENDED_B = 141
+UNICODE_HIGH_SURROGATES = 142
+UNICODE_HIGH_PRIVATE_USE_SURROGATES = 143
+UNICODE_LOW_SURROGATES = 144
+UNICODE_PRIVATE_USE_AREA = 145
+UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS = 146
+UNICODE_ALPHABETIC_PRESENTATION_FORMS = 147
+UNICODE_ARABIC_PRESENTATION_FORMS_A = 148
+UNICODE_VARIATION_SELECTORS = 149
+UNICODE_VERTICAL_FORMS = 150
+UNICODE_COMBINING_HALF_MARKS = 151
+UNICODE_CJK_COMPATIBILITY_FORMS = 152
+UNICODE_SMALL_FORM_VARIANTS = 153
+UNICODE_ARABIC_PRESENTATION_FORMS_B = 154
+UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS = 155
+UNICODE_SPECIALS = 156
+UNICODE_LINEAR_B_SYLLABARY = 157
+UNICODE_LINEAR_B_IDEOGRAMS = 158
+UNICODE_AEGEAN_NUMBERS = 159
+UNICODE_ANCIENT_GREEK_NUMBERS = 160
+UNICODE_ANCIENT_SYMBOLS = 161
+UNICODE_PHAISTOS_DISC = 162
+UNICODE_LYCIAN = 163
+UNICODE_CARIAN = 164
+UNICODE_OLD_ITALIC = 165
+UNICODE_GOTHIC = 166
+UNICODE_UGARITIC = 167
+UNICODE_OLD_PERSIAN = 168
+UNICODE_DESERET = 169
+UNICODE_SHAVIAN = 170
+UNICODE_OSMANYA = 171
+UNICODE_CYPRIOT_SYLLABARY = 172
+UNICODE_IMPERIAL_ARAMAIC = 173
+UNICODE_PHOENICIAN = 174
+UNICODE_LYDIAN = 175
+UNICODE_MEROITIC_HIEROGLYPHS = 176
+UNICODE_MEROITIC_CURSIVE = 177
+UNICODE_KHAROSHTHI = 178
+UNICODE_OLD_SOUTH_ARABIAN = 179
+UNICODE_AVESTAN = 180
+UNICODE_INSCRIPTIONAL_PARTHIAN = 181
+UNICODE_INSCRIPTIONAL_PAHLAVI = 182
+UNICODE_OLD_TURKIC = 183
+UNICODE_RUMI_NUMERAL_SYMBOLS = 184
+UNICODE_BRAHMI = 185
+UNICODE_KAITHI = 186
+UNICODE_SORA_SOMPENG = 187
+UNICODE_CHAKMA = 188
+UNICODE_SHARADA = 189
+UNICODE_TAKRI = 190
+UNICODE_CUNEIFORM = 191
+UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 192
+UNICODE_EGYPTIAN_HIEROGLYPHS = 193
+UNICODE_BAMUM_SUPPLEMENT = 194
+UNICODE_MIAO = 195
+UNICODE_KANA_SUPPLEMENT = 196
+UNICODE_BYZANTINE_MUSICAL_SYMBOLS = 197
+UNICODE_MUSICAL_SYMBOLS = 198
+UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION = 199
+UNICODE_TAI_XUAN_JING_SYMBOLS = 200
+UNICODE_COUNTING_ROD_NUMERALS = 201
+UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 202
+UNICODE_ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS = 203
+UNICODE_MAHJONG_TILES = 204
+UNICODE_DOMINO_TILES = 205
+UNICODE_PLAYING_CARDS = 206
+UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 207
+UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 208
+UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 209
+UNICODE_EMOTICONS = 210
+UNICODE_TRANSPORT_AND_MAP_SYMBOLS = 211
+UNICODE_ALCHEMICAL_SYMBOLS = 212
+UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 213
+UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 214
+UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 215
+UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 216
+UNICODE_TAGS = 217
+UNICODE_VARIATION_SELECTORS_SUPPLEMENT = 218
+UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 219
+UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 220
+
+
+_unicode_blocks = [
+    (UNICODE_BASIC_LATIN, 0x0000, 0x007F),
+    (UNICODE_LATIN_1_SUPPLEMENT, 0x0080, 0x00FF),
+    (UNICODE_LATIN_EXTENDED_A, 0x0100, 0x017F),
+    (UNICODE_LATIN_EXTENDED_B, 0x0180, 0x024F),
+    (UNICODE_IPA_EXTENSIONS, 0x0250, 0x02AF),
+    (UNICODE_SPACING_MODIFIER_LETTERS, 0x02B0, 0x02FF),
+    (UNICODE_COMBINING_DIACRITICAL_MARKS, 0x0300, 0x036F),
+    (UNICODE_GREEK_AND_COPTIC, 0x0370, 0x03FF),
+    (UNICODE_CYRILLIC, 0x0400, 0x04FF),
+    (UNICODE_CYRILLIC_SUPPLEMENT, 0x0500, 0x052F),
+    (UNICODE_ARMENIAN, 0x0530, 0x058F),
+    (UNICODE_HEBREW, 0x0590, 0x05FF),
+    (UNICODE_ARABIC, 0x0600, 0x06FF),
+    (UNICODE_SYRIAC, 0x0700, 0x074F),
+    (UNICODE_ARABIC_SUPPLEMENT, 0x0750, 0x077F),
+    (UNICODE_THAANA, 0x0780, 0x07BF),
+    (UNICODE_NKO, 0x07C0, 0x07FF),
+    (UNICODE_SAMARITAN, 0x0800, 0x083F),
+    (UNICODE_MANDAIC, 0x0840, 0x085F),
+    (UNICODE_ARABIC_EXTENDED_A, 0x08A0, 0x08FF),
+    (UNICODE_DEVANAGARI, 0x0900, 0x097F),
+    (UNICODE_BENGALI, 0x0980, 0x09FF),
+    (UNICODE_GURMUKHI, 0x0A00, 0x0A7F),
+    (UNICODE_GUJARATI, 0x0A80, 0x0AFF),
+    (UNICODE_ORIYA, 0x0B00, 0x0B7F),
+    (UNICODE_TAMIL, 0x0B80, 0x0BFF),
+    (UNICODE_TELUGU, 0x0C00, 0x0C7F),
+    (UNICODE_KANNADA, 0x0C80, 0x0CFF),
+    (UNICODE_MALAYALAM, 0x0D00, 0x0D7F),
+    (UNICODE_SINHALA, 0x0D80, 0x0DFF),
+    (UNICODE_THAI, 0x0E00, 0x0E7F),
+    (UNICODE_LAO, 0x0E80, 0x0EFF),
+    (UNICODE_TIBETAN, 0x0F00, 0x0FFF),
+    (UNICODE_MYANMAR, 0x1000, 0x109F),
+    (UNICODE_GEORGIAN, 0x10A0, 0x10FF),
+    (UNICODE_HANGUL_JAMO, 0x1100, 0x11FF),
+    (UNICODE_ETHIOPIC, 0x1200, 0x137F),
+    (UNICODE_ETHIOPIC_SUPPLEMENT, 0x1380, 0x139F),
+    (UNICODE_CHEROKEE, 0x13A0, 0x13FF),
+    (UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 0x1400, 0x167F),
+    (UNICODE_OGHAM, 0x1680, 0x169F),
+    (UNICODE_RUNIC, 0x16A0, 0x16FF),
+    (UNICODE_TAGALOG, 0x1700, 0x171F),
+    (UNICODE_HANUNOO, 0x1720, 0x173F),
+    (UNICODE_BUHID, 0x1740, 0x175F),
+    (UNICODE_TAGBANWA, 0x1760, 0x177F),
+    (UNICODE_KHMER, 0x1780, 0x17FF),
+    (UNICODE_MONGOLIAN, 0x1800, 0x18AF),
+    (UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED, 0x18B0, 0x18FF),
+    (UNICODE_LIMBU, 0x1900, 0x194F),
+    (UNICODE_TAI_LE, 0x1950, 0x197F),
+    (UNICODE_NEW_TAI_LUE, 0x1980, 0x19DF),
+    (UNICODE_KHMER_SYMBOLS, 0x19E0, 0x19FF),
+    (UNICODE_BUGINESE, 0x1A00, 0x1A1F),
+    (UNICODE_TAI_THAM, 0x1A20, 0x1AAF),
+    (UNICODE_BALINESE, 0x1B00, 0x1B7F),
+    (UNICODE_SUNDANESE, 0x1B80, 0x1BBF),
+    (UNICODE_BATAK, 0x1BC0, 0x1BFF),
+    (UNICODE_LEPCHA, 0x1C00, 0x1C4F),
+    (UNICODE_OL_CHIKI, 0x1C50, 0x1C7F),
+    (UNICODE_SUNDANESE_SUPPLEMENT, 0x1CC0, 0x1CCF),
+    (UNICODE_VEDIC_EXTENSIONS, 0x1CD0, 0x1CFF),
+    (UNICODE_PHONETIC_EXTENSIONS, 0x1D00, 0x1D7F),
+    (UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT, 0x1D80, 0x1DBF),
+    (UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT, 0x1DC0, 0x1DFF),
+    (UNICODE_LATIN_EXTENDED_ADDITIONAL, 0x1E00, 0x1EFF),
+    (UNICODE_GREEK_EXTENDED, 0x1F00, 0x1FFF),
+    (UNICODE_GENERAL_PUNCTUATION, 0x2000, 0x206F),
+    (UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS, 0x2070, 0x209F),
+    (UNICODE_CURRENCY_SYMBOLS, 0x20A0, 0x20CF),
+    (UNICODE_COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS, 0x20D0, 0x20FF),
+    (UNICODE_LETTERLIKE_SYMBOLS, 0x2100, 0x214F),
+    (UNICODE_NUMBER_FORMS, 0x2150, 0x218F),
+    (UNICODE_ARROWS, 0x2190, 0x21FF),
+    (UNICODE_MATHEMATICAL_OPERATORS, 0x2200, 0x22FF),
+    (UNICODE_MISCELLANEOUS_TECHNICAL, 0x2300, 0x23FF),
+    (UNICODE_CONTROL_PICTURES, 0x2400, 0x243F),
+    (UNICODE_OPTICAL_CHARACTER_RECOGNITION, 0x2440, 0x245F),
+    (UNICODE_ENCLOSED_ALPHANUMERICS, 0x2460, 0x24FF),
+    (UNICODE_BOX_DRAWING, 0x2500, 0x257F),
+    (UNICODE_BLOCK_ELEMENTS, 0x2580, 0x259F),
+    (UNICODE_GEOMETRIC_SHAPES, 0x25A0, 0x25FF),
+    (UNICODE_MISCELLANEOUS_SYMBOLS, 0x2600, 0x26FF),
+    (UNICODE_DINGBATS, 0x2700, 0x27BF),
+    (UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 0x27C0, 0x27EF),
+    (UNICODE_SUPPLEMENTAL_ARROWS_A, 0x27F0, 0x27FF),
+    (UNICODE_BRAILLE_PATTERNS, 0x2800, 0x28FF),
+    (UNICODE_SUPPLEMENTAL_ARROWS_B, 0x2900, 0x297F),
+    (UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 0x2980, 0x29FF),
+    (UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 0x2A00, 0x2AFF),
+    (UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS, 0x2B00, 0x2BFF),
+    (UNICODE_GLAGOLITIC, 0x2C00, 0x2C5F),
+    (UNICODE_LATIN_EXTENDED_C, 0x2C60, 0x2C7F),
+    (UNICODE_COPTIC, 0x2C80, 0x2CFF),
+    (UNICODE_GEORGIAN_SUPPLEMENT, 0x2D00, 0x2D2F),
+    (UNICODE_TIFINAGH, 0x2D30, 0x2D7F),
+    (UNICODE_ETHIOPIC_EXTENDED, 0x2D80, 0x2DDF),
+    (UNICODE_CYRILLIC_EXTENDED_A, 0x2DE0, 0x2DFF),
+    (UNICODE_SUPPLEMENTAL_PUNCTUATION, 0x2E00, 0x2E7F),
+    (UNICODE_CJK_RADICALS_SUPPLEMENT, 0x2E80, 0x2EFF),
+    (UNICODE_KANGXI_RADICALS, 0x2F00, 0x2FDF),
+    (UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 0x2FF0, 0x2FFF),
+    (UNICODE_CJK_SYMBOLS_AND_PUNCTUATION, 0x3000, 0x303F),
+    (UNICODE_HIRAGANA, 0x3040, 0x309F),
+    (UNICODE_KATAKANA, 0x30A0, 0x30FF),
+    (UNICODE_BOPOMOFO, 0x3100, 0x312F),
+    (UNICODE_HANGUL_COMPATIBILITY_JAMO, 0x3130, 0x318F),
+    (UNICODE_KANBUN, 0x3190, 0x319F),
+    (UNICODE_BOPOMOFO_EXTENDED, 0x31A0, 0x31BF),
+    (UNICODE_CJK_STROKES, 0x31C0, 0x31EF),
+    (UNICODE_KATAKANA_PHONETIC_EXTENSIONS, 0x31F0, 0x31FF),
+    (UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS, 0x3200, 0x32FF),
+    (UNICODE_CJK_COMPATIBILITY, 0x3300, 0x33FF),
+    (UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 0x3400, 0x4DBF),
+    (UNICODE_YIJING_HEXAGRAM_SYMBOLS, 0x4DC0, 0x4DFF),
+    (UNICODE_CJK_UNIFIED_IDEOGRAPHS, 0x4E00, 0x9FFF),
+    (UNICODE_YI_SYLLABLES, 0xA000, 0xA48F),
+    (UNICODE_YI_RADICALS, 0xA490, 0xA4CF),
+    (UNICODE_LISU, 0xA4D0, 0xA4FF),
+    (UNICODE_VAI, 0xA500, 0xA63F),
+    (UNICODE_CYRILLIC_EXTENDED_B, 0xA640, 0xA69F),
+    (UNICODE_BAMUM, 0xA6A0, 0xA6FF),
+    (UNICODE_MODIFIER_TONE_LETTERS, 0xA700, 0xA71F),
+    (UNICODE_LATIN_EXTENDED_D, 0xA720, 0xA7FF),
+    (UNICODE_SYLOTI_NAGRI, 0xA800, 0xA82F),
+    (UNICODE_COMMON_INDIC_NUMBER_FORMS, 0xA830, 0xA83F),
+    (UNICODE_PHAGS_PA, 0xA840, 0xA87F),
+    (UNICODE_SAURASHTRA, 0xA880, 0xA8DF),
+    (UNICODE_DEVANAGARI_EXTENDED, 0xA8E0, 0xA8FF),
+    (UNICODE_KAYAH_LI, 0xA900, 0xA92F),
+    (UNICODE_REJANG, 0xA930, 0xA95F),
+    (UNICODE_HANGUL_JAMO_EXTENDED_A, 0xA960, 0xA97F),
+    (UNICODE_JAVANESE, 0xA980, 0xA9DF),
+    (UNICODE_CHAM, 0xAA00, 0xAA5F),
+    (UNICODE_MYANMAR_EXTENDED_A, 0xAA60, 0xAA7F),
+    (UNICODE_TAI_VIET, 0xAA80, 0xAADF),
+    (UNICODE_MEETEI_MAYEK_EXTENSIONS, 0xAAE0, 0xAAFF),
+    (UNICODE_ETHIOPIC_EXTENDED_A, 0xAB00, 0xAB2F),
+    (UNICODE_MEETEI_MAYEK, 0xABC0, 0xABFF),
+    (UNICODE_HANGUL_SYLLABLES, 0xAC00, 0xD7AF),
+    (UNICODE_HANGUL_JAMO_EXTENDED_B, 0xD7B0, 0xD7FF),
+    (UNICODE_HIGH_SURROGATES, 0xD800, 0xDB7F),
+    (UNICODE_HIGH_PRIVATE_USE_SURROGATES, 0xDB80, 0xDBFF),
+    (UNICODE_LOW_SURROGATES, 0xDC00, 0xDFFF),
+    (UNICODE_PRIVATE_USE_AREA, 0xE000, 0xF8FF),
+    (UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS, 0xF900, 0xFAFF),
+    (UNICODE_ALPHABETIC_PRESENTATION_FORMS, 0xFB00, 0xFB4F),
+    (UNICODE_ARABIC_PRESENTATION_FORMS_A, 0xFB50, 0xFDFF),
+    (UNICODE_VARIATION_SELECTORS, 0xFE00, 0xFE0F),
+    (UNICODE_VERTICAL_FORMS, 0xFE10, 0xFE1F),
+    (UNICODE_COMBINING_HALF_MARKS, 0xFE20, 0xFE2F),
+    (UNICODE_CJK_COMPATIBILITY_FORMS, 0xFE30, 0xFE4F),
+    (UNICODE_SMALL_FORM_VARIANTS, 0xFE50, 0xFE6F),
+    (UNICODE_ARABIC_PRESENTATION_FORMS_B, 0xFE70, 0xFEFF),
+    (UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS, 0xFF00, 0xFFEF),
+    (UNICODE_SPECIALS, 0xFFF0, 0xFFFF),
+    (UNICODE_LINEAR_B_SYLLABARY, 0x10000, 0x1007F),
+    (UNICODE_LINEAR_B_IDEOGRAMS, 0x10080, 0x100FF),
+    (UNICODE_AEGEAN_NUMBERS, 0x10100, 0x1013F),
+    (UNICODE_ANCIENT_GREEK_NUMBERS, 0x10140, 0x1018F),
+    (UNICODE_ANCIENT_SYMBOLS, 0x10190, 0x101CF),
+    (UNICODE_PHAISTOS_DISC, 0x101D0, 0x101FF),
+    (UNICODE_LYCIAN, 0x10280, 0x1029F),
+    (UNICODE_CARIAN, 0x102A0, 0x102DF),
+    (UNICODE_OLD_ITALIC, 0x10300, 0x1032F),
+    (UNICODE_GOTHIC, 0x10330, 0x1034F),
+    (UNICODE_UGARITIC, 0x10380, 0x1039F),
+    (UNICODE_OLD_PERSIAN, 0x103A0, 0x103DF),
+    (UNICODE_DESERET, 0x10400, 0x1044F),
+    (UNICODE_SHAVIAN, 0x10450, 0x1047F),
+    (UNICODE_OSMANYA, 0x10480, 0x104AF),
+    (UNICODE_CYPRIOT_SYLLABARY, 0x10800, 0x1083F),
+    (UNICODE_IMPERIAL_ARAMAIC, 0x10840, 0x1085F),
+    (UNICODE_PHOENICIAN, 0x10900, 0x1091F),
+    (UNICODE_LYDIAN, 0x10920, 0x1093F),
+    (UNICODE_MEROITIC_HIEROGLYPHS, 0x10980, 0x1099F),
+    (UNICODE_MEROITIC_CURSIVE, 0x109A0, 0x109FF),
+    (UNICODE_KHAROSHTHI, 0x10A00, 0x10A5F),
+    (UNICODE_OLD_SOUTH_ARABIAN, 0x10A60, 0x10A7F),
+    (UNICODE_AVESTAN, 0x10B00, 0x10B3F),
+    (UNICODE_INSCRIPTIONAL_PARTHIAN, 0x10B40, 0x10B5F),
+    (UNICODE_INSCRIPTIONAL_PAHLAVI, 0x10B60, 0x10B7F),
+    (UNICODE_OLD_TURKIC, 0x10C00, 0x10C4F),
+    (UNICODE_RUMI_NUMERAL_SYMBOLS, 0x10E60, 0x10E7F),
+    (UNICODE_BRAHMI, 0x11000, 0x1107F),
+    (UNICODE_KAITHI, 0x11080, 0x110CF),
+    (UNICODE_SORA_SOMPENG, 0x110D0, 0x110FF),
+    (UNICODE_CHAKMA, 0x11100, 0x1114F),
+    (UNICODE_SHARADA, 0x11180, 0x111DF),
+    (UNICODE_TAKRI, 0x11680, 0x116CF),
+    (UNICODE_CUNEIFORM, 0x12000, 0x123FF),
+    (UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION, 0x12400, 0x1247F),
+    (UNICODE_EGYPTIAN_HIEROGLYPHS, 0x13000, 0x1342F),
+    (UNICODE_BAMUM_SUPPLEMENT, 0x16800, 0x16A3F),
+    (UNICODE_MIAO, 0x16F00, 0x16F9F),
+    (UNICODE_KANA_SUPPLEMENT, 0x1B000, 0x1B0FF),
+    (UNICODE_BYZANTINE_MUSICAL_SYMBOLS, 0x1D000, 0x1D0FF),
+    (UNICODE_MUSICAL_SYMBOLS, 0x1D100, 0x1D1FF),
+    (UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION, 0x1D200, 0x1D24F),
+    (UNICODE_TAI_XUAN_JING_SYMBOLS, 0x1D300, 0x1D35F),
+    (UNICODE_COUNTING_ROD_NUMERALS, 0x1D360, 0x1D37F),
+    (UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 0x1D400, 0x1D7FF),
+    (UNICODE_ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS, 0x1EE00, 0x1EEFF),
+    (UNICODE_MAHJONG_TILES, 0x1F000, 0x1F02F),
+    (UNICODE_DOMINO_TILES, 0x1F030, 0x1F09F),
+    (UNICODE_PLAYING_CARDS, 0x1F0A0, 0x1F0FF),
+    (UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT, 0x1F100, 0x1F1FF),
+    (UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT, 0x1F200, 0x1F2FF),
+    (UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS, 0x1F300, 0x1F5FF),
+    (UNICODE_EMOTICONS, 0x1F600, 0x1F64F),
+    (UNICODE_TRANSPORT_AND_MAP_SYMBOLS, 0x1F680, 0x1F6FF),
+    (UNICODE_ALCHEMICAL_SYMBOLS, 0x1F700, 0x1F77F),
+    (UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 0x20000, 0x2A6DF),
+    (UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C, 0x2A700, 0x2B73F),
+    (UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D, 0x2B740, 0x2B81F),
+    (UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 0x2F800, 0x2FA1F),
+    (UNICODE_TAGS, 0xE0000, 0xE007F),
+    (UNICODE_VARIATION_SELECTORS_SUPPLEMENT, 0xE0100, 0xE01EF),
+    (UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A, 0xF0000, 0xFFFFF),
+    (UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B, 0x100000, 0x10FFFF),
+]
+
+NUM_BLOCKS = len(_unicode_blocks)
+
+
+def unicode_block(ch):
+    '''Return the Unicode block name for ch, or None if ch has no block.'''
+    cp = ord(ch)
+    # special case basic latin
+    if cp <= 0x7F:
+        return UNICODE_BASIC_LATIN
+    # binary search for the correct block
+    be, en = 0, NUM_BLOCKS - 1
+    while be <= en:
+        mid = (be+en) >> 1
+        name, start, end = _unicode_blocks[mid]
+        if start <= cp <= end:
+            return name
+        if cp < start:
+            en = mid-1
+        else:
+            be = mid+1
diff --git a/nlp_resource_data/langdetect/utils/unicode_block.pyc b/nlp_resource_data/langdetect/utils/unicode_block.pyc
new file mode 100755 (executable)
index 0000000..fca22dc
Binary files /dev/null and b/nlp_resource_data/langdetect/utils/unicode_block.pyc differ
index 5564c1f..bf08685 100755 (executable)
@@ -58,5 +58,5 @@ rm -rf %{buildroot}
 %{_includedir}/*.h
 %{_app_bin_dir}/*
 %{TZ_SYS_RO_PACKAGES}/org.tizen.nlp.service.xml
-
+%{_libdir}/python2.7/site-packages/langdetect/*
 %license LICENSE