From: Kenneth Reitz Date: Fri, 22 Mar 2013 19:20:49 +0000 (-0400) Subject: upgrade to charade v1.0.3 X-Git-Tag: v1.2.0~12 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=80a861cb84175cdf444ce3d0c677251635859e22;p=services%2Fpython-requests.git upgrade to charade v1.0.3 --- diff --git a/requests/packages/charade/__init__.py b/requests/packages/charade/__init__.py index 5d580b3..1aadf3e 100644 --- a/requests/packages/charade/__init__.py +++ b/requests/packages/charade/__init__.py @@ -15,10 +15,15 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -__version__ = "1.0.1" +__version__ = "1.0.3" +from sys import version_info def detect(aBuf): + if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or + (version_info >= (3, 0) and not isinstance(aBuf, bytes))): + raise ValueError('Expected a bytes object, not a unicode object') + from . import universaldetector u = universaldetector.UniversalDetector() u.reset() diff --git a/requests/packages/charade/chardistribution.py b/requests/packages/charade/chardistribution.py index 981bd1a..dfd3355 100644 --- a/requests/packages/charade/chardistribution.py +++ b/requests/packages/charade/chardistribution.py @@ -40,6 +40,7 @@ from .compat import wrap_ord ENOUGH_DATA_THRESHOLD = 1024 SURE_YES = 0.99 SURE_NO = 0.01 +MINIMUM_DATA_THRESHOLD = 3 class CharDistributionAnalysis: @@ -82,7 +83,7 @@ class CharDistributionAnalysis: """return confidence based on existing data""" # if we didn't receive any character in our consideration range, # return negative answer - if self._mTotalChars <= 0: + if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD: return SURE_NO if self._mTotalChars != self._mFreqChars: diff --git a/requests/packages/charade/compat.py b/requests/packages/charade/compat.py index f86c46b..d9e30ad 100644 --- a/requests/packages/charade/compat.py +++ b/requests/packages/charade/compat.py @@ -18,9 +18,17 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +import sys + + +if sys.version_info < (3, 0): + base_str = (str, unicode) +else: + base_str = (bytes, str) + def wrap_ord(a): - if isinstance(a, str): + if sys.version_info < (3, 0) and isinstance(a, base_str): return ord(a) - elif isinstance(a, int): + else: return a diff --git a/requests/packages/charade/langcyrillicmodel.py b/requests/packages/charade/langcyrillicmodel.py index 4b69c82..15e338f 100644 --- a/requests/packages/charade/langcyrillicmodel.py +++ b/requests/packages/charade/langcyrillicmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from . import constants - # KOI8-R language model # Character Mapping Table: KOI8R_CharToOrderMap = ( diff --git a/requests/packages/charade/langgreekmodel.py b/requests/packages/charade/langgreekmodel.py index 78e9ce6..93241ce 100644 --- a/requests/packages/charade/langgreekmodel.py +++ b/requests/packages/charade/langgreekmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from . import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word diff --git a/requests/packages/charade/langhebrewmodel.py b/requests/packages/charade/langhebrewmodel.py index 4c6b3ce..d871324 100644 --- a/requests/packages/charade/langhebrewmodel.py +++ b/requests/packages/charade/langhebrewmodel.py @@ -27,8 +27,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from . import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word diff --git a/requests/packages/charade/langhungarianmodel.py b/requests/packages/charade/langhungarianmodel.py index bd7f505..6f59c61 100644 --- a/requests/packages/charade/langhungarianmodel.py +++ b/requests/packages/charade/langhungarianmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from . import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word diff --git a/requests/packages/charade/mbcsgroupprober.py b/requests/packages/charade/mbcsgroupprober.py index ebe93d0..2f6f5e8 100644 --- a/requests/packages/charade/mbcsgroupprober.py +++ b/requests/packages/charade/mbcsgroupprober.py @@ -33,6 +33,7 @@ from .sjisprober import SJISProber from .eucjpprober import EUCJPProber from .gb2312prober import GB2312Prober from .euckrprober import EUCKRProber +from .cp949prober import CP949Prober from .big5prober import Big5Prober from .euctwprober import EUCTWProber @@ -46,6 +47,7 @@ class MBCSGroupProber(CharSetGroupProber): EUCJPProber(), GB2312Prober(), EUCKRProber(), + CP949Prober(), Big5Prober(), EUCTWProber() ] diff --git a/requests/packages/charade/mbcssm.py b/requests/packages/charade/mbcssm.py index 3a720c9..55c02f0 100644 --- a/requests/packages/charade/mbcssm.py +++ b/requests/packages/charade/mbcssm.py @@ -78,6 +78,46 @@ Big5SMModel = {'classTable': BIG5_cls, 'charLenTable': Big5CharLenTable, 'name': 'Big5'} +# CP949 + +CP949_cls = ( + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f + 1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f + 1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f + 4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f + 1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f + 5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f + 0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f + 6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f + 6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af + 7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf + 7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff +) + +CP949_st = ( +#cls= 0 1 2 3 4 5 6 7 8 9 # previous state = + eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart + eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError + eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe + eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3 + eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4 + eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5 + eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6 +) + +CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) + +CP949SMModel = {'classTable': CP949_cls, + 'classFactor': 10, + 'stateTable': CP949_st, + 'charLenTable': CP949CharLenTable, + 'name': 'CP949'} + # EUC-JP EUCJP_cls = ( diff --git a/requests/packages/charade/universaldetector.py b/requests/packages/charade/universaldetector.py index adaae72..6175bfb 100644 --- a/requests/packages/charade/universaldetector.py +++ b/requests/packages/charade/universaldetector.py @@ -28,6 +28,7 @@ from . import constants import sys +import codecs from .latin1prober import Latin1Prober # windows-1252 from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets from .sbcsgroupprober import SBCSGroupProber # single-byte character sets @@ -70,31 +71,31 @@ class UniversalDetector: if not self._mGotData: # If the data starts with BOM, we know it is UTF - if aBuf[:3] == '\xEF\xBB\xBF': + if aBuf[:3] == codecs.BOM: # EF BB BF UTF-8 with BOM self.result = {'encoding': "UTF-8", 'confidence': 1.0} - elif aBuf[:4] == '\xFF\xFE\x00\x00': + elif aBuf[:4] == codecs.BOM_UTF32_LE: # FF FE 00 00 UTF-32, little-endian BOM self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} - elif aBuf[:4] == '\x00\x00\xFE\xFF': + elif aBuf[:4] == codecs.BOM_UTF32_BE: # 00 00 FE FF UTF-32, big-endian BOM self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} - elif aBuf[:4] == '\xFE\xFF\x00\x00': + elif aBuf[:4] == b'\xFE\xFF\x00\x00': # FE FF 00 00 UCS-4, unusual octet order BOM (3412) self.result = { 'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0 } - elif aBuf[:4] == '\x00\x00\xFF\xFE': + elif aBuf[:4] == b'\x00\x00\xFF\xFE': # 00 00 FF FE UCS-4, unusual octet order BOM (2143) self.result = { 'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0 } - elif aBuf[:2] == '\xFF\xFE': + elif aBuf[:2] == codecs.BOM_LE: # FF FE UTF-16, little endian BOM self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} - elif aBuf[:2] == '\xFE\xFF': + elif aBuf[:2] == codecs.BOM_BE: # FE FF UTF-16, big endian BOM self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}