'charLenTable': Big5CharLenTable,\r
'name': 'Big5'}\r
\r
+# CP949\r
+\r
+CP949_cls = (\r
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f\r
+ 1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f\r
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f\r
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f\r
+ 1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f\r
+ 4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f\r
+ 1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f\r
+ 5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f\r
+ 0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f\r
+ 6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f\r
+ 6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af\r
+ 7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf\r
+ 7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf\r
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df\r
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef\r
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff\r
+)\r
+\r
+CP949_st = (\r
+#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =\r
+ eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart\r
+ eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError\r
+ eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe\r
+ eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3\r
+ eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4\r
+ eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5\r
+ eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6\r
+)\r
+\r
+CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)\r
+\r
+CP949SMModel = {'classTable': CP949_cls,\r
+ 'classFactor': 10,\r
+ 'stateTable': CP949_st,\r
+ 'charLenTable': CP949CharLenTable,\r
+ 'name': 'CP949'}\r
+\r
# EUC-JP\r
\r
EUCJP_cls = (\r
\r
from . import constants\r
import sys\r
+import codecs\r
from .latin1prober import Latin1Prober # windows-1252\r
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets\r
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets\r
\r
if not self._mGotData:\r
# If the data starts with BOM, we know it is UTF\r
- if aBuf[:3] == '\xEF\xBB\xBF':\r
+ if aBuf[:3] == codecs.BOM:\r
# EF BB BF UTF-8 with BOM\r
self.result = {'encoding': "UTF-8", 'confidence': 1.0}\r
- elif aBuf[:4] == '\xFF\xFE\x00\x00':\r
+ elif aBuf[:4] == codecs.BOM_UTF32_LE:\r
# FF FE 00 00 UTF-32, little-endian BOM\r
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}\r
- elif aBuf[:4] == '\x00\x00\xFE\xFF':\r
+ elif aBuf[:4] == codecs.BOM_UTF32_BE:\r
# 00 00 FE FF UTF-32, big-endian BOM\r
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}\r
- elif aBuf[:4] == '\xFE\xFF\x00\x00':\r
+ elif aBuf[:4] == b'\xFE\xFF\x00\x00':\r
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)\r
self.result = {\r
'encoding': "X-ISO-10646-UCS-4-3412",\r
'confidence': 1.0\r
}\r
- elif aBuf[:4] == '\x00\x00\xFF\xFE':\r
+ elif aBuf[:4] == b'\x00\x00\xFF\xFE':\r
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)\r
self.result = {\r
'encoding': "X-ISO-10646-UCS-4-2143",\r
'confidence': 1.0\r
}\r
- elif aBuf[:2] == '\xFF\xFE':\r
+ elif aBuf[:2] == codecs.BOM_LE:\r
# FF FE UTF-16, little endian BOM\r
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}\r
- elif aBuf[:2] == '\xFE\xFF':\r
+ elif aBuf[:2] == codecs.BOM_BE:\r
# FE FF UTF-16, big endian BOM\r
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}\r
\r