2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
17 #define N_GRAM_MASK 0xFFFFFF
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23 : ngram(0), byteIndex(0)
25 ngramList = theNgramList;
28 ngramCount = hitCount = 0;
32 * Binary search for value in table, which must have exactly 64 entries.
35 int32_t NGramParser::search(const int32_t *table, int32_t value)
39 if (table[index + 32] <= value) {
43 if (table[index + 16] <= value) {
47 if (table[index + 8] <= value) {
51 if (table[index + 4] <= value) {
55 if (table[index + 2] <= value) {
59 if (table[index + 1] <= value) {
63 if (table[index] > value) {
67 if (index < 0 || table[index] != value) {
74 void NGramParser::lookup(int32_t thisNgram)
78 if (search(ngramList, thisNgram) >= 0) {
84 void NGramParser::addByte(int32_t b)
86 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
90 int32_t NGramParser::nextByte(InputText *det)
92 if (byteIndex >= det->fInputLen) {
96 return det->fInputBytes[byteIndex++];
99 void NGramParser::parseCharacters(InputText *det)
102 bool ignoreSpace = FALSE;
104 while ((b = nextByte(det)) >= 0) {
105 uint8_t mb = charMap[b];
107 // TODO: 0x20 might not be a space in all character sets...
109 if (!(mb == 0x20 && ignoreSpace)) {
113 ignoreSpace = (mb == 0x20);
118 int32_t NGramParser::parse(InputText *det)
120 parseCharacters(det);
122 // TODO: Is this OK? The buffer could have ended in the middle of a word...
125 double rawPercent = (double) hitCount / (double) ngramCount;
127 // if (rawPercent <= 2.0) {
131 // TODO - This is a bit of a hack to take care of a case
132 // were we were getting a confidence of 135...
133 if (rawPercent > 0.33) {
137 return (int32_t) (rawPercent * 300.0);
140 #if !UCONFIG_NO_NON_HTML5_CONVERSION
141 static const uint8_t unshapeMap_IBM420[] = {
142 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
143 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
144 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
145 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
146 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
147 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
148 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
149 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
150 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
151 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
152 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
153 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
154 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
155 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
156 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
157 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
158 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
161 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
167 int32_t NGramParser_IBM420::isLamAlef(int32_t b)
169 if(b == 0xB2 || b == 0xB3){
171 }else if(b == 0xB4 || b == 0xB5){
173 }else if(b == 0xB8 || b == 0xB9){
180 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
181 * because CharsetDetector is dealing with bytes not Unicode code points. We could
182 * convert the bytes to Unicode code points but that would leave us dependent
183 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
184 * of JDK can produce different results and therefore is also avoided.
186 int32_t NGramParser_IBM420::nextByte(InputText *det)
189 if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
194 alef = isLamAlef(det->fInputBytes[byteIndex]);
198 next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
205 void NGramParser_IBM420::parseCharacters(InputText *det)
208 bool ignoreSpace = FALSE;
210 while ((b = nextByte(det)) >= 0) {
211 uint8_t mb = charMap[b];
213 // TODO: 0x20 might not be a space in all character sets...
215 if (!(mb == 0x20 && ignoreSpace)) {
218 ignoreSpace = (mb == 0x20);
222 mb = charMap[alef & 0xFF];
224 // TODO: 0x20 might not be a space in all character sets...
226 if (!(mb == 0x20 && ignoreSpace)) {
230 ignoreSpace = (mb == 0x20);
238 CharsetRecog_sbcs::CharsetRecog_sbcs()
240 // nothing else to do
243 CharsetRecog_sbcs::~CharsetRecog_sbcs()
248 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
250 NGramParser parser(ngrams, byteMap);
253 result = parser.parse(det);
258 static const uint8_t charMap_8859_1[] = {
259 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
260 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
268 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
269 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
270 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
271 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
272 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
273 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
274 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
275 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
276 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
282 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
284 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
285 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
286 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
287 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
288 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
289 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
290 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
293 static const uint8_t charMap_8859_2[] = {
294 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
295 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
303 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
304 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
305 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
306 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
307 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
308 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
309 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
310 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
311 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
313 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
314 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
315 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
316 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
317 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
318 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
319 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
320 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
321 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
322 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
323 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
324 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
325 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
328 static const uint8_t charMap_8859_5[] = {
329 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
330 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
338 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
339 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
340 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
341 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
342 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
343 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
344 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
345 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
346 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
348 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
349 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
350 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
351 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
352 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
353 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
354 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
355 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
356 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
357 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
358 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
359 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
360 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
363 static const uint8_t charMap_8859_6[] = {
364 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
365 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
373 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
374 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
375 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
376 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
377 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
378 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
379 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
380 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
381 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
389 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
390 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
391 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
392 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
393 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
394 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
395 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
398 static const uint8_t charMap_8859_7[] = {
399 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
400 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
402 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
403 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
404 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
405 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
406 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
407 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
408 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
409 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
410 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
412 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
413 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
414 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
417 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
418 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
419 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
420 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
421 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
422 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
423 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
424 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
425 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
426 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
427 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
428 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
429 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
430 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
433 static const uint8_t charMap_8859_8[] = {
434 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
435 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
436 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
437 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
438 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
439 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
440 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
441 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
442 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
443 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
444 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
445 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
446 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
447 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
448 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
449 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
461 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
462 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
463 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
464 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
465 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
468 static const uint8_t charMap_8859_9[] = {
469 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
472 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
474 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
475 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
476 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
477 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
478 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
479 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
480 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
481 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
482 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
483 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
484 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
485 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
486 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
487 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
488 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
489 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
490 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
491 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
492 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
494 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
495 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
496 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
497 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
498 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
499 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
500 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
503 static const int32_t ngrams_windows_1251[] = {
504 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
505 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
506 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
507 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
510 static const uint8_t charMap_windows_1251[] = {
511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
516 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
518 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
519 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
520 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
521 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
522 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
523 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
524 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
525 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
526 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
527 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
528 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
529 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
530 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
531 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
532 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
533 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
534 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
535 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
536 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
537 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
538 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
539 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
540 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
541 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
542 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
545 static const int32_t ngrams_windows_1256[] = {
546 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
547 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
548 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
549 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
552 static const uint8_t charMap_windows_1256[] = {
553 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
554 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
555 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
556 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
557 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
558 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
560 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
561 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
562 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
563 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
564 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
565 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
566 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
567 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
568 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
569 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
570 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
571 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
572 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
573 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
574 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
575 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
576 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
577 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
578 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
579 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
580 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
581 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
582 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
583 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
584 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
587 static const int32_t ngrams_KOI8_R[] = {
588 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
589 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
590 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
591 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
594 static const uint8_t charMap_KOI8_R[] = {
595 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
596 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
597 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
598 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
599 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
600 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
601 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
602 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
603 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
604 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
605 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
606 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
607 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
608 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
609 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
610 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
611 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
612 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
613 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
614 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
615 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
616 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
617 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
618 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
619 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
620 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
621 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
622 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
623 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
624 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
625 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
626 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
629 #if !UCONFIG_NO_NON_HTML5_CONVERSION
630 static const int32_t ngrams_IBM424_he_rtl[] = {
631 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
632 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
633 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
634 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
637 static const int32_t ngrams_IBM424_he_ltr[] = {
638 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
639 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
640 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
641 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
644 static const uint8_t charMap_IBM424_he[] = {
645 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
646 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
647 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
648 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
649 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
650 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
651 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
652 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
654 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
659 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
660 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
661 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
664 static const int32_t ngrams_IBM420_ar_rtl[] = {
665 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
666 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
667 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
668 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
671 static const int32_t ngrams_IBM420_ar_ltr[] = {
672 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
673 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
674 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
675 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
678 static const uint8_t charMap_IBM420_ar[]= {
679 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
680 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
681 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
682 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
683 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
684 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
685 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
686 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
687 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
688 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
689 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
690 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
691 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
692 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
693 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
694 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
695 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
699 //ISO-8859-1,2,5,6,7,8,9 Ngrams
701 struct NGramsPlusLang {
702 const int32_t ngrams[64];
706 static const NGramsPlusLang ngrams_8859_1[] = {
709 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
710 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
711 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
712 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
718 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
719 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
720 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
721 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
727 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
728 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
729 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
730 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
736 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
737 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
738 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
739 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
745 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
746 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
747 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
748 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
754 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
755 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
756 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
757 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
763 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
764 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
765 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
766 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
772 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
773 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
774 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
775 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
781 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
782 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
783 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
784 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
790 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
791 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
792 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
793 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
800 static const NGramsPlusLang ngrams_8859_2[] = {
803 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
804 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
805 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
806 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
812 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
813 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
814 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
815 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
821 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
822 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
823 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
824 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
830 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
831 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
832 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
833 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
839 static const int32_t ngrams_8859_5_ru[] = {
840 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
841 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
842 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
843 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
846 static const int32_t ngrams_8859_6_ar[] = {
847 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
848 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
849 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
850 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
853 static const int32_t ngrams_8859_7_el[] = {
854 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
855 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
856 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
857 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
860 static const int32_t ngrams_8859_8_I_he[] = {
861 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
862 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
863 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
864 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
867 static const int32_t ngrams_8859_8_he[] = {
868 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
869 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
870 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
871 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
874 static const int32_t ngrams_8859_9_tr[] = {
875 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
876 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
877 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
878 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
881 CharsetRecog_8859_1::~CharsetRecog_8859_1()
886 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
887 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
889 int32_t bestConfidenceSoFar = -1;
890 for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
891 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
892 const char *lang = ngrams_8859_1[i].lang;
893 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
894 if (confidence > bestConfidenceSoFar) {
895 results->set(textIn, this, confidence, name, lang);
896 bestConfidenceSoFar = confidence;
899 return (bestConfidenceSoFar > 0);
902 const char *CharsetRecog_8859_1::getName() const
908 CharsetRecog_8859_2::~CharsetRecog_8859_2()
913 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
914 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
916 int32_t bestConfidenceSoFar = -1;
917 for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
918 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
919 const char *lang = ngrams_8859_2[i].lang;
920 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
921 if (confidence > bestConfidenceSoFar) {
922 results->set(textIn, this, confidence, name, lang);
923 bestConfidenceSoFar = confidence;
926 return (bestConfidenceSoFar > 0);
929 const char *CharsetRecog_8859_2::getName() const
935 CharsetRecog_8859_5::~CharsetRecog_8859_5()
940 const char *CharsetRecog_8859_5::getName() const
945 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
950 const char *CharsetRecog_8859_5_ru::getLanguage() const
955 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
957 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
958 results->set(textIn, this, confidence);
959 return (confidence > 0);
962 CharsetRecog_8859_6::~CharsetRecog_8859_6()
967 const char *CharsetRecog_8859_6::getName() const
972 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
977 const char *CharsetRecog_8859_6_ar::getLanguage() const
982 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
984 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
985 results->set(textIn, this, confidence);
986 return (confidence > 0);
989 CharsetRecog_8859_7::~CharsetRecog_8859_7()
994 const char *CharsetRecog_8859_7::getName() const
999 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1004 const char *CharsetRecog_8859_7_el::getLanguage() const
1009 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
1011 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1012 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1013 results->set(textIn, this, confidence, name, "el");
1014 return (confidence > 0);
1017 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1022 const char *CharsetRecog_8859_8::getName() const
1024 return "ISO-8859-8";
1027 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1032 const char *CharsetRecog_8859_8_I_he::getName() const
1034 return "ISO-8859-8-I";
1037 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1042 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
1044 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1045 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1046 results->set(textIn, this, confidence, name, "he");
1047 return (confidence > 0);
1050 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1055 const char *CharsetRecog_8859_8_he::getLanguage() const
1060 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
1062 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1063 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1064 results->set(textIn, this, confidence, name, "he");
1065 return (confidence > 0);
1068 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1073 const char *CharsetRecog_8859_9::getName() const
1075 return "ISO-8859-9";
1078 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1083 const char *CharsetRecog_8859_9_tr::getLanguage() const
1088 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
1090 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1091 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1092 results->set(textIn, this, confidence, name, "tr");
1093 return (confidence > 0);
1096 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1101 const char *CharsetRecog_windows_1256::getName() const
1103 return "windows-1256";
1106 const char *CharsetRecog_windows_1256::getLanguage() const
1111 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1113 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1114 results->set(textIn, this, confidence);
1115 return (confidence > 0);
1118 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1123 const char *CharsetRecog_windows_1251::getName() const
1125 return "windows-1251";
1128 const char *CharsetRecog_windows_1251::getLanguage() const
1133 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1135 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1136 results->set(textIn, this, confidence);
1137 return (confidence > 0);
1140 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1145 const char *CharsetRecog_KOI8_R::getName() const
1150 const char *CharsetRecog_KOI8_R::getLanguage() const
1155 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1157 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1158 results->set(textIn, this, confidence);
1159 return (confidence > 0);
1162 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1163 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1168 const char *CharsetRecog_IBM424_he::getLanguage() const
1173 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1178 const char *CharsetRecog_IBM424_he_rtl::getName() const
1180 return "IBM424_rtl";
1183 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1185 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1186 results->set(textIn, this, confidence);
1187 return (confidence > 0);
1190 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1195 const char *CharsetRecog_IBM424_he_ltr::getName() const
1197 return "IBM424_ltr";
1200 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1202 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1203 results->set(textIn, this, confidence);
1204 return (confidence > 0);
1207 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1212 const char *CharsetRecog_IBM420_ar::getLanguage() const
1218 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
1220 NGramParser_IBM420 parser(ngrams, byteMap);
1223 result = parser.parse(det);
1228 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1233 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1235 return "IBM420_rtl";
1238 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1240 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1241 results->set(textIn, this, confidence);
1242 return (confidence > 0);
1245 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1250 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1252 return "IBM420_ltr";
1255 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1257 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1258 results->set(textIn, this, confidence);
1259 return (confidence > 0);