1 Index: source/common/ucnv2022.cpp
2 ===================================================================
3 --- source/common/ucnv2022.cpp (revision 259715)
4 +++ source/common/ucnv2022.cpp (working copy)
8 /* is the StateEnum charset value for a DBCS charset? */
9 +#if UCONFIG_NO_NON_HTML5_CONVERSION
10 +#define IS_JP_DBCS(cs) (JISX208==(cs))
12 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
15 #define CSM(cs) ((uint16_t)1<<(cs))
18 * all versions, not just JIS7 and JIS8.
19 * - ICU does not distinguish between different versions of JIS X 0208.
21 +#if UCONFIG_NO_NON_HTML5_CONVERSION
22 +enum { MAX_JA_VERSION=0 };
24 enum { MAX_JA_VERSION=4 };
26 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
28 + * TODO(jshin): The encoding spec has JISX212, but we don't support it.
29 + * See https://www.w3.org/Bugs/Public/show_bug.cgi?id=26885
31 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
32 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
33 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
34 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
35 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
36 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
42 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
46 /* Type def for refactoring changeState_2022 code*/
48 #ifdef U_ENABLE_GENERIC_ISO_2022
51 +#if UCONFIG_NO_NON_HTML5_CONVERSION
60 /*********** ISO 2022 Converter Protos ***********/
62 /* prevent indexing beyond jpCharsetMasks[] */
63 myConverterData->version = version = 0;
65 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
66 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
67 myConverterData->myConverterArray[ISO8859_7] =
68 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
71 myConverterData->myConverterArray[JISX208] =
72 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
73 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
74 if(jpCharsetMasks[version]&CSM(JISX212)) {
75 myConverterData->myConverterArray[JISX212] =
76 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
78 myConverterData->myConverterArray[KSC5601] =
79 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
83 /* set the function pointers to appropriate funtions */
84 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
86 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
87 myConverterData->name[len+1]='\0';
89 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
90 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
91 (myLocale[2]=='_' || myLocale[2]=='\0'))
94 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
97 +#endif // !UCONFIG_NO_NON_HTML5_CONVERSION
99 #ifdef U_ENABLE_GENERIC_ISO_2022
100 myConverterData->isFirstBuffer = TRUE;
102 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
105 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
106 /*************** to unicode *******************/
107 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
108 /* 0 1 2 3 4 5 6 7 8 9 */
110 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
111 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
116 static UCNV_TableStates_2022
120 /* case SS3_STATE: not used in ISO-2022-JP-x */
121 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
124 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
126 myData2022->toU2022State.cs[2]=(int8_t)tempState;
131 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
132 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
137 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
140 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
142 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
145 +#endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
148 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
149 @@ -1381,12 +1410,16 @@
150 static const StateEnum jpCharsetPref[]={
153 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
158 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
166 @@ -1756,6 +1789,7 @@
170 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
172 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
173 targetValue = (uint32_t)sourceChar - 0x80;
174 @@ -1764,6 +1798,7 @@
180 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
181 if(converterData->version==3) {
182 @@ -1825,6 +1860,7 @@
186 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
188 /* G0 SBCS forced to 7-bit output */
189 len2 = MBCS_SINGLE_FROM_UCHAR32(
190 @@ -1839,6 +1875,7 @@
197 len2 = MBCS_FROM_UCHAR32_ISO2022(
198 @@ -1846,6 +1883,7 @@
200 useFallback, MBCS_OUTPUT_2);
201 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
202 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
205 * Check for valid bytes for the encoding scheme.
206 @@ -1857,6 +1895,7 @@
214 @@ -2150,6 +2189,7 @@
215 targetUniChar = mySourceChar;
218 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
220 if(mySourceChar <= 0x7f) {
221 targetUniChar = mySourceChar + 0x80;
222 @@ -2168,6 +2208,7 @@
223 /* return from a single-shift state to the previous one */
224 pToU2022State->g=pToU2022State->prevG;
228 if(mySourceChar <= 0x7f) {
229 targetUniChar = jisx201ToU(mySourceChar);
230 @@ -2207,9 +2248,11 @@
232 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
233 mySourceChar = tmpSourceChar;
234 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
236 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
239 tempBuf[0] = (char)(tmpSourceChar >> 8);
240 tempBuf[1] = (char)(tmpSourceChar);
242 @@ -2271,6 +2314,7 @@
246 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
247 /***************************************************************
248 * Rules for ISO-2022-KR encoding
249 * i) The KSC5601 designator sequence should appear only once in a file,
250 @@ -3414,6 +3458,7 @@
251 args->target = myTarget;
252 args->source = mySource;
254 +#endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
257 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
258 @@ -3615,6 +3660,7 @@
259 /* include JIS X 0201 which is hardcoded */
260 sa->add(sa->set, 0xa5);
261 sa->add(sa->set, 0x203e);
262 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
263 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
264 /* include Latin-1 for some variants of JP */
265 sa->addRange(sa->set, 0, 0xff);
266 @@ -3622,6 +3668,10 @@
267 /* include ASCII for JP */
268 sa->addRange(sa->set, 0, 0x7f);
271 + /* include ASCII for JP */
272 + sa->addRange(sa->set, 0, 0x7f);
274 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
276 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
277 @@ -3640,6 +3690,7 @@
278 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
281 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
284 /* include ASCII for CN */
285 @@ -3651,6 +3702,7 @@
286 cnvData->currentConverter, sa, which, pErrorCode);
287 /* the loop over myConverterArray[] will simply not find another converter */
293 @@ -3671,10 +3723,16 @@
294 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
295 UConverterSetFilter filter;
296 if(cnvData->myConverterArray[i]!=NULL) {
297 - if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
298 - cnvData->version==0 && i==CNS_11643
300 + if(cnvData->locale[0]=='j' && i==JISX208) {
302 + * Only add code points that map to Shift-JIS codes
303 + * corresponding to JIS X 0208.
305 + filter=UCNV_SET_FILTER_SJIS;
306 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
307 + } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
308 + cnvData->version==0 && i==CNS_11643) {
310 * Version-specific for CN:
311 * CN version 0 does not map CNS planes 3..7 although
312 * they are all available in the CNS conversion table;
313 @@ -3682,18 +3740,13 @@
314 * The two versions create different Unicode sets.
316 filter=UCNV_SET_FILTER_2022_CN;
317 - } else if(cnvData->locale[0]=='j' && i==JISX208) {
319 - * Only add code points that map to Shift-JIS codes
320 - * corresponding to JIS X 0208.
322 - filter=UCNV_SET_FILTER_SJIS;
323 } else if(i==KSC5601) {
325 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
326 * are broader than GR94.
328 filter=UCNV_SET_FILTER_GR94DBCS;
331 filter=UCNV_SET_FILTER_NONE;
333 @@ -3831,6 +3884,7 @@
337 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
338 /************* KR ***************/
339 static const UConverterImpl _ISO2022KRImpl={
341 @@ -3947,5 +4001,6 @@
345 +#endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
347 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
348 Index: source/common/ucnvbocu.cpp
349 ===================================================================
350 --- source/common/ucnvbocu.cpp (revision 259715)
351 +++ source/common/ucnvbocu.cpp (working copy)
354 #include "unicode/utypes.h"
356 -#if !UCONFIG_NO_CONVERSION
357 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
359 #include "unicode/ucnv.h"
360 #include "unicode/ucnv_cb.h"
361 Index: source/common/ucnvisci.c
362 ===================================================================
363 --- source/common/ucnvisci.c (revision 259715)
364 +++ source/common/ucnvisci.c (working copy)
367 #include "unicode/utypes.h"
369 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
370 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
372 #include "unicode/ucnv.h"
373 #include "unicode/ucnv_cb.h"
374 Index: source/common/ucnvscsu.c
375 ===================================================================
376 --- source/common/ucnvscsu.c (revision 259715)
377 +++ source/common/ucnvscsu.c (working copy)
380 #include "unicode/utypes.h"
382 -#if !UCONFIG_NO_CONVERSION
383 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
385 #include "unicode/ucnv.h"
386 #include "unicode/ucnv_cb.h"
387 Index: source/common/ucnv_u7.c
388 ===================================================================
389 --- source/common/ucnv_u7.c (revision 259715)
390 +++ source/common/ucnv_u7.c (working copy)
393 #include "unicode/utypes.h"
395 -#if !UCONFIG_NO_CONVERSION
396 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
398 #include "unicode/ucnv.h"
399 #include "ucnv_bld.h"
400 Index: source/common/unicode/uconfig.h
401 ===================================================================
402 --- source/common/unicode/uconfig.h (revision 259715)
403 +++ source/common/unicode/uconfig.h (working copy)
408 + * This switch turns off all the converters NOT listed in
409 + * the encoding standard : http://encoding.spec.whatwg.org
411 +#ifndef UCONFIG_NO_NON_HTML5_CONVERSION
412 +#define UCONFIG_NO_NON_HTML5_CONVERSION 0
416 * \def UCONFIG_NO_LEGACY_CONVERSION
417 * This switch turns off all converters except for
418 * - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1)
419 Index: source/common/ucnv_bld.cpp
420 ===================================================================
421 --- source/common/ucnv_bld.cpp (revision 259715)
422 +++ source/common/ucnv_bld.cpp (working copy)
425 #if UCONFIG_NO_LEGACY_CONVERSION
431 +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_NON_HTML5_CONVERSION
432 NULL, NULL, NULL, NULL, NULL, NULL,
433 NULL, NULL, NULL, NULL, NULL, NULL,
437 &_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6,
438 &_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19,
442 +#if UCONFIG_NO_NON_HTML5_CONVERSION
448 -#if UCONFIG_NO_LEGACY_CONVERSION
450 +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_NON_HTML5_CONVERSION
457 +#if UCONFIG_NO_NON_HTML5_CONVERSION
458 + NULL, NULL, &_UTF16Data, &_UTF32Data, NULL, NULL,
460 &_UTF7Data, &_Bocu1Data, &_UTF16Data, &_UTF32Data, &_CESU8Data, &_IMAPData,
463 -#if UCONFIG_NO_LEGACY_CONVERSION
464 +#if UCONFIG_NO_LEGACY_CONVERSION || UCONFIG_NO_NON_HTML5_CONVERSION
468 @@ -105,18 +118,24 @@
470 const UConverterType type;
471 } const cnvNameType[] = {
472 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
473 { "bocu1", UCNV_BOCU1 },
474 { "cesu8", UCNV_CESU8 },
475 -#if !UCONFIG_NO_LEGACY_CONVERSION
477 +#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
480 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
481 { "imapmailboxname", UCNV_IMAP_MAILBOX },
483 +#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
484 + { "iscii", UCNV_ISCII },
486 #if !UCONFIG_NO_LEGACY_CONVERSION
487 - { "iscii", UCNV_ISCII },
488 { "iso2022", UCNV_ISO_2022 },
490 { "iso88591", UCNV_LATIN_1 },
491 -#if !UCONFIG_NO_LEGACY_CONVERSION
492 +#if !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
493 { "lmbcs1", UCNV_LMBCS_1 },
494 { "lmbcs11",UCNV_LMBCS_11 },
495 { "lmbcs16",UCNV_LMBCS_16 },
497 { "lmbcs6", UCNV_LMBCS_6 },
498 { "lmbcs8", UCNV_LMBCS_8 },
500 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
501 { "scsu", UCNV_SCSU },
503 { "usascii", UCNV_US_ASCII },
504 { "utf16", UCNV_UTF16 },
505 { "utf16be", UCNV_UTF16_BigEndian },
507 { "utf32oppositeendian", UCNV_UTF32_BigEndian },
508 { "utf32platformendian", UCNV_UTF32_LittleEndian },
510 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
511 { "utf7", UCNV_UTF7 },
513 { "utf8", UCNV_UTF8 },
514 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
515 { "x11compoundtext", UCNV_COMPOUND_TEXT}
520 Index: source/common/ucnv_u8.c
521 ===================================================================
522 --- source/common/ucnv_u8.c (revision 259715)
523 +++ source/common/ucnv_u8.c (working copy)
525 static const uint32_t
526 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
528 +static UBool hasCESU8Data(const UConverter *cnv)
530 +#if UCONFIG_NO_NON_HTML5_CONVERSION
533 + return (UBool)(cnv->sharedData == &_CESU8Data);
537 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
541 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
542 const UChar *targetLimit = args->targetLimit;
543 unsigned char *toUBytes = cnv->toUBytes;
544 - UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
545 + UBool isCESU8 = hasCESU8Data(cnv);
546 uint32_t ch, ch2 = 0;
550 /* Restore size of current sequence */
551 if (cnv->toUnicodeStatus && myTarget < targetLimit)
554 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
555 const UChar *targetLimit = args->targetLimit;
556 unsigned char *toUBytes = cnv->toUBytes;
557 - UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
558 + UBool isCESU8 = hasCESU8Data(cnv);
559 uint32_t ch, ch2 = 0;
565 int32_t indexToWrite;
566 - UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
567 + UBool isNotCESU8 = !hasCESU8Data(cnv);
569 if (cnv->fromUChar32 && myTarget < targetLimit)
572 int32_t offsetNum, nextSourceIndex;
573 int32_t indexToWrite;
575 - UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
576 + UBool isNotCESU8 = !hasCESU8Data(cnv);
578 if (cnv->fromUChar32 && myTarget < targetLimit)
580 Index: source/common/unicode/urename.h
581 ===================================================================
582 --- source/common/unicode/urename.h (revision 259715)
583 +++ source/common/unicode/urename.h (working copy)
585 #define UDataMemory_setData U_ICU_ENTRY_POINT_RENAME(UDataMemory_setData)
586 #define UDatamemory_assign U_ICU_ENTRY_POINT_RENAME(UDatamemory_assign)
587 #define _ASCIIData U_ICU_ENTRY_POINT_RENAME(_ASCIIData)
588 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
589 #define _Bocu1Data U_ICU_ENTRY_POINT_RENAME(_Bocu1Data)
590 #define _CESU8Data U_ICU_ENTRY_POINT_RENAME(_CESU8Data)
591 #define _CompoundTextData U_ICU_ENTRY_POINT_RENAME(_CompoundTextData)
592 #define _HZData U_ICU_ENTRY_POINT_RENAME(_HZData)
593 #define _IMAPData U_ICU_ENTRY_POINT_RENAME(_IMAPData)
594 #define _ISCIIData U_ICU_ENTRY_POINT_RENAME(_ISCIIData)
596 #define _ISO2022Data U_ICU_ENTRY_POINT_RENAME(_ISO2022Data)
597 #define _LMBCSData1 U_ICU_ENTRY_POINT_RENAME(_LMBCSData1)
598 #define _LMBCSData11 U_ICU_ENTRY_POINT_RENAME(_LMBCSData11)
600 #define _LMBCSData8 U_ICU_ENTRY_POINT_RENAME(_LMBCSData8)
601 #define _Latin1Data U_ICU_ENTRY_POINT_RENAME(_Latin1Data)
602 #define _MBCSData U_ICU_ENTRY_POINT_RENAME(_MBCSData)
603 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
604 #define _SCSUData U_ICU_ENTRY_POINT_RENAME(_SCSUData)
606 #define _UTF16BEData U_ICU_ENTRY_POINT_RENAME(_UTF16BEData)
607 #define _UTF16Data U_ICU_ENTRY_POINT_RENAME(_UTF16Data)
608 #define _UTF16LEData U_ICU_ENTRY_POINT_RENAME(_UTF16LEData)
609 #define _UTF32BEData U_ICU_ENTRY_POINT_RENAME(_UTF32BEData)
610 #define _UTF32Data U_ICU_ENTRY_POINT_RENAME(_UTF32Data)
611 #define _UTF32LEData U_ICU_ENTRY_POINT_RENAME(_UTF32LEData)
612 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
613 #define _UTF7Data U_ICU_ENTRY_POINT_RENAME(_UTF7Data)
615 #define _UTF8Data U_ICU_ENTRY_POINT_RENAME(_UTF8Data)
616 #define cmemory_cleanup U_ICU_ENTRY_POINT_RENAME(cmemory_cleanup)
617 #define cmemory_inUse U_ICU_ENTRY_POINT_RENAME(cmemory_inUse)
618 Index: source/common/ucnv_cnv.h
619 ===================================================================
620 --- source/common/ucnv_cnv.h (revision 259715)
621 +++ source/common/ucnv_cnv.h (working copy)
622 @@ -256,11 +256,15 @@
623 extern const UConverterSharedData
624 _MBCSData, _Latin1Data,
625 _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData,
628 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
629 _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
630 _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
631 _HZData,_ISCIIData, _SCSUData, _ASCIIData,
632 _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData, _CompoundTextData;
634 + _ASCIIData, _UTF16Data, _UTF32Data;
639 Index: source/common/ucnv_lmb.c
640 ===================================================================
641 --- source/common/ucnv_lmb.c (revision 291619)
642 +++ source/common/ucnv_lmb.c (working copy)
645 #include "unicode/utypes.h"
647 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
648 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
650 #include "unicode/ucnv_err.h"
651 #include "unicode/ucnv.h"
652 Index: source/common/ucnvhz.c
653 ===================================================================
654 --- source/common/ucnvhz.c (revision 291619)
655 +++ source/common/ucnvhz.c (working copy)
658 #include "unicode/utypes.h"
660 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
661 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
664 #include "unicode/ucnv.h"
669 -#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
670 +#endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION */
671 Index: source/common/ucnv_ct.c
672 ===================================================================
673 --- source/common/ucnv_ct.c (revision 291619)
674 +++ source/common/ucnv_ct.c (working copy)
677 #include "unicode/utypes.h"
679 -#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
680 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_NON_HTML5_CONVERSION
682 #include "unicode/ucnv.h"
683 #include "unicode/uset.h"
684 Index: source/i18n/csrsbcs.h
685 ===================================================================
686 --- source/i18n/csrsbcs.h (revision 291619)
687 +++ source/i18n/csrsbcs.h (working copy)
692 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
693 class NGramParser_IBM420 : public NGramParser
698 NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
703 class CharsetRecog_sbcs : public CharsetRecognizer
705 virtual UBool match(InputText *det, CharsetMatch *results) const;
708 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
709 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
714 virtual UBool match(InputText *det, CharsetMatch *results) const;
720 Index: source/i18n/csr2022.h
721 ===================================================================
722 --- source/i18n/csr2022.h (revision 291619)
723 +++ source/i18n/csr2022.h (working copy)
725 UBool match(InputText *textIn, CharsetMatch *results) const;
728 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
729 class CharsetRecog_2022KR :public CharsetRecog_2022 {
731 virtual ~CharsetRecog_2022KR();
734 UBool match(InputText *textIn, CharsetMatch *results) const;
740 Index: source/i18n/csr2022.cpp
741 ===================================================================
742 --- source/i18n/csr2022.cpp (revision 291619)
743 +++ source/i18n/csr2022.cpp (working copy)
745 {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7
748 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
749 static const uint8_t escapeSequences_2022KR[][5] = {
750 {0x1b, 0x24, 0x29, 0x43, 0x00}
753 {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2
754 {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3
758 CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
761 return (confidence > 0);
764 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
765 CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
767 const char *CharsetRecog_2022KR::getName() const {
769 results->set(textIn, this, confidence);
770 return (confidence > 0);
774 CharsetRecog_2022::~CharsetRecog_2022() {
776 Index: source/i18n/csdetect.cpp
777 ===================================================================
778 --- source/i18n/csdetect.cpp (revision 291619)
779 +++ source/i18n/csdetect.cpp (working copy)
781 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
783 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
784 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
785 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
786 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
789 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
790 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
791 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
794 int32_t rCount = ARRAY_SIZE(tempArray);
796 Index: source/i18n/csrsbcs.cpp
797 ===================================================================
798 --- source/i18n/csrsbcs.cpp (revision 291619)
799 +++ source/i18n/csrsbcs.cpp (working copy)
801 return (int32_t) (rawPercent * 300.0);
804 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
805 static const uint8_t unshapeMap_IBM420[] = {
806 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
807 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
814 CharsetRecog_sbcs::CharsetRecog_sbcs()
817 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
820 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
821 static const int32_t ngrams_IBM424_he_rtl[] = {
822 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
823 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
825 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
826 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
830 //ISO-8859-1,2,5,6,7,8,9 Ngrams
832 @@ -1155,6 +1159,7 @@
833 return (confidence > 0);
836 +#if !UCONFIG_NO_NON_HTML5_CONVERSION
837 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
840 @@ -1253,6 +1258,7 @@
841 results->set(textIn, this, confidence);
842 return (confidence > 0);