1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *****************************************************************************
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *****************************************************************************
12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
17 * 06/29/2000 helena Major rewrite of the callback APIs.
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
28 #include "unicode/ucnv.h"
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 #define UNICODE_U_CODEPOINT 0x0055
35 #define UNICODE_X_CODEPOINT 0x0058
36 #define UNICODE_RS_CODEPOINT 0x005C
37 #define UNICODE_U_LOW_CODEPOINT 0x0075
38 #define UNICODE_X_LOW_CODEPOINT 0x0078
39 #define UNICODE_AMP_CODEPOINT 0x0026
40 #define UNICODE_HASH_CODEPOINT 0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 #define UNICODE_PLUS_CODEPOINT 0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 #define UNICODE_SPACE_CODEPOINT 0x0020
46 #define UCNV_PRV_ESCAPE_ICU 0
47 #define UCNV_PRV_ESCAPE_C 'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 #define UCNV_PRV_ESCAPE_JAVA 'J'
51 #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 #define UCNV_PRV_ESCAPE_CSS2 'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
63 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
65 * This list should be sync with the one in CharsetCallback.java
67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
73 (0x17B4 <= c && c <= 0x17B5) || \
74 (0x180B <= c && c <= 0x180E) || \
75 (0x200B <= c && c <= 0x200F) || \
76 (0x202A <= c && c <= 0x202E) || \
78 (0x2066 <= c && c <= 0x2069) || \
79 (0x2061 <= c && c <= 0x2064) || \
80 (0x206A <= c && c <= 0x206F) || \
82 (0x0FE00 <= c && c <= 0x0FE0F) || \
85 (0x01BCA0 <= c && c <= 0x01BCA3) || \
86 (0x01D173 <= c && c <= 0x01D17A) || \
88 (0x0E0020 <= c && c <= 0x0E007F) || \
89 (0x0E0100 <= c && c <= 0x0E01EF) || \
91 (0x0FFF0 <= c && c <= 0x0FFF8) || \
93 (0x0E0002 <= c && c <= 0x0E001F) || \
94 (0x0E0080 <= c && c <= 0x0E00FF) || \
95 (0x0E01F0 <= c && c <= 0x0E0FFF) \
99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 U_CAPI void U_EXPORT2
101 UCNV_FROM_U_CALLBACK_STOP (
103 UConverterFromUnicodeArgs *fromUArgs,
104 const UChar* codeUnits,
107 UConverterCallbackReason reason,
110 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
113 * Skip if the codepoint has unicode property of default ignorable.
117 /* the caller must have set the error code accordingly */
122 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
123 U_CAPI void U_EXPORT2
124 UCNV_TO_U_CALLBACK_STOP (
126 UConverterToUnicodeArgs *toUArgs,
127 const char* codePoints,
129 UConverterCallbackReason reason,
132 /* the caller must have set the error code accordingly */
136 U_CAPI void U_EXPORT2
137 UCNV_FROM_U_CALLBACK_SKIP (
139 UConverterFromUnicodeArgs *fromUArgs,
140 const UChar* codeUnits,
143 UConverterCallbackReason reason,
146 if (reason <= UCNV_IRREGULAR)
148 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
151 * Skip if the codepoint has unicode property of default ignorable.
155 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
159 /* else the caller must have set the error code accordingly. */
161 /* else ignore the reset, close and clone calls. */
164 U_CAPI void U_EXPORT2
165 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
167 UConverterFromUnicodeArgs *fromArgs,
168 const UChar* codeUnits,
171 UConverterCallbackReason reason,
174 if (reason <= UCNV_IRREGULAR)
176 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
179 * Skip if the codepoint has unicode property of default ignorable.
183 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
186 ucnv_cbFromUWriteSub(fromArgs, 0, err);
188 /* else the caller must have set the error code accordingly. */
190 /* else ignore the reset, close and clone calls. */
193 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
194 *uses a clean copy (resetted) of the converter, to convert that unicode
195 *escape sequence to the target codepage (if conversion failure happens then
196 *we revert to substituting with subchar)
198 U_CAPI void U_EXPORT2
199 UCNV_FROM_U_CALLBACK_ESCAPE (
201 UConverterFromUnicodeArgs *fromArgs,
202 const UChar *codeUnits,
205 UConverterCallbackReason reason,
209 UChar valueString[VALUE_STRING_LENGTH];
210 int32_t valueStringLength = 0;
213 const UChar *myValueSource = NULL;
214 UErrorCode err2 = U_ZERO_ERROR;
215 UConverterFromUCallback original = NULL;
216 const void *originalContext;
218 UConverterFromUCallback ignoredCallback = NULL;
219 const void *ignoredContext;
221 if (reason > UCNV_IRREGULAR)
225 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
228 * Skip if the codepoint has unicode property of default ignorable.
234 ucnv_setFromUCallBack (fromArgs->converter,
235 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
241 if (U_FAILURE (err2))
250 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
251 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
252 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
257 switch(*((char*)context))
259 case UCNV_PRV_ESCAPE_JAVA:
262 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
263 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
264 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
268 case UCNV_PRV_ESCAPE_C:
269 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
272 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
273 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
277 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
278 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
282 case UCNV_PRV_ESCAPE_XML_DEC:
284 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
285 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
287 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
290 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
292 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
295 case UCNV_PRV_ESCAPE_XML_HEX:
297 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
298 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
299 valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
301 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
304 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
306 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
309 case UCNV_PRV_ESCAPE_UNICODE:
310 valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
311 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
312 valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
316 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
318 valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
321 case UCNV_PRV_ESCAPE_CSS2:
322 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
323 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
324 /* Always add space character, becase the next character might be whitespace,
325 which would erroneously be considered the termination of the escape sequence. */
326 valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
332 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
333 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
334 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
338 myValueSource = valueString;
340 /* reset the error */
343 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
345 ucnv_setFromUCallBack (fromArgs->converter,
351 if (U_FAILURE (err2))
362 U_CAPI void U_EXPORT2
363 UCNV_TO_U_CALLBACK_SKIP (
365 UConverterToUnicodeArgs *toArgs,
366 const char* codeUnits,
368 UConverterCallbackReason reason,
371 if (reason <= UCNV_IRREGULAR)
373 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
377 /* else the caller must have set the error code accordingly. */
379 /* else ignore the reset, close and clone calls. */
382 U_CAPI void U_EXPORT2
383 UCNV_TO_U_CALLBACK_SUBSTITUTE (
385 UConverterToUnicodeArgs *toArgs,
386 const char* codeUnits,
388 UConverterCallbackReason reason,
391 if (reason <= UCNV_IRREGULAR)
393 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
396 ucnv_cbToUWriteSub(toArgs,0,err);
398 /* else the caller must have set the error code accordingly. */
400 /* else ignore the reset, close and clone calls. */
403 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
404 *and uses that as the substitution sequence
406 U_CAPI void U_EXPORT2
407 UCNV_TO_U_CALLBACK_ESCAPE (
409 UConverterToUnicodeArgs *toArgs,
410 const char* codeUnits,
412 UConverterCallbackReason reason,
415 UChar uniValueString[VALUE_STRING_LENGTH];
416 int32_t valueStringLength = 0;
419 if (reason > UCNV_IRREGULAR)
428 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
429 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
430 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
435 switch(*((char*)context))
437 case UCNV_PRV_ESCAPE_XML_DEC:
440 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
441 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
442 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
443 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
447 case UCNV_PRV_ESCAPE_XML_HEX:
450 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
451 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
452 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
453 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
454 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
457 case UCNV_PRV_ESCAPE_C:
460 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
461 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
462 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
468 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
469 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
470 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
471 valueStringLength += 2;
475 /* reset the error */
478 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);