2 *******************************************************************************
4 * Copyright (C) 1997-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: loclikely.cpp
10 * tab size: 8 (not used)
13 * created on: 2010feb25
14 * created by: Markus W. Scherer
16 * Code for likely and minimized locale subtags, separated out from other .cpp files
17 * that then do not depend on resource bundle code and likely-subtags data.
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ures.h"
30 * This function looks for the localeID in the likelySubtags resource.
32 * @param localeID The tag to find.
33 * @param buffer A buffer to hold the matching entry
34 * @param bufferLength The length of the output buffer
35 * @return A pointer to "buffer" if found, or a null pointer if not.
37 static const char* U_CALLCONV
38 findLikelySubtags(const char* localeID,
42 const char* result = NULL;
44 if (!U_FAILURE(*err)) {
46 const UChar* s = NULL;
47 UErrorCode tmpErr = U_ZERO_ERROR;
48 UResourceBundle* subtags = ures_openDirect(NULL, "likelySubtags", &tmpErr);
49 if (U_SUCCESS(tmpErr)) {
50 s = ures_getStringByKey(subtags, localeID, &resLen, &tmpErr);
52 if (U_FAILURE(tmpErr)) {
54 * If a resource is missing, it's not really an error, it's
55 * just that we don't have any data for that particular locale ID.
57 if (tmpErr != U_MISSING_RESOURCE_ERROR) {
61 else if (resLen >= bufferLength) {
62 /* The buffer should never overflow. */
63 *err = U_INTERNAL_PROGRAM_ERROR;
66 u_UCharsToChars(s, buffer, resLen + 1);
80 * Append a tag to a buffer, adding the separator if necessary. The buffer
81 * must be large enough to contain the resulting tag plus any separator
82 * necessary. The tag must not be a zero-length string.
84 * @param tag The tag to add.
85 * @param tagLength The length of the tag.
86 * @param buffer The output buffer.
87 * @param bufferLength The length of the output buffer. This is an input/ouput parameter.
89 static void U_CALLCONV
94 int32_t* bufferLength) {
96 if (*bufferLength > 0) {
97 buffer[*bufferLength] = '_';
102 &buffer[*bufferLength],
106 *bufferLength += tagLength;
110 * These are the canonical strings for unknown languages, scripts and regions.
112 static const char* const unknownLanguage = "und";
113 static const char* const unknownScript = "Zzzz";
114 static const char* const unknownRegion = "ZZ";
117 * Create a tag string from the supplied parameters. The lang, script and region
118 * parameters may be NULL pointers. If they are, their corresponding length parameters
119 * must be less than or equal to 0.
121 * If any of the language, script or region parameters are empty, and the alternateTags
122 * parameter is not NULL, it will be parsed for potential language, script and region tags
123 * to be used when constructing the new tag. If the alternateTags parameter is NULL, or
124 * it contains no language tag, the default tag for the unknown language is used.
126 * If the length of the new string exceeds the capacity of the output buffer,
127 * the function copies as many bytes to the output buffer as it can, and returns
128 * the error U_BUFFER_OVERFLOW_ERROR.
130 * If an illegal argument is provided, the function returns the error
131 * U_ILLEGAL_ARGUMENT_ERROR.
133 * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
134 * the tag string fits in the output buffer, but the null terminator doesn't.
136 * @param lang The language tag to use.
137 * @param langLength The length of the language tag.
138 * @param script The script tag to use.
139 * @param scriptLength The length of the script tag.
140 * @param region The region tag to use.
141 * @param regionLength The length of the region tag.
142 * @param trailing Any trailing data to append to the new tag.
143 * @param trailingLength The length of the trailing data.
144 * @param alternateTags A string containing any alternate tags.
145 * @param tag The output buffer.
146 * @param tagCapacity The capacity of the output buffer.
147 * @param err A pointer to a UErrorCode for error reporting.
148 * @return The length of the tag string, which may be greater than tagCapacity, or -1 on error.
150 static int32_t U_CALLCONV
151 createTagStringWithAlternates(
155 int32_t scriptLength,
157 int32_t regionLength,
158 const char* trailing,
159 int32_t trailingLength,
160 const char* alternateTags,
165 if (U_FAILURE(*err)) {
168 else if (tag == NULL ||
170 langLength >= ULOC_LANG_CAPACITY ||
171 scriptLength >= ULOC_SCRIPT_CAPACITY ||
172 regionLength >= ULOC_COUNTRY_CAPACITY) {
177 * ULOC_FULLNAME_CAPACITY will provide enough capacity
178 * that we can build a string that contains the language,
179 * script and region code without worrying about overrunning
180 * the user-supplied buffer.
182 char tagBuffer[ULOC_FULLNAME_CAPACITY];
183 int32_t tagLength = 0;
184 int32_t capacityRemaining = tagCapacity;
185 UBool regionAppended = FALSE;
187 if (langLength > 0) {
194 else if (alternateTags == NULL) {
196 * Append the value for an unknown language, if
197 * we found no language.
201 (int32_t)uprv_strlen(unknownLanguage),
207 * Parse the alternateTags string for the language.
209 char alternateLang[ULOC_LANG_CAPACITY];
210 int32_t alternateLangLength = sizeof(alternateLang);
212 alternateLangLength =
218 if(U_FAILURE(*err) ||
219 alternateLangLength >= ULOC_LANG_CAPACITY) {
222 else if (alternateLangLength == 0) {
224 * Append the value for an unknown language, if
225 * we found no language.
229 (int32_t)uprv_strlen(unknownLanguage),
242 if (scriptLength > 0) {
249 else if (alternateTags != NULL) {
251 * Parse the alternateTags string for the script.
253 char alternateScript[ULOC_SCRIPT_CAPACITY];
255 const int32_t alternateScriptLength =
259 sizeof(alternateScript),
262 if (U_FAILURE(*err) ||
263 alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
266 else if (alternateScriptLength > 0) {
269 alternateScriptLength,
275 if (regionLength > 0) {
282 regionAppended = TRUE;
284 else if (alternateTags != NULL) {
286 * Parse the alternateTags string for the region.
288 char alternateRegion[ULOC_COUNTRY_CAPACITY];
290 const int32_t alternateRegionLength =
294 sizeof(alternateRegion),
296 if (U_FAILURE(*err) ||
297 alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
300 else if (alternateRegionLength > 0) {
303 alternateRegionLength,
307 regionAppended = TRUE;
312 const int32_t toCopy =
313 tagLength >= tagCapacity ? tagCapacity : tagLength;
316 * Copy the partial tag from our internal buffer to the supplied
324 capacityRemaining -= toCopy;
327 if (trailingLength > 0) {
328 if (capacityRemaining > 0 && !regionAppended) {
329 tag[tagLength++] = '_';
333 if (capacityRemaining > 0) {
335 * Copy the trailing data into the supplied buffer. Use uprv_memmove, since we
336 * don't know if the user-supplied buffers overlap.
338 const int32_t toCopy =
339 trailingLength >= capacityRemaining ? capacityRemaining : trailingLength;
348 tagLength += trailingLength;
350 return u_terminateChars(
360 * An overflow indicates the locale ID passed in
361 * is ill-formed. If we got here, and there was
362 * no previous error, it's an implicit overflow.
364 if (*err == U_BUFFER_OVERFLOW_ERROR ||
366 *err = U_ILLEGAL_ARGUMENT_ERROR;
373 * Create a tag string from the supplied parameters. The lang, script and region
374 * parameters may be NULL pointers. If they are, their corresponding length parameters
375 * must be less than or equal to 0. If the lang parameter is an empty string, the
376 * default value for an unknown language is written to the output buffer.
378 * If the length of the new string exceeds the capacity of the output buffer,
379 * the function copies as many bytes to the output buffer as it can, and returns
380 * the error U_BUFFER_OVERFLOW_ERROR.
382 * If an illegal argument is provided, the function returns the error
383 * U_ILLEGAL_ARGUMENT_ERROR.
385 * @param lang The language tag to use.
386 * @param langLength The length of the language tag.
387 * @param script The script tag to use.
388 * @param scriptLength The length of the script tag.
389 * @param region The region tag to use.
390 * @param regionLength The length of the region tag.
391 * @param trailing Any trailing data to append to the new tag.
392 * @param trailingLength The length of the trailing data.
393 * @param tag The output buffer.
394 * @param tagCapacity The capacity of the output buffer.
395 * @param err A pointer to a UErrorCode for error reporting.
396 * @return The length of the tag string, which may be greater than tagCapacity.
398 static int32_t U_CALLCONV
403 int32_t scriptLength,
405 int32_t regionLength,
406 const char* trailing,
407 int32_t trailingLength,
412 return createTagStringWithAlternates(
428 * Parse the language, script, and region subtags from a tag string, and copy the
429 * results into the corresponding output parameters. The buffers are null-terminated,
430 * unless overflow occurs.
432 * The langLength, scriptLength, and regionLength parameters are input/output
433 * parameters, and must contain the capacity of their corresponding buffers on
434 * input. On output, they will contain the actual length of the buffers, not
435 * including the null terminator.
437 * If the length of any of the output subtags exceeds the capacity of the corresponding
438 * buffer, the function copies as many bytes to the output buffer as it can, and returns
439 * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow
442 * If an illegal argument is provided, the function returns the error
443 * U_ILLEGAL_ARGUMENT_ERROR.
445 * @param localeID The locale ID to parse.
446 * @param lang The language tag buffer.
447 * @param langLength The length of the language tag.
448 * @param script The script tag buffer.
449 * @param scriptLength The length of the script tag.
450 * @param region The region tag buffer.
451 * @param regionLength The length of the region tag.
452 * @param err A pointer to a UErrorCode for error reporting.
453 * @return The number of chars of the localeID parameter consumed.
455 static int32_t U_CALLCONV
457 const char* localeID,
461 int32_t* scriptLength,
463 int32_t* regionLength,
466 const char* position = localeID;
467 int32_t subtagLength = 0;
469 if(U_FAILURE(*err) ||
472 langLength == NULL ||
474 scriptLength == NULL ||
476 regionLength == NULL) {
480 subtagLength = ulocimp_getLanguage(position, lang, *langLength, &position);
481 u_terminateChars(lang, *langLength, subtagLength, err);
484 * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
485 * to be an error, because it indicates the user-supplied tag is
488 if(U_FAILURE(*err)) {
492 *langLength = subtagLength;
495 * If no language was present, use the value of unknownLanguage
496 * instead. Otherwise, move past any separator.
498 if (*langLength == 0) {
502 *langLength = (int32_t)uprv_strlen(lang);
504 else if (_isIDSeparator(*position)) {
508 subtagLength = ulocimp_getScript(position, script, *scriptLength, &position);
509 u_terminateChars(script, *scriptLength, subtagLength, err);
511 if(U_FAILURE(*err)) {
515 *scriptLength = subtagLength;
517 if (*scriptLength > 0) {
518 if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
520 * If the script part is the "unknown" script, then don't return it.
526 * Move past any separator.
528 if (_isIDSeparator(*position)) {
533 subtagLength = ulocimp_getCountry(position, region, *regionLength, &position);
534 u_terminateChars(region, *regionLength, subtagLength, err);
536 if(U_FAILURE(*err)) {
540 *regionLength = subtagLength;
542 if (*regionLength > 0) {
543 if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
545 * If the region part is the "unknown" region, then don't return it.
553 return (int32_t)(position - localeID);
558 * If we get here, we have no explicit error, it's the result of an
561 if (!U_FAILURE(*err)) {
562 *err = U_ILLEGAL_ARGUMENT_ERROR;
568 static int32_t U_CALLCONV
569 createLikelySubtagsString(
573 int32_t scriptLength,
575 int32_t regionLength,
576 const char* variants,
577 int32_t variantsLength,
583 * ULOC_FULLNAME_CAPACITY will provide enough capacity
584 * that we can build a string that contains the language,
585 * script and region code without worrying about overrunning
586 * the user-supplied buffer.
588 char tagBuffer[ULOC_FULLNAME_CAPACITY];
589 char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
590 int32_t tagBufferLength = 0;
592 if(U_FAILURE(*err)) {
597 * Try the language with the script and region first.
599 if (scriptLength > 0 && regionLength > 0) {
601 const char* likelySubtags = NULL;
603 tagBufferLength = createTagString(
615 if(U_FAILURE(*err)) {
623 sizeof(likelySubtagsBuffer),
625 if(U_FAILURE(*err)) {
629 if (likelySubtags != NULL) {
630 /* Always use the language tag from the
631 maximal string, since it may be more
632 specific than the one provided. */
633 return createTagStringWithAlternates(
650 * Try the language with just the script.
652 if (scriptLength > 0) {
654 const char* likelySubtags = NULL;
656 tagBufferLength = createTagString(
668 if(U_FAILURE(*err)) {
676 sizeof(likelySubtagsBuffer),
678 if(U_FAILURE(*err)) {
682 if (likelySubtags != NULL) {
683 /* Always use the language tag from the
684 maximal string, since it may be more
685 specific than the one provided. */
686 return createTagStringWithAlternates(
703 * Try the language with just the region.
705 if (regionLength > 0) {
707 const char* likelySubtags = NULL;
721 if(U_FAILURE(*err)) {
729 sizeof(likelySubtagsBuffer),
731 if(U_FAILURE(*err)) {
735 if (likelySubtags != NULL) {
736 /* Always use the language tag from the
737 maximal string, since it may be more
738 specific than the one provided. */
739 return createTagStringWithAlternates(
756 * Finally, try just the language.
759 const char* likelySubtags = NULL;
773 if(U_FAILURE(*err)) {
781 sizeof(likelySubtagsBuffer),
783 if(U_FAILURE(*err)) {
787 if (likelySubtags != NULL) {
788 /* Always use the language tag from the
789 maximal string, since it may be more
790 specific than the one provided. */
791 return createTagStringWithAlternates(
807 return u_terminateChars(
815 if (!U_FAILURE(*err)) {
816 *err = U_ILLEGAL_ARGUMENT_ERROR;
822 #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) \
823 { int32_t count = 0; \
825 for (i = 0; i < trailingLength; i++) { \
826 if (trailing[i] == '-' || trailing[i] == '_') { \
831 } else if (trailing[i] == '@') { \
833 } else if (count > 8) { \
842 _uloc_addLikelySubtags(const char* localeID,
843 char* maximizedLocaleID,
844 int32_t maximizedLocaleIDCapacity,
847 char lang[ULOC_LANG_CAPACITY];
848 int32_t langLength = sizeof(lang);
849 char script[ULOC_SCRIPT_CAPACITY];
850 int32_t scriptLength = sizeof(script);
851 char region[ULOC_COUNTRY_CAPACITY];
852 int32_t regionLength = sizeof(region);
853 const char* trailing = "";
854 int32_t trailingLength = 0;
855 int32_t trailingIndex = 0;
856 int32_t resultLength = 0;
858 if(U_FAILURE(*err)) {
861 else if (localeID == NULL ||
862 maximizedLocaleID == NULL ||
863 maximizedLocaleIDCapacity <= 0) {
867 trailingIndex = parseTagString(
876 if(U_FAILURE(*err)) {
877 /* Overflow indicates an illegal argument error */
878 if (*err == U_BUFFER_OVERFLOW_ERROR) {
879 *err = U_ILLEGAL_ARGUMENT_ERROR;
885 /* Find the length of the trailing portion. */
886 trailing = &localeID[trailingIndex];
887 trailingLength = (int32_t)uprv_strlen(trailing);
889 CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
892 createLikelySubtagsString(
902 maximizedLocaleIDCapacity,
905 if (resultLength == 0) {
906 const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
909 * If we get here, we need to return localeID.
914 localIDLength <= maximizedLocaleIDCapacity ?
915 localIDLength : maximizedLocaleIDCapacity);
920 maximizedLocaleIDCapacity,
929 if (!U_FAILURE(*err)) {
930 *err = U_ILLEGAL_ARGUMENT_ERROR;
937 _uloc_minimizeSubtags(const char* localeID,
938 char* minimizedLocaleID,
939 int32_t minimizedLocaleIDCapacity,
943 * ULOC_FULLNAME_CAPACITY will provide enough capacity
944 * that we can build a string that contains the language,
945 * script and region code without worrying about overrunning
946 * the user-supplied buffer.
948 char maximizedTagBuffer[ULOC_FULLNAME_CAPACITY];
949 int32_t maximizedTagBufferLength = sizeof(maximizedTagBuffer);
951 char lang[ULOC_LANG_CAPACITY];
952 int32_t langLength = sizeof(lang);
953 char script[ULOC_SCRIPT_CAPACITY];
954 int32_t scriptLength = sizeof(script);
955 char region[ULOC_COUNTRY_CAPACITY];
956 int32_t regionLength = sizeof(region);
957 const char* trailing = "";
958 int32_t trailingLength = 0;
959 int32_t trailingIndex = 0;
961 if(U_FAILURE(*err)) {
964 else if (localeID == NULL ||
965 minimizedLocaleID == NULL ||
966 minimizedLocaleIDCapacity <= 0) {
980 if(U_FAILURE(*err)) {
982 /* Overflow indicates an illegal argument error */
983 if (*err == U_BUFFER_OVERFLOW_ERROR) {
984 *err = U_ILLEGAL_ARGUMENT_ERROR;
990 /* Find the spot where the variants begin, if any. */
991 trailing = &localeID[trailingIndex];
992 trailingLength = (int32_t)uprv_strlen(trailing);
994 CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
1006 maximizedTagBufferLength,
1008 if(U_FAILURE(*err)) {
1013 * First, we need to first get the maximization
1014 * from AddLikelySubtags.
1016 maximizedTagBufferLength =
1017 uloc_addLikelySubtags(
1020 maximizedTagBufferLength,
1023 if(U_FAILURE(*err)) {
1028 * Start first with just the language.
1031 char tagBuffer[ULOC_FULLNAME_CAPACITY];
1033 const int32_t tagBufferLength =
1034 createLikelySubtagsString(
1047 if(U_FAILURE(*err)) {
1050 else if (uprv_strnicmp(
1053 tagBufferLength) == 0) {
1055 return createTagString(
1065 minimizedLocaleIDCapacity,
1071 * Next, try the language and region.
1073 if (regionLength > 0) {
1075 char tagBuffer[ULOC_FULLNAME_CAPACITY];
1077 const int32_t tagBufferLength =
1078 createLikelySubtagsString(
1091 if(U_FAILURE(*err)) {
1094 else if (uprv_strnicmp(
1097 tagBufferLength) == 0) {
1099 return createTagString(
1109 minimizedLocaleIDCapacity,
1115 * Finally, try the language and script. This is our last chance,
1116 * since trying with all three subtags would only yield the
1117 * maximal version that we already have.
1119 if (scriptLength > 0 && regionLength > 0) {
1120 char tagBuffer[ULOC_FULLNAME_CAPACITY];
1122 const int32_t tagBufferLength =
1123 createLikelySubtagsString(
1136 if(U_FAILURE(*err)) {
1139 else if (uprv_strnicmp(
1142 tagBufferLength) == 0) {
1144 return createTagString(
1154 minimizedLocaleIDCapacity,
1161 * If we got here, return the locale ID parameter.
1163 const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
1168 localeIDLength <= minimizedLocaleIDCapacity ?
1169 localeIDLength : minimizedLocaleIDCapacity);
1171 return u_terminateChars(
1173 minimizedLocaleIDCapacity,
1180 if (!U_FAILURE(*err)) {
1181 *err = U_ILLEGAL_ARGUMENT_ERROR;
1190 do_canonicalize(const char* localeID,
1192 int32_t bufferCapacity,
1201 if (*err == U_STRING_NOT_TERMINATED_WARNING ||
1202 *err == U_BUFFER_OVERFLOW_ERROR) {
1203 *err = U_ILLEGAL_ARGUMENT_ERROR;
1207 else if (U_FAILURE(*err)) {
1216 U_DRAFT int32_t U_EXPORT2
1217 uloc_addLikelySubtags(const char* localeID,
1218 char* maximizedLocaleID,
1219 int32_t maximizedLocaleIDCapacity,
1222 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1224 if (!do_canonicalize(
1227 sizeof(localeBuffer),
1232 return _uloc_addLikelySubtags(
1235 maximizedLocaleIDCapacity,
1240 U_DRAFT int32_t U_EXPORT2
1241 uloc_minimizeSubtags(const char* localeID,
1242 char* minimizedLocaleID,
1243 int32_t minimizedLocaleIDCapacity,
1246 char localeBuffer[ULOC_FULLNAME_CAPACITY];
1248 if (!do_canonicalize(
1251 sizeof(localeBuffer),
1256 return _uloc_minimizeSubtags(
1259 minimizedLocaleIDCapacity,