src/third_party/icu/source/common/uloc.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1997-2010, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *
   7 * File ULOC.CPP
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   04/01/97    aliu        Creation.
  13 *   08/21/98    stephen     JDK 1.2 sync
  14 *   12/08/98    rtg         New Locale implementation and C API
  15 *   03/15/99    damiba      overhaul.
  16 *   04/06/99    stephen     changed setDefault() to realloc and copy
  17 *   06/14/99    stephen     Changed calls to ures_open for new params
  18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  20 *                           brought canonicalization code into line with spec
  21 *****************************************************************************/
  22
  23 /*
  24    POSIX's locale format, from putil.c: [no spaces]
  25
  26      ll [ _CC ] [ . MM ] [ @ VV]
  27
  28      l = lang, C = ctry, M = charmap, V = variant
  29 */
  30
  31 #include "unicode/utypes.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/uloc.h"
  34
  35 #include "putilimp.h"
  36 #include "ustr_imp.h"
  37 #include "ulocimp.h"
  38 #include "umutex.h"
  39 #include "cstring.h"
  40 #include "cmemory.h"
  41 #include "ucln_cmn.h"
  42 #include "locmap.h"
  43 #include "uarrsort.h"
  44 #include "uenumimp.h"
  45 #include "uassert.h"
  46
  47 #include <stdio.h> /* for sprintf */
  48
  49 /* ### Declarations **************************************************/
  50
  51 /* Locale stuff from locid.cpp */
  52 U_CFUNC void locale_set_default(const char *id);
  53 U_CFUNC const char *locale_get_default(void);
  54 U_CFUNC int32_t
  55 locale_getKeywords(const char *localeID,
  56             char prev,
  57             char *keywords, int32_t keywordCapacity,
  58             char *values, int32_t valuesCapacity, int32_t *valLen,
  59             UBool valuesToo,
  60             UErrorCode *status);
  61
  62 /* ### Data tables **************************************************/
  63
  64 /**
  65  * Table of language codes, both 2- and 3-letter, with preference
  66  * given to 2-letter codes where possible.  Includes 3-letter codes
  67  * that lack a 2-letter equivalent.
  68  *
  69  * This list must be in sorted order.  This list is returned directly
  70  * to the user by some API.
  71  *
  72  * This list must be kept in sync with LANGUAGES_3, with corresponding
  73  * entries matched.
  74  *
  75  * This table should be terminated with a NULL entry, followed by a
  76  * second list, and another NULL entry.  The first list is visible to
  77  * user code when this array is returned by API.  The second list
  78  * contains codes we support, but do not expose through user API.
  79  *
  80  * Notes
  81  *
  82  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  83  * include the revisions up to 2001/7/27 *CWB*
  84  *
  85  * The 3 character codes are the terminology codes like RFC 3066.  This
  86  * is compatible with prior ICU codes
  87  *
  88  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  89  * table but now at the end of the table because 3 character codes are
  90  * duplicates.  This avoids bad searches going from 3 to 2 character
  91  * codes.
  92  *
  93  * The range qaa-qtz is reserved for local use
  94  */
  95 static const char * const LANGUAGES[] = {
  96     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",
  97     "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",
  98     "ang", "anp", "apa",
  99     "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",
 100     "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",
 101     "bai", "bal", "ban", "bas", "bat", "be",  "bej",
 102     "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",
 103     "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",
 104     "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",
 105     "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",
 106     "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",
 107     "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",
 108     "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",
 109     "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",
 110     "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",
 111     "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",
 112     "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",
 113     "fr",  "frm", "fro", "frr", "frs", "fur", "fy",
 114     "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
 115     "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
 116     "grc", "gsw", "gu",  "gv", "gwi",
 117     "ha",  "hai", "haw", "he",  "hi",  "hil", "him",
 118     "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",
 119     "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",
 120     "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",
 121     "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",
 122     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
 123     "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",
 124     "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",
 125     "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",
 126     "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",
 127     "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",
 128     "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",
 129     "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",
 130     "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",
 131     "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",
 132     "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",
 133     "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",
 134     "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",
 135     "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",
 136     "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",
 137     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
 138     "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",
 139     "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",
 140     "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",
 141     "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",
 142     "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",
 143     "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
 144     "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
 145     "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",
 146     "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",
 147     "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",
 148     "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",
 149     "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",
 150     "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",
 151     "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",
 152     "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",
 153     "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",
 154     "zu",  "zun", "zxx", "zza",
 155 NULL,
 156     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 157 NULL
 158 };
 159 static const char* const DEPRECATED_LANGUAGES[]={
 160     "in", "iw", "ji", "jw", NULL, NULL
 161 };
 162 static const char* const REPLACEMENT_LANGUAGES[]={
 163     "id", "he", "yi", "jv", NULL, NULL
 164 };
 165
 166 /**
 167  * Table of 3-letter language codes.
 168  *
 169  * This is a lookup table used to convert 3-letter language codes to
 170  * their 2-letter equivalent, where possible.  It must be kept in sync
 171  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 172  * same language as LANGUAGES_3[i].  The commented-out lines are
 173  * copied from LANGUAGES to make eyeballing this baby easier.
 174  *
 175  * Where a 3-letter language code has no 2-letter equivalent, the
 176  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 177  *
 178  * This table should be terminated with a NULL entry, followed by a
 179  * second list, and another NULL entry.  The two lists correspond to
 180  * the two lists in LANGUAGES.
 181  */
 182 static const char * const LANGUAGES_3[] = {
 183 /*  "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",    */
 184     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
 185 /*  "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",  "ang", "anp", "apa",    */
 186     "afh", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
 187 /*  "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",    */
 188     "ara", "arc", "arn", "arp", "art", "arw", "asm", "ast",
 189 /*  "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",    */
 190     "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
 191 /*  "bai", "bal", "ban", "bas", "bat", "be",  "bej",    */
 192     "bai", "bal", "ban", "bas", "bat", "bel", "bej",
 193 /*  "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",    */
 194     "bem", "ber", "bul", "bih", "bho", "bis", "bik", "bin",
 195 /*  "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",     */
 196     "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "bos",
 197 /*  "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",    */
 198     "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
 199 /*  "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",    */
 200     "cch", "che", "ceb", "cel", "cha", "chb", "chg", "chk", "chm",
 201 /*  "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",    */
 202     "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
 203 /*  "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",    */
 204     "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
 205 /*  "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",    */
 206     "chv", "cym", "dan", "dak", "dar", "day", "deu", "del", "den",
 207 /*  "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",    */
 208     "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "div", "dyu",
 209 /*  "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",     */
 210     "dzo", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
 211 /*  "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",     */
 212     "enm", "epo", "spa", "est", "eus", "ewo", "fas",
 213 /*  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",    */
 214     "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
 215 /*  "fr",  "frm", "fro", "frr", "frs", "fur", "fy",  "ga",  "gaa", "gay",    */
 216     "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gay",
 217 /*  "gba", "gd",  "gem", "gez", "gil", "gl",  "gmh", "gn",     */
 218     "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
 219 /*  "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "gv",     */
 220     "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "glv",
 221 /*  "gwi", "ha",  "hai", "haw", "he",  "hi",  "hil", "him",    */
 222     "gwi", "hau", "hai", "haw", "heb", "hin", "hil", "him",
 223 /*  "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",     */
 224     "hit", "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye", "her",
 225 /*  "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",     */
 226     "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
 227 /*  "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",      */
 228     "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
 229 /*  "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",   */
 230     "iku", "jpn", "jbo", "jpr", "jrb", "jav", "kat", "kaa", "kab",
 231 /*  "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",*/
 232     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
 233 /*  "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",     */
 234     "kho", "kik", "kua", "kaz", "kal", "khm", "kmb", "kan",
 235 /*  "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",     */
 236     "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas",
 237 /*  "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",    */
 238     "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad",
 239 /*  "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",    */
 240     "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
 241 /*  "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",    */
 242     "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus",
 243 /*  "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",    */
 244     "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
 245 /*  "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",    */
 246     "mdf", "mdr", "men", "mfe", "mlg", "mga", "mah", "mri", "mic", "min",
 247 /*  "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",    */
 248     "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
 249 /*  "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",    */
 250     "mol", "moh", "mos", "mar", "msa", "mlt", "mul", "mun",
 251 /*  "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",    */
 252     "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nap",
 253 /*  "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",    */
 254     "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
 255 /*  "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",    */
 256     "niu", "nld", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub",
 257 /*  "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",     */
 258     "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
 259 /*  "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",    */
 260     "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
 261 /*  "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",    */
 262     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
 263 /*  "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",     */
 264     "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
 265 /*  "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",    */
 266     "raj", "rap", "rar", "roh", "run", "ron", "roa", "rom",
 267 /*  "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",    */
 268     "rus", "rup", "kin", "san", "sad", "sah", "sai", "sal", "sam",
 269 /*  "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",    */
 270     "sas", "sat", "srd", "scn", "sco", "snd", "sme", "sel", "sem",
 271 /*  "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",    */
 272     "sag", "sga", "sgn", "shn", "sin", "sid", "sio", "sit",
 273 /*  "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",    */
 274     "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
 275 /*  "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",     */
 276     "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
 277 /*  "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",    */
 278     "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
 279 /*  "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",    */
 280     "swe", "swa", "syc", "syr", "tam", "tai", "tel", "tem", "ter",
 281 /*  "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",    */
 282     "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
 283 /*  "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",    */
 284     "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
 285 /*  "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",     */
 286     "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi",
 287 /*  "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",     */
 288     "tah", "tyv", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
 289 /*  "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",    */
 290     "uzb", "vai", "ven", "vie", "vol", "vot", "wln", "wak",
 291 /*  "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",    */
 292     "wal", "war", "was", "wen", "wol", "xal", "xho", "yao", "yap",
 293 /*  "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",    */
 294     "yid", "yor", "ypk", "zha", "zap", "zbl", "zen", "zho", "znd",
 295 /*  "zu",  "zun", "zxx", "zza",                                         */
 296     "zul", "zun", "zxx", "zza",
 297 NULL,
 298 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 299     "ind", "heb", "yid", "jaw", "srp",
 300 NULL
 301 };
 302
 303 /**
 304  * Table of 2-letter country codes.
 305  *
 306  * This list must be in sorted order.  This list is returned directly
 307  * to the user by some API.
 308  *
 309  * This list must be kept in sync with COUNTRIES_3, with corresponding
 310  * entries matched.
 311  *
 312  * This table should be terminated with a NULL entry, followed by a
 313  * second list, and another NULL entry.  The first list is visible to
 314  * user code when this array is returned by API.  The second list
 315  * contains codes we support, but do not expose through user API.
 316  *
 317  * Notes:
 318  *
 319  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 320  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 321  * new codes keeping the old ones for compatibility updated to include
 322  * 1999/12/03 revisions *CWB*
 323  *
 324  * RO(ROM) is now RO(ROU) according to
 325  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 326  */
 327 static const char * const COUNTRIES[] = {
 328     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",
 329     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 330     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 331     "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",
 332     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 333     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
 334     "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
 335     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
 336     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 337     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 338     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 339     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 340     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 341     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 342     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 343     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 344     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 345     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 346     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 347     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 348     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 349     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 350     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 351     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 352     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",
 353     "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 354     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 355     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 356     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 357     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 358 NULL,
 359     "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   /* obsolete country codes */
 360 NULL
 361 };
 362
 363 static const char* const DEPRECATED_COUNTRIES[] ={
 364     "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
 365 };
 366 static const char* const REPLACEMENT_COUNTRIES[] = {
 367 /*  "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
 368     "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL  /* replacement country codes */
 369 };
 370
 371 /**
 372  * Table of 3-letter country codes.
 373  *
 374  * This is a lookup table used to convert 3-letter country codes to
 375  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 376  * For all valid i, COUNTRIES[i] must refer to the same country as
 377  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 378  * to make eyeballing this baby easier.
 379  *
 380  * This table should be terminated with a NULL entry, followed by a
 381  * second list, and another NULL entry.  The two lists correspond to
 382  * the two lists in COUNTRIES.
 383  */
 384 static const char * const COUNTRIES_3[] = {
 385 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",     */
 386     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
 387 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 388     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 389 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 390     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 391 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",     */
 392     "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
 393 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 394     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 395 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
 396     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
 397 /*  "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
 398     "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
 399 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 400     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
 401 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 402     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 403 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 404     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 405 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 406     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 407 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 408     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 409 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 410     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 411 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 412     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 413 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 414     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 415 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 416     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 417 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 418     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 419 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 420     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 421 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 422     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 423 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 424     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 425 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 426     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 427 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 428     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 429 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 430     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 431 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 432     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 433 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",     */
 434     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
 435 /*  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 436     "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 437 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 438     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 439 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 440     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 441 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 442     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 443 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 444     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 445 NULL,
 446 /*  "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   */
 447     "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
 448 NULL
 449 };
 450
 451 typedef struct CanonicalizationMap {
 452     const char *id;          /* input ID */
 453     const char *canonicalID; /* canonicalized output ID */
 454     const char *keyword;     /* keyword, or NULL if none */
 455     const char *value;       /* keyword value, or NULL if kw==NULL */
 456 } CanonicalizationMap;
 457
 458 /**
 459  * A map to canonicalize locale IDs.  This handles a variety of
 460  * different semantic kinds of transformations.
 461  */
 462 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 463     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 464     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 465     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 466     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 467     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 468     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 469     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 470     { "cel_GAULISH",    "cel__GAULISH", NULL, NULL }, /* registered name */
 471     { "de_1901",        "de__1901", NULL, NULL }, /* registered name */
 472     { "de_1906",        "de__1906", NULL, NULL }, /* registered name */
 473     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 474     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 475     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 476     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 477     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 478     { "en_BOONT",       "en__BOONT", NULL, NULL }, /* registered name */
 479     { "en_SCOUSE",      "en__SCOUSE", NULL, NULL }, /* registered name */
 480     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 481     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 482     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 483     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 484     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 485     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 486     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 487     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 488     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 489     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 490     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 491     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 492     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 493     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 494     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 495     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 496     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 497     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 498     { "sl_ROZAJ",       "sl__ROZAJ", NULL, NULL }, /* registered name */
 499     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 500     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 501     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 502     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 503     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 504     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 505     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 506     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 507     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 508     { "zh_GAN",         "zh__GAN", NULL, NULL }, /* registered name */
 509     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 510     { "zh_HAKKA",       "zh__HAKKA", NULL, NULL }, /* registered name */
 511     { "zh_MIN",         "zh__MIN", NULL, NULL }, /* registered name */
 512     { "zh_MIN_NAN",     "zh__MINNAN", NULL, NULL }, /* registered name */
 513     { "zh_WUU",         "zh__WUU", NULL, NULL }, /* registered name */
 514     { "zh_XIANG",       "zh__XIANG", NULL, NULL }, /* registered name */
 515     { "zh_YUE",         "zh__YUE", NULL, NULL }, /* registered name */
 516 };
 517
 518 typedef struct VariantMap {
 519     const char *variant;          /* input ID */
 520     const char *keyword;     /* keyword, or NULL if none */
 521     const char *value;       /* keyword value, or NULL if kw==NULL */
 522 } VariantMap;
 523
 524 static const VariantMap VARIANT_MAP[] = {
 525     { "EURO",   "currency", "EUR" },
 526     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 527     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 528 };
 529
 530 /* ### BCP47 Conversion *******************************************/
 531 /* Test if the locale id has BCP47 u extension and does not have '@' */
 532 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 533 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 534 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 535         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
 536             finalID=id; \
 537         } else { \
 538             finalID=buffer; \
 539         }
 540 /* Gets the size of the shortest subtag in the given localeID. */
 541 static int32_t getShortestSubtagLength(const char *localeID) {
 542     int32_t localeIDLength = uprv_strlen(localeID);
 543     int32_t length = localeIDLength;
 544     int32_t tmpLength = 0;
 545     int32_t i;
 546     UBool reset = TRUE;
 547
 548     for (i = 0; i < localeIDLength; i++) {
 549         if (localeID[i] != '_' && localeID[i] != '-') {
 550             if (reset) {
 551                 tmpLength = 0;
 552                 reset = FALSE;
 553             }
 554             tmpLength++;
 555         } else {
 556             if (tmpLength != 0 && tmpLength < length) {
 557                 length = tmpLength;
 558             }
 559             reset = TRUE;
 560         }
 561     }
 562
 563     return length;
 564 }
 565
 566 /* ### Keywords **************************************************/
 567
 568 #define ULOC_KEYWORD_BUFFER_LEN 25
 569 #define ULOC_MAX_NO_KEYWORDS 25
 570
 571 U_CAPI const char * U_EXPORT2
 572 locale_getKeywordsStart(const char *localeID) {
 573     const char *result = NULL;
 574     if((result = uprv_strchr(localeID, '@')) != NULL) {
 575         return result;
 576     }
 577 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 578     else {
 579         /* We do this because the @ sign is variant, and the @ sign used on one
 580         EBCDIC machine won't be compiled the same way on other EBCDIC based
 581         machines. */
 582         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 583         const uint8_t *charToFind = ebcdicSigns;
 584         while(*charToFind) {
 585             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 586                 return result;
 587             }
 588             charToFind++;
 589         }
 590     }
 591 #endif
 592     return NULL;
 593 }
 594
 595 /**
 596  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 597  * @param keywordName incoming name to be canonicalized
 598  * @param status return status (keyword too long)
 599  * @return length of the keyword name
 600  */
 601 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 602 {
 603   int32_t i;
 604   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
 605
 606   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
 607     /* keyword name too long for internal buffer */
 608     *status = U_INTERNAL_PROGRAM_ERROR;
 609           return 0;
 610   }
 611
 612   /* normalize the keyword name */
 613   for(i = 0; i < keywordNameLen; i++) {
 614     buf[i] = uprv_tolower(keywordName[i]);
 615   }
 616   buf[i] = 0;
 617
 618   return keywordNameLen;
 619 }
 620
 621 typedef struct {
 622     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 623     int32_t keywordLen;
 624     const char *valueStart;
 625     int32_t valueLen;
 626 } KeywordStruct;
 627
 628 static int32_t U_CALLCONV
 629 compareKeywordStructs(const void *context, const void *left, const void *right) {
 630     const char* leftString = ((const KeywordStruct *)left)->keyword;
 631     const char* rightString = ((const KeywordStruct *)right)->keyword;
 632     return uprv_strcmp(leftString, rightString);
 633 }
 634
 635 /**
 636  * Both addKeyword and addValue must already be in canonical form.
 637  * Either both addKeyword and addValue are NULL, or neither is NULL.
 638  * If they are not NULL they must be zero terminated.
 639  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 640  */
 641 static int32_t
 642 _getKeywords(const char *localeID,
 643              char prev,
 644              char *keywords, int32_t keywordCapacity,
 645              char *values, int32_t valuesCapacity, int32_t *valLen,
 646              UBool valuesToo,
 647              const char* addKeyword,
 648              const char* addValue,
 649              UErrorCode *status)
 650 {
 651     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 652
 653     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 654     int32_t numKeywords = 0;
 655     const char* pos = localeID;
 656     const char* equalSign = NULL;
 657     const char* semicolon = NULL;
 658     int32_t i = 0, j, n;
 659     int32_t keywordsLen = 0;
 660     int32_t valuesLen = 0;
 661
 662     if(prev == '@') { /* start of keyword definition */
 663         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 664         do {
 665             UBool duplicate = FALSE;
 666             /* skip leading spaces */
 667             while(*pos == ' ') {
 668                 pos++;
 669             }
 670             if (!*pos) { /* handle trailing "; " */
 671                 break;
 672             }
 673             if(numKeywords == maxKeywords) {
 674                 *status = U_INTERNAL_PROGRAM_ERROR;
 675                 return 0;
 676             }
 677             equalSign = uprv_strchr(pos, '=');
 678             semicolon = uprv_strchr(pos, ';');
 679             /* lack of '=' [foo@currency] is illegal */
 680             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 681             if(!equalSign || (semicolon && semicolon<equalSign)) {
 682                 *status = U_INVALID_FORMAT_ERROR;
 683                 return 0;
 684             }
 685             /* need to normalize both keyword and keyword name */
 686             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 687                 /* keyword name too long for internal buffer */
 688                 *status = U_INTERNAL_PROGRAM_ERROR;
 689                 return 0;
 690             }
 691             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 692                 if (pos[i] != ' ') {
 693                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 694                 }
 695             }
 696
 697             /* zero-length keyword is an error. */
 698             if (n == 0) {
 699                 *status = U_INVALID_FORMAT_ERROR;
 700                 return 0;
 701             }
 702
 703             keywordList[numKeywords].keyword[n] = 0;
 704             keywordList[numKeywords].keywordLen = n;
 705             /* now grab the value part. First we skip the '=' */
 706             equalSign++;
 707             /* then we leading spaces */
 708             while(*equalSign == ' ') {
 709                 equalSign++;
 710             }
 711
 712             /* Premature end or zero-length value */
 713             if (!equalSign || equalSign == semicolon) {
 714                 *status = U_INVALID_FORMAT_ERROR;
 715                 return 0;
 716             }
 717
 718             keywordList[numKeywords].valueStart = equalSign;
 719
 720             pos = semicolon;
 721             i = 0;
 722             if(pos) {
 723                 while(*(pos - i - 1) == ' ') {
 724                     i++;
 725                 }
 726                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 727                 pos++;
 728             } else {
 729                 i = (int32_t)uprv_strlen(equalSign);
 730                 while(i && equalSign[i-1] == ' ') {
 731                     i--;
 732                 }
 733                 keywordList[numKeywords].valueLen = i;
 734             }
 735             /* If this is a duplicate keyword, then ignore it */
 736             for (j=0; j<numKeywords; ++j) {
 737                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 738                     duplicate = TRUE;
 739                     break;
 740                 }
 741             }
 742             if (!duplicate) {
 743                 ++numKeywords;
 744             }
 745         } while(pos);
 746
 747         /* Handle addKeyword/addValue. */
 748         if (addKeyword != NULL) {
 749             UBool duplicate = FALSE;
 750             U_ASSERT(addValue != NULL);
 751             /* Search for duplicate; if found, do nothing. Explicit keyword
 752                overrides addKeyword. */
 753             for (j=0; j<numKeywords; ++j) {
 754                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 755                     duplicate = TRUE;
 756                     break;
 757                 }
 758             }
 759             if (!duplicate) {
 760                 if (numKeywords == maxKeywords) {
 761                     *status = U_INTERNAL_PROGRAM_ERROR;
 762                     return 0;
 763                 }
 764                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 765                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 766                 keywordList[numKeywords].valueStart = addValue;
 767                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 768                 ++numKeywords;
 769             }
 770         } else {
 771             U_ASSERT(addValue == NULL);
 772         }
 773
 774         /* now we have a list of keywords */
 775         /* we need to sort it */
 776         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 777
 778         /* Now construct the keyword part */
 779         for(i = 0; i < numKeywords; i++) {
 780             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 781                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 782                 if(valuesToo) {
 783                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 784                 } else {
 785                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 786                 }
 787             }
 788             keywordsLen += keywordList[i].keywordLen + 1;
 789             if(valuesToo) {
 790                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 791                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 792                 }
 793                 keywordsLen += keywordList[i].valueLen;
 794
 795                 if(i < numKeywords - 1) {
 796                     if(keywordsLen < keywordCapacity) {
 797                         keywords[keywordsLen] = ';';
 798                     }
 799                     keywordsLen++;
 800                 }
 801             }
 802             if(values) {
 803                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 804                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 805                     values[valuesLen + keywordList[i].valueLen] = 0;
 806                 }
 807                 valuesLen += keywordList[i].valueLen + 1;
 808             }
 809         }
 810         if(values) {
 811             values[valuesLen] = 0;
 812             if(valLen) {
 813                 *valLen = valuesLen;
 814             }
 815         }
 816         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 817     } else {
 818         return 0;
 819     }
 820 }
 821
 822 U_CFUNC int32_t
 823 locale_getKeywords(const char *localeID,
 824                    char prev,
 825                    char *keywords, int32_t keywordCapacity,
 826                    char *values, int32_t valuesCapacity, int32_t *valLen,
 827                    UBool valuesToo,
 828                    UErrorCode *status) {
 829     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 830                         values, valuesCapacity, valLen, valuesToo,
 831                         NULL, NULL, status);
 832 }
 833
 834 U_CAPI int32_t U_EXPORT2
 835 uloc_getKeywordValue(const char* localeID,
 836                      const char* keywordName,
 837                      char* buffer, int32_t bufferCapacity,
 838                      UErrorCode* status)
 839 {
 840     const char* startSearchHere = NULL;
 841     const char* nextSeparator = NULL;
 842     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 843     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 844     int32_t i = 0;
 845     int32_t result = 0;
 846
 847     if(status && U_SUCCESS(*status) && localeID) {
 848       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 849       const char* tmpLocaleID;
 850
 851       if (_hasBCP47Extension(localeID)) {
 852           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 853       } else {
 854           tmpLocaleID=localeID;
 855       }
 856
 857       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
 858       if(startSearchHere == NULL) {
 859           /* no keywords, return at once */
 860           return 0;
 861       }
 862
 863       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 864       if(U_FAILURE(*status)) {
 865         return 0;
 866       }
 867
 868       /* find the first keyword */
 869       while(startSearchHere) {
 870           startSearchHere++;
 871           /* skip leading spaces (allowed?) */
 872           while(*startSearchHere == ' ') {
 873               startSearchHere++;
 874           }
 875           nextSeparator = uprv_strchr(startSearchHere, '=');
 876           /* need to normalize both keyword and keyword name */
 877           if(!nextSeparator) {
 878               break;
 879           }
 880           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
 881               /* keyword name too long for internal buffer */
 882               *status = U_INTERNAL_PROGRAM_ERROR;
 883               return 0;
 884           }
 885           for(i = 0; i < nextSeparator - startSearchHere; i++) {
 886               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
 887           }
 888           /* trim trailing spaces */
 889           while(startSearchHere[i-1] == ' ') {
 890               i--;
 891           }
 892           localeKeywordNameBuffer[i] = 0;
 893
 894           startSearchHere = uprv_strchr(nextSeparator, ';');
 895
 896           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 897               nextSeparator++;
 898               while(*nextSeparator == ' ') {
 899                   nextSeparator++;
 900               }
 901               /* we actually found the keyword. Copy the value */
 902               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
 903                   while(*(startSearchHere-1) == ' ') {
 904                       startSearchHere--;
 905                   }
 906                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
 907                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
 908               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
 909                   i = (int32_t)uprv_strlen(nextSeparator);
 910                   while(nextSeparator[i - 1] == ' ') {
 911                       i--;
 912                   }
 913                   uprv_strncpy(buffer, nextSeparator, i);
 914                   result = u_terminateChars(buffer, bufferCapacity, i, status);
 915               } else {
 916                   /* give a bigger buffer, please */
 917                   *status = U_BUFFER_OVERFLOW_ERROR;
 918                   if(startSearchHere) {
 919                       result = (int32_t)(startSearchHere - nextSeparator);
 920                   } else {
 921                       result = (int32_t)uprv_strlen(nextSeparator);
 922                   }
 923               }
 924               return result;
 925           }
 926       }
 927     }
 928     return 0;
 929 }
 930
 931 U_CAPI int32_t U_EXPORT2
 932 uloc_setKeywordValue(const char* keywordName,
 933                      const char* keywordValue,
 934                      char* buffer, int32_t bufferCapacity,
 935                      UErrorCode* status)
 936 {
 937     /* TODO: sorting. removal. */
 938     int32_t keywordNameLen;
 939     int32_t keywordValueLen;
 940     int32_t bufLen;
 941     int32_t needLen = 0;
 942     int32_t foundValueLen;
 943     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
 944     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 945     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 946     int32_t i = 0;
 947     int32_t rc;
 948     char* nextSeparator = NULL;
 949     char* nextEqualsign = NULL;
 950     char* startSearchHere = NULL;
 951     char* keywordStart = NULL;
 952     char *insertHere = NULL;
 953     if(U_FAILURE(*status)) {
 954         return -1;
 955     }
 956     if(bufferCapacity>1) {
 957         bufLen = (int32_t)uprv_strlen(buffer);
 958     } else {
 959         *status = U_ILLEGAL_ARGUMENT_ERROR;
 960         return 0;
 961     }
 962     if(bufferCapacity<bufLen) {
 963         /* The capacity is less than the length?! Is this NULL terminated? */
 964         *status = U_ILLEGAL_ARGUMENT_ERROR;
 965         return 0;
 966     }
 967     if(keywordValue && !*keywordValue) {
 968         keywordValue = NULL;
 969     }
 970     if(keywordValue) {
 971         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
 972     } else {
 973         keywordValueLen = 0;
 974     }
 975     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 976     if(U_FAILURE(*status)) {
 977         return 0;
 978     }
 979     startSearchHere = (char*)locale_getKeywordsStart(buffer);
 980     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
 981         if(!keywordValue) { /* no keywords = nothing to remove */
 982             return bufLen;
 983         }
 984
 985         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
 986         if(startSearchHere) { /* had a single @ */
 987             needLen--; /* already had the @ */
 988             /* startSearchHere points at the @ */
 989         } else {
 990             startSearchHere=buffer+bufLen;
 991         }
 992         if(needLen >= bufferCapacity) {
 993             *status = U_BUFFER_OVERFLOW_ERROR;
 994             return needLen; /* no change */
 995         }
 996         *startSearchHere = '@';
 997         startSearchHere++;
 998         uprv_strcpy(startSearchHere, keywordNameBuffer);
 999         startSearchHere += keywordNameLen;
1000         *startSearchHere = '=';
1001         startSearchHere++;
1002         uprv_strcpy(startSearchHere, keywordValue);
1003         startSearchHere+=keywordValueLen;
1004         return needLen;
1005     } /* end shortcut - no @ */
1006
1007     keywordStart = startSearchHere;
1008     /* search for keyword */
1009     while(keywordStart) {
1010         keywordStart++;
1011         /* skip leading spaces (allowed?) */
1012         while(*keywordStart == ' ') {
1013             keywordStart++;
1014         }
1015         nextEqualsign = uprv_strchr(keywordStart, '=');
1016         /* need to normalize both keyword and keyword name */
1017         if(!nextEqualsign) {
1018             break;
1019         }
1020         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1021             /* keyword name too long for internal buffer */
1022             *status = U_INTERNAL_PROGRAM_ERROR;
1023             return 0;
1024         }
1025         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1026             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1027         }
1028         /* trim trailing spaces */
1029         while(keywordStart[i-1] == ' ') {
1030             i--;
1031         }
1032         localeKeywordNameBuffer[i] = 0;
1033
1034         nextSeparator = uprv_strchr(nextEqualsign, ';');
1035         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1036         if(rc == 0) {
1037             nextEqualsign++;
1038             while(*nextEqualsign == ' ') {
1039                 nextEqualsign++;
1040             }
1041             /* we actually found the keyword. Change the value */
1042             if (nextSeparator) {
1043                 keywordAtEnd = 0;
1044                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1045             } else {
1046                 keywordAtEnd = 1;
1047                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1048             }
1049             if(keywordValue) { /* adding a value - not removing */
1050               if(foundValueLen == keywordValueLen) {
1051                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1052                 return bufLen; /* no change in size */
1053               } else if(foundValueLen > keywordValueLen) {
1054                 int32_t delta = foundValueLen - keywordValueLen;
1055                 if(nextSeparator) { /* RH side */
1056                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1057                 }
1058                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1059                 bufLen -= delta;
1060                 buffer[bufLen]=0;
1061                 return bufLen;
1062               } else { /* FVL < KVL */
1063                 int32_t delta = keywordValueLen - foundValueLen;
1064                 if((bufLen+delta) >= bufferCapacity) {
1065                   *status = U_BUFFER_OVERFLOW_ERROR;
1066                   return bufLen+delta;
1067                 }
1068                 if(nextSeparator) { /* RH side */
1069                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1070                 }
1071                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1072                 bufLen += delta;
1073                 buffer[bufLen]=0;
1074                 return bufLen;
1075               }
1076             } else { /* removing a keyword */
1077               if(keywordAtEnd) {
1078                 /* zero out the ';' or '@' just before startSearchhere */
1079                 keywordStart[-1] = 0;
1080                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1081               } else {
1082                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1083                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1084                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1085               }
1086             }
1087         } else if(rc<0){ /* end match keyword */
1088           /* could insert at this location. */
1089           insertHere = keywordStart;
1090         }
1091         keywordStart = nextSeparator;
1092     } /* end loop searching */
1093
1094     if(!keywordValue) {
1095       return bufLen; /* removal of non-extant keyword - no change */
1096     }
1097
1098     /* we know there is at least one keyword. */
1099     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1100     if(needLen >= bufferCapacity) {
1101         *status = U_BUFFER_OVERFLOW_ERROR;
1102         return needLen; /* no change */
1103     }
1104
1105     if(insertHere) {
1106       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1107       keywordStart = insertHere;
1108     } else {
1109       keywordStart = buffer+bufLen;
1110       *keywordStart = ';';
1111       keywordStart++;
1112     }
1113     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1114     keywordStart += keywordNameLen;
1115     *keywordStart = '=';
1116     keywordStart++;
1117     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1118     keywordStart+=keywordValueLen;
1119     if(insertHere) {
1120       *keywordStart = ';';
1121       keywordStart++;
1122     }
1123     buffer[needLen]=0;
1124     return needLen;
1125 }
1126
1127 /* ### ID parsing implementation **************************************************/
1128
1129 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1130
1131 /*returns TRUE if one of the special prefixes is here (s=string)
1132   'x-' or 'i-' */
1133 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1134
1135 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1136  * except for variant
1137  */
1138 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1139
1140 static char* _strnchr(const char* str, int32_t len, char c) {
1141     U_ASSERT(str != 0 && len >= 0);
1142     while (len-- != 0) {
1143         char d = *str;
1144         if (d == c) {
1145             return (char*) str;
1146         } else if (d == 0) {
1147             break;
1148         }
1149         ++str;
1150     }
1151     return NULL;
1152 }
1153
1154 /**
1155  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1156  * a NULL entry, followed by more entries, and a second NULL entry.
1157  *
1158  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1159  * COUNTRIES_3.
1160  */
1161 static int16_t _findIndex(const char* const* list, const char* key)
1162 {
1163     const char* const* anchor = list;
1164     int32_t pass = 0;
1165
1166     /* Make two passes through two NULL-terminated arrays at 'list' */
1167     while (pass++ < 2) {
1168         while (*list) {
1169             if (uprv_strcmp(key, *list) == 0) {
1170                 return (int16_t)(list - anchor);
1171             }
1172             list++;
1173         }
1174         ++list;     /* skip final NULL *CWB*/
1175     }
1176     return -1;
1177 }
1178
1179 /* count the length of src while copying it to dest; return strlen(src) */
1180 static U_INLINE int32_t
1181 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1182     const char *anchor;
1183     char c;
1184
1185     anchor=src;
1186     for(;;) {
1187         if((c=*src)==0) {
1188             return (int32_t)(src-anchor);
1189         }
1190         if(destCapacity<=0) {
1191             return (int32_t)((src-anchor)+uprv_strlen(src));
1192         }
1193         ++src;
1194         *dest++=c;
1195         --destCapacity;
1196     }
1197 }
1198
1199 U_CFUNC const char*
1200 uloc_getCurrentCountryID(const char* oldID){
1201     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1202     if (offset >= 0) {
1203         return REPLACEMENT_COUNTRIES[offset];
1204     }
1205     return oldID;
1206 }
1207 U_CFUNC const char*
1208 uloc_getCurrentLanguageID(const char* oldID){
1209     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1210     if (offset >= 0) {
1211         return REPLACEMENT_LANGUAGES[offset];
1212     }
1213     return oldID;
1214 }
1215 /*
1216  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1217  * avoid duplicating code to handle the earlier locale ID pieces
1218  * in the functions for the later ones by
1219  * setting the *pEnd pointer to where they stopped parsing
1220  *
1221  * TODO try to use this in Locale
1222  */
1223 U_CFUNC int32_t
1224 ulocimp_getLanguage(const char *localeID,
1225                     char *language, int32_t languageCapacity,
1226                     const char **pEnd) {
1227     int32_t i=0;
1228     int32_t offset;
1229     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1230
1231     /* if it starts with i- or x- then copy that prefix */
1232     if(_isIDPrefix(localeID)) {
1233         if(i<languageCapacity) {
1234             language[i]=(char)uprv_tolower(*localeID);
1235         }
1236         if(i<languageCapacity) {
1237             language[i+1]='-';
1238         }
1239         i+=2;
1240         localeID+=2;
1241     }
1242
1243     /* copy the language as far as possible and count its length */
1244     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1245         if(i<languageCapacity) {
1246             language[i]=(char)uprv_tolower(*localeID);
1247         }
1248         if(i<3) {
1249             lang[i]=(char)uprv_tolower(*localeID);
1250         }
1251         i++;
1252         localeID++;
1253     }
1254
1255     if(i==3) {
1256         /* convert 3 character code to 2 character code if possible *CWB*/
1257         offset=_findIndex(LANGUAGES_3, lang);
1258         if(offset>=0) {
1259             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1260         }
1261     }
1262
1263     if(pEnd!=NULL) {
1264         *pEnd=localeID;
1265     }
1266     return i;
1267 }
1268
1269 U_CFUNC int32_t
1270 ulocimp_getScript(const char *localeID,
1271                   char *script, int32_t scriptCapacity,
1272                   const char **pEnd)
1273 {
1274     int32_t idLen = 0;
1275
1276     if (pEnd != NULL) {
1277         *pEnd = localeID;
1278     }
1279
1280     /* copy the second item as far as possible and count its length */
1281     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1282         idLen++;
1283     }
1284
1285     /* If it's exactly 4 characters long, then it's a script and not a country. */
1286     if (idLen == 4) {
1287         int32_t i;
1288         if (pEnd != NULL) {
1289             *pEnd = localeID+idLen;
1290         }
1291         if(idLen > scriptCapacity) {
1292             idLen = scriptCapacity;
1293         }
1294         if (idLen >= 1) {
1295             script[0]=(char)uprv_toupper(*(localeID++));
1296         }
1297         for (i = 1; i < idLen; i++) {
1298             script[i]=(char)uprv_tolower(*(localeID++));
1299         }
1300     }
1301     else {
1302         idLen = 0;
1303     }
1304     return idLen;
1305 }
1306
1307 U_CFUNC int32_t
1308 ulocimp_getCountry(const char *localeID,
1309                    char *country, int32_t countryCapacity,
1310                    const char **pEnd)
1311 {
1312     int32_t idLen=0;
1313     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1314     int32_t offset;
1315
1316     /* copy the country as far as possible and count its length */
1317     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1318         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1319             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1320         }
1321         idLen++;
1322     }
1323
1324     /* the country should be either length 2 or 3 */
1325     if (idLen == 2 || idLen == 3) {
1326         UBool gotCountry = FALSE;
1327         /* convert 3 character code to 2 character code if possible *CWB*/
1328         if(idLen==3) {
1329             offset=_findIndex(COUNTRIES_3, cnty);
1330             if(offset>=0) {
1331                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1332                 gotCountry = TRUE;
1333             }
1334         }
1335         if (!gotCountry) {
1336             int32_t i = 0;
1337             for (i = 0; i < idLen; i++) {
1338                 if (i < countryCapacity) {
1339                     country[i]=(char)uprv_toupper(localeID[i]);
1340                 }
1341             }
1342         }
1343         localeID+=idLen;
1344     } else {
1345         idLen = 0;
1346     }
1347
1348     if(pEnd!=NULL) {
1349         *pEnd=localeID;
1350     }
1351
1352     return idLen;
1353 }
1354
1355 /**
1356  * @param needSeparator if true, then add leading '_' if any variants
1357  * are added to 'variant'
1358  */
1359 static int32_t
1360 _getVariantEx(const char *localeID,
1361               char prev,
1362               char *variant, int32_t variantCapacity,
1363               UBool needSeparator) {
1364     int32_t i=0;
1365
1366     /* get one or more variant tags and separate them with '_' */
1367     if(_isIDSeparator(prev)) {
1368         /* get a variant string after a '-' or '_' */
1369         while(!_isTerminator(*localeID)) {
1370             if (needSeparator) {
1371                 if (i<variantCapacity) {
1372                     variant[i] = '_';
1373                 }
1374                 ++i;
1375                 needSeparator = FALSE;
1376             }
1377             if(i<variantCapacity) {
1378                 variant[i]=(char)uprv_toupper(*localeID);
1379                 if(variant[i]=='-') {
1380                     variant[i]='_';
1381                 }
1382             }
1383             i++;
1384             localeID++;
1385         }
1386     }
1387
1388     /* if there is no variant tag after a '-' or '_' then look for '@' */
1389     if(i==0) {
1390         if(prev=='@') {
1391             /* keep localeID */
1392         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1393             ++localeID; /* point after the '@' */
1394         } else {
1395             return 0;
1396         }
1397         while(!_isTerminator(*localeID)) {
1398             if (needSeparator) {
1399                 if (i<variantCapacity) {
1400                     variant[i] = '_';
1401                 }
1402                 ++i;
1403                 needSeparator = FALSE;
1404             }
1405             if(i<variantCapacity) {
1406                 variant[i]=(char)uprv_toupper(*localeID);
1407                 if(variant[i]=='-' || variant[i]==',') {
1408                     variant[i]='_';
1409                 }
1410             }
1411             i++;
1412             localeID++;
1413         }
1414     }
1415
1416     return i;
1417 }
1418
1419 static int32_t
1420 _getVariant(const char *localeID,
1421             char prev,
1422             char *variant, int32_t variantCapacity) {
1423     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1424 }
1425
1426 /**
1427  * Delete ALL instances of a variant from the given list of one or
1428  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1429  * @param variants the source string of one or more variants,
1430  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1431  * terminated; if it is, trailing zero will NOT be maintained.
1432  * @param variantsLen length of variants
1433  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1434  * or "PREEURO"; not zero terminated
1435  * @param toDeleteLen length of toDelete
1436  * @return number of characters deleted from variants
1437  */
1438 static int32_t
1439 _deleteVariant(char* variants, int32_t variantsLen,
1440                const char* toDelete, int32_t toDeleteLen)
1441 {
1442     int32_t delta = 0; /* number of chars deleted */
1443     for (;;) {
1444         UBool flag = FALSE;
1445         if (variantsLen < toDeleteLen) {
1446             return delta;
1447         }
1448         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1449             (variantsLen == toDeleteLen ||
1450              (flag=(variants[toDeleteLen] == '_'))))
1451         {
1452             int32_t d = toDeleteLen + (flag?1:0);
1453             variantsLen -= d;
1454             delta += d;
1455             if (variantsLen > 0) {
1456                 uprv_memmove(variants, variants+d, variantsLen);
1457             }
1458         } else {
1459             char* p = _strnchr(variants, variantsLen, '_');
1460             if (p == NULL) {
1461                 return delta;
1462             }
1463             ++p;
1464             variantsLen -= (int32_t)(p - variants);
1465             variants = p;
1466         }
1467     }
1468 }
1469
1470 /* Keyword enumeration */
1471
1472 typedef struct UKeywordsContext {
1473     char* keywords;
1474     char* current;
1475 } UKeywordsContext;
1476
1477 static void U_CALLCONV
1478 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1479     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1480     uprv_free(enumerator->context);
1481     uprv_free(enumerator);
1482 }
1483
1484 static int32_t U_CALLCONV
1485 uloc_kw_countKeywords(UEnumeration *en, UErrorCode *status) {
1486     char *kw = ((UKeywordsContext *)en->context)->keywords;
1487     int32_t result = 0;
1488     while(*kw) {
1489         result++;
1490         kw += uprv_strlen(kw)+1;
1491     }
1492     return result;
1493 }
1494
1495 static const char* U_CALLCONV
1496 uloc_kw_nextKeyword(UEnumeration* en,
1497                     int32_t* resultLength,
1498                     UErrorCode* status) {
1499     const char* result = ((UKeywordsContext *)en->context)->current;
1500     int32_t len = 0;
1501     if(*result) {
1502         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1503         ((UKeywordsContext *)en->context)->current += len+1;
1504     } else {
1505         result = NULL;
1506     }
1507     if (resultLength) {
1508         *resultLength = len;
1509     }
1510     return result;
1511 }
1512
1513 static void U_CALLCONV
1514 uloc_kw_resetKeywords(UEnumeration* en,
1515                       UErrorCode* status) {
1516     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1517 }
1518
1519 static const UEnumeration gKeywordsEnum = {
1520     NULL,
1521     NULL,
1522     uloc_kw_closeKeywords,
1523     uloc_kw_countKeywords,
1524     uenum_unextDefault,
1525     uloc_kw_nextKeyword,
1526     uloc_kw_resetKeywords
1527 };
1528
1529 U_CAPI UEnumeration* U_EXPORT2
1530 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1531 {
1532     UKeywordsContext *myContext = NULL;
1533     UEnumeration *result = NULL;
1534
1535     if(U_FAILURE(*status)) {
1536         return NULL;
1537     }
1538     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1539     /* Null pointer test */
1540     if (result == NULL) {
1541         *status = U_MEMORY_ALLOCATION_ERROR;
1542         return NULL;
1543     }
1544     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1545     myContext = uprv_malloc(sizeof(UKeywordsContext));
1546     if (myContext == NULL) {
1547         *status = U_MEMORY_ALLOCATION_ERROR;
1548         uprv_free(result);
1549         return NULL;
1550     }
1551     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1552     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1553     myContext->keywords[keywordListSize] = 0;
1554     myContext->current = myContext->keywords;
1555     result->context = myContext;
1556     return result;
1557 }
1558
1559 U_CAPI UEnumeration* U_EXPORT2
1560 uloc_openKeywords(const char* localeID,
1561                         UErrorCode* status)
1562 {
1563     int32_t i=0;
1564     char keywords[256];
1565     int32_t keywordsCapacity = 256;
1566     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1567     const char* tmpLocaleID;
1568
1569     if(status==NULL || U_FAILURE(*status)) {
1570         return 0;
1571     }
1572
1573     if (_hasBCP47Extension(localeID)) {
1574         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1575     } else {
1576         if (localeID==NULL) {
1577            localeID=uloc_getDefault();
1578         }
1579         tmpLocaleID=localeID;
1580     }
1581
1582     /* Skip the language */
1583     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1584     if(_isIDSeparator(*tmpLocaleID)) {
1585         const char *scriptID;
1586         /* Skip the script if available */
1587         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1588         if(scriptID != tmpLocaleID+1) {
1589             /* Found optional script */
1590             tmpLocaleID = scriptID;
1591         }
1592         /* Skip the Country */
1593         if (_isIDSeparator(*tmpLocaleID)) {
1594             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1595             if(_isIDSeparator(*tmpLocaleID)) {
1596                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1597             }
1598         }
1599     }
1600
1601     /* keywords are located after '@' */
1602     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1603         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1604     }
1605
1606     if(i) {
1607         return uloc_openKeywordList(keywords, i, status);
1608     } else {
1609         return NULL;
1610     }
1611 }
1612
1613
1614 /* bit-flags for 'options' parameter of _canonicalize */
1615 #define _ULOC_STRIP_KEYWORDS 0x2
1616 #define _ULOC_CANONICALIZE   0x1
1617
1618 #define OPTION_SET(options, mask) ((options & mask) != 0)
1619
1620 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1621 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1622
1623 /**
1624  * Canonicalize the given localeID, to level 1 or to level 2,
1625  * depending on the options.  To specify level 1, pass in options=0.
1626  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1627  *
1628  * This is the code underlying uloc_getName and uloc_canonicalize.
1629  */
1630 static int32_t
1631 _canonicalize(const char* localeID,
1632               char* result,
1633               int32_t resultCapacity,
1634               uint32_t options,
1635               UErrorCode* err) {
1636     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1637     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1638     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1639     const char* origLocaleID;
1640     const char* tmpLocaleID;
1641     const char* keywordAssign = NULL;
1642     const char* separatorIndicator = NULL;
1643     const char* addKeyword = NULL;
1644     const char* addValue = NULL;
1645     char* name;
1646     char* variant = NULL; /* pointer into name, or NULL */
1647
1648     if (U_FAILURE(*err)) {
1649         return 0;
1650     }
1651
1652     if (_hasBCP47Extension(localeID)) {
1653         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1654     } else {
1655         if (localeID==NULL) {
1656            localeID=uloc_getDefault();
1657         }
1658         tmpLocaleID=localeID;
1659     }
1660
1661     origLocaleID=tmpLocaleID;
1662
1663     /* if we are doing a full canonicalization, then put results in
1664        localeBuffer, if necessary; otherwise send them to result. */
1665     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1666         (result == NULL || resultCapacity <  sizeof(localeBuffer))) {
1667         name = localeBuffer;
1668         nameCapacity = sizeof(localeBuffer);
1669     } else {
1670         name = result;
1671         nameCapacity = resultCapacity;
1672     }
1673
1674     /* get all pieces, one after another, and separate with '_' */
1675     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1676
1677     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1678         const char *d = uloc_getDefault();
1679
1680         len = (int32_t)uprv_strlen(d);
1681
1682         if (name != NULL) {
1683             uprv_strncpy(name, d, len);
1684         }
1685     } else if(_isIDSeparator(*tmpLocaleID)) {
1686         const char *scriptID;
1687
1688         ++fieldCount;
1689         if(len<nameCapacity) {
1690             name[len]='_';
1691         }
1692         ++len;
1693
1694         scriptSize=ulocimp_getScript(tmpLocaleID+1, name+len, nameCapacity-len, &scriptID);
1695         if(scriptSize > 0) {
1696             /* Found optional script */
1697             tmpLocaleID = scriptID;
1698             ++fieldCount;
1699             len+=scriptSize;
1700             if (_isIDSeparator(*tmpLocaleID)) {
1701                 /* If there is something else, then we add the _ */
1702                 if(len<nameCapacity) {
1703                     name[len]='_';
1704                 }
1705                 ++len;
1706             }
1707         }
1708
1709         if (_isIDSeparator(*tmpLocaleID)) {
1710             const char *cntryID;
1711             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1, name+len, nameCapacity-len, &cntryID);
1712             if (cntrySize > 0) {
1713                 /* Found optional country */
1714                 tmpLocaleID = cntryID;
1715                 len+=cntrySize;
1716             }
1717             if(_isIDSeparator(*tmpLocaleID)) {
1718                 /* If there is something else, then we add the _  if we found country before.*/
1719                 if (cntrySize > 0) {
1720                     ++fieldCount;
1721                     if(len<nameCapacity) {
1722                         name[len]='_';
1723                     }
1724                     ++len;
1725                 }
1726
1727                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID, name+len, nameCapacity-len);
1728                 if (variantSize > 0) {
1729                     variant = name+len;
1730                     len += variantSize;
1731                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1732                 }
1733             }
1734         }
1735     }
1736
1737     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1738     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1739         UBool done = FALSE;
1740         do {
1741             char c = *tmpLocaleID;
1742             switch (c) {
1743             case 0:
1744             case '@':
1745                 done = TRUE;
1746                 break;
1747             default:
1748                 if (len<nameCapacity) {
1749                     name[len] = c;
1750                 }
1751                 ++len;
1752                 ++tmpLocaleID;
1753                 break;
1754             }
1755         } while (!done);
1756     }
1757
1758     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1759        After this, tmpLocaleID either points to '@' or is NULL */
1760     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1761         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1762         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1763     }
1764
1765     /* Copy POSIX-style variant, if any [mr@FOO] */
1766     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1767         tmpLocaleID != NULL && keywordAssign == NULL) {
1768         for (;;) {
1769             char c = *tmpLocaleID;
1770             if (c == 0) {
1771                 break;
1772             }
1773             if (len<nameCapacity) {
1774                 name[len] = c;
1775             }
1776             ++len;
1777             ++tmpLocaleID;
1778         }
1779     }
1780
1781     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1782         /* Handle @FOO variant if @ is present and not followed by = */
1783         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1784             int32_t posixVariantSize;
1785             /* Add missing '_' if needed */
1786             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1787                 do {
1788                     if(len<nameCapacity) {
1789                         name[len]='_';
1790                     }
1791                     ++len;
1792                     ++fieldCount;
1793                 } while(fieldCount<2);
1794             }
1795             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1796                                              (UBool)(variantSize > 0));
1797             if (posixVariantSize > 0) {
1798                 if (variant == NULL) {
1799                     variant = name+len;
1800                 }
1801                 len += posixVariantSize;
1802                 variantSize += posixVariantSize;
1803             }
1804         }
1805
1806         /* Handle generic variants first */
1807         if (variant) {
1808             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1809                 const char* variantToCompare = VARIANT_MAP[j].variant;
1810                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1811                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1812                 len -= variantLen;
1813                 if (variantLen > 0) {
1814                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1815                         --len;
1816                     }
1817                     addKeyword = VARIANT_MAP[j].keyword;
1818                     addValue = VARIANT_MAP[j].value;
1819                     break;
1820                 }
1821             }
1822             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1823                 --len;
1824             }
1825         }
1826
1827         /* Look up the ID in the canonicalization map */
1828         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1829             const char* id = CANONICALIZE_MAP[j].id;
1830             int32_t n = (int32_t)uprv_strlen(id);
1831             if (len == n && uprv_strncmp(name, id, n) == 0) {
1832                 if (n == 0 && tmpLocaleID != NULL) {
1833                     break; /* Don't remap "" if keywords present */
1834                 }
1835                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1836                 if (CANONICALIZE_MAP[j].keyword) {
1837                     addKeyword = CANONICALIZE_MAP[j].keyword;
1838                     addValue = CANONICALIZE_MAP[j].value;
1839                 }
1840                 break;
1841             }
1842         }
1843     }
1844
1845     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1846         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1847             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1848             if(len<nameCapacity) {
1849                 name[len]='@';
1850             }
1851             ++len;
1852             ++fieldCount;
1853             len += _getKeywords(tmpLocaleID+1, '@', name+len, nameCapacity-len, NULL, 0, NULL, TRUE,
1854                                 addKeyword, addValue, err);
1855         } else if (addKeyword != NULL) {
1856             U_ASSERT(addValue != NULL);
1857             /* inelegant but works -- later make _getKeywords do this? */
1858             len += _copyCount(name+len, nameCapacity-len, "@");
1859             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1860             len += _copyCount(name+len, nameCapacity-len, "=");
1861             len += _copyCount(name+len, nameCapacity-len, addValue);
1862         }
1863     }
1864
1865     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1866         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1867     }
1868
1869     return u_terminateChars(result, resultCapacity, len, err);
1870 }
1871
1872 /* ### ID parsing API **************************************************/
1873
1874 U_CAPI int32_t  U_EXPORT2
1875 uloc_getParent(const char*    localeID,
1876                char* parent,
1877                int32_t parentCapacity,
1878                UErrorCode* err)
1879 {
1880     const char *lastUnderscore;
1881     int32_t i;
1882
1883     if (U_FAILURE(*err))
1884         return 0;
1885
1886     if (localeID == NULL)
1887         localeID = uloc_getDefault();
1888
1889     lastUnderscore=uprv_strrchr(localeID, '_');
1890     if(lastUnderscore!=NULL) {
1891         i=(int32_t)(lastUnderscore-localeID);
1892     } else {
1893         i=0;
1894     }
1895
1896     if(i>0 && parent != localeID) {
1897         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1898     }
1899     return u_terminateChars(parent, parentCapacity, i, err);
1900 }
1901
1902 U_CAPI int32_t U_EXPORT2
1903 uloc_getLanguage(const char*    localeID,
1904          char* language,
1905          int32_t languageCapacity,
1906          UErrorCode* err)
1907 {
1908     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1909     int32_t i=0;
1910
1911     if (err==NULL || U_FAILURE(*err)) {
1912         return 0;
1913     }
1914
1915     if(localeID==NULL) {
1916         localeID=uloc_getDefault();
1917     }
1918
1919     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1920     return u_terminateChars(language, languageCapacity, i, err);
1921 }
1922
1923 U_CAPI int32_t U_EXPORT2
1924 uloc_getScript(const char*    localeID,
1925          char* script,
1926          int32_t scriptCapacity,
1927          UErrorCode* err)
1928 {
1929     int32_t i=0;
1930
1931     if(err==NULL || U_FAILURE(*err)) {
1932         return 0;
1933     }
1934
1935     if(localeID==NULL) {
1936         localeID=uloc_getDefault();
1937     }
1938
1939     /* skip the language */
1940     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1941     if(_isIDSeparator(*localeID)) {
1942         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1943     }
1944     return u_terminateChars(script, scriptCapacity, i, err);
1945 }
1946
1947 U_CAPI int32_t  U_EXPORT2
1948 uloc_getCountry(const char* localeID,
1949             char* country,
1950             int32_t countryCapacity,
1951             UErrorCode* err)
1952 {
1953     int32_t i=0;
1954
1955     if(err==NULL || U_FAILURE(*err)) {
1956         return 0;
1957     }
1958
1959     if(localeID==NULL) {
1960         localeID=uloc_getDefault();
1961     }
1962
1963     /* Skip the language */
1964     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1965     if(_isIDSeparator(*localeID)) {
1966         const char *scriptID;
1967         /* Skip the script if available */
1968         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1969         if(scriptID != localeID+1) {
1970             /* Found optional script */
1971             localeID = scriptID;
1972         }
1973         if(_isIDSeparator(*localeID)) {
1974             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1975         }
1976     }
1977     return u_terminateChars(country, countryCapacity, i, err);
1978 }
1979
1980 U_CAPI int32_t  U_EXPORT2
1981 uloc_getVariant(const char* localeID,
1982                 char* variant,
1983                 int32_t variantCapacity,
1984                 UErrorCode* err)
1985 {
1986     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1987     const char* tmpLocaleID;
1988     int32_t i=0;
1989
1990     if(err==NULL || U_FAILURE(*err)) {
1991         return 0;
1992     }
1993
1994     if (_hasBCP47Extension(localeID)) {
1995         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1996     } else {
1997         if (localeID==NULL) {
1998            localeID=uloc_getDefault();
1999         }
2000         tmpLocaleID=localeID;
2001     }
2002
2003     /* Skip the language */
2004     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2005     if(_isIDSeparator(*tmpLocaleID)) {
2006         const char *scriptID;
2007         /* Skip the script if available */
2008         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2009         if(scriptID != tmpLocaleID+1) {
2010             /* Found optional script */
2011             tmpLocaleID = scriptID;
2012         }
2013         /* Skip the Country */
2014         if (_isIDSeparator(*tmpLocaleID)) {
2015             const char *cntryID;
2016             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2017             if (cntryID != tmpLocaleID+1) {
2018                 /* Found optional country */
2019                 tmpLocaleID = cntryID;
2020             }
2021             if(_isIDSeparator(*tmpLocaleID)) {
2022                 /* If there was no country ID, skip a possible extra IDSeparator */
2023                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2024                     tmpLocaleID++;
2025                 }
2026                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2027             }
2028         }
2029     }
2030
2031     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2032     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2033 /*
2034     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2035         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2036     }
2037 */
2038     return u_terminateChars(variant, variantCapacity, i, err);
2039 }
2040
2041 U_CAPI int32_t  U_EXPORT2
2042 uloc_getName(const char* localeID,
2043              char* name,
2044              int32_t nameCapacity,
2045              UErrorCode* err)
2046 {
2047     return _canonicalize(localeID, name, nameCapacity, 0, err);
2048 }
2049
2050 U_CAPI int32_t  U_EXPORT2
2051 uloc_getBaseName(const char* localeID,
2052                  char* name,
2053                  int32_t nameCapacity,
2054                  UErrorCode* err)
2055 {
2056     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2057 }
2058
2059 U_CAPI int32_t  U_EXPORT2
2060 uloc_canonicalize(const char* localeID,
2061                   char* name,
2062                   int32_t nameCapacity,
2063                   UErrorCode* err)
2064 {
2065     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2066 }
2067
2068 U_CAPI const char*  U_EXPORT2
2069 uloc_getISO3Language(const char* localeID)
2070 {
2071     int16_t offset;
2072     char lang[ULOC_LANG_CAPACITY];
2073     UErrorCode err = U_ZERO_ERROR;
2074
2075     if (localeID == NULL)
2076     {
2077         localeID = uloc_getDefault();
2078     }
2079     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2080     if (U_FAILURE(err))
2081         return "";
2082     offset = _findIndex(LANGUAGES, lang);
2083     if (offset < 0)
2084         return "";
2085     return LANGUAGES_3[offset];
2086 }
2087
2088 U_CAPI const char*  U_EXPORT2
2089 uloc_getISO3Country(const char* localeID)
2090 {
2091     int16_t offset;
2092     char cntry[ULOC_LANG_CAPACITY];
2093     UErrorCode err = U_ZERO_ERROR;
2094
2095     if (localeID == NULL)
2096     {
2097         localeID = uloc_getDefault();
2098     }
2099     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2100     if (U_FAILURE(err))
2101         return "";
2102     offset = _findIndex(COUNTRIES, cntry);
2103     if (offset < 0)
2104         return "";
2105
2106     return COUNTRIES_3[offset];
2107 }
2108
2109 U_CAPI uint32_t  U_EXPORT2
2110 uloc_getLCID(const char* localeID)
2111 {
2112     UErrorCode status = U_ZERO_ERROR;
2113     char       langID[ULOC_FULLNAME_CAPACITY];
2114
2115     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2116     if (U_FAILURE(status)) {
2117         return 0;
2118     }
2119
2120     return uprv_convertToLCID(langID, localeID, &status);
2121 }
2122
2123 U_CAPI int32_t U_EXPORT2
2124 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2125                 UErrorCode *status)
2126 {
2127     int32_t length;
2128     const char *posix = uprv_convertToPosix(hostid, status);
2129     if (U_FAILURE(*status) || posix == NULL) {
2130         return 0;
2131     }
2132     length = (int32_t)uprv_strlen(posix);
2133     if (length+1 > localeCapacity) {
2134         *status = U_BUFFER_OVERFLOW_ERROR;
2135     }
2136     else {
2137         uprv_strcpy(locale, posix);
2138     }
2139     return length;
2140 }
2141
2142 /* ### Default locale **************************************************/
2143
2144 U_CAPI const char*  U_EXPORT2
2145 uloc_getDefault()
2146 {
2147     return locale_get_default();
2148 }
2149
2150 U_CAPI void  U_EXPORT2
2151 uloc_setDefault(const char*   newDefaultLocale,
2152              UErrorCode* err)
2153 {
2154     if (U_FAILURE(*err))
2155         return;
2156     /* the error code isn't currently used for anything by this function*/
2157
2158     /* propagate change to C++ */
2159     locale_set_default(newDefaultLocale);
2160 }
2161
2162 /**
2163  * Returns a list of all language codes defined in ISO 639.  This is a pointer
2164  * to an array of pointers to arrays of char.  All of these pointers are owned
2165  * by ICU-- do not delete them, and do not write through them.  The array is
2166  * terminated with a null pointer.
2167  */
2168 U_CAPI const char* const*  U_EXPORT2
2169 uloc_getISOLanguages()
2170 {
2171     return LANGUAGES;
2172 }
2173
2174 /**
2175  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2176  * pointer to an array of pointers to arrays of char.  All of these pointers are
2177  * owned by ICU-- do not delete them, and do not write through them.  The array is
2178  * terminated with a null pointer.
2179  */
2180 U_CAPI const char* const*  U_EXPORT2
2181 uloc_getISOCountries()
2182 {
2183     return COUNTRIES;
2184 }
2185
2186
2187 /* this function to be moved into cstring.c later */
2188 static char gDecimal = 0;
2189
2190 static /* U_CAPI */
2191 double
2192 /* U_EXPORT2 */
2193 _uloc_strtod(const char *start, char **end) {
2194     char *decimal;
2195     char *myEnd;
2196     char buf[30];
2197     double rv;
2198     if (!gDecimal) {
2199         char rep[5];
2200         /* For machines that decide to change the decimal on you,
2201         and try to be too smart with localization.
2202         This normally should be just a '.'. */
2203         sprintf(rep, "%+1.1f", 1.0);
2204         gDecimal = rep[2];
2205     }
2206
2207     if(gDecimal == '.') {
2208         return uprv_strtod(start, end); /* fall through to OS */
2209     } else {
2210         uprv_strncpy(buf, start, 29);
2211         buf[29]=0;
2212         decimal = uprv_strchr(buf, '.');
2213         if(decimal) {
2214             *decimal = gDecimal;
2215         } else {
2216             return uprv_strtod(start, end); /* no decimal point */
2217         }
2218         rv = uprv_strtod(buf, &myEnd);
2219         if(end) {
2220             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2221         }
2222         return rv;
2223     }
2224 }
2225
2226 typedef struct {
2227     float q;
2228     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2229     char *locale;
2230 } _acceptLangItem;
2231
2232 static int32_t U_CALLCONV
2233 uloc_acceptLanguageCompare(const void *context, const void *a, const void *b)
2234 {
2235     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2236     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2237
2238     int32_t rc = 0;
2239     if(bb->q < aa->q) {
2240         rc = -1;  /* A > B */
2241     } else if(bb->q > aa->q) {
2242         rc = 1;   /* A < B */
2243     } else {
2244         rc = 0;   /* A = B */
2245     }
2246
2247     if(rc==0) {
2248         rc = uprv_stricmp(aa->locale, bb->locale);
2249     }
2250
2251 #if defined(ULOC_DEBUG)
2252     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2253     aa->locale, aa->q,
2254     bb->locale, bb->q,
2255     rc);*/
2256 #endif
2257
2258     return rc;
2259 }
2260
2261 /*
2262 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2263 */
2264
2265 U_CAPI int32_t U_EXPORT2
2266 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2267                             const char *httpAcceptLanguage,
2268                             UEnumeration* availableLocales,
2269                             UErrorCode *status)
2270 {
2271     _acceptLangItem *j;
2272     _acceptLangItem smallBuffer[30];
2273     char **strs;
2274     char tmp[ULOC_FULLNAME_CAPACITY +1];
2275     int32_t n = 0;
2276     const char *itemEnd;
2277     const char *paramEnd;
2278     const char *s;
2279     const char *t;
2280     int32_t res;
2281     int32_t i;
2282     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2283     int32_t jSize;
2284     char *tempstr; /* Use for null pointer check */
2285
2286     j = smallBuffer;
2287     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2288     if(U_FAILURE(*status)) {
2289         return -1;
2290     }
2291
2292     for(s=httpAcceptLanguage;s&&*s;) {
2293         while(isspace(*s)) /* eat space at the beginning */
2294             s++;
2295         itemEnd=uprv_strchr(s,',');
2296         paramEnd=uprv_strchr(s,';');
2297         if(!itemEnd) {
2298             itemEnd = httpAcceptLanguage+l; /* end of string */
2299         }
2300         if(paramEnd && paramEnd<itemEnd) {
2301             /* semicolon (;) is closer than end (,) */
2302             t = paramEnd+1;
2303             if(*t=='q') {
2304                 t++;
2305             }
2306             while(isspace(*t)) {
2307                 t++;
2308             }
2309             if(*t=='=') {
2310                 t++;
2311             }
2312             while(isspace(*t)) {
2313                 t++;
2314             }
2315             j[n].q = (float)_uloc_strtod(t,NULL);
2316         } else {
2317             /* no semicolon - it's 1.0 */
2318             j[n].q = 1.0f;
2319             paramEnd = itemEnd;
2320         }
2321         j[n].dummy=0;
2322         /* eat spaces prior to semi */
2323         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2324             ;
2325         /* Check for null pointer from uprv_strndup */
2326         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2327         if (tempstr == NULL) {
2328             *status = U_MEMORY_ALLOCATION_ERROR;
2329             return -1;
2330         }
2331         j[n].locale = tempstr;
2332         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2333         if(strcmp(j[n].locale,tmp)) {
2334             uprv_free(j[n].locale);
2335             j[n].locale=uprv_strdup(tmp);
2336         }
2337 #if defined(ULOC_DEBUG)
2338         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2339 #endif
2340         n++;
2341         s = itemEnd;
2342         while(*s==',') { /* eat duplicate commas */
2343             s++;
2344         }
2345         if(n>=jSize) {
2346             if(j==smallBuffer) {  /* overflowed the small buffer. */
2347                 j = uprv_malloc(sizeof(j[0])*(jSize*2));
2348                 if(j!=NULL) {
2349                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2350                 }
2351 #if defined(ULOC_DEBUG)
2352                 fprintf(stderr,"malloced at size %d\n", jSize);
2353 #endif
2354             } else {
2355                 j = uprv_realloc(j, sizeof(j[0])*jSize*2);
2356 #if defined(ULOC_DEBUG)
2357                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2358 #endif
2359             }
2360             jSize *= 2;
2361             if(j==NULL) {
2362                 *status = U_MEMORY_ALLOCATION_ERROR;
2363                 return -1;
2364             }
2365         }
2366     }
2367     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2368     if(U_FAILURE(*status)) {
2369         if(j != smallBuffer) {
2370 #if defined(ULOC_DEBUG)
2371             fprintf(stderr,"freeing j %p\n", j);
2372 #endif
2373             uprv_free(j);
2374         }
2375         return -1;
2376     }
2377     strs = uprv_malloc((size_t)(sizeof(strs[0])*n));
2378     /* Check for null pointer */
2379     if (strs == NULL) {
2380         uprv_free(j); /* Free to avoid memory leak */
2381         *status = U_MEMORY_ALLOCATION_ERROR;
2382         return -1;
2383     }
2384     for(i=0;i<n;i++) {
2385 #if defined(ULOC_DEBUG)
2386         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2387 #endif
2388         strs[i]=j[i].locale;
2389     }
2390     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2391         (const char**)strs, n, availableLocales, status);
2392     for(i=0;i<n;i++) {
2393         uprv_free(strs[i]);
2394     }
2395     uprv_free(strs);
2396     if(j != smallBuffer) {
2397 #if defined(ULOC_DEBUG)
2398         fprintf(stderr,"freeing j %p\n", j);
2399 #endif
2400         uprv_free(j);
2401     }
2402     return res;
2403 }
2404
2405
2406 U_CAPI int32_t U_EXPORT2
2407 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2408                     UAcceptResult *outResult, const char **acceptList,
2409                     int32_t acceptListCount,
2410                     UEnumeration* availableLocales,
2411                     UErrorCode *status)
2412 {
2413     int32_t i,j;
2414     int32_t len;
2415     int32_t maxLen=0;
2416     char tmp[ULOC_FULLNAME_CAPACITY+1];
2417     const char *l;
2418     char **fallbackList;
2419     if(U_FAILURE(*status)) {
2420         return -1;
2421     }
2422     fallbackList = uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount));
2423     if(fallbackList==NULL) {
2424         *status = U_MEMORY_ALLOCATION_ERROR;
2425         return -1;
2426     }
2427     for(i=0;i<acceptListCount;i++) {
2428 #if defined(ULOC_DEBUG)
2429         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2430 #endif
2431         while((l=uenum_next(availableLocales, NULL, status))) {
2432 #if defined(ULOC_DEBUG)
2433             fprintf(stderr,"  %s\n", l);
2434 #endif
2435             len = (int32_t)uprv_strlen(l);
2436             if(!uprv_strcmp(acceptList[i], l)) {
2437                 if(outResult) {
2438                     *outResult = ULOC_ACCEPT_VALID;
2439                 }
2440 #if defined(ULOC_DEBUG)
2441                 fprintf(stderr, "MATCH! %s\n", l);
2442 #endif
2443                 if(len>0) {
2444                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2445                 }
2446                 for(j=0;j<i;j++) {
2447                     uprv_free(fallbackList[j]);
2448                 }
2449                 uprv_free(fallbackList);
2450                 return u_terminateChars(result, resultAvailable, len, status);
2451             }
2452             if(len>maxLen) {
2453                 maxLen = len;
2454             }
2455         }
2456         uenum_reset(availableLocales, status);
2457         /* save off parent info */
2458         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2459             fallbackList[i] = uprv_strdup(tmp);
2460         } else {
2461             fallbackList[i]=0;
2462         }
2463     }
2464
2465     for(maxLen--;maxLen>0;maxLen--) {
2466         for(i=0;i<acceptListCount;i++) {
2467             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2468 #if defined(ULOC_DEBUG)
2469                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2470 #endif
2471                 while((l=uenum_next(availableLocales, NULL, status))) {
2472 #if defined(ULOC_DEBUG)
2473                     fprintf(stderr,"  %s\n", l);
2474 #endif
2475                     len = (int32_t)uprv_strlen(l);
2476                     if(!uprv_strcmp(fallbackList[i], l)) {
2477                         if(outResult) {
2478                             *outResult = ULOC_ACCEPT_FALLBACK;
2479                         }
2480 #if defined(ULOC_DEBUG)
2481                         fprintf(stderr, "fallback MATCH! %s\n", l);
2482 #endif
2483                         if(len>0) {
2484                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2485                         }
2486                         for(j=0;j<acceptListCount;j++) {
2487                             uprv_free(fallbackList[j]);
2488                         }
2489                         uprv_free(fallbackList);
2490                         return u_terminateChars(result, resultAvailable, len, status);
2491                     }
2492                 }
2493                 uenum_reset(availableLocales, status);
2494
2495                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2496                     uprv_free(fallbackList[i]);
2497                     fallbackList[i] = uprv_strdup(tmp);
2498                 } else {
2499                     uprv_free(fallbackList[i]);
2500                     fallbackList[i]=0;
2501                 }
2502             }
2503         }
2504         if(outResult) {
2505             *outResult = ULOC_ACCEPT_FAILED;
2506         }
2507     }
2508     for(i=0;i<acceptListCount;i++) {
2509         uprv_free(fallbackList[i]);
2510     }
2511     uprv_free(fallbackList);
2512     return -1;
2513 }
2514
2515 /*eof*/