src/third_party/icu/source/data/mappings/convrtrs.txt

   1 # ******************************************************************************
   2 # *
   3 # *   Copyright (C) 1995-2014, International Business Machines
   4 # *   Corporation and others.  All Rights Reserved.
   5 # *
   6 # ******************************************************************************
   7
   8 # If this converter alias table looks very confusing, a much easier to
   9 # understand view can be found at this demo:
  10 # http://demo.icu-project.org/icu-bin/convexp
  11
  12 # IMPORTANT NOTE
  13 #
  14 # This file is not read directly by ICU. If you change it, you need to
  15 # run gencnval, and eventually run pkgdata to update the representation that
  16 # ICU uses for aliases. The gencnval tool will normally compile this file into
  17 # cnvalias.icu. The gencnval -v verbose option will help you when you edit
  18 # this file.
  19
  20 # Please be friendly to the rest of us that edit this table by
  21 # keeping this table free of tabs.
  22
  23 # This is an alias file used by the character set converter.
  24 # A lot of converter information can be found in unicode/ucnv.h, but here
  25 # is more information about this file.
  26 #
  27 # If you are adding a new converter to this list and want to include it in the
  28 # icu data library, please be sure to add an entry to the appropriate ucm*.mk file
  29 # (see ucmfiles.mk for more information).
  30 #
  31 # Here is the file format using BNF-like syntax:
  32 #
  33 # converterTable ::= tags { converterLine* }
  34 # converterLine ::= converterName [ tags ] { taggedAlias* }'\n'
  35 # taggedAlias ::= alias [ tags ]
  36 # tags ::= '{' { tag+ } '}'
  37 # tag ::= standard['*']
  38 # converterName ::= [0-9a-zA-Z:_'-']+
  39 # alias ::= converterName
  40 #
  41 # Except for the converter name, aliases are case insensitive.
  42 # Names are separated by whitespace.
  43 # Line continuation and comment sytax are similar to the GNU make syntax.
  44 # Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL
  45 # TABULATION) are presumed to be a continuation of the previous line.
  46 # The # symbol starts a comment and the comment continues till the end of
  47 # the line.
  48 #
  49 # The converter
  50 #
  51 # All names can be tagged by including a space-separated list of tags in
  52 # curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
  53 # some-charset{MIME* IANA*}. The order of tags does not matter, and
  54 # whitespace is allowed between the tagged name and the tags list.
  55 #
  56 # The tags can be used to get standard names using ucnv_getStandardName().
  57 #
  58 # The complete list of recognized tags used in this file is defined in
  59 # the affinity list near the beginning of the file.
  60 #
  61 # The * after the standard tag denotes that the previous alias is the
  62 # preferred (default) charset name for that standard. There can only
  63 # be one of these default charset names per converter.
  64
  65
  66
  67 # The world is getting more complicated...
  68 # Supporting XML parsers, HTML, MIME, and similar applications
  69 # that mark encodings with a charset name can be difficult.
  70 # Many of these applications and operating systems will update
  71 # their codepages over time.
  72
  73 # It means that a new codepage, one that differs from an
  74 # old one by changing a code point, e.g., to the Euro sign,
  75 # must not get an old alias, because it would mean that
  76 # old files with this alias would be interpreted differently.
  77
  78 # If an codepage gets updated by assigning characters to previously
  79 # unassigned code points, then a new name is not necessary.
  80 # Also, some codepages map unassigned codepage byte values
  81 # to the same numbers in Unicode for roundtripping. It may be
  82 # industry practice to keep the encoding name in such a case, too
  83 # (example: Windows codepages).
  84
  85 # The aliases listed in the list of character sets
  86 # that is maintained by the IANA (http://www.iana.org/) must
  87 # not be changed to mean encodings different from what this
  88 # list shows. Currently, the IANA list is at
  89 # http://www.iana.org/assignments/character-sets
  90 # It should also be mentioned that the exact mapping table used for each
  91 # IANA names usually isn't specified. This means that some other applications
  92 # and operating systems are left to interpret the exact mappings for the
  93 # underspecified aliases. For instance, Shift-JIS on a Solaris platform
  94 # may be different from Shift-JIS on a Windows platform. This is why
  95 # some of the aliases can be tagged to differentiate different mapping
  96 # tables with the same alias. If an alias is given to more than one converter,
  97 # it is considered to be an ambiguous alias, and the affinity list will
  98 # choose the converter to use when a standard isn't specified with the alias.
  99
 100 # Name matching is case-insensitive. Also, dashes '-', underscores '_'
 101 # and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1
 102 # and "cs iso latin 1" are the same).
 103 # However, the names in the left column are directly file names
 104 # or names of algorithmic converters, and their case must not
 105 # be changed - or else code and/or file names must also be changed.
 106 # For example, the converter ibm-921 is expected to be the file ibm-921.cnv.
 107
 108
 109
 110 # The immediately following list is the affinity list of supported standard tags.
 111 # When multiple converters have the same alias under different standards,
 112 # the standard nearest to the top of this list with that alias will
 113 # be the first converter that will be opened. The ordering of the aliases
 114 # after this affinity list does not affect the preferred alias, but it may
 115 # affect the order of the returned list of aliases for a given converter.
 116 #
 117 # The general ordering is from specific and frequently used to more general
 118 # or rarely used at the bottom.
 119 {
 120     UTR22           # Name format specified by http://www.unicode.org/unicode/reports/tr22/
 121     HTML            # WHATWG's encoding spec; https://encoding.spec.whatwg.org
 122     IANA            # Source: http://www.iana.org/assignments/character-sets
 123     MIME            # Source: http://www.iana.org/assignments/character-sets
 124     }
 125
 126 UTF-8 { MIME* HTML* }
 127     unicode-1-1-utf-8
 128     utf8
 129
 130 utf-16be { MIME* HTML* }
 131
 132 utf-16le { MIME* HTML* }
 133     utf-16
 134
 135 # Keep UTF-32 entries for now until we sort out Blink's behavior when
 136 # UTF-32 is dropped.
 137 UTF-32 { IANA* MIME* }          ISO-10646-UCS-4 { IANA }
 138                                 csUCS4
 139                                 ucs-4
 140 UTF-32BE { IANA* }              UTF32_BigEndian
 141 UTF-32LE { IANA* }              UTF32_LittleEndian
 142
 143 ibm866-html
 144     IBM866 { MIME* HTML* }
 145     866
 146     cp866
 147     csibm866
 148
 149 iso-8859-2-html
 150     ISO-8859-2 { MIME* HTML* }
 151     csisolatin2
 152     iso-ir-101
 153     iso8859-2
 154     iso88592
 155     iso_8859-2
 156     iso_8859-2:1987
 157     l2
 158     latin2
 159
 160 iso-8859-3-html
 161     ISO-8859-3 { MIME* HTML* }
 162     csisolatin3
 163     iso-ir-109
 164     iso8859-3
 165     iso88593
 166     iso_8859-3
 167     iso_8859-3:1988
 168     l3
 169     latin3
 170
 171 iso-8859-4-html
 172     ISO-8859-4 { MIME* HTML* }
 173     csisolatin4
 174     iso-ir-110
 175     iso8859-4
 176     iso88594
 177     iso_8859-4
 178     iso_8859-4:1988
 179     l4
 180     latin4
 181
 182 iso-8859-5-html
 183     ISO-8859-5 { MIME* HTML* }
 184     csisolatincyrillic
 185     cyrillic
 186     iso-ir-144
 187     iso8859-5
 188     iso88595
 189     iso_8859-5
 190     iso_8859-5:1988
 191
 192 iso-8859-6-html
 193     ISO-8859-6 { MIME* HTML* }
 194     arabic
 195     asmo-708
 196     csiso88596e
 197     csiso88596i
 198     csisolatinarabic
 199     ecma-114
 200     iso-8859-6-e
 201     iso-8859-6-i
 202     iso-ir-127
 203     iso8859-6
 204     iso88596
 205     iso_8859-6
 206     iso_8859-6:1987
 207
 208 iso-8859-7-html
 209     ISO-8859-7 { MIME* HTML* }
 210     csisolatingreek
 211     ecma-118
 212     elot_928
 213     greek
 214     greek8
 215     iso-ir-126
 216     iso8859-7
 217     iso88597
 218     iso_8859-7
 219     iso_8859-7:1987
 220     sun_eu_greek
 221
 222 iso-8859-8-html
 223     ISO-8859-8 { MIME* HTML* }
 224     csiso88598e { MIME }
 225     csisolatinhebrew
 226     hebrew
 227     ISO-8859-8-E
 228     ISO-8859-8-I
 229     iso-ir-138
 230     iso8859-8
 231     iso88598
 232     iso_8859-8
 233     iso_8859-8:1988
 234     # adding this one leads to a failure in encoding-labels.html
 235 #   csiso88598i
 236
 237
 238 # This alias has to be dealt with by TextCodecICU unless
 239 # multiple encodings can share a single mapping table.
 240 #ISO-8859-8-I { MIME* HTML* }
 241 #   csiso88598i
 242 #   logical
 243
 244 iso-8859-10-html
 245     ISO-8859-10 { MIME* HTML* }
 246     csisolatin6
 247     iso-ir-157
 248     iso8859-10
 249     iso885910
 250     l6
 251     latin6
 252
 253 iso-8859-13-html
 254     ISO-8859-13 { MIME* HTML* }
 255     iso8859-13
 256     iso885913
 257
 258 iso-8859-14-html
 259     ISO-8859-14 { MIME* HTML* }
 260     iso8859-14
 261     iso885914
 262
 263 iso-8859-15-html
 264     ISO-8859-15 { MIME* HTML* }
 265     csisolatin9
 266     iso8859-15
 267     iso885915
 268     iso_8859-15
 269     l9
 270
 271 iso-8859-16-html
 272     ISO-8859-16 { MIME* HTML* }
 273
 274 koi8-r-html
 275     KOI8-R { MIME* HTML* }
 276     cskoi8r
 277     koi
 278     koi8
 279     koi8_r
 280
 281 koi8-u-html
 282     KOI8-U { MIME* HTML* }
 283
 284 macintosh-html
 285     macintosh { MIME* HTML* }
 286     csmacintosh
 287     mac
 288     x-mac-roman
 289
 290 windows-874-html
 291     windows-874 { MIME* HTML* }
 292     dos-874
 293     iso-8859-11
 294     iso8859-11
 295     iso885911
 296     tis-620
 297
 298 windows-1250-html
 299     windows-1250 { MIME* HTML* }
 300     cp1250
 301     x-cp1250
 302
 303 windows-1251-html
 304     windows-1251 { MIME* HTML* }
 305     cp1251
 306     x-cp1251
 307
 308 windows-1252-html
 309     windows-1252 { MIME* HTML* }
 310     ansi_x3.4-1968
 311     ascii
 312     cp1252
 313     cp819
 314     csisolatin1
 315     ibm819
 316     iso-8859-1
 317     iso-ir-100
 318     iso8859-1
 319     iso88591
 320     iso_8859-1
 321     iso_8859-1:1987
 322     l1
 323     latin1
 324     us-ascii
 325     x-cp1252
 326
 327 windows-1253-html
 328     windows-1253 { MIME* HTML* }
 329     cp1253
 330     x-cp1253
 331
 332 windows-1254-html
 333     windows-1254 { MIME* HTML* }
 334     cp1254
 335     csisolatin5
 336     iso-8859-9
 337     iso-ir-148
 338     iso8859-9
 339     iso88599
 340     iso_8859-9
 341     iso_8859-9:1989
 342     l5
 343     latin5
 344     x-cp1254
 345
 346 windows-1255-html
 347     windows-1255 { MIME* HTML* }
 348     cp1255
 349     x-cp1255
 350
 351 windows-1256-html
 352     windows-1256 { MIME* HTML* }
 353     cp1256
 354     x-cp1256
 355
 356 windows-1257-html
 357     windows-1257 { MIME* HTML* }
 358     cp1257
 359     x-cp1257
 360
 361 windows-1258-html
 362     windows-1258 { MIME* HTML* }
 363     cp1258
 364     x-cp1258
 365
 366 x-mac-cyrillic-html
 367     x-mac-cyrillic { MIME* HTML* }
 368     x-mac-ukrainian
 369
 370 # Chrome: Added 4 GB2312 aliases and EUC-CN to Windows-936 to reflect the
 371 #         reality of the web (GB2312 is treated synonymously with its
 372 #         superset, Windows-936/GBK)
 373 #         HTML5 makes GBK an alias for GB18030
 374 #         TODO(jshin): Decide if Chrome should follow spec. crbug.com/339862
 375 windows-936-2000
 376                         GB2312 { IANA MIME }
 377                         GBK { IANA* MIME* }
 378                         CP936 { IANA }
 379                         MS936 { IANA }
 380                         windows-936 { IANA }
 381                         chinese { IANA }
 382                         iso-ir-58 { IANA }
 383                         gb2312-1980
 384                         EUC-CN
 385                         csGB2312 { IANA }
 386                         GB_2312-80 { IANA }
 387                         x-gbk
 388
 389 # GB 18030 is partly algorithmic, using the MBCS converter
 390 gb18030 { IANA* }      gb18030 { MIME* }  ibm-1392 windows-54936
 391
 392 windows-950-2000
 393     Big5 { MIME* HTML* }
 394     cn-big5
 395     csbig5
 396     x-x-big5
 397
 398 # Chrome: WHATWG encoding spec has big5-hkscs as an alias for big5
 399 #   TODO(jshin): Decide if Chrome should follow spec. crbug.com/277040
 400 ibm-1375_P100-2007 { UTR22* }   # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters.
 401                         ibm-1375
 402                         Big5-HKSCS { MIME* IANA* }
 403                         big5hk
 404                         HKSCS-BIG5  # From http://www.openi18n.org/localenameguide/
 405
 406
 407 euc-jp-html
 408     EUC-JP { MIME* HTML* }
 409     cseucpkdfmtjapanese
 410     x-euc-jp
 411
 412 ISO_2022,locale=ja,version=0
 413     ISO-2022-JP { MIME* HTML* }
 414     csiso2022jp
 415
 416 shift_jis-html
 417     Shift_JIS { MIME* HTML* }
 418     csshiftjis
 419     ms_kanji
 420     shift-jis
 421     sjis
 422     windows-31j
 423     x-sjis
 424
 425 windows-949-2000
 426     EUC-KR { MIME* HTML* }
 427     cseuckr
 428     csksc56011987
 429     iso-ir-149
 430     korean
 431     ks_c_5601-1987
 432     ks_c_5601-1989
 433     ksc5601
 434     ksc_5601
 435     windows-949
 436
 437 # We need to keep these aliases so that documents labelled with them
 438 # are converted to a single U+FFFD instead of being rendered as a gibberish.
 439 ISO-2022-KR { HTML* MIME* } csISO2022KR { IANA }
 440 ISO-2022-CN { IANA* HTML* } csISO2022CN  x-ISO-2022-CN-GB
 441 ISO-2022-CN-EXT { IANA* HTML* }
 442 HZ-GB-2312 { HTML* IANA* } HZ
 443