ukengine/charset.cpp

   1 // -*- coding:unix; mode:c++; tab-width:4; c-basic-offset:4; indent-tabs-mode:nil -*-
   2 /*------------------------------------------------------------------------------
   3 VnConv: Vietnamese Encoding Converter Library
   4 UniKey Project: http://unikey.sourceforge.net
   5 Copyleft (C) 1998-2002 Pham Kim Long
   6 Contact: longp@cslab.felk.cvut.cz
   7
   8 This program is free software; you can redistribute it and/or
   9 modify it under the terms of the GNU General Public License
  10 as published by the Free Software Foundation; either version 2
  11 of the License, or (at your option) any later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  21 --------------------------------------------------------------------------------*/
  22
  23 #include <stddef.h>
  24 #include <search.h>
  25 #include <memory.h>
  26 #include <ctype.h>
  27 #include <stdlib.h>
  28
  29 #include "charset.h"
  30 #include "data.h"
  31
  32 int LoVowel['z'-'a'+1];
  33 int HiVowel['Z'-'A'+1];
  34
  35 #define IS_VOWEL(x) ((x >= 'a' && x <= 'z' && LoVowel[x-'a']) || (x >= 'A' && x <= 'Z' && HiVowel[x-'A']))
  36
  37 SingleByteCharset *SgCharsets[CONV_TOTAL_SINGLE_CHARSETS];
  38 DoubleByteCharset *DbCharsets[CONV_TOTAL_DOUBLE_CHARSETS];
  39
  40 DllExport CVnCharsetLib VnCharsetLibObj;
  41
  42 //////////////////////////////////////////////////////
  43 // Generic VnCharset class
  44 //////////////////////////////////////////////////////
  45 int VnCharset::elementSize()
  46 {
  47     return 1;
  48 }
  49
  50 //-------------------------------------------
  51 int VnInternalCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
  52 {
  53     if (!is.getNextDW(stdChar)) {
  54         bytesRead = 0;
  55         return 0;
  56     }
  57     bytesRead = sizeof(UKDWORD);
  58     return 1;
  59 }
  60
  61 //-------------------------------------------
  62 int VnInternalCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
  63 {
  64   outLen = sizeof(StdVnChar);
  65   UKWORD *pWord = (UKWORD *)&stdChar;
  66   os.putW(*pWord);
  67   pWord++;
  68   return os.putW(*pWord);
  69 }
  70
  71 //-------------------------------------------
  72 int VnInternalCharset::elementSize()
  73 {
  74     return 4;
  75 }
  76
  77 //-------------------------------------------
  78 SingleByteCharset::SingleByteCharset(unsigned char * vnChars)
  79 {
  80         int i;
  81         m_vnChars = vnChars;
  82         memset(m_stdMap, 0, 256*sizeof(UKWORD));
  83         for (i=0; i<TOTAL_VNCHARS; i++) {
  84                 if (vnChars[i] != 0 && (i==TOTAL_VNCHARS-1 || vnChars[i] != vnChars[i+1]))
  85                         m_stdMap[vnChars[i]] = i + 1;
  86         }
  87 }
  88
  89 //-------------------------------------------
  90 int SingleByteCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
  91 {
  92         unsigned char ch;
  93         if (!is.getNext(ch)) {
  94                 bytesRead = 0;
  95                 return 0;
  96         }
  97
  98         stdChar = (m_stdMap[ch])? (VnStdCharOffset + m_stdMap[ch] - 1) : ch;
  99         bytesRead = 1;
 100         return 1;
 101 }
 102
 103
 104 //-------------------------------------------
 105 int SingleByteCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 106 {
 107         int ret;
 108         unsigned char ch;
 109         if (stdChar >= VnStdCharOffset) {
 110                 outLen = 1;
 111                 ch = m_vnChars[stdChar - VnStdCharOffset];
 112                 if (ch == 0)
 113                         ch = (stdChar == StdStartQuote)? PadStartQuote :
 114                           ((stdChar == StdEndQuote)? PadEndQuote :
 115                                    ((stdChar == StdEllipsis)? PadEllipsis: PadChar) );
 116                 ret = os.putB(ch);
 117         }
 118         else {
 119                 if (stdChar > 255 || m_stdMap[stdChar]) {
 120                         //this character is missing in the charset
 121                         // output padding character
 122                         outLen = 1;
 123                         ret = os.putB(PadChar);
 124                 }
 125                 else {
 126                         outLen = 1;
 127                         ret = os.putB((UKBYTE)stdChar);
 128                 }
 129         }
 130         return ret;
 131 }
 132
 133 //-------------------------------------------
 134 int wideCharCompare(const void *ele1, const void *ele2)
 135 {
 136         UKWORD ch1 = LOWORD(*((UKDWORD *)ele1));
 137         UKWORD ch2 = LOWORD(*((UKDWORD *)ele2));
 138         return (ch1 == ch2)? 0 : ((ch1 > ch2)? 1 : -1);
 139 }
 140
 141 //-------------------------------------------
 142 UnicodeCharset::UnicodeCharset(UnicodeChar *vnChars)
 143 {
 144         UKDWORD i;
 145         m_toUnicode = vnChars;
 146         for (i=0; i<TOTAL_VNCHARS; i++)
 147                 m_vnChars[i] = (i << 16) + vnChars[i]; // high word is used for index
 148         qsort(m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 149 }
 150
 151 //-------------------------------------------
 152 int UnicodeCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 153 {
 154         UnicodeChar uniCh;
 155         if (!is.getNextW(uniCh)) {
 156                 bytesRead = 0;
 157                 return 0;
 158         }
 159         bytesRead = sizeof(UnicodeChar);
 160         UKDWORD key = uniCh;
 161         UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 162         if (pChar)
 163                 stdChar = VnStdCharOffset + HIWORD(*pChar);
 164         else
 165                 stdChar = uniCh;
 166         return 1;
 167 }
 168
 169 //-------------------------------------------
 170 int UnicodeCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 171 {
 172         outLen = sizeof(UnicodeChar);
 173         return os.putW((stdChar >= VnStdCharOffset)?
 174                                m_toUnicode[stdChar-VnStdCharOffset] : (UnicodeChar)stdChar);
 175 }
 176
 177 //-------------------------------------------
 178 int UnicodeCharset::elementSize()
 179 {
 180     return 2;
 181 }
 182
 183 ////////////////////////////////////////
 184 // Unicode decomposed
 185 ////////////////////////////////////////
 186 //-------------------------------------------
 187 int uniCompInfoCompare(const void *ele1, const void *ele2)
 188 {
 189         UKDWORD ch1 = ((UniCompCharInfo *)ele1)->compChar;
 190         UKDWORD ch2 = ((UniCompCharInfo *)ele2)->compChar;
 191         return (ch1 == ch2)? 0 : ((ch1 > ch2)? 1 : -1);
 192 }
 193
 194 UnicodeCompCharset::UnicodeCompCharset(UnicodeChar *uniChars, UKDWORD *uniCompChars)
 195 {
 196   int i,k;
 197         m_uniCompChars = uniCompChars;
 198         m_totalChars = 0;
 199         for (i=0; i<TOTAL_VNCHARS; i++) {
 200                 m_info[i].compChar = uniCompChars[i];
 201                 m_info[i].stdIndex = i;
 202                 m_totalChars++;
 203         }
 204
 205         for (k=0, i=TOTAL_VNCHARS; k<TOTAL_VNCHARS; k++)
 206                 if (uniChars[k] != uniCompChars[k]) {
 207                         m_info[i].compChar = uniChars[k];
 208                         m_info[i].stdIndex = k;
 209                         m_totalChars++;
 210                         i++;
 211                 }
 212
 213         qsort(m_info, m_totalChars, sizeof(UniCompCharInfo), uniCompInfoCompare);
 214 }
 215
 216 //---------------------------------------------
 217 int UnicodeCompCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 218 {
 219         // read first char
 220
 221         UniCompCharInfo key;
 222         UKWORD w;
 223         if (!is.getNextW(w)) {
 224                 bytesRead = 0;
 225                 return 0;
 226         }
 227         key.compChar = w;
 228         bytesRead = 2;
 229
 230         UniCompCharInfo *pInfo = (UniCompCharInfo *)bsearch(&key, m_info, m_totalChars,
 231                                                                 sizeof(UniCompCharInfo), uniCompInfoCompare);
 232         if (!pInfo)
 233                 stdChar = key.compChar;
 234         else {
 235                 stdChar = pInfo->stdIndex + VnStdCharOffset;
 236                 if (is.peekNextW(w)) {
 237                         UKDWORD hi = w;
 238                         if (hi > 0) {
 239                                 key.compChar += hi << 16;
 240                                 pInfo = (UniCompCharInfo *)bsearch(&key, m_info, m_totalChars,
 241                                                        sizeof(UniCompCharInfo), uniCompInfoCompare);
 242                                 if (pInfo) {
 243                                         stdChar = pInfo->stdIndex + VnStdCharOffset;
 244                                         bytesRead += 2;
 245                                         is.getNextW(w);
 246                                 }
 247                         }
 248                 }
 249         }
 250         return 1;
 251 }
 252
 253 //---------------------------------------------
 254 int UnicodeCompCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 255 {
 256         int ret;
 257         if (stdChar     >= VnStdCharOffset) {
 258                 UKDWORD uniCompCh = m_uniCompChars[stdChar-VnStdCharOffset];
 259                 UKWORD lo = LOWORD(uniCompCh);
 260                 UKWORD hi = HIWORD(uniCompCh);
 261                 outLen = 2;
 262                 ret = os.putW(lo);
 263                 if (hi > 0) {
 264                         outLen += 2;
 265                         ret = os.putW(hi);
 266                 }
 267         }
 268         else {
 269                 outLen = 2;
 270                 ret = os.putW((UKWORD)stdChar);
 271         }
 272         return ret;
 273 }
 274
 275 //-------------------------------------------
 276 int UnicodeCompCharset::elementSize()
 277 {
 278     return 2;
 279 }
 280
 281 ////////////////////////////////
 282 // Unicode UTF-8              //
 283 ////////////////////////////////
 284 int UnicodeUTF8Charset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 285 {
 286         UKWORD w1, w2, w3;
 287         UKBYTE first, second, third;
 288         UnicodeChar uniCh;
 289
 290         bytesRead = 0;
 291         if (!is.getNext(first))
 292                 return 0;
 293         bytesRead = 1;
 294
 295         if (first < 0x80)
 296                 uniCh = first; // 1-byte sequence
 297         else if ((first & 0xE0) == 0xC0) {
 298                 //2-byte sequence
 299                 if (!is.peekNext(second))
 300                         return 0;
 301                 if ((second & 0xC0) != 0x80) {
 302                         stdChar = INVALID_STD_CHAR;
 303                         return 1;
 304                 }
 305                 is.getNext(second);
 306                 bytesRead = 2;
 307                 w1 = first;
 308                 w2 = second;
 309                 uniCh = ((w1 & 0x001F) << 6) | (w2 & 0x3F);
 310         }
 311         else if ((first & 0xF0) == 0xE0) {
 312                 //3-byte sequence
 313                 if (!is.peekNext(second))
 314                         return 0;
 315                 if ((second & 0xC0) != 0x80) {
 316                         stdChar = INVALID_STD_CHAR;
 317                         return 1;
 318                 }
 319                 is.getNext(second);
 320                 bytesRead = 2;
 321                 if (!is.peekNext(third))
 322                         return 0;
 323                 if ((third & 0xC0) != 0x80) {
 324                         stdChar = INVALID_STD_CHAR;
 325                         return 1;
 326                 }
 327                 is.getNext(third);
 328                 bytesRead = 3;
 329                 w1 = first;
 330                 w2 = second;
 331                 w3 = third;
 332                 uniCh = ((w1 & 0x000F) << 12) | ((w2 & 0x003F) << 6) | (w3 & 0x003F);
 333         }
 334         else {
 335                 stdChar = INVALID_STD_CHAR;
 336                 return 1;
 337         }
 338
 339         // translate to StdVnChar
 340         UKDWORD key = uniCh;
 341         UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 342         if (pChar)
 343                 stdChar = VnStdCharOffset + HIWORD(*pChar);
 344         else stdChar = uniCh;
 345         return 1;
 346 }
 347
 348 //-------------------------------------------
 349 int UnicodeUTF8Charset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 350 {
 351         UnicodeChar uChar = (stdChar < VnStdCharOffset)?
 352                                 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
 353         int ret;
 354         if (uChar < 0x0080) {
 355                 outLen = 1;
 356                 ret = os.putB((UKBYTE)uChar);
 357         } else if (uChar < 0x0800) {
 358                 outLen = 2;
 359                 os.putB(0xC0 | (UKBYTE)(uChar >> 6));
 360                 ret = os.putB(0x80 | (UKBYTE)(uChar & 0x003F));
 361         } else {
 362                 outLen = 3;
 363                 os.putB(0xE0 | (UKBYTE)(uChar >> 12));
 364                 os.putB(0x80 | (UKBYTE)((uChar >> 6) & 0x003F));
 365                 ret = os.putB(0x80 | (UKBYTE)(uChar & 0x003F));
 366         }
 367         return ret;
 368 }
 369
 370 ////////////////////////////////////////
 371 // Unicode character reference &#D;   //
 372 ////////////////////////////////////////
 373 int hexDigitValue(unsigned char digit)
 374 {
 375         if (digit >= 'a' && digit <= 'f')
 376                 return digit-'a'+10;
 377         if (digit >= 'A' && digit <= 'F')
 378                 return digit-'A'+10;
 379         if (digit >= '0' && digit <= '9')
 380                 return digit-'0';
 381         return 0;
 382 }
 383
 384
 385 //--------------------------------------
 386 int UnicodeRefCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 387 {
 388         unsigned char ch;
 389         UnicodeChar uniCh;
 390         bytesRead = 0;
 391         if (!is.getNext(ch))
 392                 return 0;
 393         bytesRead = 1;
 394         uniCh = ch;
 395         if (ch == '&') {
 396                 if (is.peekNext(ch) && ch == '#') {
 397                         is.getNext(ch);
 398                         bytesRead++;
 399                         if (!is.eos()) {
 400                                 is.peekNext(ch);
 401                                 if (ch != 'x' && ch != 'X') {
 402                                         UKWORD code = 0;
 403                                         int digits = 0;
 404                                         while (is.peekNext(ch) && isdigit(ch) && digits < 5) {
 405                                                 is.getNext(ch);
 406                                                 bytesRead++;
 407                                                 code = code*10 + (ch - '0');
 408                                                 digits++;
 409                                         }
 410                                         if (is.peekNext(ch) && ch == ';') {
 411                                                 is.getNext(ch);
 412                                                 bytesRead++;
 413                                                 uniCh = code;
 414                                         }
 415                                 }
 416                                 else {
 417                                         is.getNext(ch);
 418                                         bytesRead++;
 419                                         UKWORD code = 0;
 420                                         int digits = 0;
 421                                         while (is.peekNext(ch) && isxdigit(ch) && digits < 4) {
 422                                                 is.getNext(ch);
 423                                                 bytesRead++;
 424                                                 code = (code << 4) + hexDigitValue(ch);
 425                                                 digits++;
 426                                         }
 427                                         if (is.peekNext(ch) && ch == ';') {
 428                                                 is.getNext(ch);
 429                                                 bytesRead++;
 430                                                 uniCh = code;
 431                                         }
 432                                 } // hex digits
 433                         }
 434                 }
 435         }
 436
 437         // translate to StdVnChar
 438         UKDWORD key = uniCh;
 439         UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 440         if (pChar)
 441                 stdChar = VnStdCharOffset + HIWORD(*pChar);
 442         else stdChar = uniCh;
 443         return 1;
 444 }
 445
 446
 447 //--------------------------------
 448 int UnicodeRefCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 449 {
 450         UnicodeChar uChar = (stdChar < VnStdCharOffset)?
 451                                 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
 452         int ret;
 453         if (uChar < 128) {
 454                 outLen = 1;
 455                 ret = os.putB((UKBYTE)uChar);
 456         }
 457         else {
 458                 outLen = 2;
 459                 os.putB((UKBYTE)'&');
 460                 os.putB((UKBYTE)'#');
 461
 462                 int i, digit, prev, base;
 463                 prev = 0;
 464                 base = 10000;
 465                 for (i=0; i < 5; i++) {
 466                         digit = uChar / base;
 467                         if (digit || prev) {
 468                                 prev = 1;
 469                                 outLen++;
 470                                 os.putB('0' + (unsigned char)digit);
 471                         }
 472                         uChar %= base;
 473                         base /= 10;
 474                 }
 475                 ret = os.putB((UKBYTE)';');
 476                 outLen++;
 477         }
 478         return ret;
 479 }
 480
 481 #define HEX_DIGIT(x) ((x < 10)? ('0'+x) : ('A'+x-10))
 482
 483 //--------------------------------
 484 int UnicodeHexCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 485 {
 486         UnicodeChar uChar = (stdChar < VnStdCharOffset)?
 487                                 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
 488         int ret;
 489         if (uChar < 256) {
 490                 outLen = 1;
 491                 ret = os.putB((UKBYTE)uChar);
 492         }
 493         else {
 494                 outLen = 3;
 495                 os.putB('&');
 496                 os.putB('#');
 497                 os.putB('x');
 498
 499                 int i, digit;
 500                 int prev = 0;
 501                 int shifts = 12;
 502
 503                 for (i=0; i < 4; i++) {
 504                         digit = ((uChar >> shifts) & 0x000F);
 505                         if (digit > 0 || prev) {
 506                                 prev = 1;
 507                                 outLen++;
 508                                 os.putB((UKBYTE)HEX_DIGIT(digit));
 509                         }
 510                         shifts -= 4;
 511                 }
 512                 ret = os.putB(';');
 513                 outLen++;
 514         }
 515         return ret;
 516 }
 517
 518
 519 /////////////////////////////////
 520 // Class UnicodeCStringCharset  /
 521 /////////////////////////////////
 522 void UnicodeCStringCharset::startInput()
 523 {
 524         m_prevIsHex = 0;
 525 }
 526
 527 //----------------------------------------
 528 int UnicodeCStringCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 529 {
 530         unsigned char ch;
 531         UnicodeChar uniCh;
 532         bytesRead = 0;
 533         if (!is.getNext(ch))
 534                 return 0;
 535         bytesRead = 1;
 536         uniCh = ch;
 537         if (ch == '\\') {
 538                 if (is.peekNext(ch) && (ch=='x' || ch=='X')) {
 539                         is.getNext(ch);
 540                         bytesRead++;
 541                         UKWORD code = 0;
 542                         int digits = 0;
 543                         while (is.peekNext(ch) && isxdigit(ch) && digits < 4) {
 544                                 is.getNext(ch);
 545                                 bytesRead++;
 546                                 code = (code << 4) + hexDigitValue(ch);
 547                                 digits++;
 548                         }
 549                         uniCh = code;
 550                 }
 551         }
 552
 553         // translate to StdVnChar
 554         UKDWORD key = uniCh;
 555         UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 556         if (pChar)
 557                 stdChar = VnStdCharOffset + HIWORD(*pChar);
 558         else stdChar = uniCh;
 559         return 1;
 560 }
 561
 562 //------------------------------------
 563 int UnicodeCStringCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 564 {
 565         UnicodeChar uChar = (stdChar < VnStdCharOffset)?
 566                                 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
 567         int ret;
 568         if (uChar < 128 && !isxdigit(uChar) && uChar != 'x' && uChar != 'X') {
 569                 outLen = 1;
 570                 ret = os.putB((UKBYTE)uChar);
 571         }
 572         else {
 573                 outLen = 2;
 574                 os.putB('\\');
 575                 os.putB('x');
 576
 577                 int i, digit;
 578                 int prev = 0;
 579                 int shifts = 12;
 580
 581                 for (i=0; i < 4; i++) {
 582                         digit = ((uChar >> shifts) & 0x000F);
 583                         if (digit > 0 || prev) {
 584                                 prev = 1;
 585                                 outLen++;
 586                                 os.putB((UKBYTE)HEX_DIGIT(digit));
 587                         }
 588                         shifts -= 4;
 589                 }
 590                 ret = os.isOK();
 591                 m_prevIsHex = 1;
 592         }
 593         return ret;
 594 }
 595
 596 /////////////////////////////////
 597 // Double-byte charsets        //
 598 /////////////////////////////////
 599 DoubleByteCharset::DoubleByteCharset(UKWORD *vnChars)
 600 {
 601         m_toDoubleChar = vnChars;
 602         memset(m_stdMap, 0, 256*sizeof(UKWORD));
 603         for (int i=0; i<TOTAL_VNCHARS; i++) {
 604                 if (vnChars[i] >> 8) // a 2-byte character
 605                         m_stdMap[vnChars[i] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
 606                 else if (m_stdMap[vnChars[i]] == 0)
 607                         m_stdMap[vnChars[i]] = i+1;
 608                 m_vnChars[i] = (i << 16) + vnChars[i]; // high word is used for StdChar index
 609         }
 610         qsort(m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 611 }
 612
 613 //---------------------------------------------
 614 int DoubleByteCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 615 {
 616         unsigned char ch;
 617
 618         // read first byte
 619         bytesRead = 0;
 620         if (!is.getNext(ch))
 621                 return 0;
 622         bytesRead = 1;
 623         stdChar = m_stdMap[ch];
 624         if (stdChar == 0)
 625                 stdChar = ch;
 626         else if (stdChar == 0xFFFF)
 627                 stdChar = INVALID_STD_CHAR;
 628         else {
 629                 stdChar += VnStdCharOffset - 1;
 630                 UKBYTE hi;
 631                 if (is.peekNext(hi) && hi > 0) {
 632                         //test if a double-byte character is encountered
 633                         UKDWORD key = MAKEWORD(ch,hi);
 634                         UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
 635                         if (pChar) {
 636                                 stdChar = VnStdCharOffset + HIWORD(*pChar);
 637                                 bytesRead = 2;
 638                                 is.getNext(hi);
 639                         }
 640                 }
 641         }
 642         return 1;
 643 }
 644
 645 //---------------------------------------------
 646 int DoubleByteCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 647 {
 648         int ret;
 649         if (stdChar     >= VnStdCharOffset) {
 650                 UKWORD wCh = m_toDoubleChar[stdChar-VnStdCharOffset];
 651
 652                 if (wCh & 0xFF00) {
 653                         outLen = 2;
 654                         os.putB((UKBYTE)(wCh & 0x00FF));
 655                         ret = os.putB((UKBYTE)(wCh >> 8));
 656                 }
 657                 else {
 658                         unsigned char b = (unsigned char)wCh;
 659                         if (m_stdMap[b] == 0xFFFF)
 660                                 b = PadChar;
 661                         outLen = 1;
 662                         ret = os.putB(b);
 663                 }
 664 /*
 665                 outLen = 1;
 666                 ret = os.putB((UKBYTE)(wCh & 0x00FF));
 667                 if (wCh & 0xFF00) {
 668                         outLen = 2;
 669                         ret = os.putB((UKBYTE)(wCh >> 8));
 670                 }
 671 */
 672         }
 673         else {
 674                 if (stdChar > 255 || m_stdMap[stdChar]) {
 675                         outLen = 1;
 676                         ret = os.putB((UKBYTE)PadChar);
 677                 }
 678                 else {
 679                         outLen = 1;
 680                         ret = os.putB((UKBYTE)stdChar);
 681                 }
 682         }
 683         return ret;
 684 }
 685
 686 /////////////////////////////////////////////
 687 // Class: VIQRCharset                      //
 688 /////////////////////////////////////////////
 689
 690 unsigned char VIQRTones[] = {'\'','`','?','~','.'};
 691
 692 const char *VIQREscapes[] = {
 693         "://",
 694         "/",
 695         "@",
 696         "mailto:",
 697         "email:",
 698         "news:",
 699         "www",
 700         "ftp"
 701 };
 702
 703 const int VIQREscCount = sizeof(VIQREscapes) / sizeof(char*);
 704
 705 VIQRCharset::VIQRCharset(UKDWORD *vnChars)
 706 {
 707         memset(m_stdMap, 0, 256*sizeof(UKWORD));
 708         int i;
 709         UKDWORD dw;
 710         m_vnChars = vnChars;
 711         for (i=0; i<TOTAL_VNCHARS; i++) {
 712                 dw = m_vnChars[i];
 713                 if (!(dw & 0xffffff00)) { //single byte
 714                         //ch = (unsigned char)(dw & 0xff);
 715                         m_stdMap[dw] = i+256;
 716                 }
 717         }
 718
 719         // set offset from base characters according to tone marks
 720         m_stdMap[(unsigned char)'\''] = 2;
 721         m_stdMap[(unsigned char)'`'] = 4;
 722         m_stdMap[(unsigned char)'?'] = 6;
 723         m_stdMap[(unsigned char)'~'] = 8;
 724         m_stdMap[(unsigned char)'.'] = 10;
 725         m_stdMap[(unsigned char)'^'] = 12;
 726
 727         m_stdMap[(unsigned char)'('] = 24;
 728         m_stdMap[(unsigned char)'+'] = 26;
 729         m_stdMap[(unsigned char)'*'] = 26;
 730 }
 731
 732 //---------------------------------------------------
 733 void VIQRCharset::startInput()
 734 {
 735         m_suspicious = 0;
 736         m_atWordBeginning = 1;
 737         m_gotTone = 0;
 738         m_escAll = 0;
 739         if (VnCharsetLibObj.m_options.viqrEsc)
 740                 VnCharsetLibObj.m_VIQREscPatterns.reset();
 741 }
 742
 743 //---------------------------------------------------
 744 int VIQRCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 745 {
 746         unsigned char ch1;
 747         bytesRead = 0;
 748
 749         if (!is.getNext(ch1))
 750                 return 0;
 751         bytesRead = 1;
 752         stdChar = m_stdMap[ch1];
 753
 754         if (VnCharsetLibObj.m_options.viqrEsc) {
 755                 if (VnCharsetLibObj.m_VIQREscPatterns.foundAtNextChar(ch1)!=-1) {
 756                         m_escAll = 1;
 757                 }
 758         }
 759
 760         if (m_escAll && (ch1==' ' || ch1=='\t' || ch1=='\r' || ch1=='\n'))
 761                 m_escAll = 0;
 762
 763         if (ch1 == '\\') {
 764                 // ecape character , try to read next
 765                 if (!is.getNext(ch1)) {
 766                         bytesRead++;
 767                         stdChar = m_stdMap[ch1];
 768                 }
 769         }
 770
 771         if (stdChar < 256) {
 772                 stdChar = ch1;
 773         }
 774         else if (!m_escAll && !is.eos()) {
 775                 // try to read the next byte
 776                 unsigned char ch2;
 777                 is.peekNext(ch2);
 778                 unsigned char upper = toupper(ch1);
 779         if ((!VnCharsetLibObj.m_options.smartViqr || m_atWordBeginning) &&
 780              upper == 'D' && (ch2 == 'd' || ch2 == 'D'))
 781         {
 782                         is.getNext(ch2);
 783                         bytesRead++;
 784                         stdChar += 2; // dd is 2 positions after d.
 785                 }
 786                 else {
 787                         StdVnChar index = m_stdMap[ch2];
 788
 789                         int cond;
 790                         if (m_suspicious) {
 791                                 cond = IS_VOWEL(ch1) &&
 792                              ( index == 2 || index == 4 || index == 8 || //not accepting ? . in suspicious mode
 793                                    (index == 12 &&  (upper == 'A' || upper == 'E' || upper == 'O')) ||
 794                                    (m_stdMap[ch2] == 24 && upper== 'A') ||
 795                                    (m_stdMap[ch2] == 26 && (upper == 'O' || upper == 'U')) );
 796                                 if (cond)
 797                                         m_suspicious = 0;
 798                         }
 799                         else
 800                                 cond = IS_VOWEL(ch1) &&
 801                                   ((index <= 10  && index > 0 && (!m_gotTone || (index!=6 && index!=10)) ) ||
 802                                    (index == 12 &&  (upper == 'A' || upper == 'E' || upper == 'O')) ||
 803                                    (m_stdMap[ch2] == 24 && upper== 'A') ||
 804                                    (m_stdMap[ch2] == 26 && (upper == 'O' || upper == 'U')) );
 805
 806                         if (cond) {
 807                                 if (index > 0)
 808                                         m_gotTone = 1; //we have a tone/breve/hook in the current word
 809
 810                                 // ok, take this byte
 811                                 is.getNext(ch2);
 812                                 bytesRead++;
 813                                 int offset = m_stdMap[ch2];
 814                                 if (offset == 26) offset = 24;
 815                                 if (offset == 24 && (ch1 == 'u' || ch1 == 'U'))
 816                                         offset = 12;
 817                                 stdChar += offset;
 818                                 // check next byte
 819                                 if (is.peekNext(ch2)) {
 820                                         if (index > 10 && m_stdMap[ch2] > 0 && m_stdMap[ch2] <= 10) {
 821                                                 // ok, take one more byte
 822                                                 is.getNext(ch2);
 823                                                 bytesRead++;
 824                                                 stdChar += m_stdMap[ch2];
 825                                         }
 826                                 }
 827                         }
 828                 }
 829         }
 830         m_atWordBeginning = (stdChar < 256);
 831         if (stdChar < 256) {
 832                 m_gotTone = 0; //reset this flag because we are at the beginning of a new word
 833         }
 834
 835         // adjust stdChar
 836         if (stdChar >= 256)
 837                 stdChar += VnStdCharOffset - 256;
 838         return 1;
 839 }
 840
 841 //---------------------------------------------------
 842 void VIQRCharset::startOutput()
 843 {
 844         m_escapeBowl = 0;
 845         m_escapeRoof = 0;
 846         m_escapeHook = 0;
 847         m_escapeTone = 0;
 848         m_noOutEsc = 0;
 849         VnCharsetLibObj.m_VIQROutEscPatterns.reset();
 850 }
 851
 852 //---------------------------------------------------
 853 int VIQRCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 854 {
 855         int ret;
 856         UKBYTE b;
 857         if (stdChar >= VnStdCharOffset) {
 858                 outLen = 1;
 859                 UKDWORD dw = m_vnChars[stdChar-VnStdCharOffset];
 860
 861                 unsigned char first = (unsigned char)dw;
 862                 unsigned char firstUpper = toupper(first);
 863
 864                 b = (UKBYTE)dw;
 865                 ret = os.putB(b);
 866                 if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar(b) != -1)
 867                   m_noOutEsc = 1;
 868
 869                 if (m_noOutEsc && (b==' ' || b=='\t' || b=='\r' || b=='\n'))
 870                   m_noOutEsc = 0;
 871
 872                 if (dw & 0x0000FF00) {
 873                         // second byte is present
 874                         unsigned char second = (UKBYTE)(dw >> 8);
 875                         outLen++;
 876                         ret = os.putB(second);
 877
 878                         if (dw & 0x00FF0000) {
 879                                 //third byte is present
 880                                 outLen++;
 881                                 ret = os.putB((UKBYTE)(dw >> 16));
 882                                 m_escapeTone = 0;
 883                         }
 884                         else {
 885                                 UKWORD index = m_stdMap[second];
 886                                 m_escapeTone = (index == 12 || index == 24 || index == 26);
 887                         }
 888
 889                         VnCharsetLibObj.m_VIQROutEscPatterns.reset();
 890
 891                         m_escapeBowl = 0;
 892                         m_escapeHook = 0;
 893                         m_escapeRoof = 0;
 894                 }
 895                 else {
 896                         m_escapeTone = IS_VOWEL(first);
 897                         m_escapeBowl = (firstUpper == 'A');
 898                         m_escapeHook = (firstUpper == 'U' || firstUpper == 'O');
 899                         m_escapeRoof = (firstUpper == 'A' || firstUpper == 'E' || firstUpper == 'O');
 900                 }
 901         }
 902         else {
 903                 if (stdChar > 255) {
 904                         outLen = 1;
 905                         ret = os.putB((UKBYTE)PadChar);
 906                         if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar((UKBYTE)PadChar) != -1)
 907                           m_noOutEsc = 1;
 908                 }
 909                 else {
 910                         outLen = 1;
 911                         UKWORD index = m_stdMap[stdChar];
 912                         if (!VnCharsetLibObj.m_options.viqrMixed && !m_noOutEsc &&
 913                                    (stdChar=='\\' ||
 914                                         (index > 0 && index <= 10 && m_escapeTone) ||
 915                                         (index == 12 && m_escapeRoof) ||
 916                                         (index == 24 && m_escapeBowl) ||
 917                                         (index == 26 && m_escapeHook))) {
 918                                 //(m_stdMap[stdChar] > 0 && m_stdMap[stdChar] <= 26)) {
 919                                 // tone mark, needs an escape character
 920                                 outLen++;
 921                                 ret = os.putB('\\');
 922                                 if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar('\\') != -1)
 923                                   m_noOutEsc = 1;
 924                         }
 925                         b = (UKBYTE)stdChar;
 926                         ret = os.putB(b);
 927                         if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar(b) != -1)
 928                           m_noOutEsc = 1;
 929                         if (m_noOutEsc && (b==' ' || b=='\t' || b=='\r' || b=='\n'))
 930                           m_noOutEsc = 0;
 931                 }
 932                 // reset escape marks
 933                 m_escapeBowl = 0;
 934                 m_escapeRoof = 0;
 935                 m_escapeHook = 0;
 936                 m_escapeTone = 0;
 937         }
 938         return ret;
 939 }
 940
 941 /////////////////////////////////////////////
 942 // Class: UTF8VIQRCharset                  //
 943 /////////////////////////////////////////////
 944
 945 //-----------------------------------------
 946 UTF8VIQRCharset::UTF8VIQRCharset(UnicodeUTF8Charset *pUtf, VIQRCharset *pViqr)
 947 {
 948   m_pUtf = pUtf;
 949   m_pViqr = pViqr;
 950 }
 951
 952 //-----------------------------------------
 953 void UTF8VIQRCharset::startInput()
 954 {
 955   m_pUtf->startInput();
 956   m_pViqr->startInput();
 957 }
 958
 959 //-----------------------------------------
 960 void UTF8VIQRCharset::startOutput()
 961 {
 962   m_pUtf->startOutput();
 963   m_pViqr->startOutput();
 964 }
 965
 966 //-----------------------------------------
 967 int UTF8VIQRCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
 968 {
 969         UKBYTE ch;
 970
 971         if (!is.peekNext(ch))
 972                 return 0;
 973
 974         if (ch > 0xBF && ch < 0xFE) {
 975                 m_pViqr->startInput(); // just to reset the VIQR object state
 976                 m_pViqr->m_suspicious = 1;
 977                 return m_pUtf->nextInput(is, stdChar, bytesRead);
 978         }
 979
 980         return m_pViqr->nextInput(is, stdChar, bytesRead);
 981 }
 982
 983 //-----------------------------------------
 984 int UTF8VIQRCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
 985 {
 986   return m_pViqr->putChar(os, stdChar, outLen);
 987 }
 988
 989
 990 //-----------------------------------------
 991 CVnCharsetLib::CVnCharsetLib()
 992 {
 993         unsigned char ch;
 994         for (ch = 'a'; ch < 'z'; ch++)
 995                 LoVowel[ch-'a'] = 0;
 996         LoVowel['a'-'a'] = 1;
 997         LoVowel['e'-'a'] = 1;
 998         LoVowel['i'-'a'] = 1;
 999         LoVowel['o'-'a'] = 1;
1000         LoVowel['u'-'a'] = 1;
1001         LoVowel['y'-'a'] = 1;
1002
1003         for (ch = 'A'; ch < 'Z'; ch++)
1004                 HiVowel[ch-'A'] = 0;
1005         HiVowel['A'-'A'] = 1;
1006         HiVowel['E'-'A'] = 1;
1007         HiVowel['I'-'A'] = 1;
1008         HiVowel['O'-'A'] = 1;
1009         HiVowel['U'-'A'] = 1;
1010         HiVowel['Y'-'A'] = 1;
1011
1012         m_pUniCharset = NULL;
1013         m_pUniCompCharset = NULL;
1014         m_pUniUTF8 = NULL;
1015         m_pUniRef = NULL;
1016         m_pUniHex = NULL;
1017         m_pVIQRCharObj = NULL;
1018         m_pUVIQRCharObj = NULL;
1019         m_pWinCP1258 = NULL;
1020         m_pVnIntCharset = NULL;
1021
1022         int i;
1023         for (i = 0; i < CONV_TOTAL_SINGLE_CHARSETS; i++)
1024                 m_sgCharsets[i] = NULL;
1025
1026         for (i = 0; i < CONV_TOTAL_DOUBLE_CHARSETS; i++)
1027                 m_dbCharsets[i] = NULL;
1028
1029         VnConvResetOptions(&m_options);
1030         m_VIQREscPatterns.init((char**)VIQREscapes, VIQREscCount);
1031         m_VIQROutEscPatterns.init((char**)VIQREscapes, VIQREscCount);
1032 }
1033
1034
1035 //-----------------------------------------
1036 CVnCharsetLib::~CVnCharsetLib()
1037 {
1038         if (m_pUniCharset)
1039                 delete m_pUniCharset;
1040         if (m_pUniUTF8)
1041                 delete m_pUniUTF8;
1042         if (m_pUniRef)
1043                 delete m_pUniRef;
1044         if (m_pUniHex)
1045                 delete m_pUniHex;
1046         if (m_pVIQRCharObj)
1047                 delete m_pVIQRCharObj;
1048         if (m_pUVIQRCharObj)
1049                 delete m_pUVIQRCharObj;
1050         if (m_pWinCP1258)
1051                 delete m_pWinCP1258;
1052         if (m_pUniCString)
1053                 delete m_pUniCString;
1054         if (m_pVnIntCharset)
1055                 delete m_pVnIntCharset;
1056
1057         int i;
1058         for (i = 0; i < CONV_TOTAL_SINGLE_CHARSETS; i++)
1059                 if (m_sgCharsets[i]) delete m_sgCharsets[i];
1060
1061         for (i = 0; i < CONV_TOTAL_DOUBLE_CHARSETS; i++)
1062                 if (m_dbCharsets[i]) delete m_dbCharsets[i];
1063
1064 }
1065
1066 //-----------------------------------------
1067 VnCharset * CVnCharsetLib::getVnCharset(int charsetIdx)
1068 {
1069         switch (charsetIdx) {
1070
1071         case CONV_CHARSET_UNICODE:
1072                 if (m_pUniCharset == NULL)
1073                         m_pUniCharset = new UnicodeCharset(UnicodeTable);
1074                 return m_pUniCharset;
1075         case CONV_CHARSET_UNIDECOMPOSED:
1076                 if (m_pUniCompCharset == NULL)
1077                         m_pUniCompCharset = new UnicodeCompCharset(UnicodeTable, UnicodeComposite);
1078                 return m_pUniCompCharset;
1079         case CONV_CHARSET_UNIUTF8:
1080   case CONV_CHARSET_XUTF8:
1081                 if (m_pUniUTF8 == NULL)
1082                         m_pUniUTF8 = new UnicodeUTF8Charset(UnicodeTable);
1083                 return m_pUniUTF8;
1084
1085         case CONV_CHARSET_UNIREF:
1086                 if (m_pUniRef == NULL)
1087                         m_pUniRef = new UnicodeRefCharset(UnicodeTable);
1088                 return m_pUniRef;
1089
1090         case CONV_CHARSET_UNIREF_HEX:
1091                 if (m_pUniHex == NULL)
1092                         m_pUniHex = new UnicodeHexCharset(UnicodeTable);
1093                 return m_pUniHex;
1094
1095         case CONV_CHARSET_UNI_CSTRING:
1096                 if (m_pUniCString == NULL)
1097                         m_pUniCString = new UnicodeCStringCharset(UnicodeTable);
1098                 return m_pUniCString;
1099
1100         case CONV_CHARSET_WINCP1258:
1101                 if (m_pWinCP1258 == NULL)
1102                         m_pWinCP1258 = new WinCP1258Charset(WinCP1258, WinCP1258Pre);
1103                 return m_pWinCP1258;
1104
1105         case CONV_CHARSET_VIQR:
1106                 if (m_pVIQRCharObj == NULL)
1107                         m_pVIQRCharObj = new VIQRCharset(VIQRTable);
1108                 return m_pVIQRCharObj;
1109
1110         case CONV_CHARSET_VNSTANDARD:
1111                 if (m_pVnIntCharset == NULL)
1112                         m_pVnIntCharset = new VnInternalCharset();
1113                 return m_pVnIntCharset;
1114
1115         case CONV_CHARSET_UTF8VIQR:
1116           if (m_pUVIQRCharObj == NULL) {
1117             if (m_pVIQRCharObj == NULL)
1118               m_pVIQRCharObj = new VIQRCharset(VIQRTable);
1119
1120             if (m_pUniUTF8 == NULL)
1121               m_pUniUTF8 = new UnicodeUTF8Charset(UnicodeTable);
1122             m_pUVIQRCharObj = new UTF8VIQRCharset(m_pUniUTF8, m_pVIQRCharObj);
1123           }
1124           return m_pUVIQRCharObj;
1125
1126         default:
1127                 if (IS_SINGLE_BYTE_CHARSET(charsetIdx)) {
1128                         int i = charsetIdx - CONV_CHARSET_TCVN3;
1129                         if (m_sgCharsets[i] == NULL)
1130                                 m_sgCharsets[i] = new SingleByteCharset(SingleByteTables[i]);
1131                         return m_sgCharsets[i];
1132                 }
1133                 else if (IS_DOUBLE_BYTE_CHARSET(charsetIdx)) {
1134                         int i = charsetIdx - CONV_CHARSET_VNIWIN;
1135                         if (m_dbCharsets[i] == NULL)
1136                                 m_dbCharsets[i] = new DoubleByteCharset(DoubleByteTables[i]);
1137                         return m_dbCharsets[i];
1138                 }
1139         }
1140         return NULL;
1141 }
1142
1143
1144 //-------------------------------------------------
1145 DllExport void VnConvSetOptions(VnConvOptions *pOptions)
1146 {
1147         VnCharsetLibObj.m_options = *pOptions;
1148 }
1149
1150 //-------------------------------------------------
1151 DllExport void VnConvGetOptions(VnConvOptions *pOptions)
1152 {
1153         *pOptions = VnCharsetLibObj.m_options;
1154 }
1155
1156 //-------------------------------------------------
1157 DllExport void VnConvResetOptions(VnConvOptions *pOptions)
1158 {
1159         pOptions->viqrEsc = 1;
1160         pOptions->viqrMixed = 0;
1161         pOptions->toUpper = 0;
1162         pOptions->toLower = 0;
1163         pOptions->removeTone = 0;
1164     pOptions->smartViqr = 1;
1165 }
1166
1167
1168 /////////////////////////////////////////////
1169 // Class WinCP1258Charset
1170 /////////////////////////////////////////////
1171 WinCP1258Charset::WinCP1258Charset(UKWORD *compositeChars, UKWORD *precomposedChars)
1172 {
1173   int i,k;
1174         m_toDoubleChar = compositeChars;
1175         memset(m_stdMap, 0, 256*sizeof(UKWORD));
1176
1177         // encode composite chars
1178         for (i=0; i<TOTAL_VNCHARS; i++) {
1179                 if (compositeChars[i] >> 8) // a 2-byte character
1180                         m_stdMap[compositeChars[i] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
1181                 else if (m_stdMap[compositeChars[i]] == 0)
1182                         m_stdMap[compositeChars[i]] = i+1;
1183
1184                 m_vnChars[i] = (i << 16) + compositeChars[i]; // high word is used for StdChar index
1185         }
1186
1187         m_totalChars = TOTAL_VNCHARS;
1188
1189         //add precomposed chars to the table
1190         for (k=0, i=TOTAL_VNCHARS; k<TOTAL_VNCHARS; k++)
1191                 if (precomposedChars[k] != compositeChars[k]) {
1192                         if (precomposedChars[k] >> 8) // a 2-byte character
1193                                 m_stdMap[precomposedChars[k] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
1194                         else if (m_stdMap[precomposedChars[k]] == 0)
1195                                 m_stdMap[precomposedChars[k]] = k+1;
1196
1197                         m_vnChars[i] = (k << 16) + precomposedChars[k];
1198                         m_totalChars++;
1199                         i++;
1200                 }
1201
1202         qsort(m_vnChars, m_totalChars, sizeof(UKDWORD), wideCharCompare);
1203 }
1204
1205
1206 //---------------------------------------------------------------------
1207 // This fuction is basically the same as that of DoubleByteCharset
1208 // with m_totalChars is used instead of constant TOTAL_VNCHARS
1209 //---------------------------------------------------------------------
1210 int WinCP1258Charset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
1211 {
1212         unsigned char ch;
1213
1214         // read first byte
1215         bytesRead = 0;
1216         if (!is.getNext(ch))
1217                 return 0;
1218         bytesRead = 1;
1219         stdChar = m_stdMap[ch];
1220         if (stdChar == 0)
1221                 stdChar = ch;
1222         else if (stdChar == 0xFFFF)
1223                 stdChar = INVALID_STD_CHAR;
1224         else {
1225                 stdChar += VnStdCharOffset - 1;
1226                 UKBYTE hi;
1227                 if (is.peekNext(hi) && hi > 0) {
1228                         //test if a double-byte character is encountered
1229                         UKDWORD key = MAKEWORD(ch,hi);
1230                         UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, m_totalChars, sizeof(UKDWORD), wideCharCompare);
1231                         if (pChar) {
1232                                 stdChar = VnStdCharOffset + HIWORD(*pChar);
1233                                 bytesRead = 2;
1234                                 is.getNext(hi);
1235                         }
1236                 }
1237         }
1238         return 1;
1239 }
1240
1241 //---------------------------------------------------------------------
1242 // This fuction is exactly the same as that of DoubleByteCharset
1243 //---------------------------------------------------------------------
1244 int WinCP1258Charset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
1245 {
1246         int ret;
1247         if (stdChar     >= VnStdCharOffset) {
1248                 UKWORD wCh = m_toDoubleChar[stdChar-VnStdCharOffset];
1249
1250                 if (wCh & 0xFF00) {
1251                         outLen = 2;
1252                         os.putB((UKBYTE)(wCh & 0x00FF));
1253                         ret = os.putB((UKBYTE)(wCh >> 8));
1254                 }
1255                 else {
1256                         unsigned char b = (unsigned char)wCh;
1257                         if (m_stdMap[b] == 0xFFFF)
1258                                 b = PadChar;
1259                         outLen = 1;
1260                         ret = os.putB(b);
1261                 }
1262         }
1263         else {
1264                 if (stdChar > 255 || m_stdMap[stdChar]) {
1265                         outLen = 1;
1266                         ret = os.putB((UKBYTE)PadChar);
1267                 }
1268                 else {
1269                         outLen = 1;
1270                         ret = os.putB((UKBYTE)stdChar);
1271                 }
1272         }
1273         return ret;
1274 }
1275
1276 #define IS_ODD(x) (x & 1)
1277 #define IS_EVEN(x) (!(x & 1))
1278
1279 StdVnChar StdVnToUpper(StdVnChar ch)
1280 {
1281         if (ch >= VnStdCharOffset &&
1282                 ch<(VnStdCharOffset + TOTAL_ALPHA_VNCHARS) &&
1283                 IS_ODD(ch))
1284                 ch -= 1;
1285         return ch;
1286 }
1287
1288 //----------------------------------------
1289 StdVnChar StdVnToLower(StdVnChar ch)
1290 {
1291         if (ch >= VnStdCharOffset &&
1292                 ch<(VnStdCharOffset + TOTAL_ALPHA_VNCHARS) &&
1293                 IS_EVEN(ch))
1294                 ch += 1;
1295         return ch;
1296 }
1297
1298 //----------------------------------------
1299 StdVnChar StdVnGetRoot(StdVnChar ch)
1300 {
1301         if (ch >= VnStdCharOffset && ch<VnStdCharOffset+TOTAL_VNCHARS)
1302                 ch = VnStdCharOffset + StdVnRootChar[ch-VnStdCharOffset];
1303         return ch;
1304 }