1 // -*- coding:unix; mode:c++; tab-width:4; c-basic-offset:4; indent-tabs-mode:nil -*-
2 /*------------------------------------------------------------------------------
3 VnConv: Vietnamese Encoding Converter Library
4 UniKey Project: http://unikey.sourceforge.net
5 Copyleft (C) 1998-2002 Pham Kim Long
6 Contact: longp@cslab.felk.cvut.cz
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License
10 as published by the Free Software Foundation; either version 2
11 of the License, or (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 --------------------------------------------------------------------------------*/
32 int LoVowel['z'-'a'+1];
33 int HiVowel['Z'-'A'+1];
35 #define IS_VOWEL(x) ((x >= 'a' && x <= 'z' && LoVowel[x-'a']) || (x >= 'A' && x <= 'Z' && HiVowel[x-'A']))
37 SingleByteCharset *SgCharsets[CONV_TOTAL_SINGLE_CHARSETS];
38 DoubleByteCharset *DbCharsets[CONV_TOTAL_DOUBLE_CHARSETS];
40 DllExport CVnCharsetLib VnCharsetLibObj;
42 //////////////////////////////////////////////////////
43 // Generic VnCharset class
44 //////////////////////////////////////////////////////
45 int VnCharset::elementSize()
50 //-------------------------------------------
51 int VnInternalCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
53 if (!is.getNextDW(stdChar)) {
57 bytesRead = sizeof(UKDWORD);
61 //-------------------------------------------
62 int VnInternalCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
64 outLen = sizeof(StdVnChar);
65 UKWORD *pWord = (UKWORD *)&stdChar;
68 return os.putW(*pWord);
71 //-------------------------------------------
72 int VnInternalCharset::elementSize()
77 //-------------------------------------------
78 SingleByteCharset::SingleByteCharset(unsigned char * vnChars)
82 memset(m_stdMap, 0, 256*sizeof(UKWORD));
83 for (i=0; i<TOTAL_VNCHARS; i++) {
84 if (vnChars[i] != 0 && (i==TOTAL_VNCHARS-1 || vnChars[i] != vnChars[i+1]))
85 m_stdMap[vnChars[i]] = i + 1;
89 //-------------------------------------------
90 int SingleByteCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
93 if (!is.getNext(ch)) {
98 stdChar = (m_stdMap[ch])? (VnStdCharOffset + m_stdMap[ch] - 1) : ch;
104 //-------------------------------------------
105 int SingleByteCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
109 if (stdChar >= VnStdCharOffset) {
111 ch = m_vnChars[stdChar - VnStdCharOffset];
113 ch = (stdChar == StdStartQuote)? PadStartQuote :
114 ((stdChar == StdEndQuote)? PadEndQuote :
115 ((stdChar == StdEllipsis)? PadEllipsis: PadChar) );
119 if (stdChar > 255 || m_stdMap[stdChar]) {
120 //this character is missing in the charset
121 // output padding character
123 ret = os.putB(PadChar);
127 ret = os.putB((UKBYTE)stdChar);
133 //-------------------------------------------
134 int wideCharCompare(const void *ele1, const void *ele2)
136 UKWORD ch1 = LOWORD(*((UKDWORD *)ele1));
137 UKWORD ch2 = LOWORD(*((UKDWORD *)ele2));
138 return (ch1 == ch2)? 0 : ((ch1 > ch2)? 1 : -1);
141 //-------------------------------------------
142 UnicodeCharset::UnicodeCharset(UnicodeChar *vnChars)
145 m_toUnicode = vnChars;
146 for (i=0; i<TOTAL_VNCHARS; i++)
147 m_vnChars[i] = (i << 16) + vnChars[i]; // high word is used for index
148 qsort(m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
151 //-------------------------------------------
152 int UnicodeCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
155 if (!is.getNextW(uniCh)) {
159 bytesRead = sizeof(UnicodeChar);
161 UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
163 stdChar = VnStdCharOffset + HIWORD(*pChar);
169 //-------------------------------------------
170 int UnicodeCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
172 outLen = sizeof(UnicodeChar);
173 return os.putW((stdChar >= VnStdCharOffset)?
174 m_toUnicode[stdChar-VnStdCharOffset] : (UnicodeChar)stdChar);
177 //-------------------------------------------
178 int UnicodeCharset::elementSize()
183 ////////////////////////////////////////
184 // Unicode decomposed
185 ////////////////////////////////////////
186 //-------------------------------------------
187 int uniCompInfoCompare(const void *ele1, const void *ele2)
189 UKDWORD ch1 = ((UniCompCharInfo *)ele1)->compChar;
190 UKDWORD ch2 = ((UniCompCharInfo *)ele2)->compChar;
191 return (ch1 == ch2)? 0 : ((ch1 > ch2)? 1 : -1);
194 UnicodeCompCharset::UnicodeCompCharset(UnicodeChar *uniChars, UKDWORD *uniCompChars)
197 m_uniCompChars = uniCompChars;
199 for (i=0; i<TOTAL_VNCHARS; i++) {
200 m_info[i].compChar = uniCompChars[i];
201 m_info[i].stdIndex = i;
205 for (k=0, i=TOTAL_VNCHARS; k<TOTAL_VNCHARS; k++)
206 if (uniChars[k] != uniCompChars[k]) {
207 m_info[i].compChar = uniChars[k];
208 m_info[i].stdIndex = k;
213 qsort(m_info, m_totalChars, sizeof(UniCompCharInfo), uniCompInfoCompare);
216 //---------------------------------------------
217 int UnicodeCompCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
223 if (!is.getNextW(w)) {
230 UniCompCharInfo *pInfo = (UniCompCharInfo *)bsearch(&key, m_info, m_totalChars,
231 sizeof(UniCompCharInfo), uniCompInfoCompare);
233 stdChar = key.compChar;
235 stdChar = pInfo->stdIndex + VnStdCharOffset;
236 if (is.peekNextW(w)) {
239 key.compChar += hi << 16;
240 pInfo = (UniCompCharInfo *)bsearch(&key, m_info, m_totalChars,
241 sizeof(UniCompCharInfo), uniCompInfoCompare);
243 stdChar = pInfo->stdIndex + VnStdCharOffset;
253 //---------------------------------------------
254 int UnicodeCompCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
257 if (stdChar >= VnStdCharOffset) {
258 UKDWORD uniCompCh = m_uniCompChars[stdChar-VnStdCharOffset];
259 UKWORD lo = LOWORD(uniCompCh);
260 UKWORD hi = HIWORD(uniCompCh);
270 ret = os.putW((UKWORD)stdChar);
275 //-------------------------------------------
276 int UnicodeCompCharset::elementSize()
281 ////////////////////////////////
283 ////////////////////////////////
284 int UnicodeUTF8Charset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
287 UKBYTE first, second, third;
291 if (!is.getNext(first))
296 uniCh = first; // 1-byte sequence
297 else if ((first & 0xE0) == 0xC0) {
299 if (!is.peekNext(second))
301 if ((second & 0xC0) != 0x80) {
302 stdChar = INVALID_STD_CHAR;
309 uniCh = ((w1 & 0x001F) << 6) | (w2 & 0x3F);
311 else if ((first & 0xF0) == 0xE0) {
313 if (!is.peekNext(second))
315 if ((second & 0xC0) != 0x80) {
316 stdChar = INVALID_STD_CHAR;
321 if (!is.peekNext(third))
323 if ((third & 0xC0) != 0x80) {
324 stdChar = INVALID_STD_CHAR;
332 uniCh = ((w1 & 0x000F) << 12) | ((w2 & 0x003F) << 6) | (w3 & 0x003F);
335 stdChar = INVALID_STD_CHAR;
339 // translate to StdVnChar
341 UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
343 stdChar = VnStdCharOffset + HIWORD(*pChar);
344 else stdChar = uniCh;
348 //-------------------------------------------
349 int UnicodeUTF8Charset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
351 UnicodeChar uChar = (stdChar < VnStdCharOffset)?
352 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
354 if (uChar < 0x0080) {
356 ret = os.putB((UKBYTE)uChar);
357 } else if (uChar < 0x0800) {
359 os.putB(0xC0 | (UKBYTE)(uChar >> 6));
360 ret = os.putB(0x80 | (UKBYTE)(uChar & 0x003F));
363 os.putB(0xE0 | (UKBYTE)(uChar >> 12));
364 os.putB(0x80 | (UKBYTE)((uChar >> 6) & 0x003F));
365 ret = os.putB(0x80 | (UKBYTE)(uChar & 0x003F));
370 ////////////////////////////////////////
371 // Unicode character reference &#D; //
372 ////////////////////////////////////////
373 int hexDigitValue(unsigned char digit)
375 if (digit >= 'a' && digit <= 'f')
377 if (digit >= 'A' && digit <= 'F')
379 if (digit >= '0' && digit <= '9')
385 //--------------------------------------
386 int UnicodeRefCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
396 if (is.peekNext(ch) && ch == '#') {
401 if (ch != 'x' && ch != 'X') {
404 while (is.peekNext(ch) && isdigit(ch) && digits < 5) {
407 code = code*10 + (ch - '0');
410 if (is.peekNext(ch) && ch == ';') {
421 while (is.peekNext(ch) && isxdigit(ch) && digits < 4) {
424 code = (code << 4) + hexDigitValue(ch);
427 if (is.peekNext(ch) && ch == ';') {
437 // translate to StdVnChar
439 UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
441 stdChar = VnStdCharOffset + HIWORD(*pChar);
442 else stdChar = uniCh;
447 //--------------------------------
448 int UnicodeRefCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
450 UnicodeChar uChar = (stdChar < VnStdCharOffset)?
451 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
455 ret = os.putB((UKBYTE)uChar);
459 os.putB((UKBYTE)'&');
460 os.putB((UKBYTE)'#');
462 int i, digit, prev, base;
465 for (i=0; i < 5; i++) {
466 digit = uChar / base;
470 os.putB('0' + (unsigned char)digit);
475 ret = os.putB((UKBYTE)';');
481 #define HEX_DIGIT(x) ((x < 10)? ('0'+x) : ('A'+x-10))
483 //--------------------------------
484 int UnicodeHexCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
486 UnicodeChar uChar = (stdChar < VnStdCharOffset)?
487 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
491 ret = os.putB((UKBYTE)uChar);
503 for (i=0; i < 4; i++) {
504 digit = ((uChar >> shifts) & 0x000F);
505 if (digit > 0 || prev) {
508 os.putB((UKBYTE)HEX_DIGIT(digit));
519 /////////////////////////////////
520 // Class UnicodeCStringCharset /
521 /////////////////////////////////
522 void UnicodeCStringCharset::startInput()
527 //----------------------------------------
528 int UnicodeCStringCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
538 if (is.peekNext(ch) && (ch=='x' || ch=='X')) {
543 while (is.peekNext(ch) && isxdigit(ch) && digits < 4) {
546 code = (code << 4) + hexDigitValue(ch);
553 // translate to StdVnChar
555 UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
557 stdChar = VnStdCharOffset + HIWORD(*pChar);
558 else stdChar = uniCh;
562 //------------------------------------
563 int UnicodeCStringCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
565 UnicodeChar uChar = (stdChar < VnStdCharOffset)?
566 (UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
568 if (uChar < 128 && !isxdigit(uChar) && uChar != 'x' && uChar != 'X') {
570 ret = os.putB((UKBYTE)uChar);
581 for (i=0; i < 4; i++) {
582 digit = ((uChar >> shifts) & 0x000F);
583 if (digit > 0 || prev) {
586 os.putB((UKBYTE)HEX_DIGIT(digit));
596 /////////////////////////////////
597 // Double-byte charsets //
598 /////////////////////////////////
599 DoubleByteCharset::DoubleByteCharset(UKWORD *vnChars)
601 m_toDoubleChar = vnChars;
602 memset(m_stdMap, 0, 256*sizeof(UKWORD));
603 for (int i=0; i<TOTAL_VNCHARS; i++) {
604 if (vnChars[i] >> 8) // a 2-byte character
605 m_stdMap[vnChars[i] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
606 else if (m_stdMap[vnChars[i]] == 0)
607 m_stdMap[vnChars[i]] = i+1;
608 m_vnChars[i] = (i << 16) + vnChars[i]; // high word is used for StdChar index
610 qsort(m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
613 //---------------------------------------------
614 int DoubleByteCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
623 stdChar = m_stdMap[ch];
626 else if (stdChar == 0xFFFF)
627 stdChar = INVALID_STD_CHAR;
629 stdChar += VnStdCharOffset - 1;
631 if (is.peekNext(hi) && hi > 0) {
632 //test if a double-byte character is encountered
633 UKDWORD key = MAKEWORD(ch,hi);
634 UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
636 stdChar = VnStdCharOffset + HIWORD(*pChar);
645 //---------------------------------------------
646 int DoubleByteCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
649 if (stdChar >= VnStdCharOffset) {
650 UKWORD wCh = m_toDoubleChar[stdChar-VnStdCharOffset];
654 os.putB((UKBYTE)(wCh & 0x00FF));
655 ret = os.putB((UKBYTE)(wCh >> 8));
658 unsigned char b = (unsigned char)wCh;
659 if (m_stdMap[b] == 0xFFFF)
666 ret = os.putB((UKBYTE)(wCh & 0x00FF));
669 ret = os.putB((UKBYTE)(wCh >> 8));
674 if (stdChar > 255 || m_stdMap[stdChar]) {
676 ret = os.putB((UKBYTE)PadChar);
680 ret = os.putB((UKBYTE)stdChar);
686 /////////////////////////////////////////////
687 // Class: VIQRCharset //
688 /////////////////////////////////////////////
690 unsigned char VIQRTones[] = {'\'','`','?','~','.'};
692 const char *VIQREscapes[] = {
703 const int VIQREscCount = sizeof(VIQREscapes) / sizeof(char*);
705 VIQRCharset::VIQRCharset(UKDWORD *vnChars)
707 memset(m_stdMap, 0, 256*sizeof(UKWORD));
711 for (i=0; i<TOTAL_VNCHARS; i++) {
713 if (!(dw & 0xffffff00)) { //single byte
714 //ch = (unsigned char)(dw & 0xff);
715 m_stdMap[dw] = i+256;
719 // set offset from base characters according to tone marks
720 m_stdMap[(unsigned char)'\''] = 2;
721 m_stdMap[(unsigned char)'`'] = 4;
722 m_stdMap[(unsigned char)'?'] = 6;
723 m_stdMap[(unsigned char)'~'] = 8;
724 m_stdMap[(unsigned char)'.'] = 10;
725 m_stdMap[(unsigned char)'^'] = 12;
727 m_stdMap[(unsigned char)'('] = 24;
728 m_stdMap[(unsigned char)'+'] = 26;
729 m_stdMap[(unsigned char)'*'] = 26;
732 //---------------------------------------------------
733 void VIQRCharset::startInput()
736 m_atWordBeginning = 1;
739 if (VnCharsetLibObj.m_options.viqrEsc)
740 VnCharsetLibObj.m_VIQREscPatterns.reset();
743 //---------------------------------------------------
744 int VIQRCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
749 if (!is.getNext(ch1))
752 stdChar = m_stdMap[ch1];
754 if (VnCharsetLibObj.m_options.viqrEsc) {
755 if (VnCharsetLibObj.m_VIQREscPatterns.foundAtNextChar(ch1)!=-1) {
760 if (m_escAll && (ch1==' ' || ch1=='\t' || ch1=='\r' || ch1=='\n'))
764 // ecape character , try to read next
765 if (!is.getNext(ch1)) {
767 stdChar = m_stdMap[ch1];
774 else if (!m_escAll && !is.eos()) {
775 // try to read the next byte
778 unsigned char upper = toupper(ch1);
779 if ((!VnCharsetLibObj.m_options.smartViqr || m_atWordBeginning) &&
780 upper == 'D' && (ch2 == 'd' || ch2 == 'D'))
784 stdChar += 2; // dd is 2 positions after d.
787 StdVnChar index = m_stdMap[ch2];
791 cond = IS_VOWEL(ch1) &&
792 ( index == 2 || index == 4 || index == 8 || //not accepting ? . in suspicious mode
793 (index == 12 && (upper == 'A' || upper == 'E' || upper == 'O')) ||
794 (m_stdMap[ch2] == 24 && upper== 'A') ||
795 (m_stdMap[ch2] == 26 && (upper == 'O' || upper == 'U')) );
800 cond = IS_VOWEL(ch1) &&
801 ((index <= 10 && index > 0 && (!m_gotTone || (index!=6 && index!=10)) ) ||
802 (index == 12 && (upper == 'A' || upper == 'E' || upper == 'O')) ||
803 (m_stdMap[ch2] == 24 && upper== 'A') ||
804 (m_stdMap[ch2] == 26 && (upper == 'O' || upper == 'U')) );
808 m_gotTone = 1; //we have a tone/breve/hook in the current word
810 // ok, take this byte
813 int offset = m_stdMap[ch2];
814 if (offset == 26) offset = 24;
815 if (offset == 24 && (ch1 == 'u' || ch1 == 'U'))
819 if (is.peekNext(ch2)) {
820 if (index > 10 && m_stdMap[ch2] > 0 && m_stdMap[ch2] <= 10) {
821 // ok, take one more byte
824 stdChar += m_stdMap[ch2];
830 m_atWordBeginning = (stdChar < 256);
832 m_gotTone = 0; //reset this flag because we are at the beginning of a new word
837 stdChar += VnStdCharOffset - 256;
841 //---------------------------------------------------
842 void VIQRCharset::startOutput()
849 VnCharsetLibObj.m_VIQROutEscPatterns.reset();
852 //---------------------------------------------------
853 int VIQRCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
857 if (stdChar >= VnStdCharOffset) {
859 UKDWORD dw = m_vnChars[stdChar-VnStdCharOffset];
861 unsigned char first = (unsigned char)dw;
862 unsigned char firstUpper = toupper(first);
866 if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar(b) != -1)
869 if (m_noOutEsc && (b==' ' || b=='\t' || b=='\r' || b=='\n'))
872 if (dw & 0x0000FF00) {
873 // second byte is present
874 unsigned char second = (UKBYTE)(dw >> 8);
876 ret = os.putB(second);
878 if (dw & 0x00FF0000) {
879 //third byte is present
881 ret = os.putB((UKBYTE)(dw >> 16));
885 UKWORD index = m_stdMap[second];
886 m_escapeTone = (index == 12 || index == 24 || index == 26);
889 VnCharsetLibObj.m_VIQROutEscPatterns.reset();
896 m_escapeTone = IS_VOWEL(first);
897 m_escapeBowl = (firstUpper == 'A');
898 m_escapeHook = (firstUpper == 'U' || firstUpper == 'O');
899 m_escapeRoof = (firstUpper == 'A' || firstUpper == 'E' || firstUpper == 'O');
905 ret = os.putB((UKBYTE)PadChar);
906 if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar((UKBYTE)PadChar) != -1)
911 UKWORD index = m_stdMap[stdChar];
912 if (!VnCharsetLibObj.m_options.viqrMixed && !m_noOutEsc &&
914 (index > 0 && index <= 10 && m_escapeTone) ||
915 (index == 12 && m_escapeRoof) ||
916 (index == 24 && m_escapeBowl) ||
917 (index == 26 && m_escapeHook))) {
918 //(m_stdMap[stdChar] > 0 && m_stdMap[stdChar] <= 26)) {
919 // tone mark, needs an escape character
922 if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar('\\') != -1)
927 if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar(b) != -1)
929 if (m_noOutEsc && (b==' ' || b=='\t' || b=='\r' || b=='\n'))
932 // reset escape marks
941 /////////////////////////////////////////////
942 // Class: UTF8VIQRCharset //
943 /////////////////////////////////////////////
945 //-----------------------------------------
946 UTF8VIQRCharset::UTF8VIQRCharset(UnicodeUTF8Charset *pUtf, VIQRCharset *pViqr)
952 //-----------------------------------------
953 void UTF8VIQRCharset::startInput()
955 m_pUtf->startInput();
956 m_pViqr->startInput();
959 //-----------------------------------------
960 void UTF8VIQRCharset::startOutput()
962 m_pUtf->startOutput();
963 m_pViqr->startOutput();
966 //-----------------------------------------
967 int UTF8VIQRCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
971 if (!is.peekNext(ch))
974 if (ch > 0xBF && ch < 0xFE) {
975 m_pViqr->startInput(); // just to reset the VIQR object state
976 m_pViqr->m_suspicious = 1;
977 return m_pUtf->nextInput(is, stdChar, bytesRead);
980 return m_pViqr->nextInput(is, stdChar, bytesRead);
983 //-----------------------------------------
984 int UTF8VIQRCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
986 return m_pViqr->putChar(os, stdChar, outLen);
990 //-----------------------------------------
991 CVnCharsetLib::CVnCharsetLib()
994 for (ch = 'a'; ch < 'z'; ch++)
996 LoVowel['a'-'a'] = 1;
997 LoVowel['e'-'a'] = 1;
998 LoVowel['i'-'a'] = 1;
999 LoVowel['o'-'a'] = 1;
1000 LoVowel['u'-'a'] = 1;
1001 LoVowel['y'-'a'] = 1;
1003 for (ch = 'A'; ch < 'Z'; ch++)
1004 HiVowel[ch-'A'] = 0;
1005 HiVowel['A'-'A'] = 1;
1006 HiVowel['E'-'A'] = 1;
1007 HiVowel['I'-'A'] = 1;
1008 HiVowel['O'-'A'] = 1;
1009 HiVowel['U'-'A'] = 1;
1010 HiVowel['Y'-'A'] = 1;
1012 m_pUniCharset = NULL;
1013 m_pUniCompCharset = NULL;
1017 m_pVIQRCharObj = NULL;
1018 m_pUVIQRCharObj = NULL;
1019 m_pWinCP1258 = NULL;
1020 m_pVnIntCharset = NULL;
1023 for (i = 0; i < CONV_TOTAL_SINGLE_CHARSETS; i++)
1024 m_sgCharsets[i] = NULL;
1026 for (i = 0; i < CONV_TOTAL_DOUBLE_CHARSETS; i++)
1027 m_dbCharsets[i] = NULL;
1029 VnConvResetOptions(&m_options);
1030 m_VIQREscPatterns.init((char**)VIQREscapes, VIQREscCount);
1031 m_VIQROutEscPatterns.init((char**)VIQREscapes, VIQREscCount);
1035 //-----------------------------------------
1036 CVnCharsetLib::~CVnCharsetLib()
1039 delete m_pUniCharset;
1047 delete m_pVIQRCharObj;
1048 if (m_pUVIQRCharObj)
1049 delete m_pUVIQRCharObj;
1051 delete m_pWinCP1258;
1053 delete m_pUniCString;
1054 if (m_pVnIntCharset)
1055 delete m_pVnIntCharset;
1058 for (i = 0; i < CONV_TOTAL_SINGLE_CHARSETS; i++)
1059 if (m_sgCharsets[i]) delete m_sgCharsets[i];
1061 for (i = 0; i < CONV_TOTAL_DOUBLE_CHARSETS; i++)
1062 if (m_dbCharsets[i]) delete m_dbCharsets[i];
1066 //-----------------------------------------
1067 VnCharset * CVnCharsetLib::getVnCharset(int charsetIdx)
1069 switch (charsetIdx) {
1071 case CONV_CHARSET_UNICODE:
1072 if (m_pUniCharset == NULL)
1073 m_pUniCharset = new UnicodeCharset(UnicodeTable);
1074 return m_pUniCharset;
1075 case CONV_CHARSET_UNIDECOMPOSED:
1076 if (m_pUniCompCharset == NULL)
1077 m_pUniCompCharset = new UnicodeCompCharset(UnicodeTable, UnicodeComposite);
1078 return m_pUniCompCharset;
1079 case CONV_CHARSET_UNIUTF8:
1080 case CONV_CHARSET_XUTF8:
1081 if (m_pUniUTF8 == NULL)
1082 m_pUniUTF8 = new UnicodeUTF8Charset(UnicodeTable);
1085 case CONV_CHARSET_UNIREF:
1086 if (m_pUniRef == NULL)
1087 m_pUniRef = new UnicodeRefCharset(UnicodeTable);
1090 case CONV_CHARSET_UNIREF_HEX:
1091 if (m_pUniHex == NULL)
1092 m_pUniHex = new UnicodeHexCharset(UnicodeTable);
1095 case CONV_CHARSET_UNI_CSTRING:
1096 if (m_pUniCString == NULL)
1097 m_pUniCString = new UnicodeCStringCharset(UnicodeTable);
1098 return m_pUniCString;
1100 case CONV_CHARSET_WINCP1258:
1101 if (m_pWinCP1258 == NULL)
1102 m_pWinCP1258 = new WinCP1258Charset(WinCP1258, WinCP1258Pre);
1103 return m_pWinCP1258;
1105 case CONV_CHARSET_VIQR:
1106 if (m_pVIQRCharObj == NULL)
1107 m_pVIQRCharObj = new VIQRCharset(VIQRTable);
1108 return m_pVIQRCharObj;
1110 case CONV_CHARSET_VNSTANDARD:
1111 if (m_pVnIntCharset == NULL)
1112 m_pVnIntCharset = new VnInternalCharset();
1113 return m_pVnIntCharset;
1115 case CONV_CHARSET_UTF8VIQR:
1116 if (m_pUVIQRCharObj == NULL) {
1117 if (m_pVIQRCharObj == NULL)
1118 m_pVIQRCharObj = new VIQRCharset(VIQRTable);
1120 if (m_pUniUTF8 == NULL)
1121 m_pUniUTF8 = new UnicodeUTF8Charset(UnicodeTable);
1122 m_pUVIQRCharObj = new UTF8VIQRCharset(m_pUniUTF8, m_pVIQRCharObj);
1124 return m_pUVIQRCharObj;
1127 if (IS_SINGLE_BYTE_CHARSET(charsetIdx)) {
1128 int i = charsetIdx - CONV_CHARSET_TCVN3;
1129 if (m_sgCharsets[i] == NULL)
1130 m_sgCharsets[i] = new SingleByteCharset(SingleByteTables[i]);
1131 return m_sgCharsets[i];
1133 else if (IS_DOUBLE_BYTE_CHARSET(charsetIdx)) {
1134 int i = charsetIdx - CONV_CHARSET_VNIWIN;
1135 if (m_dbCharsets[i] == NULL)
1136 m_dbCharsets[i] = new DoubleByteCharset(DoubleByteTables[i]);
1137 return m_dbCharsets[i];
1144 //-------------------------------------------------
1145 DllExport void VnConvSetOptions(VnConvOptions *pOptions)
1147 VnCharsetLibObj.m_options = *pOptions;
1150 //-------------------------------------------------
1151 DllExport void VnConvGetOptions(VnConvOptions *pOptions)
1153 *pOptions = VnCharsetLibObj.m_options;
1156 //-------------------------------------------------
1157 DllExport void VnConvResetOptions(VnConvOptions *pOptions)
1159 pOptions->viqrEsc = 1;
1160 pOptions->viqrMixed = 0;
1161 pOptions->toUpper = 0;
1162 pOptions->toLower = 0;
1163 pOptions->removeTone = 0;
1164 pOptions->smartViqr = 1;
1168 /////////////////////////////////////////////
1169 // Class WinCP1258Charset
1170 /////////////////////////////////////////////
1171 WinCP1258Charset::WinCP1258Charset(UKWORD *compositeChars, UKWORD *precomposedChars)
1174 m_toDoubleChar = compositeChars;
1175 memset(m_stdMap, 0, 256*sizeof(UKWORD));
1177 // encode composite chars
1178 for (i=0; i<TOTAL_VNCHARS; i++) {
1179 if (compositeChars[i] >> 8) // a 2-byte character
1180 m_stdMap[compositeChars[i] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
1181 else if (m_stdMap[compositeChars[i]] == 0)
1182 m_stdMap[compositeChars[i]] = i+1;
1184 m_vnChars[i] = (i << 16) + compositeChars[i]; // high word is used for StdChar index
1187 m_totalChars = TOTAL_VNCHARS;
1189 //add precomposed chars to the table
1190 for (k=0, i=TOTAL_VNCHARS; k<TOTAL_VNCHARS; k++)
1191 if (precomposedChars[k] != compositeChars[k]) {
1192 if (precomposedChars[k] >> 8) // a 2-byte character
1193 m_stdMap[precomposedChars[k] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
1194 else if (m_stdMap[precomposedChars[k]] == 0)
1195 m_stdMap[precomposedChars[k]] = k+1;
1197 m_vnChars[i] = (k << 16) + precomposedChars[k];
1202 qsort(m_vnChars, m_totalChars, sizeof(UKDWORD), wideCharCompare);
1206 //---------------------------------------------------------------------
1207 // This fuction is basically the same as that of DoubleByteCharset
1208 // with m_totalChars is used instead of constant TOTAL_VNCHARS
1209 //---------------------------------------------------------------------
1210 int WinCP1258Charset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
1216 if (!is.getNext(ch))
1219 stdChar = m_stdMap[ch];
1222 else if (stdChar == 0xFFFF)
1223 stdChar = INVALID_STD_CHAR;
1225 stdChar += VnStdCharOffset - 1;
1227 if (is.peekNext(hi) && hi > 0) {
1228 //test if a double-byte character is encountered
1229 UKDWORD key = MAKEWORD(ch,hi);
1230 UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, m_totalChars, sizeof(UKDWORD), wideCharCompare);
1232 stdChar = VnStdCharOffset + HIWORD(*pChar);
1241 //---------------------------------------------------------------------
1242 // This fuction is exactly the same as that of DoubleByteCharset
1243 //---------------------------------------------------------------------
1244 int WinCP1258Charset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
1247 if (stdChar >= VnStdCharOffset) {
1248 UKWORD wCh = m_toDoubleChar[stdChar-VnStdCharOffset];
1252 os.putB((UKBYTE)(wCh & 0x00FF));
1253 ret = os.putB((UKBYTE)(wCh >> 8));
1256 unsigned char b = (unsigned char)wCh;
1257 if (m_stdMap[b] == 0xFFFF)
1264 if (stdChar > 255 || m_stdMap[stdChar]) {
1266 ret = os.putB((UKBYTE)PadChar);
1270 ret = os.putB((UKBYTE)stdChar);
1276 #define IS_ODD(x) (x & 1)
1277 #define IS_EVEN(x) (!(x & 1))
1279 StdVnChar StdVnToUpper(StdVnChar ch)
1281 if (ch >= VnStdCharOffset &&
1282 ch<(VnStdCharOffset + TOTAL_ALPHA_VNCHARS) &&
1288 //----------------------------------------
1289 StdVnChar StdVnToLower(StdVnChar ch)
1291 if (ch >= VnStdCharOffset &&
1292 ch<(VnStdCharOffset + TOTAL_ALPHA_VNCHARS) &&
1298 //----------------------------------------
1299 StdVnChar StdVnGetRoot(StdVnChar ch)
1301 if (ch >= VnStdCharOffset && ch<VnStdCharOffset+TOTAL_VNCHARS)
1302 ch = VnStdCharOffset + StdVnRootChar[ch-VnStdCharOffset];