Imported Upstream version 1.57.0
[platform/upstream/boost.git] / libs / locale / src / encoding / wconv_codepage.ipp
1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8
9 #ifndef BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
10 #define BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
11
12
13 #include <boost/locale/encoding.hpp>
14 #include <algorithm>
15 #include <cstring>
16 #include <string>
17 #include "conv.hpp"
18
19 #ifndef NOMINMAX
20 # define NOMINMAX
21 #endif
22 #include <windows.h>
23 #include <vector>
24
25
26 namespace boost {
27 namespace locale {
28 namespace conv {
29 namespace impl {
30     
31     struct windows_encoding {
32         char const *name;
33         unsigned codepage;
34         unsigned was_tested;
35     };
36
37     bool operator<(windows_encoding const &l,windows_encoding const &r)
38     {
39         return strcmp(l.name,r.name) < 0;
40     }
41
42     windows_encoding all_windows_encodings[] = {
43         { "big5",       950, 0 },
44         { "cp1250",     1250, 0 },
45         { "cp1251",     1251, 0 },
46         { "cp1252",     1252, 0 },
47         { "cp1253",     1253, 0 },
48         { "cp1254",     1254, 0 },
49         { "cp1255",     1255, 0 },
50         { "cp1256",     1256, 0 },
51         { "cp1257",     1257, 0 },
52         { "cp874",      874, 0 },
53         { "cp932",      932, 0 },
54         { "cp936",      936, 0 },
55         { "eucjp",      20932, 0 },
56         { "euckr",      51949, 0 },
57         { "gb18030",    54936, 0 },
58         { "gb2312",     20936, 0 },
59         { "gbk",        936, 0 },
60         { "iso2022jp",  50220, 0 },
61         { "iso2022kr",  50225, 0 },
62         { "iso88591",   28591, 0 },
63         { "iso885913",  28603, 0 },
64         { "iso885915",  28605, 0 },
65         { "iso88592",   28592, 0 },
66         { "iso88593",   28593, 0 },
67         { "iso88594",   28594, 0 },
68         { "iso88595",   28595, 0 },
69         { "iso88596",   28596, 0 },
70         { "iso88597",   28597, 0 },
71         { "iso88598",   28598, 0 },
72         { "iso88599",   28599, 0 },
73         { "koi8r",      20866, 0 },
74         { "koi8u",      21866, 0 },
75         { "ms936",      936, 0 },
76         { "shiftjis",   932, 0 },
77         { "sjis",       932, 0 },
78         { "usascii",    20127, 0 },
79         { "utf8",       65001, 0 },
80         { "windows1250",        1250, 0 },
81         { "windows1251",        1251, 0 },
82         { "windows1252",        1252, 0 },
83         { "windows1253",        1253, 0 },
84         { "windows1254",        1254, 0 },
85         { "windows1255",        1255, 0 },
86         { "windows1256",        1256, 0 },
87         { "windows1257",        1257, 0 },
88         { "windows874",         874, 0 },
89         { "windows932",         932, 0 },
90         { "windows936",         936, 0 },
91     };
92
93     size_t remove_substitutions(std::vector<char> &v)
94     {
95         if(std::find(v.begin(),v.end(),0) == v.end()) {
96             return v.size();
97         }
98         std::vector<char> v2;
99         v2.reserve(v.size());
100         for(unsigned i=0;i<v.size();i++) {
101             if(v[i]!=0)
102                 v2.push_back(v[i]);
103         }
104         v.swap(v2);
105         return v.size();
106     }
107
108     void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
109     {
110         buf.reserve(end-begin);
111         while(begin!=end) {
112             wchar_t wide_buf[4];
113             int n = 0;
114             int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
115             if(len == 2 && begin+1==end)
116                 return;
117             n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
118             for(int i=0;i<n;i++) 
119                 buf.push_back(wide_buf[i]);
120             begin+=len;
121         }
122     }
123
124     
125     void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
126     {
127         if(begin==end)
128             return;
129         int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
130         if(n == 0) {
131             if(do_skip) {
132                 multibyte_to_wide_one_by_one(codepage,begin,end,buf);
133                 return;
134             }
135             throw conversion_error();
136         }
137
138         buf.resize(n,0);
139         if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
140             throw conversion_error();
141     }
142
143     void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
144     {
145         if(begin==end)
146             return;
147         BOOL substitute = FALSE;
148         BOOL *substitute_ptr = codepage == 65001 || codepage == 65000 ? 0 : &substitute;
149         char subst_char = 0;
150         char *subst_char_ptr = codepage == 65001 || codepage == 65000 ? 0 : &subst_char;
151         
152         int n = WideCharToMultiByte(codepage,0,begin,end-begin,0,0,subst_char_ptr,substitute_ptr);
153         buf.resize(n);
154         
155         if(WideCharToMultiByte(codepage,0,begin,end-begin,&buf[0],n,subst_char_ptr,substitute_ptr)==0)
156             throw conversion_error();
157         if(substitute) {
158             if(do_skip) 
159                 remove_substitutions(buf);
160             else 
161                 throw conversion_error();
162         }
163     }
164     
165     void wide_to_multibyte(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
166     {
167         if(begin==end)
168             return;
169         buf.reserve(end-begin);
170         wchar_t const *e = std::find(begin,end,L'\0');
171         wchar_t const *b = begin;
172         for(;;) {
173             std::vector<char> tmp;
174             wide_to_multibyte_non_zero(codepage,b,e,do_skip,tmp);
175             size_t osize = buf.size();
176             buf.resize(osize+tmp.size());
177             std::copy(tmp.begin(),tmp.end(),buf.begin()+osize);
178             if(e!=end) {
179                 buf.push_back('\0');
180                 b=e+1;
181                 e=std::find(b,end,L'0');
182             }
183             else 
184                 break;
185         }
186     }
187
188     
189     int encoding_to_windows_codepage(char const *ccharset)
190     {
191         std::string charset = normalize_encoding(ccharset);
192         windows_encoding ref;
193         ref.name = charset.c_str();
194         size_t n = sizeof(all_windows_encodings)/sizeof(all_windows_encodings[0]);
195         windows_encoding *begin = all_windows_encodings;
196         windows_encoding *end = all_windows_encodings + n;
197         windows_encoding *ptr = std::lower_bound(begin,end,ref);
198         if(ptr!=end && strcmp(ptr->name,charset.c_str())==0) {
199             if(ptr->was_tested) {
200                 return ptr->codepage;
201             }
202             else if(IsValidCodePage(ptr->codepage)) {
203                 // the thread safety is not an issue, maximum
204                 // it would be checked more then once
205                 ptr->was_tested=1;
206                 return ptr->codepage;
207             }
208             else {
209                 return -1;
210             }
211         }
212         return -1;
213         
214     }
215
216     template<typename CharType>
217     bool validate_utf16(CharType const *str,unsigned len)
218     {
219         CharType const *begin = str;
220         CharType const *end = str+len;
221         while(begin!=end) {
222             utf::code_point c = utf::utf_traits<CharType,2>::template decode<CharType const *>(begin,end);
223             if(c==utf::illegal || c==utf::incomplete)
224                 return false;
225         }
226         return true;
227     }
228
229     template<typename CharType,typename OutChar>
230     void clean_invalid_utf16(CharType const *str,unsigned len,std::vector<OutChar> &out)
231     {
232         out.reserve(len);
233         for(unsigned i=0;i<len;i++) {
234             uint16_t c = static_cast<uint16_t>(str[i]);
235
236             if(0xD800 <= c && c<= 0xDBFF) {
237                 i++;
238                 if(i>=len)
239                     return;
240                 uint16_t c2=static_cast<uint16_t>(str[i]);
241                 if(0xDC00 <= c2 && c2 <= 0xDFFF) {
242                     out.push_back(static_cast<OutChar>(c));
243                     out.push_back(static_cast<OutChar>(c2));
244                 }
245             }
246             else if(0xDC00 <= c && c <=0xDFFF)
247                 continue;
248             else
249                 out.push_back(static_cast<OutChar>(c));
250         }
251     }
252
253
254     class wconv_between : public converter_between {
255     public:
256         wconv_between() : 
257             how_(skip),
258             to_code_page_ (-1),
259             from_code_page_ ( -1)
260         {
261         }
262         bool open(char const *to_charset,char const *from_charset,method_type how)
263         {
264             how_ = how;
265             to_code_page_ = encoding_to_windows_codepage(to_charset);
266             from_code_page_ = encoding_to_windows_codepage(from_charset);
267             if(to_code_page_ == -1 || from_code_page_ == -1)
268                 return false;
269             return true;
270         }
271         virtual std::string convert(char const *begin,char const *end)
272         {
273             if(to_code_page_ == 65001 && from_code_page_ == 65001)
274                 return utf_to_utf<char>(begin,end,how_);
275
276             std::string res;
277             
278             std::vector<wchar_t> tmp;   // buffer for mb2w
279             std::wstring tmps;          // buffer for utf_to_utf
280             wchar_t const *wbegin=0;
281             wchar_t const *wend=0;
282             
283             if(from_code_page_ == 65001) {
284                 tmps = utf_to_utf<wchar_t>(begin,end,how_);
285                 if(tmps.empty())
286                     return res;
287                 wbegin = tmps.c_str();
288                 wend = wbegin + tmps.size();
289             }
290             else {
291                 multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
292                 if(tmp.empty())
293                     return res;
294                 wbegin = &tmp[0];
295                 wend = wbegin + tmp.size();
296             }
297             
298             if(to_code_page_ == 65001) {
299                 return utf_to_utf<char>(wbegin,wend,how_);
300             }
301
302             std::vector<char> ctmp;
303             wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
304             if(ctmp.empty())
305                 return res;
306             res.assign(&ctmp.front(),ctmp.size());
307             return res;
308         }
309     private:
310         method_type how_;
311         int to_code_page_;
312         int from_code_page_;
313     };
314     
315     template<typename CharType,int size = sizeof(CharType) >
316     class wconv_to_utf;
317
318     template<typename CharType,int size = sizeof(CharType) >
319     class wconv_from_utf;
320
321     template<>
322     class wconv_to_utf<char,1> : public  converter_to_utf<char> , public wconv_between {
323     public:
324         virtual bool open(char const *cs,method_type how) 
325         {
326             return wconv_between::open("UTF-8",cs,how);
327         }
328         virtual std::string convert(char const *begin,char const *end)
329         {
330             return wconv_between::convert(begin,end);
331         }
332     };
333     
334     template<>
335     class wconv_from_utf<char,1> : public  converter_from_utf<char> , public wconv_between {
336     public:
337         virtual bool open(char const *cs,method_type how) 
338         {
339             return wconv_between::open(cs,"UTF-8",how);
340         }
341         virtual std::string convert(char const *begin,char const *end)
342         {
343             return wconv_between::convert(begin,end);
344         }
345     };
346     
347     template<typename CharType>
348     class wconv_to_utf<CharType,2> : public converter_to_utf<CharType> {
349     public:
350         typedef CharType char_type;
351
352         typedef std::basic_string<char_type> string_type;
353
354         wconv_to_utf() : 
355             how_(skip),
356             code_page_(-1)
357         {
358         }
359
360         virtual bool open(char const *charset,method_type how)
361         {
362             how_ = how;
363             code_page_ = encoding_to_windows_codepage(charset);
364             return code_page_ != -1;
365         }
366
367         virtual string_type convert(char const *begin,char const *end) 
368         {
369             if(code_page_ == 65001) {
370                 return utf_to_utf<char_type>(begin,end,how_);
371             }
372             std::vector<wchar_t> tmp;
373             multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
374             string_type res;
375             if(!tmp.empty())
376                 res.assign(reinterpret_cast<char_type *>(&tmp.front()),tmp.size());
377             return res;
378         }
379
380     private:
381         method_type how_;
382         int code_page_;
383     };
384   
385     template<typename CharType>
386     class wconv_from_utf<CharType,2> : public converter_from_utf<CharType> {
387     public:
388         typedef CharType char_type;
389
390         typedef std::basic_string<char_type> string_type;
391
392         wconv_from_utf() : 
393             how_(skip),
394             code_page_(-1)
395         {
396         }
397
398         virtual bool open(char const *charset,method_type how)
399         {
400             how_ = how;
401             code_page_ = encoding_to_windows_codepage(charset);
402             return code_page_ != -1;
403         }
404
405         virtual std::string convert(CharType const *begin,CharType const *end) 
406         {
407             if(code_page_ == 65001) {
408                 return utf_to_utf<char>(begin,end,how_);
409             }
410             wchar_t const *wbegin = 0;
411             wchar_t const *wend = 0;
412             std::vector<wchar_t> buffer; // if needed
413             if(begin==end)
414                 return std::string();
415             if(validate_utf16(begin,end-begin)) {
416                 wbegin =  reinterpret_cast<wchar_t const *>(begin);
417                 wend = reinterpret_cast<wchar_t const *>(end);
418             }
419             else {
420                 if(how_ == stop) {
421                         throw conversion_error();
422                 }
423                 else {
424                     clean_invalid_utf16(begin,end-begin,buffer);
425                     if(!buffer.empty()) {
426                         wbegin = &buffer[0];
427                         wend = wbegin + buffer.size();
428                     }
429                 }
430             }
431             std::string res;
432             if(wbegin==wend)
433                 return res;
434             std::vector<char> ctmp;
435             wide_to_multibyte(code_page_,wbegin,wend,how_ == skip,ctmp);
436             if(ctmp.empty())
437                 return res;
438             res.assign(&ctmp.front(),ctmp.size());
439             return res;
440         }
441
442     private:
443         method_type how_;
444         int code_page_;
445     };
446
447
448
449     template<typename CharType>
450     class wconv_to_utf<CharType,4> : public converter_to_utf<CharType> {
451     public:
452         typedef CharType char_type;
453
454         typedef std::basic_string<char_type> string_type;
455
456         wconv_to_utf() : 
457             how_(skip),
458             code_page_(-1)
459         {
460         }
461
462         virtual bool open(char const *charset,method_type how)
463         {
464             how_ = how;
465             code_page_ = encoding_to_windows_codepage(charset);
466             return code_page_ != -1;
467         }
468
469         virtual string_type convert(char const *begin,char const *end) 
470         {
471             if(code_page_ == 65001) {
472                 return utf_to_utf<char_type>(begin,end,how_);
473             }
474             std::vector<wchar_t> buf;
475             multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
476
477             if(buf.empty())
478                 return string_type();
479
480             return utf_to_utf<CharType>(&buf[0],&buf[0]+buf.size(),how_);
481         }
482     private:
483         method_type how_;
484         int code_page_;
485     };
486   
487     template<typename CharType>
488     class wconv_from_utf<CharType,4> : public converter_from_utf<CharType> {
489     public:
490         typedef CharType char_type;
491
492         typedef std::basic_string<char_type> string_type;
493
494         wconv_from_utf() : 
495             how_(skip),
496             code_page_(-1)
497         {
498         }
499
500         virtual bool open(char const *charset,method_type how)
501         {
502             how_ = how;
503             code_page_ = encoding_to_windows_codepage(charset);
504             return code_page_ != -1;
505         }
506
507         virtual std::string convert(CharType const *begin,CharType const *end) 
508         {
509             if(code_page_ == 65001) {
510                 return utf_to_utf<char>(begin,end,how_);
511             }
512             std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
513
514             std::vector<char> ctmp;
515             wide_to_multibyte(code_page_,tmp.c_str(),tmp.c_str()+tmp.size(),how_ == skip,ctmp);
516             std::string res;
517             if(ctmp.empty())
518                 return res;
519             res.assign(&ctmp.front(),ctmp.size());
520             return res;
521
522         }
523
524     private:
525         method_type how_;
526         int code_page_;
527     };
528
529
530
531
532
533 } // impl
534 } // conv
535 } // locale 
536 } // boost
537
538 #endif
539 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4