2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
9 #ifndef BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
10 #define BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
13 #include <boost/locale/encoding.hpp>
31 struct windows_encoding {
37 bool operator<(windows_encoding const &l,windows_encoding const &r)
39 return strcmp(l.name,r.name) < 0;
42 windows_encoding all_windows_encodings[] = {
44 { "cp1250", 1250, 0 },
45 { "cp1251", 1251, 0 },
46 { "cp1252", 1252, 0 },
47 { "cp1253", 1253, 0 },
48 { "cp1254", 1254, 0 },
49 { "cp1255", 1255, 0 },
50 { "cp1256", 1256, 0 },
51 { "cp1257", 1257, 0 },
55 { "eucjp", 20932, 0 },
56 { "euckr", 51949, 0 },
57 { "gb18030", 54936, 0 },
58 { "gb2312", 20936, 0 },
60 { "iso2022jp", 50220, 0 },
61 { "iso2022kr", 50225, 0 },
62 { "iso88591", 28591, 0 },
63 { "iso885913", 28603, 0 },
64 { "iso885915", 28605, 0 },
65 { "iso88592", 28592, 0 },
66 { "iso88593", 28593, 0 },
67 { "iso88594", 28594, 0 },
68 { "iso88595", 28595, 0 },
69 { "iso88596", 28596, 0 },
70 { "iso88597", 28597, 0 },
71 { "iso88598", 28598, 0 },
72 { "iso88599", 28599, 0 },
73 { "koi8r", 20866, 0 },
74 { "koi8u", 21866, 0 },
76 { "shiftjis", 932, 0 },
78 { "usascii", 20127, 0 },
80 { "windows1250", 1250, 0 },
81 { "windows1251", 1251, 0 },
82 { "windows1252", 1252, 0 },
83 { "windows1253", 1253, 0 },
84 { "windows1254", 1254, 0 },
85 { "windows1255", 1255, 0 },
86 { "windows1256", 1256, 0 },
87 { "windows1257", 1257, 0 },
88 { "windows874", 874, 0 },
89 { "windows932", 932, 0 },
90 { "windows936", 936, 0 },
93 size_t remove_substitutions(std::vector<char> &v)
95 if(std::find(v.begin(),v.end(),0) == v.end()) {
100 for(unsigned i=0;i<v.size();i++) {
108 void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
110 buf.reserve(end-begin);
114 int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
115 if(len == 2 && begin+1==end)
117 n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
119 buf.push_back(wide_buf[i]);
125 void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
129 int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
132 multibyte_to_wide_one_by_one(codepage,begin,end,buf);
135 throw conversion_error();
139 if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
140 throw conversion_error();
143 void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
147 BOOL substitute = FALSE;
148 BOOL *substitute_ptr = codepage == 65001 || codepage == 65000 ? 0 : &substitute;
150 char *subst_char_ptr = codepage == 65001 || codepage == 65000 ? 0 : &subst_char;
152 int n = WideCharToMultiByte(codepage,0,begin,end-begin,0,0,subst_char_ptr,substitute_ptr);
155 if(WideCharToMultiByte(codepage,0,begin,end-begin,&buf[0],n,subst_char_ptr,substitute_ptr)==0)
156 throw conversion_error();
159 remove_substitutions(buf);
161 throw conversion_error();
165 void wide_to_multibyte(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
169 buf.reserve(end-begin);
170 wchar_t const *e = std::find(begin,end,L'\0');
171 wchar_t const *b = begin;
173 std::vector<char> tmp;
174 wide_to_multibyte_non_zero(codepage,b,e,do_skip,tmp);
175 size_t osize = buf.size();
176 buf.resize(osize+tmp.size());
177 std::copy(tmp.begin(),tmp.end(),buf.begin()+osize);
181 e=std::find(b,end,L'0');
189 int encoding_to_windows_codepage(char const *ccharset)
191 std::string charset = normalize_encoding(ccharset);
192 windows_encoding ref;
193 ref.name = charset.c_str();
194 size_t n = sizeof(all_windows_encodings)/sizeof(all_windows_encodings[0]);
195 windows_encoding *begin = all_windows_encodings;
196 windows_encoding *end = all_windows_encodings + n;
197 windows_encoding *ptr = std::lower_bound(begin,end,ref);
198 if(ptr!=end && strcmp(ptr->name,charset.c_str())==0) {
199 if(ptr->was_tested) {
200 return ptr->codepage;
202 else if(IsValidCodePage(ptr->codepage)) {
203 // the thread safety is not an issue, maximum
204 // it would be checked more then once
206 return ptr->codepage;
216 template<typename CharType>
217 bool validate_utf16(CharType const *str,unsigned len)
219 CharType const *begin = str;
220 CharType const *end = str+len;
222 utf::code_point c = utf::utf_traits<CharType,2>::template decode<CharType const *>(begin,end);
223 if(c==utf::illegal || c==utf::incomplete)
229 template<typename CharType,typename OutChar>
230 void clean_invalid_utf16(CharType const *str,unsigned len,std::vector<OutChar> &out)
233 for(unsigned i=0;i<len;i++) {
234 uint16_t c = static_cast<uint16_t>(str[i]);
236 if(0xD800 <= c && c<= 0xDBFF) {
240 uint16_t c2=static_cast<uint16_t>(str[i]);
241 if(0xDC00 <= c2 && c2 <= 0xDFFF) {
242 out.push_back(static_cast<OutChar>(c));
243 out.push_back(static_cast<OutChar>(c2));
246 else if(0xDC00 <= c && c <=0xDFFF)
249 out.push_back(static_cast<OutChar>(c));
254 class wconv_between : public converter_between {
259 from_code_page_ ( -1)
262 bool open(char const *to_charset,char const *from_charset,method_type how)
265 to_code_page_ = encoding_to_windows_codepage(to_charset);
266 from_code_page_ = encoding_to_windows_codepage(from_charset);
267 if(to_code_page_ == -1 || from_code_page_ == -1)
271 virtual std::string convert(char const *begin,char const *end)
273 if(to_code_page_ == 65001 && from_code_page_ == 65001)
274 return utf_to_utf<char>(begin,end,how_);
278 std::vector<wchar_t> tmp; // buffer for mb2w
279 std::wstring tmps; // buffer for utf_to_utf
280 wchar_t const *wbegin=0;
281 wchar_t const *wend=0;
283 if(from_code_page_ == 65001) {
284 tmps = utf_to_utf<wchar_t>(begin,end,how_);
287 wbegin = tmps.c_str();
288 wend = wbegin + tmps.size();
291 multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
295 wend = wbegin + tmp.size();
298 if(to_code_page_ == 65001) {
299 return utf_to_utf<char>(wbegin,wend,how_);
302 std::vector<char> ctmp;
303 wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
306 res.assign(&ctmp.front(),ctmp.size());
315 template<typename CharType,int size = sizeof(CharType) >
318 template<typename CharType,int size = sizeof(CharType) >
319 class wconv_from_utf;
322 class wconv_to_utf<char,1> : public converter_to_utf<char> , public wconv_between {
324 virtual bool open(char const *cs,method_type how)
326 return wconv_between::open("UTF-8",cs,how);
328 virtual std::string convert(char const *begin,char const *end)
330 return wconv_between::convert(begin,end);
335 class wconv_from_utf<char,1> : public converter_from_utf<char> , public wconv_between {
337 virtual bool open(char const *cs,method_type how)
339 return wconv_between::open(cs,"UTF-8",how);
341 virtual std::string convert(char const *begin,char const *end)
343 return wconv_between::convert(begin,end);
347 template<typename CharType>
348 class wconv_to_utf<CharType,2> : public converter_to_utf<CharType> {
350 typedef CharType char_type;
352 typedef std::basic_string<char_type> string_type;
360 virtual bool open(char const *charset,method_type how)
363 code_page_ = encoding_to_windows_codepage(charset);
364 return code_page_ != -1;
367 virtual string_type convert(char const *begin,char const *end)
369 if(code_page_ == 65001) {
370 return utf_to_utf<char_type>(begin,end,how_);
372 std::vector<wchar_t> tmp;
373 multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
376 res.assign(reinterpret_cast<char_type *>(&tmp.front()),tmp.size());
385 template<typename CharType>
386 class wconv_from_utf<CharType,2> : public converter_from_utf<CharType> {
388 typedef CharType char_type;
390 typedef std::basic_string<char_type> string_type;
398 virtual bool open(char const *charset,method_type how)
401 code_page_ = encoding_to_windows_codepage(charset);
402 return code_page_ != -1;
405 virtual std::string convert(CharType const *begin,CharType const *end)
407 if(code_page_ == 65001) {
408 return utf_to_utf<char>(begin,end,how_);
410 wchar_t const *wbegin = 0;
411 wchar_t const *wend = 0;
412 std::vector<wchar_t> buffer; // if needed
414 return std::string();
415 if(validate_utf16(begin,end-begin)) {
416 wbegin = reinterpret_cast<wchar_t const *>(begin);
417 wend = reinterpret_cast<wchar_t const *>(end);
421 throw conversion_error();
424 clean_invalid_utf16(begin,end-begin,buffer);
425 if(!buffer.empty()) {
427 wend = wbegin + buffer.size();
434 std::vector<char> ctmp;
435 wide_to_multibyte(code_page_,wbegin,wend,how_ == skip,ctmp);
438 res.assign(&ctmp.front(),ctmp.size());
449 template<typename CharType>
450 class wconv_to_utf<CharType,4> : public converter_to_utf<CharType> {
452 typedef CharType char_type;
454 typedef std::basic_string<char_type> string_type;
462 virtual bool open(char const *charset,method_type how)
465 code_page_ = encoding_to_windows_codepage(charset);
466 return code_page_ != -1;
469 virtual string_type convert(char const *begin,char const *end)
471 if(code_page_ == 65001) {
472 return utf_to_utf<char_type>(begin,end,how_);
474 std::vector<wchar_t> buf;
475 multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
478 return string_type();
480 return utf_to_utf<CharType>(&buf[0],&buf[0]+buf.size(),how_);
487 template<typename CharType>
488 class wconv_from_utf<CharType,4> : public converter_from_utf<CharType> {
490 typedef CharType char_type;
492 typedef std::basic_string<char_type> string_type;
500 virtual bool open(char const *charset,method_type how)
503 code_page_ = encoding_to_windows_codepage(charset);
504 return code_page_ != -1;
507 virtual std::string convert(CharType const *begin,CharType const *end)
509 if(code_page_ == 65001) {
510 return utf_to_utf<char>(begin,end,how_);
512 std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
514 std::vector<char> ctmp;
515 wide_to_multibyte(code_page_,tmp.c_str(),tmp.c_str()+tmp.size(),how_ == skip,ctmp);
519 res.assign(&ctmp.front(),ctmp.size());
539 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4