1 // This file is part of The New Aspell
2 // Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license
3 // version 2.0 or 2.1. You should have received a copy of the LGPL
4 // license along with this library if you did not you can find
5 // it at http://www.gnu.org/.
11 #include "asc_ctype.hpp"
12 #include "convert.hpp"
13 #include "fstream.hpp"
14 #include "getdata.hpp"
17 #include "stack_ptr.hpp"
18 #include "cache-t.hpp"
19 #include "file_util.hpp"
20 #include "file_data_util.hpp"
21 #include "vararray.hpp"
23 #include "iostream.hpp"
29 typedef unsigned char byte;
30 typedef unsigned char Uni8;
31 typedef unsigned short Uni16;
32 typedef unsigned int Uni32;
35 //////////////////////////////////////////////////////////////////////
36 //////////////////////////////////////////////////////////////////////
40 //////////////////////////////////////////////////////////////////////
41 //////////////////////////////////////////////////////////////////////
43 //////////////////////////////////////////////////////////////////////
51 static const Uni32 npos = (Uni32)(-1);
54 Uni32 operator[] (char key) const {return data[(unsigned char)key];}
55 bool have(char key) const {return data[(unsigned char)key] != npos;}
56 bool insert(char key, Uni32 value);
59 void ToUniLookup::reset()
61 for (int i = 0; i != 256; ++i)
65 bool ToUniLookup::insert(char key, Uni32 value)
67 if (data[(unsigned char)key] != npos)
69 data[(unsigned char)key] = value;
73 //////////////////////////////////////////////////////////////////////
78 // Assumes that the maximum number of items in the table is 256
79 // Also assumes (unsigned char)i == i % 256
81 // Based on the iso-8859-* character sets it is very fast, almost all
82 // lookups involving no more than 2 comparisons.
83 // NO looks ups involded more than 3 compassions.
84 // Also, no division (or modules) is done whatsoever.
95 static const Uni32 npos = (Uni32)(-1);
96 UniItem * overflow_end;
100 UniItem overflow[256]; // you can never be too careful;
105 inline char operator() (Uni32 key, char unknown = '?') const;
106 bool insert(Uni32 key, char value);
109 void FromUniLookup::reset()
111 for (unsigned i = 0; i != 256*4; ++i)
113 overflow_end = overflow;
116 inline char FromUniLookup::operator() (Uni32 k, char unknown) const
118 const UniItem * i = data + (unsigned char)k * 4;
120 if (i->key == k) return i->value;
122 if (i->key == k) return i->value;
124 if (i->key == k) return i->value;
126 if (i->key == k) return i->value;
128 if (i->key == npos) return unknown;
130 for(i = overflow; i != overflow_end; ++i)
131 if (i->key == k) return i->value;
136 bool FromUniLookup::insert(Uni32 k, char v)
138 UniItem * i = data + (unsigned char)k * 4;
140 while (i != e && i->key != npos) {
146 for(i = overflow; i != overflow_end; ++i)
147 if (i->key == k) return false;
154 //////////////////////////////////////////////////////////////////////
165 char operator[] (char key) const {return data[(unsigned char)key];}
166 bool insert(char key, char value);
169 void CharLookup::reset() {
170 for (int i = 0; i != 256; ++i)
174 bool CharLookup::insert(char key, char value)
176 if (data[(unsigned char)key] != -1)
178 data[(unsigned char)key] = value;
182 //////////////////////////////////////////////////////////////////////
190 static const unsigned struct_size;
196 T data[1]; // hack for data[]
200 const unsigned NormTable<T>::struct_size = sizeof(NormTable<T>) - 1;
202 template <class T, class From>
205 const typename T::To * to;
207 NormLookupRet(const typename T::To * t, From * l)
211 template <class T, class From>
212 static inline NormLookupRet<T,From> norm_lookup(const NormTable<T> * d,
213 From * s, From * stop,
214 const typename T::To * def,
219 const T * i = d->data + (static_cast<typename T::From>(*s) & d->mask);
221 if (i->from == static_cast<typename T::From>(*s)) {
223 // really tail recursion
224 if (i->to[1] != T::to_non_char) {def = i->to; prev = s;}
225 d = (const NormTable<T> *)(i->sub_table);
229 return NormLookupRet<T,From>(i->to, s);
233 if (i >= d->end) break;
237 return NormLookupRet<T,From>(def, prev);
241 void free_norm_table(NormTable<T> * d)
243 for (T * cur = d->data; cur != d->end; ++cur) {
245 free_norm_table<T>(static_cast<NormTable<T> *>(cur->sub_table));
250 struct FromUniNormEntry
256 static const From from_non_char = (From)(-1);
257 static const To to_non_char = 0x10;
258 static const unsigned max_to = 4;
262 __attribute__ ((aligned (16)))
266 struct ToUniNormEntry
272 static const From from_non_char = 0x10;
273 static const To to_non_char = 0x10;
274 static const unsigned max_to = 3;
278 __attribute__ ((aligned (16)))
282 //////////////////////////////////////////////////////////////////////
287 PosibErr<void> read_in_char_data (const Config & config,
290 FromUniLookup & from)
295 String dir1,dir2,file_name;
296 fill_data_dir(&config, dir1, dir2);
297 find_file(file_name,dir1,dir2,encoding,".cset");
300 PosibErrBase err = data.open(file_name, "r");
303 snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."),
305 return make_err(unknown_encoding, encoding, mesg);
312 p = get_nb_line(data, line);
314 for (chr = 0; chr != 256; ++chr) {
315 p = get_nb_line(data, line);
316 if (strtoul(p, 0, 16) != chr)
317 return make_err(bad_file_format, file_name);
318 uni = strtoul(p + 3, 0, 16);
320 from.insert(uni, chr);
326 //////////////////////////////////////////////////////////////////////
337 Tally(int s, int * d) : size(s), mask(s - 1), max(0), data(d) {
338 memset(data, 0, sizeof(int)*size);
340 void add(Uni32 chr) {
341 Uni32 p = chr & mask;
343 if (data[p] > max) max = data[p];
347 # define sanity(check) \
348 if (!(check)) return sanity_fail(__FILE__, FUNC, __LINE__, #check)
350 static PosibErrBase sanity_fail(const char * file, const char * func,
351 unsigned line, const char * check_str)
354 snprintf(mesg, 500, "%s:%d: %s: Assertion \"%s\" failed.",
355 file, line, func, check_str);
356 return make_err(bad_input_error, mesg);
358 # define CREATE_NORM_TABLE(T, in, buf, res) \
359 do { PosibErr<NormTable<T> *> pe( create_norm_table<T>(in,buf) );\
360 if (pe.has_err()) return PosibErrBase(pe); \
361 res = pe.data; } while(false)
364 static PosibErr< NormTable<T> * > create_norm_table(IStream & in, String & buf)
366 const char FUNC[] = "create_norm_table";
367 const char * p = get_nb_line(in, buf);
370 int size = strtoul(p, (char **)&p, 10);
371 VARARRAY(T, d, size);
372 memset(d, 0, sizeof(T) * size);
373 int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0));
374 VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d);
375 VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d);
376 VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d);
378 while (p = get_nb_line(in, buf), *p != '.') {
379 Uni32 f = strtoul(p, (char **)&p, 16);
380 cur->from = static_cast<typename T::From>(f);
381 sanity(f == cur->from);
394 Uni32 t = strtoul(p, (char **)&p, 16);
396 sanity(i < d->max_to);
397 cur->to[i] = static_cast<typename T::To>(t);
398 sanity(t == static_cast<Uni32>(cur->to[i]));
402 cur->to[1] = T::to_non_char;
405 if (*p == '/') CREATE_NORM_TABLE(T, in, buf, cur->sub_table);
408 sanity(cur - d == size);
409 Tally * which = &tally0;
410 if (which->max > tally1.max) which = &tally1;
411 if (which->max > tally2.max) which = &tally2;
412 NormTable<T> * final = (NormTable<T> *)calloc(1, NormTable<T>::struct_size +
413 sizeof(T) * which->size * which->max);
414 memset(final, 0, NormTable<T>::struct_size + sizeof(T) * which->size * which->max);
415 final->mask = which->size - 1;
416 final->height = which->size;
417 final->width = which->max;
418 final->end = final->data + which->size * which->max;
420 for (cur = d; cur != d + size; ++cur) {
421 T * dest = final->data + (cur->from & final->mask);
422 while (dest->from != 0) dest += final->height;
424 if (dest->from == 0) dest->from = T::from_non_char;
426 for (T * dest = final->data; dest < final->end; dest += final->height) {
427 if (dest->from == 0 || (dest->from == T::from_non_char && dest->to[0] == 0)) {
428 dest->from = T::from_non_char;
429 dest->to[0] = T::to_non_char;
435 static PosibErr<void> init_norm_tables(FStream & in, NormTables * d)
437 const char FUNC[] = "init_norm_tables";
441 sanity (l == "INTERNAL");
445 CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->internal);
448 sanity (l == "STRICT");
449 char * p = get_nb_line(in, l);
452 CREATE_NORM_TABLE(FromUniNormEntry, in, l, d->strict_d);
453 d->strict = d->strict_d;
457 sanity(strcmp(p, "INTERNAL") == 0);
458 d->strict = d->internal;
460 while (get_nb_line(in, l)) {
462 d->to_uni.push_back(NormTables::ToUniTable());
463 NormTables::ToUniTable & e = d->to_uni.back();
464 e.name.resize(l.size());
465 for (unsigned i = 0; i != l.size(); ++i)
466 e.name[i] = asc_tolower(l[i]);
467 char * p = get_nb_line(in, l);
470 CREATE_NORM_TABLE(ToUniNormEntry, in, l, e.data);
475 for (char * q = p; *q; ++q) *q = asc_tolower(*q);
476 Vector<NormTables::ToUniTable>::iterator i = d->to_uni.begin();
477 while (i->name != p && i != d->to_uni.end()) ++i;
478 sanity(i != d->to_uni.end());
486 PosibErr<NormTables *> NormTables::get_new(const String & encoding,
487 const Config * config)
489 String dir1,dir2,file_name;
490 fill_data_dir(config, dir1, dir2);
491 find_file(file_name,dir1,dir2,encoding,".cmap");
494 PosibErrBase err = in.open(file_name, "r");
497 snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."),
499 return make_err(unknown_encoding, encoding, mesg); // FIXME
502 NormTables * d = new NormTables;
504 err = init_norm_tables(in, d);
506 return make_err(bad_file_format, file_name, err.get_err()->mesg);
513 NormTables::~NormTables()
515 free_norm_table<FromUniNormEntry>(internal);
517 free_norm_table<FromUniNormEntry>(strict_d);
518 for (unsigned i = 0; i != to_uni.size(); ++i) {
520 free_norm_table<ToUniNormEntry>(to_uni[i].data);
524 //////////////////////////////////////////////////////////////////////
525 //////////////////////////////////////////////////////////////////////
529 //////////////////////////////////////////////////////////////////////
530 //////////////////////////////////////////////////////////////////////
533 bool operator== (const Convert & rhs, const Convert & lhs)
535 return strcmp(rhs.in_code(), lhs.in_code()) == 0
536 && strcmp(rhs.out_code(), lhs.out_code()) == 0;
539 //////////////////////////////////////////////////////////////////////
541 // Trivial Conversion
544 const char * unsupported_null_term_wide_string_msg =
545 "Null-terminated wide-character strings unsupported when used this way.";
547 template <typename Chr>
548 struct DecodeDirect : public Decode
550 DecodeDirect() {type_width = sizeof(Chr);}
551 void decode(const char * in0, int size, FilterCharVector & out) const {
552 const Chr * in = reinterpret_cast<const Chr *>(in0);
553 if (size == -sizeof(Chr)) {
555 out.append(*in, sizeof(Chr));
556 } else if (size <= -1) {
557 fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
560 const Chr * stop = reinterpret_cast<const Chr *>(in0) + size/sizeof(Chr);
561 for (;in != stop; ++in)
562 out.append(*in, sizeof(Chr));
565 PosibErr<void> decode_ec(const char * in0, int size,
566 FilterCharVector & out, ParmStr) const {
567 DecodeDirect::decode(in0, size, out);
572 template <typename Chr>
573 struct EncodeDirect : public Encode
575 EncodeDirect() {type_width = sizeof(Chr);}
576 void encode(const FilterChar * in, const FilterChar * stop,
577 CharVector & out) const {
578 for (; in != stop; ++in) {
580 if (c != in->chr) c = '?';
581 out.append(&c, sizeof(Chr));
584 PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop,
585 CharVector & out, ParmStr orig) const {
586 for (; in != stop; ++in) {
590 snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
591 return make_err(invalid_string, orig, m);
594 out.append(&c, sizeof(Chr));
598 bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const {
603 template <typename Chr>
604 struct ConvDirect : public DirectConv
606 ConvDirect() {type_width = sizeof(Chr);}
607 void convert(const char * in0, int size, CharVector & out) const {
608 if (size == -sizeof(Chr)) {
609 const Chr * in = reinterpret_cast<const Chr *>(in0);
610 for (;*in != 0; ++in)
611 out.append(in, sizeof(Chr));
612 } else if (size <= -1) {
613 fprintf(stderr, "%s\n", unsupported_null_term_wide_string_msg);
616 out.append(in0, size);
619 PosibErr<void> convert_ec(const char * in0, int size,
620 CharVector & out, ParmStr) const {
621 ConvDirect::convert(in0, size, out);
626 //////////////////////////////////////////////////////////////////////
631 struct DecodeLookup : public Decode
634 PosibErr<void> init(ParmStr code, const Config & c) {
635 FromUniLookup unused;
636 return read_in_char_data(c, code, lookup, unused);
638 void decode(const char * in, int size, FilterCharVector & out) const {
641 out.append(lookup[*in]);
643 const char * stop = in + size;
644 for (;in != stop; ++in)
645 out.append(lookup[*in]);
648 PosibErr<void> decode_ec(const char * in, int size,
649 FilterCharVector & out, ParmStr) const {
650 DecodeLookup::decode(in, size, out);
655 struct DecodeNormLookup : public Decode
657 typedef ToUniNormEntry E;
659 DecodeNormLookup(NormTable<E> * d) : data(d) {}
660 // must be null terminated
661 // FIXME: Why must it be null terminated?
662 void decode(const char * in, int size, FilterCharVector & out) const {
663 const char * stop = in + size; // will work even if size -1
666 if (size == -1) break;
670 NormLookupRet<E,const char> ret = norm_lookup<E>(data, in, stop, 0, in);
671 for (unsigned i = 0; ret.to[i] && i < E::max_to; ++i)
672 out.append(ret.to[i]);
677 PosibErr<void> decode_ec(const char * in, int size,
678 FilterCharVector & out, ParmStr) const {
679 DecodeNormLookup::decode(in, size, out);
684 struct EncodeLookup : public Encode
686 FromUniLookup lookup;
687 PosibErr<void> init(ParmStr code, const Config & c)
689 return read_in_char_data(c, code, unused, lookup);}
690 void encode(const FilterChar * in, const FilterChar * stop,
691 CharVector & out) const {
692 for (; in != stop; ++in) {
693 out.append(lookup(*in));
696 PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop,
697 CharVector & out, ParmStr orig) const {
698 for (; in != stop; ++in) {
699 char c = lookup(*in, '\0');
700 if (c == '\0' && in->chr != 0) {
702 snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
703 return make_err(invalid_string, orig, m);
709 bool encode(FilterChar * & in0, FilterChar * & stop,
710 FilterCharVector & out) const {
711 FilterChar * in = in0;
712 for (; in != stop; ++in)
718 struct EncodeNormLookup : public Encode
720 typedef FromUniNormEntry E;
722 EncodeNormLookup(NormTable<E> * d) : data(d) {}
723 // *stop must equal 0
724 void encode(const FilterChar * in, const FilterChar * stop,
725 CharVector & out) const {
731 NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in);
732 for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i)
733 out.append(ret.to[i]);
738 PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop,
739 CharVector & out, ParmStr orig) const {
745 NormLookupRet<E,const FilterChar> ret = norm_lookup<E>(data, in, stop, 0, in);
748 snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr);
749 return make_err(invalid_string, orig, m);
751 for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i)
752 out.append(ret.to[i]);
758 bool encode(FilterChar * & in, FilterChar * & stop,
759 FilterCharVector & buf) const {
763 buf.append(FilterChar(0));
766 NormLookupRet<E,FilterChar> ret = norm_lookup<E>(data, in, stop, (const byte *)"?", in);
767 const FilterChar * end = ret.last + 1;
769 for (; in != end; ++in) width += in->width;
770 buf.append(FilterChar(ret.to[0], width));
771 for (unsigned i = 1; i < E::max_to && ret.to[i]; ++i) {
772 buf.append(FilterChar(ret.to[i],0));
783 //////////////////////////////////////////////////////////////////////
788 #define get_check_next \
789 if (in == stop) goto error; \
791 if ((c & 0xC0/*1100 0000*/) != 0x80/*10xx xxxx*/) goto error;\
794 u |= c & 0x3F/*0011 1111*/; \
797 static inline FilterChar from_utf8 (const char * & in, const char * stop = 0,
798 Uni32 err_char = '?')
800 Uni32 u = (Uni32)(-1);
801 FilterChar::Width w = 1;
803 // the first char is guaranteed not to be off the end
807 if ((c & 0x80/*1000 0000*/) == 0x00/*0xxx xxx*/) {
809 } else if ((c & 0xE0/*1110 0000*/) == 0xC0/*110x xxxx*/) { // 2-byte wide
810 u = c & 0x1F/*0001 1111*/;
812 } else if ((c & 0xF0/*1111 0000*/) == 0xE0/*1110 xxxx*/) { // 3-byte wide
813 u = c & 0x0F/*0000 1111*/;
816 } else if ((c & 0xF8/*1111 1000*/) == 0xF0/*1111 0xxx*/) { // 4-byte wide
817 u = c & 0x07/*0000 0111*/;
825 return FilterChar(u, w);
827 return FilterChar(err_char, w);
830 static inline void to_utf8 (FilterChar in, CharVector & out)
832 FilterChar::Chr c = in;
837 else if (c < 0x800) {
838 out.append(0xC0 | (c>>6));
839 out.append(0x80 | (c & 0x3F));
841 else if (c < 0x10000) {
842 out.append(0xE0 | (c>>12));
843 out.append(0x80 | (c>>6 & 0x3F));
844 out.append(0x80 | (c & 0x3F));
846 else if (c < 0x200000) {
847 out.append(0xF0 | (c>>18));
848 out.append(0x80 | (c>>12 & 0x3F));
849 out.append(0x80 | (c>>6 & 0x3F));
850 out.append(0x80 | (c & 0x3F));
854 struct DecodeUtf8 : public Decode
857 void decode(const char * in, int size, FilterCharVector & out) const {
860 out.append(from_utf8(in));
862 const char * stop = in + size;
864 out.append(from_utf8(in, stop));
867 PosibErr<void> decode_ec(const char * in, int size,
868 FilterCharVector & out, ParmStr orig) const {
869 const char * begin = in;
872 FilterChar c = from_utf8(in, 0, (Uni32)-1);
873 if (c == (Uni32)-1) goto error;
877 const char * stop = in + size;
879 FilterChar c = from_utf8(in, stop, (Uni32)-1);
880 if (c == (Uni32)-1) goto error;
887 snprintf(m, 70, _("Invalid UTF-8 sequence at position %ld."), (long)(in - begin));
888 return make_err(invalid_string, orig, m);
892 struct EncodeUtf8 : public Encode
894 FromUniLookup lookup;
895 void encode(const FilterChar * in, const FilterChar * stop,
896 CharVector & out) const {
897 for (; in != stop; ++in) {
901 PosibErr<void> encode_ec(const FilterChar * in, const FilterChar * stop,
902 CharVector & out, ParmStr) const {
903 for (; in != stop; ++in) {
910 //////////////////////////////////////////////////////////////////////
915 static GlobalCache<Decode> decode_cache("decode");
916 static GlobalCache<Encode> encode_cache("encode");
917 static GlobalCache<NormTables> norm_tables_cache("norm_tables");
919 //////////////////////////////////////////////////////////////////////
921 // new_aspell_convert
924 void Convert::generic_convert(const char * in, int size, CharVector & out)
927 decode_->decode(in, size, buf_);
928 FilterChar * start = buf_.pbegin();
929 FilterChar * stop = buf_.pend();
931 filter.process(start, stop);
932 encode_->encode(start, stop, out);
935 const char * fix_encoding_str(ParmStr enc, String & buf)
938 buf.reserve(enc.size() + 1);
939 for (size_t i = 0; i != enc.size(); ++i)
940 buf.push_back(asc_tolower(enc[i]));
942 if (strncmp(buf.c_str(), "iso8859", 7) == 0)
943 buf.insert(buf.begin() + 3, '-'); // For backwards compatibility
945 if (buf == "ascii" || buf == "ansi_x3.4-1968")
947 else if (buf == "machine unsigned 16" || buf == "utf-16")
949 else if (buf == "machine unsigned 32" || buf == "utf-32")
955 bool ascii_encoding(const Config & c, ParmStr enc0)
957 if (enc0.empty()) return true;
958 if (enc0 == "ANSI_X3.4-1968"
959 || enc0 == "ASCII" || enc0 == "ascii") return true;
961 const char * enc = fix_encoding_str(enc0, buf);
962 if (strcmp(enc, "utf-8") == 0
963 || strcmp(enc, "ucs-2") == 0
964 || strcmp(enc, "ucs-4") == 0) return false;
965 String dir1,dir2,file_name;
966 fill_data_dir(&c, dir1, dir2);
967 file_name << dir1 << enc << ".cset";
968 if (file_exists(file_name)) return false;
969 if (dir1 == dir2) return true;
971 file_name << dir2 << enc << ".cset";
972 return !file_exists(file_name);
975 PosibErr<Convert *> internal_new_convert(const Config & c,
982 in.val = fix_encoding_str(in.val, in_s);
985 out.val = fix_encoding_str(out.val, out_s);
987 if (if_needed && in.val == out.val) return 0;
989 StackPtr<Convert> conv(new Convert);
992 RET_ON_ERR(conv->init(c, in, out)); break;
994 RET_ON_ERR(conv->init_norm_from(c, in, out)); break;
996 RET_ON_ERR(conv->init_norm_to(c, in, out)); break;
998 return conv.release();
1001 PosibErr<Decode *> Decode::get_new(const ConvKey & k, const Config * c)
1003 StackPtr<Decode> ptr;
1004 if (k.val == "iso-8859-1") {
1005 ptr.reset(new DecodeDirect<Uni8>);
1006 } else if (k.val == "ucs-2") {
1008 ptr.reset(new DecodeDirect<Uni16>);
1010 return make_err(encoding_not_supported, k.val);
1011 } else if (k.val == "ucs-4") {
1013 ptr.reset(new DecodeDirect<Uni32>);
1015 return make_err(encoding_not_supported, k.val);
1016 } else if (k.val == "utf-8") {
1017 ptr.reset(new DecodeUtf8);
1019 ptr.reset(new DecodeLookup);
1021 RET_ON_ERR(ptr->init(k.val, *c));
1023 return ptr.release();
1026 PosibErr<Encode *> Encode::get_new(const ConvKey & k, const Config * c)
1028 StackPtr<Encode> ptr;
1029 if (k.val == "iso-8859-1") {
1030 ptr.reset(new EncodeDirect<Uni8>);
1031 } else if (k.val == "ucs-2" && k.allow_ucs) {
1033 ptr.reset(new EncodeDirect<Uni16>);
1035 return make_err(encoding_not_supported, k.val);
1036 } else if (k.val == "ucs-4" && k.allow_ucs) {
1038 ptr.reset(new EncodeDirect<Uni32>);
1040 return make_err(encoding_not_supported, k.val);
1041 } else if (k.val == "utf-8") {
1042 ptr.reset(new EncodeUtf8);
1044 ptr.reset(new EncodeLookup);
1046 RET_ON_ERR(ptr->init(k.val, *c));
1048 return ptr.release();
1051 Convert::~Convert() {}
1053 PosibErr<void> Convert::init(const Config & c, const ConvKey & in, const ConvKey & out)
1055 RET_ON_ERR(setup(decode_c, &decode_cache, &c, in));
1056 decode_ = decode_c.get();
1057 RET_ON_ERR(setup(encode_c, &encode_cache, &c, out));
1058 encode_ = encode_c.get();
1061 if (in.val == out.val) {
1062 if (in.val == "ucs-2") {
1064 conv_ = new ConvDirect<Uni16>;
1066 return make_err(encoding_not_supported, in.val);
1068 } else if (in.val == "ucs-4") {
1070 conv_ = new ConvDirect<Uni32>;
1072 return make_err(encoding_not_supported, in.val);
1075 conv_ = new ConvDirect<char>;
1080 RET_ON_ERR(conv_->init(decode_, encode_, c));
1086 PosibErr<void> Convert::init_norm_from(const Config & c, const ConvKey & in, const ConvKey & out)
1088 if (!c.retrieve_bool("normalize") && !c.retrieve_bool("norm-required"))
1089 return init(c,in,out);
1091 RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, out.val));
1093 RET_ON_ERR(setup(decode_c, &decode_cache, &c, in));
1094 decode_ = decode_c.get();
1096 if (c.retrieve_bool("norm-strict")) {
1097 encode_s = new EncodeNormLookup(norm_tables_->strict);
1099 encode_->key = out.val;
1100 encode_->key += ":strict";
1102 encode_s = new EncodeNormLookup(norm_tables_->internal);
1104 encode_->key = out.val;
1105 encode_->key += ":internal";
1112 PosibErr<void> Convert::init_norm_to(const Config & c, const ConvKey & in, const ConvKey & out)
1114 String norm_form = c.retrieve("norm-form");
1115 if ((!c.retrieve_bool("normalize") || norm_form == "none")
1116 && !c.retrieve_bool("norm-required"))
1117 return init(c,in,out);
1118 if (norm_form == "none" && c.retrieve_bool("norm-required"))
1121 RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, in.val));
1123 RET_ON_ERR(setup(encode_c, &encode_cache, &c, out));
1124 encode_ = encode_c.get();
1126 NormTables::ToUni::const_iterator i = norm_tables_->to_uni.begin();
1127 for (; i != norm_tables_->to_uni.end() && i->name != norm_form; ++i);
1128 if (i == norm_tables_->to_uni.end())
1129 return make_err(aerror_bad_value, "norm-form", norm_form, "one of none, nfd, nfc, or comp");
1131 decode_s = new DecodeNormLookup(i->ptr);
1133 decode_->key = in.val;
1134 decode_->key += ':';
1135 decode_->key += i->name;
1142 PosibErr<void> MBLen::setup(const Config &, ParmStr enc0)
1145 const char * enc = fix_encoding_str(enc0,buf);
1146 if (strcmp(enc, "utf-8") == 0) encoding = UTF8;
1147 else if (strcmp(enc, "ucs-2") == 0) encoding = UCS2;
1148 else if (strcmp(enc, "ucs-4") == 0) encoding = UCS4;
1149 else encoding = Other;
1153 unsigned MBLen::operator()(const char * str, const char * stop)
1160 for (; str != stop; ++str) {
1161 if ((*str & 0x80) == 0 || (*str & 0xC0) == 0xC0) ++size;
1165 return (stop - str)/2;
1167 return (stop - str)/4;
1172 PosibErr<void> unsupported_null_term_wide_string_err_(const char * func) {
1173 static bool reported_to_stderr = false;
1174 PosibErr<void> err = make_err(other_error, unsupported_null_term_wide_string_msg);
1175 if (!reported_to_stderr) {
1176 CERR.printf("ERROR: %s: %s\n", func, unsupported_null_term_wide_string_msg);
1177 reported_to_stderr = true;
1182 void unsupported_null_term_wide_string_abort_(const char * func) {
1183 CERR.printf("%s: %s\n", func, unsupported_null_term_wide_string_msg);