1 /* Copyright 2008,2009 Alain Knaff.
2 * This file is part of mtools.
4 * Mtools is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
9 * Mtools is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with Mtools. If not, see <http://www.gnu.org/licenses/>.
17 * Various character set conversions used by mtools
19 #include "sysincludes.h"
26 #include "file_name.h"
37 static char *wcharCp=NULL;
39 static char* wcharTries[] = {
41 "UTF-32BE", "UTF-32LE",
42 "UTF-16BE", "UTF-16LE",
49 static wchar_t *testString = L"ab";
51 static int try(char *testCp) {
53 char *inbuf = (char *)testString;
54 size_t inbufLen = 2*sizeof(wchar_t);
56 char *outbufP = outbuf;
57 size_t outbufLen = 2*sizeof(char);
58 iconv_t test = iconv_open("ASCII", testCp);
60 if(test == (iconv_t) -1)
64 &outbufP, &outbufLen);
65 if(res != 0 || outbufLen != 0 || inbufLen != 0)
67 if(memcmp(outbuf, "ab", 2))
69 /* fprintf(stderr, "%s ok\n", testCp); */
74 /*fprintf(stderr, "%s fail\n", testCp);*/
78 static const char *getWcharCp() {
82 for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
83 if(try(wcharTries[i]))
84 return (wcharCp=wcharTries[i]);
86 fprintf(stderr, "No codepage found for wchar_t\n");
91 doscp_t *cp_open(int codepage)
99 codepage = mtools_default_codepage;
100 if(codepage < 0 || codepage > 9999) {
101 fprintf(stderr, "Bad codepage %d\n", codepage);
105 if(getWcharCp() == NULL)
108 sprintf(dosCp, "CP%d", codepage);
109 from = iconv_open(wcharCp, dosCp);
110 if(from == (iconv_t)-1) {
111 fprintf(stderr, "Error converting to codepage %d %s\n",
112 codepage, strerror(errno));
116 sprintf(dosCp, "CP%d//TRANSLIT", codepage);
117 to = iconv_open(dosCp, wcharCp);
118 if(to == (iconv_t)-1) {
119 /* Transliteration not supported? */
120 sprintf(dosCp, "CP%d", codepage);
121 to = iconv_open(dosCp, wcharCp);
123 if(to == (iconv_t)-1) {
125 fprintf(stderr, "Error converting to codepage %d %s\n",
126 codepage, strerror(errno));
138 void cp_close(doscp_t *cp)
141 iconv_close(cp->from);
145 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
149 size_t out_len=len*sizeof(wchar_t);
151 r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
159 * Converts len wide character to destination. Caller's responsibility to
160 * ensure that dest is large enough.
161 * mangled will be set if there has been an untranslatable character.
163 static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
164 size_t len, int *mangled)
168 size_t in_len=len*sizeof(wchar_t);
169 size_t out_len=len*4;
173 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
174 if(r >= 0 || errno != EILSEQ) {
175 /* everything transformed, or error that is _not_ a bad
189 len = dptr-dest; /* how many dest characters have there been
192 /* eliminate question marks which might have been formed by
193 untransliterable characters */
194 for(i=0; i<len; i++) {
203 void wchar_to_dos(doscp_t *cp,
204 wchar_t *wchar, char *dos, size_t len, int *mangled)
206 safe_iconv(cp->to, wchar, dos, len, mangled);
211 #include "codepage.h"
214 unsigned char *from_dos;
215 unsigned char to_dos[0x80];
218 doscp_t *cp_open(int codepage)
231 for(cp=codepages; cp->nr ; cp++)
232 if(cp->nr == codepage) {
233 ret->from_dos = cp->tounix;
237 if(ret->from_dos == NULL) {
238 fprintf(stderr, "Bad codepage %d\n", codepage);
243 for(i=0; i<0x80; i++) {
244 char native = ret->from_dos[i];
245 if(! (native & 0x80))
247 ret->to_dos[native & 0x7f] = 0x80 | i;
252 void cp_close(doscp_t *cp)
257 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
261 for(i=0; i<len && dos[i]; i++) {
263 if(c >= ' ' && c <= '~')
266 wchar[i] = cp->from_dos[c & 0x7f];
274 void wchar_to_dos(doscp_t *cp,
275 wchar_t *wchar, char *dos, size_t len, int *mangled)
278 for(i=0; i<len && wchar[i]; i++) {
280 if(c >= ' ' && c <= '~')
283 dos[i] = cp->to_dos[c & 0x7f];
297 typedef int mbstate_t;
299 static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
305 static inline size_t mbrtowc(wchar_t *pwc, const char *s,
306 size_t n, mbstate_t *ps)
316 #include <langinfo.h>
318 static iconv_t to_native = NULL;
320 static void initialize_to_native(void)
324 if(to_native != NULL)
326 li = nl_langinfo(CODESET);
327 len = strlen(li) + 11;
328 if(getWcharCp() == NULL)
330 cp = safe_malloc(len);
332 strcat(cp, "//TRANSLIT");
333 to_native = iconv_open(cp, wcharCp);
334 if(to_native == (iconv_t) -1)
335 to_native = iconv_open(li, wcharCp);
336 if(to_native == (iconv_t) -1)
337 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
339 if(to_native == (iconv_t) -1)
349 * Convert wchar string to native, converting at most len wchar characters
350 * Returns number of generated native characters
352 int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
357 initialize_to_native();
358 len = wcsnlen(wchar,len);
359 r=safe_iconv(to_native, wchar, native, len, &mangled);
366 memset(&ps, 0, sizeof(ps));
367 for(i=0; i<len && wchar[i] != 0; i++) {
368 int r = wcrtomb(dptr, wchar[i], &ps);
369 if(r < 0 && errno == EILSEQ) {
383 * Convert native string to wchar string, converting at most len wchar
384 * characters. If end is supplied, stop conversion when source pointer
385 * exceeds end. Returns number of converted wchars
387 int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
388 const char *end, int *mangled)
392 memset(&ps, 0, sizeof(ps));
394 for(i=0; i<len && (native < end || !end); i++) {
395 int r = mbrtowc(wchar+i, native, len, &ps);
397 /* Unconvertible character. Just pretend it's Latin1
398 encoded (if valid Latin1 character) or substitue
399 with an underscore if not
402 if(c >= '\xa0' && c < '\xff')
406 memset(&ps, 0, sizeof(ps));
413 if(mangled && end && native < end)