1 /* Copyright 2008,2009 Alain Knaff.
2 * This file is part of mtools.
4 * Mtools is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
9 * Mtools is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with Mtools. If not, see <http://www.gnu.org/licenses/>.
17 * Various character set conversions used by mtools
19 #include "sysincludes.h"
26 #include "file_name.h"
37 static const char *wcharCp=NULL;
39 static const char* wcharTries[] = {
41 "UTF-32BE", "UTF-32LE",
42 "UTF-16BE", "UTF-16LE",
49 static const char *asciiTries[] = {
50 "ASCII", "ASCII-GR", "ISO8859-1"
53 static const wchar_t *testString = L"ab";
55 static int try(const char *testCp) {
57 char *inbuf = (char *)testString;
58 size_t inbufLen = 2*sizeof(wchar_t);
60 char *outbufP = outbuf;
61 size_t outbufLen = 2*sizeof(char);
65 for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
66 test = iconv_open(asciiTries[i], testCp);
67 if(test != (iconv_t) -1)
70 if(test == (iconv_t) -1)
74 &outbufP, &outbufLen);
75 if(res != 0 || outbufLen != 0 || inbufLen != 0)
77 if(memcmp(outbuf, "ab", 2))
79 /* fprintf(stderr, "%s ok\n", testCp); */
84 /*fprintf(stderr, "%s fail\n", testCp);*/
88 static const char *getWcharCp(void) {
92 for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
93 if(try(wcharTries[i]))
94 return (wcharCp=wcharTries[i]);
96 fprintf(stderr, "No codepage found for wchar_t\n");
101 doscp_t *cp_open(int codepage)
109 codepage = mtools_default_codepage;
110 if(codepage < 0 || codepage > 9999) {
111 fprintf(stderr, "Bad codepage %d\n", codepage);
115 if(getWcharCp() == NULL)
118 sprintf(dosCp, "CP%d", codepage);
119 from = iconv_open(wcharCp, dosCp);
120 if(from == (iconv_t)-1) {
121 fprintf(stderr, "Error converting to codepage %d %s\n",
122 codepage, strerror(errno));
126 sprintf(dosCp, "CP%d//TRANSLIT", codepage);
127 to = iconv_open(dosCp, wcharCp);
128 if(to == (iconv_t)-1) {
129 /* Transliteration not supported? */
130 sprintf(dosCp, "CP%d", codepage);
131 to = iconv_open(dosCp, wcharCp);
133 if(to == (iconv_t)-1) {
135 fprintf(stderr, "Error converting to codepage %d %s\n",
136 codepage, strerror(errno));
148 void cp_close(doscp_t *cp)
151 iconv_close(cp->from);
155 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
159 size_t out_len=len*sizeof(wchar_t);
161 r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
169 * Converts len wide character to destination. Caller's responsibility to
170 * ensure that dest is large enough.
171 * mangled will be set if there has been an untranslatable character.
173 static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
174 size_t len, int *mangled)
178 size_t in_len=len*sizeof(wchar_t);
179 size_t out_len=len*4;
183 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
184 if(r >= 0 || errno != EILSEQ) {
185 /* everything transformed, or error that is _not_ a bad
199 len = dptr-dest; /* how many dest characters have there been
202 /* eliminate question marks which might have been formed by
203 untransliterable characters */
204 for(i=0; i<len; i++) {
213 void wchar_to_dos(doscp_t *cp,
214 wchar_t *wchar, char *dos, size_t len, int *mangled)
216 safe_iconv(cp->to, wchar, dos, len, mangled);
221 #include "codepage.h"
224 unsigned char *from_dos;
225 unsigned char to_dos[0x80];
228 doscp_t *cp_open(int codepage)
241 for(cp=codepages; cp->nr ; cp++)
242 if(cp->nr == codepage) {
243 ret->from_dos = cp->tounix;
247 if(ret->from_dos == NULL) {
248 fprintf(stderr, "Bad codepage %d\n", codepage);
253 for(i=0; i<0x80; i++) {
254 char native = ret->from_dos[i];
255 if(! (native & 0x80))
257 ret->to_dos[native & 0x7f] = 0x80 | i;
262 void cp_close(doscp_t *cp)
267 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
271 for(i=0; i<len && dos[i]; i++) {
273 if(c >= ' ' && c <= '~')
276 wchar[i] = cp->from_dos[c & 0x7f];
284 void wchar_to_dos(doscp_t *cp,
285 wchar_t *wchar, char *dos, size_t len, int *mangled)
288 for(i=0; i<len && wchar[i]; i++) {
290 if(c >= ' ' && c <= '~')
293 dos[i] = cp->to_dos[c & 0x7f];
307 typedef int mbstate_t;
309 static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
315 static inline size_t mbrtowc(wchar_t *pwc, const char *s,
316 size_t n, mbstate_t *ps)
326 #include <langinfo.h>
328 static iconv_t to_native = NULL;
330 static void initialize_to_native(void)
334 if(to_native != NULL)
336 li = nl_langinfo(CODESET);
337 len = strlen(li) + 11;
338 if(getWcharCp() == NULL)
340 cp = safe_malloc(len);
342 strcat(cp, "//TRANSLIT");
343 to_native = iconv_open(cp, wcharCp);
344 if(to_native == (iconv_t) -1)
345 to_native = iconv_open(li, wcharCp);
346 if(to_native == (iconv_t) -1)
347 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
349 if(to_native == (iconv_t) -1)
359 * Convert wchar string to native, converting at most len wchar characters
360 * Returns number of generated native characters
362 int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
367 initialize_to_native();
368 len = wcsnlen(wchar,len);
369 r=safe_iconv(to_native, wchar, native, len, &mangled);
376 memset(&ps, 0, sizeof(ps));
377 for(i=0; i<len && wchar[i] != 0; i++) {
378 int r = wcrtomb(dptr, wchar[i], &ps);
379 if(r < 0 && errno == EILSEQ) {
393 * Convert native string to wchar string, generating at most len wchar
394 * characters. If end is supplied, stop conversion when source pointer
395 * exceeds end. Returns number of generated wchars
397 int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
398 const char *end, int *mangled)
402 memset(&ps, 0, sizeof(ps));
404 for(i=0; i<len && (native < end || !end); i++) {
405 int r = mbrtowc(wchar+i, native, len, &ps);
407 /* Unconvertible character. Just pretend it's Latin1
408 encoded (if valid Latin1 character) or substitue
409 with an underscore if not
412 if(c >= '\xa0' && c < '\xff')
416 memset(&ps, 0, sizeof(ps));
423 if(mangled && ((end && native < end) || (!end && *native && i == len)))