Initial import package mtools: Programs for accessing MS-DOS disks without mounting...
[profile/ivi/mtools.git] / charsetConv.c
1 /*  Copyright 2008,2009 Alain Knaff.
2  *  This file is part of mtools.
3  *                              
4  *  Mtools is free software: you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation, either version 3 of the License, or   
7  *  (at your option) any later version.                                 
8  *                                                                      
9  *  Mtools is distributed in the hope that it will be useful,           
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of      
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with Mtools.  If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Various character set conversions used by mtools
18  */
19 #include "sysincludes.h"
20 #include "msdos.h"
21 #include "mtools.h"
22
23 #include <stdio.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include "file_name.h"
27
28
29 #ifdef HAVE_ICONV_H
30 #include <iconv.h>
31
32 struct doscp_t {
33         iconv_t from;
34         iconv_t to;
35 };
36
37 static char *wcharCp=NULL;
38
39 static char* wcharTries[] = {
40         "WCHAR_T",
41         "UTF-32BE", "UTF-32LE",
42         "UTF-16BE", "UTF-16LE",
43         "UTF-32", "UTF-16",
44         "UCS-4BE", "UCS-4LE",
45         "UCS-2BE", "UCS-2LE",
46         "UCS-4", "UCS-2"
47 };
48
49 static wchar_t *testString = L"ab";
50
51 static int try(char *testCp) {
52         size_t res;
53         char *inbuf = (char *)testString;
54         size_t inbufLen = 2*sizeof(wchar_t);
55         char outbuf[3];
56         char *outbufP = outbuf;
57         size_t outbufLen = 2*sizeof(char);
58         iconv_t test = iconv_open("ASCII", testCp);
59
60         if(test == (iconv_t) -1)
61                 goto fail0;
62         res = iconv(test,
63                     &inbuf, &inbufLen,
64                     &outbufP, &outbufLen);
65         if(res != 0 || outbufLen != 0 || inbufLen != 0)
66                 goto fail;
67         if(memcmp(outbuf, "ab", 2))
68                 goto fail;
69         /* fprintf(stderr, "%s ok\n", testCp); */
70         return 1;
71  fail:
72         iconv_close(test);
73  fail0:
74         /*fprintf(stderr, "%s fail\n", testCp);*/
75         return 0;
76 }
77
78 static const char *getWcharCp() {
79         int i;
80         if(wcharCp != NULL)
81                 return wcharCp; 
82         for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
83                 if(try(wcharTries[i]))
84                         return (wcharCp=wcharTries[i]);
85         }
86         fprintf(stderr, "No codepage found for wchar_t\n");
87         return NULL;
88 }
89
90
91 doscp_t *cp_open(int codepage)
92 {
93         char dosCp[17];
94         doscp_t *ret;
95         iconv_t *from;
96         iconv_t *to;
97
98         if(codepage == 0)
99                 codepage = mtools_default_codepage;
100         if(codepage < 0 || codepage > 9999) {
101                 fprintf(stderr, "Bad codepage %d\n", codepage);
102                 return NULL;
103         }
104
105         if(getWcharCp() == NULL)
106                 return NULL;
107
108         sprintf(dosCp, "CP%d", codepage);
109         from = iconv_open(wcharCp, dosCp);
110         if(from == (iconv_t)-1) {
111                 fprintf(stderr, "Error converting to codepage %d %s\n",
112                         codepage, strerror(errno));
113                 return NULL;
114         }
115
116         sprintf(dosCp, "CP%d//TRANSLIT", codepage);
117         to   =  iconv_open(dosCp, wcharCp);
118         if(to == (iconv_t)-1) {
119                 /* Transliteration not supported? */
120                 sprintf(dosCp, "CP%d", codepage);
121                 to   =  iconv_open(dosCp, wcharCp);
122         }
123         if(to == (iconv_t)-1) {
124                 iconv_close(from);
125                 fprintf(stderr, "Error converting to codepage %d %s\n",
126                         codepage, strerror(errno));
127                 return NULL;
128         }
129
130         ret = New(doscp_t);
131         if(ret == NULL)
132                 return ret;
133         ret->from = from;
134         ret->to   = to;
135         return ret;
136 }
137
138 void cp_close(doscp_t *cp)
139 {
140         iconv_close(cp->to);
141         iconv_close(cp->from);
142         free(cp);
143 }
144
145 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
146 {
147         int r;
148         size_t in_len=len;
149         size_t out_len=len*sizeof(wchar_t);
150         wchar_t *dptr=wchar;
151         r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
152         if(r < 0)
153                 return r;
154         *dptr = L'\0';
155         return dptr-wchar;
156 }
157
158 /**
159  * Converts len wide character to destination. Caller's responsibility to
160  * ensure that dest is large enough.
161  * mangled will be set if there has been an untranslatable character.
162  */
163 static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
164                       size_t len, int *mangled)
165 {
166         int r;
167         int i;
168         size_t in_len=len*sizeof(wchar_t);
169         size_t out_len=len*4;
170         char *dptr = dest;
171
172         while(in_len > 0) {
173                 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
174                 if(r >= 0 || errno != EILSEQ) {
175                         /* everything transformed, or error that is _not_ a bad
176                          * character */
177                         break;
178                 }
179                 *mangled |= 1;
180
181                 if(dptr)
182                         *dptr++ = '_';
183                 in_len--;
184
185                 wchar++;
186                 out_len--;
187         }
188
189         len = dptr-dest; /* how many dest characters have there been
190                             generated */
191
192         /* eliminate question marks which might have been formed by
193            untransliterable characters */
194         for(i=0; i<len; i++) {
195                 if(dest[i] == '?') {
196                         dest[i] = '_';
197                         *mangled |= 1;
198                 }
199         }
200         return len;
201 }
202
203 void wchar_to_dos(doscp_t *cp,
204                   wchar_t *wchar, char *dos, size_t len, int *mangled)
205 {
206         safe_iconv(cp->to, wchar, dos, len, mangled);
207 }
208
209 #else
210
211 #include "codepage.h"
212
213 struct doscp_t {
214         unsigned char *from_dos;
215         unsigned char to_dos[0x80];
216 };
217
218 doscp_t *cp_open(int codepage)
219 {
220         doscp_t *ret;
221         int i;
222         Codepage_t *cp;
223
224         if(codepage == 0)
225                 codepage = 850;
226
227         ret = New(doscp_t);
228         if(ret == NULL)
229                 return ret;
230
231         for(cp=codepages; cp->nr ; cp++)
232                 if(cp->nr == codepage) {
233                         ret->from_dos = cp->tounix;
234                         break;
235                 }
236
237         if(ret->from_dos == NULL) {
238                 fprintf(stderr, "Bad codepage %d\n", codepage);
239                 free(ret);
240                 return NULL;
241         }
242
243         for(i=0; i<0x80; i++) {
244                 char native = ret->from_dos[i];
245                 if(! (native & 0x80))
246                         continue;
247                 ret->to_dos[native & 0x7f] = 0x80 | i;
248         }
249         return ret;
250 }
251
252 void cp_close(doscp_t *cp)
253 {
254         free(cp);
255 }
256
257 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
258 {
259         int i;
260
261         for(i=0; i<len && dos[i]; i++) {
262                 char c = dos[i];
263                 if(c >= ' ' && c <= '~')
264                         wchar[i] = c;
265                 else {
266                         wchar[i] = cp->from_dos[c & 0x7f];
267                 }
268         }
269         wchar[i] = '\0';
270         return i;
271 }
272
273
274 void wchar_to_dos(doscp_t *cp,
275                   wchar_t *wchar, char *dos, size_t len, int *mangled)
276 {
277         int i;
278         for(i=0; i<len && wchar[i]; i++) {
279                 char c = wchar[i];
280                 if(c >= ' ' && c <= '~')
281                         dos[i] = c;
282                 else {
283                         dos[i] = cp->to_dos[c & 0x7f];
284                         if(dos[i] == '\0') {
285                                 dos[i]='_';
286                                 *mangled=1;
287                         }
288                 }
289         }
290 }
291
292 #endif
293
294
295 #ifndef HAVE_WCHAR_H
296
297 typedef int mbstate_t;
298
299 static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
300 {
301         *s = wc;
302         return 1;
303 }
304
305 static inline size_t mbrtowc(wchar_t *pwc, const char *s, 
306                              size_t n, mbstate_t *ps)
307 {
308         *pwc = *s;
309         return 1;
310 }
311
312 #endif
313
314 #ifdef HAVE_ICONV_H
315
316 #include <langinfo.h>
317
318 static iconv_t to_native = NULL;
319
320 static void initialize_to_native(void)
321 {
322         char *li, *cp;
323         int len;
324         if(to_native != NULL)
325                 return;
326         li = nl_langinfo(CODESET);
327         len = strlen(li) + 11;
328         if(getWcharCp() == NULL)
329                 exit(1);
330         cp = safe_malloc(len);
331         strcpy(cp, li);
332         strcat(cp, "//TRANSLIT");
333         to_native = iconv_open(cp, wcharCp);
334         if(to_native == (iconv_t) -1)
335                 to_native = iconv_open(li, wcharCp);
336         if(to_native == (iconv_t) -1)
337                 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
338         free(cp);
339         if(to_native == (iconv_t) -1)
340                 exit(1);
341 }
342
343
344
345 #endif
346
347
348 /**
349  * Convert wchar string to native, converting at most len wchar characters
350  * Returns number of generated native characters
351  */
352 int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
353 {
354 #ifdef HAVE_ICONV_H
355         int mangled;
356         int r;
357         initialize_to_native();
358         len = wcsnlen(wchar,len);
359         r=safe_iconv(to_native, wchar, native, len, &mangled);
360         native[r]='\0';
361         return r;
362 #else
363         int i;
364         char *dptr = native;
365         mbstate_t ps;
366         memset(&ps, 0, sizeof(ps));
367         for(i=0; i<len && wchar[i] != 0; i++) {
368                 int r = wcrtomb(dptr, wchar[i], &ps);
369                 if(r < 0 && errno == EILSEQ) {
370                         r=1;
371                         *dptr='_';
372                 }
373                 if(r < 0)
374                         return r;
375                 dptr+=r;
376         }
377         *dptr='\0';
378         return dptr-native;
379 #endif
380 }
381
382 /**
383  * Convert native string to wchar string, converting at most len wchar
384  * characters. If end is supplied, stop conversion when source pointer
385  * exceeds end. Returns number of converted wchars
386  */
387 int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
388                     const char *end, int *mangled)
389 {
390         mbstate_t ps;
391         int i;
392         memset(&ps, 0, sizeof(ps));
393
394         for(i=0; i<len && (native < end || !end); i++) {
395                 int r = mbrtowc(wchar+i, native, len, &ps);
396                 if(r < 0) {
397                         /* Unconvertible character. Just pretend it's Latin1
398                            encoded (if valid Latin1 character) or substitue
399                            with an underscore if not
400                         */
401                         char c = *native;
402                         if(c >= '\xa0' && c < '\xff')
403                                 wchar[i] = c & 0xff;
404                         else
405                                 wchar[i] = '_';
406                         memset(&ps, 0, sizeof(ps));
407                         r=1;
408                 }
409                 if(r == 0)
410                         break;
411                 native += r;
412         }
413         if(mangled && end && native < end)
414                 *mangled |= 3;
415         wchar[i]='\0';
416         return i;
417 }
418