Imported Upstream version 4.0.18
[platform/upstream/mtools.git] / charsetConv.c
1 /*  Copyright 2008,2009 Alain Knaff.
2  *  This file is part of mtools.
3  *                              
4  *  Mtools is free software: you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation, either version 3 of the License, or   
7  *  (at your option) any later version.                                 
8  *                                                                      
9  *  Mtools is distributed in the hope that it will be useful,           
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of      
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with Mtools.  If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Various character set conversions used by mtools
18  */
19 #include "sysincludes.h"
20 #include "msdos.h"
21 #include "mtools.h"
22
23 #include <stdio.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include "file_name.h"
27
28
29 #ifdef HAVE_ICONV_H
30 #include <iconv.h>
31
32 struct doscp_t {
33         iconv_t from;
34         iconv_t to;
35 };
36
37 static const char *wcharCp=NULL;
38
39 static const char* wcharTries[] = {
40         "WCHAR_T",
41         "UTF-32BE", "UTF-32LE",
42         "UTF-16BE", "UTF-16LE",
43         "UTF-32", "UTF-16",
44         "UCS-4BE", "UCS-4LE",
45         "UCS-2BE", "UCS-2LE",
46         "UCS-4", "UCS-2"
47 };
48
49 static const char *asciiTries[] = {
50         "ASCII", "ASCII-GR", "ISO8859-1"
51 };
52
53 static const wchar_t *testString = L"ab";
54
55 static int try(const char *testCp) {
56         size_t res;
57         char *inbuf = (char *)testString;
58         size_t inbufLen = 2*sizeof(wchar_t);
59         char outbuf[3];
60         char *outbufP = outbuf;
61         size_t outbufLen = 2*sizeof(char);
62         iconv_t test;
63         int i;
64         
65         for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
66                 test = iconv_open(asciiTries[i], testCp);
67                 if(test != (iconv_t) -1)
68                         break;
69         }
70         if(test == (iconv_t) -1)
71                 goto fail0;
72         res = iconv(test,
73                     &inbuf, &inbufLen,
74                     &outbufP, &outbufLen);
75         if(res != 0 || outbufLen != 0 || inbufLen != 0)
76                 goto fail;
77         if(memcmp(outbuf, "ab", 2))
78                 goto fail;
79         /* fprintf(stderr, "%s ok\n", testCp); */
80         return 1;
81  fail:
82         iconv_close(test);
83  fail0:
84         /*fprintf(stderr, "%s fail\n", testCp);*/
85         return 0;
86 }
87
88 static const char *getWcharCp(void) {
89         unsigned int i;
90         if(wcharCp != NULL)
91                 return wcharCp; 
92         for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
93                 if(try(wcharTries[i]))
94                         return (wcharCp=wcharTries[i]);
95         }
96         fprintf(stderr, "No codepage found for wchar_t\n");
97         return NULL;
98 }
99
100
101 doscp_t *cp_open(int codepage)
102 {
103         char dosCp[17];
104         doscp_t *ret;
105         iconv_t *from;
106         iconv_t *to;
107
108         if(codepage == 0)
109                 codepage = mtools_default_codepage;
110         if(codepage < 0 || codepage > 9999) {
111                 fprintf(stderr, "Bad codepage %d\n", codepage);
112                 return NULL;
113         }
114
115         if(getWcharCp() == NULL)
116                 return NULL;
117
118         sprintf(dosCp, "CP%d", codepage);
119         from = iconv_open(wcharCp, dosCp);
120         if(from == (iconv_t)-1) {
121                 fprintf(stderr, "Error converting to codepage %d %s\n",
122                         codepage, strerror(errno));
123                 return NULL;
124         }
125
126         sprintf(dosCp, "CP%d//TRANSLIT", codepage);
127         to   =  iconv_open(dosCp, wcharCp);
128         if(to == (iconv_t)-1) {
129                 /* Transliteration not supported? */
130                 sprintf(dosCp, "CP%d", codepage);
131                 to   =  iconv_open(dosCp, wcharCp);
132         }
133         if(to == (iconv_t)-1) {
134                 iconv_close(from);
135                 fprintf(stderr, "Error converting to codepage %d %s\n",
136                         codepage, strerror(errno));
137                 return NULL;
138         }
139
140         ret = New(doscp_t);
141         if(ret == NULL)
142                 return ret;
143         ret->from = from;
144         ret->to   = to;
145         return ret;
146 }
147
148 void cp_close(doscp_t *cp)
149 {
150         iconv_close(cp->to);
151         iconv_close(cp->from);
152         free(cp);
153 }
154
155 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
156 {
157         int r;
158         size_t in_len=len;
159         size_t out_len=len*sizeof(wchar_t);
160         wchar_t *dptr=wchar;
161         r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
162         if(r < 0)
163                 return r;
164         *dptr = L'\0';
165         return dptr-wchar;
166 }
167
168 /**
169  * Converts len wide character to destination. Caller's responsibility to
170  * ensure that dest is large enough.
171  * mangled will be set if there has been an untranslatable character.
172  */
173 static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
174                       size_t len, int *mangled)
175 {
176         int r;
177         unsigned int i;
178         size_t in_len=len*sizeof(wchar_t);
179         size_t out_len=len*4;
180         char *dptr = dest;
181
182         while(in_len > 0) {
183                 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
184                 if(r >= 0 || errno != EILSEQ) {
185                         /* everything transformed, or error that is _not_ a bad
186                          * character */
187                         break;
188                 }
189                 *mangled |= 1;
190
191                 if(dptr)
192                         *dptr++ = '_';
193                 in_len--;
194
195                 wchar++;
196                 out_len--;
197         }
198
199         len = dptr-dest; /* how many dest characters have there been
200                             generated */
201
202         /* eliminate question marks which might have been formed by
203            untransliterable characters */
204         for(i=0; i<len; i++) {
205                 if(dest[i] == '?') {
206                         dest[i] = '_';
207                         *mangled |= 1;
208                 }
209         }
210         return len;
211 }
212
213 void wchar_to_dos(doscp_t *cp,
214                   wchar_t *wchar, char *dos, size_t len, int *mangled)
215 {
216         safe_iconv(cp->to, wchar, dos, len, mangled);
217 }
218
219 #else
220
221 #include "codepage.h"
222
223 struct doscp_t {
224         unsigned char *from_dos;
225         unsigned char to_dos[0x80];
226 };
227
228 doscp_t *cp_open(int codepage)
229 {
230         doscp_t *ret;
231         int i;
232         Codepage_t *cp;
233
234         if(codepage == 0)
235                 codepage = 850;
236
237         ret = New(doscp_t);
238         if(ret == NULL)
239                 return ret;
240
241         for(cp=codepages; cp->nr ; cp++)
242                 if(cp->nr == codepage) {
243                         ret->from_dos = cp->tounix;
244                         break;
245                 }
246
247         if(ret->from_dos == NULL) {
248                 fprintf(stderr, "Bad codepage %d\n", codepage);
249                 free(ret);
250                 return NULL;
251         }
252
253         for(i=0; i<0x80; i++) {
254                 char native = ret->from_dos[i];
255                 if(! (native & 0x80))
256                         continue;
257                 ret->to_dos[native & 0x7f] = 0x80 | i;
258         }
259         return ret;
260 }
261
262 void cp_close(doscp_t *cp)
263 {
264         free(cp);
265 }
266
267 int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
268 {
269         int i;
270
271         for(i=0; i<len && dos[i]; i++) {
272                 char c = dos[i];
273                 if(c >= ' ' && c <= '~')
274                         wchar[i] = c;
275                 else {
276                         wchar[i] = cp->from_dos[c & 0x7f];
277                 }
278         }
279         wchar[i] = '\0';
280         return i;
281 }
282
283
284 void wchar_to_dos(doscp_t *cp,
285                   wchar_t *wchar, char *dos, size_t len, int *mangled)
286 {
287         int i;
288         for(i=0; i<len && wchar[i]; i++) {
289                 char c = wchar[i];
290                 if(c >= ' ' && c <= '~')
291                         dos[i] = c;
292                 else {
293                         dos[i] = cp->to_dos[c & 0x7f];
294                         if(dos[i] == '\0') {
295                                 dos[i]='_';
296                                 *mangled=1;
297                         }
298                 }
299         }
300 }
301
302 #endif
303
304
305 #ifndef HAVE_WCHAR_H
306
307 typedef int mbstate_t;
308
309 static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
310 {
311         *s = wc;
312         return 1;
313 }
314
315 static inline size_t mbrtowc(wchar_t *pwc, const char *s, 
316                              size_t n, mbstate_t *ps)
317 {
318         *pwc = *s;
319         return 1;
320 }
321
322 #endif
323
324 #ifdef HAVE_ICONV_H
325
326 #include <langinfo.h>
327
328 static iconv_t to_native = NULL;
329
330 static void initialize_to_native(void)
331 {
332         char *li, *cp;
333         int len;
334         if(to_native != NULL)
335                 return;
336         li = nl_langinfo(CODESET);
337         len = strlen(li) + 11;
338         if(getWcharCp() == NULL)
339                 exit(1);
340         cp = safe_malloc(len);
341         strcpy(cp, li);
342         strcat(cp, "//TRANSLIT");
343         to_native = iconv_open(cp, wcharCp);
344         if(to_native == (iconv_t) -1)
345                 to_native = iconv_open(li, wcharCp);
346         if(to_native == (iconv_t) -1)
347                 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
348         free(cp);
349         if(to_native == (iconv_t) -1)
350                 exit(1);
351 }
352
353
354
355 #endif
356
357
358 /**
359  * Convert wchar string to native, converting at most len wchar characters
360  * Returns number of generated native characters
361  */
362 int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
363 {
364 #ifdef HAVE_ICONV_H
365         int mangled;
366         int r;
367         initialize_to_native();
368         len = wcsnlen(wchar,len);
369         r=safe_iconv(to_native, wchar, native, len, &mangled);
370         native[r]='\0';
371         return r;
372 #else
373         int i;
374         char *dptr = native;
375         mbstate_t ps;
376         memset(&ps, 0, sizeof(ps));
377         for(i=0; i<len && wchar[i] != 0; i++) {
378                 int r = wcrtomb(dptr, wchar[i], &ps);
379                 if(r < 0 && errno == EILSEQ) {
380                         r=1;
381                         *dptr='_';
382                 }
383                 if(r < 0)
384                         return r;
385                 dptr+=r;
386         }
387         *dptr='\0';
388         return dptr-native;
389 #endif
390 }
391
392 /**
393  * Convert native string to wchar string, generating at most len wchar
394  * characters. If end is supplied, stop conversion when source pointer
395  * exceeds end. Returns number of generated wchars
396  */
397 int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
398                     const char *end, int *mangled)
399 {
400         mbstate_t ps;
401         unsigned int i;
402         memset(&ps, 0, sizeof(ps));
403
404         for(i=0; i<len && (native < end || !end); i++) {
405                 int r = mbrtowc(wchar+i, native, len, &ps);
406                 if(r < 0) {
407                         /* Unconvertible character. Just pretend it's Latin1
408                            encoded (if valid Latin1 character) or substitue
409                            with an underscore if not
410                         */
411                         char c = *native;
412                         if(c >= '\xa0' && c < '\xff')
413                                 wchar[i] = c & 0xff;
414                         else
415                                 wchar[i] = '_';
416                         memset(&ps, 0, sizeof(ps));
417                         r=1;
418                 }
419                 if(r == 0)
420                         break;
421                 native += r;
422         }
423         if(mangled && ((end && native < end) || (!end && *native &&  i == len)))
424                 *mangled |= 3;
425         wchar[i]='\0';
426         return i;
427 }
428