48afd54ff3f8add686600b0c5370d26a4c9ebed1
[platform/upstream/libxml2.git] / os400 / iconv / bldcsndfa / bldcsndfa.c
1 /**
2 ***     Build a deterministic finite automaton to associate CCSIDs with
3 ***             character set names.
4 ***
5 ***     Compile on OS/400 with options SYSIFCOPT(*IFSIO).
6 ***
7 ***     See Copyright for the status of this software.
8 ***
9 ***     Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
10 **/
11
12 #include <stdio.h>
13 #include <errno.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <fcntl.h>
17 #include <ctype.h>
18
19 #include <iconv.h>
20
21
22 #ifdef OLDXML
23 #include "xml.h"
24 #else
25 #include <libxml/hash.h>
26 #include <libxml/parser.h>
27 #include <libxml/xpath.h>
28 #include <libxml/xpathInternals.h>
29 #endif
30
31
32 #ifdef __OS400__
33 #define iconv_open_error(cd)            ((cd).return_value == -1)
34 #define set_iconv_open_error(cd)        ((cd).return_value = -1)
35 #else
36 #define iconv_open_error(cd)            ((cd) == (iconv_t) -1)
37 #define set_iconv_open_error(cd)        ((cd) = (iconv_t) -1)
38 #endif
39
40
41 #define C_SOURCE_CCSID          500
42 #define C_UTF8_CCSID            1208
43
44
45 #define UTF8_SPACE      0x20
46 #define UTF8_HT         0x09
47 #define UTF8_0          0x30
48 #define UTF8_9          0x39
49 #define UTF8_A          0x41
50 #define UTF8_Z          0x5A
51 #define UTF8_a          0x61
52 #define UTF8_z          0x7A
53
54
55 #define GRANULE         128             /* Memory allocation granule. */
56
57 #define EPSILON         0x100           /* Token for empty transition. */
58
59
60 #ifndef OFFSETOF
61 #define OFFSETOF(t, f)  ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0))
62 #endif
63
64 #ifndef OFFSETBY
65 #define OFFSETBY(t, p, o)       ((t *) ((char *) (p) + (unsigned int) (o)))
66 #endif
67
68
69 typedef struct t_transition     t_transition;   /* NFA/DFA transition. */
70 typedef struct t_state          t_state;        /* NFA/DFA state node. */
71 typedef struct t_symlist        t_symlist;      /* Symbol (i.e.: name) list. */
72 typedef struct t_chset          t_chset;        /* Character set. */
73 typedef struct t_stategroup     t_stategroup;   /* Optimization group. */
74 typedef unsigned char           utf8char;       /* UTF-8 character byte. */
75 typedef unsigned char           byte;           /* Untyped data byte. */
76
77
78 typedef struct {                        /* Set of pointers. */
79         unsigned int    p_size;         /* Current allocated size. */
80         unsigned int    p_card;         /* Current element count. */
81         void *          p_set[1];       /* Element array. */
82 }               t_powerset;
83
84
85 struct t_transition {
86         t_transition *  t_forwprev;     /* Head of forward transition list. */
87         t_transition *  t_forwnext;     /* Tail of forward transition list. */
88         t_transition *  t_backprev;     /* Head of backward transition list. */
89         t_transition *  t_backnext;     /* Tail of backward transition list. */
90         t_state *       t_from;         /* Incoming state. */
91         t_state *       t_to;           /* Destination state. */
92         unsigned short  t_token;        /* Transition token. */
93         unsigned int    t_index;        /* Transition array index. */
94 };
95
96
97 struct t_state {
98         t_state *       s_next;         /* Next state (for DFA construction). */
99         t_state *       s_stack;        /* Unprocessed DFA states stack. */
100         t_transition *  s_forward;      /* Forward transitions. */
101         t_transition *  s_backward;     /* Backward transitions. */
102         t_chset *       s_final;        /* Recognized character set. */
103         t_powerset *    s_nfastates;    /* Corresponding NFA states. */
104         unsigned int    s_index;        /* State index. */
105 };
106
107
108 struct t_symlist {
109         t_symlist *     l_next;         /* Next name in list. */
110         utf8char        l_symbol[1];    /* Name bytes. */
111 };
112
113
114 struct t_chset {
115         t_chset *       c_next;         /* Next character set. */
116         t_symlist *     c_names;        /* Character set name list. */
117         iconv_t         c_fromUTF8;     /* Conversion from UTF-8. */
118         unsigned int    c_ccsid;        /* IBM character set code. */
119         unsigned int    c_mibenum;      /* IANA character code. */
120 };
121
122
123 struct t_stategroup {
124         t_stategroup *  g_next;         /* Next group. */
125         t_state *       g_member;       /* Group member (s_stack) list. */
126         unsigned int    g_id;           /* Group ident. */
127 };
128
129
130
131 t_chset *       chset_list;             /* Character set list. */
132 t_state *       initial_state;          /* Initial NFA state. */
133 iconv_t         job2utf8;               /* Job CCSID to UTF-8 conversion. */
134 iconv_t         utf82job;               /* UTF-8 to job CCSID conversion. */
135 t_state *       dfa_states;             /* List of DFA states. */
136 unsigned int    groupid;                /* Group ident counter. */
137
138
139 /**
140 ***     UTF-8 strings.
141 **/
142
143 #pragma convert(819)
144
145 static const utf8char   utf8_MIBenum[] = "MIBenum";
146 static const utf8char   utf8_mibenum[] = "mibenum";
147 static const utf8char   utf8_ibm_[] = "ibm-";
148 static const utf8char   utf8_IBMCCSID[] = "IBMCCSID";
149 static const utf8char   utf8_iana_[] = "iana-";
150 static const utf8char   utf8_Name[] = "Name";
151 static const utf8char   utf8_Pref_MIME_Name[] = "Preferred MIME Name";
152 static const utf8char   utf8_Aliases[] = "Aliases";
153 static const utf8char   utf8_html[] = "html";
154 static const utf8char   utf8_htmluri[] = "http://www.w3.org/1999/xhtml";
155 static const utf8char   utf8_A[] = "A";
156 static const utf8char   utf8_C[] = "C";
157 static const utf8char   utf8_M[] = "M";
158 static const utf8char   utf8_N[] = "N";
159 static const utf8char   utf8_P[] = "P";
160 static const utf8char   utf8_T[] = "T";
161 static const utf8char   utf8_ccsid[] = "ccsid";
162 static const utf8char   utf8_EBCDIC[] = "EBCDIC";
163 static const utf8char   utf8_ASCII[] = "ASCII";
164 static const utf8char   utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]";
165 static const utf8char   utf8_aliastext[] =
166                                 "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()";
167 #ifdef OLDXML
168 static const utf8char   utf8_tablerows[] =
169                         "//table[@id='table-character-sets-1']/*/tr";
170 static const utf8char   utf8_headerpos[] =
171                 "count(th[text()=$T]/preceding-sibling::th)+1";
172 static const utf8char   utf8_getmibenum[] = "number(td[$M])";
173 static const utf8char   utf8_getprefname[] = "string(td[$P])";
174 static const utf8char   utf8_getname[] = "string(td[$N])";
175 static const utf8char   utf8_getaliases[] = "td[$A]/text()";
176 #else
177 static const utf8char   utf8_tablerows[] =
178                         "//html:table[@id='table-character-sets-1']/*/html:tr";
179 static const utf8char   utf8_headerpos[] =
180                 "count(html:th[text()=$T]/preceding-sibling::html:th)+1";
181 static const utf8char   utf8_getmibenum[] = "number(html:td[$M])";
182 static const utf8char   utf8_getprefname[] = "string(html:td[$P])";
183 static const utf8char   utf8_getname[] = "string(html:td[$N])";
184 static const utf8char   utf8_getaliases[] = "html:td[$A]/text()";
185 #endif
186
187 #pragma convert(0)
188
189
190 /**
191 ***     UTF-8 character length table.
192 ***
193 ***     Index is first character byte, value is the character byte count.
194 **/
195
196 static signed char      utf8_chlen[] = {
197 /* 00-07 */     1,      1,      1,      1,      1,      1,      1,      1,
198 /* 08-0F */     1,      1,      1,      1,      1,      1,      1,      1,
199 /* 10-17 */     1,      1,      1,      1,      1,      1,      1,      1,
200 /* 18-1F */     1,      1,      1,      1,      1,      1,      1,      1,
201 /* 20-27 */     1,      1,      1,      1,      1,      1,      1,      1,
202 /* 28-2F */     1,      1,      1,      1,      1,      1,      1,      1,
203 /* 30-37 */     1,      1,      1,      1,      1,      1,      1,      1,
204 /* 38-3F */     1,      1,      1,      1,      1,      1,      1,      1,
205 /* 40-47 */     1,      1,      1,      1,      1,      1,      1,      1,
206 /* 48-4F */     1,      1,      1,      1,      1,      1,      1,      1,
207 /* 50-57 */     1,      1,      1,      1,      1,      1,      1,      1,
208 /* 58-5F */     1,      1,      1,      1,      1,      1,      1,      1,
209 /* 60-67 */     1,      1,      1,      1,      1,      1,      1,      1,
210 /* 68-6F */     1,      1,      1,      1,      1,      1,      1,      1,
211 /* 70-77 */     1,      1,      1,      1,      1,      1,      1,      1,
212 /* 78-7F */     1,      1,      1,      1,      1,      1,      1,      1,
213 /* 80-87 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
214 /* 88-8F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
215 /* 90-97 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
216 /* 98-9F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
217 /* A0-A7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
218 /* A8-AF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
219 /* B0-B7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
220 /* B8-BF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
221 /* C0-C7 */     2,      2,      2,      2,      2,      2,      2,      2,
222 /* C8-CF */     2,      2,      2,      2,      2,      2,      2,      2,
223 /* D0-D7 */     2,      2,      2,      2,      2,      2,      2,      2,
224 /* D8-DF */     2,      2,      2,      2,      2,      2,      2,      2,
225 /* E0-E7 */     3,      3,      3,      3,      3,      3,      3,      3,
226 /* E8-EF */     3,      3,      3,      3,      3,      3,      3,      3,
227 /* F0-F7 */     4,      4,      4,      4,      4,      4,      4,      4,
228 /* F8-FF */     5,      5,      5,      5,      6,      6,      -1,     -1
229 };
230
231
232
233 void
234 chknull(void * p)
235
236 {
237         if (p)
238                 return;
239
240         fprintf(stderr, "Not enough memory\n");
241         exit(1);
242 }
243
244
245 void
246 makecode(char * buf, unsigned int ccsid)
247
248 {
249         ccsid &= 0xFFFF;
250         memset(buf, 0, 32);
251         sprintf(buf, "IBMCCSID%05u0000000", ccsid);
252 }
253
254
255 iconv_t
256 iconv_open_ccsid(unsigned int ccsidout,
257                                 unsigned int ccsidin, unsigned int nullflag)
258
259 {
260         char fromcode[33];
261         char tocode[33];
262
263         makecode(fromcode, ccsidin);
264         makecode(tocode, ccsidout);
265         memset(tocode + 13, 0, sizeof tocode - 13);
266
267         if (nullflag)
268                 fromcode[18] = '1';
269
270         return iconv_open(tocode, fromcode);
271 }
272
273
274 unsigned int
275 getnum(char * * cpp)
276
277 {
278         unsigned int n;
279         char * cp;
280
281         cp = *cpp;
282         n = 0;
283
284         while (isdigit(*cp))
285                 n = 10 * n + *cp++ - '0';
286
287         *cpp = cp;
288         return n;
289 }
290
291
292 const utf8char *
293 hashBinaryKey(const byte * bytes, unsigned int len)
294
295 {
296         const byte * bp;
297         utf8char * key;
298         utf8char * cp;
299         unsigned int n;
300         unsigned int n4;
301         unsigned int i;
302
303         /**
304         ***     Encode binary data in character form to be used as hash
305         ***             table key.
306         **/
307
308         n = (4 * len + 2) / 3;
309         key = (utf8char *) malloc(n + 1);
310         chknull(key);
311         bp = bytes;
312         cp = key;
313
314         for (n4 = n >> 2; n4; n4--) {
315                 i = (bp[0] << 16) | (bp[1] << 8) | bp[2];
316                 *cp++ = 0x21 + ((i >> 18) & 0x3F);
317                 *cp++ = 0x21 + ((i >> 12) & 0x3F);
318                 *cp++ = 0x21 + ((i >> 6) & 0x3F);
319                 *cp++ = 0x21 + (i & 0x3F);
320                 bp += 3;
321                 }
322
323         switch (n & 0x3) {
324
325         case 2:
326                 *cp++ = 0x21 + ((*bp >> 2) & 0x3F);
327                 *cp++ = 0x21 + ((*bp << 4) & 0x3F);
328                 break;
329
330         case 3:
331                 i = (bp[0] << 8) | bp[1];
332                 *cp++ = 0x21 + ((i >> 10) & 0x3F);
333                 *cp++ = 0x21 + ((i >> 4) & 0x3F);
334                 *cp++ = 0x21 + ((i << 2) & 0x3F);
335                 break;
336                 }
337
338         *cp = '\0';
339         return key;
340 }
341
342
343 void *
344 hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len)
345
346 {
347         const utf8char * key;
348         void * result;
349
350         key = hashBinaryKey((const byte *) binkey, len);
351         result = xmlHashLookup(h, key);
352         free((char *) key);
353         return result;
354 }
355
356
357 int
358 hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data)
359
360 {
361         const utf8char * key;
362         int result;
363
364         key = hashBinaryKey((const byte *) binkey, len);
365         result = xmlHashAddEntry(h, key, data);
366         free((char *) key);
367         return result;
368 }
369
370
371 xmlDocPtr
372 loadXMLFile(const char * filename)
373
374 {
375         struct stat sbuf;
376         byte * databuf;
377         int fd;
378         int i;
379         xmlDocPtr doc;
380
381         if (stat(filename, &sbuf))
382                 return (xmlDocPtr) NULL;
383
384         databuf = malloc(sbuf.st_size + 4);
385
386         if (!databuf)
387                 return (xmlDocPtr) NULL;
388
389         fd = open(filename, O_RDONLY
390 #ifdef O_BINARY
391                                          | O_BINARY
392 #endif
393                                                         );
394
395         if (fd < 0) {
396                 free((char *) databuf);
397                 return (xmlDocPtr) NULL;
398                 }
399
400         i = read(fd, (char *) databuf, sbuf.st_size);
401         close(fd);
402
403         if (i != sbuf.st_size) {
404                 free((char *) databuf);
405                 return (xmlDocPtr) NULL;
406                 }
407
408         databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0;
409         doc = xmlParseMemory((xmlChar *) databuf, i);
410         free((char *) databuf);
411         return doc;
412 }
413
414
415 int
416 match(char * * cpp, char * s)
417
418 {
419         char * cp;
420         int c1;
421         int c2;
422
423         cp = *cpp;
424
425         for (cp = *cpp; c2 = *s++; cp++) {
426                 c1 = *cp;
427
428                 if (c1 != c2) {
429                         if (isupper(c1))
430                                 c1 = tolower(c1);
431
432                         if (isupper(c2))
433                                 c2 = tolower(c2);
434                         }
435
436                 if (c1 != c2)
437                         return 0;
438                 }
439
440         c1 = *cp;
441
442         while (c1 == ' ' || c1 == '\t')
443                 c1 = *++cp;
444
445         *cpp = cp;
446         return 1;
447 }
448
449
450 t_state *
451 newstate(void)
452
453 {
454         t_state * s;
455
456         s = (t_state *) malloc(sizeof *s);
457         chknull(s);
458         memset((char *) s, 0, sizeof *s);
459         return s;
460 }
461
462
463 void
464 unlink_transition(t_transition * t)
465
466 {
467         if (t->t_backnext)
468                 t->t_backnext->t_backprev = t->t_backprev;
469
470         if (t->t_backprev)
471                 t->t_backprev->t_backnext = t->t_backnext;
472         else if (t->t_to)
473                 t->t_to->s_backward = t->t_backnext;
474
475         if (t->t_forwnext)
476                 t->t_forwnext->t_forwprev = t->t_forwprev;
477
478         if (t->t_forwprev)
479                 t->t_forwprev->t_forwnext = t->t_forwnext;
480         else if (t->t_from)
481                 t->t_from->s_forward = t->t_forwnext;
482
483         t->t_backprev = (t_transition *) NULL;
484         t->t_backnext = (t_transition *) NULL;
485         t->t_forwprev = (t_transition *) NULL;
486         t->t_forwnext = (t_transition *) NULL;
487         t->t_from = (t_state *) NULL;
488         t->t_to = (t_state *) NULL;
489 }
490
491
492 void
493 link_transition(t_transition * t, t_state * from, t_state * to)
494
495 {
496         if (!from)
497                 from = t->t_from;
498
499         if (!to)
500                 to = t->t_to;
501
502         unlink_transition(t);
503
504         if ((t->t_from = from)) {
505                 if ((t->t_forwnext = from->s_forward))
506                         t->t_forwnext->t_forwprev = t;
507
508                 from->s_forward = t;
509                 }
510
511         if ((t->t_to = to)) {
512                 if ((t->t_backnext = to->s_backward))
513                         t->t_backnext->t_backprev = t;
514
515                 to->s_backward = t;
516                 }
517 }
518
519
520 t_transition *
521 newtransition(unsigned int token, t_state * from, t_state * to)
522
523 {
524         t_transition * t;
525
526         t = (t_transition *) malloc(sizeof *t);
527         chknull(t);
528         memset((char *) t, 0, sizeof *t);
529         t->t_token = token;
530         link_transition(t, from, to);
531         return t;
532 }
533
534
535 t_transition *
536 uniquetransition(unsigned int token, t_state * from, t_state * to)
537
538 {
539         t_transition * t;
540
541         for (t = from->s_forward; t; t = t->t_forwnext)
542                 if (t->t_token == token && (t->t_to == to || !to))
543                         return t;
544
545         return to? newtransition(token, from, to): (t_transition *) NULL;
546 }
547
548
549 int
550 set_position(t_powerset * s, void * e)
551
552 {
553         unsigned int l;
554         unsigned int h;
555         unsigned int m;
556         int i;
557
558         l = 0;
559         h = s->p_card;
560
561         while (l < h) {
562                 m = (l + h) >> 1;
563
564                 /**
565                 ***     If both pointers belong to different allocation arenas,
566                 ***             native comparison may find them neither
567                 ***             equal, nor greater, nor smaller.
568                 ***     We thus compare using memcmp() to get an orthogonal
569                 ***             result.
570                 **/
571
572                 i = memcmp(&e, s->p_set + m, sizeof e);
573
574                 if (i < 0)
575                         h = m;
576                 else if (!i)
577                         return m;
578                 else
579                         l = m + 1;
580                 }
581
582         return l;
583 }
584
585
586 t_powerset *
587 set_include(t_powerset * s, void * e)
588
589 {
590         unsigned int pos;
591         unsigned int n;
592
593         if (!s) {
594                 s = (t_powerset *) malloc(sizeof *s +
595                     GRANULE * sizeof s->p_set);
596                 chknull(s);
597                 s->p_size = GRANULE;
598                 s->p_set[GRANULE] = (t_state *) NULL;
599                 s->p_set[0] = e;
600                 s->p_card = 1;
601                 return s;
602                 }
603
604         pos = set_position(s, e);
605
606         if (pos < s->p_card && s->p_set[pos] == e)
607                 return s;
608
609         if (s->p_card >= s->p_size) {
610                 s->p_size += GRANULE;
611                 s = (t_powerset *) realloc(s,
612                     sizeof *s + s->p_size * sizeof s->p_set);
613                 chknull(s);
614                 s->p_set[s->p_size] = (t_state *) NULL;
615                 }
616
617         n = s->p_card - pos;
618
619         if (n)
620                 memmove((char *) (s->p_set + pos + 1),
621                     (char *) (s->p_set + pos), n * sizeof s->p_set[0]);
622
623         s->p_set[pos] = e;
624         s->p_card++;
625         return s;
626 }
627
628
629 t_state *
630 nfatransition(t_state * to, byte token)
631
632 {
633         t_state * from;
634
635         from = newstate();
636         newtransition(token, from, to);
637         return from;
638 }
639
640
641 static t_state *        nfadevelop(t_state * from, t_state * final, iconv_t icc,
642                                 const utf8char * name, unsigned int len);
643
644
645 void
646 nfaslice(t_state * * from, t_state * * to, iconv_t icc,
647                 const utf8char * chr, unsigned int chlen,
648                 const utf8char * name, unsigned int len, t_state * final)
649
650 {
651         char * srcp;
652         char * dstp;
653         size_t srcc;
654         size_t dstc;
655         unsigned int cnt;
656         t_state * f;
657         t_state * t;
658         t_transition * tp;
659         byte bytebuf[8];
660
661         srcp = (char *) chr;
662         srcc = chlen;
663         dstp = (char *) bytebuf;
664         dstc = sizeof bytebuf;
665         iconv(icc, &srcp, &srcc, &dstp, &dstc);
666         dstp = (char *) bytebuf;
667         cnt = sizeof bytebuf - dstc;
668         t = *to;
669         f = *from;
670
671         /**
672         ***     Check for end of string.
673         **/
674
675         if (!len)
676                 if (t && t != final)
677                         uniquetransition(EPSILON, t, final);
678                 else
679                         t = final;
680
681         if (f)
682                 while (cnt) {
683                         tp = uniquetransition(*dstp, f, (t_state *) NULL);
684
685                         if (!tp)
686                                 break;
687
688                         f = tp->t_to;
689                         dstp++;
690                         cnt--;
691                         }
692
693         if (!cnt) {
694                 if (!t)
695                         t = nfadevelop(f, final, icc, name, len);
696
697                 *to = t;
698                 return;
699                 }
700
701         if (!t) {
702                 t = nfadevelop((t_state *) NULL, final, icc, name, len);
703                 *to = t;
704                 }
705
706         if (!f)
707                 *from = f = newstate();
708
709         while (cnt > 1)
710                 t = nfatransition(t, dstp[--cnt]);
711
712         newtransition(*dstp, f, t);
713 }
714
715
716 t_state *
717 nfadevelop(t_state * from, t_state * final, iconv_t icc,
718                                         const utf8char * name, unsigned int len)
719
720 {
721         int chlen;
722         int i;
723         t_state * to;
724         int uccnt;
725         int lccnt;
726         utf8char chr;
727
728         chlen = utf8_chlen[*name];
729
730         for (i = 1; i < chlen; i++)
731                 if ((name[i] & 0xC0) != 0x80)
732                         break;
733
734         if (i != chlen) {
735                 fprintf(stderr,
736                     "Invalid UTF8 character in character set name\n");
737                 return (t_state *) NULL;
738                 }
739
740         to = (t_state *) NULL;
741         nfaslice(&from, &to,
742             icc, name, chlen, name + chlen, len - chlen, final);
743
744         if (*name >= UTF8_a && *name <= UTF8_z)
745                 chr = *name - UTF8_a + UTF8_A;
746         else if (*name >= UTF8_A && *name <= UTF8_Z)
747                 chr = *name - UTF8_A + UTF8_a;
748         else
749                 return from;
750
751         nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final);
752         return from;
753 }
754
755
756
757 void
758 nfaenter(const utf8char * name, int len, t_chset * charset)
759
760 {
761         t_chset * s;
762         t_state * final;
763         t_state * sp;
764         t_symlist * lp;
765
766         /**
767         ***     Enter case-insensitive `name' in NFA in all known
768         ***             character codes.
769         ***     Redundant shift state changes as well as shift state
770         ***             differences between uppercase and lowercase are
771         ***             not handled.
772         **/
773
774         if (len < 0)
775                 len = strlen(name) + 1;
776
777         for (lp = charset->c_names; lp; lp = lp->l_next)
778                 if (!memcmp(name, lp->l_symbol, len))
779                         return;         /* Already entered. */
780
781         lp = (t_symlist *) malloc(sizeof *lp + len);
782         chknull(lp);
783         memcpy(lp->l_symbol, name, len);
784         lp->l_symbol[len] = '\0';
785         lp->l_next = charset->c_names;
786         charset->c_names = lp;
787         final = newstate();
788         final->s_final = charset;
789
790         for (s = chset_list; s; s = s->c_next)
791                 if (!iconv_open_error(s->c_fromUTF8))
792                         sp = nfadevelop(initial_state, final,
793                             s->c_fromUTF8, name, len);
794 }
795
796
797 unsigned int
798 utf8_utostr(utf8char * s, unsigned int v)
799
800 {
801         unsigned int d;
802         unsigned int i;
803
804         d = v / 10;
805         v -= d * 10;
806         i = d? utf8_utostr(s, d): 0;
807         s[i++] = v + UTF8_0;
808         s[i] = '\0';
809         return i;
810 }
811
812
813 unsigned int
814 utf8_utostrpad(utf8char * s, unsigned int v, int digits)
815
816 {
817         unsigned int i = utf8_utostr(s, v);
818         utf8char pad = UTF8_SPACE;
819
820         if (digits < 0) {
821                 pad = UTF8_0;
822                 digits = -digits;
823                 }
824
825         if (i >= digits)
826                 return i;
827
828         memmove(s + digits - i, s, i + 1);
829         memset(s, pad, digits - i);
830         return digits;
831 }
832
833
834 unsigned int
835 utf8_strtou(const utf8char * s)
836
837 {
838         unsigned int v;
839
840         while (*s == UTF8_SPACE || *s == UTF8_HT)
841                 s++;
842
843         for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;)
844                 v = 10 * v + *s++ - UTF8_0;
845
846         return v;
847 }
848
849
850 unsigned int
851 getNumAttr(xmlNodePtr node, const xmlChar * name)
852
853 {
854         const xmlChar * s;
855         unsigned int val;
856
857         s = xmlGetProp(node, name);
858
859         if (!s)
860                 return 0;
861
862         val = utf8_strtou(s);
863         xmlFree((xmlChar *) s);
864         return val;
865 }
866
867
868 void
869 read_assocs(const char * filename)
870
871 {
872         xmlDocPtr doc;
873         xmlXPathContextPtr ctxt;
874         xmlXPathObjectPtr obj;
875         xmlNodePtr node;
876         t_chset * sp;
877         int i;
878         unsigned int ccsid;
879         unsigned int mibenum;
880         utf8char symbuf[32];
881
882         doc = loadXMLFile(filename);
883
884         if (!doc) {
885                 fprintf(stderr, "Cannot load file %s\n", filename);
886                 exit(1);
887                 }
888
889         ctxt = xmlXPathNewContext(doc);
890         obj = xmlXPathEval(utf8_assocnodes, ctxt);
891
892         if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval ||
893             !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) {
894                 fprintf(stderr, "No association found in %s\n", filename);
895                 exit(1);
896                 }
897
898         for (i = 0; i < obj->nodesetval->nodeNr; i++) {
899                 node = obj->nodesetval->nodeTab[i];
900                 ccsid = getNumAttr(node, utf8_ccsid);
901                 mibenum = getNumAttr(node, utf8_mibenum);
902
903                 /**
904                 ***     Check for duplicate.
905                 **/
906
907                 for (sp = chset_list; sp; sp = sp->c_next)
908                         if (ccsid && ccsid == sp->c_ccsid ||
909                             mibenum && mibenum == sp->c_mibenum) {
910                                 fprintf(stderr, "Duplicate character set: ");
911                                 fprintf(stderr, "CCSID = %u/%u, ",
912                                     ccsid, sp->c_ccsid);
913                                 fprintf(stderr, "MIBenum = %u/%u\n",
914                                     mibenum, sp->c_mibenum);
915                                 break;
916                                 }
917
918                 if (sp)
919                         continue;
920
921                 /**
922                 ***     Allocate the new character set.
923                 **/
924
925                 sp = (t_chset *) malloc(sizeof *sp);
926                 chknull(sp);
927                 memset(sp, 0, sizeof *sp);
928
929                 if (!ccsid)     /* Do not attempt with current job CCSID. */
930                         set_iconv_open_error(sp->c_fromUTF8);
931                 else {
932                         sp->c_fromUTF8 =
933                             iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0);
934
935                         if (iconv_open_error(sp->c_fromUTF8) == -1)
936                                 fprintf(stderr,
937                                     "Cannot convert into CCSID %u: ignored\n",
938                                     ccsid);
939                         }
940
941                 sp->c_ccsid = ccsid;
942                 sp->c_mibenum = mibenum;
943                 sp->c_next = chset_list;
944                 chset_list = sp;
945                 }
946
947         xmlXPathFreeObject(obj);
948
949         /**
950         ***     Enter aliases.
951         **/
952
953         for (sp = chset_list; sp; sp = sp->c_next) {
954                 strcpy(symbuf, utf8_ibm_);
955                 utf8_utostr(symbuf + 4, sp->c_ccsid);
956                 nfaenter(symbuf, -1, sp);
957                 strcpy(symbuf, utf8_IBMCCSID);
958                 utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5);
959                 nfaenter(symbuf, 13, sp);       /* Not null-terminated. */
960
961                 if (sp->c_mibenum) {
962                         strcpy(symbuf, utf8_iana_);
963                         utf8_utostr(symbuf + 5, sp->c_mibenum);
964                         nfaenter(symbuf, -1, sp);
965                         }
966
967                 xmlXPathRegisterVariable(ctxt, utf8_C,
968                     xmlXPathNewFloat((double) sp->c_ccsid));
969                 obj = xmlXPathEval(utf8_aliastext, ctxt);
970
971                 if (!obj || obj->type != XPATH_NODESET) {
972                         fprintf(stderr, "getAlias failed in %s\n", filename);
973                         exit(1);
974                         }
975
976                 if (obj->nodesetval &&
977                     obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) {
978                         for (i = 0; i < obj->nodesetval->nodeNr; i++) {
979                                 node = obj->nodesetval->nodeTab[i];
980                                 nfaenter(node->content, -1, sp);
981                                 }
982                         }
983
984                 xmlXPathFreeObject(obj);
985                 }
986
987         xmlXPathFreeContext(ctxt);
988         xmlFreeDoc(doc);
989 }
990
991
992 unsigned int
993 columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header)
994
995 {
996         xmlXPathObjectPtr obj;
997         unsigned int res = 0;
998
999         xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header));
1000         obj = xmlXPathEval(utf8_headerpos, ctxt);
1001
1002         if (obj) {
1003                 if (obj->type == XPATH_NUMBER)
1004                         res = (unsigned int) obj->floatval;
1005
1006                 xmlXPathFreeObject(obj);
1007                 }
1008
1009         return res;
1010 }
1011
1012
1013 void
1014 read_iana(const char * filename)
1015
1016 {
1017         xmlDocPtr doc;
1018         xmlXPathContextPtr ctxt;
1019         xmlXPathObjectPtr obj1;
1020         xmlXPathObjectPtr obj2;
1021         xmlNodePtr node;
1022         int prefnamecol;
1023         int namecol;
1024         int mibenumcol;
1025         int aliascol;
1026         int mibenum;
1027         t_chset * sp;
1028         int n;
1029         int i;
1030
1031         doc = loadXMLFile(filename);
1032
1033         if (!doc) {
1034                 fprintf(stderr, "Cannot load file %s\n", filename);
1035                 exit(1);
1036                 }
1037
1038         ctxt = xmlXPathNewContext(doc);
1039
1040 #ifndef OLDXML
1041         xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri);
1042 #endif
1043
1044         obj1 = xmlXPathEval(utf8_tablerows, ctxt);
1045
1046         if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval ||
1047             !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) {
1048                 fprintf(stderr, "No data in %s\n", filename);
1049                 exit(1);
1050                 }
1051
1052         /**
1053         ***     Identify columns.
1054         **/
1055
1056         xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt);
1057         prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name);
1058         namecol = columnPosition(ctxt, utf8_Name);
1059         mibenumcol = columnPosition(ctxt, utf8_MIBenum);
1060         aliascol = columnPosition(ctxt, utf8_Aliases);
1061
1062         if (!prefnamecol || !namecol || !mibenumcol || !aliascol) {
1063                 fprintf(stderr, "Key column(s) missing in %s\n", filename);
1064                 exit(1);
1065                 }
1066
1067         xmlXPathRegisterVariable(ctxt, utf8_P,
1068             xmlXPathNewFloat((double) prefnamecol));
1069         xmlXPathRegisterVariable(ctxt, utf8_N,
1070             xmlXPathNewFloat((double) namecol));
1071         xmlXPathRegisterVariable(ctxt, utf8_M,
1072             xmlXPathNewFloat((double) mibenumcol));
1073         xmlXPathRegisterVariable(ctxt, utf8_A,
1074             xmlXPathNewFloat((double) aliascol));
1075
1076         /**
1077         ***     Process each row.
1078         **/
1079
1080         for (n = 1; n < obj1->nodesetval->nodeNr; n++) {
1081                 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt);
1082
1083                 /**
1084                 ***     Get the MIBenum from current row.
1085                 */
1086
1087                 obj2 = xmlXPathEval(utf8_getmibenum, ctxt);
1088
1089                 if (!obj2 || obj2->type != XPATH_NUMBER) {
1090                         fprintf(stderr, "get MIBenum failed at row %u\n", n);
1091                         exit(1);
1092                         }
1093
1094                 if (xmlXPathIsNaN(obj2->floatval) ||
1095                     obj2->floatval < 1.0 || obj2->floatval > 65535.0 ||
1096                     ((unsigned int) obj2->floatval) != obj2->floatval) {
1097                         fprintf(stderr, "invalid MIBenum at row %u\n", n);
1098                         xmlXPathFreeObject(obj2);
1099                         continue;
1100                         }
1101
1102                 mibenum = obj2->floatval;
1103                 xmlXPathFreeObject(obj2);
1104
1105                 /**
1106                 ***     Search the associations for a corresponding CCSID.
1107                 **/
1108
1109                 for (sp = chset_list; sp; sp = sp->c_next)
1110                         if (sp->c_mibenum == mibenum)
1111                                 break;
1112
1113                 if (!sp)
1114                         continue;       /* No CCSID for this MIBenum. */
1115
1116                 /**
1117                 ***     Process preferred MIME name.
1118                 **/
1119
1120                 obj2 = xmlXPathEval(utf8_getprefname, ctxt);
1121
1122                 if (!obj2 || obj2->type != XPATH_STRING) {
1123                         fprintf(stderr,
1124                             "get Preferred_MIME_Name failed at row %u\n", n);
1125                         exit(1);
1126                         }
1127
1128                 if (obj2->stringval && obj2->stringval[0])
1129                         nfaenter(obj2->stringval, -1, sp);
1130
1131                 xmlXPathFreeObject(obj2);
1132
1133                 /**
1134                 ***     Process name.
1135                 **/
1136
1137                 obj2 = xmlXPathEval(utf8_getname, ctxt);
1138
1139                 if (!obj2 || obj2->type != XPATH_STRING) {
1140                         fprintf(stderr, "get name failed at row %u\n", n);
1141                         exit(1);
1142                         }
1143
1144                 if (obj2->stringval && obj2->stringval[0])
1145                         nfaenter(obj2->stringval, -1, sp);
1146
1147                 xmlXPathFreeObject(obj2);
1148
1149                 /**
1150                 ***     Process aliases.
1151                 **/
1152
1153                 obj2 = xmlXPathEval(utf8_getaliases, ctxt);
1154
1155                 if (!obj2 || obj2->type != XPATH_NODESET) {
1156                         fprintf(stderr, "get aliases failed at row %u\n", n);
1157                         exit(1);
1158                         }
1159
1160                 if (obj2->nodesetval && obj2->nodesetval->nodeTab)
1161                         for (i = 0; i < obj2->nodesetval->nodeNr; i++) {
1162                                 node = obj2->nodesetval->nodeTab[i];
1163
1164                                 if (node && node->content && node->content[0])
1165                                         nfaenter(node->content, -1, sp);
1166                                 }
1167
1168                 xmlXPathFreeObject(obj2);
1169                 }
1170
1171         xmlXPathFreeObject(obj1);
1172         xmlXPathFreeContext(ctxt);
1173         xmlFreeDoc(doc);
1174 }
1175
1176
1177 t_powerset *    closureset(t_powerset * dst, t_powerset * src);
1178
1179
1180 t_powerset *
1181 closure(t_powerset * dst, t_state * src)
1182
1183 {
1184         t_transition * t;
1185         unsigned int oldcard;
1186
1187         if (src->s_nfastates) {
1188                 /**
1189                 ***     Is a DFA state: return closure of set of equivalent
1190                 ***             NFA states.
1191                 **/
1192
1193                 return closureset(dst, src->s_nfastates);
1194                 }
1195
1196         /**
1197         ***     Compute closure of NFA state.
1198         **/
1199
1200         dst = set_include(dst, src);
1201
1202         for (t = src->s_forward; t; t = t->t_forwnext)
1203                 if (t->t_token == EPSILON) {
1204                         oldcard = dst->p_card;
1205                         dst = set_include(dst, t->t_to);
1206
1207                         if (oldcard != dst->p_card)
1208                                 dst = closure(dst, t->t_to);
1209                         }
1210
1211         return dst;
1212 }
1213
1214
1215 t_powerset *
1216 closureset(t_powerset * dst, t_powerset * src)
1217
1218 {
1219         unsigned int i;
1220
1221         for (i = 0; i < src->p_card; i++)
1222                 dst = closure(dst, (t_state *) src->p_set[i]);
1223
1224         return dst;
1225 }
1226
1227
1228 t_state *
1229 get_dfa_state(t_state * * stack,
1230                         t_powerset * nfastates, xmlHashTablePtr sethash)
1231
1232 {
1233         t_state * s;
1234
1235         if (s = hash_get(sethash, nfastates->p_set,
1236             nfastates->p_card * sizeof nfastates->p_set[0])) {
1237                 /**
1238                 ***     DFA state already present.
1239                 ***     Release the NFA state set and return
1240                 ***             the address of the old DFA state.
1241                 **/
1242
1243                 free((char *) nfastates);
1244                 return s;
1245                 }
1246
1247         /**
1248         ***     Build the new state.
1249         **/
1250
1251         s = newstate();
1252         s->s_nfastates = nfastates;
1253         s->s_next = dfa_states;
1254         dfa_states = s;
1255         s->s_stack = *stack;
1256         *stack = s;
1257
1258         /**
1259         ***     Enter it in hash.
1260         **/
1261
1262         if (hash_add(sethash, nfastates->p_set,
1263             nfastates->p_card * sizeof nfastates->p_set[0], s))
1264                 chknull(NULL);          /* Memory allocation error. */
1265
1266         return s;
1267 }
1268
1269
1270 int
1271 transcmp(const void * p1, const void * p2)
1272
1273 {
1274         t_transition * t1;
1275         t_transition * t2;
1276
1277         t1 = *(t_transition * *) p1;
1278         t2 = *(t_transition * *) p2;
1279         return ((int) t1->t_token) - ((int) t2->t_token);
1280 }
1281
1282
1283 void
1284 builddfa(void)
1285
1286 {
1287         t_powerset * transset;
1288         t_powerset * stateset;
1289         t_state * s;
1290         t_state * s2;
1291         unsigned int n;
1292         unsigned int i;
1293         unsigned int token;
1294         t_transition * t;
1295         t_state * stack;
1296         xmlHashTablePtr sethash;
1297         unsigned int nst;
1298
1299         transset = set_include(NULL, NULL);
1300         chknull(transset);
1301         stateset = set_include(NULL, NULL);
1302         chknull(stateset);
1303         sethash = xmlHashCreate(1);
1304         chknull(sethash);
1305         dfa_states = (t_state *) NULL;
1306         stack = (t_state *) NULL;
1307         nst = 0;
1308
1309         /**
1310         ***     Build the DFA initial state.
1311         **/
1312
1313         get_dfa_state(&stack, closure(NULL, initial_state), sethash);
1314
1315         /**
1316         ***     Build the other DFA states by looking at each
1317         ***             possible transition from stacked DFA states.
1318         **/
1319
1320         do {
1321                 if (!(++nst % 100))
1322                         fprintf(stderr, "%u DFA states\n", nst);
1323
1324                 s = stack;
1325                 stack = s->s_stack;
1326                 s->s_stack = (t_state *) NULL;
1327
1328                 /**
1329                 ***     Build a set of all non-epsilon transitions from this
1330                 ***             state.
1331                 **/
1332
1333                 transset->p_card = 0;
1334
1335                 for (n = 0; n < s->s_nfastates->p_card; n++) {
1336                         s2 = s->s_nfastates->p_set[n];
1337
1338                         for (t = s2->s_forward; t; t = t->t_forwnext)
1339                                 if (t->t_token != EPSILON) {
1340                                         transset = set_include(transset, t);
1341                                         chknull(transset);
1342                                         }
1343                         }
1344
1345                 /**
1346                 ***     Sort transitions by token.
1347                 **/
1348
1349                 qsort(transset->p_set, transset->p_card,
1350                     sizeof transset->p_set[0], transcmp);
1351
1352                 /**
1353                 ***     Process all transitions, grouping them by token.
1354                 **/
1355
1356                 stateset->p_card = 0;
1357                 token = EPSILON;
1358
1359                 for (i = 0; i < transset->p_card; i++) {
1360                         t = transset->p_set[i];
1361
1362                         if (token != t->t_token) {
1363                                 if (stateset->p_card) {
1364                                         /**
1365                                         ***     Get the equivalent DFA state
1366                                         ***             and create transition.
1367                                         **/
1368
1369                                         newtransition(token, s,
1370                                             get_dfa_state(&stack,
1371                                             closureset(NULL, stateset),
1372                                             sethash));
1373                                         stateset->p_card = 0;
1374                                         }
1375
1376                                 token = t->t_token;
1377                                 }
1378
1379                         stateset = set_include(stateset, t->t_to);
1380                         }
1381
1382                 if (stateset->p_card)
1383                         newtransition(token, s, get_dfa_state(&stack,
1384                             closureset(NULL, stateset), sethash));
1385         } while (stack);
1386
1387         free((char *) transset);
1388         free((char *) stateset);
1389         xmlHashFree(sethash, NULL);
1390
1391         /**
1392         ***     Reverse the state list to get the initial state first,
1393         ***             check for ambiguous prefixes, determine final states,
1394         ***             destroy NFA state sets.
1395         **/
1396
1397         while (s = dfa_states) {
1398                 dfa_states = s->s_next;
1399                 s->s_next = stack;
1400                 stack = s;
1401                 stateset = s->s_nfastates;
1402                 s->s_nfastates = (t_powerset *) NULL;
1403
1404                 for (n = 0; n < stateset->p_card; n++) {
1405                         s2 = (t_state *) stateset->p_set[n];
1406
1407                         if (s2->s_final) {
1408                                 if (s->s_final && s->s_final != s2->s_final)
1409                                         fprintf(stderr,
1410                                             "Ambiguous name for CCSIDs %u/%u\n",
1411                                             s->s_final->c_ccsid,
1412                                             s2->s_final->c_ccsid);
1413
1414                                 s->s_final = s2->s_final;
1415                                 }
1416                         }
1417
1418                 free((char *) stateset);
1419                 }
1420
1421         dfa_states = stack;
1422 }
1423
1424
1425 void
1426 deletenfa(void)
1427
1428 {
1429         t_transition * t;
1430         t_state * s;
1431         t_state * u;
1432         t_state * stack;
1433
1434         stack = initial_state;
1435         stack->s_stack = (t_state *) NULL;
1436
1437         while ((s = stack)) {
1438                 stack = s->s_stack;
1439
1440                 while ((t = s->s_forward)) {
1441                         u = t->t_to;
1442                         unlink_transition(t);
1443                         free((char *) t);
1444
1445                         if (!u->s_backward) {
1446                                 u->s_stack = stack;
1447                                 stack = u;
1448                                 }
1449                         }
1450
1451                 free((char *) s);
1452                 }
1453 }
1454
1455
1456 t_stategroup *
1457 newgroup(void)
1458
1459 {
1460         t_stategroup * g;
1461
1462         g = (t_stategroup *) malloc(sizeof *g);
1463         chknull(g);
1464         memset((char *) g, 0, sizeof *g);
1465         g->g_id = groupid++;
1466         return g;
1467 }
1468
1469
1470 void
1471 optimizedfa(void)
1472
1473 {
1474         unsigned int i;
1475         xmlHashTablePtr h;
1476         t_state * s1;
1477         t_state * s2;
1478         t_state * finstates;
1479         t_state * * sp;
1480         t_stategroup * g1;
1481         t_stategroup * g2;
1482         t_stategroup * ghead;
1483         t_transition * t1;
1484         t_transition * t2;
1485         unsigned int done;
1486         unsigned int startgroup;
1487         unsigned int gtrans[1 << (8 * sizeof(unsigned char))];
1488
1489         /**
1490         ***     Reduce DFA state count.
1491         **/
1492
1493         groupid = 0;
1494         ghead = (t_stategroup *) NULL;
1495
1496         /**
1497         ***     First split: non-final and each distinct final states.
1498         **/
1499
1500         h = xmlHashCreate(4);
1501         chknull(h);
1502
1503         for (s1 = dfa_states; s1; s1 = s1->s_next) {
1504                 if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) {
1505                         g1 = newgroup();
1506                         g1->g_next = ghead;
1507                         ghead = g1;
1508
1509                         if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1))
1510                                 chknull(NULL);  /* Memory allocation error. */
1511                         }
1512
1513                 s1->s_index = g1->g_id;
1514                 s1->s_stack = g1->g_member;
1515                 g1->g_member = s1;
1516                 }
1517
1518         xmlHashFree(h, NULL);
1519
1520         /**
1521         ***     Subsequent splits: states that have the same forward
1522         ***             transition tokens to states in the same group.
1523         **/
1524
1525         do {
1526                 done = 1;
1527
1528                 for (g2 = ghead; g2; g2 = g2->g_next) {
1529                         s1 = g2->g_member;
1530
1531                         if (!s1->s_stack)
1532                                 continue;
1533
1534                         h = xmlHashCreate(1);
1535                         chknull(h);
1536
1537                         /**
1538                         ***     Build the group transition map.
1539                         **/
1540
1541                         memset((char *) gtrans, ~0, sizeof gtrans);
1542
1543                         for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext)
1544                                 gtrans[t1->t_token] = t1->t_to->s_index;
1545
1546                         if (hash_add(h, gtrans, sizeof gtrans, g2))
1547                                 chknull(NULL);
1548
1549                         /**
1550                         ***     Process other states in group.
1551                         **/
1552
1553                         sp = &s1->s_stack;
1554                         s1 = *sp;
1555
1556                         do {
1557                                 *sp = s1->s_stack;
1558
1559                                 /**
1560                                 ***     Build the transition map.
1561                                 **/
1562
1563                                 memset((char *) gtrans, ~0, sizeof gtrans);
1564
1565                                 for (t1 = s1->s_forward;
1566                                     t1; t1 = t1->t_forwnext)
1567                                         gtrans[t1->t_token] = t1->t_to->s_index;
1568
1569                                 g1 = hash_get(h, gtrans, sizeof gtrans);
1570
1571                                 if (g1 == g2) {
1572                                         *sp = s1;
1573                                         sp = &s1->s_stack;
1574                                         }
1575                                 else {
1576                                         if (!g1) {
1577                                                 g1 = newgroup();
1578                                                 g1->g_next = ghead;
1579                                                 ghead = g1;
1580
1581                                                 if (hash_add(h, gtrans,
1582                                                     sizeof gtrans, g1))
1583                                                         chknull(NULL);
1584                                                 }
1585
1586                                         s1->s_index = g1->g_id;
1587                                         s1->s_stack = g1->g_member;
1588                                         g1->g_member = s1;
1589                                         done = 0;
1590                                         }
1591                         } while (s1 = *sp);
1592
1593                         xmlHashFree(h, NULL);
1594                         }
1595         } while (!done);
1596
1597         /**
1598         ***     Establish group leaders and remap transitions.
1599         **/
1600
1601         startgroup = dfa_states->s_index;
1602
1603         for (g1 = ghead; g1; g1 = g1->g_next)
1604                 for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack)
1605                         for (t1 = s1->s_backward; t1; t1 = t2) {
1606                                 t2 = t1->t_backnext;
1607                                 link_transition(t1, NULL, g1->g_member);
1608                                 }
1609
1610         /**
1611         ***     Remove redundant states and transitions.
1612         **/
1613
1614         for (g1 = ghead; g1; g1 = g1->g_next) {
1615                 g1->g_member->s_next = (t_state *) NULL;
1616
1617                 while ((s1 = g1->g_member->s_stack)) {
1618                         g1->g_member->s_stack = s1->s_stack;
1619
1620                         for (t1 = s1->s_forward; t1; t1 = t2) {
1621                                 t2 = t1->t_forwnext;
1622                                 unlink_transition(t1);
1623                                 free((char *) t1);
1624                                 }
1625
1626                         free((char *) s1);
1627                         }
1628                 }
1629
1630         /**
1631         ***     Remove group support and relink DFA states.
1632         **/
1633
1634         dfa_states = (t_state *) NULL;
1635         s2 = (t_state *) NULL;
1636         finstates = (t_state *) NULL;
1637
1638         while (g1 = ghead) {
1639                 ghead = g1->g_next;
1640                 s1 = g1->g_member;
1641
1642                 if (g1->g_id == startgroup)
1643                         dfa_states = s1;        /* Keep start state first. */
1644                 else if (s1->s_final) {         /* Then final states. */
1645                         s1->s_next = finstates;
1646                         finstates = s1;
1647                         }
1648                 else {                  /* Finish with non-final states. */
1649                         s1->s_next = s2;
1650                         s2 = s1;
1651                         }
1652
1653                 free((char *) g1);
1654                 }
1655
1656         for (dfa_states->s_next = finstates; finstates->s_next;)
1657                 finstates = finstates->s_next;
1658
1659         finstates->s_next = s2;
1660 }
1661
1662
1663 const char *
1664 inttype(unsigned long max)
1665
1666 {
1667         int i;
1668
1669         for (i = 0; max; i++)
1670                 max >>= 1;
1671
1672         if (i > 8 * sizeof(unsigned int))
1673                 return "unsigned long";
1674
1675         if (i > 8 * sizeof(unsigned short))
1676                 return "unsigned int";
1677
1678         if (i > 8 * sizeof(unsigned char))
1679                 return "unsigned short";
1680
1681         return "unsigned char";
1682 }
1683
1684
1685 listids(FILE * fp)
1686
1687 {
1688         unsigned int pos;
1689         t_chset * cp;
1690         t_symlist * lp;
1691         char * srcp;
1692         char * dstp;
1693         size_t srcc;
1694         size_t dstc;
1695         char buf[80];
1696
1697         fprintf(fp, "/**\n***     CCSID   For arg   Recognized name.\n");
1698         pos = 0;
1699
1700         for (cp = chset_list; cp; cp = cp->c_next) {
1701                 if (pos) {
1702                         fprintf(fp, "\n");
1703                         pos = 0;
1704                         }
1705
1706                 if (!cp->c_names)
1707                         continue;
1708
1709                 pos = fprintf(fp, "***     %5u      %c     ", cp->c_ccsid,
1710                     iconv_open_error(cp->c_fromUTF8)? ' ': 'X');
1711
1712                 for (lp = cp->c_names; lp; lp = lp->l_next) {
1713                         srcp = (char *) lp->l_symbol;
1714                         srcc = strlen(srcp);
1715                         dstp = buf;
1716                         dstc = sizeof buf;
1717                         iconv(utf82job, &srcp, &srcc, &dstp, &dstc);
1718                         srcc = dstp - buf;
1719
1720                         if (pos + srcc > 79) {
1721                                 fprintf(fp, "\n***%22c", ' ');
1722                                 pos = 25;
1723                                 }
1724
1725                         pos += fprintf(fp, " %.*s", srcc, buf);
1726                         }
1727                 }
1728
1729         if (pos)
1730                 fprintf(fp, "\n");
1731
1732         fprintf(fp, "**/\n\n");
1733 }
1734
1735
1736 void
1737 generate(FILE * fp)
1738
1739 {
1740         unsigned int nstates;
1741         unsigned int ntrans;
1742         unsigned int maxfinal;
1743         t_state * s;
1744         t_transition * t;
1745         unsigned int i;
1746         unsigned int pos;
1747         char * ns;
1748
1749         /**
1750         ***     Assign indexes to states and transitions.
1751         **/
1752
1753         nstates = 0;
1754         ntrans = 0;
1755         maxfinal = 0;
1756
1757         for (s = dfa_states; s; s = s->s_next) {
1758                 s->s_index = nstates++;
1759
1760                 if (s->s_final)
1761                         maxfinal = nstates;
1762
1763                 for (t = s->s_forward; t; t = t->t_forwnext)
1764                         t->t_index = ntrans++;
1765                 }
1766
1767         fprintf(fp,
1768             "/**\n***     %u states, %u finals, %u transitions.\n**/\n\n",
1769             nstates, maxfinal, ntrans);
1770         fprintf(stderr, "%u states, %u finals, %u transitions.\n",
1771             nstates, maxfinal, ntrans);
1772
1773         /**
1774         ***     Generate types.
1775         **/
1776
1777         fprintf(fp, "typedef unsigned short          t_ccsid;\n");
1778         fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates));
1779         fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans));
1780
1781         /**
1782         ***     Generate first transition index for each state.
1783         **/
1784
1785         fprintf(fp, "static t_transrange     trans_array[] = {\n");
1786         pos = 0;
1787         ntrans = 0;
1788
1789         for (s = dfa_states; s; s = s->s_next) {
1790                 pos += fprintf(fp, " %u,", ntrans);
1791
1792                 if (pos > 72) {
1793                         fprintf(fp, "\n");
1794                         pos = 0;
1795                         }
1796
1797                 for (t = s->s_forward; t; t = t->t_forwnext)
1798                         ntrans++;
1799                 }
1800
1801         fprintf(fp, " %u\n};\n\n", ntrans);
1802
1803         /**
1804         ***     Generate final state info.
1805         **/
1806
1807         fprintf(fp, "static t_ccsid          final_array[] = {\n");
1808         pos = 0;
1809         ns ="";
1810         i = 0;
1811
1812         for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) {
1813                 pos += fprintf(fp, "%s", ns);
1814                 ns = ",";
1815
1816                 if (pos > 72) {
1817                         fprintf(fp, "\n");
1818                         pos = 0;
1819                         }
1820
1821                 pos += fprintf(fp, " %u",
1822                     s->s_final? s->s_final->c_ccsid + 1: 0);
1823                 }
1824
1825         fprintf(fp, "\n};\n\n");
1826
1827         /**
1828         ***     Generate goto table.
1829         **/
1830
1831         fprintf(fp, "static t_staterange     goto_array[] = {\n");
1832         pos = 0;
1833
1834         for (s = dfa_states; s; s = s->s_next)
1835                 for (t = s->s_forward; t; t = t->t_forwnext) {
1836                         pos += fprintf(fp, " %u,", t->t_to->s_index);
1837
1838                         if (pos > 72) {
1839                                 fprintf(fp, "\n");
1840                                 pos = 0;
1841                                 }
1842                         }
1843
1844         fprintf(fp, " %u\n};\n\n", nstates);
1845
1846         /**
1847         ***     Generate transition label table.
1848         **/
1849
1850         fprintf(fp, "static unsigned char    label_array[] = {\n");
1851         pos = 0;
1852         ns ="";
1853
1854         for (s = dfa_states; s; s = s->s_next)
1855                 for (t = s->s_forward; t; t = t->t_forwnext) {
1856                         pos += fprintf(fp, "%s", ns);
1857                         ns = ",";
1858
1859                         if (pos > 72) {
1860                                 fprintf(fp, "\n");
1861                                 pos = 0;
1862                                 }
1863
1864                         pos += fprintf(fp, " 0x%02X", t->t_token);
1865                         }
1866
1867         fprintf(fp, "\n};\n", nstates);
1868 }
1869
1870
1871 main(argc, argv)
1872 int argc;
1873 char * * argv;
1874
1875 {
1876         FILE * fp;
1877         t_chset * csp;
1878         char symbuf[20];
1879
1880         chset_list = (t_chset *) NULL;
1881         initial_state = newstate();
1882         job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0);
1883         utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0);
1884
1885         if (argc != 4) {
1886                 fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv);
1887                 fprintf(stderr, "<iana-character-set file> <output file>\n");
1888                 exit(1);
1889                 }
1890
1891         /**
1892         ***     Read CCSID/MIBenum associations. Define special names.
1893         **/
1894
1895         read_assocs(argv[1]);
1896
1897         /**
1898         ***     Read character set names and establish the case-independent
1899         ***             name DFA in all possible CCSIDs.
1900         **/
1901
1902         read_iana(argv[2]);
1903
1904         /**
1905         ***     Build DFA from NFA.
1906         **/
1907
1908         builddfa();
1909
1910         /**
1911         ***     Delete NFA.
1912         **/
1913
1914         deletenfa();
1915
1916         /**
1917         ***     Minimize the DFA state count.
1918         **/
1919
1920         optimizedfa();
1921
1922         /**
1923         ***     Generate the table.
1924         **/
1925
1926         fp = fopen(argv[3], "w+");
1927
1928         if (!fp) {
1929                 perror(argv[3]);
1930                 exit(1);
1931                 }
1932
1933         fprintf(fp, "/**\n");
1934         fprintf(fp, "***     Character set names table.\n");
1935         fprintf(fp, "***     Generated by program BLDCSNDFA from");
1936         fprintf(fp, " IANA character set assignment file\n");
1937         fprintf(fp, "***          and CCSID/MIBenum equivalence file.\n");
1938         fprintf(fp, "***     *** Do not edit by hand ***\n");
1939         fprintf(fp, "**/\n\n");
1940         listids(fp);
1941         generate(fp);
1942
1943         if (ferror(fp)) {
1944                 perror(argv[3]);
1945                 fclose(fp);
1946                 exit(1);
1947                 }
1948
1949         fclose(fp);
1950         iconv_close(job2utf8);
1951         iconv_close(utf82job);
1952         exit(0);
1953 }