2 *** Build a deterministic finite automaton to associate CCSIDs with
3 *** character set names.
5 *** Compile on OS/400 with options SYSIFCOPT(*IFSIO).
7 *** See Copyright for the status of this software.
9 *** Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
25 #include <libxml/hash.h>
26 #include <libxml/parser.h>
27 #include <libxml/xpath.h>
28 #include <libxml/xpathInternals.h>
33 #define iconv_open_error(cd) ((cd).return_value == -1)
34 #define set_iconv_open_error(cd) ((cd).return_value = -1)
36 #define iconv_open_error(cd) ((cd) == (iconv_t) -1)
37 #define set_iconv_open_error(cd) ((cd) = (iconv_t) -1)
41 #define C_SOURCE_CCSID 500
42 #define C_UTF8_CCSID 1208
45 #define UTF8_SPACE 0x20
55 #define GRANULE 128 /* Memory allocation granule. */
57 #define EPSILON 0x100 /* Token for empty transition. */
61 #define OFFSETOF(t, f) ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0))
65 #define OFFSETBY(t, p, o) ((t *) ((char *) (p) + (unsigned int) (o)))
69 typedef struct t_transition t_transition; /* NFA/DFA transition. */
70 typedef struct t_state t_state; /* NFA/DFA state node. */
71 typedef struct t_symlist t_symlist; /* Symbol (i.e.: name) list. */
72 typedef struct t_chset t_chset; /* Character set. */
73 typedef struct t_stategroup t_stategroup; /* Optimization group. */
74 typedef unsigned char utf8char; /* UTF-8 character byte. */
75 typedef unsigned char byte; /* Untyped data byte. */
78 typedef struct { /* Set of pointers. */
79 unsigned int p_size; /* Current allocated size. */
80 unsigned int p_card; /* Current element count. */
81 void * p_set[1]; /* Element array. */
86 t_transition * t_forwprev; /* Head of forward transition list. */
87 t_transition * t_forwnext; /* Tail of forward transition list. */
88 t_transition * t_backprev; /* Head of backward transition list. */
89 t_transition * t_backnext; /* Tail of backward transition list. */
90 t_state * t_from; /* Incoming state. */
91 t_state * t_to; /* Destination state. */
92 unsigned short t_token; /* Transition token. */
93 unsigned int t_index; /* Transition array index. */
98 t_state * s_next; /* Next state (for DFA construction). */
99 t_state * s_stack; /* Unprocessed DFA states stack. */
100 t_transition * s_forward; /* Forward transitions. */
101 t_transition * s_backward; /* Backward transitions. */
102 t_chset * s_final; /* Recognized character set. */
103 t_powerset * s_nfastates; /* Corresponding NFA states. */
104 unsigned int s_index; /* State index. */
109 t_symlist * l_next; /* Next name in list. */
110 utf8char l_symbol[1]; /* Name bytes. */
115 t_chset * c_next; /* Next character set. */
116 t_symlist * c_names; /* Character set name list. */
117 iconv_t c_fromUTF8; /* Conversion from UTF-8. */
118 unsigned int c_ccsid; /* IBM character set code. */
119 unsigned int c_mibenum; /* IANA character code. */
123 struct t_stategroup {
124 t_stategroup * g_next; /* Next group. */
125 t_state * g_member; /* Group member (s_stack) list. */
126 unsigned int g_id; /* Group ident. */
131 t_chset * chset_list; /* Character set list. */
132 t_state * initial_state; /* Initial NFA state. */
133 iconv_t job2utf8; /* Job CCSID to UTF-8 conversion. */
134 iconv_t utf82job; /* UTF-8 to job CCSID conversion. */
135 t_state * dfa_states; /* List of DFA states. */
136 unsigned int groupid; /* Group ident counter. */
145 static const utf8char utf8_MIBenum[] = "MIBenum";
146 static const utf8char utf8_mibenum[] = "mibenum";
147 static const utf8char utf8_ibm_[] = "ibm-";
148 static const utf8char utf8_IBMCCSID[] = "IBMCCSID";
149 static const utf8char utf8_iana_[] = "iana-";
150 static const utf8char utf8_Name[] = "Name";
151 static const utf8char utf8_Pref_MIME_Name[] = "Preferred MIME Name";
152 static const utf8char utf8_Aliases[] = "Aliases";
153 static const utf8char utf8_html[] = "html";
154 static const utf8char utf8_htmluri[] = "http://www.w3.org/1999/xhtml";
155 static const utf8char utf8_A[] = "A";
156 static const utf8char utf8_C[] = "C";
157 static const utf8char utf8_M[] = "M";
158 static const utf8char utf8_N[] = "N";
159 static const utf8char utf8_P[] = "P";
160 static const utf8char utf8_T[] = "T";
161 static const utf8char utf8_ccsid[] = "ccsid";
162 static const utf8char utf8_EBCDIC[] = "EBCDIC";
163 static const utf8char utf8_ASCII[] = "ASCII";
164 static const utf8char utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]";
165 static const utf8char utf8_aliastext[] =
166 "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()";
168 static const utf8char utf8_tablerows[] =
169 "//table[@id='table-character-sets-1']/*/tr";
170 static const utf8char utf8_headerpos[] =
171 "count(th[text()=$T]/preceding-sibling::th)+1";
172 static const utf8char utf8_getmibenum[] = "number(td[$M])";
173 static const utf8char utf8_getprefname[] = "string(td[$P])";
174 static const utf8char utf8_getname[] = "string(td[$N])";
175 static const utf8char utf8_getaliases[] = "td[$A]/text()";
177 static const utf8char utf8_tablerows[] =
178 "//html:table[@id='table-character-sets-1']/*/html:tr";
179 static const utf8char utf8_headerpos[] =
180 "count(html:th[text()=$T]/preceding-sibling::html:th)+1";
181 static const utf8char utf8_getmibenum[] = "number(html:td[$M])";
182 static const utf8char utf8_getprefname[] = "string(html:td[$P])";
183 static const utf8char utf8_getname[] = "string(html:td[$N])";
184 static const utf8char utf8_getaliases[] = "html:td[$A]/text()";
191 *** UTF-8 character length table.
193 *** Index is first character byte, value is the character byte count.
196 static signed char utf8_chlen[] = {
197 /* 00-07 */ 1, 1, 1, 1, 1, 1, 1, 1,
198 /* 08-0F */ 1, 1, 1, 1, 1, 1, 1, 1,
199 /* 10-17 */ 1, 1, 1, 1, 1, 1, 1, 1,
200 /* 18-1F */ 1, 1, 1, 1, 1, 1, 1, 1,
201 /* 20-27 */ 1, 1, 1, 1, 1, 1, 1, 1,
202 /* 28-2F */ 1, 1, 1, 1, 1, 1, 1, 1,
203 /* 30-37 */ 1, 1, 1, 1, 1, 1, 1, 1,
204 /* 38-3F */ 1, 1, 1, 1, 1, 1, 1, 1,
205 /* 40-47 */ 1, 1, 1, 1, 1, 1, 1, 1,
206 /* 48-4F */ 1, 1, 1, 1, 1, 1, 1, 1,
207 /* 50-57 */ 1, 1, 1, 1, 1, 1, 1, 1,
208 /* 58-5F */ 1, 1, 1, 1, 1, 1, 1, 1,
209 /* 60-67 */ 1, 1, 1, 1, 1, 1, 1, 1,
210 /* 68-6F */ 1, 1, 1, 1, 1, 1, 1, 1,
211 /* 70-77 */ 1, 1, 1, 1, 1, 1, 1, 1,
212 /* 78-7F */ 1, 1, 1, 1, 1, 1, 1, 1,
213 /* 80-87 */ -1, -1, -1, -1, -1, -1, -1, -1,
214 /* 88-8F */ -1, -1, -1, -1, -1, -1, -1, -1,
215 /* 90-97 */ -1, -1, -1, -1, -1, -1, -1, -1,
216 /* 98-9F */ -1, -1, -1, -1, -1, -1, -1, -1,
217 /* A0-A7 */ -1, -1, -1, -1, -1, -1, -1, -1,
218 /* A8-AF */ -1, -1, -1, -1, -1, -1, -1, -1,
219 /* B0-B7 */ -1, -1, -1, -1, -1, -1, -1, -1,
220 /* B8-BF */ -1, -1, -1, -1, -1, -1, -1, -1,
221 /* C0-C7 */ 2, 2, 2, 2, 2, 2, 2, 2,
222 /* C8-CF */ 2, 2, 2, 2, 2, 2, 2, 2,
223 /* D0-D7 */ 2, 2, 2, 2, 2, 2, 2, 2,
224 /* D8-DF */ 2, 2, 2, 2, 2, 2, 2, 2,
225 /* E0-E7 */ 3, 3, 3, 3, 3, 3, 3, 3,
226 /* E8-EF */ 3, 3, 3, 3, 3, 3, 3, 3,
227 /* F0-F7 */ 4, 4, 4, 4, 4, 4, 4, 4,
228 /* F8-FF */ 5, 5, 5, 5, 6, 6, -1, -1
240 fprintf(stderr, "Not enough memory\n");
246 makecode(char * buf, unsigned int ccsid)
251 sprintf(buf, "IBMCCSID%05u0000000", ccsid);
256 iconv_open_ccsid(unsigned int ccsidout,
257 unsigned int ccsidin, unsigned int nullflag)
263 makecode(fromcode, ccsidin);
264 makecode(tocode, ccsidout);
265 memset(tocode + 13, 0, sizeof tocode - 13);
270 return iconv_open(tocode, fromcode);
285 n = 10 * n + *cp++ - '0';
293 hashBinaryKey(const byte * bytes, unsigned int len)
304 *** Encode binary data in character form to be used as hash
308 n = (4 * len + 2) / 3;
309 key = (utf8char *) malloc(n + 1);
314 for (n4 = n >> 2; n4; n4--) {
315 i = (bp[0] << 16) | (bp[1] << 8) | bp[2];
316 *cp++ = 0x21 + ((i >> 18) & 0x3F);
317 *cp++ = 0x21 + ((i >> 12) & 0x3F);
318 *cp++ = 0x21 + ((i >> 6) & 0x3F);
319 *cp++ = 0x21 + (i & 0x3F);
326 *cp++ = 0x21 + ((*bp >> 2) & 0x3F);
327 *cp++ = 0x21 + ((*bp << 4) & 0x3F);
331 i = (bp[0] << 8) | bp[1];
332 *cp++ = 0x21 + ((i >> 10) & 0x3F);
333 *cp++ = 0x21 + ((i >> 4) & 0x3F);
334 *cp++ = 0x21 + ((i << 2) & 0x3F);
344 hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len)
347 const utf8char * key;
350 key = hashBinaryKey((const byte *) binkey, len);
351 result = xmlHashLookup(h, key);
358 hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data)
361 const utf8char * key;
364 key = hashBinaryKey((const byte *) binkey, len);
365 result = xmlHashAddEntry(h, key, data);
372 loadXMLFile(const char * filename)
381 if (stat(filename, &sbuf))
382 return (xmlDocPtr) NULL;
384 databuf = malloc(sbuf.st_size + 4);
387 return (xmlDocPtr) NULL;
389 fd = open(filename, O_RDONLY
396 free((char *) databuf);
397 return (xmlDocPtr) NULL;
400 i = read(fd, (char *) databuf, sbuf.st_size);
403 if (i != sbuf.st_size) {
404 free((char *) databuf);
405 return (xmlDocPtr) NULL;
408 databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0;
409 doc = xmlParseMemory((xmlChar *) databuf, i);
410 free((char *) databuf);
416 match(char * * cpp, char * s)
425 for (cp = *cpp; c2 = *s++; cp++) {
442 while (c1 == ' ' || c1 == '\t')
456 s = (t_state *) malloc(sizeof *s);
458 memset((char *) s, 0, sizeof *s);
464 unlink_transition(t_transition * t)
468 t->t_backnext->t_backprev = t->t_backprev;
471 t->t_backprev->t_backnext = t->t_backnext;
473 t->t_to->s_backward = t->t_backnext;
476 t->t_forwnext->t_forwprev = t->t_forwprev;
479 t->t_forwprev->t_forwnext = t->t_forwnext;
481 t->t_from->s_forward = t->t_forwnext;
483 t->t_backprev = (t_transition *) NULL;
484 t->t_backnext = (t_transition *) NULL;
485 t->t_forwprev = (t_transition *) NULL;
486 t->t_forwnext = (t_transition *) NULL;
487 t->t_from = (t_state *) NULL;
488 t->t_to = (t_state *) NULL;
493 link_transition(t_transition * t, t_state * from, t_state * to)
502 unlink_transition(t);
504 if ((t->t_from = from)) {
505 if ((t->t_forwnext = from->s_forward))
506 t->t_forwnext->t_forwprev = t;
511 if ((t->t_to = to)) {
512 if ((t->t_backnext = to->s_backward))
513 t->t_backnext->t_backprev = t;
521 newtransition(unsigned int token, t_state * from, t_state * to)
526 t = (t_transition *) malloc(sizeof *t);
528 memset((char *) t, 0, sizeof *t);
530 link_transition(t, from, to);
536 uniquetransition(unsigned int token, t_state * from, t_state * to)
541 for (t = from->s_forward; t; t = t->t_forwnext)
542 if (t->t_token == token && (t->t_to == to || !to))
545 return to? newtransition(token, from, to): (t_transition *) NULL;
550 set_position(t_powerset * s, void * e)
565 *** If both pointers belong to different allocation arenas,
566 *** native comparison may find them neither
567 *** equal, nor greater, nor smaller.
568 *** We thus compare using memcmp() to get an orthogonal
572 i = memcmp(&e, s->p_set + m, sizeof e);
587 set_include(t_powerset * s, void * e)
594 s = (t_powerset *) malloc(sizeof *s +
595 GRANULE * sizeof s->p_set);
598 s->p_set[GRANULE] = (t_state *) NULL;
604 pos = set_position(s, e);
606 if (pos < s->p_card && s->p_set[pos] == e)
609 if (s->p_card >= s->p_size) {
610 s->p_size += GRANULE;
611 s = (t_powerset *) realloc(s,
612 sizeof *s + s->p_size * sizeof s->p_set);
614 s->p_set[s->p_size] = (t_state *) NULL;
620 memmove((char *) (s->p_set + pos + 1),
621 (char *) (s->p_set + pos), n * sizeof s->p_set[0]);
630 nfatransition(t_state * to, byte token)
636 newtransition(token, from, to);
641 static t_state * nfadevelop(t_state * from, t_state * final, iconv_t icc,
642 const utf8char * name, unsigned int len);
646 nfaslice(t_state * * from, t_state * * to, iconv_t icc,
647 const utf8char * chr, unsigned int chlen,
648 const utf8char * name, unsigned int len, t_state * final)
663 dstp = (char *) bytebuf;
664 dstc = sizeof bytebuf;
665 iconv(icc, &srcp, &srcc, &dstp, &dstc);
666 dstp = (char *) bytebuf;
667 cnt = sizeof bytebuf - dstc;
672 *** Check for end of string.
677 uniquetransition(EPSILON, t, final);
683 tp = uniquetransition(*dstp, f, (t_state *) NULL);
695 t = nfadevelop(f, final, icc, name, len);
702 t = nfadevelop((t_state *) NULL, final, icc, name, len);
707 *from = f = newstate();
710 t = nfatransition(t, dstp[--cnt]);
712 newtransition(*dstp, f, t);
717 nfadevelop(t_state * from, t_state * final, iconv_t icc,
718 const utf8char * name, unsigned int len)
728 chlen = utf8_chlen[*name];
730 for (i = 1; i < chlen; i++)
731 if ((name[i] & 0xC0) != 0x80)
736 "Invalid UTF8 character in character set name\n");
737 return (t_state *) NULL;
740 to = (t_state *) NULL;
742 icc, name, chlen, name + chlen, len - chlen, final);
744 if (*name >= UTF8_a && *name <= UTF8_z)
745 chr = *name - UTF8_a + UTF8_A;
746 else if (*name >= UTF8_A && *name <= UTF8_Z)
747 chr = *name - UTF8_A + UTF8_a;
751 nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final);
758 nfaenter(const utf8char * name, int len, t_chset * charset)
767 *** Enter case-insensitive `name' in NFA in all known
769 *** Redundant shift state changes as well as shift state
770 *** differences between uppercase and lowercase are
775 len = strlen(name) + 1;
777 for (lp = charset->c_names; lp; lp = lp->l_next)
778 if (!memcmp(name, lp->l_symbol, len))
779 return; /* Already entered. */
781 lp = (t_symlist *) malloc(sizeof *lp + len);
783 memcpy(lp->l_symbol, name, len);
784 lp->l_symbol[len] = '\0';
785 lp->l_next = charset->c_names;
786 charset->c_names = lp;
788 final->s_final = charset;
790 for (s = chset_list; s; s = s->c_next)
791 if (!iconv_open_error(s->c_fromUTF8))
792 sp = nfadevelop(initial_state, final,
793 s->c_fromUTF8, name, len);
798 utf8_utostr(utf8char * s, unsigned int v)
806 i = d? utf8_utostr(s, d): 0;
814 utf8_utostrpad(utf8char * s, unsigned int v, int digits)
817 unsigned int i = utf8_utostr(s, v);
818 utf8char pad = UTF8_SPACE;
828 memmove(s + digits - i, s, i + 1);
829 memset(s, pad, digits - i);
835 utf8_strtou(const utf8char * s)
840 while (*s == UTF8_SPACE || *s == UTF8_HT)
843 for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;)
844 v = 10 * v + *s++ - UTF8_0;
851 getNumAttr(xmlNodePtr node, const xmlChar * name)
857 s = xmlGetProp(node, name);
862 val = utf8_strtou(s);
863 xmlFree((xmlChar *) s);
869 read_assocs(const char * filename)
873 xmlXPathContextPtr ctxt;
874 xmlXPathObjectPtr obj;
879 unsigned int mibenum;
882 doc = loadXMLFile(filename);
885 fprintf(stderr, "Cannot load file %s\n", filename);
889 ctxt = xmlXPathNewContext(doc);
890 obj = xmlXPathEval(utf8_assocnodes, ctxt);
892 if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval ||
893 !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) {
894 fprintf(stderr, "No association found in %s\n", filename);
898 for (i = 0; i < obj->nodesetval->nodeNr; i++) {
899 node = obj->nodesetval->nodeTab[i];
900 ccsid = getNumAttr(node, utf8_ccsid);
901 mibenum = getNumAttr(node, utf8_mibenum);
904 *** Check for duplicate.
907 for (sp = chset_list; sp; sp = sp->c_next)
908 if (ccsid && ccsid == sp->c_ccsid ||
909 mibenum && mibenum == sp->c_mibenum) {
910 fprintf(stderr, "Duplicate character set: ");
911 fprintf(stderr, "CCSID = %u/%u, ",
913 fprintf(stderr, "MIBenum = %u/%u\n",
914 mibenum, sp->c_mibenum);
922 *** Allocate the new character set.
925 sp = (t_chset *) malloc(sizeof *sp);
927 memset(sp, 0, sizeof *sp);
929 if (!ccsid) /* Do not attempt with current job CCSID. */
930 set_iconv_open_error(sp->c_fromUTF8);
933 iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0);
935 if (iconv_open_error(sp->c_fromUTF8) == -1)
937 "Cannot convert into CCSID %u: ignored\n",
942 sp->c_mibenum = mibenum;
943 sp->c_next = chset_list;
947 xmlXPathFreeObject(obj);
953 for (sp = chset_list; sp; sp = sp->c_next) {
954 strcpy(symbuf, utf8_ibm_);
955 utf8_utostr(symbuf + 4, sp->c_ccsid);
956 nfaenter(symbuf, -1, sp);
957 strcpy(symbuf, utf8_IBMCCSID);
958 utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5);
959 nfaenter(symbuf, 13, sp); /* Not null-terminated. */
962 strcpy(symbuf, utf8_iana_);
963 utf8_utostr(symbuf + 5, sp->c_mibenum);
964 nfaenter(symbuf, -1, sp);
967 xmlXPathRegisterVariable(ctxt, utf8_C,
968 xmlXPathNewFloat((double) sp->c_ccsid));
969 obj = xmlXPathEval(utf8_aliastext, ctxt);
971 if (!obj || obj->type != XPATH_NODESET) {
972 fprintf(stderr, "getAlias failed in %s\n", filename);
976 if (obj->nodesetval &&
977 obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) {
978 for (i = 0; i < obj->nodesetval->nodeNr; i++) {
979 node = obj->nodesetval->nodeTab[i];
980 nfaenter(node->content, -1, sp);
984 xmlXPathFreeObject(obj);
987 xmlXPathFreeContext(ctxt);
993 columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header)
996 xmlXPathObjectPtr obj;
997 unsigned int res = 0;
999 xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header));
1000 obj = xmlXPathEval(utf8_headerpos, ctxt);
1003 if (obj->type == XPATH_NUMBER)
1004 res = (unsigned int) obj->floatval;
1006 xmlXPathFreeObject(obj);
1014 read_iana(const char * filename)
1018 xmlXPathContextPtr ctxt;
1019 xmlXPathObjectPtr obj1;
1020 xmlXPathObjectPtr obj2;
1031 doc = loadXMLFile(filename);
1034 fprintf(stderr, "Cannot load file %s\n", filename);
1038 ctxt = xmlXPathNewContext(doc);
1041 xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri);
1044 obj1 = xmlXPathEval(utf8_tablerows, ctxt);
1046 if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval ||
1047 !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) {
1048 fprintf(stderr, "No data in %s\n", filename);
1053 *** Identify columns.
1056 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt);
1057 prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name);
1058 namecol = columnPosition(ctxt, utf8_Name);
1059 mibenumcol = columnPosition(ctxt, utf8_MIBenum);
1060 aliascol = columnPosition(ctxt, utf8_Aliases);
1062 if (!prefnamecol || !namecol || !mibenumcol || !aliascol) {
1063 fprintf(stderr, "Key column(s) missing in %s\n", filename);
1067 xmlXPathRegisterVariable(ctxt, utf8_P,
1068 xmlXPathNewFloat((double) prefnamecol));
1069 xmlXPathRegisterVariable(ctxt, utf8_N,
1070 xmlXPathNewFloat((double) namecol));
1071 xmlXPathRegisterVariable(ctxt, utf8_M,
1072 xmlXPathNewFloat((double) mibenumcol));
1073 xmlXPathRegisterVariable(ctxt, utf8_A,
1074 xmlXPathNewFloat((double) aliascol));
1077 *** Process each row.
1080 for (n = 1; n < obj1->nodesetval->nodeNr; n++) {
1081 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt);
1084 *** Get the MIBenum from current row.
1087 obj2 = xmlXPathEval(utf8_getmibenum, ctxt);
1089 if (!obj2 || obj2->type != XPATH_NUMBER) {
1090 fprintf(stderr, "get MIBenum failed at row %u\n", n);
1094 if (xmlXPathIsNaN(obj2->floatval) ||
1095 obj2->floatval < 1.0 || obj2->floatval > 65535.0 ||
1096 ((unsigned int) obj2->floatval) != obj2->floatval) {
1097 fprintf(stderr, "invalid MIBenum at row %u\n", n);
1098 xmlXPathFreeObject(obj2);
1102 mibenum = obj2->floatval;
1103 xmlXPathFreeObject(obj2);
1106 *** Search the associations for a corresponding CCSID.
1109 for (sp = chset_list; sp; sp = sp->c_next)
1110 if (sp->c_mibenum == mibenum)
1114 continue; /* No CCSID for this MIBenum. */
1117 *** Process preferred MIME name.
1120 obj2 = xmlXPathEval(utf8_getprefname, ctxt);
1122 if (!obj2 || obj2->type != XPATH_STRING) {
1124 "get Preferred_MIME_Name failed at row %u\n", n);
1128 if (obj2->stringval && obj2->stringval[0])
1129 nfaenter(obj2->stringval, -1, sp);
1131 xmlXPathFreeObject(obj2);
1137 obj2 = xmlXPathEval(utf8_getname, ctxt);
1139 if (!obj2 || obj2->type != XPATH_STRING) {
1140 fprintf(stderr, "get name failed at row %u\n", n);
1144 if (obj2->stringval && obj2->stringval[0])
1145 nfaenter(obj2->stringval, -1, sp);
1147 xmlXPathFreeObject(obj2);
1150 *** Process aliases.
1153 obj2 = xmlXPathEval(utf8_getaliases, ctxt);
1155 if (!obj2 || obj2->type != XPATH_NODESET) {
1156 fprintf(stderr, "get aliases failed at row %u\n", n);
1160 if (obj2->nodesetval && obj2->nodesetval->nodeTab)
1161 for (i = 0; i < obj2->nodesetval->nodeNr; i++) {
1162 node = obj2->nodesetval->nodeTab[i];
1164 if (node && node->content && node->content[0])
1165 nfaenter(node->content, -1, sp);
1168 xmlXPathFreeObject(obj2);
1171 xmlXPathFreeObject(obj1);
1172 xmlXPathFreeContext(ctxt);
1177 t_powerset * closureset(t_powerset * dst, t_powerset * src);
1181 closure(t_powerset * dst, t_state * src)
1185 unsigned int oldcard;
1187 if (src->s_nfastates) {
1189 *** Is a DFA state: return closure of set of equivalent
1193 return closureset(dst, src->s_nfastates);
1197 *** Compute closure of NFA state.
1200 dst = set_include(dst, src);
1202 for (t = src->s_forward; t; t = t->t_forwnext)
1203 if (t->t_token == EPSILON) {
1204 oldcard = dst->p_card;
1205 dst = set_include(dst, t->t_to);
1207 if (oldcard != dst->p_card)
1208 dst = closure(dst, t->t_to);
1216 closureset(t_powerset * dst, t_powerset * src)
1221 for (i = 0; i < src->p_card; i++)
1222 dst = closure(dst, (t_state *) src->p_set[i]);
1229 get_dfa_state(t_state * * stack,
1230 t_powerset * nfastates, xmlHashTablePtr sethash)
1235 if (s = hash_get(sethash, nfastates->p_set,
1236 nfastates->p_card * sizeof nfastates->p_set[0])) {
1238 *** DFA state already present.
1239 *** Release the NFA state set and return
1240 *** the address of the old DFA state.
1243 free((char *) nfastates);
1248 *** Build the new state.
1252 s->s_nfastates = nfastates;
1253 s->s_next = dfa_states;
1255 s->s_stack = *stack;
1259 *** Enter it in hash.
1262 if (hash_add(sethash, nfastates->p_set,
1263 nfastates->p_card * sizeof nfastates->p_set[0], s))
1264 chknull(NULL); /* Memory allocation error. */
1271 transcmp(const void * p1, const void * p2)
1277 t1 = *(t_transition * *) p1;
1278 t2 = *(t_transition * *) p2;
1279 return ((int) t1->t_token) - ((int) t2->t_token);
1287 t_powerset * transset;
1288 t_powerset * stateset;
1296 xmlHashTablePtr sethash;
1299 transset = set_include(NULL, NULL);
1301 stateset = set_include(NULL, NULL);
1303 sethash = xmlHashCreate(1);
1305 dfa_states = (t_state *) NULL;
1306 stack = (t_state *) NULL;
1310 *** Build the DFA initial state.
1313 get_dfa_state(&stack, closure(NULL, initial_state), sethash);
1316 *** Build the other DFA states by looking at each
1317 *** possible transition from stacked DFA states.
1322 fprintf(stderr, "%u DFA states\n", nst);
1326 s->s_stack = (t_state *) NULL;
1329 *** Build a set of all non-epsilon transitions from this
1333 transset->p_card = 0;
1335 for (n = 0; n < s->s_nfastates->p_card; n++) {
1336 s2 = s->s_nfastates->p_set[n];
1338 for (t = s2->s_forward; t; t = t->t_forwnext)
1339 if (t->t_token != EPSILON) {
1340 transset = set_include(transset, t);
1346 *** Sort transitions by token.
1349 qsort(transset->p_set, transset->p_card,
1350 sizeof transset->p_set[0], transcmp);
1353 *** Process all transitions, grouping them by token.
1356 stateset->p_card = 0;
1359 for (i = 0; i < transset->p_card; i++) {
1360 t = transset->p_set[i];
1362 if (token != t->t_token) {
1363 if (stateset->p_card) {
1365 *** Get the equivalent DFA state
1366 *** and create transition.
1369 newtransition(token, s,
1370 get_dfa_state(&stack,
1371 closureset(NULL, stateset),
1373 stateset->p_card = 0;
1379 stateset = set_include(stateset, t->t_to);
1382 if (stateset->p_card)
1383 newtransition(token, s, get_dfa_state(&stack,
1384 closureset(NULL, stateset), sethash));
1387 free((char *) transset);
1388 free((char *) stateset);
1389 xmlHashFree(sethash, NULL);
1392 *** Reverse the state list to get the initial state first,
1393 *** check for ambiguous prefixes, determine final states,
1394 *** destroy NFA state sets.
1397 while (s = dfa_states) {
1398 dfa_states = s->s_next;
1401 stateset = s->s_nfastates;
1402 s->s_nfastates = (t_powerset *) NULL;
1404 for (n = 0; n < stateset->p_card; n++) {
1405 s2 = (t_state *) stateset->p_set[n];
1408 if (s->s_final && s->s_final != s2->s_final)
1410 "Ambiguous name for CCSIDs %u/%u\n",
1411 s->s_final->c_ccsid,
1412 s2->s_final->c_ccsid);
1414 s->s_final = s2->s_final;
1418 free((char *) stateset);
1434 stack = initial_state;
1435 stack->s_stack = (t_state *) NULL;
1437 while ((s = stack)) {
1440 while ((t = s->s_forward)) {
1442 unlink_transition(t);
1445 if (!u->s_backward) {
1462 g = (t_stategroup *) malloc(sizeof *g);
1464 memset((char *) g, 0, sizeof *g);
1465 g->g_id = groupid++;
1478 t_state * finstates;
1482 t_stategroup * ghead;
1486 unsigned int startgroup;
1487 unsigned int gtrans[1 << (8 * sizeof(unsigned char))];
1490 *** Reduce DFA state count.
1494 ghead = (t_stategroup *) NULL;
1497 *** First split: non-final and each distinct final states.
1500 h = xmlHashCreate(4);
1503 for (s1 = dfa_states; s1; s1 = s1->s_next) {
1504 if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) {
1509 if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1))
1510 chknull(NULL); /* Memory allocation error. */
1513 s1->s_index = g1->g_id;
1514 s1->s_stack = g1->g_member;
1518 xmlHashFree(h, NULL);
1521 *** Subsequent splits: states that have the same forward
1522 *** transition tokens to states in the same group.
1528 for (g2 = ghead; g2; g2 = g2->g_next) {
1534 h = xmlHashCreate(1);
1538 *** Build the group transition map.
1541 memset((char *) gtrans, ~0, sizeof gtrans);
1543 for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext)
1544 gtrans[t1->t_token] = t1->t_to->s_index;
1546 if (hash_add(h, gtrans, sizeof gtrans, g2))
1550 *** Process other states in group.
1560 *** Build the transition map.
1563 memset((char *) gtrans, ~0, sizeof gtrans);
1565 for (t1 = s1->s_forward;
1566 t1; t1 = t1->t_forwnext)
1567 gtrans[t1->t_token] = t1->t_to->s_index;
1569 g1 = hash_get(h, gtrans, sizeof gtrans);
1581 if (hash_add(h, gtrans,
1586 s1->s_index = g1->g_id;
1587 s1->s_stack = g1->g_member;
1593 xmlHashFree(h, NULL);
1598 *** Establish group leaders and remap transitions.
1601 startgroup = dfa_states->s_index;
1603 for (g1 = ghead; g1; g1 = g1->g_next)
1604 for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack)
1605 for (t1 = s1->s_backward; t1; t1 = t2) {
1606 t2 = t1->t_backnext;
1607 link_transition(t1, NULL, g1->g_member);
1611 *** Remove redundant states and transitions.
1614 for (g1 = ghead; g1; g1 = g1->g_next) {
1615 g1->g_member->s_next = (t_state *) NULL;
1617 while ((s1 = g1->g_member->s_stack)) {
1618 g1->g_member->s_stack = s1->s_stack;
1620 for (t1 = s1->s_forward; t1; t1 = t2) {
1621 t2 = t1->t_forwnext;
1622 unlink_transition(t1);
1631 *** Remove group support and relink DFA states.
1634 dfa_states = (t_state *) NULL;
1635 s2 = (t_state *) NULL;
1636 finstates = (t_state *) NULL;
1638 while (g1 = ghead) {
1642 if (g1->g_id == startgroup)
1643 dfa_states = s1; /* Keep start state first. */
1644 else if (s1->s_final) { /* Then final states. */
1645 s1->s_next = finstates;
1648 else { /* Finish with non-final states. */
1656 for (dfa_states->s_next = finstates; finstates->s_next;)
1657 finstates = finstates->s_next;
1659 finstates->s_next = s2;
1664 inttype(unsigned long max)
1669 for (i = 0; max; i++)
1672 if (i > 8 * sizeof(unsigned int))
1673 return "unsigned long";
1675 if (i > 8 * sizeof(unsigned short))
1676 return "unsigned int";
1678 if (i > 8 * sizeof(unsigned char))
1679 return "unsigned short";
1681 return "unsigned char";
1697 fprintf(fp, "/**\n*** CCSID For arg Recognized name.\n");
1700 for (cp = chset_list; cp; cp = cp->c_next) {
1709 pos = fprintf(fp, "*** %5u %c ", cp->c_ccsid,
1710 iconv_open_error(cp->c_fromUTF8)? ' ': 'X');
1712 for (lp = cp->c_names; lp; lp = lp->l_next) {
1713 srcp = (char *) lp->l_symbol;
1714 srcc = strlen(srcp);
1717 iconv(utf82job, &srcp, &srcc, &dstp, &dstc);
1720 if (pos + srcc > 79) {
1721 fprintf(fp, "\n***%22c", ' ');
1725 pos += fprintf(fp, " %.*s", srcc, buf);
1732 fprintf(fp, "**/\n\n");
1740 unsigned int nstates;
1741 unsigned int ntrans;
1742 unsigned int maxfinal;
1750 *** Assign indexes to states and transitions.
1757 for (s = dfa_states; s; s = s->s_next) {
1758 s->s_index = nstates++;
1763 for (t = s->s_forward; t; t = t->t_forwnext)
1764 t->t_index = ntrans++;
1768 "/**\n*** %u states, %u finals, %u transitions.\n**/\n\n",
1769 nstates, maxfinal, ntrans);
1770 fprintf(stderr, "%u states, %u finals, %u transitions.\n",
1771 nstates, maxfinal, ntrans);
1777 fprintf(fp, "typedef unsigned short t_ccsid;\n");
1778 fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates));
1779 fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans));
1782 *** Generate first transition index for each state.
1785 fprintf(fp, "static t_transrange trans_array[] = {\n");
1789 for (s = dfa_states; s; s = s->s_next) {
1790 pos += fprintf(fp, " %u,", ntrans);
1797 for (t = s->s_forward; t; t = t->t_forwnext)
1801 fprintf(fp, " %u\n};\n\n", ntrans);
1804 *** Generate final state info.
1807 fprintf(fp, "static t_ccsid final_array[] = {\n");
1812 for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) {
1813 pos += fprintf(fp, "%s", ns);
1821 pos += fprintf(fp, " %u",
1822 s->s_final? s->s_final->c_ccsid + 1: 0);
1825 fprintf(fp, "\n};\n\n");
1828 *** Generate goto table.
1831 fprintf(fp, "static t_staterange goto_array[] = {\n");
1834 for (s = dfa_states; s; s = s->s_next)
1835 for (t = s->s_forward; t; t = t->t_forwnext) {
1836 pos += fprintf(fp, " %u,", t->t_to->s_index);
1844 fprintf(fp, " %u\n};\n\n", nstates);
1847 *** Generate transition label table.
1850 fprintf(fp, "static unsigned char label_array[] = {\n");
1854 for (s = dfa_states; s; s = s->s_next)
1855 for (t = s->s_forward; t; t = t->t_forwnext) {
1856 pos += fprintf(fp, "%s", ns);
1864 pos += fprintf(fp, " 0x%02X", t->t_token);
1867 fprintf(fp, "\n};\n", nstates);
1880 chset_list = (t_chset *) NULL;
1881 initial_state = newstate();
1882 job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0);
1883 utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0);
1886 fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv);
1887 fprintf(stderr, "<iana-character-set file> <output file>\n");
1892 *** Read CCSID/MIBenum associations. Define special names.
1895 read_assocs(argv[1]);
1898 *** Read character set names and establish the case-independent
1899 *** name DFA in all possible CCSIDs.
1905 *** Build DFA from NFA.
1917 *** Minimize the DFA state count.
1923 *** Generate the table.
1926 fp = fopen(argv[3], "w+");
1933 fprintf(fp, "/**\n");
1934 fprintf(fp, "*** Character set names table.\n");
1935 fprintf(fp, "*** Generated by program BLDCSNDFA from");
1936 fprintf(fp, " IANA character set assignment file\n");
1937 fprintf(fp, "*** and CCSID/MIBenum equivalence file.\n");
1938 fprintf(fp, "*** *** Do not edit by hand ***\n");
1939 fprintf(fp, "**/\n\n");
1950 iconv_close(job2utf8);
1951 iconv_close(utf82job);