1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
29 #include "localeinfo.h"
31 #include "locfile-token.h"
32 #include "stringtrans.h"
34 /* Uncomment the following line in the production version. */
39 void *xmalloc (size_t __n);
40 void *xcalloc (size_t __n, size_t __s);
41 void *xrealloc (void *__ptr, size_t __n);
44 /* The bit used for representing a special class. */
45 #define BITPOS(class) ((class) - tok_upper)
46 #define BIT(class) (1 << BITPOS (class))
48 #define ELEM(ctype, collection, idx, value) \
49 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
50 &ctype->collection##_act idx, value)
53 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
56 ((((w) >> 8) & 0xff) | (((w) & 0xff) << 8))
59 /* To be compatible with former implementations we for now restrict
60 the number of bits for character classes to 16. When compatibility
61 is not necessary anymore increase the number to 32. */
62 #define char_class_t u_int16_t
63 #define CHAR_CLASS_TRANS SWAPU16
64 #define char_class32_t u_int32_t
65 #define CHAR_CLASS32_TRANS SWAPU32
68 /* The real definition of the struct for the LC_CTYPE locale. */
71 unsigned int *charnames;
75 /* We will allow up to 8 * sizeof(u_int32_t) - 1 character classes. */
76 #define MAX_NR_CHARCLASS (8 * sizeof (u_int32_t) - 1)
78 const char *classnames[MAX_NR_CHARCLASS];
79 unsigned long int current_class_mask;
80 unsigned int last_class_char;
81 u_int32_t *class_collection;
82 size_t class_collection_max;
83 size_t class_collection_act;
84 unsigned long int class_done;
86 /* If the following number ever turns out to be too small simply
87 increase it. But I doubt it will. --drepper@gnu */
88 #define MAX_NR_CHARMAP 16
89 const char *mapnames[MAX_NR_CHARMAP];
90 u_int32_t *map_collection[MAX_NR_CHARMAP];
91 u_int32_t map_collection_max[MAX_NR_CHARMAP];
92 u_int32_t map_collection_act[MAX_NR_CHARMAP];
93 size_t map_collection_nr;
95 unsigned int from_map_char;
99 /* The arrays for the binary representation. */
100 u_int32_t plane_size;
102 char_class_t *ctype_b;
103 char_class32_t *ctype32_b;
108 u_int32_t *class_name_ptr;
109 u_int32_t *map_name_ptr;
110 unsigned char *width;
111 u_int32_t mb_cur_max;
112 const char *codeset_name;
116 /* Prototypes for local functions. */
117 static void ctype_class_newP (struct linereader *lr,
118 struct locale_ctype_t *ctype, const char *name);
119 static void ctype_map_newP (struct linereader *lr,
120 struct locale_ctype_t *ctype,
121 const char *name, struct charset_t *charset);
122 static u_int32_t *find_idx (struct locale_ctype_t *ctype, u_int32_t **table,
123 size_t *max, size_t *act, unsigned int idx);
124 static void set_class_defaults (struct locale_ctype_t *ctype,
125 struct charset_t *charset);
126 static void allocate_arrays (struct locale_ctype_t *ctype,
127 struct charset_t *charset);
131 ctype_startup (struct linereader *lr, struct localedef_t *locale,
132 struct charset_t *charset)
135 struct locale_ctype_t *ctype;
137 /* It is important that we always use UCS1 encoding for strings now. */
138 encoding_method = ENC_UCS1;
140 /* Allocate the needed room. */
141 locale->categories[LC_CTYPE].ctype = ctype =
142 (struct locale_ctype_t *) xmalloc (sizeof (struct locale_ctype_t));
144 /* We have no names seen yet. */
145 ctype->charnames_max = charset->mb_cur_max == 1 ? 256 : 512;
147 (unsigned int *) xmalloc (ctype->charnames_max * sizeof (unsigned int));
148 for (cnt = 0; cnt < 256; ++cnt)
149 ctype->charnames[cnt] = cnt;
150 ctype->charnames_act = 256;
152 /* Fill character class information. */
153 ctype->nr_charclass = 0;
154 ctype->current_class_mask = 0;
155 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
156 /* The order of the following instructions determines the bit
158 ctype_class_newP (lr, ctype, "upper");
159 ctype_class_newP (lr, ctype, "lower");
160 ctype_class_newP (lr, ctype, "alpha");
161 ctype_class_newP (lr, ctype, "digit");
162 ctype_class_newP (lr, ctype, "xdigit");
163 ctype_class_newP (lr, ctype, "space");
164 ctype_class_newP (lr, ctype, "print");
165 ctype_class_newP (lr, ctype, "graph");
166 ctype_class_newP (lr, ctype, "blank");
167 ctype_class_newP (lr, ctype, "cntrl");
168 ctype_class_newP (lr, ctype, "punct");
169 ctype_class_newP (lr, ctype, "alnum");
171 ctype->class_collection_max = charset->mb_cur_max == 1 ? 256 : 512;
172 ctype->class_collection
173 = (u_int32_t *) xmalloc (sizeof (unsigned long int)
174 * ctype->class_collection_max);
175 memset (ctype->class_collection, '\0',
176 sizeof (unsigned long int) * ctype->class_collection_max);
177 ctype->class_collection_act = 256;
179 /* Fill character map information. */
180 ctype->map_collection_nr = 0;
181 ctype->last_map_idx = MAX_NR_CHARMAP;
182 ctype->from_map_char = ILLEGAL_CHAR_VALUE;
183 ctype_map_newP (lr, ctype, "toupper", charset);
184 ctype_map_newP (lr, ctype, "tolower", charset);
186 /* Fill first 256 entries in `toupper' and `tolower' arrays. */
187 for (cnt = 0; cnt < 256; ++cnt)
189 ctype->map_collection[0][cnt] = cnt;
190 ctype->map_collection[1][cnt] = cnt;
196 ctype_finish (struct localedef_t *locale, struct charset_t *charset)
198 /* See POSIX.2, table 2-6 for the meaning of the following table. */
203 const char allow[NCLASS];
205 valid_table[NCLASS] =
207 /* The order is important. See token.h for more information.
208 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
209 { "upper", "--MX-XDDXXX-" },
210 { "lower", "--MX-XDDXXX-" },
211 { "alpha", "---X-XDDXXX-" },
212 { "digit", "XXX--XDDXXX-" },
213 { "xdigit", "-----XDDXXX-" },
214 { "space", "XXXXX------X" },
215 { "print", "---------X--" },
216 { "graph", "---------X--" },
217 { "blank", "XXXXXM-----X" },
218 { "cntrl", "XXXXX-XX--XX" },
219 { "punct", "XXXXX-DD-X-X" },
220 { "alnum", "-----XDDXXX-" }
224 unsigned int space_value;
225 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
227 /* Set default value for classes not specified. */
228 set_class_defaults (ctype, charset);
230 /* Check according to table. */
231 for (cnt = 0; cnt < ctype->class_collection_max; ++cnt)
233 unsigned long int tmp;
235 tmp = ctype->class_collection[cnt];
239 for (cls1 = 0; cls1 < NCLASS; ++cls1)
240 if ((tmp & (1 << cls1)) != 0)
241 for (cls2 = 0; cls2 < NCLASS; ++cls2)
242 if (valid_table[cls1].allow[cls2] != '-')
244 int eq = (tmp & (1 << cls2)) != 0;
245 switch (valid_table[cls1].allow[cls2])
254 value = ctype->charnames[cnt];
256 if ((value & 0xff000000) != 0)
257 cp += sprintf (cp, "\\%o", (value >> 24) & 0xff);
258 if ((value & 0xffff0000) != 0)
259 cp += sprintf (cp, "\\%o", (value >> 16) & 0xff);
260 if ((value & 0xffffff00) != 0)
261 cp += sprintf (cp, "\\%o", (value >> 8) & 0xff);
262 sprintf (cp, "\\%o", value & 0xff);
265 character %s'%s' in class `%s' must be in class `%s'"), value > 256 ? "L" : "",
266 cp, valid_table[cls1].name,
267 valid_table[cls2].name);
278 value = ctype->charnames[cnt];
280 if ((value & 0xff000000) != 0)
281 cp += sprintf (cp, "\\%o", value >> 24);
282 if ((value & 0xffff0000) != 0)
283 cp += sprintf (cp, "\\%o", (value >> 16) & 0xff);
284 if ((value & 0xffffff00) != 0)
285 cp += sprintf (cp, "\\%o", (value >> 8) & 0xff);
286 sprintf (cp, "\\%o", value & 0xff);
289 character %s'%s' in class `%s' must not be in class `%s'"),
290 value > 256 ? "L" : "", cp,
291 valid_table[cls1].name, valid_table[cls2].name);
296 ctype->class_collection[cnt] |= 1 << cls2;
300 error (5, 0, _("internal error in %s, line %u"),
301 __FUNCTION__, __LINE__);
306 /* ... and now test <SP> as a special case. */
307 space_value = charset_find_value (charset, "SP", 2);
308 if (space_value == ILLEGAL_CHAR_VALUE)
309 error (0, 0, _("character <SP> not defined in character map"));
310 else if ((cnt = BITPOS (tok_space),
311 (ELEM (ctype, class_collection, , space_value)
312 & BIT (tok_space)) == 0)
313 || (cnt = BITPOS (tok_blank),
314 (ELEM (ctype, class_collection, , space_value)
315 & BIT (tok_blank)) == 0))
316 error (0, 0, _("<SP> character not in class `%s'"),
317 valid_table[cnt].name);
318 else if ((cnt = BITPOS (tok_punct),
319 (ELEM (ctype, class_collection, , space_value)
320 & BIT (tok_punct)) != 0)
321 || (cnt = BITPOS (tok_graph),
322 (ELEM (ctype, class_collection, , space_value)
325 error (0, 0, _("<SP> character must not be in class `%s'"),
326 valid_table[cnt].name);
328 ELEM (ctype, class_collection, , space_value) |= BIT (tok_print);
330 /* Now that the tests are done make sure the name array contains all
331 characters which are handled in the WIDTH section of the
332 character set definition file. */
333 if (charset->width_rules != NULL)
334 for (cnt = 0; cnt < charset->nwidth_rules; ++cnt)
337 for (inner = charset->width_rules[cnt].from;
338 inner <= charset->width_rules[cnt].to; ++inner)
339 (void) find_idx (ctype, NULL, NULL, NULL, inner);
345 ctype_output (struct localedef_t *locale, struct charset_t *charset,
346 const char *output_path)
348 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
349 const size_t nelems = (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)
350 + 2 * (ctype->map_collection_nr - 2));
351 struct iovec iov[2 + nelems + ctype->nr_charclass
352 + ctype->map_collection_nr];
353 struct locale_file data;
354 u_int32_t idx[nelems];
355 size_t elem, cnt, offset, total;
358 if ((locale->binary & (1 << LC_CTYPE)) != 0)
360 iov[0].iov_base = ctype;
361 iov[0].iov_len = locale->len[LC_CTYPE];
363 write_locale_data (output_path, "LC_CTYPE", 1, iov);
369 /* Now prepare the output: Find the sizes of the table we can use. */
370 allocate_arrays (ctype, charset);
372 data.magic = LIMAGIC (LC_CTYPE);
374 iov[0].iov_base = (void *) &data;
375 iov[0].iov_len = sizeof (data);
377 iov[1].iov_base = (void *) idx;
378 iov[1].iov_len = sizeof (idx);
380 idx[0] = iov[0].iov_len + iov[1].iov_len;
383 for (elem = 0; elem < nelems; ++elem)
385 if (elem < _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE))
388 #define CTYPE_DATA(name, base, len) \
389 case _NL_ITEM_INDEX (name): \
390 iov[2 + elem + offset].iov_base = (base); \
391 iov[2 + elem + offset].iov_len = (len); \
392 if (elem + 1 < nelems) \
393 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
396 CTYPE_DATA (_NL_CTYPE_CLASS,
398 (256 + 128) * sizeof (char_class_t));
400 CTYPE_DATA (_NL_CTYPE_TOUPPER_EB,
402 (ctype->plane_size * ctype->plane_cnt + 128)
403 * sizeof (u_int32_t));
404 CTYPE_DATA (_NL_CTYPE_TOLOWER_EB,
406 (ctype->plane_size * ctype->plane_cnt + 128)
407 * sizeof (u_int32_t));
409 CTYPE_DATA (_NL_CTYPE_TOUPPER_EL,
411 (ctype->plane_size * ctype->plane_cnt + 128)
412 * sizeof (u_int32_t));
413 CTYPE_DATA (_NL_CTYPE_TOLOWER_EL,
415 (ctype->plane_size * ctype->plane_cnt + 128)
416 * sizeof (u_int32_t));
418 CTYPE_DATA (_NL_CTYPE_CLASS32,
420 (ctype->plane_size * ctype->plane_cnt
421 * sizeof (char_class32_t)));
423 CTYPE_DATA (_NL_CTYPE_NAMES_EB,
424 ctype->names_eb, (ctype->plane_size * ctype->plane_cnt
425 * sizeof (u_int32_t)));
426 CTYPE_DATA (_NL_CTYPE_NAMES_EL,
427 ctype->names_el, (ctype->plane_size * ctype->plane_cnt
428 * sizeof (u_int32_t)));
430 CTYPE_DATA (_NL_CTYPE_HASH_SIZE,
431 &ctype->plane_size, sizeof (u_int32_t));
432 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS,
433 &ctype->plane_cnt, sizeof (u_int32_t));
435 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
436 /* The class name array. */
438 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
440 iov[2 + elem + offset].iov_base
441 = (void *) ctype->classnames[cnt];
442 iov[2 + elem + offset].iov_len
443 = strlen (ctype->classnames[cnt]) + 1;
444 total += iov[2 + elem + offset].iov_len;
446 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
447 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
448 total += 1 + (4 - ((total + 1) % 4));
450 if (elem + 1 < nelems)
451 idx[elem + 1] = idx[elem] + total;
454 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
455 /* The class name array. */
457 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
459 iov[2 + elem + offset].iov_base
460 = (void *) ctype->mapnames[cnt];
461 iov[2 + elem + offset].iov_len
462 = strlen (ctype->mapnames[cnt]) + 1;
463 total += iov[2 + elem + offset].iov_len;
465 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
466 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
467 total += 1 + (4 - ((total + 1) % 4));
469 if (elem + 1 < nelems)
470 idx[elem + 1] = idx[elem] + total;
473 CTYPE_DATA (_NL_CTYPE_WIDTH,
474 ctype->width, ctype->plane_size * ctype->plane_cnt);
476 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
477 &ctype->mb_cur_max, sizeof (u_int32_t));
479 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
480 total = strlen (ctype->codeset_name) + 1;
482 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
485 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
486 memcpy (iov[2 + elem + offset].iov_base, ctype->codeset_name,
488 total = (total + 3) & ~3;
490 iov[2 + elem + offset].iov_len = total;
491 if (elem + 1 < nelems)
492 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
496 assert (! "unknown CTYPE element");
500 /* Handle extra maps. */
501 size_t nr = (elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) >> 1;
503 if (((elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) & 1) == 0)
504 iov[2 + elem + offset].iov_base = ctype->map_eb[nr];
506 iov[2 + elem + offset].iov_base = ctype->map_el[nr];
508 iov[2 + elem + offset].iov_len = ((ctype->plane_size
509 * ctype->plane_cnt + 128)
510 * sizeof (u_int32_t));
512 if (elem + 1 < nelems)
513 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
517 assert (2 + elem + offset == (nelems + ctype->nr_charclass
518 + ctype->map_collection_nr + 2));
520 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
524 /* Character class handling. */
526 ctype_class_new (struct linereader *lr, struct localedef_t *locale,
527 enum token_t tok, struct token *code,
528 struct charset_t *charset)
530 ctype_class_newP (lr, locale->categories[LC_CTYPE].ctype,
531 code->val.str.start);
536 ctype_is_charclass (struct linereader *lr, struct localedef_t *locale,
541 for (cnt = 0; cnt < locale->categories[LC_CTYPE].ctype->nr_charclass; ++cnt)
542 if (strcmp (name, locale->categories[LC_CTYPE].ctype->classnames[cnt])
551 ctype_class_start (struct linereader *lr, struct localedef_t *locale,
552 enum token_t tok, const char *str,
553 struct charset_t *charset)
555 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
599 assert (! "illegal token as class name: should not happen");
602 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
603 if (strcmp (str, ctype->classnames[cnt]) == 0)
606 if (cnt >= ctype->nr_charclass)
607 assert (! "unknown class in class definition: should not happen");
609 ctype->class_done |= BIT (tok);
611 ctype->current_class_mask = 1 << cnt;
612 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
617 ctype_class_from (struct linereader *lr, struct localedef_t *locale,
618 struct token *code, struct charset_t *charset)
620 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
623 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
625 ctype->last_class_char = value;
627 if (value == ILLEGAL_CHAR_VALUE)
628 /* In the LC_CTYPE category it is no error when a character is
629 not found. This has to be ignored silently. */
632 *find_idx (ctype, &ctype->class_collection, &ctype->class_collection_max,
633 &ctype->class_collection_act, value)
634 |= ctype->current_class_mask;
639 ctype_class_to (struct linereader *lr, struct localedef_t *locale,
640 struct token *code, struct charset_t *charset)
642 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
643 unsigned int value, cnt;
645 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
647 assert (value >= ctype->last_class_char);
649 for (cnt = ctype->last_class_char + 1; cnt <= value; ++cnt)
650 *find_idx (ctype, &ctype->class_collection, &ctype->class_collection_max,
651 &ctype->class_collection_act, cnt)
652 |= ctype->current_class_mask;
654 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
659 ctype_class_end (struct linereader *lr, struct localedef_t *locale)
661 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
663 /* We have no special actions to perform here. */
664 ctype->current_class_mask = 0;
665 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
669 /* Character map handling. */
671 ctype_map_new (struct linereader *lr, struct localedef_t *locale,
672 enum token_t tok, struct token *code,
673 struct charset_t *charset)
675 ctype_map_newP (lr, locale->categories[LC_CTYPE].ctype,
676 code->val.str.start, charset);
681 ctype_is_charconv (struct linereader *lr, struct localedef_t *locale,
684 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
687 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
688 if (strcmp (name, ctype->mapnames[cnt]) == 0)
696 ctype_map_start (struct linereader *lr, struct localedef_t *locale,
697 enum token_t tok, const char *name, struct charset_t *charset)
699 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
705 ctype->toupper_done = 1;
709 ctype->tolower_done = 1;
715 assert (! "unknown token in category `LC_CTYPE' should not happen");
718 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
719 if (strcmp (name, ctype->mapnames[cnt]) == 0)
722 if (cnt == ctype->map_collection_nr)
723 assert (! "unknown token in category `LC_CTYPE' should not happen");
725 ctype->last_map_idx = cnt;
726 ctype->from_map_char = ILLEGAL_CHAR_VALUE;
731 ctype_map_from (struct linereader *lr, struct localedef_t *locale,
732 struct token *code, struct charset_t *charset)
734 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
737 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
739 if (value == ILLEGAL_CHAR_VALUE)
740 /* In the LC_CTYPE category it is no error when a character is
741 not found. This has to be ignored silently. */
744 assert (ctype->last_map_idx < ctype->map_collection_nr);
746 ctype->from_map_char = value;
751 ctype_map_to (struct linereader *lr, struct localedef_t *locale,
752 struct token *code, struct charset_t *charset)
754 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
757 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
759 if (ctype->from_map_char == ILLEGAL_CHAR_VALUE
760 || value == ILLEGAL_CHAR_VALUE)
762 /* In the LC_CTYPE category it is no error when a character is
763 not found. This has to be ignored silently. */
764 ctype->from_map_char = ILLEGAL_CHAR_VALUE;
768 *find_idx (ctype, &ctype->map_collection[ctype->last_map_idx],
769 &ctype->map_collection_max[ctype->last_map_idx],
770 &ctype->map_collection_act[ctype->last_map_idx],
771 ctype->from_map_char) = value;
773 ctype->from_map_char = ILLEGAL_CHAR_VALUE;
778 ctype_map_end (struct linereader *lr, struct localedef_t *locale)
780 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
782 ctype->last_map_idx = MAX_NR_CHARMAP;
783 ctype->from_map_char = ILLEGAL_CHAR_VALUE;
787 /* Local functions. */
789 ctype_class_newP (struct linereader *lr, struct locale_ctype_t *ctype,
794 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
795 if (strcmp (ctype->classnames[cnt], name) == 0)
798 if (cnt < ctype->nr_charclass)
800 lr_error (lr, _("character class `%s' already defined"));
804 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
805 /* Exit code 2 is prescribed in P1003.2b. */
807 implementation limit: no more than %d character classes allowed"),
810 ctype->classnames[ctype->nr_charclass++] = name;
815 ctype_map_newP (struct linereader *lr, struct locale_ctype_t *ctype,
816 const char *name, struct charset_t *charset)
818 size_t max_chars = 0;
821 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
823 if (strcmp (ctype->mapnames[cnt], name) == 0)
826 if (max_chars < ctype->map_collection_max[cnt])
827 max_chars = ctype->map_collection_max[cnt];
830 if (cnt < ctype->map_collection_nr)
832 lr_error (lr, _("character map `%s' already defined"));
836 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
837 /* Exit code 2 is prescribed in P1003.2b. */
839 implementation limit: no more than %d character maps allowed"),
842 ctype->mapnames[cnt] = name;
845 ctype->map_collection_max[cnt] = charset->mb_cur_max == 1 ? 256 : 512;
847 ctype->map_collection_max[cnt] = max_chars;
849 ctype->map_collection[cnt] = (u_int32_t *)
850 xmalloc (sizeof (u_int32_t) * ctype->map_collection_max[cnt]);
851 memset (ctype->map_collection[cnt], '\0',
852 sizeof (u_int32_t) * ctype->map_collection_max[cnt]);
853 ctype->map_collection_act[cnt] = 256;
855 ++ctype->map_collection_nr;
859 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
860 is possible if we only want ot extend the name array. */
862 find_idx (struct locale_ctype_t *ctype, u_int32_t **table, size_t *max,
863 size_t *act, unsigned int idx)
868 return table == NULL ? NULL : &(*table)[idx];
870 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
871 if (ctype->charnames[cnt] == idx)
874 /* We have to distinguish two cases: the names is found or not. */
875 if (cnt == ctype->charnames_act)
877 /* Extend the name array. */
878 if (ctype->charnames_act == ctype->charnames_max)
880 ctype->charnames_max *= 2;
881 ctype->charnames = (unsigned int *)
882 xrealloc (ctype->charnames,
883 sizeof (unsigned int) * ctype->charnames_max);
885 ctype->charnames[ctype->charnames_act++] = idx;
889 /* We have done everything we are asked to do. */
896 size_t old_max = *max;
902 (u_int32_t *) xrealloc (*table, *max * sizeof (unsigned long int));
903 memset (&(*table)[old_max], '\0',
904 (*max - old_max) * sizeof (u_int32_t));
911 return &(*table)[cnt];
916 set_class_defaults (struct locale_ctype_t *ctype, struct charset_t *charset)
918 /* These function defines the default values for the classes and conversions
919 according to POSIX.2 2.5.2.1.
920 It may seem that the order of these if-blocks is arbitrary but it is NOT.
921 Don't move them unless you know what you do! */
923 void set_default (int bit, int from, int to)
930 for (ch = from; ch <= to; ++ch)
935 value = charset_find_value (charset, tmp, 1);
936 if (value == ILLEGAL_CHAR_VALUE)
939 character `%s' not defined while needed as default value"),
944 ELEM (ctype, class_collection, , value) |= bit;
948 /* Set default values if keyword was not present. */
949 if ((ctype->class_done & BIT (tok_upper)) == 0)
950 /* "If this keyword [lower] is not specified, the lowercase letters
951 `A' through `Z', ..., shall automatically belong to this class,
952 with implementation defined character values." [P1003.2, 2.5.2.1] */
953 set_default (BIT (tok_upper), 'A', 'Z');
955 if ((ctype->class_done & BIT (tok_lower)) == 0)
956 /* "If this keyword [lower] is not specified, the lowercase letters
957 `a' through `z', ..., shall automatically belong to this class,
958 with implementation defined character values." [P1003.2, 2.5.2.1] */
959 set_default (BIT (tok_lower), 'a', 'z');
961 if ((ctype->class_done & BIT (tok_alpha)) == 0)
963 /* Table 2-6 in P1003.2 says that characters in class `upper' or
964 class `lower' *must* be in class `alpha'. */
965 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
968 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
969 if ((ctype->class_collection[cnt] & mask) != 0)
970 ctype->class_collection[cnt] |= BIT (tok_alpha);
973 if ((ctype->class_done & BIT (tok_digit)) == 0)
974 /* "If this keyword [digit] is not specified, the digits `0' through
975 `9', ..., shall automatically belong to this class, with
976 implementation-defined character values." [P1003.2, 2.5.2.1] */
977 set_default (BIT (tok_digit), '0', '9');
979 /* "Only characters specified for the `alpha' and `digit' keyword
980 shall be specified. Characters specified for the keyword `alpha'
981 and `digit' are automatically included in this class. */
983 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
986 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
987 if ((ctype->class_collection[cnt] & mask) != 0)
988 ctype->class_collection[cnt] |= BIT (tok_alnum);
991 if ((ctype->class_done & BIT (tok_space)) == 0)
992 /* "If this keyword [space] is not specified, the characters <space>,
993 <form-feed>, <newline>, <carriage-return>, <tab>, and
994 <vertical-tab>, ..., shall automatically belong to this class,
995 with implementation-defined character values." [P1003.2, 2.5.2.1] */
999 value = charset_find_value (charset, "space", 5);
1000 if (value == ILLEGAL_CHAR_VALUE)
1002 character `%s' not defined while needed as default value"),
1005 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
1007 value = charset_find_value (charset, "form-feed", 9);
1008 if (value == ILLEGAL_CHAR_VALUE)
1010 character `%s' not defined while needed as default value"),
1013 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
1015 value = charset_find_value (charset, "newline", 7);
1016 if (value == ILLEGAL_CHAR_VALUE)
1018 character `%s' not defined while needed as default value"),
1021 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
1023 value = charset_find_value (charset, "carriage-return", 15);
1024 if (value == ILLEGAL_CHAR_VALUE)
1026 character `%s' not defined while needed as default value"),
1027 "<carriage-return>");
1029 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
1031 value = charset_find_value (charset, "tab", 3);
1032 if (value == ILLEGAL_CHAR_VALUE)
1034 character `%s' not defined while needed as default value"),
1037 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
1039 value = charset_find_value (charset, "vertical-tab", 12);
1040 if (value == ILLEGAL_CHAR_VALUE)
1042 character `%s' not defined while needed as default value"),
1045 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
1048 if ((ctype->class_done & BIT (tok_xdigit)) == 0)
1049 /* "If this keyword is not specified, the digits `0' to `9', the
1050 uppercase letters `A' through `F', and the lowercase letters `a'
1051 through `f', ..., shell automatically belong to this class, with
1052 implementation defined character values." [P1003.2, 2.5.2.1] */
1054 set_default (BIT (tok_xdigit), '0', '9');
1055 set_default (BIT (tok_xdigit), 'A', 'F');
1056 set_default (BIT (tok_xdigit), 'a', 'f');
1059 if ((ctype->class_done & BIT (tok_blank)) == 0)
1060 /* "If this keyword [blank] is unspecified, the characters <space> and
1061 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
1065 value = charset_find_value (charset, "space", 5);
1066 if (value == ILLEGAL_CHAR_VALUE)
1068 character `%s' not defined while needed as default value"),
1071 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
1073 value = charset_find_value (charset, "tab", 3);
1074 if (value == ILLEGAL_CHAR_VALUE)
1076 character `%s' not defined while needed as default value"),
1079 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
1082 if ((ctype->class_done & BIT (tok_graph)) == 0)
1083 /* "If this keyword [graph] is not specified, characters specified for
1084 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
1085 shall belong to this character class." [P1003.2, 2.5.2.1] */
1087 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
1088 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
1091 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
1092 if ((ctype->class_collection[cnt] & mask) != 0)
1093 ctype->class_collection[cnt] |= BIT (tok_graph);
1096 if ((ctype->class_done & BIT (tok_print)) == 0)
1097 /* "If this keyword [print] is not provided, characters specified for
1098 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
1099 and the <space> character shall belong to this character class."
1100 [P1003.2, 2.5.2.1] */
1102 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
1103 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
1107 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
1108 if ((ctype->class_collection[cnt] & mask) != 0)
1109 ctype->class_collection[cnt] |= BIT (tok_print);
1111 space = charset_find_value (charset, "space", 5);
1112 if (space == ILLEGAL_CHAR_VALUE)
1114 character `%s' not defined while needed as default value"),
1117 ELEM (ctype, class_collection, , space) |= BIT (tok_print);
1120 if (ctype->toupper_done == 0)
1121 /* "If this keyword [toupper] is not spcified, the lowercase letters
1122 `a' through `z', and their corresponding uppercase letters `A' to
1123 `Z', ..., shall automatically be included, with implementation-
1124 defined character values." [P1003.2, 2.5.2.1] */
1129 strcpy (tmp, "<?>");
1131 for (ch = 'a'; ch <= 'z'; ++ch)
1133 unsigned int value_from, value_to;
1137 value_from = charset_find_value (charset, &tmp[1], 1);
1138 if (value_from == ILLEGAL_CHAR_VALUE)
1141 character `%c' not defined while needed as default value"),
1146 /* This conversion is implementation defined. */
1147 tmp[1] = (char) (ch + ('A' - 'a'));
1148 value_to = charset_find_value (charset, &tmp[1], 1);
1152 character `%s' not defined while needed as default value"),
1157 /* The index [0] is determined by the order of the
1158 `ctype_map_newP' calls in `ctype_startup'. */
1159 ELEM (ctype, map_collection, [0], value_from) = value_to;
1163 if (ctype->tolower_done == 0)
1164 /* "If this keyword [tolower] is not specified, the mapping shall be
1165 the reverse mapping of the one specified to `toupper'." [P1003.2] */
1169 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
1170 if (ctype->map_collection[0][cnt] != 0)
1171 ELEM (ctype, map_collection, [1],
1172 ctype->map_collection[0][cnt])
1173 = ctype->charnames[cnt];
1179 allocate_arrays (struct locale_ctype_t *ctype, struct charset_t *charset)
1183 /* First we have to decide how we organize the arrays. It is easy for
1184 a one-byte character set. But multi-byte character set cannot be
1185 stored flat because they might be sparsly used. So we determine an
1186 optimal hashing function for the used characters.
1188 We use a very trivial hashing function to store the sparse table.
1189 CH % TABSIZE is used as an index. To solve multiple hits we have
1190 N planes. This gurantees a fixed search time for a character [N
1191 / 2]. In the following code we determine the minmum value for
1192 TABSIZE * N, where TABSIZE >= 256. */
1193 size_t min_total = UINT_MAX;
1194 size_t act_size = 256;
1197 Computing table size for character classes might take a while..."),
1200 while (act_size < min_total)
1202 size_t cnt[act_size];
1203 size_t act_planes = 1;
1205 memset (cnt, '\0', sizeof cnt);
1207 for (idx = 0; idx < 256; ++idx)
1210 for (idx = 0; idx < ctype->charnames_act; ++idx)
1211 if (ctype->charnames[idx] >= 256)
1213 size_t nr = ctype->charnames[idx] % act_size;
1215 if (++cnt[nr] > act_planes)
1217 act_planes = cnt[nr];
1218 if (act_size * act_planes >= min_total)
1223 if (act_size * act_planes < min_total)
1225 min_total = act_size * act_planes;
1226 ctype->plane_size = act_size;
1227 ctype->plane_cnt = act_planes;
1233 fprintf (stderr, _(" done\n"));
1236 #if __BYTE_ORDER == __LITTLE_ENDIAN
1237 # define NAMES_B1 ctype->names_el
1238 # define NAMES_B2 ctype->names_eb
1240 # define NAMES_B1 ctype->names_eb
1241 # define NAMES_B2 ctype->names_el
1244 ctype->names_eb = (u_int32_t *) xcalloc (ctype->plane_size
1246 sizeof (u_int32_t));
1247 ctype->names_el = (u_int32_t *) xcalloc (ctype->plane_size
1249 sizeof (u_int32_t));
1251 for (idx = 1; idx < 256; ++idx)
1252 NAMES_B1[idx] = idx;
1254 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
1257 for (idx = 256; idx < ctype->charnames_act; ++idx)
1259 size_t nr = (ctype->charnames[idx] % ctype->plane_size);
1262 while (NAMES_B1[nr + depth * ctype->plane_size])
1264 assert (depth < ctype->plane_cnt);
1266 NAMES_B1[nr + depth * ctype->plane_size] = ctype->charnames[idx];
1268 /* Now for faster access remember the index in the NAMES_B array. */
1269 ctype->charnames[idx] = nr + depth * ctype->plane_size;
1273 for (idx = 0; idx < ctype->plane_size * ctype->plane_cnt; ++idx)
1274 NAMES_B2[idx] = SWAPU32 (NAMES_B1[idx]);
1277 /* You wonder about this amount of memory? This is only because some
1278 users do not manage to address the array with unsigned values or
1279 data types with range >= 256. '\200' would result in the array
1280 index -128. To help these poor people we duplicate the entries for
1281 128 up to 255 below the entry for \0. */
1282 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128,
1283 sizeof (char_class_t));
1284 ctype->ctype32_b = (char_class32_t *) xcalloc (ctype->plane_size
1286 sizeof (char_class32_t));
1288 /* Fill in the character class information. */
1289 #if __BYTE_ORDER == __LITTLE_ENDIAN
1290 # define TRANS(w) CHAR_CLASS_TRANS (w)
1291 # define TRANS32(w) CHAR_CLASS32_TRANS (w)
1293 # define TRANS(w) (w)
1294 # define TRANS32(w) (w)
1297 for (idx = 0; idx < ctype->class_collection_act; ++idx)
1298 if (ctype->charnames[idx] < 256)
1299 ctype->ctype_b[128 + ctype->charnames[idx]]
1300 = TRANS (ctype->class_collection[idx]);
1302 /* Mirror first 127 entries. We must take care that entry -1 is not
1303 mirrored because EOF == -1. */
1304 for (idx = 0; idx < 127; ++idx)
1305 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
1307 /* The 32 bit array contains all characters. */
1308 for (idx = 0; idx < ctype->class_collection_act; ++idx)
1309 ctype->ctype32_b[ctype->charnames[idx]]
1310 = TRANS32 (ctype->class_collection[idx]);
1312 /* Room for table of mappings. */
1313 ctype->map_eb = (u_int32_t **) xmalloc (ctype->map_collection_nr
1314 * sizeof (u_int32_t *));
1315 ctype->map_el = (u_int32_t **) xmalloc (ctype->map_collection_nr
1316 * sizeof (u_int32_t *));
1318 /* Fill in all mappings. */
1319 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
1323 /* Allocate table. */
1324 ctype->map_eb[idx] = (u_int32_t *) xmalloc ((ctype->plane_size
1325 * ctype->plane_cnt + 128)
1326 * sizeof (u_int32_t));
1327 ctype->map_el[idx] = (u_int32_t *) xmalloc ((ctype->plane_size
1328 * ctype->plane_cnt + 128)
1329 * sizeof (u_int32_t));
1331 #if __BYTE_ORDER == __LITTLE_ENDIAN
1332 # define MAP_B1 ctype->map_el
1333 # define MAP_B2 ctype->map_eb
1335 # define MAP_B1 ctype->map_eb
1336 # define MAP_B2 ctype->map_el
1339 /* Copy default value (identity mapping). */
1340 memcpy (&MAP_B1[idx][128], NAMES_B1,
1341 ctype->plane_size * ctype->plane_cnt * sizeof (u_int32_t));
1343 /* Copy values from collection. */
1344 for (idx2 = 0; idx2 < ctype->map_collection_act[idx]; ++idx2)
1345 if (ctype->map_collection[idx][idx2] != 0)
1346 MAP_B1[idx][128 + ctype->charnames[idx2]] =
1347 ctype->map_collection[idx][idx2];
1349 /* Mirror first 127 entries. We must take care not to map entry
1350 -1 because EOF == -1. */
1351 for (idx2 = 0; idx2 < 127; ++idx2)
1352 MAP_B1[idx][idx2] = MAP_B1[idx][256 + idx2];
1354 /* EOF must map to EOF. */
1355 MAP_B1[idx][127] = EOF;
1357 /* And now the other byte order. */
1358 for (idx2 = 0; idx2 < ctype->plane_size * ctype->plane_cnt + 128; ++idx2)
1359 MAP_B2[idx][idx2] = SWAPU32 (MAP_B1[idx][idx2]);
1362 /* Extra array for class and map names. */
1363 ctype->class_name_ptr = (u_int32_t *) xmalloc (ctype->nr_charclass
1364 * sizeof (u_int32_t));
1365 ctype->map_name_ptr = (u_int32_t *) xmalloc (ctype->map_collection_nr
1366 * sizeof (u_int32_t));
1368 /* Array for width information. Because the expected width are very
1369 small we use only one single byte. This save space and we need
1370 not provide the information twice with both endianesses. */
1371 ctype->width = (unsigned char *) xmalloc (ctype->plane_size
1372 * ctype->plane_cnt);
1373 /* Initialize with default width value. */
1374 memset (ctype->width, charset->width_default,
1375 ctype->plane_size * ctype->plane_cnt);
1376 if (charset->width_rules != NULL)
1380 for (cnt = 0; cnt < charset->nwidth_rules; ++cnt)
1381 if (charset->width_rules[cnt].width != charset->width_default)
1382 for (idx = charset->width_rules[cnt].from;
1383 idx <= charset->width_rules[cnt].to; ++idx)
1385 size_t nr = idx % ctype->plane_size;
1388 while (NAMES_B1[nr + depth * ctype->plane_size] != nr)
1390 assert (depth < ctype->plane_cnt);
1392 ctype->width[nr + depth * ctype->plane_size]
1393 = charset->width_rules[cnt].width;
1397 /* Compute MB_CUR_MAX. Please note the value mb_cur_max in the
1398 character set definition gives the number of bytes in the wide
1399 character representation. We compute the number of bytes used
1400 for the UTF-8 encoded form. */
1401 ctype->mb_cur_max = ((int []) { 2, 3, 5, 6 }) [charset->mb_cur_max - 1];
1403 /* We need the name of the currently used 8-bit character set to
1404 make correct conversion between this 8-bit representation and the
1405 ISO 10646 character set used internally for wide characters. */
1406 ctype->codeset_name = charset->code_set_name;