1 /* $XTermId: charclass.c,v 1.22 2009/11/05 23:46:15 tom Exp $ */
4 * Compact and efficient reimplementation of the
5 * xterm character class mechanism for large character sets
7 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
9 * Xterm allows users to select entire words with a double-click on the left
10 * mouse button. Opinions might differ on what type of characters are part of
11 * separate words, therefore xterm allows users to configure a class code for
12 * each 8-bit character. Words are maximum length sequences of neighboring
13 * characters with identical class code. Extending this mechanism to Unicode
14 * naively would create an at least 2^16 entries (128 kB) long class code
17 * Instead, we transform the character class table into a list of intervals,
18 * that will be accessed via a linear search. Changes made to the table by the
19 * user will be appended. A special class code IDENT (default) marks
20 * characters who have their code number as the class code.
22 * We could alternatively use a sorted table of non-overlapping intervals that
23 * can be accessed via binary search, but merging in new intervals is
24 * significantly more hassle and not worth the effort here.
28 #include <charclass.h>
32 static struct classentry {
39 * Special convention for classtab[0]:
40 * - classtab[0].cclass is the allocated number of entries in classtab
41 * - classtab[0].first = 1 (first used entry in classtab)
42 * - classtab[0].last is the last used entry in classtab
46 SetCharacterClassRange(int low, int high, int value)
49 return -1; /* nothing to do */
51 /* make sure we have at least one free entry left at table end */
52 if (classtab[0].last > classtab[0].cclass - 2) {
53 classtab[0].cclass += 5 + classtab[0].cclass / 4;
54 classtab = TypeRealloc(struct classentry,
55 (unsigned) classtab[0].cclass, classtab);
60 /* simply append new interval to end of interval array */
62 classtab[classtab[0].last].first = low;
63 classtab[classtab[0].last].last = high;
64 classtab[classtab[0].last].cclass = value;
81 classtab = TypeMallocN(struct classentry, (unsigned) size);
84 classtab[0].cclass = size;
85 classtab[0].first = 1;
88 /* old xterm default classes */
89 SetCharacterClassRange(0, 0, BLANK);
90 SetCharacterClassRange(1, 31, CNTRL);
91 SetCharacterClassRange('\t', '\t', BLANK);
92 SetCharacterClassRange('0', '9', ALNUM);
93 SetCharacterClassRange('A', 'Z', ALNUM);
94 SetCharacterClassRange('_', '_', ALNUM);
95 SetCharacterClassRange('a', 'z', ALNUM);
96 SetCharacterClassRange(127, 159, CNTRL);
97 SetCharacterClassRange(160, 191, IDENT);
98 SetCharacterClassRange(192, 255, ALNUM);
99 SetCharacterClassRange(215, 215, IDENT);
100 SetCharacterClassRange(247, 247, IDENT);
102 /* added Unicode classes */
103 SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */
104 SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */
105 SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */
106 SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */
107 SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */
108 SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */
109 SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */
110 SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */
111 SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */
112 SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */
113 SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */
114 SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */
115 SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */
116 SetCharacterClassRange(0x200b, 0x27ff, IDENT); /* punctuation and symbols */
117 SetCharacterClassRange(0x2070, 0x207f, 0x2070); /* superscript */
118 SetCharacterClassRange(0x2080, 0x208f, 0x2080); /* subscript */
119 SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */
120 SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */
121 SetCharacterClassRange(0x3040, 0x309f, 0x3040); /* Hiragana */
122 SetCharacterClassRange(0x30a0, 0x30ff, 0x30a0); /* Katakana */
123 SetCharacterClassRange(0x3300, 0x9fff, 0x4e00); /* CJK Ideographs */
124 SetCharacterClassRange(0xac00, 0xd7a3, 0xac00); /* Hangul Syllables */
125 SetCharacterClassRange(0xf900, 0xfaff, 0x4e00); /* CJK Ideographs */
126 SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */
127 SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */
128 SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */
129 SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */
130 SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */
136 CharacterClass(int c)
138 int i, cclass = IDENT;
140 for (i = classtab[0].first; i <= classtab[0].last; i++)
141 if (classtab[i].first <= c && classtab[i].last >= c)
142 cclass = classtab[i].cclass;
152 noleaks_CharacterClass(void)
161 #endif /* OPT_WIDE_CHARS */