1 // Copyright (c) 1998, 1999 Matthias Clasen
2 // See the file copying.txt for copying permission.
24 #ifdef DSSSL_NAMESPACE
25 namespace DSSSL_NAMESPACE {
31 static char *strdup(const char *s)
33 size_t l = strlen(s) + 1;
34 return (char *) memcpy ((char *) malloc (sizeof (char) * l), s, l);
36 # endif /* __GLIBC__ */
38 static char *stringify(const StringC &s)
40 char *r = (char *) malloc (sizeof(char)*(s.size() + 1));
41 for (size_t i = 0; i < s.size(); i++)
47 // FIXME this is unneeded if SP_WCHAR_IS_USHORT
48 static wchar_t *wchartify(const StringC &s)
50 wchar_t *r = (wchar_t *) malloc (sizeof(wchar_t)*(s.size() + 1));
51 for (size_t i = 0; i < s.size(); i++)
57 char *RefLangObj::localeName(const StringC &lang, const StringC &country)
61 p = (char *) malloc (sizeof(char)*(lang.size() + country.size() + 2));
63 for (i = 0; i < lang.size(); i++)
64 p[i] = tolower(char(lang[i]));
66 for (size_t j = 0; j < country.size(); j++, i++)
67 p[i] = toupper(char(country[j]));
72 bool RefLangObj::supportedLanguage(const StringC &lang, const StringC &country)
74 char *p = RefLangObj::localeName(lang, country);
75 char *old = strdup(setlocale(LC_ALL, 0));
76 bool res = (setlocale(LC_ALL, p) != 0);
77 setlocale(LC_ALL, old);
82 RefLangObj::RefLangObj(const StringC &lang, const StringC &country)
84 char *p = localeName(lang, country);
85 oldLocale_ = strdup(setlocale(LC_ALL, 0));
86 newLocale_ = strdup(setlocale(LC_ALL, p));
90 RefLangObj::~RefLangObj()
96 LanguageObj *RefLangObj::asLanguage()
101 Char RefLangObj::toUpper(const Char c) const
103 setlocale(LC_ALL, newLocale_);
104 Char uc = towupper(c);
105 setlocale(LC_ALL, oldLocale_);
109 Char RefLangObj::toLower(const Char c) const
111 setlocale(LC_ALL, newLocale_);
112 Char lc = towlower(c);
113 setlocale(LC_ALL, oldLocale_);
117 bool RefLangObj::areEquivalent(const StringC &r, const StringC &s,
120 setlocale(LC_ALL, newLocale_);
121 wchar_t *rr = wchartify(r);
122 unsigned rn = wcsxfrm (0, rr, 0);
123 wchar_t *rx = (wchar_t *) malloc (sizeof(wchar_t)*rn);
125 wchar_t *ss = wchartify(s);
126 unsigned sn = wcsxfrm (0, ss, 0);
127 wchar_t *sx = (wchar_t *) malloc (sizeof(wchar_t)*sn);
131 for (unsigned i = 0; ; i++) {
132 if (rx[i] != sx[i]) { res = 0; break; }
134 if (k == l || rx[i] == 0) { res = 1; break; }
136 free (rr); free (ss); free (rx); free (sx);
137 setlocale(LC_ALL, oldLocale_);
141 bool RefLangObj::isLess(const StringC &r, const StringC &s) const
143 setlocale(LC_ALL, newLocale_);
144 wchar_t *rr = wchartify(r);
145 wchar_t *ss = wchartify(s);
146 int res = wcscoll(rr, ss);
147 free (rr); free (ss);
148 setlocale(LC_ALL, oldLocale_);
152 bool RefLangObj::isLessOrEqual(const StringC &r, const StringC &s) const
154 setlocale(LC_ALL, newLocale_);
155 wchar_t *rr = wchartify(r);
156 wchar_t *ss = wchartify(s);
157 int res = wcscoll(rr, ss);
158 free (rr); free (ss);
159 setlocale(LC_ALL, oldLocale_);
162 #endif /* SP_HAVE_WCHAR */
163 #endif /* SP_HAVE_LOCALE */
165 class LangBuildData {
167 LangBuildData() : currentpos(0) {};
168 HashTable<StringC, StringC> order;
170 HashTable<StringC, StringC> ce;
171 HashTable<StringC, Char> syms;
177 LangObj::LevelSort level[20]; // FIXME
179 HashTable<StringC, StringC> weights;
180 HashTable<StringC, Char> collate;
181 CharMap<Char> toupper;
182 CharMap<Char> tolower;
187 toupper.setAll(charMax);
188 tolower.setAll(charMax);
192 void LangObj::addMultiCollatingElement(const StringC &sym,
195 buildData_->ce.insert(sym, str);
198 void LangObj::addCollatingSymbol(const StringC &sym)
200 buildData_->syms.insert(sym, charMax);
203 void LangObj::addLevel(const LevelSort &sort)
205 data_->level[data_->levels++] = sort;
208 void LangObj::addDefaultPos()
211 addCollatingPos(empty);
214 bool LangObj::addCollatingPos(const StringC &sym)
216 if (!buildData_->ce.lookup(sym) && !buildData_->syms.lookup(sym))
218 buildData_->ce.insert(sym, sym);
221 buildData_->order.insert(StringC(&buildData_->currentpos, 1), sym);
222 buildData_->currentpos++;
226 bool LangObj::addLevelWeight(const Char l, const StringC &w)
228 if (!buildData_->ce.lookup(w) && !buildData_->syms.lookup(w))
230 buildData_->ce.insert(w, w);
235 key[0] = buildData_->currentpos - 1;
237 for (key[2] = 0; buildData_->order.lookup(key); key[2]++) ;
238 buildData_->order.insert(key, w);
242 void LangObj::addToupper(const Char lc, const Char uc)
244 data_->toupper.setChar(lc, uc);
247 void LangObj::addTolower(const Char uc, const Char lc)
249 data_->tolower.setChar(uc, lc);
254 buildData_ = new LangBuildData;
255 data_ = new LangData;
260 if (buildData_) delete buildData_;
261 if (data_) delete data_;
264 bool LangObj::compile()
267 StringC key, val, data;
269 const StringC *match, *match2;
271 data_->collate.insert(empty, buildData_->currentpos);
273 for (key[0] = 0; key[0] < buildData_->currentpos; key[0]++) {
274 match = buildData_->order.lookup(key);
277 match2 = buildData_->ce.lookup(*match);
279 buildData_->syms.insert(*match, key[0]);
281 data_->collate.insert(*match2, key[0]);
285 for (data[0] = 0; data[0] < buildData_->currentpos; data[0]++) {
287 for (data[1] = 0; data[1] < levels(); data[1]++) {
290 for (data[2] = 0; buildData_->order.lookup(data); data[2]++) {
291 match = buildData_->order.lookup(data);
294 match2 = buildData_->ce.lookup(*match);
296 col = buildData_->syms.lookup(*match);
298 col = data_->collate.lookup(*match2);
303 data_->weights.insert(key, val);
311 LanguageObj *LangObj::asLanguage()
316 bool LangObj::areEquivalent(const StringC &r, const StringC &s,
319 return (compare(r, s, k) == 0);
322 int LangObj::compare(const StringC &rr, const StringC &ss,
325 StringC rc = asCollatingElts(rr);
326 StringC sc = asCollatingElts(ss);
328 for (Char l = 0; (l < k) && (l < levels()); l++) {
329 StringC r = atLevel(rc, l);
330 StringC s = atLevel(sc, l);
331 for (size_t i = 0; (i < r.size()) || (i < s.size()); i++) {
332 if (i == r.size()) return -1;
333 if (i == s.size()) return 1;
334 if (r[i] < s[i]) return -1;
335 if (r[i] > s[i]) return 1;
341 StringC LangObj::asCollatingElts(const StringC &s) const
343 StringC res, key, empty;
349 def = data_->collate.lookup(empty);
350 def_val = (def == 0) ? charMax : *def;
351 for (i = 0; i < s.size(); i = j) {
354 for (j = i; j < s.size(); j++) {
356 c = data_->collate.lookup(key);
362 // if we get here, s[j] is a single `unknown' char and
363 // we better not reconsider it to avoid an infinite loop.
370 StringC LangObj::atLevel(const StringC &s, const Char l) const
372 StringC cols, res, key;
375 if (data_->level[l].backward)
376 for (int i = s.size() - 1; i >= 0; i--)
382 for (size_t i = 0; i < cols.size(); i++) {
384 w = data_->weights.lookup(key);
387 if (data_->level[l].backward)
388 for (int j = w->size() - 1; j >= 0; j--) {
389 if (data_->level[l].position)
394 for (int j = 0; j < w->size(); j++) {
395 if (data_->level[l].position)
403 Char LangObj::toUpper(const Char c) const
405 Char uc = data_->toupper[c];
406 return (uc == charMax) ? c : uc;
409 Char LangObj::toLower(const Char c) const
411 Char lc = data_->tolower[c];
412 return (lc == charMax) ? c : lc;
415 unsigned LangObj::levels() const
417 return data_->levels;
420 bool LangObj::isLess(const StringC &r, const StringC &s) const
422 return (compare(r, s, levels()) < 0);
425 bool LangObj::isLessOrEqual(const StringC &r, const StringC &s) const
427 return (compare(r, s, levels()) <= 0);
430 #ifdef DSSSL_NAMESPACE
436 Notes on the LangObj class:
438 All data is in the classes LanguageData and
439 LanguageBuildData. The LanguageBuildData is only
440 needed until we call LanguageObj::compile() and
443 Once a LangObj is built, we use the functions isLess(),
444 isLessOrEqual(), toUpper(), toLower() and areEquivalent()
445 to implement the language-dependent functions of the
448 The LangBuildData stores a map symbols --> strings
449 for multi-collating-elements (ce) and a second map
450 positions --> symbols (order). These are incrementally
451 filled up while parsing a define-language expression.
452 order also stores a map
454 positions x levels x weight-numbers --> weights
456 (there may be more than one weight for a given position
457 and level). The empty string is used as key for the
458 default entry in syms.
460 Compilation is done in two phases: In the first phase,
461 we build the syms and collate maps (syms maps
462 symbols --> positions and collate maps
463 multi-collating-elements --> positions). This is achived
464 by inverting the corresponding part of order.
466 In the next phase, the weights map is built. It maps
467 positions x levels x weight-numbers --> positions.
469 Comparing two string works in three phases: First,
470 the strings are converted to positions (done by
471 asCollatingElts()), then we are comparing one level
472 at a time. For this, the strings of positions are
473 converted into strings of weights. How this is done
474 depends on the level.
475 Finally the strings of weights are lexicographically
478 Notes on the RefLangObj class:
480 It uses the underlying POSIX locale system and wchar_t. At least
481 with GNU libc, wchar_t is always encoded as ISO-10646 UCS4, thus
482 the Char and wchar_t codes of a character have the same value. But
483 since sizeof(wchar_t) = 4 != 2 = sizeof(Char), we have to do ugly
484 conversions (at least on i386-linux-glibc2) to be able to use the
485 libc wide character routines wcscoll() and wcsxfrm().
487 To create a language by reference to a locale, use the external
488 procedure with pubid UNREGISTERED::OpenJade//Procedure::language
490 (language lang country)
492 where lang and country are symbols or strings like the ones used
493 for the language: and country: characteristics of the paragraph FO.
494 If the locale is not supported, language returns #f. If Jade is
495 compiled without locale support (!SP_HAVE_LOCALE) language *always*
498 The implementation of string-equiv? depends on the fact that
499 wcsxfrm() returns a 0-terminated string of integers where the
500 substrings for each level are separated by 1. This is true for
501 glibc, but I don't know if it is universally true.