lib/localeinfo.c

   1 /* locale information
   2
   3    Copyright 2016 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  18    02110-1301, USA.  */
  19
  20 /* Written by Paul Eggert.  */
  21
  22 #include <config.h>
  23
  24 #include <localeinfo.h>
  25
  26 #include <verify.h>
  27
  28 #include <limits.h>
  29 #include <locale.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32 #include <wctype.h>
  33
  34 /* The sbclen implementation relies on this.  */
  35 verify (MB_LEN_MAX <= SCHAR_MAX);
  36
  37 /* Return true if the locale uses UTF-8.  */
  38
  39 static bool
  40 is_using_utf8 (void)
  41 {
  42   wchar_t wc;
  43   mbstate_t mbs = {0};
  44   return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
  45 }
  46
  47 /* Initialize *LOCALEINFO from the current locale.  */
  48
  49 void
  50 init_localeinfo (struct localeinfo *localeinfo)
  51 {
  52   int i;
  53
  54   localeinfo->multibyte = MB_CUR_MAX > 1;
  55   localeinfo->using_utf8 = is_using_utf8 ();
  56
  57   for (i = CHAR_MIN; i <= CHAR_MAX; i++)
  58     {
  59       char c = i;
  60       unsigned char uc = i;
  61       mbstate_t s = {0};
  62       wchar_t wc;
  63       size_t len = mbrtowc (&wc, &c, 1, &s);
  64       localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
  65       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
  66     }
  67 }
  68
  69 /* The set of wchar_t values C such that there's a useful locale
  70    somewhere where C != towupper (C) && C != towlower (towupper (C)).
  71    For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
  72    towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
  73    towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
  74 static short const lonesome_lower[] =
  75   {
  76     0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
  77     0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
  78
  79     /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
  80        counterpart in locales predating Unicode 4.0.0 (April 2003).  */
  81     0x03F2,
  82
  83     0x03F5, 0x1E9B, 0x1FBE,
  84   };
  85
  86 /* Verify that the worst case fits.  This is 1 for towupper, 1 for
  87    towlower, and 1 for each entry in LONESOME_LOWER.  */
  88 verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
  89         <= CASE_FOLDED_BUFSIZE);
  90
  91 /* Find the characters equal to C after case-folding, other than C
  92    itself, and store them into FOLDED.  Return the number of characters
  93    stored.  */
  94
  95 int
  96 case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
  97 {
  98   int i;
  99   int n = 0;
 100   wint_t uc = towupper (c);
 101   wint_t lc = towlower (uc);
 102   if (uc != c)
 103     folded[n++] = uc;
 104   if (lc != uc && lc != c && towupper (lc) == uc)
 105     folded[n++] = lc;
 106   for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
 107     {
 108       wint_t li = lonesome_lower[i];
 109       if (li != lc && li != uc && li != c && towupper (li) == uc)
 110         folded[n++] = li;
 111     }
 112   return n;
 113 }