guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "glib.h"
  23 #include "gunichartables.h"
  24
  25 #include <config.h>
  26
  27 #include <stddef.h>
  28
  29 #define asize(x)  ((sizeof (x)) / sizeof (x[0]))
  30
  31 #define ATTTABLE(Page, Char) \
  32   ((attr_table[Page] == 0) ? 0 : (attr_table[Page][Char]))
  33
  34 /* We cheat a bit and cast type values to (char *).  We detect these
  35    using the &0xff trick.  */
  36 #define TTYPE(Page, Char) \
  37   (((((int) type_table[Page]) & 0xff) == ((int) type_table[Page])) \
  38    ? ((int) (type_table[Page])) \
  39    : (type_table[Page][Char]))
  40
  41 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
  42
  43 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
  44                        || (Type) == G_UNICODE_LETTER_NUMBER \
  45                        || (Type) == G_UNICODE_OTHER_NUMBER)
  46
  47 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
  48                        || (Type) == G_UNICODE_UPPERCASE_LETTER \
  49                        || (Type) == G_UNICODE_TITLECASE_LETTER \
  50                        || (Type) == G_UNICODE_MODIFIER_LETTER \
  51                        || (Type) == G_UNICODE_OTHER_LETTER)
  52
  53 gboolean
  54 g_unichar_isalnum (gunichar c)
  55 {
  56   int t = TYPE (c);
  57   return ISDIGIT (t) || ISALPHA (t);
  58 }
  59
  60 gboolean
  61 g_unichar_isalpha (gunichar c)
  62 {
  63   int t = TYPE (c);
  64   return ISALPHA (t);
  65 }
  66
  67 gboolean
  68 g_unichar_iscntrl (gunichar c)
  69 {
  70   return TYPE (c) == G_UNICODE_CONTROL;
  71 }
  72
  73 gboolean
  74 g_unichar_isdigit (gunichar c)
  75 {
  76   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
  77 }
  78
  79 gboolean
  80 g_unichar_isgraph (gunichar c)
  81 {
  82   int t = TYPE (c);
  83   return (t != G_UNICODE_CONTROL
  84           && t != G_UNICODE_FORMAT
  85           && t != G_UNICODE_UNASSIGNED
  86           && t != G_UNICODE_PRIVATE_USE
  87           && t != G_UNICODE_SURROGATE
  88           && t != G_UNICODE_SPACE_SEPARATOR);
  89 }
  90
  91 gboolean
  92 g_unichar_islower (gunichar c)
  93 {
  94   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
  95 }
  96
  97 gboolean
  98 g_unichar_isprint (gunichar c)
  99 {
 100   int t = TYPE (c);
 101   return (t != G_UNICODE_CONTROL
 102           && t != G_UNICODE_FORMAT
 103           && t != G_UNICODE_UNASSIGNED
 104           && t != G_UNICODE_PRIVATE_USE
 105           && t != G_UNICODE_SURROGATE);
 106 }
 107
 108 gboolean
 109 g_unichar_ispunct (gunichar c)
 110 {
 111   int t = TYPE (c);
 112   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 113           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 114           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 115           || t == G_UNICODE_OPEN_PUNCTUATION);
 116 }
 117
 118 gboolean
 119 g_unichar_isspace (gunichar c)
 120 {
 121   switch (c)
 122     {
 123       /* special-case these since Unicode thinks they are not spaces */
 124     case '\t':
 125     case '\n':
 126     case '\r':
 127     case '\f':
 128     case '\v': /* vertical tab - as if anyone has ever used this... */
 129       return TRUE;
 130       break;
 131
 132     default:
 133       {
 134         int t = TYPE (c);
 135         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 136                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 137       }
 138       break;
 139     }
 140 }
 141
 142 /**
 143  * g_unichar_isupper:
 144  * @c: a unicode character
 145  *
 146  * Determines if a character is uppercase.
 147  *
 148  * Return value:
 149  **/
 150 gboolean
 151 g_unichar_isupper (gunichar c)
 152 {
 153   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 154 }
 155
 156 /**
 157  * g_unichar_istitle:
 158  * @c: a unicode character
 159  *
 160  * Determines if a character is titlecase. Some characters in
 161  * Unicode which are composites, such as the DZ digraph
 162  * have three case variants instead of just two. The titlecase
 163  * form is used at the beginning of a word where only the
 164  * first letter is capitalized. The titlecase form of the DZ
 165  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z
 166  *
 167  * Return value: %TRUE if the character is titlecase.
 168  **/
 169 gboolean
 170 g_unichar_istitle (gunichar c)
 171 {
 172   unsigned int i;
 173   for (i = 0; i < asize (title_table); ++i)
 174     if (title_table[i][0] == c)
 175       return 1;
 176   return 0;
 177 }
 178
 179 /**
 180  * g_unichar_isxdigit:
 181  * @c: a unicode character.
 182  *
 183  * Determines if a characters is a hexidecimal digit
 184  *
 185  * Return value: %TRUE if the character is a hexidecimal digit.
 186  **/
 187 gboolean
 188 g_unichar_isxdigit (gunichar c)
 189 {
 190   int t = TYPE (c);
 191   return ((c >= 'a' && c <= 'f')
 192           || (c >= 'A' && c <= 'F')
 193           || ISDIGIT (t));
 194 }
 195
 196 /**
 197  * g_unichar_isdefined:
 198  * @c: a unicode character
 199  *
 200  * Determines if a given character is assigned in the Unicode
 201  * standard
 202  *
 203  * Return value: %TRUE if the character has an assigned value.
 204  **/
 205 gboolean
 206 g_unichar_isdefined (gunichar c)
 207 {
 208   int t = TYPE (c);
 209   return t != G_UNICODE_UNASSIGNED;
 210 }
 211
 212 /**
 213  * g_unichar_iswide:
 214  * @c: a unicode character
 215  *
 216  * Determines if a character is typically rendered in a double-width
 217  * cell.
 218  *
 219  * Return value: %TRUE if the character is wide.
 220  **/
 221 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 222 gboolean
 223 g_unichar_iswide (gunichar c)
 224 {
 225   if (c < 0x1100)
 226     return 0;
 227
 228   return ((c >= 0x1100 && c <= 0x115f)     /* Hangul Jamo */
 229           || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
 230               c != 0x303f)                 /* CJK ... Yi */
 231           || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
 232           || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
 233           || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
 234           || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
 235           || (c >= 0xffe0 && c <= 0xffe6));
 236 }
 237
 238 /**
 239  * g_unichar_toupper:
 240  * @c: a unicode character
 241  *
 242  * Convert a character to uppercase.
 243  *
 244  * Return value: the result of converting @c to uppercase.
 245  *               If @c is not an lowercase or titlecase character,
 246  *               @c is returned unchanged.
 247  **/
 248 gunichar
 249 g_unichar_toupper (gunichar c)
 250 {
 251   int t = TYPE (c);
 252   if (t == G_UNICODE_LOWERCASE_LETTER)
 253     return ATTTABLE (c >> 8, c & 0xff);
 254   else if (t == G_UNICODE_TITLECASE_LETTER)
 255     {
 256       unsigned int i;
 257       for (i = 0; i < asize (title_table); ++i)
 258         {
 259           if (title_table[i][0] == c)
 260             return title_table[i][1];
 261         }
 262     }
 263   return c;
 264 }
 265
 266 /**
 267  * g_unichar_tolower:
 268  * @c: a unicode character.
 269  *
 270  * Convert a character to lower case
 271  *
 272  * Return value: the result of converting @c to lower case.
 273  *               If @c is not an upperlower or titlecase character,
 274  *               @c is returned unchanged.
 275  **/
 276 gunichar
 277 g_unichar_tolower (gunichar c)
 278 {
 279   int t = TYPE (c);
 280   if (t == G_UNICODE_UPPERCASE_LETTER)
 281     return ATTTABLE (c >> 8, c & 0xff);
 282   else if (t == G_UNICODE_TITLECASE_LETTER)
 283     {
 284       unsigned int i;
 285       for (i = 0; i < asize (title_table); ++i)
 286         {
 287           if (title_table[i][0] == c)
 288             return title_table[i][2];
 289         }
 290     }
 291   return c;
 292 }
 293
 294 /**
 295  * g_unichar_totitle:
 296  * @c: a unicode character
 297  *
 298  * Convert a character to the titlecase
 299  *
 300  * Return value: the result of converting @c to titlecase.
 301  *               If @c is not an uppercase or lowercase character,
 302  *               @c is returned unchanged.
 303  **/
 304 gunichar
 305 g_unichar_totitle (gunichar c)
 306 {
 307   unsigned int i;
 308   for (i = 0; i < asize (title_table); ++i)
 309     {
 310       if (title_table[i][0] == c || title_table[i][1] == c
 311           || title_table[i][2] == c)
 312         return title_table[i][0];
 313     }
 314   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 315           ? ATTTABLE (c >> 8, c & 0xff)
 316           : c);
 317 }
 318
 319 /**
 320  * g_unichar_xdigit_value:
 321  * @c: a unicode character
 322  *
 323  * Determines the numeric value of a character as a decimal
 324  * degital.
 325  *
 326  * Return value: If @c is a decimal digit (according to
 327  * `g_unichar_isdigit'), its numeric value. Otherwise, -1.
 328  **/
 329 int
 330 g_unichar_digit_value (gunichar c)
 331 {
 332   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 333     return ATTTABLE (c >> 8, c & 0xff);
 334   return -1;
 335 }
 336
 337 /**
 338  * g_unichar_xdigit_value:
 339  * @c: a unicode character
 340  *
 341  * Determines the numeric value of a character as a hexidecimal
 342  * degital.
 343  *
 344  * Return value: If @c is a hex digit (according to
 345  * `g_unichar_isxdigit'), its numeric value. Otherwise, -1.
 346  **/
 347 int
 348 g_unichar_xdigit_value (gunichar c)
 349 {
 350   if (c >= 'A' && c <= 'F')
 351     return c - 'A' + 1;
 352   if (c >= 'a' && c <= 'f')
 353     return c - 'a' + 1;
 354   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 355     return ATTTABLE (c >> 8, c & 0xff);
 356   return -1;
 357 }
 358
 359 /**
 360  * g_unichar_type:
 361  * @c: a unicode character
 362  *
 363  * Classifies a unicode character by type.
 364  *
 365  * Return value: the typ of the character.
 366  **/
 367 GUnicodeType
 368 g_unichar_type (gunichar c)
 369 {
 370   return TYPE (c);
 371 }