glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "glib.h"
  23 #include "gunichartables.h"
  24
  25 #include <config.h>
  26
  27 #include <stddef.h>
  28
  29 #define asize(x)  ((sizeof (x)) / sizeof (x[0]))
  30
  31 #define ATTTABLE(Page, Char) \
  32   ((attr_table[Page] == 0) ? 0 : (attr_table[Page][Char]))
  33
  34 /* We cheat a bit and cast type values to (char *).  We detect these
  35    using the &0xff trick.  */
  36 #define TTYPE(Page, Char) \
  37   (((GPOINTER_TO_INT(type_table[Page]) & 0xff) == GPOINTER_TO_INT(type_table[Page])) \
  38    ? GPOINTER_TO_INT(type_table[Page]) \
  39    : (type_table[Page][Char]))
  40
  41 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
  42
  43 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
  44                        || (Type) == G_UNICODE_LETTER_NUMBER \
  45                        || (Type) == G_UNICODE_OTHER_NUMBER)
  46
  47 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
  48                        || (Type) == G_UNICODE_UPPERCASE_LETTER \
  49                        || (Type) == G_UNICODE_TITLECASE_LETTER \
  50                        || (Type) == G_UNICODE_MODIFIER_LETTER \
  51                        || (Type) == G_UNICODE_OTHER_LETTER)
  52
  53 /**
  54  * g_unichar_isalnum:
  55  * @c: a Unicode character
  56  *
  57  * Determines whether a character is alphanumeric.
  58  * Given some UTF-8 text, obtain a character value
  59  * with g_utf8_get_char().
  60  *
  61  * Return value: %TRUE if @c is an alphanumeric character
  62  **/
  63 gboolean
  64 g_unichar_isalnum (gunichar c)
  65 {
  66   int t = TYPE (c);
  67   return ISDIGIT (t) || ISALPHA (t);
  68 }
  69
  70 /**
  71  * g_unichar_isalpha:
  72  * @c: a Unicode character
  73  *
  74  * Determines whether a character is alphabetic (i.e. a letter).
  75  * Given some UTF-8 text, obtain a character value with
  76  * g_utf8_get_char().
  77  *
  78  * Return value: %TRUE if @c is an alphabetic character
  79  **/
  80 gboolean
  81 g_unichar_isalpha (gunichar c)
  82 {
  83   int t = TYPE (c);
  84   return ISALPHA (t);
  85 }
  86
  87
  88 /**
  89  * g_unichar_iscntrl:
  90  * @c: a Unicode character
  91  *
  92  * Determines whether a character is a control character.
  93  * Given some UTF-8 text, obtain a character value with
  94  * g_utf8_get_char().
  95  *
  96  * Return value: %TRUE if @c is a control character
  97  **/
  98 gboolean
  99 g_unichar_iscntrl (gunichar c)
 100 {
 101   return TYPE (c) == G_UNICODE_CONTROL;
 102 }
 103
 104 /**
 105  * g_unichar_isdigit:
 106  * @c: a Unicode character
 107  *
 108  * Determines whether a character is numeric (i.e. a digit).  This
 109  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 110  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 111  *
 112  * Return value: %TRUE if @c is a digit
 113  **/
 114 gboolean
 115 g_unichar_isdigit (gunichar c)
 116 {
 117   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 118 }
 119
 120
 121 /**
 122  * g_unichar_isgraph:
 123  * @c: a Unicode character
 124  *
 125  * Determines whether a character is printable and not a space
 126  * (returns %FALSE for control characters, format characters, and
 127  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 128  * spaces. Given some UTF-8 text, obtain a character value with
 129  * g_utf8_get_char().
 130  *
 131  * Return value: %TRUE if @c is printable unless it's a space
 132  **/
 133 gboolean
 134 g_unichar_isgraph (gunichar c)
 135 {
 136   int t = TYPE (c);
 137   return (t != G_UNICODE_CONTROL
 138           && t != G_UNICODE_FORMAT
 139           && t != G_UNICODE_UNASSIGNED
 140           && t != G_UNICODE_PRIVATE_USE
 141           && t != G_UNICODE_SURROGATE
 142           && t != G_UNICODE_SPACE_SEPARATOR);
 143 }
 144
 145 /**
 146  * g_unichar_islower:
 147  * @c: a Unicode character
 148  *
 149  * Determines whether a character is a lowercase letter.
 150  * Given some UTF-8 text, obtain a character value with
 151  * g_utf8_get_char().
 152  *
 153  * Return value: %TRUE if @c is a lowercase letter
 154  **/
 155 gboolean
 156 g_unichar_islower (gunichar c)
 157 {
 158   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 159 }
 160
 161
 162 /**
 163  * g_unichar_isprint:
 164  * @c: a Unicode character
 165  *
 166  * Determines whether a character is printable.
 167  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 168  * Given some UTF-8 text, obtain a character value with
 169  * g_utf8_get_char().
 170  *
 171  * Return value: %TRUE if @c is printable
 172  **/
 173 gboolean
 174 g_unichar_isprint (gunichar c)
 175 {
 176   int t = TYPE (c);
 177   return (t != G_UNICODE_CONTROL
 178           && t != G_UNICODE_FORMAT
 179           && t != G_UNICODE_UNASSIGNED
 180           && t != G_UNICODE_PRIVATE_USE
 181           && t != G_UNICODE_SURROGATE);
 182 }
 183
 184 /**
 185  * g_unichar_ispunct:
 186  * @c: a Unicode character
 187  *
 188  * Determines whether a character is punctuation.
 189  * Given some UTF-8 text, obtain a character value with
 190  * g_utf8_get_char().
 191  *
 192  * Return value: %TRUE if @c is a punctuation character
 193  **/
 194 gboolean
 195 g_unichar_ispunct (gunichar c)
 196 {
 197   int t = TYPE (c);
 198   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 199           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 200           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 201           || t == G_UNICODE_OPEN_PUNCTUATION);
 202 }
 203
 204 /**
 205  * g_unichar_isspace:
 206  * @c: a Unicode character
 207  *
 208  * Determines whether a character is a space, tab, or line separator
 209  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 210  * character value with g_utf8_get_char().
 211  *
 212  * (Note: don't use this to do word breaking; you have to use
 213  * Pango or equivalent to get word breaking right, the algorithm
 214  * is fairly complex.)
 215  *
 216  * Return value: %TRUE if @c is a punctuation character
 217  **/
 218 gboolean
 219 g_unichar_isspace (gunichar c)
 220 {
 221   switch (c)
 222     {
 223       /* special-case these since Unicode thinks they are not spaces */
 224     case '\t':
 225     case '\n':
 226     case '\r':
 227     case '\f':
 228     case '\v': /* vertical tab - as if anyone has ever used this... */
 229       return TRUE;
 230       break;
 231
 232     default:
 233       {
 234         int t = TYPE (c);
 235         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 236                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 237       }
 238       break;
 239     }
 240 }
 241
 242 /**
 243  * g_unichar_isupper:
 244  * @c: a unicode character
 245  *
 246  * Determines if a character is uppercase.
 247  *
 248  * Return value: %TRUE if @c is an uppercase character.
 249  **/
 250 gboolean
 251 g_unichar_isupper (gunichar c)
 252 {
 253   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 254 }
 255
 256 /**
 257  * g_unichar_istitle:
 258  * @c: a unicode character
 259  *
 260  * Determines if a character is titlecase. Some characters in
 261  * Unicode which are composites, such as the DZ digraph
 262  * have three case variants instead of just two. The titlecase
 263  * form is used at the beginning of a word where only the
 264  * first letter is capitalized. The titlecase form of the DZ
 265  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z
 266  *
 267  * Return value: %TRUE if the character is titlecase.
 268  **/
 269 gboolean
 270 g_unichar_istitle (gunichar c)
 271 {
 272   unsigned int i;
 273   for (i = 0; i < asize (title_table); ++i)
 274     if (title_table[i][0] == c)
 275       return 1;
 276   return 0;
 277 }
 278
 279 /**
 280  * g_unichar_isxdigit:
 281  * @c: a unicode character.
 282  *
 283  * Determines if a characters is a hexidecimal digit
 284  *
 285  * Return value: %TRUE if the character is a hexadecimal digit.
 286  **/
 287 gboolean
 288 g_unichar_isxdigit (gunichar c)
 289 {
 290   int t = TYPE (c);
 291   return ((c >= 'a' && c <= 'f')
 292           || (c >= 'A' && c <= 'F')
 293           || ISDIGIT (t));
 294 }
 295
 296 /**
 297  * g_unichar_isdefined:
 298  * @c: a unicode character
 299  *
 300  * Determines if a given character is assigned in the Unicode
 301  * standard.
 302  *
 303  * Return value: %TRUE if the character has an assigned value.
 304  **/
 305 gboolean
 306 g_unichar_isdefined (gunichar c)
 307 {
 308   int t = TYPE (c);
 309   return t != G_UNICODE_UNASSIGNED;
 310 }
 311
 312 /**
 313  * g_unichar_iswide:
 314  * @c: a unicode character
 315  *
 316  * Determines if a character is typically rendered in a double-width
 317  * cell.
 318  *
 319  * Return value: %TRUE if the character is wide.
 320  **/
 321 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 322 gboolean
 323 g_unichar_iswide (gunichar c)
 324 {
 325   if (c < 0x1100)
 326     return 0;
 327
 328   return ((c >= 0x1100 && c <= 0x115f)     /* Hangul Jamo */
 329           || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
 330               c != 0x303f)                 /* CJK ... Yi */
 331           || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
 332           || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
 333           || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
 334           || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
 335           || (c >= 0xffe0 && c <= 0xffe6));
 336 }
 337
 338 /**
 339  * g_unichar_toupper:
 340  * @c: a unicode character
 341  *
 342  * Convert a character to uppercase.
 343  *
 344  * Return value: the result of converting @c to uppercase.
 345  *               If @c is not an lowercase or titlecase character,
 346  *               @c is returned unchanged.
 347  **/
 348 gunichar
 349 g_unichar_toupper (gunichar c)
 350 {
 351   int t = TYPE (c);
 352   if (t == G_UNICODE_LOWERCASE_LETTER)
 353     return ATTTABLE (c >> 8, c & 0xff);
 354   else if (t == G_UNICODE_TITLECASE_LETTER)
 355     {
 356       unsigned int i;
 357       for (i = 0; i < asize (title_table); ++i)
 358         {
 359           if (title_table[i][0] == c)
 360             return title_table[i][1];
 361         }
 362     }
 363   return c;
 364 }
 365
 366 /**
 367  * g_unichar_tolower:
 368  * @c: a unicode character.
 369  *
 370  * Convert a character to lower case
 371  *
 372  * Return value: the result of converting @c to lower case.
 373  *               If @c is not an upperlower or titlecase character,
 374  *               @c is returned unchanged.
 375  **/
 376 gunichar
 377 g_unichar_tolower (gunichar c)
 378 {
 379   int t = TYPE (c);
 380   if (t == G_UNICODE_UPPERCASE_LETTER)
 381     return ATTTABLE (c >> 8, c & 0xff);
 382   else if (t == G_UNICODE_TITLECASE_LETTER)
 383     {
 384       unsigned int i;
 385       for (i = 0; i < asize (title_table); ++i)
 386         {
 387           if (title_table[i][0] == c)
 388             return title_table[i][2];
 389         }
 390     }
 391   return c;
 392 }
 393
 394 /**
 395  * g_unichar_totitle:
 396  * @c: a unicode character
 397  *
 398  * Convert a character to the titlecase
 399  *
 400  * Return value: the result of converting @c to titlecase.
 401  *               If @c is not an uppercase or lowercase character,
 402  *               @c is returned unchanged.
 403  **/
 404 gunichar
 405 g_unichar_totitle (gunichar c)
 406 {
 407   unsigned int i;
 408   for (i = 0; i < asize (title_table); ++i)
 409     {
 410       if (title_table[i][0] == c || title_table[i][1] == c
 411           || title_table[i][2] == c)
 412         return title_table[i][0];
 413     }
 414   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 415           ? ATTTABLE (c >> 8, c & 0xff)
 416           : c);
 417 }
 418
 419 /**
 420  * g_unichar_digit_value:
 421  * @c: a unicode character
 422  *
 423  * Determines the numeric value of a character as a decimal
 424  * degital.
 425  *
 426  * Return value: If @c is a decimal digit (according to
 427  * `g_unichar_isdigit'), its numeric value. Otherwise, -1.
 428  **/
 429 int
 430 g_unichar_digit_value (gunichar c)
 431 {
 432   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 433     return ATTTABLE (c >> 8, c & 0xff);
 434   return -1;
 435 }
 436
 437 /**
 438  * g_unichar_xdigit_value:
 439  * @c: a unicode character
 440  *
 441  * Determines the numeric value of a character as a hexidecimal
 442  * degital.
 443  *
 444  * Return value: If @c is a hex digit (according to
 445  * `g_unichar_isxdigit'), its numeric value. Otherwise, -1.
 446  **/
 447 int
 448 g_unichar_xdigit_value (gunichar c)
 449 {
 450   if (c >= 'A' && c <= 'F')
 451     return c - 'A' + 1;
 452   if (c >= 'a' && c <= 'f')
 453     return c - 'a' + 1;
 454   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 455     return ATTTABLE (c >> 8, c & 0xff);
 456   return -1;
 457 }
 458
 459 /**
 460  * g_unichar_type:
 461  * @c: a unicode character
 462  *
 463  * Classifies a unicode character by type.
 464  *
 465  * Return value: the type of the character.
 466  **/
 467 GUnicodeType
 468 g_unichar_type (gunichar c)
 469 {
 470   return TYPE (c);
 471 }