1 // Character.java - Character class.
3 /* Copyright (C) 1998, 1999, 2000 Free Software Foundation
5 This file is part of libgcj.
7 This software is copyrighted work licensed under the terms of the
8 Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
13 import java.io.Serializable;
16 * @author Tom Tromey <tromey@cygnus.com>
17 * @date September 10, 1998
20 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
21 * "The Java Language Specification", ISBN 0-201-63451-1,
22 * online API docs for JDK 1.2 beta from http://www.javasoft.com,
23 * and The Unicode Standard Version 2.0.
24 * Status: Believed complete and correct for JDK 1.1; 1.2 methods
28 public final class Character implements Serializable, Comparable
30 public static final char MIN_VALUE = '\u0000';
31 public static final char MAX_VALUE = '\uffff';
33 public static final int MIN_RADIX = 2;
34 public static final int MAX_RADIX = 36;
36 // This initialization is seemingly circular, but it is accepted
37 // by javac, and is handled specially by gcc.
38 public static final Class TYPE = char.class;
41 public static final byte SPACE_SEPARATOR = 12;
42 public static final byte LINE_SEPARATOR = 13;
43 public static final byte PARAGRAPH_SEPARATOR = 14;
46 public static final byte UPPERCASE_LETTER = 1;
47 public static final byte LOWERCASE_LETTER = 2;
48 public static final byte TITLECASE_LETTER = 3;
49 public static final byte MODIFIER_LETTER = 4;
50 public static final byte OTHER_LETTER = 5;
53 public static final byte DECIMAL_DIGIT_NUMBER = 9;
54 public static final byte LETTER_NUMBER = 10;
55 public static final byte OTHER_NUMBER = 11;
58 public static final byte NON_SPACING_MARK = 6;
59 public static final byte ENCLOSING_MARK = 7;
60 public static final byte COMBINING_SPACING_MARK = 8;
63 public static final byte DASH_PUNCTUATION = 20;
64 public static final byte START_PUNCTUATION = 21;
65 public static final byte END_PUNCTUATION = 22;
66 public static final byte CONNECTOR_PUNCTUATION = 23;
67 public static final byte OTHER_PUNCTUATION = 24;
70 public static final byte MATH_SYMBOL = 25;
71 public static final byte CURRENCY_SYMBOL = 26;
72 public static final byte MODIFIER_SYMBOL = 27;
73 public static final byte OTHER_SYMBOL = 28;
76 public static final byte CONTROL = 15;
77 // Note: The JCL book says that both FORMAT and PRIVATE_USE are 18.
78 // However, FORMAT is actually 16.
79 public static final byte FORMAT = 16;
82 public static final byte UNASSIGNED = 0;
83 public static final byte PRIVATE_USE = 18;
84 public static final byte SURROGATE = 19;
86 private static final long serialVersionUID = 3786198910865385080L;
88 public Character (char ch)
93 public char charValue ()
98 // See if a character is a digit. If so, return the corresponding
99 // value. Otherwise return -1.
100 private static native int digit_value (char ch);
102 public static int digit (char ch, int radix)
104 if (radix < MIN_RADIX || radix > MAX_RADIX)
107 int d = digit_value (ch);
110 if (ch >= 'A' && ch <= 'Z')
112 else if (ch >= 'a' && ch <= 'z')
117 return d >= radix ? -1 : d;
120 public boolean equals (Object obj)
122 // Don't need to compare OBJ to null as instanceof will do this.
123 if (obj instanceof Character)
124 return value == ((Character) obj).value;
128 public static char forDigit (int d, int rdx)
130 if (d < 0 || d >= rdx || rdx < MIN_RADIX || rdx > MAX_RADIX)
133 return (char) ('0' + d);
134 // The Java Language Spec says to use lowercase, while the JCL
135 // says to use uppercase. We go with the former.
136 return (char) ('a' + d - 10);
139 public static native int getNumericValue (char ch);
140 public static native int getType (char ch);
142 public int hashCode ()
147 public static boolean isDefined (char ch)
149 return getType (ch) != UNASSIGNED;
152 public static boolean isDigit (char ch)
154 return digit_value (ch) != -1;
157 // The JCL book says that the argument here is a Character. That is
159 public static boolean isIdentifierIgnorable (char ch)
161 // This information comes from the Unicode Standard. It isn't
162 // auto-generated as it doesn't appear in the unidata table.
163 return ((ch >= '\u0000' && ch <= '\u0008')
164 || (ch >= '\u000e' && ch <= '\u001b')
165 // JDK 1.2 docs say that these are ignorable. The Unicode
166 // Standard is somewhat ambiguous on this issue.
167 || (ch >= '\u007f' && ch <= '\u009f')
168 || (ch >= '\u200c' && ch <= '\u200f')
169 // JCl says 200a through 200e, but that is a typo. The
170 // Unicode standard says the bidi controls are 202a
172 || (ch >= '\u202a' && ch <= '\u202e')
173 || (ch >= '\u206a' && ch <= '\u206f')
177 public static boolean isISOControl (char c)
179 return ((c >= '\u0000' && c <= '\u001f')
180 || (c >= '\u007f' && c <= '\u009f'));
183 public static boolean isJavaIdentifierPart (char ch)
185 if (isIdentifierIgnorable (ch) || isDigit (ch))
187 int type = getType (ch);
188 return (type == COMBINING_SPACING_MARK || type == NON_SPACING_MARK
189 || type == CURRENCY_SYMBOL || type == CONNECTOR_PUNCTUATION
190 || type == UPPERCASE_LETTER || type == LOWERCASE_LETTER
191 || type == TITLECASE_LETTER || type == MODIFIER_LETTER
192 || type == OTHER_LETTER || type == LETTER_NUMBER);
195 public static boolean isJavaIdentifierStart (char ch)
197 int type = getType (ch);
198 return (type == CURRENCY_SYMBOL || type == CONNECTOR_PUNCTUATION
199 || type == UPPERCASE_LETTER || type == LOWERCASE_LETTER
200 || type == TITLECASE_LETTER || type == MODIFIER_LETTER
201 || type == OTHER_LETTER);
204 // Deprecated in 1.2.
205 public static boolean isJavaLetter (char ch)
207 return ch == '$' || ch == '_' || isLetter (ch);
210 // Deprecated in 1.2.
211 public static boolean isJavaLetterOrDigit (char ch)
213 return ch == '$' || ch == '_' || isLetterOrDigit (ch);
216 public static boolean isLetter (char ch)
218 int type = getType (ch);
219 return (type == UPPERCASE_LETTER || type == LOWERCASE_LETTER
220 || type == TITLECASE_LETTER || type == MODIFIER_LETTER
221 || type == OTHER_LETTER);
224 public static boolean isLetterOrDigit (char ch)
226 return isDigit (ch) || isLetter (ch);
229 public static native boolean isLowerCase (char ch);
231 // Deprecated in JCL.
232 public static boolean isSpace (char ch)
234 return ch == '\n' || ch == '\t' || ch == '\f' || ch == '\r' || ch == ' ';
237 public static native boolean isSpaceChar (char ch);
238 public static native boolean isTitleCase (char ch);
240 public static boolean isUnicodeIdentifierPart (char ch)
242 if (isIdentifierIgnorable (ch) || isDigit (ch))
244 int type = getType (ch);
245 return (type == CONNECTOR_PUNCTUATION || type == LETTER_NUMBER
246 || type == COMBINING_SPACING_MARK || type == NON_SPACING_MARK
247 || type == UPPERCASE_LETTER || type == LOWERCASE_LETTER
248 || type == TITLECASE_LETTER || type == MODIFIER_LETTER
249 || type == OTHER_LETTER);
252 public static boolean isUnicodeIdentifierStart (char ch)
254 return isLetter (ch);
257 public static native boolean isUpperCase (char ch);
259 public static boolean isWhitespace (char ch)
261 return ((ch >= '\u0009' && ch <= '\r')
262 || (ch >= '\u001c' && ch <= '\u001f')
263 || (ch != '\u00a0' && ch != '\ufeff' && isSpaceChar (ch)));
266 public static native char toLowerCase (char ch);
267 public static native char toTitleCase (char ch);
268 public static native char toUpperCase (char ch);
270 public String toString ()
272 return String.valueOf(value);
275 public int compareTo (Character anotherCharacter)
277 return value - anotherCharacter.value;
280 public int compareTo (Object o)
282 return compareTo ((Character) o);
288 public static class Subset
290 protected Subset (String name)
295 public final boolean equals (Object obj)
300 public final int hashCode ()
302 return super.hashCode ();
305 public final String toString ()
310 // Name of this subset.
314 public static final class UnicodeBlock extends Subset
316 private UnicodeBlock (String name, char start, char end)
323 public static UnicodeBlock of (char c)
325 // A special case we need.
329 // Do a binary search to find the correct subset.
330 int hi = blocks.length;
334 int mid = (hi + lo) / 2;
335 UnicodeBlock ub = blocks[mid];
347 // Start and end characters.
348 private char start, end;
350 // Everything from here to the end of UnicodeBlock is
351 // automatically generated by the blocks.pl script.
352 public static final UnicodeBlock BASIC_LATIN = new UnicodeBlock ("Basic Latin", '\u0000', '\u007F');
353 public static final UnicodeBlock LATIN_1_SUPPLEMENT = new UnicodeBlock ("Latin-1 Supplement", '\u0080', '\u00FF');
354 public static final UnicodeBlock LATIN_EXTENDED_A = new UnicodeBlock ("Latin Extended-A", '\u0100', '\u017F');
355 public static final UnicodeBlock LATIN_EXTENDED_B = new UnicodeBlock ("Latin Extended-B", '\u0180', '\u024F');
356 public static final UnicodeBlock IPA_EXTENSIONS = new UnicodeBlock ("IPA Extensions", '\u0250', '\u02AF');
357 public static final UnicodeBlock SPACING_MODIFIER_LETTERS = new UnicodeBlock ("Spacing Modifier Letters", '\u02B0', '\u02FF');
358 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS = new UnicodeBlock ("Combining Diacritical Marks", '\u0300', '\u036F');
359 public static final UnicodeBlock GREEK = new UnicodeBlock ("Greek", '\u0370', '\u03FF');
360 public static final UnicodeBlock CYRILLIC = new UnicodeBlock ("Cyrillic", '\u0400', '\u04FF');
361 public static final UnicodeBlock ARMENIAN = new UnicodeBlock ("Armenian", '\u0530', '\u058F');
362 public static final UnicodeBlock HEBREW = new UnicodeBlock ("Hebrew", '\u0590', '\u05FF');
363 public static final UnicodeBlock ARABIC = new UnicodeBlock ("Arabic", '\u0600', '\u06FF');
364 public static final UnicodeBlock DEVANAGARI = new UnicodeBlock ("Devanagari", '\u0900', '\u097F');
365 public static final UnicodeBlock BENGALI = new UnicodeBlock ("Bengali", '\u0980', '\u09FF');
366 public static final UnicodeBlock GURMUKHI = new UnicodeBlock ("Gurmukhi", '\u0A00', '\u0A7F');
367 public static final UnicodeBlock GUJARATI = new UnicodeBlock ("Gujarati", '\u0A80', '\u0AFF');
368 public static final UnicodeBlock ORIYA = new UnicodeBlock ("Oriya", '\u0B00', '\u0B7F');
369 public static final UnicodeBlock TAMIL = new UnicodeBlock ("Tamil", '\u0B80', '\u0BFF');
370 public static final UnicodeBlock TELUGU = new UnicodeBlock ("Telugu", '\u0C00', '\u0C7F');
371 public static final UnicodeBlock KANNADA = new UnicodeBlock ("Kannada", '\u0C80', '\u0CFF');
372 public static final UnicodeBlock MALAYALAM = new UnicodeBlock ("Malayalam", '\u0D00', '\u0D7F');
373 public static final UnicodeBlock THAI = new UnicodeBlock ("Thai", '\u0E00', '\u0E7F');
374 public static final UnicodeBlock LAO = new UnicodeBlock ("Lao", '\u0E80', '\u0EFF');
375 public static final UnicodeBlock TIBETAN = new UnicodeBlock ("Tibetan", '\u0F00', '\u0FBF');
376 public static final UnicodeBlock GEORGIAN = new UnicodeBlock ("Georgian", '\u10A0', '\u10FF');
377 public static final UnicodeBlock HANGUL_JAMO = new UnicodeBlock ("Hangul Jamo", '\u1100', '\u11FF');
378 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL = new UnicodeBlock ("Latin Extended Additional", '\u1E00', '\u1EFF');
379 public static final UnicodeBlock GREEK_EXTENDED = new UnicodeBlock ("Greek Extended", '\u1F00', '\u1FFF');
380 public static final UnicodeBlock GENERAL_PUNCTUATION = new UnicodeBlock ("General Punctuation", '\u2000', '\u206F');
381 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS = new UnicodeBlock ("Superscripts and Subscripts", '\u2070', '\u209F');
382 public static final UnicodeBlock CURRENCY_SYMBOLS = new UnicodeBlock ("Currency Symbols", '\u20A0', '\u20CF');
383 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS = new UnicodeBlock ("Combining Marks for Symbols", '\u20D0', '\u20FF');
384 public static final UnicodeBlock LETTERLIKE_SYMBOLS = new UnicodeBlock ("Letterlike Symbols", '\u2100', '\u214F');
385 public static final UnicodeBlock NUMBER_FORMS = new UnicodeBlock ("Number Forms", '\u2150', '\u218F');
386 public static final UnicodeBlock ARROWS = new UnicodeBlock ("Arrows", '\u2190', '\u21FF');
387 public static final UnicodeBlock MATHEMATICAL_OPERATORS = new UnicodeBlock ("Mathematical Operators", '\u2200', '\u22FF');
388 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL = new UnicodeBlock ("Miscellaneous Technical", '\u2300', '\u23FF');
389 public static final UnicodeBlock CONTROL_PICTURES = new UnicodeBlock ("Control Pictures", '\u2400', '\u243F');
390 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION = new UnicodeBlock ("Optical Character Recognition", '\u2440', '\u245F');
391 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS = new UnicodeBlock ("Enclosed Alphanumerics", '\u2460', '\u24FF');
392 public static final UnicodeBlock BOX_DRAWING = new UnicodeBlock ("Box Drawing", '\u2500', '\u257F');
393 public static final UnicodeBlock BLOCK_ELEMENTS = new UnicodeBlock ("Block Elements", '\u2580', '\u259F');
394 public static final UnicodeBlock GEOMETRIC_SHAPES = new UnicodeBlock ("Geometric Shapes", '\u25A0', '\u25FF');
395 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS = new UnicodeBlock ("Miscellaneous Symbols", '\u2600', '\u26FF');
396 public static final UnicodeBlock DINGBATS = new UnicodeBlock ("Dingbats", '\u2700', '\u27BF');
397 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION = new UnicodeBlock ("CJK Symbols and Punctuation", '\u3000', '\u303F');
398 public static final UnicodeBlock HIRAGANA = new UnicodeBlock ("Hiragana", '\u3040', '\u309F');
399 public static final UnicodeBlock KATAKANA = new UnicodeBlock ("Katakana", '\u30A0', '\u30FF');
400 public static final UnicodeBlock BOPOMOFO = new UnicodeBlock ("Bopomofo", '\u3100', '\u312F');
401 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO = new UnicodeBlock ("Hangul Compatibility Jamo", '\u3130', '\u318F');
402 public static final UnicodeBlock KANBUN = new UnicodeBlock ("Kanbun", '\u3190', '\u319F');
403 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS = new UnicodeBlock ("Enclosed CJK Letters and Months", '\u3200', '\u32FF');
404 public static final UnicodeBlock CJK_COMPATIBILITY = new UnicodeBlock ("CJK Compatibility", '\u3300', '\u33FF');
405 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS = new UnicodeBlock ("CJK Unified Ideographs", '\u4E00', '\u9FFF');
406 public static final UnicodeBlock HANGUL_SYLLABLES = new UnicodeBlock ("Hangul Syllables", '\uAC00', '\uD7A3');
407 public static final UnicodeBlock HIGH_SURROGATES = new UnicodeBlock ("High Surrogates", '\uD800', '\uDB7F');
408 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES = new UnicodeBlock ("High Private Use Surrogates", '\uDB80', '\uDBFF');
409 public static final UnicodeBlock LOW_SURROGATES = new UnicodeBlock ("Low Surrogates", '\uDC00', '\uDFFF');
410 public static final UnicodeBlock PRIVATE_USE = new UnicodeBlock ("Private Use", '\uE000', '\uF8FF');
411 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS = new UnicodeBlock ("CJK Compatibility Ideographs", '\uF900', '\uFAFF');
412 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS = new UnicodeBlock ("Alphabetic Presentation Forms", '\uFB00', '\uFB4F');
413 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A = new UnicodeBlock ("Arabic Presentation Forms-A", '\uFB50', '\uFDFF');
414 public static final UnicodeBlock COMBINING_HALF_MARKS = new UnicodeBlock ("Combining Half Marks", '\uFE20', '\uFE2F');
415 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS = new UnicodeBlock ("CJK Compatibility Forms", '\uFE30', '\uFE4F');
416 public static final UnicodeBlock SMALL_FORM_VARIANTS = new UnicodeBlock ("Small Form Variants", '\uFE50', '\uFE6F');
417 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B = new UnicodeBlock ("Arabic Presentation Forms-B", '\uFE70', '\uFEFE');
418 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS = new UnicodeBlock ("Halfwidth and Fullwidth Forms", '\uFF00', '\uFFEF');
419 public static final UnicodeBlock SPECIALS = new UnicodeBlock ("Specials", '\uFFF0', '\uFFFD');
420 private static final UnicodeBlock[] blocks = {
426 SPACING_MODIFIER_LETTERS,
427 COMBINING_DIACRITICAL_MARKS,
447 LATIN_EXTENDED_ADDITIONAL,
450 SUPERSCRIPTS_AND_SUBSCRIPTS,
452 COMBINING_MARKS_FOR_SYMBOLS,
456 MATHEMATICAL_OPERATORS,
457 MISCELLANEOUS_TECHNICAL,
459 OPTICAL_CHARACTER_RECOGNITION,
460 ENCLOSED_ALPHANUMERICS,
464 MISCELLANEOUS_SYMBOLS,
466 CJK_SYMBOLS_AND_PUNCTUATION,
470 HANGUL_COMPATIBILITY_JAMO,
472 ENCLOSED_CJK_LETTERS_AND_MONTHS,
474 CJK_UNIFIED_IDEOGRAPHS,
477 HIGH_PRIVATE_USE_SURROGATES,
480 CJK_COMPATIBILITY_IDEOGRAPHS,
481 ALPHABETIC_PRESENTATION_FORMS,
482 ARABIC_PRESENTATION_FORMS_A,
483 COMBINING_HALF_MARKS,
484 CJK_COMPATIBILITY_FORMS,
486 ARABIC_PRESENTATION_FORMS_B,
487 HALFWIDTH_AND_FULLWIDTH_FORMS,