Add Unicode script support

[platform/upstream/glib.git] / docs / reference / glib / tmpl / unicode.sgml
diff --git a/docs/reference/glib/tmpl/unicode.sgml b/docs/reference/glib/tmpl/unicode.sgml

index db8cc03..7e1cff9 100644 (file)
--- a/docs/reference/glib/tmpl/unicode.sgml
+++ b/docs/reference/glib/tmpl/unicode.sgml
@@ -2,38 +2,67 @@
  Unicode Manipulation
  
  <!-- ##### SECTION Short_Description ##### -->
-
+functions operating on Unicode characters and UTF-8 strings.
  
  <!-- ##### SECTION Long_Description ##### -->
  <para>
-
+This section describes a number of functions for dealing with
+Unicode characters and strings.  There are analogues of the
+traditional <filename>ctype.h</filename> character classification
+and case conversion functions, UTF-8 analogues of some string utility 
+functions, functions to perform normalization, case conversion and
+collation on UTF-8 strings and finally functions to convert between
+the UTF-8, UTF-16 and UCS-4 encodings of Unicode.
  </para>
  
+<para>
+The implementations of the Unicode functions in GLib are based
+on the Unicode Character Data tables, which are available from
+<ulink url="http://www.unicode.org">www.unicode.org</ulink>.
+GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
+GLib 2.12 supports Unicode 5.0.
+</para>
  
  <!-- ##### SECTION See_Also ##### -->
  <para>
+<variablelist>
+
+<varlistentry>
+<term>g_locale_to_utf8(), g_locale_from_utf8()</term>
+<listitem><para>
+Convenience functions for converting between UTF-8 and the locale encoding. 
+</para></listitem>
+</varlistentry>
  
+</variablelist>
  </para>
  
+<!-- ##### SECTION Stability_Level ##### -->
+
  
  <!-- ##### TYPEDEF gunichar ##### -->
  <para>
-
+A type which can hold any UCS-4 character code. 
  </para>
  
  
  <!-- ##### TYPEDEF gunichar2 ##### -->
  <para>
-
+A type which can hold any UTF-16 code 
+point<footnote id="utf16_surrogate_pairs">UTF-16 also has so called 
+<firstterm>surrogate pairs</firstterm> to encode characters beyond the 
+BMP as pairs of 16bit numbers. Surrogate pairs cannot be stored in a 
+single gunichar2 field, but all GLib functions accepting gunichar2 arrays 
+will correctly interpret surrogate pairs.</footnote>.
  </para>
  
  
-<!-- ##### FUNCTION g_get_charset ##### -->
+<!-- ##### FUNCTION g_unichar_validate ##### -->
  <para>
  
  </para>
  
-@charset: 
+@ch: 
  @Returns: 
  
  
@@ -163,6 +192,15 @@ Unicode Manipulation
  @Returns: 
  
  
+<!-- ##### FUNCTION g_unichar_iswide_cjk ##### -->
+<para>
+
+</para>
+
+@c: 
+@Returns: 
+
+
  <!-- ##### FUNCTION g_unichar_toupper ##### -->
  <para>
  
@@ -210,7 +248,9 @@ Unicode Manipulation
  
  <!-- ##### ENUM GUnicodeType ##### -->
  <para>
-
+These are the possible character classifications.
+See <ulink url="http://www.unicode.org/Public/UNIDATA/UnicodeData.html"
+>http://www.unicode.org/Public/UNIDATA/UnicodeData.html</ulink>.
  </para>
  
  @G_UNICODE_CONTROL: 
@@ -253,6 +293,63 @@ Unicode Manipulation
  @Returns: 
  
  
+<!-- ##### ENUM GUnicodeBreakType ##### -->
+<para>
+These are the possible line break classifications.
+The five Hangul types were added in Unicode 4.1, so, has been
+introduced in GLib 2.10.  Note that new types may be added in the future.
+Applications should be ready to handle unknown values.
+They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
+See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
+>http://www.unicode.org/unicode/reports/tr14/</ulink>.
+</para>
+
+@G_UNICODE_BREAK_MANDATORY: 
+@G_UNICODE_BREAK_CARRIAGE_RETURN: 
+@G_UNICODE_BREAK_LINE_FEED: 
+@G_UNICODE_BREAK_COMBINING_MARK: 
+@G_UNICODE_BREAK_SURROGATE: 
+@G_UNICODE_BREAK_ZERO_WIDTH_SPACE: 
+@G_UNICODE_BREAK_INSEPARABLE: 
+@G_UNICODE_BREAK_NON_BREAKING_GLUE: 
+@G_UNICODE_BREAK_CONTINGENT: 
+@G_UNICODE_BREAK_SPACE: 
+@G_UNICODE_BREAK_AFTER: 
+@G_UNICODE_BREAK_BEFORE: 
+@G_UNICODE_BREAK_BEFORE_AND_AFTER: 
+@G_UNICODE_BREAK_HYPHEN: 
+@G_UNICODE_BREAK_NON_STARTER: 
+@G_UNICODE_BREAK_OPEN_PUNCTUATION: 
+@G_UNICODE_BREAK_CLOSE_PUNCTUATION: 
+@G_UNICODE_BREAK_QUOTATION: 
+@G_UNICODE_BREAK_EXCLAMATION: 
+@G_UNICODE_BREAK_IDEOGRAPHIC: 
+@G_UNICODE_BREAK_NUMERIC: 
+@G_UNICODE_BREAK_INFIX_SEPARATOR: 
+@G_UNICODE_BREAK_SYMBOL: 
+@G_UNICODE_BREAK_ALPHABETIC: 
+@G_UNICODE_BREAK_PREFIX: 
+@G_UNICODE_BREAK_POSTFIX: 
+@G_UNICODE_BREAK_COMPLEX_CONTEXT: 
+@G_UNICODE_BREAK_AMBIGUOUS: 
+@G_UNICODE_BREAK_UNKNOWN: 
+@G_UNICODE_BREAK_NEXT_LINE: 
+@G_UNICODE_BREAK_WORD_JOINER: 
+@G_UNICODE_BREAK_HANGUL_L_JAMO: 
+@G_UNICODE_BREAK_HANGUL_V_JAMO: 
+@G_UNICODE_BREAK_HANGUL_T_JAMO: 
+@G_UNICODE_BREAK_HANGUL_LV_SYLLABLE: 
+@G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE: 
+
+<!-- ##### FUNCTION g_unichar_break_type ##### -->
+<para>
+
+</para>
+
+@c: 
+@Returns: 
+
+
  <!-- ##### FUNCTION g_unicode_canonical_ordering ##### -->
  <para>
  
@@ -272,20 +369,138 @@ Unicode Manipulation
  @Returns: 
  
  
+<!-- ##### FUNCTION g_unichar_get_mirror_char ##### -->
+<para>
+
+</para>
+
+@ch: 
+@mirrored_ch: 
+@Returns: 
+
+
+<!-- ##### ENUM GUnicodeScript ##### -->
+<para>
+The #GUnicodeScript enumeration identifies different writing
+systems. The values correspond to the names as defined in the
+Unicode standard. The enumeration has been added in GLib 2.14. 
+Note that new types may be added in the future. Applications 
+should be ready to handle unknown values.
+See <ulink
+url="http://www.unicode.org/reports/tr24/">Unicode Standard Annex
+#24: Script names</ulink>.
+</para>
+
+@G_UNICODE_SCRIPT_INVALID_CODE: a value never returned from g_unichar_get_script()
+@G_UNICODE_SCRIPT_COMMON:     a character used by multiple different scripts
+@G_UNICODE_SCRIPT_INHERITED:  a mark glyph that takes its script from the
+                              base glyph to which it is attached
+@G_UNICODE_SCRIPT_ARABIC:     Arabic
+@G_UNICODE_SCRIPT_ARMENIAN:   Armenian
+@G_UNICODE_SCRIPT_BENGALI:    Bengali
+@G_UNICODE_SCRIPT_BOPOMOFO:   Bopomofo
+@G_UNICODE_SCRIPT_CHEROKEE:   Cherokee
+@G_UNICODE_SCRIPT_COPTIC:     Coptic
+@G_UNICODE_SCRIPT_CYRILLIC:   Cyrillic
+@G_UNICODE_SCRIPT_DESERET:    Deseret
+@G_UNICODE_SCRIPT_DEVANAGARI: Devanagari
+@G_UNICODE_SCRIPT_ETHIOPIC:   Ethiopic
+@G_UNICODE_SCRIPT_GEORGIAN:   Georgian
+@G_UNICODE_SCRIPT_GOTHIC:     Gothic
+@G_UNICODE_SCRIPT_GREEK:      Greek
+@G_UNICODE_SCRIPT_GUJARATI:   Gujarati
+@G_UNICODE_SCRIPT_GURMUKHI:   Gurmukhi
+@G_UNICODE_SCRIPT_HAN:        Han
+@G_UNICODE_SCRIPT_HANGUL:     Hangul
+@G_UNICODE_SCRIPT_HEBREW:     Hebrew
+@G_UNICODE_SCRIPT_HIRAGANA:   Hiragana
+@G_UNICODE_SCRIPT_KANNADA:    Kannada
+@G_UNICODE_SCRIPT_KATAKANA:   Katakana
+@G_UNICODE_SCRIPT_KHMER:      Khmer
+@G_UNICODE_SCRIPT_LAO:        Lao
+@G_UNICODE_SCRIPT_LATIN:      Latin
+@G_UNICODE_SCRIPT_MALAYALAM:  Malayalam
+@G_UNICODE_SCRIPT_MONGOLIAN:  Mongolian
+@G_UNICODE_SCRIPT_MYANMAR:    Myanmar
+@G_UNICODE_SCRIPT_OGHAM:      Ogham
+@G_UNICODE_SCRIPT_OLD_ITALIC: Old Italic
+@G_UNICODE_SCRIPT_ORIYA:      Oriya
+@G_UNICODE_SCRIPT_RUNIC:      Runic
+@G_UNICODE_SCRIPT_SINHALA:    Sinhala
+@G_UNICODE_SCRIPT_SYRIAC:     Syriac
+@G_UNICODE_SCRIPT_TAMIL:      Tamil
+@G_UNICODE_SCRIPT_TELUGU:     Telugu
+@G_UNICODE_SCRIPT_THAANA:     Thaana
+@G_UNICODE_SCRIPT_THAI:       Thai
+@G_UNICODE_SCRIPT_TIBETAN:    Tibetan
+@G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL: 
+                              Canadian Aboriginal
+@G_UNICODE_SCRIPT_YI:         Yi
+@G_UNICODE_SCRIPT_TAGALOG:    Tagalog
+@G_UNICODE_SCRIPT_HANUNOO:    Hanunoo
+@G_UNICODE_SCRIPT_BUHID:      Buhid
+@G_UNICODE_SCRIPT_TAGBANWA:   Tagbanwa
+@G_UNICODE_SCRIPT_BRAILLE:    Braille
+@G_UNICODE_SCRIPT_CYPRIOT:    Cypriot
+@G_UNICODE_SCRIPT_LIMBU:      Limbu
+@G_UNICODE_SCRIPT_OSMANYA:    Osmanya
+@G_UNICODE_SCRIPT_SHAVIAN:    Shavian
+@G_UNICODE_SCRIPT_LINEAR_B:   Linear B
+@G_UNICODE_SCRIPT_TAI_LE:     Tai Le
+@G_UNICODE_SCRIPT_UGARITIC:   Ugaritic
+@G_UNICODE_SCRIPT_NEW_TAI_LUE: New Tai Lue
+@G_UNICODE_SCRIPT_BUGINESE:   Buginese
+@G_UNICODE_SCRIPT_GLAGOLITIC: Glagolitic
+@G_UNICODE_SCRIPT_TIFINAGH:   Tifinagh
+@G_UNICODE_SCRIPT_SYLOTI_NAGRI: Syloti Nagri
+@G_UNICODE_SCRIPT_OLD_PERSIAN: Old Persian
+@G_UNICODE_SCRIPT_KHAROSHTHI: Kharoshthi
+@G_UNICODE_SCRIPT_UNKNOWN:    an unassigned code point
+@G_UNICODE_SCRIPT_BALINESE:   Balinese
+@G_UNICODE_SCRIPT_CUNEIFORM:  Cuneiform
+@G_UNICODE_SCRIPT_PHOENICIAN: Phoenician
+@G_UNICODE_SCRIPT_PHAGS_PA:   Phags-pa
+@G_UNICODE_SCRIPT_NKO:        N'Ko
+
+
+<!-- ##### FUNCTION g_unichar_get_script ##### -->
+<para>
+
+</para>
+
+@ch: 
+@Returns: 
+
+
  <!-- ##### MACRO g_utf8_next_char ##### -->
  <para>
+Skips to the next character in a UTF-8 string. The string must be
+valid; this macro is as fast as possible, and has no error-checking.
+You would use this macro to iterate over a string character by
+character. The macro returns the start of the next UTF-8 character.
+Before using this macro, use g_utf8_validate() to validate strings
+that may contain invalid UTF-8.
+</para>
+
+@p: Pointer to the start of a valid UTF-8 character.
+
+
+<!-- ##### FUNCTION g_utf8_get_char ##### -->
+<para>
  
  </para>
  
  @p: 
+@Returns: 
  
  
-<!-- ##### FUNCTION g_utf8_get_char ##### -->
+<!-- ##### FUNCTION g_utf8_get_char_validated ##### -->
  <para>
  
  </para>
  
  @p: 
+@max_len: 
  @Returns: 
  
  
@@ -365,6 +580,7 @@ Unicode Manipulation
  </para>
  
  @p: 
+@len: 
  @c: 
  @Returns: 
  
@@ -375,10 +591,127 @@ Unicode Manipulation
  </para>
  
  @p: 
+@len: 
  @c: 
  @Returns: 
  
  
+<!-- ##### FUNCTION g_utf8_strreverse ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_validate ##### -->
+<para>
+
+</para>
+
+@str: 
+@max_len: 
+@end: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_strup ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_strdown ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_casefold ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_normalize ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@mode: 
+@Returns: 
+
+
+<!-- ##### ENUM GNormalizeMode ##### -->
+<para>
+Defines how a Unicode string is transformed in a canonical 
+form, standardizing such issues as whether a character with an accent is 
+represented as a base character and combining accent or as a single precomposed
+character. Unicode strings should generally be normalized before comparing them.
+</para>
+
+@G_NORMALIZE_DEFAULT: standardize differences that do not affect the
+  text content, such as the above-mentioned accent representation.
+@G_NORMALIZE_NFD: another name for %G_NORMALIZE_DEFAULT.
+@G_NORMALIZE_DEFAULT_COMPOSE: like %G_NORMALIZE_DEFAULT, but with composed
+  forms rather than a maximally decomposed form.
+@G_NORMALIZE_NFC: another name for %G_NORMALIZE_DEFAULT_COMPOSE.
+@G_NORMALIZE_ALL: beyond %G_NORMALIZE_DEFAULT also standardize the 
+  "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the 
+  standard forms (in this case DIGIT THREE). Formatting information may be 
+  lost but for most text operations such characters should be considered the 
+  same.
+@G_NORMALIZE_NFKD: another name for %G_NORMALIZE_ALL.
+@G_NORMALIZE_ALL_COMPOSE: like %G_NORMALIZE_ALL, but with composed
+  forms rather than a maximally decomposed form.
+@G_NORMALIZE_NFKC: another name for %G_NORMALIZE_ALL_COMPOSE.
+
+<!-- ##### FUNCTION g_utf8_collate ##### -->
+<para>
+
+</para>
+
+@str1: 
+@str2: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_collate_key ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_collate_key_for_filename ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@Returns: 
+
+
  <!-- ##### FUNCTION g_utf8_to_utf16 ##### -->
  <para>
  
@@ -386,6 +719,9 @@ Unicode Manipulation
  
  @str: 
  @len: 
+@items_read: 
+@items_written: 
+@error: 
  @Returns: 
  
  
@@ -396,6 +732,20 @@ Unicode Manipulation
  
  @str: 
  @len: 
+@items_read: 
+@items_written: 
+@error: 
+@Returns: 
+
+
+<!-- ##### FUNCTION g_utf8_to_ucs4_fast ##### -->
+<para>
+
+</para>
+
+@str: 
+@len: 
+@items_written: 
  @Returns: 
  
  
@@ -406,6 +756,9 @@ Unicode Manipulation
  
  @str: 
  @len: 
+@items_read: 
+@items_written: 
+@error: 
  @Returns: 
  
  
@@ -416,6 +769,9 @@ Unicode Manipulation
  
  @str: 
  @len: 
+@items_read: 
+@items_written: 
+@error: 
  @Returns: 
  
  
@@ -426,6 +782,9 @@ Unicode Manipulation
  
  @str: 
  @len: 
+@items_read: 
+@items_written: 
+@error: 
  @Returns: 
  
  
@@ -436,6 +795,9 @@ Unicode Manipulation
  
  @str: 
  @len: 
+@items_read: 
+@items_written: 
+@error: 
  @Returns: