4 * Copyright (C) 1999 Red Hat Software
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
24 #include "pango-break.h"
25 #include "pango-modules.h"
26 #include "pango-script-private.h"
27 #include "pango-impl-utils.h"
30 #define PARAGRAPH_SEPARATOR 0x2029
31 #define PARAGRAPH_SEPARATOR_STRING "\xE2\x80\xA9"
33 /* See http://www.unicode.org/unicode/reports/tr14/ if you hope
34 * to understand the line breaking code.
39 BREAK_ALREADY_HANDLED, /* didn't use the table */
40 BREAK_PROHIBITED, /* no break, even if spaces intervene */
41 BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */
42 BREAK_ALLOWED /* "direct break" (can always break here) */
43 /* TR 14 has one more break-opportunity class,
44 * "indirect break opportunity for combining marks following a space"
45 * but we handle that inline in the code.
52 INDEX_OPEN_PUNCTUATION,
53 INDEX_CLOSE_PUNCTUATION,
55 INDEX_NON_BREAKING_GLUE,
59 INDEX_INFIX_SEPARATOR,
69 INDEX_BEFORE_AND_AFTER,
70 INDEX_ZERO_WIDTH_SPACE,
74 /* End of the table */
78 /* The following are not in the tables */
80 INDEX_CARRIAGE_RETURN,
85 INDEX_COMPLEX_CONTEXT,
92 INDEX_HANGUL_LV_SYLLABLE,
93 INDEX_HANGUL_LVT_SYLLABLE,
96 static const BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
97 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
98 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
99 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
100 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
101 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
105 static const BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
106 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
107 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
108 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
109 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
110 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
114 static const BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
115 BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
116 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
117 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
118 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
119 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
123 static const BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
124 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
125 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
126 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
127 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
128 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
132 static const BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
133 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
134 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
135 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
136 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
137 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
141 static const BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
142 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
143 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
144 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
145 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
146 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
150 static const BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
151 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
152 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
153 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
154 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
155 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
159 static const BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
160 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
161 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
162 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
163 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
164 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
168 static const BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
169 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
170 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
171 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
172 BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
173 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
177 static const BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
178 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
179 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
180 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
181 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
182 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
186 static const BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
187 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
188 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
189 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
190 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
191 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
195 static const BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
196 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
197 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
198 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
199 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
200 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
204 static const BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
205 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
206 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
207 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
208 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
209 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
213 static const BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
214 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
215 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
216 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
217 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
218 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
222 static const BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
223 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
224 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
225 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
226 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
227 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
231 static const BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
232 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
233 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
234 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
235 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
236 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
240 static const BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
241 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
242 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
243 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
244 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
245 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
249 static const BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
250 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
251 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
252 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
253 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
254 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
258 static const BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
259 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
260 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
261 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
262 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
263 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
267 static const BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
268 BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
269 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
270 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
271 BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
272 BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
276 static const BreakOpportunity row_WORD_JOINER[INDEX_END_OF_TABLE] = {
277 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
278 BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
279 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
280 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
281 BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
285 static const BreakOpportunity *const line_break_rows[INDEX_END_OF_TABLE] = {
286 row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
287 row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
288 row_QUOTATION, /* INDEX_QUOTATION */
289 row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
290 row_NON_STARTER, /* INDEX_NON_STARTER */
291 row_EXCLAMATION, /* INDEX_EXCLAMATION */
292 row_SYMBOL, /* INDEX_SYMBOL */
293 row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
294 row_PREFIX, /* INDEX_PREFIX */
295 row_POSTFIX, /* INDEX_POSTFIX */
296 row_NUMERIC, /* INDEX_NUMERIC */
297 row_ALPHABETIC, /* INDEX_ALPHABETIC */
298 row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
299 row_INSEPARABLE, /* INDEX_INSEPARABLE */
300 row_HYPHEN, /* INDEX_HYPHEN */
301 row_AFTER, /* INDEX_AFTER */
302 row_BEFORE, /* INDEX_BEFORE */
303 row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
304 row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
305 row_COMBINING_MARK, /* INDEX_COMBINING_MARK */
306 row_WORD_JOINER /* INDEX_WORD_JOINER */
309 /* Map GUnicodeBreakType to table indexes */
310 static const int line_break_indexes[] = {
312 INDEX_CARRIAGE_RETURN,
314 INDEX_COMBINING_MARK,
316 INDEX_ZERO_WIDTH_SPACE,
318 INDEX_NON_BREAKING_GLUE,
323 INDEX_BEFORE_AND_AFTER,
326 INDEX_OPEN_PUNCTUATION,
327 INDEX_CLOSE_PUNCTUATION,
332 INDEX_INFIX_SEPARATOR,
337 INDEX_COMPLEX_CONTEXT,
345 INDEX_HANGUL_LV_SYLLABLE,
346 INDEX_HANGUL_LVT_SYLLABLE
349 #define BREAK_TYPE_SAFE(btype) \
350 ((btype) < G_N_ELEMENTS(line_break_indexes) ? (btype) : G_UNICODE_BREAK_UNKNOWN)
351 #define BREAK_INDEX(btype) \
352 (line_break_indexes[(btype)])
353 #define BREAK_ROW(before_type) \
354 (line_break_rows[BREAK_INDEX (before_type)])
355 #define BREAK_OP(before_type, after_type) \
356 (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
357 #define IN_BREAK_TABLE(btype) \
358 ((btype) < G_N_ELEMENTS(line_break_indexes) && BREAK_INDEX((btype)) < INDEX_END_OF_TABLE)
363 * Hangul Conjoining Jamo handling.
365 * The way we implement it is just a bit different from TR14,
366 * but produces the same results.
367 * The same algorithm is also used in TR29 for cluster boundaries.
372 /* An enum that works as the states of the Hangul syllables system.
376 JAMO_L, /* G_UNICODE_BREAK_HANGUL_L_JAMO */
377 JAMO_V, /* G_UNICODE_BREAK_HANGUL_V_JAMO */
378 JAMO_T, /* G_UNICODE_BREAK_HANGUL_T_JAMO */
379 JAMO_LV, /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
380 JAMO_LVT, /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
384 /* There are Hangul syllables encoded as characters, that act like a
385 * sequence of Jamos. For each character we define a JamoType
386 * that the character starts with, and one that it ends with. This
387 * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs. So for
388 * example, a character with LineBreak type
389 * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
391 typedef struct _CharJamoProps
396 /* Map from JamoType to CharJamoProps that hold only simple
397 * JamoTypes (no LV or LVT) or none.
399 static const CharJamoProps HangulJamoProps[] = {
400 {JAMO_L, JAMO_L}, /* JAMO_L */
401 {JAMO_V, JAMO_V}, /* JAMO_V */
402 {JAMO_T, JAMO_T}, /* JAMO_T */
403 {JAMO_L, JAMO_V}, /* JAMO_LV */
404 {JAMO_L, JAMO_T}, /* JAMO_LVT */
405 {NO_JAMO, NO_JAMO} /* NO_JAMO */
408 /* A character forms a syllable with the previous character if and only if:
409 * JamoType(this) is not NO_JAMO and:
411 * HangulJamoProps[JamoType(prev)].end and
412 * HangulJamoProps[JamoType(this)].start are equal,
413 * or the former is one less than the latter.
416 #define IS_JAMO(btype) \
417 ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
418 (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
419 #define JAMO_TYPE(btype) \
420 (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
422 /* Types of Japanese characters */
423 #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
424 #define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
425 #define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
426 #define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
428 #define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
429 #define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
430 #define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
431 #define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
432 #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
433 #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))
435 /* p. 132-133 of Unicode spec table 5-6 will help understand this */
438 STATE_SENTENCE_OUTSIDE,
441 STATE_SENTENCE_POST_TERM_CLOSE,
442 STATE_SENTENCE_POST_TERM_SPACE,
443 STATE_SENTENCE_POST_TERM_SEP,
445 STATE_SENTENCE_POST_DOT_CLOSE,
446 STATE_SENTENCE_POST_DOT_SPACE,
447 STATE_SENTENCE_POST_DOT_OPEN,
448 /* never include line/para separators in a sentence for now */
449 /* This isn't in the spec, but I can't figure out why they'd include
450 * one line/para separator in lines ending with Term but not with
451 * period-terminated lines, so I'm doing it for the dot lines also
453 STATE_SENTENCE_POST_DOT_SEP
456 /* We call "123" and "foobar" words, but "123foo" is two words;
457 * the Unicode spec just calls "123" a non-word
468 * pango_default_break:
469 * @text: text to break
470 * @length: length of text in bytes (may be -1 if @text is nul-terminated)
471 * @analysis: a #PangoAnalysis for the @text
472 * @attrs: logical attributes to fill in
473 * @attrs_len: size of the array passed as @attrs
475 * This is the default break algorithm, used if no language
476 * engine overrides it. Normally you should use pango_break()
477 * instead. Unlike pango_break(),
478 * @analysis can be %NULL, but only do that if you know what
479 * you're doing. If you need an analysis to pass to pango_break(),
480 * you need to pango_itemize(). In most cases however you should
481 * simply use pango_get_log_attrs().
484 pango_default_break (const gchar *text,
486 PangoAnalysis *analysis G_GNUC_UNUSED,
488 int attrs_len G_GNUC_UNUSED)
490 /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
491 * the line breaking stuff is also in TR14 on unicode.org
494 /* This is a default break implementation that should work for nearly all
495 * languages. Language engines can override it optionally.
498 /* FIXME one cheesy optimization here would be to memset attrs to 0
499 * before we start, and then never assign %FALSE to anything
510 GUnicodeBreakType next_break_type;
511 GUnicodeType prev_type;
512 GUnicodeBreakType prev_break_type; /* skips spaces */
513 gboolean prev_was_break_space;
515 /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
523 GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
525 GraphemeBreakType prev_GB_type = GB_Other;
527 /* See Word_Break Property Values table of UAX#29 */
541 WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
544 WordType current_word_type = WordNone;
545 gunichar last_word_letter = 0;
546 gunichar base_character = 0;
548 SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
549 /* Tracks what will be the end of the sentence if a period is
550 * determined to actually be a sentence-ending period.
552 gint possible_sentence_end = -1;
553 /* possible sentence break before Open* after a period-ended sentence */
554 gint possible_sentence_boundary = -1;
555 gboolean almost_done = FALSE;
556 gboolean done = FALSE;
558 g_return_if_fail (length == 0 || text != NULL);
559 g_return_if_fail (attrs != NULL);
563 prev_type = G_UNICODE_PARAGRAPH_SEPARATOR;
564 prev_break_type = G_UNICODE_BREAK_UNKNOWN;
565 prev_was_break_space = FALSE;
569 if (length == 0 || *text == '\0')
571 next_wc = PARAGRAPH_SEPARATOR;
575 next_wc = g_utf8_get_char (next);
577 next_break_type = g_unichar_break_type (next_wc);
578 next_break_type = BREAK_TYPE_SAFE (next_break_type);
580 for (i = 0; !done ; i++)
584 GUnicodeBreakType break_type;
585 BreakOpportunity break_op;
587 gboolean makes_hangul_syllable;
589 /* UAX#29 boundaries */
590 gboolean is_grapheme_boundary;
591 gboolean is_word_boundary;
595 break_type = next_break_type;
600 * If we have already reached the end of @text g_utf8_next_char()
601 * may not increment next
604 next_break_type = G_UNICODE_BREAK_UNKNOWN;
609 next = g_utf8_next_char (next);
611 if ((length >= 0 && next >= text + length) || *next == '\0')
613 /* This is how we fill in the last element (end position) of the
614 * attr array - assume there's a paragraph separators off the end
617 next_wc = PARAGRAPH_SEPARATOR;
621 next_wc = g_utf8_get_char (next);
623 next_break_type = g_unichar_break_type (next_wc);
624 next_break_type = BREAK_TYPE_SAFE (next_break_type);
627 type = g_unichar_type (wc);
628 jamo = JAMO_TYPE (break_type);
630 /* Determine wheter this forms a Hangul syllable with prev. */
632 makes_hangul_syllable = FALSE;
635 JamoType prev_end = HangulJamoProps[prev_jamo].end ;
636 JamoType this_start = HangulJamoProps[ jamo].start;
638 /* See comments before IS_JAMO */
639 makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
642 /* Can't just use the type here since isspace() doesn't
643 * correspond to a Unicode character type
645 attrs[i].is_white = g_unichar_isspace (wc);
647 /* Just few spaces have variable width. So explicitly mark them.
649 attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
651 /* ---- UAX#29 Grapheme Boundaries ---- */
653 GraphemeBreakType GB_type;
654 /* Find the GraphemeBreakType of wc */
658 case G_UNICODE_FORMAT:
659 if (wc == 0x200C && wc == 0x200D)
661 GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
665 case G_UNICODE_CONTROL:
666 case G_UNICODE_LINE_SEPARATOR:
667 case G_UNICODE_PARAGRAPH_SEPARATOR:
668 GB_type = GB_ControlCRLF;
671 case G_UNICODE_OTHER_LETTER:
672 if (makes_hangul_syllable)
673 GB_type = GB_InHangulSyllable;
674 else if ((wc & 0x0E00) == 0x0E00)
676 /* Thai and Lao stuff hardcoded in UAX#29 */
677 if ((wc >= 0x0E40 && wc <= 0x0E44) || (wc >= 0x0EC0 && wc <= 0x0EC4))
678 GB_type = GB_Prepend; /* Prepend */
679 else if (wc == 0x0E30 || wc == 0x0E32 || wc == 0x0E33 || wc == 0x0E45 ||
680 wc == 0x0EB0 || wc == 0x0EB2 || wc == 0x0EB3)
681 GB_type = GB_Extend; /* Exceptions in the Extend definition */
685 case G_UNICODE_MODIFIER_LETTER:
686 if (wc >= 0xFF9E && wc <= 0xFF9F)
687 GB_type = GB_Extend; /* Other_Grapheme_Extend */
690 case G_UNICODE_COMBINING_MARK:
691 GB_type = GB_SpacingMark; /* SpacingMark */
694 if (wc == 0x09BE || wc == 0x09D7 ||
695 wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
696 wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
697 wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
698 wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
699 GB_type = GB_Extend; /* Other_Grapheme_Extend */
703 case G_UNICODE_ENCLOSING_MARK:
704 case G_UNICODE_NON_SPACING_MARK:
705 GB_type = GB_Extend; /* Grapheme_Extend */
709 /* Grapheme Cluster Boundary Rules */
710 /* We apply Rules GB1 and GB2 at the end of the function */
711 if (wc == '\n' && prev_wc == '\r')
712 is_grapheme_boundary = FALSE; /* Rule GB3 */
713 else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
714 is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
715 else if (GB_type == GB_InHangulSyllable)
716 is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
717 else if (GB_type == GB_Extend)
718 is_grapheme_boundary = FALSE; /* Rule GB9 */
719 else if (GB_type == GB_SpacingMark)
720 is_grapheme_boundary = FALSE; /* Rule GB9a */
721 else if (prev_GB_type == GB_Prepend)
722 is_grapheme_boundary = FALSE; /* Rule GB9b */
724 is_grapheme_boundary = TRUE; /* Rule GB10 */
726 prev_GB_type = GB_type;
728 attrs[i].is_cursor_position = is_grapheme_boundary;
729 /* If this is a grapheme boundary, we have to decide if backspace
730 * deletes a character or the whole grapheme cluster */
731 if (is_grapheme_boundary)
732 attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
734 attrs[i].backspace_deletes_character = FALSE;
737 /* ---- UAX#29 Word Boundaries ---- */
739 is_word_boundary = FALSE;
740 if (is_grapheme_boundary) /* Rules WB3 and WB4 */
743 WordBreakType WB_type;
745 script = pango_script_for_unichar (wc);
747 /* Find the WordBreakType of wc */
750 if (script == PANGO_SCRIPT_KATAKANA)
751 WB_type = WB_Katakana;
753 if (WB_type == WB_Other)
757 if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
758 wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
759 WB_type = WB_Katakana; /* Katakana exceptions */
763 WB_type = WB_Katakana; /* Katakana exceptions */
764 else if (wc >= 0xFF9E || wc <= 0xFF9F)
765 WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
769 WB_type = WB_ALetter; /* ALetter exceptions */
773 if (WB_type == WB_Other)
774 switch ((int) break_type)
776 case G_UNICODE_BREAK_NUMERIC:
778 WB_type = WB_Numeric; /* Numeric */
780 case G_UNICODE_BREAK_INFIX_SEPARATOR:
781 if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
782 WB_type = WB_MidNum; /* MidNum */
786 if (WB_type == WB_Other)
789 case G_UNICODE_CONTROL:
790 if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
793 case G_UNICODE_LINE_SEPARATOR:
794 case G_UNICODE_PARAGRAPH_SEPARATOR:
795 WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
798 case G_UNICODE_FORMAT:
799 case G_UNICODE_COMBINING_MARK:
800 case G_UNICODE_ENCLOSING_MARK:
801 case G_UNICODE_NON_SPACING_MARK:
802 WB_type = WB_ExtendFormat; /* Extend, Format */
805 case G_UNICODE_CONNECT_PUNCTUATION:
806 WB_type = WB_ExtendNumLet; /* ExtendNumLet */
809 case G_UNICODE_INITIAL_PUNCTUATION:
810 case G_UNICODE_FINAL_PUNCTUATION:
811 if (wc == 0x2018 || wc == 0x2019)
812 WB_type = WB_MidNumLet; /* MidNumLet */
814 case G_UNICODE_OTHER_PUNCTUATION:
815 if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
816 wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
817 WB_type = WB_MidNumLet; /* MidNumLet */
818 else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 ||
819 wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
820 WB_type = WB_MidLetter; /* WB_MidLetter */
821 else if (wc == 0x066c ||
822 wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
823 WB_type = WB_MidNum; /* MidNum */
826 case G_UNICODE_OTHER_SYMBOL:
827 if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
831 case G_UNICODE_OTHER_LETTER:
832 case G_UNICODE_LETTER_NUMBER:
833 if (wc == 0x3006 || wc == 0x3007 ||
834 (wc >= 0x3021 && wc <= 0x3029) ||
835 (wc >= 0x3038 && wc <= 0x303A) ||
836 (wc >= 0x3400 && wc <= 0x4DB5) ||
837 (wc >= 0x4E00 && wc <= 0x9FC3) ||
838 (wc >= 0xF900 && wc <= 0xFA2D) ||
839 (wc >= 0xFA30 && wc <= 0xFA6A) ||
840 (wc >= 0xFA70 && wc <= 0xFAD9) ||
841 (wc >= 0x20000 && wc <= 0x2A6D6) ||
842 (wc >= 0x2F800 && wc <= 0x2FA1D))
843 break; /* ALetter exceptions: Ideographic */
846 case G_UNICODE_LOWERCASE_LETTER:
847 case G_UNICODE_MODIFIER_LETTER:
848 case G_UNICODE_TITLECASE_LETTER:
849 case G_UNICODE_UPPERCASE_LETTER:
851 if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
852 WB_type = WB_ALetter; /* ALetter */
856 /* Grapheme Cluster Boundary Rules */
858 /* We apply Rules WB1 and WB2 at the end of the function */
860 if (prev_wc == 0x3031 && wc == 0x41)
861 g_debug ("Y %d %d", prev_WB_type, WB_type);
862 if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
864 /* The extra check for prev_WB_i is to correctly handle sequences like
865 * Newline ÷ Extend × Extend
866 * since we have not skipped ExtendFormat yet.
868 is_word_boundary = TRUE; /* Rule WB3a */
870 else if (WB_type == WB_NewlineCRLF)
871 is_word_boundary = TRUE; /* Rule WB3b */
872 else if (WB_type == WB_ExtendFormat)
873 is_word_boundary = FALSE; /* Rules WB4? */
874 else if ((prev_WB_type == WB_ALetter ||
875 prev_WB_type == WB_Numeric ||
876 prev_WB_type == WB_ExtendNumLet) &&
877 ( WB_type == WB_ALetter ||
878 WB_type == WB_Numeric ||
879 WB_type == WB_ExtendNumLet))
880 is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10, WB13a, WB13b */
881 else if ((prev_WB_type == WB_Katakana ||
882 prev_WB_type == WB_ExtendNumLet) &&
883 ( WB_type == WB_Katakana ||
884 WB_type == WB_ExtendNumLet))
885 is_word_boundary = FALSE; /* Rules WB13, WB13a, WB13b */
886 else if ((prev_prev_WB_type == WB_ALetter && WB_type == WB_ALetter) &&
887 (prev_WB_type == WB_MidLetter || prev_WB_type == WB_MidNumLet))
889 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
890 is_word_boundary = FALSE; /* Rule WB7 */
892 else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
893 (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet))
895 is_word_boundary = FALSE; /* Rule WB11 */
896 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
899 is_word_boundary = TRUE; /* Rule WB14 */
901 if (WB_type != WB_ExtendFormat)
903 prev_prev_WB_type = prev_WB_type;
904 prev_WB_type = WB_type;
909 attrs[i].is_word_boundary = is_word_boundary;
913 /* ---- Line breaking ---- */
915 break_op = BREAK_ALREADY_HANDLED;
917 g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);
919 attrs[i].is_line_break = FALSE;
920 attrs[i].is_mandatory_break = FALSE;
922 if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
923 * it's not a line break either
926 /* space followed by a combining mark is handled
927 * specially; (rule 7a from TR 14)
929 if (break_type == G_UNICODE_BREAK_SPACE &&
930 next_break_type == G_UNICODE_BREAK_COMBINING_MARK)
931 break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
933 /* Unicode doesn't specify char wrap; we wrap around all chars
934 * except where a line break is prohibited, which means we
935 * effectively break everywhere except inside runs of spaces.
937 attrs[i].is_char_break = TRUE;
939 /* Make any necessary replacements first */
940 switch ((int) prev_break_type)
942 case G_UNICODE_BREAK_HANGUL_L_JAMO:
943 case G_UNICODE_BREAK_HANGUL_V_JAMO:
944 case G_UNICODE_BREAK_HANGUL_T_JAMO:
945 case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
946 case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
947 /* treat Jamo as IDEOGRAPHIC from now
949 prev_break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
952 case G_UNICODE_BREAK_AMBIGUOUS:
954 * we need to resolve the East Asian width
955 * to decide what to do here
957 case G_UNICODE_BREAK_COMPLEX_CONTEXT:
959 * language engines should handle this case...
961 case G_UNICODE_BREAK_UNKNOWN:
962 /* convert unknown, complex, ambiguous to ALPHABETIC
964 prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
971 switch ((int) prev_break_type)
973 case G_UNICODE_BREAK_MANDATORY:
974 case G_UNICODE_BREAK_LINE_FEED:
975 case G_UNICODE_BREAK_NEXT_LINE:
976 attrs[i].is_line_break = TRUE;
977 attrs[i].is_mandatory_break = TRUE;
980 case G_UNICODE_BREAK_CARRIAGE_RETURN:
983 attrs[i].is_line_break = TRUE;
984 attrs[i].is_mandatory_break = TRUE;
988 case G_UNICODE_BREAK_CONTINGENT:
989 /* can break after 0xFFFC by default, though we might want
990 * to eventually have a PangoLayout setting or
991 * PangoAttribute that disables this, if for some
992 * application breaking after objects is not desired.
994 break_op = BREAK_ALLOWED;
997 case G_UNICODE_BREAK_SURROGATE:
998 g_assert_not_reached ();
1002 g_assert (IN_BREAK_TABLE (prev_break_type));
1004 /* Note that our table assumes that combining marks
1005 * are only applied to alphabetic characters;
1006 * tech report 14 explains how to remove this assumption
1007 * from the code, if anyone ever cares, but it shouldn't
1008 * be a problem. Also this issue sort of goes
1009 * away since we only look for breaks on grapheme
1013 switch ((int) break_type)
1015 case G_UNICODE_BREAK_MANDATORY:
1016 case G_UNICODE_BREAK_LINE_FEED:
1017 case G_UNICODE_BREAK_CARRIAGE_RETURN:
1018 case G_UNICODE_BREAK_NEXT_LINE:
1019 case G_UNICODE_BREAK_SPACE:
1020 /* These types all "pile up" at the end of lines and
1023 break_op = BREAK_PROHIBITED;
1026 case G_UNICODE_BREAK_CONTINGENT:
1027 /* break before 0xFFFC by default, eventually
1028 * make this configurable?
1030 break_op = BREAK_ALLOWED;
1033 case G_UNICODE_BREAK_SURROGATE:
1034 g_assert_not_reached ();
1037 /* Hangul additions are from Unicode 4.1 UAX#14 */
1038 case G_UNICODE_BREAK_HANGUL_L_JAMO:
1039 case G_UNICODE_BREAK_HANGUL_V_JAMO:
1040 case G_UNICODE_BREAK_HANGUL_T_JAMO:
1041 case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
1042 case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
1043 /* treat Jamo as IDEOGRAPHIC from now
1045 break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
1047 if (makes_hangul_syllable)
1048 break_op = BREAK_IF_SPACES;
1050 break_op = BREAK_ALLOWED;
1053 case G_UNICODE_BREAK_AMBIGUOUS:
1055 * we need to resolve the East Asian width
1056 * to decide what to do here
1058 case G_UNICODE_BREAK_COMPLEX_CONTEXT:
1060 * language engines should handle this case...
1062 case G_UNICODE_BREAK_UNKNOWN:
1063 /* treat unknown, complex, and ambiguous like ALPHABETIC
1066 break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
1071 g_assert (IN_BREAK_TABLE (break_type));
1072 break_op = BREAK_OP (prev_break_type, break_type);
1080 case BREAK_PROHIBITED:
1081 /* can't break here */
1082 attrs[i].is_char_break = FALSE;
1085 case BREAK_IF_SPACES:
1086 /* break if prev char was space */
1087 if (prev_was_break_space)
1088 attrs[i].is_line_break = TRUE;
1092 attrs[i].is_line_break = TRUE;
1095 case BREAK_ALREADY_HANDLED:
1099 g_assert_not_reached ();
1104 if (break_type != G_UNICODE_BREAK_SPACE)
1106 prev_break_type = break_type;
1107 prev_was_break_space = FALSE;
1111 prev_was_break_space = TRUE;
1113 /* ---- Word breaks ---- */
1115 /* default to not a word start/end */
1116 attrs[i].is_word_start = FALSE;
1117 attrs[i].is_word_end = FALSE;
1119 if (current_word_type != WordNone)
1121 /* Check for a word end */
1124 case G_UNICODE_COMBINING_MARK:
1125 case G_UNICODE_ENCLOSING_MARK:
1126 case G_UNICODE_NON_SPACING_MARK:
1127 case G_UNICODE_FORMAT:
1128 /* nothing, we just eat these up as part of the word */
1131 case G_UNICODE_LOWERCASE_LETTER:
1132 case G_UNICODE_MODIFIER_LETTER:
1133 case G_UNICODE_OTHER_LETTER:
1134 case G_UNICODE_TITLECASE_LETTER:
1135 case G_UNICODE_UPPERCASE_LETTER:
1136 if (current_word_type == WordLetters)
1138 /* Japanese special cases for ending the word */
1139 if (JAPANESE (last_word_letter) ||
1142 if ((HIRAGANA (last_word_letter) &&
1144 (KATAKANA (last_word_letter) &&
1145 !(KATAKANA (wc) || HIRAGANA (wc))) ||
1146 (KANJI (last_word_letter) &&
1147 !(HIRAGANA (wc) || KANJI (wc))) ||
1148 (JAPANESE (last_word_letter) &&
1150 (!JAPANESE (last_word_letter) &&
1152 attrs[i].is_word_end = TRUE;
1157 /* end the number word, start the letter word */
1158 attrs[i].is_word_end = TRUE;
1159 attrs[i].is_word_start = TRUE;
1160 current_word_type = WordLetters;
1163 last_word_letter = wc;
1166 case G_UNICODE_DECIMAL_NUMBER:
1167 case G_UNICODE_LETTER_NUMBER:
1168 case G_UNICODE_OTHER_NUMBER:
1169 if (current_word_type != WordNumbers)
1171 attrs[i].is_word_end = TRUE;
1172 attrs[i].is_word_start = TRUE;
1173 current_word_type = WordNumbers;
1176 last_word_letter = wc;
1180 /* Punctuation, control/format chars, etc. all end a word. */
1181 attrs[i].is_word_end = TRUE;
1182 current_word_type = WordNone;
1188 /* Check for a word start */
1191 case G_UNICODE_LOWERCASE_LETTER:
1192 case G_UNICODE_MODIFIER_LETTER:
1193 case G_UNICODE_OTHER_LETTER:
1194 case G_UNICODE_TITLECASE_LETTER:
1195 case G_UNICODE_UPPERCASE_LETTER:
1196 current_word_type = WordLetters;
1197 last_word_letter = wc;
1198 attrs[i].is_word_start = TRUE;
1201 case G_UNICODE_DECIMAL_NUMBER:
1202 case G_UNICODE_LETTER_NUMBER:
1203 case G_UNICODE_OTHER_NUMBER:
1204 current_word_type = WordNumbers;
1205 last_word_letter = wc;
1206 attrs[i].is_word_start = TRUE;
1215 /* ---- Sentence breaks ---- */
1217 /* The Unicode spec specifies sentence breakpoints, so that a piece of
1218 * text would be partitioned into sentences, and all characters would
1219 * be inside some sentence. This code implements that for is_sentence_boundary,
1220 * but tries to keep leading/trailing whitespace out of sentences for
1221 * the start/end flags
1224 /* The Unicode spec seems to say that one trailing line/para
1225 * separator can be tacked on to a sentence ending in ! or ?,
1226 * but not a sentence ending in period; I think they're on crack
1227 * so am allowing one to be tacked onto a sentence ending in period.
1230 #define MAYBE_START_NEW_SENTENCE \
1231 switch ((int) type) \
1233 case G_UNICODE_LINE_SEPARATOR: \
1234 case G_UNICODE_PARAGRAPH_SEPARATOR: \
1235 case G_UNICODE_CONTROL: \
1236 case G_UNICODE_FORMAT: \
1237 case G_UNICODE_SPACE_SEPARATOR: \
1238 sentence_state = STATE_SENTENCE_OUTSIDE; \
1242 sentence_state = STATE_SENTENCE_BODY; \
1243 attrs[i].is_sentence_start = TRUE; \
1247 /* No sentence break at the start of the text */
1249 /* default to not a sentence breakpoint */
1250 attrs[i].is_sentence_boundary = FALSE;
1251 attrs[i].is_sentence_start = FALSE;
1252 attrs[i].is_sentence_end = FALSE;
1254 /* FIXME the Unicode spec lumps control/format chars with
1255 * line/para separators in descriptive text, but not in the
1256 * character class specs, in table 5-6, so who knows whether you
1257 * are actually supposed to break on control/format
1258 * characters. Seems semi-broken to break on tabs...
1261 /* Break after line/para separators except carriage return
1262 * followed by newline
1264 switch ((int) prev_type)
1266 case G_UNICODE_LINE_SEPARATOR:
1267 case G_UNICODE_PARAGRAPH_SEPARATOR:
1268 case G_UNICODE_CONTROL:
1269 case G_UNICODE_FORMAT:
1272 if (next_wc != '\n')
1273 attrs[i].is_sentence_boundary = TRUE;
1276 attrs[i].is_sentence_boundary = TRUE;
1283 /* break before para/line separators except newline following
1288 case G_UNICODE_LINE_SEPARATOR:
1289 case G_UNICODE_PARAGRAPH_SEPARATOR:
1290 case G_UNICODE_CONTROL:
1291 case G_UNICODE_FORMAT:
1294 if (prev_wc != '\r')
1295 attrs[i].is_sentence_boundary = TRUE;
1298 attrs[i].is_sentence_boundary = TRUE;
1305 switch (sentence_state)
1307 case STATE_SENTENCE_OUTSIDE:
1308 /* Start sentence if we have non-whitespace/format/control */
1311 case G_UNICODE_LINE_SEPARATOR:
1312 case G_UNICODE_PARAGRAPH_SEPARATOR:
1313 case G_UNICODE_CONTROL:
1314 case G_UNICODE_FORMAT:
1315 case G_UNICODE_SPACE_SEPARATOR:
1319 attrs[i].is_sentence_start = TRUE;
1320 sentence_state = STATE_SENTENCE_BODY;
1325 case STATE_SENTENCE_BODY:
1326 /* If we already broke here due to separators, end the sentence. */
1327 if (attrs[i].is_sentence_boundary)
1329 attrs[i].is_sentence_end = TRUE;
1331 MAYBE_START_NEW_SENTENCE;
1336 sentence_state = STATE_SENTENCE_DOT;
1337 else if (wc == '?' || wc == '!')
1338 sentence_state = STATE_SENTENCE_TERM;
1342 case STATE_SENTENCE_TERM:
1343 /* End sentence on anything but close punctuation and some
1344 * loosely-specified OTHER_PUNCTUATION such as period,
1345 * comma, etc.; follow Unicode rules for breaks
1349 case G_UNICODE_OTHER_PUNCTUATION:
1350 case G_UNICODE_CLOSE_PUNCTUATION:
1351 if (type == G_UNICODE_CLOSE_PUNCTUATION ||
1356 sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
1359 attrs[i].is_sentence_end = TRUE;
1360 attrs[i].is_sentence_boundary = TRUE;
1362 MAYBE_START_NEW_SENTENCE;
1366 case G_UNICODE_SPACE_SEPARATOR:
1367 attrs[i].is_sentence_end = TRUE;
1368 sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
1371 case G_UNICODE_LINE_SEPARATOR:
1372 case G_UNICODE_PARAGRAPH_SEPARATOR:
1373 attrs[i].is_sentence_end = TRUE;
1374 sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1378 attrs[i].is_sentence_end = TRUE;
1379 attrs[i].is_sentence_boundary = TRUE;
1381 MAYBE_START_NEW_SENTENCE;
1387 case STATE_SENTENCE_POST_TERM_CLOSE:
1388 /* End sentence on anything besides more punctuation; follow
1393 case G_UNICODE_OTHER_PUNCTUATION:
1394 case G_UNICODE_CLOSE_PUNCTUATION:
1395 if (type == G_UNICODE_CLOSE_PUNCTUATION ||
1400 /* continue in this state */
1404 attrs[i].is_sentence_end = TRUE;
1405 attrs[i].is_sentence_boundary = TRUE;
1407 MAYBE_START_NEW_SENTENCE;
1411 case G_UNICODE_SPACE_SEPARATOR:
1412 attrs[i].is_sentence_end = TRUE;
1413 sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
1416 case G_UNICODE_LINE_SEPARATOR:
1417 case G_UNICODE_PARAGRAPH_SEPARATOR:
1418 attrs[i].is_sentence_end = TRUE;
1419 /* undo the unconditional break-at-all-line/para-separators
1420 * from above; I'm not sure this is what the Unicode spec
1421 * intends, but it seems right - we get to include
1422 * a single line/para separator in the sentence according
1425 attrs[i].is_sentence_boundary = FALSE;
1426 sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1430 attrs[i].is_sentence_end = TRUE;
1431 attrs[i].is_sentence_boundary = TRUE;
1433 MAYBE_START_NEW_SENTENCE;
1439 case STATE_SENTENCE_POST_TERM_SPACE:
1441 /* Sentence is definitely already ended; to enter this state
1442 * we had to see a space, which ends the sentence.
1447 case G_UNICODE_SPACE_SEPARATOR:
1448 /* continue in this state */
1451 case G_UNICODE_LINE_SEPARATOR:
1452 case G_UNICODE_PARAGRAPH_SEPARATOR:
1453 /* undo the unconditional break-at-all-line/para-separators
1454 * from above; I'm not sure this is what the Unicode spec
1455 * intends, but it seems right
1457 attrs[i].is_sentence_boundary = FALSE;
1458 sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1462 attrs[i].is_sentence_boundary = TRUE;
1464 MAYBE_START_NEW_SENTENCE;
1470 case STATE_SENTENCE_POST_TERM_SEP:
1471 /* Break is forced at this point, unless we're a newline
1472 * after a CR, then we will break after the newline on the
1473 * next iteration. Only a single Sep can be in the
1476 if (!(prev_wc == '\r' && wc == '\n'))
1477 attrs[i].is_sentence_boundary = TRUE;
1479 MAYBE_START_NEW_SENTENCE;
1483 case STATE_SENTENCE_DOT:
1486 case G_UNICODE_CLOSE_PUNCTUATION:
1487 sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
1490 case G_UNICODE_SPACE_SEPARATOR:
1491 possible_sentence_end = i;
1492 sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
1496 /* If we broke on a control/format char, end the
1497 * sentence; else this was not a sentence end, since
1498 * we didn't enter the POST_DOT_SPACE state.
1500 if (attrs[i].is_sentence_boundary)
1502 attrs[i].is_sentence_end = TRUE;
1504 MAYBE_START_NEW_SENTENCE;
1507 sentence_state = STATE_SENTENCE_BODY;
1512 case STATE_SENTENCE_POST_DOT_CLOSE:
1515 case G_UNICODE_SPACE_SEPARATOR:
1516 possible_sentence_end = i;
1517 sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
1521 /* If we broke on a control/format char, end the
1522 * sentence; else this was not a sentence end, since
1523 * we didn't enter the POST_DOT_SPACE state.
1525 if (attrs[i].is_sentence_boundary)
1527 attrs[i].is_sentence_end = TRUE;
1529 MAYBE_START_NEW_SENTENCE;
1532 sentence_state = STATE_SENTENCE_BODY;
1537 case STATE_SENTENCE_POST_DOT_SPACE:
1539 possible_sentence_boundary = i;
1543 case G_UNICODE_SPACE_SEPARATOR:
1544 /* remain in current state */
1547 case G_UNICODE_OPEN_PUNCTUATION:
1548 sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
1551 case G_UNICODE_LOWERCASE_LETTER:
1552 /* wasn't a sentence-ending period; so re-enter the sentence
1555 sentence_state = STATE_SENTENCE_BODY;
1559 /* End the sentence, break, maybe start a new one */
1561 g_assert (possible_sentence_end >= 0);
1562 g_assert (possible_sentence_boundary >= 0);
1564 attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
1565 attrs[possible_sentence_end].is_sentence_end = TRUE;
1567 possible_sentence_end = -1;
1568 possible_sentence_boundary = -1;
1570 MAYBE_START_NEW_SENTENCE;
1576 case STATE_SENTENCE_POST_DOT_OPEN:
1579 case G_UNICODE_OPEN_PUNCTUATION:
1580 /* continue in current state */
1583 case G_UNICODE_LOWERCASE_LETTER:
1584 /* wasn't a sentence-ending period; so re-enter the sentence
1587 sentence_state = STATE_SENTENCE_BODY;
1591 /* End the sentence, break, maybe start a new one */
1593 g_assert (possible_sentence_end >= 0);
1594 g_assert (possible_sentence_boundary >= 0);
1596 attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
1597 attrs[possible_sentence_end].is_sentence_end = TRUE;
1599 possible_sentence_end = -1;
1600 possible_sentence_boundary = -1;
1602 MAYBE_START_NEW_SENTENCE;
1608 case STATE_SENTENCE_POST_DOT_SEP:
1609 /* Break is forced at this point, unless we're a newline
1610 * after a CR, then we will break after the newline on the
1611 * next iteration. Only a single Sep can be in the
1614 if (!(prev_wc == '\r' && wc == '\n'))
1615 attrs[i].is_sentence_boundary = TRUE;
1617 g_assert (possible_sentence_end >= 0);
1618 g_assert (possible_sentence_boundary >= 0);
1620 attrs[possible_sentence_end].is_sentence_end = TRUE;
1622 possible_sentence_end = -1;
1623 possible_sentence_boundary = -1;
1625 MAYBE_START_NEW_SENTENCE;
1630 g_assert_not_reached ();
1637 /* wc might not be a valid Unicode base character, but really all we
1638 * need to know is the last non-combining character */
1639 if (type != G_UNICODE_COMBINING_MARK &&
1640 type != G_UNICODE_ENCLOSING_MARK &&
1641 type != G_UNICODE_NON_SPACING_MARK)
1642 base_character = wc;
1646 attrs[i].is_cursor_position = TRUE; /* Rule GB2 */
1647 attrs[0].is_cursor_position = TRUE; /* Rule GB1 */
1649 attrs[i].is_word_boundary = TRUE; /* Rule WB2 */
1650 attrs[0].is_word_boundary = TRUE; /* Rule WB1 */
1652 attrs[i].is_line_break = TRUE; /* Rule LB3 */
1653 attrs[0].is_line_break = FALSE; /* Rule LB2 */
1658 tailor_break (const gchar *text,
1660 PangoAnalysis *analysis,
1661 PangoLogAttr *attrs,
1664 if (analysis->lang_engine && PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break)
1667 length = strlen (text);
1668 else if (text == NULL)
1671 PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break (analysis->lang_engine, text, length, analysis, attrs, attrs_len);
1679 * @text: the text to process
1680 * @length: length of @text in bytes (may be -1 if @text is nul-terminated)
1681 * @analysis: #PangoAnalysis structure from pango_itemize()
1682 * @attrs: an array to store character information in
1683 * @attrs_len: size of the array passed as @attrs
1685 * Determines possible line, word, and character breaks
1686 * for a string of Unicode text with a single analysis. For most
1687 * purposes you may want to use pango_get_log_attrs().
1690 pango_break (const gchar *text,
1692 PangoAnalysis *analysis,
1693 PangoLogAttr *attrs,
1696 g_return_if_fail (analysis != NULL);
1697 g_return_if_fail (attrs != NULL);
1699 pango_default_break (text, length, analysis, attrs, attrs_len);
1700 tailor_break (text, length, analysis, attrs, attrs_len);
1704 * pango_find_paragraph_boundary:
1706 * @length: length of @text in bytes, or -1 if nul-terminated
1707 * @paragraph_delimiter_index: return location for index of delimiter
1708 * @next_paragraph_start: return location for start of next paragraph
1710 * Locates a paragraph boundary in @text. A boundary is caused by
1711 * delimiter characters, such as a newline, carriage return, carriage
1712 * return-newline pair, or Unicode paragraph separator character. The
1713 * index of the run of delimiters is returned in
1714 * @paragraph_delimiter_index. The index of the start of the paragraph
1715 * (index after all delimiters) is stored in @next_paragraph_start.
1717 * If no delimiters are found, both @paragraph_delimiter_index and
1718 * @next_paragraph_start are filled with the length of @text (an index one
1722 pango_find_paragraph_boundary (const gchar *text,
1724 gint *paragraph_delimiter_index,
1725 gint *next_paragraph_start)
1727 const gchar *p = text;
1729 const gchar *start = NULL;
1730 const gchar *delimiter = NULL;
1732 /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
1733 * Unicode 5.0; update the following code if that changes.
1736 /* prev_sep is the first byte of the previous separator. Since
1737 * the valid separators are \r, \n, and PARAGRAPH_SEPARATOR, the
1738 * first byte is enough to identify it.
1744 length = strlen (text);
1746 end = text + length;
1748 if (paragraph_delimiter_index)
1749 *paragraph_delimiter_index = length;
1751 if (next_paragraph_start)
1752 *next_paragraph_start = length;
1761 if (prev_sep == '\n' ||
1762 prev_sep == PARAGRAPH_SEPARATOR_STRING[0])
1764 g_assert (delimiter);
1768 else if (prev_sep == '\r')
1770 /* don't break between \r and \n */
1773 g_assert (delimiter);
1781 !strncmp(p, PARAGRAPH_SEPARATOR_STRING,
1782 strlen(PARAGRAPH_SEPARATOR_STRING)))
1784 if (delimiter == NULL)
1791 p = g_utf8_next_char (p);
1794 if (delimiter && paragraph_delimiter_index)
1795 *paragraph_delimiter_index = delimiter - text;
1797 if (start && next_paragraph_start)
1798 *next_paragraph_start = start - text;
1802 tailor_segment (const char *range_start,
1803 const char *range_end,
1804 PangoEngineLang *range_engine,
1806 PangoAnalysis *analysis,
1807 PangoLogAttr *log_attrs)
1810 PangoLogAttr attr_before = log_attrs[0];
1812 analysis->lang_engine = range_engine;
1813 chars_in_range = pango_utf8_strlen (range_start, range_end - range_start);
1816 if (tailor_break (range_start,
1817 range_end - range_start,
1819 log_attrs + chars_broken,
1820 chars_in_range + 1))
1822 /* if tailored, we enforce some of the attrs from before tailoring at
1826 log_attrs[0].backspace_deletes_character = attr_before.backspace_deletes_character;
1828 log_attrs[0].is_line_break |= attr_before.is_line_break;
1829 log_attrs[0].is_mandatory_break |= attr_before.is_mandatory_break;
1830 log_attrs[0].is_cursor_position |= attr_before.is_cursor_position;
1833 return chars_in_range;
1837 * pango_get_log_attrs:
1838 * @text: text to process
1839 * @length: length in bytes of @text
1840 * @level: embedding level, or -1 if unknown
1841 * @language: language tag
1842 * @log_attrs: array with one #PangoLogAttr per character in @text, plus one extra, to be filled in
1843 * @attrs_len: length of @log_attrs array
1845 * Computes a #PangoLogAttr for each character in @text. The @log_attrs
1846 * array must have one #PangoLogAttr for each position in @text; if
1847 * @text contains N characters, it has N+1 positions, including the
1848 * last position at the end of the text. @text should be an entire
1849 * paragraph; logical attributes can't be computed without context
1850 * (for example you need to see spaces on either side of a word to know
1851 * the word is a word).
1854 pango_get_log_attrs (const char *text,
1857 PangoLanguage *language,
1858 PangoLogAttr *log_attrs,
1863 const char *range_start, *range_end;
1865 PangoEngineLang *range_engine;
1866 static guint engine_type_id = 0;
1867 static guint render_type_id = 0;
1868 PangoAnalysis analysis = { NULL };
1869 PangoScriptIter iter;
1871 g_return_if_fail (length == 0 || text != NULL);
1872 g_return_if_fail (log_attrs != NULL);
1874 analysis.level = level;
1876 pango_default_break (text, length, &analysis, log_attrs, attrs_len);
1878 if (engine_type_id == 0)
1880 engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
1881 render_type_id = g_quark_from_static_string (PANGO_RENDER_TYPE_NONE);
1884 lang_map = pango_find_map (language, engine_type_id, render_type_id);
1888 _pango_script_iter_init (&iter, text, length);
1889 pango_script_iter_get_range (&iter, &range_start, &range_end, &script);
1890 range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
1891 g_assert (range_start == text);
1893 while (pango_script_iter_next (&iter))
1895 const char *run_start, *run_end;
1896 PangoEngineLang* run_engine;
1898 pango_script_iter_get_range (&iter, &run_start, &run_end, &script);
1899 run_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
1900 g_assert (range_end == run_start);
1902 if (range_engine != run_engine)
1904 /* Engine has changed; do the tailoring for the current range,
1905 * then start a new range.
1907 chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
1909 range_start = run_start;
1910 range_engine = run_engine;
1912 range_end = run_end;
1914 _pango_script_iter_fini (&iter);
1916 g_assert (length < 0 || range_end == text + length);
1918 chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
1920 if (chars_broken + 1 > attrs_len)
1921 g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d. Expect corrupted memory.",