Git init
[external/pango1.0.git] / pango / break.c
1 /* Pango
2  * break.c:
3  *
4  * Copyright (C) 1999 Red Hat Software
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public
17  * License along with this library; if not, write to the
18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19  * Boston, MA 02111-1307, USA.
20  */
21
22 #include "config.h"
23
24 #include "pango-break.h"
25 #include "pango-modules.h"
26 #include "pango-script-private.h"
27 #include "pango-impl-utils.h"
28 #include <string.h>
29
30 #define PARAGRAPH_SEPARATOR 0x2029
31 #define PARAGRAPH_SEPARATOR_STRING "\xE2\x80\xA9"
32
33 /* See http://www.unicode.org/unicode/reports/tr14/ if you hope
34  * to understand the line breaking code.
35  */
36
37 typedef enum
38 {
39   BREAK_ALREADY_HANDLED,   /* didn't use the table */
40   BREAK_PROHIBITED, /* no break, even if spaces intervene */
41   BREAK_IF_SPACES,  /* "indirect break" (only if there are spaces) */
42   BREAK_ALLOWED     /* "direct break" (can always break here) */
43   /* TR 14 has one more break-opportunity class,
44    * "indirect break opportunity for combining marks following a space"
45    * but we handle that inline in the code.
46    */
47 } BreakOpportunity;
48
49
50 enum
51 {
52   INDEX_OPEN_PUNCTUATION,
53   INDEX_CLOSE_PUNCTUATION,
54   INDEX_QUOTATION,
55   INDEX_NON_BREAKING_GLUE,
56   INDEX_NON_STARTER,
57   INDEX_EXCLAMATION,
58   INDEX_SYMBOL,
59   INDEX_INFIX_SEPARATOR,
60   INDEX_PREFIX,
61   INDEX_POSTFIX,
62   INDEX_NUMERIC,
63   INDEX_ALPHABETIC,
64   INDEX_IDEOGRAPHIC,
65   INDEX_INSEPARABLE,
66   INDEX_HYPHEN,
67   INDEX_AFTER,
68   INDEX_BEFORE,
69   INDEX_BEFORE_AND_AFTER,
70   INDEX_ZERO_WIDTH_SPACE,
71   INDEX_COMBINING_MARK,
72   INDEX_WORD_JOINER,
73
74   /* End of the table */
75
76   INDEX_END_OF_TABLE,
77
78   /* The following are not in the tables */
79   INDEX_MANDATORY,
80   INDEX_CARRIAGE_RETURN,
81   INDEX_LINE_FEED,
82   INDEX_SURROGATE,
83   INDEX_CONTINGENT,
84   INDEX_SPACE,
85   INDEX_COMPLEX_CONTEXT,
86   INDEX_AMBIGUOUS,
87   INDEX_UNKNOWN,
88   INDEX_NEXT_LINE,
89   INDEX_HANGUL_L_JAMO,
90   INDEX_HANGUL_V_JAMO,
91   INDEX_HANGUL_T_JAMO,
92   INDEX_HANGUL_LV_SYLLABLE,
93   INDEX_HANGUL_LVT_SYLLABLE,
94 };
95
96 static const BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
97   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
98   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
99   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
100   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
101   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
102   BREAK_PROHIBITED
103 };
104
105 static const BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
106   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
107   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
108   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
109   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
110   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
111   BREAK_PROHIBITED
112 };
113
114 static const BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
115   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
116   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
117   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
118   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
119   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
120   BREAK_PROHIBITED
121 };
122
123 static const BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
124   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
125   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
126   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
127   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
128   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
129   BREAK_PROHIBITED
130 };
131
132 static const BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
133   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
134   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
135   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
136   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
137   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
138   BREAK_PROHIBITED
139 };
140
141 static const BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
142   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
143   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
144   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
145   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
146   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
147   BREAK_PROHIBITED
148 };
149
150 static const BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
151   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
152   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
153   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
154   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
155   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
156   BREAK_PROHIBITED
157 };
158
159 static const BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
160   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
161   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
162   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
163   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
164   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
165   BREAK_PROHIBITED
166 };
167
168 static const BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
169   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
170   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
171   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
172   BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
173   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
174   BREAK_PROHIBITED
175 };
176
177 static const BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
178   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
179   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
180   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
181   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
182   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
183   BREAK_PROHIBITED
184 };
185
186 static const BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
187   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
188   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
189   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
190   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
191   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
192   BREAK_PROHIBITED
193 };
194
195 static const BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
196   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
197   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
198   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
199   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
200   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
201   BREAK_PROHIBITED
202 };
203
204 static const BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
205   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
206   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
207   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
208   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
209   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
210   BREAK_PROHIBITED
211 };
212
213 static const BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
214   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
215   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
216   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
217   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
218   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
219   BREAK_PROHIBITED
220 };
221
222 static const BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
223   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
224   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
225   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
226   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
227   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
228   BREAK_PROHIBITED
229 };
230
231 static const BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
232   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
233   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
234   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
235   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
236   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
237   BREAK_PROHIBITED
238 };
239
240 static const BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
241   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
242   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
243   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
244   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
245   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
246   BREAK_PROHIBITED
247 };
248
249 static const BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
250   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
251   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
252   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
253   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
254   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
255   BREAK_PROHIBITED
256 };
257
258 static const BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
259   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
260   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
261   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
262   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
263   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
264   BREAK_ALLOWED
265 };
266
267 static const BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
268   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
269   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
270   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
271   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
272   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
273   BREAK_PROHIBITED
274 };
275
276 static const BreakOpportunity row_WORD_JOINER[INDEX_END_OF_TABLE] = {
277   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
278   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
279   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
280   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
281   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
282   BREAK_PROHIBITED
283 };
284
285 static const BreakOpportunity *const line_break_rows[INDEX_END_OF_TABLE] = {
286   row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
287   row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
288   row_QUOTATION, /* INDEX_QUOTATION */
289   row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
290   row_NON_STARTER, /* INDEX_NON_STARTER */
291   row_EXCLAMATION, /* INDEX_EXCLAMATION */
292   row_SYMBOL, /* INDEX_SYMBOL */
293   row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
294   row_PREFIX, /* INDEX_PREFIX */
295   row_POSTFIX, /* INDEX_POSTFIX */
296   row_NUMERIC, /* INDEX_NUMERIC */
297   row_ALPHABETIC, /* INDEX_ALPHABETIC */
298   row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
299   row_INSEPARABLE, /* INDEX_INSEPARABLE */
300   row_HYPHEN, /* INDEX_HYPHEN */
301   row_AFTER, /* INDEX_AFTER */
302   row_BEFORE, /* INDEX_BEFORE */
303   row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
304   row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
305   row_COMBINING_MARK, /* INDEX_COMBINING_MARK */
306   row_WORD_JOINER /* INDEX_WORD_JOINER */
307 };
308
309 /* Map GUnicodeBreakType to table indexes */
310 static const int line_break_indexes[] = {
311   INDEX_MANDATORY,
312   INDEX_CARRIAGE_RETURN,
313   INDEX_LINE_FEED,
314   INDEX_COMBINING_MARK,
315   INDEX_SURROGATE,
316   INDEX_ZERO_WIDTH_SPACE,
317   INDEX_INSEPARABLE,
318   INDEX_NON_BREAKING_GLUE,
319   INDEX_CONTINGENT,
320   INDEX_SPACE,
321   INDEX_AFTER,
322   INDEX_BEFORE,
323   INDEX_BEFORE_AND_AFTER,
324   INDEX_HYPHEN,
325   INDEX_NON_STARTER,
326   INDEX_OPEN_PUNCTUATION,
327   INDEX_CLOSE_PUNCTUATION,
328   INDEX_QUOTATION,
329   INDEX_EXCLAMATION,
330   INDEX_IDEOGRAPHIC,
331   INDEX_NUMERIC,
332   INDEX_INFIX_SEPARATOR,
333   INDEX_SYMBOL,
334   INDEX_ALPHABETIC,
335   INDEX_PREFIX,
336   INDEX_POSTFIX,
337   INDEX_COMPLEX_CONTEXT,
338   INDEX_AMBIGUOUS,
339   INDEX_UNKNOWN,
340   INDEX_NEXT_LINE,
341   INDEX_WORD_JOINER,
342   INDEX_HANGUL_L_JAMO,
343   INDEX_HANGUL_V_JAMO,
344   INDEX_HANGUL_T_JAMO,
345   INDEX_HANGUL_LV_SYLLABLE,
346   INDEX_HANGUL_LVT_SYLLABLE
347 };
348
349 #define BREAK_TYPE_SAFE(btype)            \
350          ((btype) < G_N_ELEMENTS(line_break_indexes) ? (btype) : G_UNICODE_BREAK_UNKNOWN)
351 #define BREAK_INDEX(btype)                \
352          (line_break_indexes[(btype)])
353 #define BREAK_ROW(before_type)            \
354          (line_break_rows[BREAK_INDEX (before_type)])
355 #define BREAK_OP(before_type, after_type) \
356          (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
357 #define IN_BREAK_TABLE(btype)             \
358          ((btype) < G_N_ELEMENTS(line_break_indexes) && BREAK_INDEX((btype)) < INDEX_END_OF_TABLE)
359
360
361
362 /*
363  * Hangul Conjoining Jamo handling.
364  *
365  * The way we implement it is just a bit different from TR14,
366  * but produces the same results.
367  * The same algorithm is also used in TR29 for cluster boundaries.
368  *
369  */
370
371
372 /* An enum that works as the states of the Hangul syllables system.
373  **/
374 typedef enum
375 {
376   JAMO_L,       /* G_UNICODE_BREAK_HANGUL_L_JAMO */
377   JAMO_V,       /* G_UNICODE_BREAK_HANGUL_V_JAMO */
378   JAMO_T,       /* G_UNICODE_BREAK_HANGUL_T_JAMO */
379   JAMO_LV,      /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
380   JAMO_LVT,     /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
381   NO_JAMO       /* Other */
382 } JamoType;
383
384 /* There are Hangul syllables encoded as characters, that act like a
385  * sequence of Jamos. For each character we define a JamoType
386  * that the character starts with, and one that it ends with.  This
387  * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs.  So for
388  * example, a character with LineBreak type
389  * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
390  */
391 typedef struct _CharJamoProps
392 {
393   JamoType start, end;
394 } CharJamoProps;
395
396 /* Map from JamoType to CharJamoProps that hold only simple
397  * JamoTypes (no LV or LVT) or none.
398  */
399 static const CharJamoProps HangulJamoProps[] = {
400   {JAMO_L, JAMO_L},     /* JAMO_L */
401   {JAMO_V, JAMO_V},     /* JAMO_V */
402   {JAMO_T, JAMO_T},     /* JAMO_T */
403   {JAMO_L, JAMO_V},     /* JAMO_LV */
404   {JAMO_L, JAMO_T},     /* JAMO_LVT */
405   {NO_JAMO, NO_JAMO}    /* NO_JAMO */
406 };
407
408 /* A character forms a syllable with the previous character if and only if:
409  * JamoType(this) is not NO_JAMO and:
410  *
411  * HangulJamoProps[JamoType(prev)].end and
412  * HangulJamoProps[JamoType(this)].start are equal,
413  * or the former is one less than the latter.
414  */
415
416 #define IS_JAMO(btype)              \
417         ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
418          (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
419 #define JAMO_TYPE(btype)      \
420         (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
421
422 /* Types of Japanese characters */
423 #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
424 #define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
425 #define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
426 #define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
427
428 #define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
429 #define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
430 #define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
431 #define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
432 #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
433 #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))
434
435 /* p. 132-133 of Unicode spec table 5-6 will help understand this */
436 typedef enum
437 {
438   STATE_SENTENCE_OUTSIDE,
439   STATE_SENTENCE_BODY,
440   STATE_SENTENCE_TERM,
441   STATE_SENTENCE_POST_TERM_CLOSE,
442   STATE_SENTENCE_POST_TERM_SPACE,
443   STATE_SENTENCE_POST_TERM_SEP,
444   STATE_SENTENCE_DOT,
445   STATE_SENTENCE_POST_DOT_CLOSE,
446   STATE_SENTENCE_POST_DOT_SPACE,
447   STATE_SENTENCE_POST_DOT_OPEN,
448   /* never include line/para separators in a sentence for now */
449   /* This isn't in the spec, but I can't figure out why they'd include
450    * one line/para separator in lines ending with Term but not with
451    * period-terminated lines, so I'm doing it for the dot lines also
452    */
453   STATE_SENTENCE_POST_DOT_SEP
454 } SentenceState;
455
456 /* We call "123" and "foobar" words, but "123foo" is two words;
457  * the Unicode spec just calls "123" a non-word
458  */
459 typedef enum
460 {
461   WordNone,
462   WordLetters,
463   WordNumbers
464 } WordType;
465
466
467 /**
468  * pango_default_break:
469  * @text: text to break
470  * @length: length of text in bytes (may be -1 if @text is nul-terminated)
471  * @analysis: a #PangoAnalysis for the @text
472  * @attrs: logical attributes to fill in
473  * @attrs_len: size of the array passed as @attrs
474  *
475  * This is the default break algorithm, used if no language
476  * engine overrides it. Normally you should use pango_break()
477  * instead. Unlike pango_break(),
478  * @analysis can be %NULL, but only do that if you know what
479  * you're doing. If you need an analysis to pass to pango_break(),
480  * you need to pango_itemize().  In most cases however you should
481  * simply use pango_get_log_attrs().
482  **/
483 void
484 pango_default_break (const gchar   *text,
485                      gint           length,
486                      PangoAnalysis *analysis G_GNUC_UNUSED,
487                      PangoLogAttr  *attrs,
488                      int            attrs_len G_GNUC_UNUSED)
489 {
490   /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
491    * the line breaking stuff is also in TR14 on unicode.org
492    */
493
494   /* This is a default break implementation that should work for nearly all
495    * languages. Language engines can override it optionally.
496    */
497
498   /* FIXME one cheesy optimization here would be to memset attrs to 0
499    * before we start, and then never assign %FALSE to anything
500    */
501
502   const gchar *next;
503   gint i;
504
505   gunichar prev_wc;
506   gunichar next_wc;
507
508   JamoType prev_jamo;
509
510   GUnicodeBreakType next_break_type;
511   GUnicodeType prev_type;
512   GUnicodeBreakType prev_break_type; /* skips spaces */
513   gboolean prev_was_break_space;
514
515   /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
516   typedef enum
517   {
518     GB_Other,
519     GB_ControlCRLF,
520     GB_Extend,
521     GB_Prepend,
522     GB_SpacingMark,
523     GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
524   } GraphemeBreakType;
525   GraphemeBreakType prev_GB_type = GB_Other;
526
527   /* See Word_Break Property Values table of UAX#29 */
528   typedef enum
529   {
530     WB_Other,
531     WB_NewlineCRLF,
532     WB_ExtendFormat,
533     WB_Katakana,
534     WB_ALetter,
535     WB_MidNumLet,
536     WB_MidLetter,
537     WB_MidNum,
538     WB_Numeric,
539     WB_ExtendNumLet,
540   } WordBreakType;
541   WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
542   gint prev_WB_i = -1;
543
544   WordType current_word_type = WordNone;
545   gunichar last_word_letter = 0;
546   gunichar base_character = 0;
547
548   SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
549   /* Tracks what will be the end of the sentence if a period is
550    * determined to actually be a sentence-ending period.
551    */
552   gint possible_sentence_end = -1;
553   /* possible sentence break before Open* after a period-ended sentence */
554   gint possible_sentence_boundary = -1;
555   gboolean almost_done = FALSE;
556   gboolean done = FALSE;
557
558   g_return_if_fail (length == 0 || text != NULL);
559   g_return_if_fail (attrs != NULL);
560
561   next = text;
562
563   prev_type = G_UNICODE_PARAGRAPH_SEPARATOR;
564   prev_break_type = G_UNICODE_BREAK_UNKNOWN;
565   prev_was_break_space = FALSE;
566   prev_wc = 0;
567   prev_jamo = NO_JAMO;
568
569   if (length == 0 || *text == '\0')
570     {
571       next_wc = PARAGRAPH_SEPARATOR;
572       almost_done = TRUE;
573     }
574   else
575     next_wc = g_utf8_get_char (next);
576
577   next_break_type = g_unichar_break_type (next_wc);
578   next_break_type = BREAK_TYPE_SAFE (next_break_type);
579
580   for (i = 0; !done ; i++)
581     {
582       GUnicodeType type;
583       gunichar wc;
584       GUnicodeBreakType break_type;
585       BreakOpportunity break_op;
586       JamoType jamo;
587       gboolean makes_hangul_syllable;
588
589       /* UAX#29 boundaries */
590       gboolean is_grapheme_boundary;
591       gboolean is_word_boundary;
592
593
594       wc = next_wc;
595       break_type = next_break_type;
596
597       if (almost_done)
598         {
599           /*
600            * If we have already reached the end of @text g_utf8_next_char()
601            * may not increment next
602            */
603           next_wc = 0;
604           next_break_type = G_UNICODE_BREAK_UNKNOWN;
605           done = TRUE;
606         }
607       else
608         {
609           next = g_utf8_next_char (next);
610
611           if ((length >= 0 && next >= text + length) || *next == '\0')
612             {
613               /* This is how we fill in the last element (end position) of the
614                * attr array - assume there's a paragraph separators off the end
615                * of @text.
616                */
617               next_wc = PARAGRAPH_SEPARATOR;
618               almost_done = TRUE;
619             }
620           else
621             next_wc = g_utf8_get_char (next);
622
623           next_break_type = g_unichar_break_type (next_wc);
624           next_break_type = BREAK_TYPE_SAFE (next_break_type);
625         }
626
627       type = g_unichar_type (wc);
628       jamo = JAMO_TYPE (break_type);
629
630       /* Determine wheter this forms a Hangul syllable with prev. */
631       if (jamo == NO_JAMO)
632         makes_hangul_syllable = FALSE;
633       else
634         {
635           JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;
636           JamoType this_start = HangulJamoProps[     jamo].start;
637
638           /* See comments before IS_JAMO */
639           makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
640         }
641
642       /* Can't just use the type here since isspace() doesn't
643        * correspond to a Unicode character type
644        */
645       attrs[i].is_white = g_unichar_isspace (wc);
646
647       /* Just few spaces have variable width. So explicitly mark them.
648        */
649       attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
650
651       /* ---- UAX#29 Grapheme Boundaries ---- */
652       {
653         GraphemeBreakType GB_type;
654         /* Find the GraphemeBreakType of wc */
655         GB_type = GB_Other;
656         switch ((int) type)
657           {
658           case G_UNICODE_FORMAT:
659             if (wc == 0x200C && wc == 0x200D)
660               {
661                 GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
662                 break;
663               }
664             /* fall through */
665           case G_UNICODE_CONTROL:
666           case G_UNICODE_LINE_SEPARATOR:
667           case G_UNICODE_PARAGRAPH_SEPARATOR:
668             GB_type = GB_ControlCRLF;
669             break;
670
671           case G_UNICODE_OTHER_LETTER:
672             if (makes_hangul_syllable)
673               GB_type = GB_InHangulSyllable;
674             else if ((wc & 0x0E00) == 0x0E00)
675               {
676                 /* Thai and Lao stuff hardcoded in UAX#29 */
677                 if ((wc >= 0x0E40 && wc <= 0x0E44) || (wc >= 0x0EC0 && wc <= 0x0EC4))
678                   GB_type = GB_Prepend; /* Prepend */
679                 else if (wc == 0x0E30 || wc == 0x0E32 || wc == 0x0E33 || wc == 0x0E45 ||
680                          wc == 0x0EB0 || wc == 0x0EB2 || wc == 0x0EB3)
681                   GB_type = GB_Extend; /* Exceptions in the Extend definition */
682               }
683             break;
684
685           case G_UNICODE_MODIFIER_LETTER:
686             if (wc >= 0xFF9E && wc <= 0xFF9F)
687               GB_type = GB_Extend; /* Other_Grapheme_Extend */
688             break;
689
690           case G_UNICODE_COMBINING_MARK:
691             GB_type = GB_SpacingMark; /* SpacingMark */
692             if (wc >= 0x0900)
693               {
694                 if (wc == 0x09BE || wc == 0x09D7 ||
695                     wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
696                     wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
697                     wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
698                     wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
699                   GB_type = GB_Extend; /* Other_Grapheme_Extend */
700               }
701             break;
702
703           case G_UNICODE_ENCLOSING_MARK:
704           case G_UNICODE_NON_SPACING_MARK:
705             GB_type = GB_Extend; /* Grapheme_Extend */
706             break;
707           }
708
709         /* Grapheme Cluster Boundary Rules */
710         /* We apply Rules GB1 and GB2 at the end of the function */
711         if (wc == '\n' && prev_wc == '\r')
712           is_grapheme_boundary = FALSE; /* Rule GB3 */
713         else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
714           is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
715         else if (GB_type == GB_InHangulSyllable)
716           is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
717         else if (GB_type == GB_Extend)
718           is_grapheme_boundary = FALSE; /* Rule GB9 */
719         else if (GB_type == GB_SpacingMark)
720           is_grapheme_boundary = FALSE; /* Rule GB9a */
721         else if (prev_GB_type == GB_Prepend)
722           is_grapheme_boundary = FALSE; /* Rule GB9b */
723         else
724           is_grapheme_boundary = TRUE;  /* Rule GB10 */
725
726         prev_GB_type = GB_type;
727
728         attrs[i].is_cursor_position = is_grapheme_boundary;
729         /* If this is a grapheme boundary, we have to decide if backspace
730          * deletes a character or the whole grapheme cluster */
731         if (is_grapheme_boundary)
732           attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
733         else
734           attrs[i].backspace_deletes_character = FALSE;
735       }
736
737       /* ---- UAX#29 Word Boundaries ---- */
738       {
739         is_word_boundary = FALSE;
740         if (is_grapheme_boundary) /* Rules WB3 and WB4 */
741           {
742             PangoScript script;
743             WordBreakType WB_type;
744
745             script = pango_script_for_unichar (wc);
746
747             /* Find the WordBreakType of wc */
748             WB_type = WB_Other;
749
750             if (script == PANGO_SCRIPT_KATAKANA)
751               WB_type = WB_Katakana;
752
753             if (WB_type == WB_Other)
754               switch (wc >> 8)
755                 {
756                 case 0x30:
757                   if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
758                       wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
759                     WB_type = WB_Katakana; /* Katakana exceptions */
760                   break;
761                 case 0xFF:
762                   if (wc == 0xFF70)
763                     WB_type = WB_Katakana; /* Katakana exceptions */
764                   else if (wc >= 0xFF9E || wc <= 0xFF9F)
765                     WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
766                   break;
767                 case 0x05:
768                   if (wc == 0x05F3)
769                     WB_type = WB_ALetter; /* ALetter exceptions */
770                   break;
771                 }
772
773             if (WB_type == WB_Other)
774               switch ((int) break_type)
775                 {
776                 case G_UNICODE_BREAK_NUMERIC:
777                   if (wc != 0x066C)
778                     WB_type = WB_Numeric; /* Numeric */
779                   break;
780                 case G_UNICODE_BREAK_INFIX_SEPARATOR:
781                   if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
782                     WB_type = WB_MidNum; /* MidNum */
783                   break;
784                 }
785
786             if (WB_type == WB_Other)
787               switch ((int) type)
788                 {
789                 case G_UNICODE_CONTROL:
790                   if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
791                     break;
792                   /* fall through */
793                 case G_UNICODE_LINE_SEPARATOR:
794                 case G_UNICODE_PARAGRAPH_SEPARATOR:
795                   WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
796                   break;
797
798                 case G_UNICODE_FORMAT:
799                 case G_UNICODE_COMBINING_MARK:
800                 case G_UNICODE_ENCLOSING_MARK:
801                 case G_UNICODE_NON_SPACING_MARK:
802                   WB_type = WB_ExtendFormat; /* Extend, Format */
803                   break;
804
805                 case G_UNICODE_CONNECT_PUNCTUATION:
806                   WB_type = WB_ExtendNumLet; /* ExtendNumLet */
807                   break;
808
809                 case G_UNICODE_INITIAL_PUNCTUATION:
810                 case G_UNICODE_FINAL_PUNCTUATION:
811                   if (wc == 0x2018 || wc == 0x2019)
812                     WB_type = WB_MidNumLet; /* MidNumLet */
813                   break;
814                 case G_UNICODE_OTHER_PUNCTUATION:
815                   if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
816                       wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
817                     WB_type = WB_MidNumLet; /* MidNumLet */
818                   else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 ||
819                            wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
820                     WB_type = WB_MidLetter; /* WB_MidLetter */
821                   else if (wc == 0x066c ||
822                            wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
823                     WB_type = WB_MidNum; /* MidNum */
824                   break;
825
826                 case G_UNICODE_OTHER_SYMBOL:
827                   if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
828                     goto Alphabetic;
829                   break;
830
831                 case G_UNICODE_OTHER_LETTER:
832                 case G_UNICODE_LETTER_NUMBER:
833                   if (wc == 0x3006 || wc == 0x3007 ||
834                       (wc >= 0x3021 && wc <= 0x3029) ||
835                       (wc >= 0x3038 && wc <= 0x303A) ||
836                       (wc >= 0x3400 && wc <= 0x4DB5) ||
837                       (wc >= 0x4E00 && wc <= 0x9FC3) ||
838                       (wc >= 0xF900 && wc <= 0xFA2D) ||
839                       (wc >= 0xFA30 && wc <= 0xFA6A) ||
840                       (wc >= 0xFA70 && wc <= 0xFAD9) ||
841                       (wc >= 0x20000 && wc <= 0x2A6D6) ||
842                       (wc >= 0x2F800 && wc <= 0x2FA1D))
843                     break; /* ALetter exceptions: Ideographic */
844                   goto Alphabetic;
845
846                 case G_UNICODE_LOWERCASE_LETTER:
847                 case G_UNICODE_MODIFIER_LETTER:
848                 case G_UNICODE_TITLECASE_LETTER:
849                 case G_UNICODE_UPPERCASE_LETTER:
850                 Alphabetic:
851                   if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
852                     WB_type = WB_ALetter; /* ALetter */
853                   break;
854                 }
855
856             /* Grapheme Cluster Boundary Rules */
857
858             /* We apply Rules WB1 and WB2 at the end of the function */
859
860             if (prev_wc == 0x3031 && wc == 0x41)
861               g_debug ("Y %d %d", prev_WB_type, WB_type);
862             if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
863               {
864                 /* The extra check for prev_WB_i is to correctly handle sequences like
865                  * Newline Ã· Extend Ã— Extend
866                  * since we have not skipped ExtendFormat yet.
867                  */
868                 is_word_boundary = TRUE; /* Rule WB3a */
869               }
870             else if (WB_type == WB_NewlineCRLF)
871               is_word_boundary = TRUE; /* Rule WB3b */
872             else if (WB_type == WB_ExtendFormat)
873               is_word_boundary = FALSE; /* Rules WB4? */
874             else if ((prev_WB_type == WB_ALetter  ||
875                       prev_WB_type == WB_Numeric  ||
876                       prev_WB_type == WB_ExtendNumLet) &&
877                      (     WB_type == WB_ALetter  ||
878                            WB_type == WB_Numeric  ||
879                            WB_type == WB_ExtendNumLet))
880               is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10, WB13a, WB13b */
881             else if ((prev_WB_type == WB_Katakana ||
882                       prev_WB_type == WB_ExtendNumLet) &&
883                      (     WB_type == WB_Katakana ||
884                            WB_type == WB_ExtendNumLet))
885               is_word_boundary = FALSE; /* Rules WB13, WB13a, WB13b */
886             else if ((prev_prev_WB_type == WB_ALetter && WB_type == WB_ALetter) &&
887                      (prev_WB_type == WB_MidLetter || prev_WB_type == WB_MidNumLet))
888               {
889                 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
890                 is_word_boundary = FALSE; /* Rule WB7 */
891               }
892             else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
893                      (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet))
894               {
895                 is_word_boundary = FALSE; /* Rule WB11 */
896                 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
897               }
898             else
899               is_word_boundary = TRUE; /* Rule WB14 */
900
901             if (WB_type != WB_ExtendFormat)
902               {
903                 prev_prev_WB_type = prev_WB_type;
904                 prev_WB_type = WB_type;
905                 prev_WB_i = i;
906               }
907           }
908
909         attrs[i].is_word_boundary = is_word_boundary;
910       }
911
912
913       /* ---- Line breaking ---- */
914
915       break_op = BREAK_ALREADY_HANDLED;
916
917       g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);
918
919       attrs[i].is_line_break = FALSE;
920       attrs[i].is_mandatory_break = FALSE;
921
922       if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
923                                         * it's not a line break either
924                                         */
925         {
926           /* space followed by a combining mark is handled
927            * specially; (rule 7a from TR 14)
928            */
929           if (break_type == G_UNICODE_BREAK_SPACE &&
930               next_break_type == G_UNICODE_BREAK_COMBINING_MARK)
931             break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
932
933           /* Unicode doesn't specify char wrap; we wrap around all chars
934            * except where a line break is prohibited, which means we
935            * effectively break everywhere except inside runs of spaces.
936            */
937           attrs[i].is_char_break = TRUE;
938
939           /* Make any necessary replacements first */
940           switch ((int) prev_break_type)
941             {
942             case G_UNICODE_BREAK_HANGUL_L_JAMO:
943             case G_UNICODE_BREAK_HANGUL_V_JAMO:
944             case G_UNICODE_BREAK_HANGUL_T_JAMO:
945             case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
946             case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
947               /* treat Jamo as IDEOGRAPHIC from now
948                */
949               prev_break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
950               break;
951
952             case G_UNICODE_BREAK_AMBIGUOUS:
953               /* FIXME
954                * we need to resolve the East Asian width
955                * to decide what to do here
956                */
957             case G_UNICODE_BREAK_COMPLEX_CONTEXT:
958               /* FIXME
959                * language engines should handle this case...
960                */
961             case G_UNICODE_BREAK_UNKNOWN:
962               /* convert unknown, complex, ambiguous to ALPHABETIC
963                */
964               prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
965               break;
966
967             default:
968               ;
969             }
970
971           switch ((int) prev_break_type)
972             {
973             case G_UNICODE_BREAK_MANDATORY:
974             case G_UNICODE_BREAK_LINE_FEED:
975             case G_UNICODE_BREAK_NEXT_LINE:
976               attrs[i].is_line_break = TRUE;
977               attrs[i].is_mandatory_break = TRUE;
978               break;
979
980             case G_UNICODE_BREAK_CARRIAGE_RETURN:
981               if (wc != '\n')
982                 {
983                   attrs[i].is_line_break = TRUE;
984                   attrs[i].is_mandatory_break = TRUE;
985                 }
986               break;
987
988             case G_UNICODE_BREAK_CONTINGENT:
989               /* can break after 0xFFFC by default, though we might want
990                * to eventually have a PangoLayout setting or
991                * PangoAttribute that disables this, if for some
992                * application breaking after objects is not desired.
993                */
994               break_op = BREAK_ALLOWED;
995               break;
996
997             case G_UNICODE_BREAK_SURROGATE:
998               g_assert_not_reached ();
999               break;
1000
1001             default:
1002               g_assert (IN_BREAK_TABLE (prev_break_type));
1003
1004               /* Note that our table assumes that combining marks
1005                * are only applied to alphabetic characters;
1006                * tech report 14 explains how to remove this assumption
1007                * from the code, if anyone ever cares, but it shouldn't
1008                * be a problem. Also this issue sort of goes
1009                * away since we only look for breaks on grapheme
1010                * boundaries.
1011                */
1012
1013               switch ((int) break_type)
1014                 {
1015                 case G_UNICODE_BREAK_MANDATORY:
1016                 case G_UNICODE_BREAK_LINE_FEED:
1017                 case G_UNICODE_BREAK_CARRIAGE_RETURN:
1018                 case G_UNICODE_BREAK_NEXT_LINE:
1019                 case G_UNICODE_BREAK_SPACE:
1020                   /* These types all "pile up" at the end of lines and
1021                    * get elided.
1022                    */
1023                   break_op = BREAK_PROHIBITED;
1024                   break;
1025
1026                 case G_UNICODE_BREAK_CONTINGENT:
1027                   /* break before 0xFFFC by default, eventually
1028                    * make this configurable?
1029                    */
1030                   break_op = BREAK_ALLOWED;
1031                   break;
1032
1033                 case G_UNICODE_BREAK_SURROGATE:
1034                   g_assert_not_reached ();
1035                   break;
1036
1037                 /* Hangul additions are from Unicode 4.1 UAX#14 */
1038                 case G_UNICODE_BREAK_HANGUL_L_JAMO:
1039                 case G_UNICODE_BREAK_HANGUL_V_JAMO:
1040                 case G_UNICODE_BREAK_HANGUL_T_JAMO:
1041                 case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
1042                 case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
1043                   /* treat Jamo as IDEOGRAPHIC from now
1044                    */
1045                   break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
1046
1047                   if (makes_hangul_syllable)
1048                     break_op = BREAK_IF_SPACES;
1049                   else
1050                     break_op = BREAK_ALLOWED;
1051                   break;
1052
1053                 case G_UNICODE_BREAK_AMBIGUOUS:
1054                   /* FIXME:
1055                    * we need to resolve the East Asian width
1056                    * to decide what to do here
1057                    */
1058                 case G_UNICODE_BREAK_COMPLEX_CONTEXT:
1059                   /* FIXME:
1060                    * language engines should handle this case...
1061                    */
1062                 case G_UNICODE_BREAK_UNKNOWN:
1063                   /* treat unknown, complex, and ambiguous like ALPHABETIC
1064                    * for now
1065                    */
1066                   break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
1067                   break;
1068
1069                 default:
1070
1071                   g_assert (IN_BREAK_TABLE (break_type));
1072                   break_op = BREAK_OP (prev_break_type, break_type);
1073                   break;
1074                 }
1075               break;
1076             }
1077
1078           switch (break_op)
1079             {
1080             case BREAK_PROHIBITED:
1081               /* can't break here */
1082               attrs[i].is_char_break = FALSE;
1083               break;
1084
1085             case BREAK_IF_SPACES:
1086               /* break if prev char was space */
1087               if (prev_was_break_space)
1088                 attrs[i].is_line_break = TRUE;
1089               break;
1090
1091             case BREAK_ALLOWED:
1092               attrs[i].is_line_break = TRUE;
1093               break;
1094
1095             case BREAK_ALREADY_HANDLED:
1096               break;
1097
1098             default:
1099               g_assert_not_reached ();
1100               break;
1101             }
1102         }
1103
1104       if (break_type != G_UNICODE_BREAK_SPACE)
1105         {
1106           prev_break_type = break_type;
1107           prev_was_break_space = FALSE;
1108           prev_jamo = jamo;
1109         }
1110       else
1111         prev_was_break_space = TRUE;
1112
1113       /* ---- Word breaks ---- */
1114
1115       /* default to not a word start/end */
1116       attrs[i].is_word_start = FALSE;
1117       attrs[i].is_word_end = FALSE;
1118
1119       if (current_word_type != WordNone)
1120         {
1121           /* Check for a word end */
1122           switch ((int) type)
1123             {
1124             case G_UNICODE_COMBINING_MARK:
1125             case G_UNICODE_ENCLOSING_MARK:
1126             case G_UNICODE_NON_SPACING_MARK:
1127             case G_UNICODE_FORMAT:
1128               /* nothing, we just eat these up as part of the word */
1129               break;
1130
1131             case G_UNICODE_LOWERCASE_LETTER:
1132             case G_UNICODE_MODIFIER_LETTER:
1133             case G_UNICODE_OTHER_LETTER:
1134             case G_UNICODE_TITLECASE_LETTER:
1135             case G_UNICODE_UPPERCASE_LETTER:
1136               if (current_word_type == WordLetters)
1137                 {
1138                   /* Japanese special cases for ending the word */
1139                   if (JAPANESE (last_word_letter) ||
1140                       JAPANESE (wc))
1141                     {
1142                       if ((HIRAGANA (last_word_letter) &&
1143                            !HIRAGANA (wc)) ||
1144                           (KATAKANA (last_word_letter) &&
1145                            !(KATAKANA (wc) || HIRAGANA (wc))) ||
1146                           (KANJI (last_word_letter) &&
1147                            !(HIRAGANA (wc) || KANJI (wc))) ||
1148                           (JAPANESE (last_word_letter) &&
1149                            !JAPANESE (wc)) ||
1150                           (!JAPANESE (last_word_letter) &&
1151                            JAPANESE (wc)))
1152                         attrs[i].is_word_end = TRUE;
1153                     }
1154                 }
1155               else
1156                 {
1157                   /* end the number word, start the letter word */
1158                   attrs[i].is_word_end = TRUE;
1159                   attrs[i].is_word_start = TRUE;
1160                   current_word_type = WordLetters;
1161                 }
1162
1163               last_word_letter = wc;
1164               break;
1165
1166             case G_UNICODE_DECIMAL_NUMBER:
1167             case G_UNICODE_LETTER_NUMBER:
1168             case G_UNICODE_OTHER_NUMBER:
1169               if (current_word_type != WordNumbers)
1170                 {
1171                   attrs[i].is_word_end = TRUE;
1172                   attrs[i].is_word_start = TRUE;
1173                   current_word_type = WordNumbers;
1174                 }
1175
1176               last_word_letter = wc;
1177               break;
1178
1179             default:
1180               /* Punctuation, control/format chars, etc. all end a word. */
1181               attrs[i].is_word_end = TRUE;
1182               current_word_type = WordNone;
1183               break;
1184             }
1185         }
1186       else
1187         {
1188           /* Check for a word start */
1189           switch ((int) type)
1190             {
1191             case G_UNICODE_LOWERCASE_LETTER:
1192             case G_UNICODE_MODIFIER_LETTER:
1193             case G_UNICODE_OTHER_LETTER:
1194             case G_UNICODE_TITLECASE_LETTER:
1195             case G_UNICODE_UPPERCASE_LETTER:
1196               current_word_type = WordLetters;
1197               last_word_letter = wc;
1198               attrs[i].is_word_start = TRUE;
1199               break;
1200
1201             case G_UNICODE_DECIMAL_NUMBER:
1202             case G_UNICODE_LETTER_NUMBER:
1203             case G_UNICODE_OTHER_NUMBER:
1204               current_word_type = WordNumbers;
1205               last_word_letter = wc;
1206               attrs[i].is_word_start = TRUE;
1207               break;
1208
1209             default:
1210               /* No word here */
1211               break;
1212             }
1213         }
1214
1215       /* ---- Sentence breaks ---- */
1216
1217       /* The Unicode spec specifies sentence breakpoints, so that a piece of
1218        * text would be partitioned into sentences, and all characters would
1219        * be inside some sentence. This code implements that for is_sentence_boundary,
1220        * but tries to keep leading/trailing whitespace out of sentences for
1221        * the start/end flags
1222        */
1223
1224       /* The Unicode spec seems to say that one trailing line/para
1225        * separator can be tacked on to a sentence ending in ! or ?,
1226        * but not a sentence ending in period; I think they're on crack
1227        * so am allowing one to be tacked onto a sentence ending in period.
1228        */
1229
1230 #define MAYBE_START_NEW_SENTENCE                                \
1231               switch ((int) type)                               \
1232                 {                                               \
1233                 case G_UNICODE_LINE_SEPARATOR:                  \
1234                 case G_UNICODE_PARAGRAPH_SEPARATOR:             \
1235                 case G_UNICODE_CONTROL:                         \
1236                 case G_UNICODE_FORMAT:                          \
1237                 case G_UNICODE_SPACE_SEPARATOR:                 \
1238                   sentence_state = STATE_SENTENCE_OUTSIDE;      \
1239                   break;                                        \
1240                                                                 \
1241                 default:                                        \
1242                   sentence_state = STATE_SENTENCE_BODY;         \
1243                   attrs[i].is_sentence_start = TRUE;            \
1244                   break;                                        \
1245                 }
1246
1247       /* No sentence break at the start of the text */
1248
1249       /* default to not a sentence breakpoint */
1250       attrs[i].is_sentence_boundary = FALSE;
1251       attrs[i].is_sentence_start = FALSE;
1252       attrs[i].is_sentence_end = FALSE;
1253
1254       /* FIXME the Unicode spec lumps control/format chars with
1255        * line/para separators in descriptive text, but not in the
1256        * character class specs, in table 5-6, so who knows whether you
1257        * are actually supposed to break on control/format
1258        * characters. Seems semi-broken to break on tabs...
1259        */
1260
1261       /* Break after line/para separators except carriage return
1262        * followed by newline
1263        */
1264       switch ((int) prev_type)
1265         {
1266         case G_UNICODE_LINE_SEPARATOR:
1267         case G_UNICODE_PARAGRAPH_SEPARATOR:
1268         case G_UNICODE_CONTROL:
1269         case G_UNICODE_FORMAT:
1270           if (wc == '\r')
1271             {
1272               if (next_wc != '\n')
1273                 attrs[i].is_sentence_boundary = TRUE;
1274             }
1275           else
1276             attrs[i].is_sentence_boundary = TRUE;
1277           break;
1278
1279         default:
1280           break;
1281         }
1282
1283       /* break before para/line separators except newline following
1284        * carriage return
1285        */
1286       switch ((int) type)
1287         {
1288         case G_UNICODE_LINE_SEPARATOR:
1289         case G_UNICODE_PARAGRAPH_SEPARATOR:
1290         case G_UNICODE_CONTROL:
1291         case G_UNICODE_FORMAT:
1292           if (wc == '\n')
1293             {
1294               if (prev_wc != '\r')
1295                 attrs[i].is_sentence_boundary = TRUE;
1296             }
1297           else
1298             attrs[i].is_sentence_boundary = TRUE;
1299           break;
1300
1301         default:
1302           break;
1303         }
1304
1305       switch (sentence_state)
1306         {
1307         case STATE_SENTENCE_OUTSIDE:
1308           /* Start sentence if we have non-whitespace/format/control */
1309           switch ((int) type)
1310             {
1311             case G_UNICODE_LINE_SEPARATOR:
1312             case G_UNICODE_PARAGRAPH_SEPARATOR:
1313             case G_UNICODE_CONTROL:
1314             case G_UNICODE_FORMAT:
1315             case G_UNICODE_SPACE_SEPARATOR:
1316               break;
1317
1318             default:
1319               attrs[i].is_sentence_start = TRUE;
1320               sentence_state = STATE_SENTENCE_BODY;
1321               break;
1322             }
1323           break;
1324
1325         case STATE_SENTENCE_BODY:
1326           /* If we already broke here due to separators, end the sentence. */
1327           if (attrs[i].is_sentence_boundary)
1328             {
1329               attrs[i].is_sentence_end = TRUE;
1330
1331               MAYBE_START_NEW_SENTENCE;
1332             }
1333           else
1334             {
1335               if (wc == '.')
1336                 sentence_state = STATE_SENTENCE_DOT;
1337               else if (wc == '?' || wc == '!')
1338                 sentence_state = STATE_SENTENCE_TERM;
1339             }
1340           break;
1341
1342         case STATE_SENTENCE_TERM:
1343           /* End sentence on anything but close punctuation and some
1344            * loosely-specified OTHER_PUNCTUATION such as period,
1345            * comma, etc.; follow Unicode rules for breaks
1346            */
1347           switch ((int) type)
1348             {
1349             case G_UNICODE_OTHER_PUNCTUATION:
1350             case G_UNICODE_CLOSE_PUNCTUATION:
1351               if (type == G_UNICODE_CLOSE_PUNCTUATION ||
1352                   wc == '.' ||
1353                   wc == ',' ||
1354                   wc == '?' ||
1355                   wc == '!')
1356                 sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
1357               else
1358                 {
1359                   attrs[i].is_sentence_end = TRUE;
1360                   attrs[i].is_sentence_boundary = TRUE;
1361
1362                   MAYBE_START_NEW_SENTENCE;
1363                 }
1364               break;
1365
1366             case G_UNICODE_SPACE_SEPARATOR:
1367               attrs[i].is_sentence_end = TRUE;
1368               sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
1369               break;
1370
1371             case G_UNICODE_LINE_SEPARATOR:
1372             case G_UNICODE_PARAGRAPH_SEPARATOR:
1373               attrs[i].is_sentence_end = TRUE;
1374               sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1375               break;
1376
1377             default:
1378               attrs[i].is_sentence_end = TRUE;
1379               attrs[i].is_sentence_boundary = TRUE;
1380
1381               MAYBE_START_NEW_SENTENCE;
1382
1383               break;
1384             }
1385           break;
1386
1387         case STATE_SENTENCE_POST_TERM_CLOSE:
1388           /* End sentence on anything besides more punctuation; follow
1389            * rules for breaks
1390            */
1391           switch ((int) type)
1392             {
1393             case G_UNICODE_OTHER_PUNCTUATION:
1394             case G_UNICODE_CLOSE_PUNCTUATION:
1395               if (type == G_UNICODE_CLOSE_PUNCTUATION ||
1396                   wc == '.' ||
1397                   wc == ',' ||
1398                   wc == '?' ||
1399                   wc == '!')
1400                 /* continue in this state */
1401                 ;
1402               else
1403                 {
1404                   attrs[i].is_sentence_end = TRUE;
1405                   attrs[i].is_sentence_boundary = TRUE;
1406
1407                   MAYBE_START_NEW_SENTENCE;
1408                 }
1409               break;
1410
1411             case G_UNICODE_SPACE_SEPARATOR:
1412               attrs[i].is_sentence_end = TRUE;
1413               sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
1414               break;
1415
1416             case G_UNICODE_LINE_SEPARATOR:
1417             case G_UNICODE_PARAGRAPH_SEPARATOR:
1418               attrs[i].is_sentence_end = TRUE;
1419               /* undo the unconditional break-at-all-line/para-separators
1420                * from above; I'm not sure this is what the Unicode spec
1421                * intends, but it seems right - we get to include
1422                * a single line/para separator in the sentence according
1423                * to their rules
1424                */
1425               attrs[i].is_sentence_boundary = FALSE;
1426               sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1427               break;
1428
1429             default:
1430               attrs[i].is_sentence_end = TRUE;
1431               attrs[i].is_sentence_boundary = TRUE;
1432
1433               MAYBE_START_NEW_SENTENCE;
1434
1435               break;
1436             }
1437           break;
1438
1439         case STATE_SENTENCE_POST_TERM_SPACE:
1440
1441           /* Sentence is definitely already ended; to enter this state
1442            * we had to see a space, which ends the sentence.
1443            */
1444
1445           switch ((int) type)
1446             {
1447             case G_UNICODE_SPACE_SEPARATOR:
1448               /* continue in this state */
1449               break;
1450
1451             case G_UNICODE_LINE_SEPARATOR:
1452             case G_UNICODE_PARAGRAPH_SEPARATOR:
1453               /* undo the unconditional break-at-all-line/para-separators
1454                * from above; I'm not sure this is what the Unicode spec
1455                * intends, but it seems right
1456                */
1457               attrs[i].is_sentence_boundary = FALSE;
1458               sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1459               break;
1460
1461             default:
1462               attrs[i].is_sentence_boundary = TRUE;
1463
1464               MAYBE_START_NEW_SENTENCE;
1465
1466               break;
1467             }
1468           break;
1469
1470         case STATE_SENTENCE_POST_TERM_SEP:
1471           /* Break is forced at this point, unless we're a newline
1472            * after a CR, then we will break after the newline on the
1473            * next iteration. Only a single Sep can be in the
1474            * sentence.
1475            */
1476           if (!(prev_wc == '\r' && wc == '\n'))
1477             attrs[i].is_sentence_boundary = TRUE;
1478
1479           MAYBE_START_NEW_SENTENCE;
1480
1481           break;
1482
1483         case STATE_SENTENCE_DOT:
1484           switch ((int) type)
1485             {
1486             case G_UNICODE_CLOSE_PUNCTUATION:
1487               sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
1488               break;
1489
1490             case G_UNICODE_SPACE_SEPARATOR:
1491               possible_sentence_end = i;
1492               sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
1493               break;
1494
1495             default:
1496               /* If we broke on a control/format char, end the
1497                * sentence; else this was not a sentence end, since
1498                * we didn't enter the POST_DOT_SPACE state.
1499                */
1500               if (attrs[i].is_sentence_boundary)
1501                 {
1502                   attrs[i].is_sentence_end = TRUE;
1503
1504                   MAYBE_START_NEW_SENTENCE;
1505                 }
1506               else
1507                 sentence_state = STATE_SENTENCE_BODY;
1508               break;
1509             }
1510           break;
1511
1512         case STATE_SENTENCE_POST_DOT_CLOSE:
1513           switch ((int) type)
1514             {
1515             case G_UNICODE_SPACE_SEPARATOR:
1516               possible_sentence_end = i;
1517               sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
1518               break;
1519
1520             default:
1521               /* If we broke on a control/format char, end the
1522                * sentence; else this was not a sentence end, since
1523                * we didn't enter the POST_DOT_SPACE state.
1524                */
1525               if (attrs[i].is_sentence_boundary)
1526                 {
1527                   attrs[i].is_sentence_end = TRUE;
1528
1529                   MAYBE_START_NEW_SENTENCE;
1530                 }
1531               else
1532                 sentence_state = STATE_SENTENCE_BODY;
1533               break;
1534             }
1535           break;
1536
1537         case STATE_SENTENCE_POST_DOT_SPACE:
1538
1539           possible_sentence_boundary = i;
1540
1541           switch ((int) type)
1542             {
1543             case G_UNICODE_SPACE_SEPARATOR:
1544               /* remain in current state */
1545               break;
1546
1547             case G_UNICODE_OPEN_PUNCTUATION:
1548               sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
1549               break;
1550
1551             case G_UNICODE_LOWERCASE_LETTER:
1552               /* wasn't a sentence-ending period; so re-enter the sentence
1553                * body
1554                */
1555               sentence_state = STATE_SENTENCE_BODY;
1556               break;
1557
1558             default:
1559               /* End the sentence, break, maybe start a new one */
1560
1561               g_assert (possible_sentence_end >= 0);
1562               g_assert (possible_sentence_boundary >= 0);
1563
1564               attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
1565               attrs[possible_sentence_end].is_sentence_end = TRUE;
1566
1567               possible_sentence_end = -1;
1568               possible_sentence_boundary = -1;
1569
1570               MAYBE_START_NEW_SENTENCE;
1571
1572               break;
1573             }
1574           break;
1575
1576         case STATE_SENTENCE_POST_DOT_OPEN:
1577           switch ((int) type)
1578             {
1579             case G_UNICODE_OPEN_PUNCTUATION:
1580               /* continue in current state */
1581               break;
1582
1583             case G_UNICODE_LOWERCASE_LETTER:
1584               /* wasn't a sentence-ending period; so re-enter the sentence
1585                * body
1586                */
1587               sentence_state = STATE_SENTENCE_BODY;
1588               break;
1589
1590             default:
1591               /* End the sentence, break, maybe start a new one */
1592
1593               g_assert (possible_sentence_end >= 0);
1594               g_assert (possible_sentence_boundary >= 0);
1595
1596               attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
1597               attrs[possible_sentence_end].is_sentence_end = TRUE;
1598
1599               possible_sentence_end = -1;
1600               possible_sentence_boundary = -1;
1601
1602               MAYBE_START_NEW_SENTENCE;
1603
1604               break;
1605             }
1606           break;
1607
1608         case STATE_SENTENCE_POST_DOT_SEP:
1609           /* Break is forced at this point, unless we're a newline
1610            * after a CR, then we will break after the newline on the
1611            * next iteration. Only a single Sep can be in the
1612            * sentence.
1613            */
1614           if (!(prev_wc == '\r' && wc == '\n'))
1615             attrs[i].is_sentence_boundary = TRUE;
1616
1617           g_assert (possible_sentence_end >= 0);
1618           g_assert (possible_sentence_boundary >= 0);
1619
1620           attrs[possible_sentence_end].is_sentence_end = TRUE;
1621
1622           possible_sentence_end = -1;
1623           possible_sentence_boundary = -1;
1624
1625           MAYBE_START_NEW_SENTENCE;
1626
1627           break;
1628
1629         default:
1630           g_assert_not_reached ();
1631           break;
1632         }
1633
1634       prev_type = type;
1635       prev_wc = wc;
1636
1637       /* wc might not be a valid Unicode base character, but really all we
1638        * need to know is the last non-combining character */
1639       if (type != G_UNICODE_COMBINING_MARK &&
1640           type != G_UNICODE_ENCLOSING_MARK &&
1641           type != G_UNICODE_NON_SPACING_MARK)
1642         base_character = wc;
1643     }
1644   i--;
1645
1646   attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
1647   attrs[0].is_cursor_position = TRUE;  /* Rule GB1 */
1648
1649   attrs[i].is_word_boundary = TRUE;  /* Rule WB2 */
1650   attrs[0].is_word_boundary = TRUE;  /* Rule WB1 */
1651
1652   attrs[i].is_line_break = TRUE;  /* Rule LB3 */
1653   attrs[0].is_line_break = FALSE; /* Rule LB2 */
1654
1655 }
1656
1657 static gboolean
1658 tailor_break (const gchar   *text,
1659              gint           length,
1660              PangoAnalysis *analysis,
1661              PangoLogAttr  *attrs,
1662              int            attrs_len)
1663 {
1664   if (analysis->lang_engine && PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break)
1665     {
1666       if (length < 0)
1667         length = strlen (text);
1668       else if (text == NULL)
1669         text = "";
1670
1671       PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break (analysis->lang_engine, text, length, analysis, attrs, attrs_len);
1672       return TRUE;
1673     }
1674   return FALSE;
1675 }
1676
1677 /**
1678  * pango_break:
1679  * @text:      the text to process
1680  * @length:    length of @text in bytes (may be -1 if @text is nul-terminated)
1681  * @analysis:  #PangoAnalysis structure from pango_itemize()
1682  * @attrs:     an array to store character information in
1683  * @attrs_len: size of the array passed as @attrs
1684  *
1685  * Determines possible line, word, and character breaks
1686  * for a string of Unicode text with a single analysis.  For most
1687  * purposes you may want to use pango_get_log_attrs().
1688  */
1689 void
1690 pango_break (const gchar   *text,
1691              gint           length,
1692              PangoAnalysis *analysis,
1693              PangoLogAttr  *attrs,
1694              int            attrs_len)
1695 {
1696   g_return_if_fail (analysis != NULL);
1697   g_return_if_fail (attrs != NULL);
1698
1699   pango_default_break (text, length, analysis, attrs, attrs_len);
1700   tailor_break        (text, length, analysis, attrs, attrs_len);
1701 }
1702
1703 /**
1704  * pango_find_paragraph_boundary:
1705  * @text: UTF-8 text
1706  * @length: length of @text in bytes, or -1 if nul-terminated
1707  * @paragraph_delimiter_index: return location for index of delimiter
1708  * @next_paragraph_start: return location for start of next paragraph
1709  *
1710  * Locates a paragraph boundary in @text. A boundary is caused by
1711  * delimiter characters, such as a newline, carriage return, carriage
1712  * return-newline pair, or Unicode paragraph separator character.  The
1713  * index of the run of delimiters is returned in
1714  * @paragraph_delimiter_index. The index of the start of the paragraph
1715  * (index after all delimiters) is stored in @next_paragraph_start.
1716  *
1717  * If no delimiters are found, both @paragraph_delimiter_index and
1718  * @next_paragraph_start are filled with the length of @text (an index one
1719  * off the end).
1720  **/
1721 void
1722 pango_find_paragraph_boundary (const gchar *text,
1723                                gint         length,
1724                                gint        *paragraph_delimiter_index,
1725                                gint        *next_paragraph_start)
1726 {
1727   const gchar *p = text;
1728   const gchar *end;
1729   const gchar *start = NULL;
1730   const gchar *delimiter = NULL;
1731
1732   /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
1733    * Unicode 5.0; update the following code if that changes.
1734    */
1735
1736   /* prev_sep is the first byte of the previous separator.  Since
1737    * the valid separators are \r, \n, and PARAGRAPH_SEPARATOR, the
1738    * first byte is enough to identify it.
1739    */
1740   gchar prev_sep;
1741
1742
1743   if (length < 0)
1744     length = strlen (text);
1745
1746   end = text + length;
1747
1748   if (paragraph_delimiter_index)
1749     *paragraph_delimiter_index = length;
1750
1751   if (next_paragraph_start)
1752     *next_paragraph_start = length;
1753
1754   if (length == 0)
1755     return;
1756
1757   prev_sep = 0;
1758
1759   while (p != end)
1760     {
1761       if (prev_sep == '\n' ||
1762           prev_sep == PARAGRAPH_SEPARATOR_STRING[0])
1763         {
1764           g_assert (delimiter);
1765           start = p;
1766           break;
1767         }
1768       else if (prev_sep == '\r')
1769         {
1770           /* don't break between \r and \n */
1771           if (*p != '\n')
1772             {
1773               g_assert (delimiter);
1774               start = p;
1775               break;
1776             }
1777         }
1778
1779       if (*p == '\n' ||
1780            *p == '\r' ||
1781            !strncmp(p, PARAGRAPH_SEPARATOR_STRING,
1782                     strlen(PARAGRAPH_SEPARATOR_STRING)))
1783         {
1784           if (delimiter == NULL)
1785             delimiter = p;
1786           prev_sep = *p;
1787         }
1788       else
1789         prev_sep = 0;
1790
1791       p = g_utf8_next_char (p);
1792     }
1793
1794   if (delimiter && paragraph_delimiter_index)
1795     *paragraph_delimiter_index = delimiter - text;
1796
1797   if (start && next_paragraph_start)
1798     *next_paragraph_start = start - text;
1799 }
1800
1801 static int
1802 tailor_segment (const char      *range_start,
1803                 const char      *range_end,
1804                 PangoEngineLang *range_engine,
1805                 int              chars_broken,
1806                 PangoAnalysis   *analysis,
1807                 PangoLogAttr    *log_attrs)
1808 {
1809   int chars_in_range;
1810   PangoLogAttr attr_before = log_attrs[0];
1811
1812   analysis->lang_engine = range_engine;
1813   chars_in_range = pango_utf8_strlen (range_start, range_end - range_start);
1814
1815
1816   if (tailor_break (range_start,
1817                     range_end - range_start,
1818                     analysis,
1819                     log_attrs + chars_broken,
1820                     chars_in_range + 1))
1821     {
1822       /* if tailored, we enforce some of the attrs from before tailoring at
1823        * the boundary
1824        */
1825
1826      log_attrs[0].backspace_deletes_character  = attr_before.backspace_deletes_character;
1827
1828      log_attrs[0].is_line_break      |= attr_before.is_line_break;
1829      log_attrs[0].is_mandatory_break |= attr_before.is_mandatory_break;
1830      log_attrs[0].is_cursor_position |= attr_before.is_cursor_position;
1831     }
1832
1833   return chars_in_range;
1834 }
1835
1836 /**
1837  * pango_get_log_attrs:
1838  * @text: text to process
1839  * @length: length in bytes of @text
1840  * @level: embedding level, or -1 if unknown
1841  * @language: language tag
1842  * @log_attrs: array with one #PangoLogAttr per character in @text, plus one extra, to be filled in
1843  * @attrs_len: length of @log_attrs array
1844  *
1845  * Computes a #PangoLogAttr for each character in @text. The @log_attrs
1846  * array must have one #PangoLogAttr for each position in @text; if
1847  * @text contains N characters, it has N+1 positions, including the
1848  * last position at the end of the text. @text should be an entire
1849  * paragraph; logical attributes can't be computed without context
1850  * (for example you need to see spaces on either side of a word to know
1851  * the word is a word).
1852  */
1853 void
1854 pango_get_log_attrs (const char    *text,
1855                      int            length,
1856                      int            level,
1857                      PangoLanguage *language,
1858                      PangoLogAttr  *log_attrs,
1859                      int            attrs_len)
1860 {
1861   PangoMap *lang_map;
1862   int chars_broken;
1863   const char *range_start, *range_end;
1864   PangoScript script;
1865   PangoEngineLang *range_engine;
1866   static guint engine_type_id = 0;
1867   static guint render_type_id = 0;
1868   PangoAnalysis analysis = { NULL };
1869   PangoScriptIter iter;
1870
1871   g_return_if_fail (length == 0 || text != NULL);
1872   g_return_if_fail (log_attrs != NULL);
1873
1874   analysis.level = level;
1875
1876   pango_default_break (text, length, &analysis, log_attrs, attrs_len);
1877
1878   if (engine_type_id == 0)
1879     {
1880       engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
1881       render_type_id = g_quark_from_static_string (PANGO_RENDER_TYPE_NONE);
1882     }
1883
1884   lang_map = pango_find_map (language, engine_type_id, render_type_id);
1885
1886   chars_broken = 0;
1887
1888   _pango_script_iter_init (&iter, text, length);
1889   pango_script_iter_get_range (&iter, &range_start, &range_end, &script);
1890   range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
1891   g_assert (range_start == text);
1892
1893   while (pango_script_iter_next (&iter))
1894     {
1895       const char *run_start, *run_end;
1896       PangoEngineLang* run_engine;
1897
1898       pango_script_iter_get_range (&iter, &run_start, &run_end, &script);
1899       run_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
1900       g_assert (range_end == run_start);
1901
1902       if (range_engine != run_engine)
1903         {
1904           /* Engine has changed; do the tailoring for the current range,
1905            * then start a new range.
1906            */
1907           chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
1908
1909           range_start = run_start;
1910           range_engine = run_engine;
1911         }
1912       range_end = run_end;
1913     }
1914   _pango_script_iter_fini (&iter);
1915
1916   g_assert (length < 0 || range_end == text + length);
1917
1918   chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
1919
1920   if (chars_broken + 1 > attrs_len)
1921     g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d.  Expect corrupted memory.",
1922                chars_broken + 1,
1923                attrs_len);
1924 }