pango/break.c

   1 /* Pango
   2  * break.c:
   3  *
   4  * Copyright (C) 1999 Red Hat Software
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Library General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Library General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Library General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include "pango-break.h"
  25 #include "pango-modules.h"
  26 #include "pango-script-private.h"
  27 #include "pango-impl-utils.h"
  28 #include <string.h>
  29
  30 #define PARAGRAPH_SEPARATOR 0x2029
  31 #define PARAGRAPH_SEPARATOR_STRING "\xE2\x80\xA9"
  32
  33 /* See http://www.unicode.org/unicode/reports/tr14/ if you hope
  34  * to understand the line breaking code.
  35  */
  36
  37 typedef enum
  38 {
  39   BREAK_ALREADY_HANDLED,   /* didn't use the table */
  40   BREAK_PROHIBITED, /* no break, even if spaces intervene */
  41   BREAK_IF_SPACES,  /* "indirect break" (only if there are spaces) */
  42   BREAK_ALLOWED     /* "direct break" (can always break here) */
  43   /* TR 14 has one more break-opportunity class,
  44    * "indirect break opportunity for combining marks following a space"
  45    * but we handle that inline in the code.
  46    */
  47 } BreakOpportunity;
  48
  49
  50 enum
  51 {
  52   INDEX_OPEN_PUNCTUATION,
  53   INDEX_CLOSE_PUNCTUATION,
  54   INDEX_QUOTATION,
  55   INDEX_NON_BREAKING_GLUE,
  56   INDEX_NON_STARTER,
  57   INDEX_EXCLAMATION,
  58   INDEX_SYMBOL,
  59   INDEX_INFIX_SEPARATOR,
  60   INDEX_PREFIX,
  61   INDEX_POSTFIX,
  62   INDEX_NUMERIC,
  63   INDEX_ALPHABETIC,
  64   INDEX_IDEOGRAPHIC,
  65   INDEX_INSEPARABLE,
  66   INDEX_HYPHEN,
  67   INDEX_AFTER,
  68   INDEX_BEFORE,
  69   INDEX_BEFORE_AND_AFTER,
  70   INDEX_ZERO_WIDTH_SPACE,
  71   INDEX_COMBINING_MARK,
  72   INDEX_WORD_JOINER,
  73
  74   /* End of the table */
  75
  76   INDEX_END_OF_TABLE,
  77
  78   /* The following are not in the tables */
  79   INDEX_MANDATORY,
  80   INDEX_CARRIAGE_RETURN,
  81   INDEX_LINE_FEED,
  82   INDEX_SURROGATE,
  83   INDEX_CONTINGENT,
  84   INDEX_SPACE,
  85   INDEX_COMPLEX_CONTEXT,
  86   INDEX_AMBIGUOUS,
  87   INDEX_UNKNOWN,
  88   INDEX_NEXT_LINE,
  89   INDEX_HANGUL_L_JAMO,
  90   INDEX_HANGUL_V_JAMO,
  91   INDEX_HANGUL_T_JAMO,
  92   INDEX_HANGUL_LV_SYLLABLE,
  93   INDEX_HANGUL_LVT_SYLLABLE,
  94 };
  95
  96 static const BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
  97   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  98   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  99   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 100   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 101   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 102   BREAK_PROHIBITED
 103 };
 104
 105 static const BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
 106   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 107   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 108   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
 109   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 110   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 111   BREAK_PROHIBITED
 112 };
 113
 114 static const BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
 115   BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 116   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 117   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 118   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 119   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
 120   BREAK_PROHIBITED
 121 };
 122
 123 static const BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
 124   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 125   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 126   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 127   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 128   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
 129   BREAK_PROHIBITED
 130 };
 131
 132 static const BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
 133   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 134   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 135   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 136   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 137   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 138   BREAK_PROHIBITED
 139 };
 140
 141 static const BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
 142   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 143   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 144   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 145   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 146   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 147   BREAK_PROHIBITED
 148 };
 149
 150 static const BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
 151   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 152   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 153   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
 154   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 155   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 156   BREAK_PROHIBITED
 157 };
 158
 159 static const BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
 160   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 161   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 162   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 163   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 164   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 165   BREAK_PROHIBITED
 166 };
 167
 168 static const BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
 169   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 170   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 171   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 172   BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 173   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 174   BREAK_PROHIBITED
 175 };
 176
 177 static const BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
 178   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 179   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 180   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 181   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 182   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 183   BREAK_PROHIBITED
 184 };
 185
 186 static const BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
 187   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 188   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 189   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 190   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 191   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 192   BREAK_PROHIBITED
 193 };
 194
 195 static const BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
 196   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 197   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 198   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 199   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 200   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 201   BREAK_PROHIBITED
 202 };
 203
 204 static const BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
 205   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 206   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 207   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
 208   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 209   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 210   BREAK_PROHIBITED
 211 };
 212
 213 static const BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
 214   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 215   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 216   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 217   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 218   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 219   BREAK_PROHIBITED
 220 };
 221
 222 static const BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
 223   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 224   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 225   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
 226   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 227   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 228   BREAK_PROHIBITED
 229 };
 230
 231 static const BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
 232   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 233   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 234   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 235   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 236   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 237   BREAK_PROHIBITED
 238 };
 239
 240 static const BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
 241   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 242   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 243   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 244   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 245   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
 246   BREAK_PROHIBITED
 247 };
 248
 249 static const BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
 250   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 251   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 252   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 253   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 254   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 255   BREAK_PROHIBITED
 256 };
 257
 258 static const BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
 259   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 260   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 261   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 262   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
 263   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 264   BREAK_ALLOWED
 265 };
 266
 267 static const BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
 268   BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 269   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 270   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 271   BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 272   BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 273   BREAK_PROHIBITED
 274 };
 275
 276 static const BreakOpportunity row_WORD_JOINER[INDEX_END_OF_TABLE] = {
 277   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
 278   BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
 279   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 280   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
 281   BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
 282   BREAK_PROHIBITED
 283 };
 284
 285 static const BreakOpportunity *const line_break_rows[INDEX_END_OF_TABLE] = {
 286   row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
 287   row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
 288   row_QUOTATION, /* INDEX_QUOTATION */
 289   row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
 290   row_NON_STARTER, /* INDEX_NON_STARTER */
 291   row_EXCLAMATION, /* INDEX_EXCLAMATION */
 292   row_SYMBOL, /* INDEX_SYMBOL */
 293   row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
 294   row_PREFIX, /* INDEX_PREFIX */
 295   row_POSTFIX, /* INDEX_POSTFIX */
 296   row_NUMERIC, /* INDEX_NUMERIC */
 297   row_ALPHABETIC, /* INDEX_ALPHABETIC */
 298   row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
 299   row_INSEPARABLE, /* INDEX_INSEPARABLE */
 300   row_HYPHEN, /* INDEX_HYPHEN */
 301   row_AFTER, /* INDEX_AFTER */
 302   row_BEFORE, /* INDEX_BEFORE */
 303   row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
 304   row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
 305   row_COMBINING_MARK, /* INDEX_COMBINING_MARK */
 306   row_WORD_JOINER /* INDEX_WORD_JOINER */
 307 };
 308
 309 /* Map GUnicodeBreakType to table indexes */
 310 static const int line_break_indexes[] = {
 311   INDEX_MANDATORY,
 312   INDEX_CARRIAGE_RETURN,
 313   INDEX_LINE_FEED,
 314   INDEX_COMBINING_MARK,
 315   INDEX_SURROGATE,
 316   INDEX_ZERO_WIDTH_SPACE,
 317   INDEX_INSEPARABLE,
 318   INDEX_NON_BREAKING_GLUE,
 319   INDEX_CONTINGENT,
 320   INDEX_SPACE,
 321   INDEX_AFTER,
 322   INDEX_BEFORE,
 323   INDEX_BEFORE_AND_AFTER,
 324   INDEX_HYPHEN,
 325   INDEX_NON_STARTER,
 326   INDEX_OPEN_PUNCTUATION,
 327   INDEX_CLOSE_PUNCTUATION,
 328   INDEX_QUOTATION,
 329   INDEX_EXCLAMATION,
 330   INDEX_IDEOGRAPHIC,
 331   INDEX_NUMERIC,
 332   INDEX_INFIX_SEPARATOR,
 333   INDEX_SYMBOL,
 334   INDEX_ALPHABETIC,
 335   INDEX_PREFIX,
 336   INDEX_POSTFIX,
 337   INDEX_COMPLEX_CONTEXT,
 338   INDEX_AMBIGUOUS,
 339   INDEX_UNKNOWN,
 340   INDEX_NEXT_LINE,
 341   INDEX_WORD_JOINER,
 342   INDEX_HANGUL_L_JAMO,
 343   INDEX_HANGUL_V_JAMO,
 344   INDEX_HANGUL_T_JAMO,
 345   INDEX_HANGUL_LV_SYLLABLE,
 346   INDEX_HANGUL_LVT_SYLLABLE
 347 };
 348
 349 #define BREAK_TYPE_SAFE(btype)            \
 350          ((btype) < G_N_ELEMENTS(line_break_indexes) ? (btype) : G_UNICODE_BREAK_UNKNOWN)
 351 #define BREAK_INDEX(btype)                \
 352          (line_break_indexes[(btype)])
 353 #define BREAK_ROW(before_type)            \
 354          (line_break_rows[BREAK_INDEX (before_type)])
 355 #define BREAK_OP(before_type, after_type) \
 356          (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
 357 #define IN_BREAK_TABLE(btype)             \
 358          ((btype) < G_N_ELEMENTS(line_break_indexes) && BREAK_INDEX((btype)) < INDEX_END_OF_TABLE)
 359
 360
 361
 362 /*
 363  * Hangul Conjoining Jamo handling.
 364  *
 365  * The way we implement it is just a bit different from TR14,
 366  * but produces the same results.
 367  * The same algorithm is also used in TR29 for cluster boundaries.
 368  *
 369  */
 370
 371
 372 /* An enum that works as the states of the Hangul syllables system.
 373  **/
 374 typedef enum
 375 {
 376   JAMO_L,       /* G_UNICODE_BREAK_HANGUL_L_JAMO */
 377   JAMO_V,       /* G_UNICODE_BREAK_HANGUL_V_JAMO */
 378   JAMO_T,       /* G_UNICODE_BREAK_HANGUL_T_JAMO */
 379   JAMO_LV,      /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
 380   JAMO_LVT,     /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
 381   NO_JAMO       /* Other */
 382 } JamoType;
 383
 384 /* There are Hangul syllables encoded as characters, that act like a
 385  * sequence of Jamos. For each character we define a JamoType
 386  * that the character starts with, and one that it ends with.  This
 387  * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs.  So for
 388  * example, a character with LineBreak type
 389  * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
 390  */
 391 typedef struct _CharJamoProps
 392 {
 393   JamoType start, end;
 394 } CharJamoProps;
 395
 396 /* Map from JamoType to CharJamoProps that hold only simple
 397  * JamoTypes (no LV or LVT) or none.
 398  */
 399 static const CharJamoProps HangulJamoProps[] = {
 400   {JAMO_L, JAMO_L},     /* JAMO_L */
 401   {JAMO_V, JAMO_V},     /* JAMO_V */
 402   {JAMO_T, JAMO_T},     /* JAMO_T */
 403   {JAMO_L, JAMO_V},     /* JAMO_LV */
 404   {JAMO_L, JAMO_T},     /* JAMO_LVT */
 405   {NO_JAMO, NO_JAMO}    /* NO_JAMO */
 406 };
 407
 408 /* A character forms a syllable with the previous character if and only if:
 409  * JamoType(this) is not NO_JAMO and:
 410  *
 411  * HangulJamoProps[JamoType(prev)].end and
 412  * HangulJamoProps[JamoType(this)].start are equal,
 413  * or the former is one less than the latter.
 414  */
 415
 416 #define IS_JAMO(btype)              \
 417         ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
 418          (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
 419 #define JAMO_TYPE(btype)      \
 420         (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
 421
 422 /* Types of Japanese characters */
 423 #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
 424 #define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
 425 #define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
 426 #define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
 427
 428 #define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
 429 #define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
 430 #define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
 431 #define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
 432 #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
 433 #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))
 434
 435 /* p. 132-133 of Unicode spec table 5-6 will help understand this */
 436 typedef enum
 437 {
 438   STATE_SENTENCE_OUTSIDE,
 439   STATE_SENTENCE_BODY,
 440   STATE_SENTENCE_TERM,
 441   STATE_SENTENCE_POST_TERM_CLOSE,
 442   STATE_SENTENCE_POST_TERM_SPACE,
 443   STATE_SENTENCE_POST_TERM_SEP,
 444   STATE_SENTENCE_DOT,
 445   STATE_SENTENCE_POST_DOT_CLOSE,
 446   STATE_SENTENCE_POST_DOT_SPACE,
 447   STATE_SENTENCE_POST_DOT_OPEN,
 448   /* never include line/para separators in a sentence for now */
 449   /* This isn't in the spec, but I can't figure out why they'd include
 450    * one line/para separator in lines ending with Term but not with
 451    * period-terminated lines, so I'm doing it for the dot lines also
 452    */
 453   STATE_SENTENCE_POST_DOT_SEP
 454 } SentenceState;
 455
 456 /* We call "123" and "foobar" words, but "123foo" is two words;
 457  * the Unicode spec just calls "123" a non-word
 458  */
 459 typedef enum
 460 {
 461   WordNone,
 462   WordLetters,
 463   WordNumbers
 464 } WordType;
 465
 466
 467 /**
 468  * pango_default_break:
 469  * @text: text to break
 470  * @length: length of text in bytes (may be -1 if @text is nul-terminated)
 471  * @analysis: a #PangoAnalysis for the @text
 472  * @attrs: logical attributes to fill in
 473  * @attrs_len: size of the array passed as @attrs
 474  *
 475  * This is the default break algorithm, used if no language
 476  * engine overrides it. Normally you should use pango_break()
 477  * instead. Unlike pango_break(),
 478  * @analysis can be %NULL, but only do that if you know what
 479  * you're doing. If you need an analysis to pass to pango_break(),
 480  * you need to pango_itemize().  In most cases however you should
 481  * simply use pango_get_log_attrs().
 482  **/
 483 void
 484 pango_default_break (const gchar   *text,
 485                      gint           length,
 486                      PangoAnalysis *analysis G_GNUC_UNUSED,
 487                      PangoLogAttr  *attrs,
 488                      int            attrs_len G_GNUC_UNUSED)
 489 {
 490   /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
 491    * the line breaking stuff is also in TR14 on unicode.org
 492    */
 493
 494   /* This is a default break implementation that should work for nearly all
 495    * languages. Language engines can override it optionally.
 496    */
 497
 498   /* FIXME one cheesy optimization here would be to memset attrs to 0
 499    * before we start, and then never assign %FALSE to anything
 500    */
 501
 502   const gchar *next;
 503   gint i;
 504
 505   gunichar prev_wc;
 506   gunichar next_wc;
 507
 508   JamoType prev_jamo;
 509
 510   GUnicodeBreakType next_break_type;
 511   GUnicodeType prev_type;
 512   GUnicodeBreakType prev_break_type; /* skips spaces */
 513   gboolean prev_was_break_space;
 514
 515   /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
 516   typedef enum
 517   {
 518     GB_Other,
 519     GB_ControlCRLF,
 520     GB_Extend,
 521     GB_Prepend,
 522     GB_SpacingMark,
 523     GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
 524   } GraphemeBreakType;
 525   GraphemeBreakType prev_GB_type = GB_Other;
 526
 527   /* See Word_Break Property Values table of UAX#29 */
 528   typedef enum
 529   {
 530     WB_Other,
 531     WB_NewlineCRLF,
 532     WB_ExtendFormat,
 533     WB_Katakana,
 534     WB_ALetter,
 535     WB_MidNumLet,
 536     WB_MidLetter,
 537     WB_MidNum,
 538     WB_Numeric,
 539     WB_ExtendNumLet,
 540   } WordBreakType;
 541   WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
 542   gint prev_WB_i = -1;
 543
 544   WordType current_word_type = WordNone;
 545   gunichar last_word_letter = 0;
 546   gunichar base_character = 0;
 547
 548   SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
 549   /* Tracks what will be the end of the sentence if a period is
 550    * determined to actually be a sentence-ending period.
 551    */
 552   gint possible_sentence_end = -1;
 553   /* possible sentence break before Open* after a period-ended sentence */
 554   gint possible_sentence_boundary = -1;
 555   gboolean almost_done = FALSE;
 556   gboolean done = FALSE;
 557
 558   g_return_if_fail (length == 0 || text != NULL);
 559   g_return_if_fail (attrs != NULL);
 560
 561   next = text;
 562
 563   prev_type = G_UNICODE_PARAGRAPH_SEPARATOR;
 564   prev_break_type = G_UNICODE_BREAK_UNKNOWN;
 565   prev_was_break_space = FALSE;
 566   prev_wc = 0;
 567   prev_jamo = NO_JAMO;
 568
 569   if (length == 0 || *text == '\0')
 570     {
 571       next_wc = PARAGRAPH_SEPARATOR;
 572       almost_done = TRUE;
 573     }
 574   else
 575     next_wc = g_utf8_get_char (next);
 576
 577   next_break_type = g_unichar_break_type (next_wc);
 578   next_break_type = BREAK_TYPE_SAFE (next_break_type);
 579
 580   for (i = 0; !done ; i++)
 581     {
 582       GUnicodeType type;
 583       gunichar wc;
 584       GUnicodeBreakType break_type;
 585       BreakOpportunity break_op;
 586       JamoType jamo;
 587       gboolean makes_hangul_syllable;
 588
 589       /* UAX#29 boundaries */
 590       gboolean is_grapheme_boundary;
 591       gboolean is_word_boundary;
 592
 593
 594       wc = next_wc;
 595       break_type = next_break_type;
 596
 597       if (almost_done)
 598         {
 599           /*
 600            * If we have already reached the end of @text g_utf8_next_char()
 601            * may not increment next
 602            */
 603           next_wc = 0;
 604           next_break_type = G_UNICODE_BREAK_UNKNOWN;
 605           done = TRUE;
 606         }
 607       else
 608         {
 609           next = g_utf8_next_char (next);
 610
 611           if ((length >= 0 && next >= text + length) || *next == '\0')
 612             {
 613               /* This is how we fill in the last element (end position) of the
 614                * attr array - assume there's a paragraph separators off the end
 615                * of @text.
 616                */
 617               next_wc = PARAGRAPH_SEPARATOR;
 618               almost_done = TRUE;
 619             }
 620           else
 621             next_wc = g_utf8_get_char (next);
 622
 623           next_break_type = g_unichar_break_type (next_wc);
 624           next_break_type = BREAK_TYPE_SAFE (next_break_type);
 625         }
 626
 627       type = g_unichar_type (wc);
 628       jamo = JAMO_TYPE (break_type);
 629
 630       /* Determine wheter this forms a Hangul syllable with prev. */
 631       if (jamo == NO_JAMO)
 632         makes_hangul_syllable = FALSE;
 633       else
 634         {
 635           JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;
 636           JamoType this_start = HangulJamoProps[     jamo].start;
 637
 638           /* See comments before IS_JAMO */
 639           makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
 640         }
 641
 642       /* Can't just use the type here since isspace() doesn't
 643        * correspond to a Unicode character type
 644        */
 645       attrs[i].is_white = g_unichar_isspace (wc);
 646
 647       /* Just few spaces have variable width. So explicitly mark them.
 648        */
 649       attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
 650
 651       /* ---- UAX#29 Grapheme Boundaries ---- */
 652       {
 653         GraphemeBreakType GB_type;
 654         /* Find the GraphemeBreakType of wc */
 655         GB_type = GB_Other;
 656         switch ((int) type)
 657           {
 658           case G_UNICODE_FORMAT:
 659             if (wc == 0x200C && wc == 0x200D)
 660               {
 661                 GB_type = GB_Extend; /* U+200C and U+200D are Other_Grapheme_Extend */
 662                 break;
 663               }
 664             /* fall through */
 665           case G_UNICODE_CONTROL:
 666           case G_UNICODE_LINE_SEPARATOR:
 667           case G_UNICODE_PARAGRAPH_SEPARATOR:
 668             GB_type = GB_ControlCRLF;
 669             break;
 670
 671           case G_UNICODE_OTHER_LETTER:
 672             if (makes_hangul_syllable)
 673               GB_type = GB_InHangulSyllable;
 674             else if ((wc & 0x0E00) == 0x0E00)
 675               {
 676                 /* Thai and Lao stuff hardcoded in UAX#29 */
 677                 if ((wc >= 0x0E40 && wc <= 0x0E44) || (wc >= 0x0EC0 && wc <= 0x0EC4))
 678                   GB_type = GB_Prepend; /* Prepend */
 679                 else if (wc == 0x0E30 || wc == 0x0E32 || wc == 0x0E33 || wc == 0x0E45 ||
 680                          wc == 0x0EB0 || wc == 0x0EB2 || wc == 0x0EB3)
 681                   GB_type = GB_Extend; /* Exceptions in the Extend definition */
 682               }
 683             break;
 684
 685           case G_UNICODE_MODIFIER_LETTER:
 686             if (wc >= 0xFF9E && wc <= 0xFF9F)
 687               GB_type = GB_Extend; /* Other_Grapheme_Extend */
 688             break;
 689
 690           case G_UNICODE_COMBINING_MARK:
 691             GB_type = GB_SpacingMark; /* SpacingMark */
 692             if (wc >= 0x0900)
 693               {
 694                 if (wc == 0x09BE || wc == 0x09D7 ||
 695                     wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
 696                     wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
 697                     wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
 698                     wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
 699                   GB_type = GB_Extend; /* Other_Grapheme_Extend */
 700               }
 701             break;
 702
 703           case G_UNICODE_ENCLOSING_MARK:
 704           case G_UNICODE_NON_SPACING_MARK:
 705             GB_type = GB_Extend; /* Grapheme_Extend */
 706             break;
 707           }
 708
 709         /* Grapheme Cluster Boundary Rules */
 710         /* We apply Rules GB1 and GB2 at the end of the function */
 711         if (wc == '\n' && prev_wc == '\r')
 712           is_grapheme_boundary = FALSE; /* Rule GB3 */
 713         else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
 714           is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
 715         else if (GB_type == GB_InHangulSyllable)
 716           is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
 717         else if (GB_type == GB_Extend)
 718           is_grapheme_boundary = FALSE; /* Rule GB9 */
 719         else if (GB_type == GB_SpacingMark)
 720           is_grapheme_boundary = FALSE; /* Rule GB9a */
 721         else if (prev_GB_type == GB_Prepend)
 722           is_grapheme_boundary = FALSE; /* Rule GB9b */
 723         else
 724           is_grapheme_boundary = TRUE;  /* Rule GB10 */
 725
 726         prev_GB_type = GB_type;
 727
 728         attrs[i].is_cursor_position = is_grapheme_boundary;
 729         /* If this is a grapheme boundary, we have to decide if backspace
 730          * deletes a character or the whole grapheme cluster */
 731         if (is_grapheme_boundary)
 732           attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
 733         else
 734           attrs[i].backspace_deletes_character = FALSE;
 735       }
 736
 737       /* ---- UAX#29 Word Boundaries ---- */
 738       {
 739         is_word_boundary = FALSE;
 740         if (is_grapheme_boundary) /* Rules WB3 and WB4 */
 741           {
 742             PangoScript script;
 743             WordBreakType WB_type;
 744
 745             script = pango_script_for_unichar (wc);
 746
 747             /* Find the WordBreakType of wc */
 748             WB_type = WB_Other;
 749
 750             if (script == PANGO_SCRIPT_KATAKANA)
 751               WB_type = WB_Katakana;
 752
 753             if (WB_type == WB_Other)
 754               switch (wc >> 8)
 755                 {
 756                 case 0x30:
 757                   if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
 758                       wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
 759                     WB_type = WB_Katakana; /* Katakana exceptions */
 760                   break;
 761                 case 0xFF:
 762                   if (wc == 0xFF70)
 763                     WB_type = WB_Katakana; /* Katakana exceptions */
 764                   else if (wc >= 0xFF9E || wc <= 0xFF9F)
 765                     WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
 766                   break;
 767                 case 0x05:
 768                   if (wc == 0x05F3)
 769                     WB_type = WB_ALetter; /* ALetter exceptions */
 770                   break;
 771                 }
 772
 773             if (WB_type == WB_Other)
 774               switch ((int) break_type)
 775                 {
 776                 case G_UNICODE_BREAK_NUMERIC:
 777                   if (wc != 0x066C)
 778                     WB_type = WB_Numeric; /* Numeric */
 779                   break;
 780                 case G_UNICODE_BREAK_INFIX_SEPARATOR:
 781                   if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
 782                     WB_type = WB_MidNum; /* MidNum */
 783                   break;
 784                 }
 785
 786             if (WB_type == WB_Other)
 787               switch ((int) type)
 788                 {
 789                 case G_UNICODE_CONTROL:
 790                   if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
 791                     break;
 792                   /* fall through */
 793                 case G_UNICODE_LINE_SEPARATOR:
 794                 case G_UNICODE_PARAGRAPH_SEPARATOR:
 795                   WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
 796                   break;
 797
 798                 case G_UNICODE_FORMAT:
 799                 case G_UNICODE_COMBINING_MARK:
 800                 case G_UNICODE_ENCLOSING_MARK:
 801                 case G_UNICODE_NON_SPACING_MARK:
 802                   WB_type = WB_ExtendFormat; /* Extend, Format */
 803                   break;
 804
 805                 case G_UNICODE_CONNECT_PUNCTUATION:
 806                   WB_type = WB_ExtendNumLet; /* ExtendNumLet */
 807                   break;
 808
 809                 case G_UNICODE_INITIAL_PUNCTUATION:
 810                 case G_UNICODE_FINAL_PUNCTUATION:
 811                   if (wc == 0x2018 || wc == 0x2019)
 812                     WB_type = WB_MidNumLet; /* MidNumLet */
 813                   break;
 814                 case G_UNICODE_OTHER_PUNCTUATION:
 815                   if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
 816                       wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
 817                     WB_type = WB_MidNumLet; /* MidNumLet */
 818                   else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 ||
 819                            wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
 820                     WB_type = WB_MidLetter; /* WB_MidLetter */
 821                   else if (wc == 0x066c ||
 822                            wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
 823                     WB_type = WB_MidNum; /* MidNum */
 824                   break;
 825
 826                 case G_UNICODE_OTHER_SYMBOL:
 827                   if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
 828                     goto Alphabetic;
 829                   break;
 830
 831                 case G_UNICODE_OTHER_LETTER:
 832                 case G_UNICODE_LETTER_NUMBER:
 833                   if (wc == 0x3006 || wc == 0x3007 ||
 834                       (wc >= 0x3021 && wc <= 0x3029) ||
 835                       (wc >= 0x3038 && wc <= 0x303A) ||
 836                       (wc >= 0x3400 && wc <= 0x4DB5) ||
 837                       (wc >= 0x4E00 && wc <= 0x9FC3) ||
 838                       (wc >= 0xF900 && wc <= 0xFA2D) ||
 839                       (wc >= 0xFA30 && wc <= 0xFA6A) ||
 840                       (wc >= 0xFA70 && wc <= 0xFAD9) ||
 841                       (wc >= 0x20000 && wc <= 0x2A6D6) ||
 842                       (wc >= 0x2F800 && wc <= 0x2FA1D))
 843                     break; /* ALetter exceptions: Ideographic */
 844                   goto Alphabetic;
 845
 846                 case G_UNICODE_LOWERCASE_LETTER:
 847                 case G_UNICODE_MODIFIER_LETTER:
 848                 case G_UNICODE_TITLECASE_LETTER:
 849                 case G_UNICODE_UPPERCASE_LETTER:
 850                 Alphabetic:
 851                   if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
 852                     WB_type = WB_ALetter; /* ALetter */
 853                   break;
 854                 }
 855
 856             /* Grapheme Cluster Boundary Rules */
 857
 858             /* We apply Rules WB1 and WB2 at the end of the function */
 859
 860             if (prev_wc == 0x3031 && wc == 0x41)
 861               g_debug ("Y %d %d", prev_WB_type, WB_type);
 862             if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
 863               {
 864                 /* The extra check for prev_WB_i is to correctly handle sequences like
 865                  * Newline ÷ Extend × Extend
 866                  * since we have not skipped ExtendFormat yet.
 867                  */
 868                 is_word_boundary = TRUE; /* Rule WB3a */
 869               }
 870             else if (WB_type == WB_NewlineCRLF)
 871               is_word_boundary = TRUE; /* Rule WB3b */
 872             else if (WB_type == WB_ExtendFormat)
 873               is_word_boundary = FALSE; /* Rules WB4? */
 874             else if ((prev_WB_type == WB_ALetter  ||
 875                       prev_WB_type == WB_Numeric  ||
 876                       prev_WB_type == WB_ExtendNumLet) &&
 877                      (     WB_type == WB_ALetter  ||
 878                            WB_type == WB_Numeric  ||
 879                            WB_type == WB_ExtendNumLet))
 880               is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10, WB13a, WB13b */
 881             else if ((prev_WB_type == WB_Katakana ||
 882                       prev_WB_type == WB_ExtendNumLet) &&
 883                      (     WB_type == WB_Katakana ||
 884                            WB_type == WB_ExtendNumLet))
 885               is_word_boundary = FALSE; /* Rules WB13, WB13a, WB13b */
 886             else if ((prev_prev_WB_type == WB_ALetter && WB_type == WB_ALetter) &&
 887                      (prev_WB_type == WB_MidLetter || prev_WB_type == WB_MidNumLet))
 888               {
 889                 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
 890                 is_word_boundary = FALSE; /* Rule WB7 */
 891               }
 892             else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
 893                      (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet))
 894               {
 895                 is_word_boundary = FALSE; /* Rule WB11 */
 896                 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
 897               }
 898             else
 899               is_word_boundary = TRUE; /* Rule WB14 */
 900
 901             if (WB_type != WB_ExtendFormat)
 902               {
 903                 prev_prev_WB_type = prev_WB_type;
 904                 prev_WB_type = WB_type;
 905                 prev_WB_i = i;
 906               }
 907           }
 908
 909         attrs[i].is_word_boundary = is_word_boundary;
 910       }
 911
 912
 913       /* ---- Line breaking ---- */
 914
 915       break_op = BREAK_ALREADY_HANDLED;
 916
 917       g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);
 918
 919       attrs[i].is_line_break = FALSE;
 920       attrs[i].is_mandatory_break = FALSE;
 921
 922       if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
 923                                         * it's not a line break either
 924                                         */
 925         {
 926           /* space followed by a combining mark is handled
 927            * specially; (rule 7a from TR 14)
 928            */
 929           if (break_type == G_UNICODE_BREAK_SPACE &&
 930               next_break_type == G_UNICODE_BREAK_COMBINING_MARK)
 931             break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
 932
 933           /* Unicode doesn't specify char wrap; we wrap around all chars
 934            * except where a line break is prohibited, which means we
 935            * effectively break everywhere except inside runs of spaces.
 936            */
 937           attrs[i].is_char_break = TRUE;
 938
 939           /* Make any necessary replacements first */
 940           switch ((int) prev_break_type)
 941             {
 942             case G_UNICODE_BREAK_HANGUL_L_JAMO:
 943             case G_UNICODE_BREAK_HANGUL_V_JAMO:
 944             case G_UNICODE_BREAK_HANGUL_T_JAMO:
 945             case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
 946             case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
 947               /* treat Jamo as IDEOGRAPHIC from now
 948                */
 949               prev_break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
 950               break;
 951
 952             case G_UNICODE_BREAK_AMBIGUOUS:
 953               /* FIXME
 954                * we need to resolve the East Asian width
 955                * to decide what to do here
 956                */
 957             case G_UNICODE_BREAK_COMPLEX_CONTEXT:
 958               /* FIXME
 959                * language engines should handle this case...
 960                */
 961             case G_UNICODE_BREAK_UNKNOWN:
 962               /* convert unknown, complex, ambiguous to ALPHABETIC
 963                */
 964               prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
 965               break;
 966
 967             default:
 968               ;
 969             }
 970
 971           switch ((int) prev_break_type)
 972             {
 973             case G_UNICODE_BREAK_MANDATORY:
 974             case G_UNICODE_BREAK_LINE_FEED:
 975             case G_UNICODE_BREAK_NEXT_LINE:
 976               attrs[i].is_line_break = TRUE;
 977               attrs[i].is_mandatory_break = TRUE;
 978               break;
 979
 980             case G_UNICODE_BREAK_CARRIAGE_RETURN:
 981               if (wc != '\n')
 982                 {
 983                   attrs[i].is_line_break = TRUE;
 984                   attrs[i].is_mandatory_break = TRUE;
 985                 }
 986               break;
 987
 988             case G_UNICODE_BREAK_CONTINGENT:
 989               /* can break after 0xFFFC by default, though we might want
 990                * to eventually have a PangoLayout setting or
 991                * PangoAttribute that disables this, if for some
 992                * application breaking after objects is not desired.
 993                */
 994               break_op = BREAK_ALLOWED;
 995               break;
 996
 997             case G_UNICODE_BREAK_SURROGATE:
 998               g_assert_not_reached ();
 999               break;
1000
1001             default:
1002               g_assert (IN_BREAK_TABLE (prev_break_type));
1003
1004               /* Note that our table assumes that combining marks
1005                * are only applied to alphabetic characters;
1006                * tech report 14 explains how to remove this assumption
1007                * from the code, if anyone ever cares, but it shouldn't
1008                * be a problem. Also this issue sort of goes
1009                * away since we only look for breaks on grapheme
1010                * boundaries.
1011                */
1012
1013               switch ((int) break_type)
1014                 {
1015                 case G_UNICODE_BREAK_MANDATORY:
1016                 case G_UNICODE_BREAK_LINE_FEED:
1017                 case G_UNICODE_BREAK_CARRIAGE_RETURN:
1018                 case G_UNICODE_BREAK_NEXT_LINE:
1019                 case G_UNICODE_BREAK_SPACE:
1020                   /* These types all "pile up" at the end of lines and
1021                    * get elided.
1022                    */
1023                   break_op = BREAK_PROHIBITED;
1024                   break;
1025
1026                 case G_UNICODE_BREAK_CONTINGENT:
1027                   /* break before 0xFFFC by default, eventually
1028                    * make this configurable?
1029                    */
1030                   break_op = BREAK_ALLOWED;
1031                   break;
1032
1033                 case G_UNICODE_BREAK_SURROGATE:
1034                   g_assert_not_reached ();
1035                   break;
1036
1037                 /* Hangul additions are from Unicode 4.1 UAX#14 */
1038                 case G_UNICODE_BREAK_HANGUL_L_JAMO:
1039                 case G_UNICODE_BREAK_HANGUL_V_JAMO:
1040                 case G_UNICODE_BREAK_HANGUL_T_JAMO:
1041                 case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
1042                 case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
1043                   /* treat Jamo as IDEOGRAPHIC from now
1044                    */
1045                   break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
1046
1047                   if (makes_hangul_syllable)
1048                     break_op = BREAK_IF_SPACES;
1049                   else
1050                     break_op = BREAK_ALLOWED;
1051                   break;
1052
1053                 case G_UNICODE_BREAK_AMBIGUOUS:
1054                   /* FIXME:
1055                    * we need to resolve the East Asian width
1056                    * to decide what to do here
1057                    */
1058                 case G_UNICODE_BREAK_COMPLEX_CONTEXT:
1059                   /* FIXME:
1060                    * language engines should handle this case...
1061                    */
1062                 case G_UNICODE_BREAK_UNKNOWN:
1063                   /* treat unknown, complex, and ambiguous like ALPHABETIC
1064                    * for now
1065                    */
1066                   break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
1067                   break;
1068
1069                 default:
1070
1071                   g_assert (IN_BREAK_TABLE (break_type));
1072                   break_op = BREAK_OP (prev_break_type, break_type);
1073                   break;
1074                 }
1075               break;
1076             }
1077
1078           switch (break_op)
1079             {
1080             case BREAK_PROHIBITED:
1081               /* can't break here */
1082               attrs[i].is_char_break = FALSE;
1083               break;
1084
1085             case BREAK_IF_SPACES:
1086               /* break if prev char was space */
1087               if (prev_was_break_space)
1088                 attrs[i].is_line_break = TRUE;
1089               break;
1090
1091             case BREAK_ALLOWED:
1092               attrs[i].is_line_break = TRUE;
1093               break;
1094
1095             case BREAK_ALREADY_HANDLED:
1096               break;
1097
1098             default:
1099               g_assert_not_reached ();
1100               break;
1101             }
1102         }
1103
1104       if (break_type != G_UNICODE_BREAK_SPACE)
1105         {
1106           prev_break_type = break_type;
1107           prev_was_break_space = FALSE;
1108           prev_jamo = jamo;
1109         }
1110       else
1111         prev_was_break_space = TRUE;
1112
1113       /* ---- Word breaks ---- */
1114
1115       /* default to not a word start/end */
1116       attrs[i].is_word_start = FALSE;
1117       attrs[i].is_word_end = FALSE;
1118
1119       if (current_word_type != WordNone)
1120         {
1121           /* Check for a word end */
1122           switch ((int) type)
1123             {
1124             case G_UNICODE_COMBINING_MARK:
1125             case G_UNICODE_ENCLOSING_MARK:
1126             case G_UNICODE_NON_SPACING_MARK:
1127             case G_UNICODE_FORMAT:
1128               /* nothing, we just eat these up as part of the word */
1129               break;
1130
1131             case G_UNICODE_LOWERCASE_LETTER:
1132             case G_UNICODE_MODIFIER_LETTER:
1133             case G_UNICODE_OTHER_LETTER:
1134             case G_UNICODE_TITLECASE_LETTER:
1135             case G_UNICODE_UPPERCASE_LETTER:
1136               if (current_word_type == WordLetters)
1137                 {
1138                   /* Japanese special cases for ending the word */
1139                   if (JAPANESE (last_word_letter) ||
1140                       JAPANESE (wc))
1141                     {
1142                       if ((HIRAGANA (last_word_letter) &&
1143                            !HIRAGANA (wc)) ||
1144                           (KATAKANA (last_word_letter) &&
1145                            !(KATAKANA (wc) || HIRAGANA (wc))) ||
1146                           (KANJI (last_word_letter) &&
1147                            !(HIRAGANA (wc) || KANJI (wc))) ||
1148                           (JAPANESE (last_word_letter) &&
1149                            !JAPANESE (wc)) ||
1150                           (!JAPANESE (last_word_letter) &&
1151                            JAPANESE (wc)))
1152                         attrs[i].is_word_end = TRUE;
1153                     }
1154                 }
1155               else
1156                 {
1157                   /* end the number word, start the letter word */
1158                   attrs[i].is_word_end = TRUE;
1159                   attrs[i].is_word_start = TRUE;
1160                   current_word_type = WordLetters;
1161                 }
1162
1163               last_word_letter = wc;
1164               break;
1165
1166             case G_UNICODE_DECIMAL_NUMBER:
1167             case G_UNICODE_LETTER_NUMBER:
1168             case G_UNICODE_OTHER_NUMBER:
1169               if (current_word_type != WordNumbers)
1170                 {
1171                   attrs[i].is_word_end = TRUE;
1172                   attrs[i].is_word_start = TRUE;
1173                   current_word_type = WordNumbers;
1174                 }
1175
1176               last_word_letter = wc;
1177               break;
1178
1179             default:
1180               /* Punctuation, control/format chars, etc. all end a word. */
1181               attrs[i].is_word_end = TRUE;
1182               current_word_type = WordNone;
1183               break;
1184             }
1185         }
1186       else
1187         {
1188           /* Check for a word start */
1189           switch ((int) type)
1190             {
1191             case G_UNICODE_LOWERCASE_LETTER:
1192             case G_UNICODE_MODIFIER_LETTER:
1193             case G_UNICODE_OTHER_LETTER:
1194             case G_UNICODE_TITLECASE_LETTER:
1195             case G_UNICODE_UPPERCASE_LETTER:
1196               current_word_type = WordLetters;
1197               last_word_letter = wc;
1198               attrs[i].is_word_start = TRUE;
1199               break;
1200
1201             case G_UNICODE_DECIMAL_NUMBER:
1202             case G_UNICODE_LETTER_NUMBER:
1203             case G_UNICODE_OTHER_NUMBER:
1204               current_word_type = WordNumbers;
1205               last_word_letter = wc;
1206               attrs[i].is_word_start = TRUE;
1207               break;
1208
1209             default:
1210               /* No word here */
1211               break;
1212             }
1213         }
1214
1215       /* ---- Sentence breaks ---- */
1216
1217       /* The Unicode spec specifies sentence breakpoints, so that a piece of
1218        * text would be partitioned into sentences, and all characters would
1219        * be inside some sentence. This code implements that for is_sentence_boundary,
1220        * but tries to keep leading/trailing whitespace out of sentences for
1221        * the start/end flags
1222        */
1223
1224       /* The Unicode spec seems to say that one trailing line/para
1225        * separator can be tacked on to a sentence ending in ! or ?,
1226        * but not a sentence ending in period; I think they're on crack
1227        * so am allowing one to be tacked onto a sentence ending in period.
1228        */
1229
1230 #define MAYBE_START_NEW_SENTENCE                                \
1231               switch ((int) type)                               \
1232                 {                                               \
1233                 case G_UNICODE_LINE_SEPARATOR:                  \
1234                 case G_UNICODE_PARAGRAPH_SEPARATOR:             \
1235                 case G_UNICODE_CONTROL:                         \
1236                 case G_UNICODE_FORMAT:                          \
1237                 case G_UNICODE_SPACE_SEPARATOR:                 \
1238                   sentence_state = STATE_SENTENCE_OUTSIDE;      \
1239                   break;                                        \
1240                                                                 \
1241                 default:                                        \
1242                   sentence_state = STATE_SENTENCE_BODY;         \
1243                   attrs[i].is_sentence_start = TRUE;            \
1244                   break;                                        \
1245                 }
1246
1247       /* No sentence break at the start of the text */
1248
1249       /* default to not a sentence breakpoint */
1250       attrs[i].is_sentence_boundary = FALSE;
1251       attrs[i].is_sentence_start = FALSE;
1252       attrs[i].is_sentence_end = FALSE;
1253
1254       /* FIXME the Unicode spec lumps control/format chars with
1255        * line/para separators in descriptive text, but not in the
1256        * character class specs, in table 5-6, so who knows whether you
1257        * are actually supposed to break on control/format
1258        * characters. Seems semi-broken to break on tabs...
1259        */
1260
1261       /* Break after line/para separators except carriage return
1262        * followed by newline
1263        */
1264       switch ((int) prev_type)
1265         {
1266         case G_UNICODE_LINE_SEPARATOR:
1267         case G_UNICODE_PARAGRAPH_SEPARATOR:
1268         case G_UNICODE_CONTROL:
1269         case G_UNICODE_FORMAT:
1270           if (wc == '\r')
1271             {
1272               if (next_wc != '\n')
1273                 attrs[i].is_sentence_boundary = TRUE;
1274             }
1275           else
1276             attrs[i].is_sentence_boundary = TRUE;
1277           break;
1278
1279         default:
1280           break;
1281         }
1282
1283       /* break before para/line separators except newline following
1284        * carriage return
1285        */
1286       switch ((int) type)
1287         {
1288         case G_UNICODE_LINE_SEPARATOR:
1289         case G_UNICODE_PARAGRAPH_SEPARATOR:
1290         case G_UNICODE_CONTROL:
1291         case G_UNICODE_FORMAT:
1292           if (wc == '\n')
1293             {
1294               if (prev_wc != '\r')
1295                 attrs[i].is_sentence_boundary = TRUE;
1296             }
1297           else
1298             attrs[i].is_sentence_boundary = TRUE;
1299           break;
1300
1301         default:
1302           break;
1303         }
1304
1305       switch (sentence_state)
1306         {
1307         case STATE_SENTENCE_OUTSIDE:
1308           /* Start sentence if we have non-whitespace/format/control */
1309           switch ((int) type)
1310             {
1311             case G_UNICODE_LINE_SEPARATOR:
1312             case G_UNICODE_PARAGRAPH_SEPARATOR:
1313             case G_UNICODE_CONTROL:
1314             case G_UNICODE_FORMAT:
1315             case G_UNICODE_SPACE_SEPARATOR:
1316               break;
1317
1318             default:
1319               attrs[i].is_sentence_start = TRUE;
1320               sentence_state = STATE_SENTENCE_BODY;
1321               break;
1322             }
1323           break;
1324
1325         case STATE_SENTENCE_BODY:
1326           /* If we already broke here due to separators, end the sentence. */
1327           if (attrs[i].is_sentence_boundary)
1328             {
1329               attrs[i].is_sentence_end = TRUE;
1330
1331               MAYBE_START_NEW_SENTENCE;
1332             }
1333           else
1334             {
1335               if (wc == '.')
1336                 sentence_state = STATE_SENTENCE_DOT;
1337               else if (wc == '?' || wc == '!')
1338                 sentence_state = STATE_SENTENCE_TERM;
1339             }
1340           break;
1341
1342         case STATE_SENTENCE_TERM:
1343           /* End sentence on anything but close punctuation and some
1344            * loosely-specified OTHER_PUNCTUATION such as period,
1345            * comma, etc.; follow Unicode rules for breaks
1346            */
1347           switch ((int) type)
1348             {
1349             case G_UNICODE_OTHER_PUNCTUATION:
1350             case G_UNICODE_CLOSE_PUNCTUATION:
1351               if (type == G_UNICODE_CLOSE_PUNCTUATION ||
1352                   wc == '.' ||
1353                   wc == ',' ||
1354                   wc == '?' ||
1355                   wc == '!')
1356                 sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
1357               else
1358                 {
1359                   attrs[i].is_sentence_end = TRUE;
1360                   attrs[i].is_sentence_boundary = TRUE;
1361
1362                   MAYBE_START_NEW_SENTENCE;
1363                 }
1364               break;
1365
1366             case G_UNICODE_SPACE_SEPARATOR:
1367               attrs[i].is_sentence_end = TRUE;
1368               sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
1369               break;
1370
1371             case G_UNICODE_LINE_SEPARATOR:
1372             case G_UNICODE_PARAGRAPH_SEPARATOR:
1373               attrs[i].is_sentence_end = TRUE;
1374               sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1375               break;
1376
1377             default:
1378               attrs[i].is_sentence_end = TRUE;
1379               attrs[i].is_sentence_boundary = TRUE;
1380
1381               MAYBE_START_NEW_SENTENCE;
1382
1383               break;
1384             }
1385           break;
1386
1387         case STATE_SENTENCE_POST_TERM_CLOSE:
1388           /* End sentence on anything besides more punctuation; follow
1389            * rules for breaks
1390            */
1391           switch ((int) type)
1392             {
1393             case G_UNICODE_OTHER_PUNCTUATION:
1394             case G_UNICODE_CLOSE_PUNCTUATION:
1395               if (type == G_UNICODE_CLOSE_PUNCTUATION ||
1396                   wc == '.' ||
1397                   wc == ',' ||
1398                   wc == '?' ||
1399                   wc == '!')
1400                 /* continue in this state */
1401                 ;
1402               else
1403                 {
1404                   attrs[i].is_sentence_end = TRUE;
1405                   attrs[i].is_sentence_boundary = TRUE;
1406
1407                   MAYBE_START_NEW_SENTENCE;
1408                 }
1409               break;
1410
1411             case G_UNICODE_SPACE_SEPARATOR:
1412               attrs[i].is_sentence_end = TRUE;
1413               sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
1414               break;
1415
1416             case G_UNICODE_LINE_SEPARATOR:
1417             case G_UNICODE_PARAGRAPH_SEPARATOR:
1418               attrs[i].is_sentence_end = TRUE;
1419               /* undo the unconditional break-at-all-line/para-separators
1420                * from above; I'm not sure this is what the Unicode spec
1421                * intends, but it seems right - we get to include
1422                * a single line/para separator in the sentence according
1423                * to their rules
1424                */
1425               attrs[i].is_sentence_boundary = FALSE;
1426               sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1427               break;
1428
1429             default:
1430               attrs[i].is_sentence_end = TRUE;
1431               attrs[i].is_sentence_boundary = TRUE;
1432
1433               MAYBE_START_NEW_SENTENCE;
1434
1435               break;
1436             }
1437           break;
1438
1439         case STATE_SENTENCE_POST_TERM_SPACE:
1440
1441           /* Sentence is definitely already ended; to enter this state
1442            * we had to see a space, which ends the sentence.
1443            */
1444
1445           switch ((int) type)
1446             {
1447             case G_UNICODE_SPACE_SEPARATOR:
1448               /* continue in this state */
1449               break;
1450
1451             case G_UNICODE_LINE_SEPARATOR:
1452             case G_UNICODE_PARAGRAPH_SEPARATOR:
1453               /* undo the unconditional break-at-all-line/para-separators
1454                * from above; I'm not sure this is what the Unicode spec
1455                * intends, but it seems right
1456                */
1457               attrs[i].is_sentence_boundary = FALSE;
1458               sentence_state = STATE_SENTENCE_POST_TERM_SEP;
1459               break;
1460
1461             default:
1462               attrs[i].is_sentence_boundary = TRUE;
1463
1464               MAYBE_START_NEW_SENTENCE;
1465
1466               break;
1467             }
1468           break;
1469
1470         case STATE_SENTENCE_POST_TERM_SEP:
1471           /* Break is forced at this point, unless we're a newline
1472            * after a CR, then we will break after the newline on the
1473            * next iteration. Only a single Sep can be in the
1474            * sentence.
1475            */
1476           if (!(prev_wc == '\r' && wc == '\n'))
1477             attrs[i].is_sentence_boundary = TRUE;
1478
1479           MAYBE_START_NEW_SENTENCE;
1480
1481           break;
1482
1483         case STATE_SENTENCE_DOT:
1484           switch ((int) type)
1485             {
1486             case G_UNICODE_CLOSE_PUNCTUATION:
1487               sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
1488               break;
1489
1490             case G_UNICODE_SPACE_SEPARATOR:
1491               possible_sentence_end = i;
1492               sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
1493               break;
1494
1495             default:
1496               /* If we broke on a control/format char, end the
1497                * sentence; else this was not a sentence end, since
1498                * we didn't enter the POST_DOT_SPACE state.
1499                */
1500               if (attrs[i].is_sentence_boundary)
1501                 {
1502                   attrs[i].is_sentence_end = TRUE;
1503
1504                   MAYBE_START_NEW_SENTENCE;
1505                 }
1506               else
1507                 sentence_state = STATE_SENTENCE_BODY;
1508               break;
1509             }
1510           break;
1511
1512         case STATE_SENTENCE_POST_DOT_CLOSE:
1513           switch ((int) type)
1514             {
1515             case G_UNICODE_SPACE_SEPARATOR:
1516               possible_sentence_end = i;
1517               sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
1518               break;
1519
1520             default:
1521               /* If we broke on a control/format char, end the
1522                * sentence; else this was not a sentence end, since
1523                * we didn't enter the POST_DOT_SPACE state.
1524                */
1525               if (attrs[i].is_sentence_boundary)
1526                 {
1527                   attrs[i].is_sentence_end = TRUE;
1528
1529                   MAYBE_START_NEW_SENTENCE;
1530                 }
1531               else
1532                 sentence_state = STATE_SENTENCE_BODY;
1533               break;
1534             }
1535           break;
1536
1537         case STATE_SENTENCE_POST_DOT_SPACE:
1538
1539           possible_sentence_boundary = i;
1540
1541           switch ((int) type)
1542             {
1543             case G_UNICODE_SPACE_SEPARATOR:
1544               /* remain in current state */
1545               break;
1546
1547             case G_UNICODE_OPEN_PUNCTUATION:
1548               sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
1549               break;
1550
1551             case G_UNICODE_LOWERCASE_LETTER:
1552               /* wasn't a sentence-ending period; so re-enter the sentence
1553                * body
1554                */
1555               sentence_state = STATE_SENTENCE_BODY;
1556               break;
1557
1558             default:
1559               /* End the sentence, break, maybe start a new one */
1560
1561               g_assert (possible_sentence_end >= 0);
1562               g_assert (possible_sentence_boundary >= 0);
1563
1564               attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
1565               attrs[possible_sentence_end].is_sentence_end = TRUE;
1566
1567               possible_sentence_end = -1;
1568               possible_sentence_boundary = -1;
1569
1570               MAYBE_START_NEW_SENTENCE;
1571
1572               break;
1573             }
1574           break;
1575
1576         case STATE_SENTENCE_POST_DOT_OPEN:
1577           switch ((int) type)
1578             {
1579             case G_UNICODE_OPEN_PUNCTUATION:
1580               /* continue in current state */
1581               break;
1582
1583             case G_UNICODE_LOWERCASE_LETTER:
1584               /* wasn't a sentence-ending period; so re-enter the sentence
1585                * body
1586                */
1587               sentence_state = STATE_SENTENCE_BODY;
1588               break;
1589
1590             default:
1591               /* End the sentence, break, maybe start a new one */
1592
1593               g_assert (possible_sentence_end >= 0);
1594               g_assert (possible_sentence_boundary >= 0);
1595
1596               attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
1597               attrs[possible_sentence_end].is_sentence_end = TRUE;
1598
1599               possible_sentence_end = -1;
1600               possible_sentence_boundary = -1;
1601
1602               MAYBE_START_NEW_SENTENCE;
1603
1604               break;
1605             }
1606           break;
1607
1608         case STATE_SENTENCE_POST_DOT_SEP:
1609           /* Break is forced at this point, unless we're a newline
1610            * after a CR, then we will break after the newline on the
1611            * next iteration. Only a single Sep can be in the
1612            * sentence.
1613            */
1614           if (!(prev_wc == '\r' && wc == '\n'))
1615             attrs[i].is_sentence_boundary = TRUE;
1616
1617           g_assert (possible_sentence_end >= 0);
1618           g_assert (possible_sentence_boundary >= 0);
1619
1620           attrs[possible_sentence_end].is_sentence_end = TRUE;
1621
1622           possible_sentence_end = -1;
1623           possible_sentence_boundary = -1;
1624
1625           MAYBE_START_NEW_SENTENCE;
1626
1627           break;
1628
1629         default:
1630           g_assert_not_reached ();
1631           break;
1632         }
1633
1634       prev_type = type;
1635       prev_wc = wc;
1636
1637       /* wc might not be a valid Unicode base character, but really all we
1638        * need to know is the last non-combining character */
1639       if (type != G_UNICODE_COMBINING_MARK &&
1640           type != G_UNICODE_ENCLOSING_MARK &&
1641           type != G_UNICODE_NON_SPACING_MARK)
1642         base_character = wc;
1643     }
1644   i--;
1645
1646   attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
1647   attrs[0].is_cursor_position = TRUE;  /* Rule GB1 */
1648
1649   attrs[i].is_word_boundary = TRUE;  /* Rule WB2 */
1650   attrs[0].is_word_boundary = TRUE;  /* Rule WB1 */
1651
1652   attrs[i].is_line_break = TRUE;  /* Rule LB3 */
1653   attrs[0].is_line_break = FALSE; /* Rule LB2 */
1654
1655 }
1656
1657 static gboolean
1658 tailor_break (const gchar   *text,
1659              gint           length,
1660              PangoAnalysis *analysis,
1661              PangoLogAttr  *attrs,
1662              int            attrs_len)
1663 {
1664   if (analysis->lang_engine && PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break)
1665     {
1666       if (length < 0)
1667         length = strlen (text);
1668       else if (text == NULL)
1669         text = "";
1670
1671       PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break (analysis->lang_engine, text, length, analysis, attrs, attrs_len);
1672       return TRUE;
1673     }
1674   return FALSE;
1675 }
1676
1677 /**
1678  * pango_break:
1679  * @text:      the text to process
1680  * @length:    length of @text in bytes (may be -1 if @text is nul-terminated)
1681  * @analysis:  #PangoAnalysis structure from pango_itemize()
1682  * @attrs:     an array to store character information in
1683  * @attrs_len: size of the array passed as @attrs
1684  *
1685  * Determines possible line, word, and character breaks
1686  * for a string of Unicode text with a single analysis.  For most
1687  * purposes you may want to use pango_get_log_attrs().
1688  */
1689 void
1690 pango_break (const gchar   *text,
1691              gint           length,
1692              PangoAnalysis *analysis,
1693              PangoLogAttr  *attrs,
1694              int            attrs_len)
1695 {
1696   g_return_if_fail (analysis != NULL);
1697   g_return_if_fail (attrs != NULL);
1698
1699   pango_default_break (text, length, analysis, attrs, attrs_len);
1700   tailor_break        (text, length, analysis, attrs, attrs_len);
1701 }
1702
1703 /**
1704  * pango_find_paragraph_boundary:
1705  * @text: UTF-8 text
1706  * @length: length of @text in bytes, or -1 if nul-terminated
1707  * @paragraph_delimiter_index: return location for index of delimiter
1708  * @next_paragraph_start: return location for start of next paragraph
1709  *
1710  * Locates a paragraph boundary in @text. A boundary is caused by
1711  * delimiter characters, such as a newline, carriage return, carriage
1712  * return-newline pair, or Unicode paragraph separator character.  The
1713  * index of the run of delimiters is returned in
1714  * @paragraph_delimiter_index. The index of the start of the paragraph
1715  * (index after all delimiters) is stored in @next_paragraph_start.
1716  *
1717  * If no delimiters are found, both @paragraph_delimiter_index and
1718  * @next_paragraph_start are filled with the length of @text (an index one
1719  * off the end).
1720  **/
1721 void
1722 pango_find_paragraph_boundary (const gchar *text,
1723                                gint         length,
1724                                gint        *paragraph_delimiter_index,
1725                                gint        *next_paragraph_start)
1726 {
1727   const gchar *p = text;
1728   const gchar *end;
1729   const gchar *start = NULL;
1730   const gchar *delimiter = NULL;
1731
1732   /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
1733    * Unicode 5.0; update the following code if that changes.
1734    */
1735
1736   /* prev_sep is the first byte of the previous separator.  Since
1737    * the valid separators are \r, \n, and PARAGRAPH_SEPARATOR, the
1738    * first byte is enough to identify it.
1739    */
1740   gchar prev_sep;
1741
1742
1743   if (length < 0)
1744     length = strlen (text);
1745
1746   end = text + length;
1747
1748   if (paragraph_delimiter_index)
1749     *paragraph_delimiter_index = length;
1750
1751   if (next_paragraph_start)
1752     *next_paragraph_start = length;
1753
1754   if (length == 0)
1755     return;
1756
1757   prev_sep = 0;
1758
1759   while (p != end)
1760     {
1761       if (prev_sep == '\n' ||
1762           prev_sep == PARAGRAPH_SEPARATOR_STRING[0])
1763         {
1764           g_assert (delimiter);
1765           start = p;
1766           break;
1767         }
1768       else if (prev_sep == '\r')
1769         {
1770           /* don't break between \r and \n */
1771           if (*p != '\n')
1772             {
1773               g_assert (delimiter);
1774               start = p;
1775               break;
1776             }
1777         }
1778
1779       if (*p == '\n' ||
1780            *p == '\r' ||
1781            !strncmp(p, PARAGRAPH_SEPARATOR_STRING,
1782                     strlen(PARAGRAPH_SEPARATOR_STRING)))
1783         {
1784           if (delimiter == NULL)
1785             delimiter = p;
1786           prev_sep = *p;
1787         }
1788       else
1789         prev_sep = 0;
1790
1791       p = g_utf8_next_char (p);
1792     }
1793
1794   if (delimiter && paragraph_delimiter_index)
1795     *paragraph_delimiter_index = delimiter - text;
1796
1797   if (start && next_paragraph_start)
1798     *next_paragraph_start = start - text;
1799 }
1800
1801 static int
1802 tailor_segment (const char      *range_start,
1803                 const char      *range_end,
1804                 PangoEngineLang *range_engine,
1805                 int              chars_broken,
1806                 PangoAnalysis   *analysis,
1807                 PangoLogAttr    *log_attrs)
1808 {
1809   int chars_in_range;
1810   PangoLogAttr attr_before = log_attrs[0];
1811
1812   analysis->lang_engine = range_engine;
1813   chars_in_range = pango_utf8_strlen (range_start, range_end - range_start);
1814
1815
1816   if (tailor_break (range_start,
1817                     range_end - range_start,
1818                     analysis,
1819                     log_attrs + chars_broken,
1820                     chars_in_range + 1))
1821     {
1822       /* if tailored, we enforce some of the attrs from before tailoring at
1823        * the boundary
1824        */
1825
1826      log_attrs[0].backspace_deletes_character  = attr_before.backspace_deletes_character;
1827
1828      log_attrs[0].is_line_break      |= attr_before.is_line_break;
1829      log_attrs[0].is_mandatory_break |= attr_before.is_mandatory_break;
1830      log_attrs[0].is_cursor_position |= attr_before.is_cursor_position;
1831     }
1832
1833   return chars_in_range;
1834 }
1835
1836 /**
1837  * pango_get_log_attrs:
1838  * @text: text to process
1839  * @length: length in bytes of @text
1840  * @level: embedding level, or -1 if unknown
1841  * @language: language tag
1842  * @log_attrs: array with one #PangoLogAttr per character in @text, plus one extra, to be filled in
1843  * @attrs_len: length of @log_attrs array
1844  *
1845  * Computes a #PangoLogAttr for each character in @text. The @log_attrs
1846  * array must have one #PangoLogAttr for each position in @text; if
1847  * @text contains N characters, it has N+1 positions, including the
1848  * last position at the end of the text. @text should be an entire
1849  * paragraph; logical attributes can't be computed without context
1850  * (for example you need to see spaces on either side of a word to know
1851  * the word is a word).
1852  */
1853 void
1854 pango_get_log_attrs (const char    *text,
1855                      int            length,
1856                      int            level,
1857                      PangoLanguage *language,
1858                      PangoLogAttr  *log_attrs,
1859                      int            attrs_len)
1860 {
1861   PangoMap *lang_map;
1862   int chars_broken;
1863   const char *range_start, *range_end;
1864   PangoScript script;
1865   PangoEngineLang *range_engine;
1866   static guint engine_type_id = 0;
1867   static guint render_type_id = 0;
1868   PangoAnalysis analysis = { NULL };
1869   PangoScriptIter iter;
1870
1871   g_return_if_fail (length == 0 || text != NULL);
1872   g_return_if_fail (log_attrs != NULL);
1873
1874   analysis.level = level;
1875
1876   pango_default_break (text, length, &analysis, log_attrs, attrs_len);
1877
1878   if (engine_type_id == 0)
1879     {
1880       engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
1881       render_type_id = g_quark_from_static_string (PANGO_RENDER_TYPE_NONE);
1882     }
1883
1884   lang_map = pango_find_map (language, engine_type_id, render_type_id);
1885
1886   chars_broken = 0;
1887
1888   _pango_script_iter_init (&iter, text, length);
1889   pango_script_iter_get_range (&iter, &range_start, &range_end, &script);
1890   range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
1891   g_assert (range_start == text);
1892
1893   while (pango_script_iter_next (&iter))
1894     {
1895       const char *run_start, *run_end;
1896       PangoEngineLang* run_engine;
1897
1898       pango_script_iter_get_range (&iter, &run_start, &run_end, &script);
1899       run_engine = (PangoEngineLang*) pango_map_get_engine (lang_map, script);
1900       g_assert (range_end == run_start);
1901
1902       if (range_engine != run_engine)
1903         {
1904           /* Engine has changed; do the tailoring for the current range,
1905            * then start a new range.
1906            */
1907           chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
1908
1909           range_start = run_start;
1910           range_engine = run_engine;
1911         }
1912       range_end = run_end;
1913     }
1914   _pango_script_iter_fini (&iter);
1915
1916   g_assert (length < 0 || range_end == text + length);
1917
1918   chars_broken += tailor_segment (range_start, range_end, range_engine, chars_broken, &analysis, log_attrs);
1919
1920   if (chars_broken + 1 > attrs_len)
1921     g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d.  Expect corrupted memory.",
1922                chars_broken + 1,
1923                attrs_len);
1924 }