* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2008-2015 Wu Yongwei <wuyongwei at gmail dot com>
* Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
- * This library has been updated according to Revision 30, for
- * Unicode 6.2.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
+ * This library has been updated according to Revision 33, for
+ * Unicode 7.0.0:
+ * <URL:http://www.unicode.org/reports/tr14/tr14-33.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
* Implementation of the line breaking algorithm as described in Unicode
* Standard Annex 14.
*
- * @version 2.5, 2013/11/14
+ * @version 2.7, 2015/04/18
* @author Wu Yongwei
* @author Petr Filipsky
*/
#define LINEBREAK_INDEX_SIZE 40
/**
- * Version number of the library.
- */
-const int linebreak_version = LINEBREAK_VERSION;
-
-/**
* Enumeration of break actions. They are used in the break action
* pair table below.
*/
* @post \a lbpCtx->lbcCur has the updated line break class
*/
static void treat_first_char(
- struct LineBreakContext* lbpCtx)
+ struct LineBreakContext *lbpCtx)
{
switch (lbpCtx->lbcCur)
{
case LBP_SP:
lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
break;
+ case LBP_HL:
+ lbpCtx->fLb21aHebrew = 1; /* Rule LB21a */
default:
break;
}
* table lookup is needed
*/
static int get_lb_result_simple(
- struct LineBreakContext* lbpCtx)
+ struct LineBreakContext *lbpCtx)
{
if (lbpCtx->lbcCur == LBP_BK
|| (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
* #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
*/
static int get_lb_result_lookup(
- struct LineBreakContext* lbpCtx)
+ struct LineBreakContext *lbpCtx)
{
- /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
- * yet implemented below. */
int brk = LINEBREAK_UNDEFINED;
+
assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
- switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
+
+ /* Fix for Hangul word wrap */
+ enum LineBreakClass lbcCur, lbcNew;
+
+ switch (lbpCtx->lbcCur)
+ {
+ case LBP_H2: /**< Hangul LV */
+ case LBP_H3: /**< Hangul LVT */
+ case LBP_JL: /**< Hangul L Jamo */
+ case LBP_JV: /**< Hangul V Jamo */
+ case LBP_JT: /**< Hangul T Jamo */
+ lbcCur = LBP_AL;
+ break;
+ default:
+ lbcCur = lbpCtx->lbcCur;
+ break;
+ }
+
+ switch (lbpCtx->lbcNew)
+ {
+ case LBP_H2: /**< Hangul LV */
+ case LBP_H3: /**< Hangul LVT */
+ case LBP_JL: /**< Hangul L Jamo */
+ case LBP_JV: /**< Hangul V Jamo */
+ case LBP_JT: /**< Hangul T Jamo */
+ lbcNew = LBP_AL;
+ break;
+ default:
+ lbcNew = lbpCtx->lbcNew;
+ break;
+ }
+
+ switch (baTable[lbcCur - 1][lbcNew - 1])
+ /* END */
{
case DIR_BRK:
brk = LINEBREAK_ALLOWBREAK;
brk = LINEBREAK_NOBREAK;
break;
}
+
+ /* Special processing due to rule LB21a */
+ if (lbpCtx->fLb21aHebrew &&
+ (lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA))
+ {
+ brk = LINEBREAK_NOBREAK;
+ lbpCtx->fLb21aHebrew = 0;
+ }
+ else if (!(lbpCtx->lbcNew == LBP_HY || lbpCtx->lbcNew == LBP_BA))
+ {
+ lbpCtx->fLb21aHebrew = (lbpCtx->lbcNew == LBP_HL);
+ }
+
lbpCtx->lbcCur = lbpCtx->lbcNew;
return brk;
}
* @post the line breaking context is initialized
*/
void lb_init_break_context(
- struct LineBreakContext* lbpCtx,
+ struct LineBreakContext *lbpCtx,
utf32_t ch,
- const char* lang)
+ const char *lang)
{
lbpCtx->lang = lang;
lbpCtx->lbpLang = get_lb_prop_lang(lang);
lbpCtx->lbcCur = resolve_lb_class(
get_char_lb_class_lang(ch, lbpCtx->lbpLang),
lbpCtx->lang);
+ lbpCtx->fLb21aHebrew = 0;
treat_first_char(lbpCtx);
}
* @post the line breaking context is updated
*/
int lb_process_next_char(
- struct LineBreakContext* lbpCtx,
+ struct LineBreakContext *lbpCtx,
utf32_t ch )
{
int brk;
}
/**
- * Gets the next Unicode character in a UTF-8 sequence. The index will
- * be advanced to the next complete character, unless the end of string
- * is reached in the middle of a UTF-8 sequence.
- *
- * @param[in] s input UTF-8 string
- * @param[in] len length of the string in bytes
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
- */
-utf32_t lb_get_next_char_utf8(
- const utf8_t *s,
- size_t len,
- size_t *ip)
-{
- utf8_t ch;
- utf32_t res;
-
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- ch = s[*ip];
-
- if (ch < 0xC2 || ch > 0xF4)
- { /* One-byte sequence, tail (should not occur), or invalid */
- *ip += 1;
- return ch;
- }
- else if (ch < 0xE0)
- { /* Two-byte sequence */
- if (*ip + 2 > len)
- return EOS;
- res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
- *ip += 2;
- return res;
- }
- else if (ch < 0xF0)
- { /* Three-byte sequence */
- if (*ip + 3 > len)
- return EOS;
- res = ((ch & 0x0F) << 12) +
- ((s[*ip + 1] & 0x3F) << 6) +
- ((s[*ip + 2] & 0x3F));
- *ip += 3;
- return res;
- }
- else
- { /* Four-byte sequence */
- if (*ip + 4 > len)
- return EOS;
- res = ((ch & 0x07) << 18) +
- ((s[*ip + 1] & 0x3F) << 12) +
- ((s[*ip + 2] & 0x3F) << 6) +
- ((s[*ip + 3] & 0x3F));
- *ip += 4;
- return res;
- }
-}
-
-/**
- * Gets the next Unicode character in a UTF-16 sequence. The index will
- * be advanced to the next complete character, unless the end of string
- * is reached in the middle of a UTF-16 surrogate pair.
- *
- * @param[in] s input UTF-16 string
- * @param[in] len length of the string in words
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
- */
-utf32_t lb_get_next_char_utf16(
- const utf16_t *s,
- size_t len,
- size_t *ip)
-{
- utf16_t ch;
-
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- ch = s[(*ip)++];
-
- if (ch < 0xD800 || ch > 0xDBFF)
- { /* If the character is not a high surrogate */
- return ch;
- }
- if (*ip == len)
- { /* If the input ends here (an error) */
- --(*ip);
- return EOS;
- }
- if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
- { /* If the next character is not the low surrogate (an error) */
- return ch;
- }
- /* Return the constructed character and advance the index again */
- return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
-}
-
-/**
- * Gets the next Unicode character in a UTF-32 sequence. The index will
- * be advanced to the next character.
- *
- * @param[in] s input UTF-32 string
- * @param[in] len length of the string in dwords
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
- */
-utf32_t lb_get_next_char_utf32(
- const utf32_t *s,
- size_t len,
- size_t *ip)
-{
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- return s[(*ip)++];
-}
-
-/**
* Sets the line breaking information for a generic input string.
*
* @param[in] s input string
char *brks)
{
set_linebreaks(s, len, lang, brks,
- (get_next_char_t)lb_get_next_char_utf8);
+ (get_next_char_t)ub_get_next_char_utf8);
}
/**
char *brks)
{
set_linebreaks(s, len, lang, brks,
- (get_next_char_t)lb_get_next_char_utf16);
+ (get_next_char_t)ub_get_next_char_utf16);
}
/**
char *brks)
{
set_linebreaks(s, len, lang, brks,
- (get_next_char_t)lb_get_next_char_utf32);
+ (get_next_char_t)ub_get_next_char_utf32);
}
/**
int is_line_breakable(
utf32_t char1,
utf32_t char2,
- const char* lang)
+ const char *lang)
{
utf32_t s[2];
char brks[2];