From c6d52122583816fcea9233ca1e99b3567a6a8fe4 Mon Sep 17 00:00:00 2001 From: "minho.sun" Date: Thu, 11 Jan 2018 15:28:23 +0900 Subject: [PATCH] [4.0] Update libunibreak to fix word wrap mode issue. Update libunibreak to fix word wrap mode issue. Change-Id: Iaadfbb29e5afeb0dc32eb5503ea750542119564a Signed-off-by: minho.sun --- text/dali/internal/libunibreak/LICENCE | 5 +- text/dali/internal/libunibreak/file.list | 5 +- text/dali/internal/libunibreak/linebreak.c | 210 ++++++++----------------- text/dali/internal/libunibreak/linebreak.h | 29 ++-- text/dali/internal/libunibreak/linebreakdata.c | 1 - text/dali/internal/libunibreak/linebreakdef.c | 8 +- text/dali/internal/libunibreak/linebreakdef.h | 32 ++-- text/dali/internal/libunibreak/unibreakbase.c | 41 +++++ text/dali/internal/libunibreak/unibreakbase.h | 73 +++++++++ text/dali/internal/libunibreak/unibreakdef.c | 159 +++++++++++++++++++ text/dali/internal/libunibreak/unibreakdef.h | 80 ++++++++++ text/dali/internal/libunibreak/wordbreak.c | 75 +++++++-- text/dali/internal/libunibreak/wordbreak.h | 12 +- text/dali/internal/libunibreak/wordbreakdata.c | 201 ++++++++++++++++++----- text/dali/internal/libunibreak/wordbreakdef.h | 21 ++- 15 files changed, 686 insertions(+), 266 deletions(-) create mode 100644 text/dali/internal/libunibreak/unibreakbase.c create mode 100644 text/dali/internal/libunibreak/unibreakbase.h create mode 100644 text/dali/internal/libunibreak/unibreakdef.c create mode 100644 text/dali/internal/libunibreak/unibreakdef.h diff --git a/text/dali/internal/libunibreak/LICENCE b/text/dali/internal/libunibreak/LICENCE index ceec155..3eda8d5 100644 --- a/text/dali/internal/libunibreak/LICENCE +++ b/text/dali/internal/libunibreak/LICENCE @@ -1,5 +1,6 @@ -Copyright (C) 2008-2012 Wu Yongwei -Copyright (C) 2012 Tom Hacohen +Copyright (C) 2008-2015 Wu Yongwei +Copyright (C) 2012-2015 Tom Hacohen +Copyright (C) 2013 Petr Filipsky This software is provided 'as-is', without any express or implied warranty. In no event will the author be held liable for any damages diff --git a/text/dali/internal/libunibreak/file.list b/text/dali/internal/libunibreak/file.list index d6b28ad..da3381d 100644 --- a/text/dali/internal/libunibreak/file.list +++ b/text/dali/internal/libunibreak/file.list @@ -4,4 +4,7 @@ static_libraries_libunibreak_src_files = \ $(static_libraries_libunibreak_src_dir)/linebreak.c \ $(static_libraries_libunibreak_src_dir)/linebreakdata.c \ $(static_libraries_libunibreak_src_dir)/linebreakdef.c \ - $(static_libraries_libunibreak_src_dir)/wordbreak.c + $(static_libraries_libunibreak_src_dir)/wordbreak.c \ + $(static_libraries_libunibreak_src_dir)/unibreakbase.c \ + $(static_libraries_libunibreak_src_dir)/unibreakdef.c + diff --git a/text/dali/internal/libunibreak/linebreak.c b/text/dali/internal/libunibreak/linebreak.c index 62a8104..2b28f05 100644 --- a/text/dali/internal/libunibreak/linebreak.c +++ b/text/dali/internal/libunibreak/linebreak.c @@ -4,7 +4,7 @@ * Line breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2008-2013 Wu Yongwei + * Copyright (C) 2008-2015 Wu Yongwei * Copyright (C) 2013 Petr Filipsky * * This software is provided 'as-is', without any express or implied @@ -31,9 +31,9 @@ * Unicode 5.0.0: * * - * This library has been updated according to Revision 30, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 33, for + * Unicode 7.0.0: + * * * The Unicode Terms of Use are available at * @@ -45,7 +45,7 @@ * Implementation of the line breaking algorithm as described in Unicode * Standard Annex 14. * - * @version 2.5, 2013/11/14 + * @version 2.7, 2015/04/18 * @author Wu Yongwei * @author Petr Filipsky */ @@ -67,11 +67,6 @@ #define LINEBREAK_INDEX_SIZE 40 /** - * Version number of the library. - */ -const int linebreak_version = LINEBREAK_VERSION; - -/** * Enumeration of break actions. They are used in the break action * pair table below. */ @@ -451,7 +446,7 @@ static enum LineBreakClass resolve_lb_class( * @post \a lbpCtx->lbcCur has the updated line break class */ static void treat_first_char( - struct LineBreakContext* lbpCtx) + struct LineBreakContext *lbpCtx) { switch (lbpCtx->lbcCur) { @@ -465,6 +460,8 @@ static void treat_first_char( case LBP_SP: lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */ break; + case LBP_HL: + lbpCtx->fLb21aHebrew = 1; /* Rule LB21a */ default: break; } @@ -485,7 +482,7 @@ static void treat_first_char( * table lookup is needed */ static int get_lb_result_simple( - struct LineBreakContext* lbpCtx) + struct LineBreakContext *lbpCtx) { if (lbpCtx->lbcCur == LBP_BK || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF)) @@ -528,14 +525,46 @@ static int get_lb_result_simple( * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK */ static int get_lb_result_lookup( - struct LineBreakContext* lbpCtx) + struct LineBreakContext *lbpCtx) { - /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not - * yet implemented below. */ int brk = LINEBREAK_UNDEFINED; + assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI)); assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI)); - switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1]) + + /* Fix for Hangul word wrap */ + enum LineBreakClass lbcCur, lbcNew; + + switch (lbpCtx->lbcCur) + { + case LBP_H2: /**< Hangul LV */ + case LBP_H3: /**< Hangul LVT */ + case LBP_JL: /**< Hangul L Jamo */ + case LBP_JV: /**< Hangul V Jamo */ + case LBP_JT: /**< Hangul T Jamo */ + lbcCur = LBP_AL; + break; + default: + lbcCur = lbpCtx->lbcCur; + break; + } + + switch (lbpCtx->lbcNew) + { + case LBP_H2: /**< Hangul LV */ + case LBP_H3: /**< Hangul LVT */ + case LBP_JL: /**< Hangul L Jamo */ + case LBP_JV: /**< Hangul V Jamo */ + case LBP_JT: /**< Hangul T Jamo */ + lbcNew = LBP_AL; + break; + default: + lbcNew = lbpCtx->lbcNew; + break; + } + + switch (baTable[lbcCur - 1][lbcNew - 1]) + /* END */ { case DIR_BRK: brk = LINEBREAK_ALLOWBREAK; @@ -555,6 +584,19 @@ static int get_lb_result_lookup( brk = LINEBREAK_NOBREAK; break; } + + /* Special processing due to rule LB21a */ + if (lbpCtx->fLb21aHebrew && + (lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA)) + { + brk = LINEBREAK_NOBREAK; + lbpCtx->fLb21aHebrew = 0; + } + else if (!(lbpCtx->lbcNew == LBP_HY || lbpCtx->lbcNew == LBP_BA)) + { + lbpCtx->fLb21aHebrew = (lbpCtx->lbcNew == LBP_HL); + } + lbpCtx->lbcCur = lbpCtx->lbcNew; return brk; } @@ -568,9 +610,9 @@ static int get_lb_result_lookup( * @post the line breaking context is initialized */ void lb_init_break_context( - struct LineBreakContext* lbpCtx, + struct LineBreakContext *lbpCtx, utf32_t ch, - const char* lang) + const char *lang) { lbpCtx->lang = lang; lbpCtx->lbpLang = get_lb_prop_lang(lang); @@ -579,6 +621,7 @@ void lb_init_break_context( lbpCtx->lbcCur = resolve_lb_class( get_char_lb_class_lang(ch, lbpCtx->lbpLang), lbpCtx->lang); + lbpCtx->fLb21aHebrew = 0; treat_first_char(lbpCtx); } @@ -593,7 +636,7 @@ void lb_init_break_context( * @post the line breaking context is updated */ int lb_process_next_char( - struct LineBreakContext* lbpCtx, + struct LineBreakContext *lbpCtx, utf32_t ch ) { int brk; @@ -618,127 +661,6 @@ int lb_process_next_char( } /** - * Gets the next Unicode character in a UTF-8 sequence. The index will - * be advanced to the next complete character, unless the end of string - * is reached in the middle of a UTF-8 sequence. - * - * @param[in] s input UTF-8 string - * @param[in] len length of the string in bytes - * @param[in,out] ip pointer to the index - * @return the Unicode character beginning at the index; or - * #EOS if end of input is encountered - */ -utf32_t lb_get_next_char_utf8( - const utf8_t *s, - size_t len, - size_t *ip) -{ - utf8_t ch; - utf32_t res; - - assert(*ip <= len); - if (*ip == len) - return EOS; - ch = s[*ip]; - - if (ch < 0xC2 || ch > 0xF4) - { /* One-byte sequence, tail (should not occur), or invalid */ - *ip += 1; - return ch; - } - else if (ch < 0xE0) - { /* Two-byte sequence */ - if (*ip + 2 > len) - return EOS; - res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); - *ip += 2; - return res; - } - else if (ch < 0xF0) - { /* Three-byte sequence */ - if (*ip + 3 > len) - return EOS; - res = ((ch & 0x0F) << 12) + - ((s[*ip + 1] & 0x3F) << 6) + - ((s[*ip + 2] & 0x3F)); - *ip += 3; - return res; - } - else - { /* Four-byte sequence */ - if (*ip + 4 > len) - return EOS; - res = ((ch & 0x07) << 18) + - ((s[*ip + 1] & 0x3F) << 12) + - ((s[*ip + 2] & 0x3F) << 6) + - ((s[*ip + 3] & 0x3F)); - *ip += 4; - return res; - } -} - -/** - * Gets the next Unicode character in a UTF-16 sequence. The index will - * be advanced to the next complete character, unless the end of string - * is reached in the middle of a UTF-16 surrogate pair. - * - * @param[in] s input UTF-16 string - * @param[in] len length of the string in words - * @param[in,out] ip pointer to the index - * @return the Unicode character beginning at the index; or - * #EOS if end of input is encountered - */ -utf32_t lb_get_next_char_utf16( - const utf16_t *s, - size_t len, - size_t *ip) -{ - utf16_t ch; - - assert(*ip <= len); - if (*ip == len) - return EOS; - ch = s[(*ip)++]; - - if (ch < 0xD800 || ch > 0xDBFF) - { /* If the character is not a high surrogate */ - return ch; - } - if (*ip == len) - { /* If the input ends here (an error) */ - --(*ip); - return EOS; - } - if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) - { /* If the next character is not the low surrogate (an error) */ - return ch; - } - /* Return the constructed character and advance the index again */ - return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; -} - -/** - * Gets the next Unicode character in a UTF-32 sequence. The index will - * be advanced to the next character. - * - * @param[in] s input UTF-32 string - * @param[in] len length of the string in dwords - * @param[in,out] ip pointer to the index - * @return the Unicode character beginning at the index; or - * #EOS if end of input is encountered - */ -utf32_t lb_get_next_char_utf32( - const utf32_t *s, - size_t len, - size_t *ip) -{ - assert(*ip <= len); - if (*ip == len) - return EOS; - return s[(*ip)++]; -} - -/** * Sets the line breaking information for a generic input string. * * @param[in] s input string @@ -809,7 +731,7 @@ void set_linebreaks_utf8( char *brks) { set_linebreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf8); + (get_next_char_t)ub_get_next_char_utf8); } /** @@ -829,7 +751,7 @@ void set_linebreaks_utf16( char *brks) { set_linebreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf16); + (get_next_char_t)ub_get_next_char_utf16); } /** @@ -849,7 +771,7 @@ void set_linebreaks_utf32( char *brks) { set_linebreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf32); + (get_next_char_t)ub_get_next_char_utf32); } /** @@ -868,7 +790,7 @@ void set_linebreaks_utf32( int is_line_breakable( utf32_t char1, utf32_t char2, - const char* lang) + const char *lang) { utf32_t s[2]; char brks[2]; diff --git a/text/dali/internal/libunibreak/linebreak.h b/text/dali/internal/libunibreak/linebreak.h index 94fbca0..68c8e41 100644 --- a/text/dali/internal/libunibreak/linebreak.h +++ b/text/dali/internal/libunibreak/linebreak.h @@ -4,7 +4,7 @@ * Line breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2008-2012 Wu Yongwei + * Copyright (C) 2008-2015 Wu Yongwei * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -30,9 +30,9 @@ * Unicode 5.0.0: * * - * This library has been updated according to Revision 30, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 33, for + * Unicode 7.0.0: + * * * The Unicode Terms of Use are available at * @@ -43,7 +43,7 @@ * * Header file for the line breaking algorithm. * - * @version 2.2, 2012/10/06 + * @version 2.4, 2015/04/18 * @author Wu Yongwei */ @@ -51,21 +51,12 @@ #define LINEBREAK_H #include +#include "unibreakbase.h" #ifdef __cplusplus extern "C" { #endif -#define LINEBREAK_VERSION 0x0202 /**< Version of the library linebreak */ -extern const int linebreak_version; - -#ifndef LINEBREAK_UTF_TYPES_DEFINED -#define LINEBREAK_UTF_TYPES_DEFINED -typedef unsigned char utf8_t; /**< Type for UTF-8 data points */ -typedef unsigned short utf16_t; /**< Type for UTF-16 data points */ -typedef unsigned int utf32_t; /**< Type for UTF-32 data points */ -#endif - #define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */ #define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */ #define LINEBREAK_NOBREAK 2 /**< No break is possible */ @@ -73,12 +64,12 @@ typedef unsigned int utf32_t; /**< Type for UTF-32 data points */ void init_linebreak(void); void set_linebreaks_utf8( - const utf8_t *s, size_t len, const char* lang, char *brks); + const utf8_t *s, size_t len, const char *lang, char *brks); void set_linebreaks_utf16( - const utf16_t *s, size_t len, const char* lang, char *brks); + const utf16_t *s, size_t len, const char *lang, char *brks); void set_linebreaks_utf32( - const utf32_t *s, size_t len, const char* lang, char *brks); -int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang); + const utf32_t *s, size_t len, const char *lang, char *brks); +int is_line_breakable(utf32_t char1, utf32_t char2, const char *lang); #ifdef __cplusplus } diff --git a/text/dali/internal/libunibreak/linebreakdata.c b/text/dali/internal/libunibreak/linebreakdata.c index 1038a14..d4bc8d8 100644 --- a/text/dali/internal/libunibreak/linebreakdata.c +++ b/text/dali/internal/libunibreak/linebreakdata.c @@ -3,7 +3,6 @@ # Date: 2014-02-28, 23:15:00 GMT [KW, LI] */ -#include "linebreak.h" #include "linebreakdef.h" /** Default line breaking properties as from the Unicode Web site. */ diff --git a/text/dali/internal/libunibreak/linebreakdef.c b/text/dali/internal/libunibreak/linebreakdef.c index 3455afd..41a7296 100644 --- a/text/dali/internal/libunibreak/linebreakdef.c +++ b/text/dali/internal/libunibreak/linebreakdef.c @@ -4,7 +4,7 @@ * Line breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2008-2012 Wu Yongwei + * Copyright (C) 2008-2015 Wu Yongwei * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -30,9 +30,9 @@ * Unicode 5.0.0: * * - * This library has been updated according to Revision 30, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 33, for + * Unicode 7.0.0: + * * * The Unicode Terms of Use are available at * diff --git a/text/dali/internal/libunibreak/linebreakdef.h b/text/dali/internal/libunibreak/linebreakdef.h index d557aba..7600d0a 100644 --- a/text/dali/internal/libunibreak/linebreakdef.h +++ b/text/dali/internal/libunibreak/linebreakdef.h @@ -4,7 +4,7 @@ * Line breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2008-2013 Wu Yongwei + * Copyright (C) 2008-2015 Wu Yongwei * Copyright (C) 2013 Petr Filipsky * * This software is provided 'as-is', without any express or implied @@ -31,9 +31,9 @@ * Unicode 5.0.0: * * - * This library has been updated according to Revision 30, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 33, for + * Unicode 7.0.0: + * * * The Unicode Terms of Use are available at * @@ -45,16 +45,12 @@ * Definitions of internal data structures, declarations of global * variables, and function prototypes for the line breaking algorithm. * - * @version 2.4, 2013/11/10 + * @version 2.6, 2015/04/18 * @author Wu Yongwei * @author Petr Filipsky */ -/** - * Constant value to mark the end of string. It is not a valid Unicode - * character. - */ -#define EOS 0xFFFFFFFF +#include "unibreakdef.h" /** * Line break classes. This is a direct mapping of Table 1 of Unicode @@ -143,28 +139,20 @@ struct LineBreakContext enum LineBreakClass lbcCur; /**< Breaking class of current codepoint */ enum LineBreakClass lbcNew; /**< Breaking class of next codepoint */ enum LineBreakClass lbcLast; /**< Breaking class of last codepoint */ + int fLb21aHebrew; /**< Flag for Hebrew letters (LB21a) */ }; -/** - * Abstract function interface for #lb_get_next_char_utf8, - * #lb_get_next_char_utf16, and #lb_get_next_char_utf32. - */ -typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *); - /* Declarations */ extern struct LineBreakProperties lb_prop_default[]; extern struct LineBreakPropertiesLang lb_prop_lang_map[]; /* Function Prototype */ -utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip); -utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip); -utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip); void lb_init_break_context( - struct LineBreakContext* lbpCtx, + struct LineBreakContext *lbpCtx, utf32_t ch, - const char* lang); + const char *lang); int lb_process_next_char( - struct LineBreakContext* lbpCtx, + struct LineBreakContext *lbpCtx, utf32_t ch); void set_linebreaks( const void *s, diff --git a/text/dali/internal/libunibreak/unibreakbase.c b/text/dali/internal/libunibreak/unibreakbase.c new file mode 100644 index 0000000..dbe3a38 --- /dev/null +++ b/text/dali/internal/libunibreak/unibreakbase.c @@ -0,0 +1,41 @@ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ + +/* + * Break processing in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2015 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + */ + +/** + * @file unibreakbase.c + * + * Definition of basic libunibreak information. + * + * @version 1.0, 2015/04/18 + * @author Wu Yongwei + */ + +#include "unibreakbase.h" + +/** + * Version number of the library. + */ +const int unibreak_version = UNIBREAK_VERSION; diff --git a/text/dali/internal/libunibreak/unibreakbase.h b/text/dali/internal/libunibreak/unibreakbase.h new file mode 100644 index 0000000..76b35e6 --- /dev/null +++ b/text/dali/internal/libunibreak/unibreakbase.h @@ -0,0 +1,73 @@ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ + +/* + * Break processing in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2015 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 14 (UAX #14): + * + * + * When this library was designed, this annex was at Revision 19, for + * Unicode 5.0.0: + * + * + * This library has been updated according to Revision 33, for + * Unicode 7.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file unibreakbase.h + * + * Header file for common definitions in the libunibreak library. + * + * @version 1.0, 2015/04/18 + * @author Wu Yongwei + */ + +#ifndef UNIBREAKBASE_H +#define UNIBREAKBASE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define UNIBREAK_VERSION 0x0300 /**< Version of the library linebreak */ +extern const int unibreak_version; + +#ifndef UNIBREAK_UTF_TYPES_DEFINED +#define UNIBREAK_UTF_TYPES_DEFINED +typedef unsigned char utf8_t; /**< Type for UTF-8 data points */ +typedef unsigned short utf16_t; /**< Type for UTF-16 data points */ +typedef unsigned int utf32_t; /**< Type for UTF-32 data points */ +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* UNIBREAKBASE_H */ diff --git a/text/dali/internal/libunibreak/unibreakdef.c b/text/dali/internal/libunibreak/unibreakdef.c new file mode 100644 index 0000000..2647b61 --- /dev/null +++ b/text/dali/internal/libunibreak/unibreakdef.c @@ -0,0 +1,159 @@ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ + +/* + * Break processing in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2015 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + */ + +/** + * @file unibreakdef.c + * + * Definition of utility functions used by the libunibreak library. + * + * @version 1.0, 2015/04/18 + * @author Wu Yongwei + */ + +#include +#include +#include "unibreakdef.h" + +/** + * Gets the next Unicode character in a UTF-8 sequence. The index will + * be advanced to the next complete character, unless the end of string + * is reached in the middle of a UTF-8 sequence. + * + * @param[in] s input UTF-8 string + * @param[in] len length of the string in bytes + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered + */ +utf32_t ub_get_next_char_utf8( + const utf8_t *s, + size_t len, + size_t *ip) +{ + utf8_t ch; + utf32_t res; + + assert(*ip <= len); + if (*ip == len) + return EOS; + ch = s[*ip]; + + if (ch < 0xC2 || ch > 0xF4) + { /* One-byte sequence, tail (should not occur), or invalid */ + *ip += 1; + return ch; + } + else if (ch < 0xE0) + { /* Two-byte sequence */ + if (*ip + 2 > len) + return EOS; + res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); + *ip += 2; + return res; + } + else if (ch < 0xF0) + { /* Three-byte sequence */ + if (*ip + 3 > len) + return EOS; + res = ((ch & 0x0F) << 12) + + ((s[*ip + 1] & 0x3F) << 6) + + ((s[*ip + 2] & 0x3F)); + *ip += 3; + return res; + } + else + { /* Four-byte sequence */ + if (*ip + 4 > len) + return EOS; + res = ((ch & 0x07) << 18) + + ((s[*ip + 1] & 0x3F) << 12) + + ((s[*ip + 2] & 0x3F) << 6) + + ((s[*ip + 3] & 0x3F)); + *ip += 4; + return res; + } +} + +/** + * Gets the next Unicode character in a UTF-16 sequence. The index will + * be advanced to the next complete character, unless the end of string + * is reached in the middle of a UTF-16 surrogate pair. + * + * @param[in] s input UTF-16 string + * @param[in] len length of the string in words + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered + */ +utf32_t ub_get_next_char_utf16( + const utf16_t *s, + size_t len, + size_t *ip) +{ + utf16_t ch; + + assert(*ip <= len); + if (*ip == len) + return EOS; + ch = s[(*ip)++]; + + if (ch < 0xD800 || ch > 0xDBFF) + { /* If the character is not a high surrogate */ + return ch; + } + if (*ip == len) + { /* If the input ends here (an error) */ + --(*ip); + return EOS; + } + if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) + { /* If the next character is not the low surrogate (an error) */ + return ch; + } + /* Return the constructed character and advance the index again */ + return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; +} + +/** + * Gets the next Unicode character in a UTF-32 sequence. The index will + * be advanced to the next character. + * + * @param[in] s input UTF-32 string + * @param[in] len length of the string in dwords + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered + */ +utf32_t ub_get_next_char_utf32( + const utf32_t *s, + size_t len, + size_t *ip) +{ + assert(*ip <= len); + if (*ip == len) + return EOS; + return s[(*ip)++]; +} diff --git a/text/dali/internal/libunibreak/unibreakdef.h b/text/dali/internal/libunibreak/unibreakdef.h new file mode 100644 index 0000000..b823e50 --- /dev/null +++ b/text/dali/internal/libunibreak/unibreakdef.h @@ -0,0 +1,80 @@ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ + +/* + * Break processing in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2015 Wu Yongwei + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 14 (UAX #14): + * + * + * When this library was designed, this annex was at Revision 19, for + * Unicode 5.0.0: + * + * + * This library has been updated according to Revision 33, for + * Unicode 7.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file unibreakdef.h + * + * Header file for private definitions in the libunibreak library. + * + * @version 1.1, 2015/04/19 + * @author Wu Yongwei + */ + +#ifndef UNIBREAKDEF_H +#define UNIBREAKDEF_H + +#include "unibreakbase.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Constant value to mark the end of string. It is not a valid Unicode + * character. + */ +#define EOS 0xFFFFFFFF + +/** + * Abstract function interface for #ub_get_next_char_utf8, + * #ub_get_next_char_utf16, and #ub_get_next_char_utf32. + */ +typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *); + +/* Function Prototype */ +utf32_t ub_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip); +utf32_t ub_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip); +utf32_t ub_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip); + +#ifdef __cplusplus +} +#endif + +#endif /* UNIBREAKDEF_H */ diff --git a/text/dali/internal/libunibreak/wordbreak.c b/text/dali/internal/libunibreak/wordbreak.c index e67a1f8..d7d5a42 100644 --- a/text/dali/internal/libunibreak/wordbreak.c +++ b/text/dali/internal/libunibreak/wordbreak.c @@ -4,7 +4,7 @@ * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2013 Tom Hacohen + * Copyright (C) 2013-2015 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -30,9 +30,9 @@ * Unicode 6.0.0: * * - * This library has been updated according to Revision 21, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 25, for + * Unicode 7.0.0: + * * * The Unicode Terms of Use are available at * @@ -44,16 +44,14 @@ * Implementation of the word breaking algorithm as described in Unicode * Standard Annex 29. * - * @version 2.4, 2013/09/28 + * @version 2.6, 2015/04/18 * @author Tom Hacohen */ #include #include #include -#include "linebreak.h" -#include "linebreakdef.h" - +#include "unibreakdef.h" #include "wordbreak.h" #include "wordbreakdata.c" @@ -256,8 +254,24 @@ static void set_wordbreaks( posLast = posCur; break; + case WBP_Hebrew_Letter: case WBP_ALetter: - if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ + if ((wbcSeqStart == WBP_Hebrew_Letter) && + (wbcLast == WBP_Double_Quote)) /* WB7b,c */ + { + if (wbcCur == WBP_Hebrew_Letter) + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + } + else if (((wbcSeqStart == WBP_ALetter) || + (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */ (wbcLast == WBP_Numeric) || /* WB10 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { @@ -274,8 +288,18 @@ static void set_wordbreaks( posLast = posCur; break; + case WBP_Single_Quote: + if (wbcLast == WBP_Hebrew_Letter) /* WB7a */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + /* No break on purpose */ case WBP_MidNumLet: - if ((wbcLast == WBP_ALetter) || /* WB6,7 */ + if (((wbcLast == WBP_ALetter) || + (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */ (wbcLast == WBP_Numeric)) /* WB11,12 */ { /* Go on */ @@ -290,7 +314,8 @@ static void set_wordbreaks( break; case WBP_MidLetter: - if (wbcLast == WBP_ALetter) /* WB6,7 */ + if ((wbcLast == WBP_ALetter) || + (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */ { /* Go on */ } @@ -319,7 +344,8 @@ static void set_wordbreaks( case WBP_Numeric: if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ - (wbcLast == WBP_ALetter) || /* WB9 */ + ((wbcLast == WBP_ALetter) || + (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, @@ -339,6 +365,7 @@ static void set_wordbreaks( /* WB13a,13b */ if ((wbcSeqStart == wbcLast) && ((wbcLast == WBP_ALetter) || + (wbcLast == WBP_Hebrew_Letter) || (wbcLast == WBP_Numeric) || (wbcLast == WBP_Katakana) || (wbcLast == WBP_ExtendNumLet))) @@ -356,9 +383,9 @@ static void set_wordbreaks( posLast = posCur; break; - case WBP_Regional: + case WBP_Regional_Indicator: /* WB13c */ - if (wbcSeqStart == WBP_Regional) + if (wbcSeqStart == WBP_Regional_Indicator) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); @@ -367,6 +394,20 @@ static void set_wordbreaks( posLast = posCur; break; + case WBP_Double_Quote: + if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; + case WBP_Any: /* Allow breaks and reset */ set_brks_to(s, brks, posLast, posCur, len, @@ -408,7 +449,7 @@ void set_wordbreaks_utf8( char *brks) { set_wordbreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf8); + (get_next_char_t)ub_get_next_char_utf8); } /** @@ -428,7 +469,7 @@ void set_wordbreaks_utf16( char *brks) { set_wordbreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf16); + (get_next_char_t)ub_get_next_char_utf16); } /** @@ -448,5 +489,5 @@ void set_wordbreaks_utf32( char *brks) { set_wordbreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf32); + (get_next_char_t)ub_get_next_char_utf32); } diff --git a/text/dali/internal/libunibreak/wordbreak.h b/text/dali/internal/libunibreak/wordbreak.h index cd2bf2c..360953f 100644 --- a/text/dali/internal/libunibreak/wordbreak.h +++ b/text/dali/internal/libunibreak/wordbreak.h @@ -4,7 +4,7 @@ * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2013 Tom Hacohen + * Copyright (C) 2013-2015 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -30,9 +30,9 @@ * Unicode 6.0.0: * * - * This library has been updated according to Revision 21, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 25, for + * Unicode 7.0.0: + * * * The Unicode Terms of Use are available at * @@ -43,7 +43,7 @@ * * Header file for the word breaking (segmentation) algorithm. * - * @version 2.3, 2013/09/28 + * @version 2.5, 2015/04/18 * @author Tom Hacohen */ @@ -51,7 +51,7 @@ #define WORDBREAK_H #include -#include "linebreak.h" +#include "unibreakbase.h" #ifdef __cplusplus extern "C" { diff --git a/text/dali/internal/libunibreak/wordbreakdata.c b/text/dali/internal/libunibreak/wordbreakdata.c index fe5afe3..c6d5694 100644 --- a/text/dali/internal/libunibreak/wordbreakdata.c +++ b/text/dali/internal/libunibreak/wordbreakdata.c @@ -1,16 +1,16 @@ /* The content of this file is generated from: -# WordBreakProperty-6.2.0.txt -# Date: 2012-08-13, 19:12:09 GMT [MD] +# WordBreakProperty-7.0.0.txt +# Date: 2014-02-19, 15:51:39 GMT [MD] */ -#include "linebreak.h" #include "wordbreakdef.h" static struct WordBreakProperties wb_prop_default[] = { {0x000A, 0x000A, WBP_LF}, {0x000B, 0x000C, WBP_Newline}, {0x000D, 0x000D, WBP_CR}, - {0x0027, 0x0027, WBP_MidNumLet}, + {0x0022, 0x0022, WBP_Double_Quote}, + {0x0027, 0x0027, WBP_Single_Quote}, {0x002C, 0x002C, WBP_MidNum}, {0x002E, 0x002E, WBP_MidNumLet}, {0x0030, 0x0039, WBP_Numeric}, @@ -36,6 +36,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0295, 0x02AF, WBP_ALetter}, {0x02B0, 0x02C1, WBP_ALetter}, {0x02C6, 0x02D1, WBP_ALetter}, + {0x02D7, 0x02D7, WBP_MidLetter}, {0x02E0, 0x02E4, WBP_ALetter}, {0x02EC, 0x02EC, WBP_ALetter}, {0x02EE, 0x02EE, WBP_ALetter}, @@ -46,6 +47,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x037A, 0x037A, WBP_ALetter}, {0x037B, 0x037D, WBP_ALetter}, {0x037E, 0x037E, WBP_MidNum}, + {0x037F, 0x037F, WBP_ALetter}, {0x0386, 0x0386, WBP_ALetter}, {0x0387, 0x0387, WBP_MidLetter}, {0x0388, 0x038A, WBP_ALetter}, @@ -55,7 +57,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x03F7, 0x0481, WBP_ALetter}, {0x0483, 0x0487, WBP_Extend}, {0x0488, 0x0489, WBP_Extend}, - {0x048A, 0x0527, WBP_ALetter}, + {0x048A, 0x052F, WBP_ALetter}, {0x0531, 0x0556, WBP_ALetter}, {0x0559, 0x0559, WBP_ALetter}, {0x0561, 0x0587, WBP_ALetter}, @@ -65,13 +67,14 @@ static struct WordBreakProperties wb_prop_default[] = { {0x05C1, 0x05C2, WBP_Extend}, {0x05C4, 0x05C5, WBP_Extend}, {0x05C7, 0x05C7, WBP_Extend}, - {0x05D0, 0x05EA, WBP_ALetter}, - {0x05F0, 0x05F2, WBP_ALetter}, + {0x05D0, 0x05EA, WBP_Hebrew_Letter}, + {0x05F0, 0x05F2, WBP_Hebrew_Letter}, {0x05F3, 0x05F3, WBP_ALetter}, {0x05F4, 0x05F4, WBP_MidLetter}, - {0x0600, 0x0604, WBP_Format}, + {0x0600, 0x0605, WBP_Format}, {0x060C, 0x060D, WBP_MidNum}, {0x0610, 0x061A, WBP_Extend}, + {0x061C, 0x061C, WBP_Format}, {0x0620, 0x063F, WBP_ALetter}, {0x0640, 0x0640, WBP_ALetter}, {0x0641, 0x064A, WBP_ALetter}, @@ -117,10 +120,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0829, 0x082D, WBP_Extend}, {0x0840, 0x0858, WBP_ALetter}, {0x0859, 0x085B, WBP_Extend}, - {0x08A0, 0x08A0, WBP_ALetter}, - {0x08A2, 0x08AC, WBP_ALetter}, - {0x08E4, 0x08FE, WBP_Extend}, - {0x0900, 0x0902, WBP_Extend}, + {0x08A0, 0x08B2, WBP_ALetter}, + {0x08E4, 0x0902, WBP_Extend}, {0x0903, 0x0903, WBP_Extend}, {0x0904, 0x0939, WBP_ALetter}, {0x093A, 0x093A, WBP_Extend}, @@ -138,8 +139,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0962, 0x0963, WBP_Extend}, {0x0966, 0x096F, WBP_Numeric}, {0x0971, 0x0971, WBP_ALetter}, - {0x0972, 0x0977, WBP_ALetter}, - {0x0979, 0x097F, WBP_ALetter}, + {0x0972, 0x0980, WBP_ALetter}, {0x0981, 0x0981, WBP_Extend}, {0x0982, 0x0983, WBP_Extend}, {0x0985, 0x098C, WBP_ALetter}, @@ -247,12 +247,12 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0BD0, 0x0BD0, WBP_ALetter}, {0x0BD7, 0x0BD7, WBP_Extend}, {0x0BE6, 0x0BEF, WBP_Numeric}, + {0x0C00, 0x0C00, WBP_Extend}, {0x0C01, 0x0C03, WBP_Extend}, {0x0C05, 0x0C0C, WBP_ALetter}, {0x0C0E, 0x0C10, WBP_ALetter}, {0x0C12, 0x0C28, WBP_ALetter}, - {0x0C2A, 0x0C33, WBP_ALetter}, - {0x0C35, 0x0C39, WBP_ALetter}, + {0x0C2A, 0x0C39, WBP_ALetter}, {0x0C3D, 0x0C3D, WBP_ALetter}, {0x0C3E, 0x0C40, WBP_Extend}, {0x0C41, 0x0C44, WBP_Extend}, @@ -263,6 +263,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0C60, 0x0C61, WBP_ALetter}, {0x0C62, 0x0C63, WBP_Extend}, {0x0C66, 0x0C6F, WBP_Numeric}, + {0x0C81, 0x0C81, WBP_Extend}, {0x0C82, 0x0C83, WBP_Extend}, {0x0C85, 0x0C8C, WBP_ALetter}, {0x0C8E, 0x0C90, WBP_ALetter}, @@ -284,6 +285,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0CE2, 0x0CE3, WBP_Extend}, {0x0CE6, 0x0CEF, WBP_Numeric}, {0x0CF1, 0x0CF2, WBP_ALetter}, + {0x0D01, 0x0D01, WBP_Extend}, {0x0D02, 0x0D03, WBP_Extend}, {0x0D05, 0x0D0C, WBP_ALetter}, {0x0D0E, 0x0D10, WBP_ALetter}, @@ -311,6 +313,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x0DD2, 0x0DD4, WBP_Extend}, {0x0DD6, 0x0DD6, WBP_Extend}, {0x0DD8, 0x0DDF, WBP_Extend}, + {0x0DE6, 0x0DEF, WBP_Numeric}, {0x0DF2, 0x0DF3, WBP_Extend}, {0x0E31, 0x0E31, WBP_Extend}, {0x0E34, 0x0E3A, WBP_Extend}, @@ -391,6 +394,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1681, 0x169A, WBP_ALetter}, {0x16A0, 0x16EA, WBP_ALetter}, {0x16EE, 0x16F0, WBP_ALetter}, + {0x16F1, 0x16F8, WBP_ALetter}, {0x1700, 0x170C, WBP_ALetter}, {0x170E, 0x1711, WBP_ALetter}, {0x1712, 0x1714, WBP_Extend}, @@ -411,6 +415,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x17DD, 0x17DD, WBP_Extend}, {0x17E0, 0x17E9, WBP_Numeric}, {0x180B, 0x180D, WBP_Extend}, + {0x180E, 0x180E, WBP_Format}, {0x1810, 0x1819, WBP_Numeric}, {0x1820, 0x1842, WBP_ALetter}, {0x1843, 0x1843, WBP_ALetter}, @@ -419,7 +424,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x18A9, 0x18A9, WBP_Extend}, {0x18AA, 0x18AA, WBP_ALetter}, {0x18B0, 0x18F5, WBP_ALetter}, - {0x1900, 0x191C, WBP_ALetter}, + {0x1900, 0x191E, WBP_ALetter}, {0x1920, 0x1922, WBP_Extend}, {0x1923, 0x1926, WBP_Extend}, {0x1927, 0x1928, WBP_Extend}, @@ -434,7 +439,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0x19D0, 0x19D9, WBP_Numeric}, {0x1A00, 0x1A16, WBP_ALetter}, {0x1A17, 0x1A18, WBP_Extend}, - {0x1A19, 0x1A1B, WBP_Extend}, + {0x1A19, 0x1A1A, WBP_Extend}, + {0x1A1B, 0x1A1B, WBP_Extend}, {0x1A55, 0x1A55, WBP_Extend}, {0x1A56, 0x1A56, WBP_Extend}, {0x1A57, 0x1A57, WBP_Extend}, @@ -449,6 +455,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1A7F, 0x1A7F, WBP_Extend}, {0x1A80, 0x1A89, WBP_Numeric}, {0x1A90, 0x1A99, WBP_Numeric}, + {0x1AB0, 0x1ABD, WBP_Extend}, + {0x1ABE, 0x1ABE, WBP_Extend}, {0x1B00, 0x1B03, WBP_Extend}, {0x1B04, 0x1B04, WBP_Extend}, {0x1B05, 0x1B33, WBP_ALetter}, @@ -471,8 +479,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1BA6, 0x1BA7, WBP_Extend}, {0x1BA8, 0x1BA9, WBP_Extend}, {0x1BAA, 0x1BAA, WBP_Extend}, - {0x1BAB, 0x1BAB, WBP_Extend}, - {0x1BAC, 0x1BAD, WBP_Extend}, + {0x1BAB, 0x1BAD, WBP_Extend}, {0x1BAE, 0x1BAF, WBP_ALetter}, {0x1BB0, 0x1BB9, WBP_Numeric}, {0x1BBA, 0x1BE5, WBP_ALetter}, @@ -504,13 +511,14 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1CF2, 0x1CF3, WBP_Extend}, {0x1CF4, 0x1CF4, WBP_Extend}, {0x1CF5, 0x1CF6, WBP_ALetter}, + {0x1CF8, 0x1CF9, WBP_Extend}, {0x1D00, 0x1D2B, WBP_ALetter}, {0x1D2C, 0x1D6A, WBP_ALetter}, {0x1D6B, 0x1D77, WBP_ALetter}, {0x1D78, 0x1D78, WBP_ALetter}, {0x1D79, 0x1D9A, WBP_ALetter}, {0x1D9B, 0x1DBF, WBP_ALetter}, - {0x1DC0, 0x1DE6, WBP_Extend}, + {0x1DC0, 0x1DF5, WBP_Extend}, {0x1DFC, 0x1DFF, WBP_Extend}, {0x1E00, 0x1F15, WBP_ALetter}, {0x1F18, 0x1F1D, WBP_ALetter}, @@ -544,7 +552,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x2044, 0x2044, WBP_MidNum}, {0x2054, 0x2054, WBP_ExtendNumLet}, {0x2060, 0x2064, WBP_Format}, - {0x206A, 0x206F, WBP_Format}, + {0x2066, 0x206F, WBP_Format}, {0x2071, 0x2071, WBP_ALetter}, {0x207F, 0x207F, WBP_ALetter}, {0x2090, 0x209C, WBP_ALetter}, @@ -631,7 +639,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0xA670, 0xA672, WBP_Extend}, {0xA674, 0xA67D, WBP_Extend}, {0xA67F, 0xA67F, WBP_ALetter}, - {0xA680, 0xA697, WBP_ALetter}, + {0xA680, 0xA69B, WBP_ALetter}, + {0xA69C, 0xA69D, WBP_ALetter}, {0xA69F, 0xA69F, WBP_Extend}, {0xA6A0, 0xA6E5, WBP_ALetter}, {0xA6E6, 0xA6EF, WBP_ALetter}, @@ -642,8 +651,9 @@ static struct WordBreakProperties wb_prop_default[] = { {0xA771, 0xA787, WBP_ALetter}, {0xA788, 0xA788, WBP_ALetter}, {0xA78B, 0xA78E, WBP_ALetter}, - {0xA790, 0xA793, WBP_ALetter}, - {0xA7A0, 0xA7AA, WBP_ALetter}, + {0xA790, 0xA7AD, WBP_ALetter}, + {0xA7B0, 0xA7B1, WBP_ALetter}, + {0xA7F7, 0xA7F7, WBP_ALetter}, {0xA7F8, 0xA7F9, WBP_ALetter}, {0xA7FA, 0xA7FA, WBP_ALetter}, {0xA7FB, 0xA801, WBP_ALetter}, @@ -683,6 +693,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0xA9BD, 0xA9C0, WBP_Extend}, {0xA9CF, 0xA9CF, WBP_ALetter}, {0xA9D0, 0xA9D9, WBP_Numeric}, + {0xA9E5, 0xA9E5, WBP_Extend}, + {0xA9F0, 0xA9F9, WBP_Numeric}, {0xAA00, 0xAA28, WBP_ALetter}, {0xAA29, 0xAA2E, WBP_Extend}, {0xAA2F, 0xAA30, WBP_Extend}, @@ -696,6 +708,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0xAA4D, 0xAA4D, WBP_Extend}, {0xAA50, 0xAA59, WBP_Numeric}, {0xAA7B, 0xAA7B, WBP_Extend}, + {0xAA7C, 0xAA7C, WBP_Extend}, + {0xAA7D, 0xAA7D, WBP_Extend}, {0xAAB0, 0xAAB0, WBP_Extend}, {0xAAB2, 0xAAB4, WBP_Extend}, {0xAAB7, 0xAAB8, WBP_Extend}, @@ -714,6 +728,9 @@ static struct WordBreakProperties wb_prop_default[] = { {0xAB11, 0xAB16, WBP_ALetter}, {0xAB20, 0xAB26, WBP_ALetter}, {0xAB28, 0xAB2E, WBP_ALetter}, + {0xAB30, 0xAB5A, WBP_ALetter}, + {0xAB5C, 0xAB5F, WBP_ALetter}, + {0xAB64, 0xAB65, WBP_ALetter}, {0xABC0, 0xABE2, WBP_ALetter}, {0xABE3, 0xABE4, WBP_Extend}, {0xABE5, 0xABE5, WBP_Extend}, @@ -728,15 +745,16 @@ static struct WordBreakProperties wb_prop_default[] = { {0xD7CB, 0xD7FB, WBP_ALetter}, {0xFB00, 0xFB06, WBP_ALetter}, {0xFB13, 0xFB17, WBP_ALetter}, - {0xFB1D, 0xFB1D, WBP_ALetter}, + {0xFB1D, 0xFB1D, WBP_Hebrew_Letter}, {0xFB1E, 0xFB1E, WBP_Extend}, - {0xFB1F, 0xFB28, WBP_ALetter}, - {0xFB2A, 0xFB36, WBP_ALetter}, - {0xFB38, 0xFB3C, WBP_ALetter}, - {0xFB3E, 0xFB3E, WBP_ALetter}, - {0xFB40, 0xFB41, WBP_ALetter}, - {0xFB43, 0xFB44, WBP_ALetter}, - {0xFB46, 0xFBB1, WBP_ALetter}, + {0xFB1F, 0xFB28, WBP_Hebrew_Letter}, + {0xFB2A, 0xFB36, WBP_Hebrew_Letter}, + {0xFB38, 0xFB3C, WBP_Hebrew_Letter}, + {0xFB3E, 0xFB3E, WBP_Hebrew_Letter}, + {0xFB40, 0xFB41, WBP_Hebrew_Letter}, + {0xFB43, 0xFB44, WBP_Hebrew_Letter}, + {0xFB46, 0xFB4F, WBP_Hebrew_Letter}, + {0xFB50, 0xFBB1, WBP_ALetter}, {0xFBD3, 0xFD3D, WBP_ALetter}, {0xFD50, 0xFD8F, WBP_ALetter}, {0xFD92, 0xFDC7, WBP_ALetter}, @@ -745,7 +763,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0xFE10, 0xFE10, WBP_MidNum}, {0xFE13, 0xFE13, WBP_MidLetter}, {0xFE14, 0xFE14, WBP_MidNum}, - {0xFE20, 0xFE26, WBP_Extend}, + {0xFE20, 0xFE2D, WBP_Extend}, {0xFE33, 0xFE34, WBP_ExtendNumLet}, {0xFE4D, 0xFE4F, WBP_ExtendNumLet}, {0xFE50, 0xFE50, WBP_MidNum}, @@ -784,11 +802,14 @@ static struct WordBreakProperties wb_prop_default[] = { {0x101FD, 0x101FD, WBP_Extend}, {0x10280, 0x1029C, WBP_ALetter}, {0x102A0, 0x102D0, WBP_ALetter}, - {0x10300, 0x1031E, WBP_ALetter}, + {0x102E0, 0x102E0, WBP_Extend}, + {0x10300, 0x1031F, WBP_ALetter}, {0x10330, 0x10340, WBP_ALetter}, {0x10341, 0x10341, WBP_ALetter}, {0x10342, 0x10349, WBP_ALetter}, {0x1034A, 0x1034A, WBP_ALetter}, + {0x10350, 0x10375, WBP_ALetter}, + {0x10376, 0x1037A, WBP_Extend}, {0x10380, 0x1039D, WBP_ALetter}, {0x103A0, 0x103C3, WBP_ALetter}, {0x103C8, 0x103CF, WBP_ALetter}, @@ -796,12 +817,19 @@ static struct WordBreakProperties wb_prop_default[] = { {0x10400, 0x1044F, WBP_ALetter}, {0x10450, 0x1049D, WBP_ALetter}, {0x104A0, 0x104A9, WBP_Numeric}, + {0x10500, 0x10527, WBP_ALetter}, + {0x10530, 0x10563, WBP_ALetter}, + {0x10600, 0x10736, WBP_ALetter}, + {0x10740, 0x10755, WBP_ALetter}, + {0x10760, 0x10767, WBP_ALetter}, {0x10800, 0x10805, WBP_ALetter}, {0x10808, 0x10808, WBP_ALetter}, {0x1080A, 0x10835, WBP_ALetter}, {0x10837, 0x10838, WBP_ALetter}, {0x1083C, 0x1083C, WBP_ALetter}, {0x1083F, 0x10855, WBP_ALetter}, + {0x10860, 0x10876, WBP_ALetter}, + {0x10880, 0x1089E, WBP_ALetter}, {0x10900, 0x10915, WBP_ALetter}, {0x10920, 0x10939, WBP_ALetter}, {0x10980, 0x109B7, WBP_ALetter}, @@ -816,9 +844,14 @@ static struct WordBreakProperties wb_prop_default[] = { {0x10A38, 0x10A3A, WBP_Extend}, {0x10A3F, 0x10A3F, WBP_Extend}, {0x10A60, 0x10A7C, WBP_ALetter}, + {0x10A80, 0x10A9C, WBP_ALetter}, + {0x10AC0, 0x10AC7, WBP_ALetter}, + {0x10AC9, 0x10AE4, WBP_ALetter}, + {0x10AE5, 0x10AE6, WBP_Extend}, {0x10B00, 0x10B35, WBP_ALetter}, {0x10B40, 0x10B55, WBP_ALetter}, {0x10B60, 0x10B72, WBP_ALetter}, + {0x10B80, 0x10B91, WBP_ALetter}, {0x10C00, 0x10C48, WBP_ALetter}, {0x11000, 0x11000, WBP_Extend}, {0x11001, 0x11001, WBP_Extend}, @@ -826,7 +859,7 @@ static struct WordBreakProperties wb_prop_default[] = { {0x11003, 0x11037, WBP_ALetter}, {0x11038, 0x11046, WBP_Extend}, {0x11066, 0x1106F, WBP_Numeric}, - {0x11080, 0x11081, WBP_Extend}, + {0x1107F, 0x11081, WBP_Extend}, {0x11082, 0x11082, WBP_Extend}, {0x11083, 0x110AF, WBP_ALetter}, {0x110B0, 0x110B2, WBP_Extend}, @@ -842,6 +875,9 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1112C, 0x1112C, WBP_Extend}, {0x1112D, 0x11134, WBP_Extend}, {0x11136, 0x1113F, WBP_Numeric}, + {0x11150, 0x11172, WBP_ALetter}, + {0x11173, 0x11173, WBP_Extend}, + {0x11176, 0x11176, WBP_ALetter}, {0x11180, 0x11181, WBP_Extend}, {0x11182, 0x11182, WBP_Extend}, {0x11183, 0x111B2, WBP_ALetter}, @@ -850,6 +886,68 @@ static struct WordBreakProperties wb_prop_default[] = { {0x111BF, 0x111C0, WBP_Extend}, {0x111C1, 0x111C4, WBP_ALetter}, {0x111D0, 0x111D9, WBP_Numeric}, + {0x111DA, 0x111DA, WBP_ALetter}, + {0x11200, 0x11211, WBP_ALetter}, + {0x11213, 0x1122B, WBP_ALetter}, + {0x1122C, 0x1122E, WBP_Extend}, + {0x1122F, 0x11231, WBP_Extend}, + {0x11232, 0x11233, WBP_Extend}, + {0x11234, 0x11234, WBP_Extend}, + {0x11235, 0x11235, WBP_Extend}, + {0x11236, 0x11237, WBP_Extend}, + {0x112B0, 0x112DE, WBP_ALetter}, + {0x112DF, 0x112DF, WBP_Extend}, + {0x112E0, 0x112E2, WBP_Extend}, + {0x112E3, 0x112EA, WBP_Extend}, + {0x112F0, 0x112F9, WBP_Numeric}, + {0x11301, 0x11301, WBP_Extend}, + {0x11302, 0x11303, WBP_Extend}, + {0x11305, 0x1130C, WBP_ALetter}, + {0x1130F, 0x11310, WBP_ALetter}, + {0x11313, 0x11328, WBP_ALetter}, + {0x1132A, 0x11330, WBP_ALetter}, + {0x11332, 0x11333, WBP_ALetter}, + {0x11335, 0x11339, WBP_ALetter}, + {0x1133C, 0x1133C, WBP_Extend}, + {0x1133D, 0x1133D, WBP_ALetter}, + {0x1133E, 0x1133F, WBP_Extend}, + {0x11340, 0x11340, WBP_Extend}, + {0x11341, 0x11344, WBP_Extend}, + {0x11347, 0x11348, WBP_Extend}, + {0x1134B, 0x1134D, WBP_Extend}, + {0x11357, 0x11357, WBP_Extend}, + {0x1135D, 0x11361, WBP_ALetter}, + {0x11362, 0x11363, WBP_Extend}, + {0x11366, 0x1136C, WBP_Extend}, + {0x11370, 0x11374, WBP_Extend}, + {0x11480, 0x114AF, WBP_ALetter}, + {0x114B0, 0x114B2, WBP_Extend}, + {0x114B3, 0x114B8, WBP_Extend}, + {0x114B9, 0x114B9, WBP_Extend}, + {0x114BA, 0x114BA, WBP_Extend}, + {0x114BB, 0x114BE, WBP_Extend}, + {0x114BF, 0x114C0, WBP_Extend}, + {0x114C1, 0x114C1, WBP_Extend}, + {0x114C2, 0x114C3, WBP_Extend}, + {0x114C4, 0x114C5, WBP_ALetter}, + {0x114C7, 0x114C7, WBP_ALetter}, + {0x114D0, 0x114D9, WBP_Numeric}, + {0x11580, 0x115AE, WBP_ALetter}, + {0x115AF, 0x115B1, WBP_Extend}, + {0x115B2, 0x115B5, WBP_Extend}, + {0x115B8, 0x115BB, WBP_Extend}, + {0x115BC, 0x115BD, WBP_Extend}, + {0x115BE, 0x115BE, WBP_Extend}, + {0x115BF, 0x115C0, WBP_Extend}, + {0x11600, 0x1162F, WBP_ALetter}, + {0x11630, 0x11632, WBP_Extend}, + {0x11633, 0x1163A, WBP_Extend}, + {0x1163B, 0x1163C, WBP_Extend}, + {0x1163D, 0x1163D, WBP_Extend}, + {0x1163E, 0x1163E, WBP_Extend}, + {0x1163F, 0x11640, WBP_Extend}, + {0x11644, 0x11644, WBP_ALetter}, + {0x11650, 0x11659, WBP_Numeric}, {0x11680, 0x116AA, WBP_ALetter}, {0x116AB, 0x116AB, WBP_Extend}, {0x116AC, 0x116AC, WBP_Extend}, @@ -859,16 +957,36 @@ static struct WordBreakProperties wb_prop_default[] = { {0x116B6, 0x116B6, WBP_Extend}, {0x116B7, 0x116B7, WBP_Extend}, {0x116C0, 0x116C9, WBP_Numeric}, - {0x12000, 0x1236E, WBP_ALetter}, - {0x12400, 0x12462, WBP_ALetter}, + {0x118A0, 0x118DF, WBP_ALetter}, + {0x118E0, 0x118E9, WBP_Numeric}, + {0x118FF, 0x118FF, WBP_ALetter}, + {0x11AC0, 0x11AF8, WBP_ALetter}, + {0x12000, 0x12398, WBP_ALetter}, + {0x12400, 0x1246E, WBP_ALetter}, {0x13000, 0x1342E, WBP_ALetter}, {0x16800, 0x16A38, WBP_ALetter}, + {0x16A40, 0x16A5E, WBP_ALetter}, + {0x16A60, 0x16A69, WBP_Numeric}, + {0x16AD0, 0x16AED, WBP_ALetter}, + {0x16AF0, 0x16AF4, WBP_Extend}, + {0x16B00, 0x16B2F, WBP_ALetter}, + {0x16B30, 0x16B36, WBP_Extend}, + {0x16B40, 0x16B43, WBP_ALetter}, + {0x16B50, 0x16B59, WBP_Numeric}, + {0x16B63, 0x16B77, WBP_ALetter}, + {0x16B7D, 0x16B8F, WBP_ALetter}, {0x16F00, 0x16F44, WBP_ALetter}, {0x16F50, 0x16F50, WBP_ALetter}, {0x16F51, 0x16F7E, WBP_Extend}, {0x16F8F, 0x16F92, WBP_Extend}, {0x16F93, 0x16F9F, WBP_ALetter}, {0x1B000, 0x1B000, WBP_Katakana}, + {0x1BC00, 0x1BC6A, WBP_ALetter}, + {0x1BC70, 0x1BC7C, WBP_ALetter}, + {0x1BC80, 0x1BC88, WBP_ALetter}, + {0x1BC90, 0x1BC99, WBP_ALetter}, + {0x1BC9D, 0x1BC9E, WBP_Extend}, + {0x1BCA0, 0x1BCA3, WBP_Format}, {0x1D165, 0x1D166, WBP_Extend}, {0x1D167, 0x1D169, WBP_Extend}, {0x1D16D, 0x1D172, WBP_Extend}, @@ -908,6 +1026,8 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1D7AA, 0x1D7C2, WBP_ALetter}, {0x1D7C4, 0x1D7CB, WBP_ALetter}, {0x1D7CE, 0x1D7FF, WBP_Numeric}, + {0x1E800, 0x1E8C4, WBP_ALetter}, + {0x1E8D0, 0x1E8D6, WBP_Extend}, {0x1EE00, 0x1EE03, WBP_ALetter}, {0x1EE05, 0x1EE1F, WBP_ALetter}, {0x1EE21, 0x1EE22, WBP_ALetter}, @@ -941,7 +1061,10 @@ static struct WordBreakProperties wb_prop_default[] = { {0x1EEA1, 0x1EEA3, WBP_ALetter}, {0x1EEA5, 0x1EEA9, WBP_ALetter}, {0x1EEAB, 0x1EEBB, WBP_ALetter}, - {0x1F1E6, 0x1F1FF, WBP_Regional}, + {0x1F130, 0x1F149, WBP_ALetter}, + {0x1F150, 0x1F169, WBP_ALetter}, + {0x1F170, 0x1F189, WBP_ALetter}, + {0x1F1E6, 0x1F1FF, WBP_Regional_Indicator}, {0xE0001, 0xE0001, WBP_Format}, {0xE0020, 0xE007F, WBP_Format}, {0xE0100, 0xE01EF, WBP_Extend}, diff --git a/text/dali/internal/libunibreak/wordbreakdef.h b/text/dali/internal/libunibreak/wordbreakdef.h index 72816f9..7130a13 100644 --- a/text/dali/internal/libunibreak/wordbreakdef.h +++ b/text/dali/internal/libunibreak/wordbreakdef.h @@ -4,8 +4,7 @@ * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2013 Tom Hacohen - * Copyright (C) 2013 Petr Filipsky + * Copyright (C) 2013-15 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -31,9 +30,8 @@ * Unicode 6.0.0: * * - * This library has been updated according to Revision 21, for - * Unicode 6.2.0: - * + * This library has been updated according to Revision 25, for + * Unicode 7.0.0: * * The Unicode Terms of Use are available at * @@ -45,11 +43,12 @@ * Definitions of internal data structures, declarations of global * variables, and function prototypes for the word breaking algorithm. * - * @version 2.4, 2013/11/10 + * @version 2.6, 2015/04/19 * @author Tom Hacohen - * @author Petr Filipsky */ +#include "unibreakdef.h" + /** * Word break classes. This is a direct mapping of Table 3 of Unicode * Standard Annex 29, Revision 23. @@ -61,18 +60,18 @@ enum WordBreakClass WBP_LF, WBP_Newline, WBP_Extend, + WBP_Regional_Indicator, WBP_Format, WBP_Katakana, + WBP_Hebrew_Letter, WBP_ALetter, + WBP_Single_Quote, + WBP_Double_Quote, WBP_MidNumLet, WBP_MidLetter, WBP_MidNum, WBP_Numeric, WBP_ExtendNumLet, - WBP_Regional, - WBP_Hebrew, - WBP_Single, - WBP_Double, WBP_Any }; -- 2.7.4