From: tasn Date: Mon, 12 Dec 2011 15:25:39 +0000 (+0000) Subject: Evas liblinebreak: Added the wordbreak support. X-Git-Tag: accepted/2.0/20130306.225542~155^2~97 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f9c3db4f7935d853202105f41719ec480eab63eb;p=profile%2Fivi%2Fevas.git Evas liblinebreak: Added the wordbreak support. Will send it upstream soon. My tests worked, but they are far from complete. Probably needs more complete testing. git-svn-id: svn+ssh://svn.enlightenment.org/var/svn/e/trunk/evas@66118 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33 --- diff --git a/src/static_deps/liblinebreak/AUTHORS b/src/static_deps/liblinebreak/AUTHORS index 523106f..22786d4 100644 --- a/src/static_deps/liblinebreak/AUTHORS +++ b/src/static_deps/liblinebreak/AUTHORS @@ -4,3 +4,5 @@ Nikolay Pultsin. Put forward the original requirements on liblinebreak, performed tests, and made a lot of suggestions on the initial versions. Thomas Klausner. Autoconfiscated and libtoolized liblinebreak. + +Tom Hacohen. Added word boundaries support. diff --git a/src/static_deps/liblinebreak/Makefile.am b/src/static_deps/liblinebreak/Makefile.am index f386455..50328e9 100644 --- a/src/static_deps/liblinebreak/Makefile.am +++ b/src/static_deps/liblinebreak/Makefile.am @@ -7,7 +7,9 @@ noinst_LTLIBRARIES = liblinebreak.la liblinebreak_la_SOURCES = \ linebreak.c \ linebreakdata.c \ - linebreakdef.c + linebreakdef.c \ + wordbreak.c \ + wordbreakdata.x EXTRA_DIST = \ LineBreak1.sed \ diff --git a/src/static_deps/liblinebreak/gen_wordbreak_data.sh b/src/static_deps/liblinebreak/gen_wordbreak_data.sh new file mode 100755 index 0000000..7c84de2 --- /dev/null +++ b/src/static_deps/liblinebreak/gen_wordbreak_data.sh @@ -0,0 +1,25 @@ +#!/bin/sh +FNAME="WordBreakProperty.txt"; +if [ ! -f ${FNAME} ]; then + wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt +fi + +# +sed -n 's/\(^[0-9A-F.]\+\)/\1/p' ${FNAME} > tmp.txt +sed -i 's/^\([0-9A-F]\+\)\s\+/\1..\1/' tmp.txt +sort --numeric-sort tmp.txt > tmp2.txt +./sort_numeric_hex.py tmp2.txt > tmp.txt +rm tmp2.txt +sed -i -n 's/^\([0-9A-F]\+\)..\([0-9A-F]\+\)\s*;\s*\([A-Za-z]\+\).*$/\t{0x\1, 0x\2, WBP_\3},/p' tmp.txt + +echo "/* The content of this file is generated from:" > wordbreakdata.x +head -2 ${FNAME} >> wordbreakdata.x +echo "*/" >> wordbreakdata.x +echo '#include "linebreak.h"' >> wordbreakdata.x +echo '#include "wordbreakdef.h"' >> wordbreakdata.x +echo "static struct WordBreakProperties wb_prop_default[] = {" >> wordbreakdata.x +cat tmp.txt >> wordbreakdata.x +echo " {0xFFFFFFFF, 0xFFFFFFFF, WBP_Undefined}" >> wordbreakdata.x +echo "};" >> wordbreakdata.x +rm tmp.txt + diff --git a/src/static_deps/liblinebreak/sort_numeric_hex.py b/src/static_deps/liblinebreak/sort_numeric_hex.py new file mode 100755 index 0000000..a16d0f3 --- /dev/null +++ b/src/static_deps/liblinebreak/sort_numeric_hex.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python2 +import sys + +lines = open(sys.argv[1]).readlines() +lines_out = sorted(lines, key=lambda line: int(line.split("..")[0], 16)) +map(sys.stdout.write, lines_out) diff --git a/src/static_deps/liblinebreak/wordbreak.c b/src/static_deps/liblinebreak/wordbreak.c new file mode 100644 index 0000000..13fd2c7 --- /dev/null +++ b/src/static_deps/liblinebreak/wordbreak.c @@ -0,0 +1,435 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2011-2011 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreak.c + * + * Implementation of the word breaking algorithm as described in Unicode + * Standard Annex 29. + * + * @version 2.0, 2011/12/12 + * @author Tom Hacohen + */ + + +#include +#include +#include +#include "linebreak.h" +#include "linebreakdef.h" + +#include "wordbreak.h" +#include "wordbreakdata.x" + +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +/* Init the wordbreak internals. */ +void init_wordbreak(void) +{ + /* Currently does nothing, may be needed in the future. */ + return; +} + +/** + * Gets the word breaking class of a character. + * + * @param ch character to check + * @param wbp pointer to the wbp breaking properties array + * @param len the size of the wbp array in number of items. + * @return the word breaking class if found; \c WBP_Any otherwise + */ +static enum WordBreakClass get_char_wb_class( + utf32_t ch, + struct WordBreakProperties *wbp, + size_t len) +{ + size_t min = 0; + size_t max = len - 1; + size_t mid; + + do + { + mid = (min + max) / 2; + + if (ch < wbp[mid].start) + max = mid - 1; + else if (ch > wbp[mid].end) + min = mid + 1; + else + return wbp[mid].prop; + } + while (min <= max); + + return WBP_Any; +} + +/** + * Sets the break types in brks starting from posLast up to posStop. + * + * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType. + * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are + * cells that we really don't want to break after. + * + * @param s the string + * @param brks[out] the breaks array to fill. + * @param posStart the start position + * @param posEnd the end position + * @param len the length of the string + * @param brkType the breaks type to use + * @param get_next_char function to get the next UTF-32 character + */ +static void set_brks_to(const void *s, + char *brks, + size_t posStart, + size_t posEnd, + size_t len, + char brkType, + get_next_char_t get_next_char) +{ + size_t posCur = posStart; + while (posCur < posEnd) + { + get_next_char(s, len, &posCur); + for ( ; posStart < posCur - 1; ++posStart) + { + brks[posStart] = WORDBREAK_INSIDECHAR; + } + assert(posStart == posCur - 1); + + /* Only set it if we haven't set it not to break before. */ + if (brks[posStart] != WORDBREAK_NOBREAK) + brks[posStart] = brkType; + posStart = posCur; + } +} + +/* Checks to see if newline, cr, or lf. for WB3a and b */ +#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ + (cls == WBP_LF)) + +/** + * Sets the word breaking information for a generic input string. + * + * @param[in] s input string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + * @param[in] get_next_char function to get the next UTF-32 character + */ +static void set_wordbreaks( + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char) +{ + /* Previous class */ + enum WordBreakClass p_cls = WBP_Undefined; + /* Strong previous class. */ + enum WordBreakClass sp_cls = WBP_Undefined; + utf32_t ch; + size_t posCur = 0; + size_t posCurSt = 0; + size_t posLast = 0; + + /* FIXME: unused atm. */ + (void) lang; + + + /* Init brks */ + memset(brks, WORDBREAK_BREAK, len); + + ch = get_next_char(s, len, &posCur); + + /* WB3a, WB3b are implied. */ + for ( ; ch != EOS ; ) + { + /* Current class */ + enum WordBreakClass c_cls; + c_cls = get_char_wb_class(ch, wb_prop_default, + ARRAY_LEN(wb_prop_default)); + + switch (c_cls) + { + case WBP_CR: + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_LF: + if (sp_cls == WBP_CR) /* WB3 */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_Newline: + /* WB3a, WB3b */ + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_Extend: + case WBP_Format: + /* WB4 - If not the first char/after a newline (W3ab), + * skip this class, set it to be the same as the prev, and mark + * brks not to break before them. */ + if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls)) + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + } + else + { + /* It's surely not the first */ + brks[posCurSt - 1] = WORDBREAK_NOBREAK; + /* "inherit" the previous class. */ + c_cls = p_cls; + } + break; + + case WBP_Katakana: + if ((sp_cls == WBP_Katakana) || /* WB13 */ + (sp_cls == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_ALetter: + if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */ + ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */ + (sp_cls == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_MidNumLet: + if ((p_cls == WBP_ALetter) || /* WBP6,7 */ + (p_cls == WBP_Numeric)) /* WBP11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + break; + + case WBP_MidLetter: + if (p_cls == WBP_ALetter) /* WBP6,7 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + break; + + case WBP_MidNum: + if (p_cls == WBP_Numeric) /* WBP11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + } + break; + + case WBP_Numeric: + if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */ + ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */ + (sp_cls == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_ExtendNumLet: + /* WB13a,13b */ + if ((sp_cls == p_cls) && + ((p_cls == WBP_ALetter) || + (p_cls == WBP_Numeric) || + (p_cls == WBP_Katakana) || + (p_cls == WBP_ExtendNumLet))) + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, + get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + } + sp_cls = c_cls; + posLast = posCurSt; + break; + + case WBP_Any: + /* Allow breaks and reset */ + set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, + get_next_char); + sp_cls = c_cls; + posLast = posCurSt; + break; + + default: + /* Error, should never get here! */ + assert(0); + break; + } + + p_cls = c_cls; + posCurSt = posCur; + ch = get_next_char(s, len, &posCur); + } + + /* WB2 */ + set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, + get_next_char); +} + +/** + * Sets the word breaking information for a UTF-8 input string. + * + * @param[in] s input UTF-8 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf8( + const utf8_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf8); +} + +/** + * Sets the word breaking information for a UTF-16 input string. + * + * @param[in] s input UTF-16 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf16( + const utf16_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf16); +} + +/** + * Sets the word breaking information for a UTF-32 input string. + * + * @param[in] s input UTF-32 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + */ +void set_wordbreaks_utf32( + const utf32_t *s, + size_t len, + const char *lang, + char *brks) +{ + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf32); +} diff --git a/src/static_deps/liblinebreak/wordbreak.h b/src/static_deps/liblinebreak/wordbreak.h new file mode 100644 index 0000000..7b7bea7 --- /dev/null +++ b/src/static_deps/liblinebreak/wordbreak.h @@ -0,0 +1,72 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2011-2011 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreak.h + * + * Header file for the word breaking (segmentation) algorithm. + * + * @version 2.0, 2011/12/12 + * @author Tom Hacohen + */ + +#ifndef WORDBREAK_H +#define WORDBREAK_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define WORDBREAK_BREAK 0 /* Break found */ +#define WORDBREAK_NOBREAK 1 /**< Break not found */ +#define WORDBREAK_INSIDECHAR 2 /**< A UTF-8/16 sequence is unfinished */ + +void init_wordbreak(void); +void set_wordbreaks_utf8( + const utf8_t *s, size_t len, const char* lang, char *brks); +void set_wordbreaks_utf16( + const utf16_t *s, size_t len, const char* lang, char *brks); +void set_wordbreaks_utf32( + const utf32_t *s, size_t len, const char* lang, char *brks); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/static_deps/liblinebreak/wordbreakdata.x b/src/static_deps/liblinebreak/wordbreakdata.x new file mode 100644 index 0000000..c7278ef --- /dev/null +++ b/src/static_deps/liblinebreak/wordbreakdata.x @@ -0,0 +1,858 @@ +/* The content of this file is generated from: +# WordBreakProperty-6.0.0.txt +# Date: 2010-08-19, 00:48:48 GMT [MD] +*/ +#include "linebreak.h" +#include "wordbreakdef.h" +static struct WordBreakProperties wb_prop_default[] = { + {0x000A, 0x000A, WBP_LF}, + {0x000B, 0x000C, WBP_Newline}, + {0x000D, 0x000D, WBP_CR}, + {0x0027, 0x0027, WBP_MidNumLet}, + {0x002C, 0x002C, WBP_MidNum}, + {0x002E, 0x002E, WBP_MidNumLet}, + {0x0030, 0x0039, WBP_Numeric}, + {0x003A, 0x003A, WBP_MidLetter}, + {0x003B, 0x003B, WBP_MidNum}, + {0x0041, 0x005A, WBP_ALetter}, + {0x005F, 0x005F, WBP_ExtendNumLet}, + {0x0061, 0x007A, WBP_ALetter}, + {0x0085, 0x0085, WBP_Newline}, + {0x00AA, 0x00AA, WBP_ALetter}, + {0x00AD, 0x00AD, WBP_Format}, + {0x00B5, 0x00B5, WBP_ALetter}, + {0x00B7, 0x00B7, WBP_MidLetter}, + {0x00BA, 0x00BA, WBP_ALetter}, + {0x00C0, 0x00D6, WBP_ALetter}, + {0x00D8, 0x00F6, WBP_ALetter}, + {0x00F8, 0x01BA, WBP_ALetter}, + {0x01BB, 0x01BB, WBP_ALetter}, + {0x01BC, 0x01BF, WBP_ALetter}, + {0x01C0, 0x01C3, WBP_ALetter}, + {0x01C4, 0x0293, WBP_ALetter}, + {0x0294, 0x0294, WBP_ALetter}, + {0x0295, 0x02AF, WBP_ALetter}, + {0x02B0, 0x02C1, WBP_ALetter}, + {0x02C6, 0x02D1, WBP_ALetter}, + {0x02E0, 0x02E4, WBP_ALetter}, + {0x02EC, 0x02EC, WBP_ALetter}, + {0x02EE, 0x02EE, WBP_ALetter}, + {0x0300, 0x036F, WBP_Extend}, + {0x0370, 0x0373, WBP_ALetter}, + {0x0374, 0x0374, WBP_ALetter}, + {0x0376, 0x0377, WBP_ALetter}, + {0x037A, 0x037A, WBP_ALetter}, + {0x037B, 0x037D, WBP_ALetter}, + {0x037E, 0x037E, WBP_MidNum}, + {0x0386, 0x0386, WBP_ALetter}, + {0x0387, 0x0387, WBP_MidLetter}, + {0x0388, 0x038A, WBP_ALetter}, + {0x038C, 0x038C, WBP_ALetter}, + {0x038E, 0x03A1, WBP_ALetter}, + {0x03A3, 0x03F5, WBP_ALetter}, + {0x03F7, 0x0481, WBP_ALetter}, + {0x0483, 0x0487, WBP_Extend}, + {0x0488, 0x0489, WBP_Extend}, + {0x048A, 0x0527, WBP_ALetter}, + {0x0531, 0x0556, WBP_ALetter}, + {0x0559, 0x0559, WBP_ALetter}, + {0x0561, 0x0587, WBP_ALetter}, + {0x0589, 0x0589, WBP_MidNum}, + {0x0591, 0x05BD, WBP_Extend}, + {0x05BF, 0x05BF, WBP_Extend}, + {0x05C1, 0x05C2, WBP_Extend}, + {0x05C4, 0x05C5, WBP_Extend}, + {0x05C7, 0x05C7, WBP_Extend}, + {0x05D0, 0x05EA, WBP_ALetter}, + {0x05F0, 0x05F2, WBP_ALetter}, + {0x05F3, 0x05F3, WBP_ALetter}, + {0x05F4, 0x05F4, WBP_MidLetter}, + {0x0600, 0x0603, WBP_Format}, + {0x060C, 0x060D, WBP_MidNum}, + {0x0610, 0x061A, WBP_Extend}, + {0x0620, 0x063F, WBP_ALetter}, + {0x0640, 0x0640, WBP_ALetter}, + {0x0641, 0x064A, WBP_ALetter}, + {0x064B, 0x065F, WBP_Extend}, + {0x0660, 0x0669, WBP_Numeric}, + {0x066B, 0x066B, WBP_Numeric}, + {0x066C, 0x066C, WBP_MidNum}, + {0x066E, 0x066F, WBP_ALetter}, + {0x0670, 0x0670, WBP_Extend}, + {0x0671, 0x06D3, WBP_ALetter}, + {0x06D5, 0x06D5, WBP_ALetter}, + {0x06D6, 0x06DC, WBP_Extend}, + {0x06DD, 0x06DD, WBP_Format}, + {0x06DF, 0x06E4, WBP_Extend}, + {0x06E5, 0x06E6, WBP_ALetter}, + {0x06E7, 0x06E8, WBP_Extend}, + {0x06EA, 0x06ED, WBP_Extend}, + {0x06EE, 0x06EF, WBP_ALetter}, + {0x06F0, 0x06F9, WBP_Numeric}, + {0x06FA, 0x06FC, WBP_ALetter}, + {0x06FF, 0x06FF, WBP_ALetter}, + {0x070F, 0x070F, WBP_Format}, + {0x0710, 0x0710, WBP_ALetter}, + {0x0711, 0x0711, WBP_Extend}, + {0x0712, 0x072F, WBP_ALetter}, + {0x0730, 0x074A, WBP_Extend}, + {0x074D, 0x07A5, WBP_ALetter}, + {0x07A6, 0x07B0, WBP_Extend}, + {0x07B1, 0x07B1, WBP_ALetter}, + {0x07C0, 0x07C9, WBP_Numeric}, + {0x07CA, 0x07EA, WBP_ALetter}, + {0x07EB, 0x07F3, WBP_Extend}, + {0x07F4, 0x07F5, WBP_ALetter}, + {0x07F8, 0x07F8, WBP_MidNum}, + {0x07FA, 0x07FA, WBP_ALetter}, + {0x0800, 0x0815, WBP_ALetter}, + {0x0816, 0x0819, WBP_Extend}, + {0x081A, 0x081A, WBP_ALetter}, + {0x081B, 0x0823, WBP_Extend}, + {0x0824, 0x0824, WBP_ALetter}, + {0x0825, 0x0827, WBP_Extend}, + {0x0828, 0x0828, WBP_ALetter}, + {0x0829, 0x082D, WBP_Extend}, + {0x0840, 0x0858, WBP_ALetter}, + {0x0859, 0x085B, WBP_Extend}, + {0x0900, 0x0902, WBP_Extend}, + {0x0903, 0x0903, WBP_Extend}, + {0x0904, 0x0939, WBP_ALetter}, + {0x093A, 0x093A, WBP_Extend}, + {0x093B, 0x093B, WBP_Extend}, + {0x093C, 0x093C, WBP_Extend}, + {0x093D, 0x093D, WBP_ALetter}, + {0x093E, 0x0940, WBP_Extend}, + {0x0941, 0x0948, WBP_Extend}, + {0x0949, 0x094C, WBP_Extend}, + {0x094D, 0x094D, WBP_Extend}, + {0x094E, 0x094F, WBP_Extend}, + {0x0950, 0x0950, WBP_ALetter}, + {0x0951, 0x0957, WBP_Extend}, + {0x0958, 0x0961, WBP_ALetter}, + {0x0962, 0x0963, WBP_Extend}, + {0x0966, 0x096F, WBP_Numeric}, + {0x0971, 0x0971, WBP_ALetter}, + {0x0972, 0x0977, WBP_ALetter}, + {0x0979, 0x097F, WBP_ALetter}, + {0x0981, 0x0981, WBP_Extend}, + {0x0982, 0x0983, WBP_Extend}, + {0x0985, 0x098C, WBP_ALetter}, + {0x098F, 0x0990, WBP_ALetter}, + {0x0993, 0x09A8, WBP_ALetter}, + {0x09AA, 0x09B0, WBP_ALetter}, + {0x09B2, 0x09B2, WBP_ALetter}, + {0x09B6, 0x09B9, WBP_ALetter}, + {0x09BC, 0x09BC, WBP_Extend}, + {0x09BD, 0x09BD, WBP_ALetter}, + {0x09BE, 0x09C0, WBP_Extend}, + {0x09C1, 0x09C4, WBP_Extend}, + {0x09C7, 0x09C8, WBP_Extend}, + {0x09CB, 0x09CC, WBP_Extend}, + {0x09CD, 0x09CD, WBP_Extend}, + {0x09CE, 0x09CE, WBP_ALetter}, + {0x09D7, 0x09D7, WBP_Extend}, + {0x09DC, 0x09DD, WBP_ALetter}, + {0x09DF, 0x09E1, WBP_ALetter}, + {0x09E2, 0x09E3, WBP_Extend}, + {0x09E6, 0x09EF, WBP_Numeric}, + {0x09F0, 0x09F1, WBP_ALetter}, + {0x0A01, 0x0A02, WBP_Extend}, + {0x0A03, 0x0A03, WBP_Extend}, + {0x0A05, 0x0A0A, WBP_ALetter}, + {0x0A0F, 0x0A10, WBP_ALetter}, + {0x0A13, 0x0A28, WBP_ALetter}, + {0x0A2A, 0x0A30, WBP_ALetter}, + {0x0A32, 0x0A33, WBP_ALetter}, + {0x0A35, 0x0A36, WBP_ALetter}, + {0x0A38, 0x0A39, WBP_ALetter}, + {0x0A3C, 0x0A3C, WBP_Extend}, + {0x0A3E, 0x0A40, WBP_Extend}, + {0x0A41, 0x0A42, WBP_Extend}, + {0x0A47, 0x0A48, WBP_Extend}, + {0x0A4B, 0x0A4D, WBP_Extend}, + {0x0A51, 0x0A51, WBP_Extend}, + {0x0A59, 0x0A5C, WBP_ALetter}, + {0x0A5E, 0x0A5E, WBP_ALetter}, + {0x0A66, 0x0A6F, WBP_Numeric}, + {0x0A70, 0x0A71, WBP_Extend}, + {0x0A72, 0x0A74, WBP_ALetter}, + {0x0A75, 0x0A75, WBP_Extend}, + {0x0A81, 0x0A82, WBP_Extend}, + {0x0A83, 0x0A83, WBP_Extend}, + {0x0A85, 0x0A8D, WBP_ALetter}, + {0x0A8F, 0x0A91, WBP_ALetter}, + {0x0A93, 0x0AA8, WBP_ALetter}, + {0x0AAA, 0x0AB0, WBP_ALetter}, + {0x0AB2, 0x0AB3, WBP_ALetter}, + {0x0AB5, 0x0AB9, WBP_ALetter}, + {0x0ABC, 0x0ABC, WBP_Extend}, + {0x0ABD, 0x0ABD, WBP_ALetter}, + {0x0ABE, 0x0AC0, WBP_Extend}, + {0x0AC1, 0x0AC5, WBP_Extend}, + {0x0AC7, 0x0AC8, WBP_Extend}, + {0x0AC9, 0x0AC9, WBP_Extend}, + {0x0ACB, 0x0ACC, WBP_Extend}, + {0x0ACD, 0x0ACD, WBP_Extend}, + {0x0AD0, 0x0AD0, WBP_ALetter}, + {0x0AE0, 0x0AE1, WBP_ALetter}, + {0x0AE2, 0x0AE3, WBP_Extend}, + {0x0AE6, 0x0AEF, WBP_Numeric}, + {0x0B01, 0x0B01, WBP_Extend}, + {0x0B02, 0x0B03, WBP_Extend}, + {0x0B05, 0x0B0C, WBP_ALetter}, + {0x0B0F, 0x0B10, WBP_ALetter}, + {0x0B13, 0x0B28, WBP_ALetter}, + {0x0B2A, 0x0B30, WBP_ALetter}, + {0x0B32, 0x0B33, WBP_ALetter}, + {0x0B35, 0x0B39, WBP_ALetter}, + {0x0B3C, 0x0B3C, WBP_Extend}, + {0x0B3D, 0x0B3D, WBP_ALetter}, + {0x0B3E, 0x0B3E, WBP_Extend}, + {0x0B3F, 0x0B3F, WBP_Extend}, + {0x0B40, 0x0B40, WBP_Extend}, + {0x0B41, 0x0B44, WBP_Extend}, + {0x0B47, 0x0B48, WBP_Extend}, + {0x0B4B, 0x0B4C, WBP_Extend}, + {0x0B4D, 0x0B4D, WBP_Extend}, + {0x0B56, 0x0B56, WBP_Extend}, + {0x0B57, 0x0B57, WBP_Extend}, + {0x0B5C, 0x0B5D, WBP_ALetter}, + {0x0B5F, 0x0B61, WBP_ALetter}, + {0x0B62, 0x0B63, WBP_Extend}, + {0x0B66, 0x0B6F, WBP_Numeric}, + {0x0B71, 0x0B71, WBP_ALetter}, + {0x0B82, 0x0B82, WBP_Extend}, + {0x0B83, 0x0B83, WBP_ALetter}, + {0x0B85, 0x0B8A, WBP_ALetter}, + {0x0B8E, 0x0B90, WBP_ALetter}, + {0x0B92, 0x0B95, WBP_ALetter}, + {0x0B99, 0x0B9A, WBP_ALetter}, + {0x0B9C, 0x0B9C, WBP_ALetter}, + {0x0B9E, 0x0B9F, WBP_ALetter}, + {0x0BA3, 0x0BA4, WBP_ALetter}, + {0x0BA8, 0x0BAA, WBP_ALetter}, + {0x0BAE, 0x0BB9, WBP_ALetter}, + {0x0BBE, 0x0BBF, WBP_Extend}, + {0x0BC0, 0x0BC0, WBP_Extend}, + {0x0BC1, 0x0BC2, WBP_Extend}, + {0x0BC6, 0x0BC8, WBP_Extend}, + {0x0BCA, 0x0BCC, WBP_Extend}, + {0x0BCD, 0x0BCD, WBP_Extend}, + {0x0BD0, 0x0BD0, WBP_ALetter}, + {0x0BD7, 0x0BD7, WBP_Extend}, + {0x0BE6, 0x0BEF, WBP_Numeric}, + {0x0C01, 0x0C03, WBP_Extend}, + {0x0C05, 0x0C0C, WBP_ALetter}, + {0x0C0E, 0x0C10, WBP_ALetter}, + {0x0C12, 0x0C28, WBP_ALetter}, + {0x0C2A, 0x0C33, WBP_ALetter}, + {0x0C35, 0x0C39, WBP_ALetter}, + {0x0C3D, 0x0C3D, WBP_ALetter}, + {0x0C3E, 0x0C40, WBP_Extend}, + {0x0C41, 0x0C44, WBP_Extend}, + {0x0C46, 0x0C48, WBP_Extend}, + {0x0C4A, 0x0C4D, WBP_Extend}, + {0x0C55, 0x0C56, WBP_Extend}, + {0x0C58, 0x0C59, WBP_ALetter}, + {0x0C60, 0x0C61, WBP_ALetter}, + {0x0C62, 0x0C63, WBP_Extend}, + {0x0C66, 0x0C6F, WBP_Numeric}, + {0x0C82, 0x0C83, WBP_Extend}, + {0x0C85, 0x0C8C, WBP_ALetter}, + {0x0C8E, 0x0C90, WBP_ALetter}, + {0x0C92, 0x0CA8, WBP_ALetter}, + {0x0CAA, 0x0CB3, WBP_ALetter}, + {0x0CB5, 0x0CB9, WBP_ALetter}, + {0x0CBC, 0x0CBC, WBP_Extend}, + {0x0CBD, 0x0CBD, WBP_ALetter}, + {0x0CBE, 0x0CBE, WBP_Extend}, + {0x0CBF, 0x0CBF, WBP_Extend}, + {0x0CC0, 0x0CC4, WBP_Extend}, + {0x0CC6, 0x0CC6, WBP_Extend}, + {0x0CC7, 0x0CC8, WBP_Extend}, + {0x0CCA, 0x0CCB, WBP_Extend}, + {0x0CCC, 0x0CCD, WBP_Extend}, + {0x0CD5, 0x0CD6, WBP_Extend}, + {0x0CDE, 0x0CDE, WBP_ALetter}, + {0x0CE0, 0x0CE1, WBP_ALetter}, + {0x0CE2, 0x0CE3, WBP_Extend}, + {0x0CE6, 0x0CEF, WBP_Numeric}, + {0x0CF1, 0x0CF2, WBP_ALetter}, + {0x0D02, 0x0D03, WBP_Extend}, + {0x0D05, 0x0D0C, WBP_ALetter}, + {0x0D0E, 0x0D10, WBP_ALetter}, + {0x0D12, 0x0D3A, WBP_ALetter}, + {0x0D3D, 0x0D3D, WBP_ALetter}, + {0x0D3E, 0x0D40, WBP_Extend}, + {0x0D41, 0x0D44, WBP_Extend}, + {0x0D46, 0x0D48, WBP_Extend}, + {0x0D4A, 0x0D4C, WBP_Extend}, + {0x0D4D, 0x0D4D, WBP_Extend}, + {0x0D4E, 0x0D4E, WBP_ALetter}, + {0x0D57, 0x0D57, WBP_Extend}, + {0x0D60, 0x0D61, WBP_ALetter}, + {0x0D62, 0x0D63, WBP_Extend}, + {0x0D66, 0x0D6F, WBP_Numeric}, + {0x0D7A, 0x0D7F, WBP_ALetter}, + {0x0D82, 0x0D83, WBP_Extend}, + {0x0D85, 0x0D96, WBP_ALetter}, + {0x0D9A, 0x0DB1, WBP_ALetter}, + {0x0DB3, 0x0DBB, WBP_ALetter}, + {0x0DBD, 0x0DBD, WBP_ALetter}, + {0x0DC0, 0x0DC6, WBP_ALetter}, + {0x0DCA, 0x0DCA, WBP_Extend}, + {0x0DCF, 0x0DD1, WBP_Extend}, + {0x0DD2, 0x0DD4, WBP_Extend}, + {0x0DD6, 0x0DD6, WBP_Extend}, + {0x0DD8, 0x0DDF, WBP_Extend}, + {0x0DF2, 0x0DF3, WBP_Extend}, + {0x0E31, 0x0E31, WBP_Extend}, + {0x0E34, 0x0E3A, WBP_Extend}, + {0x0E47, 0x0E4E, WBP_Extend}, + {0x0E50, 0x0E59, WBP_Numeric}, + {0x0EB1, 0x0EB1, WBP_Extend}, + {0x0EB4, 0x0EB9, WBP_Extend}, + {0x0EBB, 0x0EBC, WBP_Extend}, + {0x0EC8, 0x0ECD, WBP_Extend}, + {0x0ED0, 0x0ED9, WBP_Numeric}, + {0x0F00, 0x0F00, WBP_ALetter}, + {0x0F18, 0x0F19, WBP_Extend}, + {0x0F20, 0x0F29, WBP_Numeric}, + {0x0F35, 0x0F35, WBP_Extend}, + {0x0F37, 0x0F37, WBP_Extend}, + {0x0F39, 0x0F39, WBP_Extend}, + {0x0F3E, 0x0F3F, WBP_Extend}, + {0x0F40, 0x0F47, WBP_ALetter}, + {0x0F49, 0x0F6C, WBP_ALetter}, + {0x0F71, 0x0F7E, WBP_Extend}, + {0x0F7F, 0x0F7F, WBP_Extend}, + {0x0F80, 0x0F84, WBP_Extend}, + {0x0F86, 0x0F87, WBP_Extend}, + {0x0F88, 0x0F8C, WBP_ALetter}, + {0x0F8D, 0x0F97, WBP_Extend}, + {0x0F99, 0x0FBC, WBP_Extend}, + {0x0FC6, 0x0FC6, WBP_Extend}, + {0x102B, 0x102C, WBP_Extend}, + {0x102D, 0x1030, WBP_Extend}, + {0x1031, 0x1031, WBP_Extend}, + {0x1032, 0x1037, WBP_Extend}, + {0x1038, 0x1038, WBP_Extend}, + {0x1039, 0x103A, WBP_Extend}, + {0x103B, 0x103C, WBP_Extend}, + {0x103D, 0x103E, WBP_Extend}, + {0x1040, 0x1049, WBP_Numeric}, + {0x1056, 0x1057, WBP_Extend}, + {0x1058, 0x1059, WBP_Extend}, + {0x105E, 0x1060, WBP_Extend}, + {0x1062, 0x1064, WBP_Extend}, + {0x1067, 0x106D, WBP_Extend}, + {0x1071, 0x1074, WBP_Extend}, + {0x1082, 0x1082, WBP_Extend}, + {0x1083, 0x1084, WBP_Extend}, + {0x1085, 0x1086, WBP_Extend}, + {0x1087, 0x108C, WBP_Extend}, + {0x108D, 0x108D, WBP_Extend}, + {0x108F, 0x108F, WBP_Extend}, + {0x1090, 0x1099, WBP_Numeric}, + {0x109A, 0x109C, WBP_Extend}, + {0x109D, 0x109D, WBP_Extend}, + {0x10A0, 0x10C5, WBP_ALetter}, + {0x10D0, 0x10FA, WBP_ALetter}, + {0x10FC, 0x10FC, WBP_ALetter}, + {0x1100, 0x1248, WBP_ALetter}, + {0x124A, 0x124D, WBP_ALetter}, + {0x1250, 0x1256, WBP_ALetter}, + {0x1258, 0x1258, WBP_ALetter}, + {0x125A, 0x125D, WBP_ALetter}, + {0x1260, 0x1288, WBP_ALetter}, + {0x128A, 0x128D, WBP_ALetter}, + {0x1290, 0x12B0, WBP_ALetter}, + {0x12B2, 0x12B5, WBP_ALetter}, + {0x12B8, 0x12BE, WBP_ALetter}, + {0x12C0, 0x12C0, WBP_ALetter}, + {0x12C2, 0x12C5, WBP_ALetter}, + {0x12C8, 0x12D6, WBP_ALetter}, + {0x12D8, 0x1310, WBP_ALetter}, + {0x1312, 0x1315, WBP_ALetter}, + {0x1318, 0x135A, WBP_ALetter}, + {0x135D, 0x135F, WBP_Extend}, + {0x1380, 0x138F, WBP_ALetter}, + {0x13A0, 0x13F4, WBP_ALetter}, + {0x1401, 0x166C, WBP_ALetter}, + {0x166F, 0x167F, WBP_ALetter}, + {0x1681, 0x169A, WBP_ALetter}, + {0x16A0, 0x16EA, WBP_ALetter}, + {0x16EE, 0x16F0, WBP_ALetter}, + {0x1700, 0x170C, WBP_ALetter}, + {0x170E, 0x1711, WBP_ALetter}, + {0x1712, 0x1714, WBP_Extend}, + {0x1720, 0x1731, WBP_ALetter}, + {0x1732, 0x1734, WBP_Extend}, + {0x1740, 0x1751, WBP_ALetter}, + {0x1752, 0x1753, WBP_Extend}, + {0x1760, 0x176C, WBP_ALetter}, + {0x176E, 0x1770, WBP_ALetter}, + {0x1772, 0x1773, WBP_Extend}, + {0x17B4, 0x17B5, WBP_Format}, + {0x17B6, 0x17B6, WBP_Extend}, + {0x17B7, 0x17BD, WBP_Extend}, + {0x17BE, 0x17C5, WBP_Extend}, + {0x17C6, 0x17C6, WBP_Extend}, + {0x17C7, 0x17C8, WBP_Extend}, + {0x17C9, 0x17D3, WBP_Extend}, + {0x17DD, 0x17DD, WBP_Extend}, + {0x17E0, 0x17E9, WBP_Numeric}, + {0x180B, 0x180D, WBP_Extend}, + {0x1810, 0x1819, WBP_Numeric}, + {0x1820, 0x1842, WBP_ALetter}, + {0x1843, 0x1843, WBP_ALetter}, + {0x1844, 0x1877, WBP_ALetter}, + {0x1880, 0x18A8, WBP_ALetter}, + {0x18A9, 0x18A9, WBP_Extend}, + {0x18AA, 0x18AA, WBP_ALetter}, + {0x18B0, 0x18F5, WBP_ALetter}, + {0x1900, 0x191C, WBP_ALetter}, + {0x1920, 0x1922, WBP_Extend}, + {0x1923, 0x1926, WBP_Extend}, + {0x1927, 0x1928, WBP_Extend}, + {0x1929, 0x192B, WBP_Extend}, + {0x1930, 0x1931, WBP_Extend}, + {0x1932, 0x1932, WBP_Extend}, + {0x1933, 0x1938, WBP_Extend}, + {0x1939, 0x193B, WBP_Extend}, + {0x1946, 0x194F, WBP_Numeric}, + {0x19B0, 0x19C0, WBP_Extend}, + {0x19C8, 0x19C9, WBP_Extend}, + {0x19D0, 0x19D9, WBP_Numeric}, + {0x1A00, 0x1A16, WBP_ALetter}, + {0x1A17, 0x1A18, WBP_Extend}, + {0x1A19, 0x1A1B, WBP_Extend}, + {0x1A55, 0x1A55, WBP_Extend}, + {0x1A56, 0x1A56, WBP_Extend}, + {0x1A57, 0x1A57, WBP_Extend}, + {0x1A58, 0x1A5E, WBP_Extend}, + {0x1A60, 0x1A60, WBP_Extend}, + {0x1A61, 0x1A61, WBP_Extend}, + {0x1A62, 0x1A62, WBP_Extend}, + {0x1A63, 0x1A64, WBP_Extend}, + {0x1A65, 0x1A6C, WBP_Extend}, + {0x1A6D, 0x1A72, WBP_Extend}, + {0x1A73, 0x1A7C, WBP_Extend}, + {0x1A7F, 0x1A7F, WBP_Extend}, + {0x1A80, 0x1A89, WBP_Numeric}, + {0x1A90, 0x1A99, WBP_Numeric}, + {0x1B00, 0x1B03, WBP_Extend}, + {0x1B04, 0x1B04, WBP_Extend}, + {0x1B05, 0x1B33, WBP_ALetter}, + {0x1B34, 0x1B34, WBP_Extend}, + {0x1B35, 0x1B35, WBP_Extend}, + {0x1B36, 0x1B3A, WBP_Extend}, + {0x1B3B, 0x1B3B, WBP_Extend}, + {0x1B3C, 0x1B3C, WBP_Extend}, + {0x1B3D, 0x1B41, WBP_Extend}, + {0x1B42, 0x1B42, WBP_Extend}, + {0x1B43, 0x1B44, WBP_Extend}, + {0x1B45, 0x1B4B, WBP_ALetter}, + {0x1B50, 0x1B59, WBP_Numeric}, + {0x1B6B, 0x1B73, WBP_Extend}, + {0x1B80, 0x1B81, WBP_Extend}, + {0x1B82, 0x1B82, WBP_Extend}, + {0x1B83, 0x1BA0, WBP_ALetter}, + {0x1BA1, 0x1BA1, WBP_Extend}, + {0x1BA2, 0x1BA5, WBP_Extend}, + {0x1BA6, 0x1BA7, WBP_Extend}, + {0x1BA8, 0x1BA9, WBP_Extend}, + {0x1BAA, 0x1BAA, WBP_Extend}, + {0x1BAE, 0x1BAF, WBP_ALetter}, + {0x1BB0, 0x1BB9, WBP_Numeric}, + {0x1BC0, 0x1BE5, WBP_ALetter}, + {0x1BE6, 0x1BE6, WBP_Extend}, + {0x1BE7, 0x1BE7, WBP_Extend}, + {0x1BE8, 0x1BE9, WBP_Extend}, + {0x1BEA, 0x1BEC, WBP_Extend}, + {0x1BED, 0x1BED, WBP_Extend}, + {0x1BEE, 0x1BEE, WBP_Extend}, + {0x1BEF, 0x1BF1, WBP_Extend}, + {0x1BF2, 0x1BF3, WBP_Extend}, + {0x1C00, 0x1C23, WBP_ALetter}, + {0x1C24, 0x1C2B, WBP_Extend}, + {0x1C2C, 0x1C33, WBP_Extend}, + {0x1C34, 0x1C35, WBP_Extend}, + {0x1C36, 0x1C37, WBP_Extend}, + {0x1C40, 0x1C49, WBP_Numeric}, + {0x1C4D, 0x1C4F, WBP_ALetter}, + {0x1C50, 0x1C59, WBP_Numeric}, + {0x1C5A, 0x1C77, WBP_ALetter}, + {0x1C78, 0x1C7D, WBP_ALetter}, + {0x1CD0, 0x1CD2, WBP_Extend}, + {0x1CD4, 0x1CE0, WBP_Extend}, + {0x1CE1, 0x1CE1, WBP_Extend}, + {0x1CE2, 0x1CE8, WBP_Extend}, + {0x1CE9, 0x1CEC, WBP_ALetter}, + {0x1CED, 0x1CED, WBP_Extend}, + {0x1CEE, 0x1CF1, WBP_ALetter}, + {0x1CF2, 0x1CF2, WBP_Extend}, + {0x1D00, 0x1D2B, WBP_ALetter}, + {0x1D2C, 0x1D61, WBP_ALetter}, + {0x1D62, 0x1D77, WBP_ALetter}, + {0x1D78, 0x1D78, WBP_ALetter}, + {0x1D79, 0x1D9A, WBP_ALetter}, + {0x1D9B, 0x1DBF, WBP_ALetter}, + {0x1DC0, 0x1DE6, WBP_Extend}, + {0x1DFC, 0x1DFF, WBP_Extend}, + {0x1E00, 0x1F15, WBP_ALetter}, + {0x1F18, 0x1F1D, WBP_ALetter}, + {0x1F20, 0x1F45, WBP_ALetter}, + {0x1F48, 0x1F4D, WBP_ALetter}, + {0x1F50, 0x1F57, WBP_ALetter}, + {0x1F59, 0x1F59, WBP_ALetter}, + {0x1F5B, 0x1F5B, WBP_ALetter}, + {0x1F5D, 0x1F5D, WBP_ALetter}, + {0x1F5F, 0x1F7D, WBP_ALetter}, + {0x1F80, 0x1FB4, WBP_ALetter}, + {0x1FB6, 0x1FBC, WBP_ALetter}, + {0x1FBE, 0x1FBE, WBP_ALetter}, + {0x1FC2, 0x1FC4, WBP_ALetter}, + {0x1FC6, 0x1FCC, WBP_ALetter}, + {0x1FD0, 0x1FD3, WBP_ALetter}, + {0x1FD6, 0x1FDB, WBP_ALetter}, + {0x1FE0, 0x1FEC, WBP_ALetter}, + {0x1FF2, 0x1FF4, WBP_ALetter}, + {0x1FF6, 0x1FFC, WBP_ALetter}, + {0x200C, 0x200D, WBP_Extend}, + {0x200E, 0x200F, WBP_Format}, + {0x2018, 0x2018, WBP_MidNumLet}, + {0x2019, 0x2019, WBP_MidNumLet}, + {0x2024, 0x2024, WBP_MidNumLet}, + {0x2027, 0x2027, WBP_MidLetter}, + {0x2028, 0x2028, WBP_Newline}, + {0x2029, 0x2029, WBP_Newline}, + {0x202A, 0x202E, WBP_Format}, + {0x203F, 0x2040, WBP_ExtendNumLet}, + {0x2044, 0x2044, WBP_MidNum}, + {0x2054, 0x2054, WBP_ExtendNumLet}, + {0x2060, 0x2064, WBP_Format}, + {0x206A, 0x206F, WBP_Format}, + {0x2071, 0x2071, WBP_ALetter}, + {0x207F, 0x207F, WBP_ALetter}, + {0x2090, 0x209C, WBP_ALetter}, + {0x20D0, 0x20DC, WBP_Extend}, + {0x20DD, 0x20E0, WBP_Extend}, + {0x20E1, 0x20E1, WBP_Extend}, + {0x20E2, 0x20E4, WBP_Extend}, + {0x20E5, 0x20F0, WBP_Extend}, + {0x2102, 0x2102, WBP_ALetter}, + {0x2107, 0x2107, WBP_ALetter}, + {0x210A, 0x2113, WBP_ALetter}, + {0x2115, 0x2115, WBP_ALetter}, + {0x2119, 0x211D, WBP_ALetter}, + {0x2124, 0x2124, WBP_ALetter}, + {0x2126, 0x2126, WBP_ALetter}, + {0x2128, 0x2128, WBP_ALetter}, + {0x212A, 0x212D, WBP_ALetter}, + {0x212F, 0x2134, WBP_ALetter}, + {0x2135, 0x2138, WBP_ALetter}, + {0x2139, 0x2139, WBP_ALetter}, + {0x213C, 0x213F, WBP_ALetter}, + {0x2145, 0x2149, WBP_ALetter}, + {0x214E, 0x214E, WBP_ALetter}, + {0x2160, 0x2182, WBP_ALetter}, + {0x2183, 0x2184, WBP_ALetter}, + {0x2185, 0x2188, WBP_ALetter}, + {0x24B6, 0x24E9, WBP_ALetter}, + {0x2C00, 0x2C2E, WBP_ALetter}, + {0x2C30, 0x2C5E, WBP_ALetter}, + {0x2C60, 0x2C7C, WBP_ALetter}, + {0x2C7D, 0x2C7D, WBP_ALetter}, + {0x2C7E, 0x2CE4, WBP_ALetter}, + {0x2CEB, 0x2CEE, WBP_ALetter}, + {0x2CEF, 0x2CF1, WBP_Extend}, + {0x2D00, 0x2D25, WBP_ALetter}, + {0x2D30, 0x2D65, WBP_ALetter}, + {0x2D6F, 0x2D6F, WBP_ALetter}, + {0x2D7F, 0x2D7F, WBP_Extend}, + {0x2D80, 0x2D96, WBP_ALetter}, + {0x2DA0, 0x2DA6, WBP_ALetter}, + {0x2DA8, 0x2DAE, WBP_ALetter}, + {0x2DB0, 0x2DB6, WBP_ALetter}, + {0x2DB8, 0x2DBE, WBP_ALetter}, + {0x2DC0, 0x2DC6, WBP_ALetter}, + {0x2DC8, 0x2DCE, WBP_ALetter}, + {0x2DD0, 0x2DD6, WBP_ALetter}, + {0x2DD8, 0x2DDE, WBP_ALetter}, + {0x2DE0, 0x2DFF, WBP_Extend}, + {0x2E2F, 0x2E2F, WBP_ALetter}, + {0x3005, 0x3005, WBP_ALetter}, + {0x302A, 0x302F, WBP_Extend}, + {0x3031, 0x3035, WBP_Katakana}, + {0x303B, 0x303B, WBP_ALetter}, + {0x303C, 0x303C, WBP_ALetter}, + {0x3099, 0x309A, WBP_Extend}, + {0x309B, 0x309C, WBP_Katakana}, + {0x30A0, 0x30A0, WBP_Katakana}, + {0x30A1, 0x30FA, WBP_Katakana}, + {0x30FC, 0x30FE, WBP_Katakana}, + {0x30FF, 0x30FF, WBP_Katakana}, + {0x3105, 0x312D, WBP_ALetter}, + {0x3131, 0x318E, WBP_ALetter}, + {0x31A0, 0x31BA, WBP_ALetter}, + {0x31F0, 0x31FF, WBP_Katakana}, + {0x32D0, 0x32FE, WBP_Katakana}, + {0x3300, 0x3357, WBP_Katakana}, + {0xA000, 0xA014, WBP_ALetter}, + {0xA015, 0xA015, WBP_ALetter}, + {0xA016, 0xA48C, WBP_ALetter}, + {0xA4D0, 0xA4F7, WBP_ALetter}, + {0xA4F8, 0xA4FD, WBP_ALetter}, + {0xA500, 0xA60B, WBP_ALetter}, + {0xA60C, 0xA60C, WBP_ALetter}, + {0xA610, 0xA61F, WBP_ALetter}, + {0xA620, 0xA629, WBP_Numeric}, + {0xA62A, 0xA62B, WBP_ALetter}, + {0xA640, 0xA66D, WBP_ALetter}, + {0xA66E, 0xA66E, WBP_ALetter}, + {0xA66F, 0xA66F, WBP_Extend}, + {0xA670, 0xA672, WBP_Extend}, + {0xA67C, 0xA67D, WBP_Extend}, + {0xA67F, 0xA67F, WBP_ALetter}, + {0xA680, 0xA697, WBP_ALetter}, + {0xA6A0, 0xA6E5, WBP_ALetter}, + {0xA6E6, 0xA6EF, WBP_ALetter}, + {0xA6F0, 0xA6F1, WBP_Extend}, + {0xA717, 0xA71F, WBP_ALetter}, + {0xA722, 0xA76F, WBP_ALetter}, + {0xA770, 0xA770, WBP_ALetter}, + {0xA771, 0xA787, WBP_ALetter}, + {0xA788, 0xA788, WBP_ALetter}, + {0xA78B, 0xA78E, WBP_ALetter}, + {0xA790, 0xA791, WBP_ALetter}, + {0xA7A0, 0xA7A9, WBP_ALetter}, + {0xA7FA, 0xA7FA, WBP_ALetter}, + {0xA7FB, 0xA801, WBP_ALetter}, + {0xA802, 0xA802, WBP_Extend}, + {0xA803, 0xA805, WBP_ALetter}, + {0xA806, 0xA806, WBP_Extend}, + {0xA807, 0xA80A, WBP_ALetter}, + {0xA80B, 0xA80B, WBP_Extend}, + {0xA80C, 0xA822, WBP_ALetter}, + {0xA823, 0xA824, WBP_Extend}, + {0xA825, 0xA826, WBP_Extend}, + {0xA827, 0xA827, WBP_Extend}, + {0xA840, 0xA873, WBP_ALetter}, + {0xA880, 0xA881, WBP_Extend}, + {0xA882, 0xA8B3, WBP_ALetter}, + {0xA8B4, 0xA8C3, WBP_Extend}, + {0xA8C4, 0xA8C4, WBP_Extend}, + {0xA8D0, 0xA8D9, WBP_Numeric}, + {0xA8E0, 0xA8F1, WBP_Extend}, + {0xA8F2, 0xA8F7, WBP_ALetter}, + {0xA8FB, 0xA8FB, WBP_ALetter}, + {0xA900, 0xA909, WBP_Numeric}, + {0xA90A, 0xA925, WBP_ALetter}, + {0xA926, 0xA92D, WBP_Extend}, + {0xA930, 0xA946, WBP_ALetter}, + {0xA947, 0xA951, WBP_Extend}, + {0xA952, 0xA953, WBP_Extend}, + {0xA960, 0xA97C, WBP_ALetter}, + {0xA980, 0xA982, WBP_Extend}, + {0xA983, 0xA983, WBP_Extend}, + {0xA984, 0xA9B2, WBP_ALetter}, + {0xA9B3, 0xA9B3, WBP_Extend}, + {0xA9B4, 0xA9B5, WBP_Extend}, + {0xA9B6, 0xA9B9, WBP_Extend}, + {0xA9BA, 0xA9BB, WBP_Extend}, + {0xA9BC, 0xA9BC, WBP_Extend}, + {0xA9BD, 0xA9C0, WBP_Extend}, + {0xA9CF, 0xA9CF, WBP_ALetter}, + {0xA9D0, 0xA9D9, WBP_Numeric}, + {0xAA00, 0xAA28, WBP_ALetter}, + {0xAA29, 0xAA2E, WBP_Extend}, + {0xAA2F, 0xAA30, WBP_Extend}, + {0xAA31, 0xAA32, WBP_Extend}, + {0xAA33, 0xAA34, WBP_Extend}, + {0xAA35, 0xAA36, WBP_Extend}, + {0xAA40, 0xAA42, WBP_ALetter}, + {0xAA43, 0xAA43, WBP_Extend}, + {0xAA44, 0xAA4B, WBP_ALetter}, + {0xAA4C, 0xAA4C, WBP_Extend}, + {0xAA4D, 0xAA4D, WBP_Extend}, + {0xAA50, 0xAA59, WBP_Numeric}, + {0xAA7B, 0xAA7B, WBP_Extend}, + {0xAAB0, 0xAAB0, WBP_Extend}, + {0xAAB2, 0xAAB4, WBP_Extend}, + {0xAAB7, 0xAAB8, WBP_Extend}, + {0xAABE, 0xAABF, WBP_Extend}, + {0xAAC1, 0xAAC1, WBP_Extend}, + {0xAB01, 0xAB06, WBP_ALetter}, + {0xAB09, 0xAB0E, WBP_ALetter}, + {0xAB11, 0xAB16, WBP_ALetter}, + {0xAB20, 0xAB26, WBP_ALetter}, + {0xAB28, 0xAB2E, WBP_ALetter}, + {0xABC0, 0xABE2, WBP_ALetter}, + {0xABE3, 0xABE4, WBP_Extend}, + {0xABE5, 0xABE5, WBP_Extend}, + {0xABE6, 0xABE7, WBP_Extend}, + {0xABE8, 0xABE8, WBP_Extend}, + {0xABE9, 0xABEA, WBP_Extend}, + {0xABEC, 0xABEC, WBP_Extend}, + {0xABED, 0xABED, WBP_Extend}, + {0xABF0, 0xABF9, WBP_Numeric}, + {0xAC00, 0xD7A3, WBP_ALetter}, + {0xD7B0, 0xD7C6, WBP_ALetter}, + {0xD7CB, 0xD7FB, WBP_ALetter}, + {0xFB00, 0xFB06, WBP_ALetter}, + {0xFB13, 0xFB17, WBP_ALetter}, + {0xFB1D, 0xFB1D, WBP_ALetter}, + {0xFB1E, 0xFB1E, WBP_Extend}, + {0xFB1F, 0xFB28, WBP_ALetter}, + {0xFB2A, 0xFB36, WBP_ALetter}, + {0xFB38, 0xFB3C, WBP_ALetter}, + {0xFB3E, 0xFB3E, WBP_ALetter}, + {0xFB40, 0xFB41, WBP_ALetter}, + {0xFB43, 0xFB44, WBP_ALetter}, + {0xFB46, 0xFBB1, WBP_ALetter}, + {0xFBD3, 0xFD3D, WBP_ALetter}, + {0xFD50, 0xFD8F, WBP_ALetter}, + {0xFD92, 0xFDC7, WBP_ALetter}, + {0xFDF0, 0xFDFB, WBP_ALetter}, + {0xFE00, 0xFE0F, WBP_Extend}, + {0xFE10, 0xFE10, WBP_MidNum}, + {0xFE13, 0xFE13, WBP_MidLetter}, + {0xFE14, 0xFE14, WBP_MidNum}, + {0xFE20, 0xFE26, WBP_Extend}, + {0xFE33, 0xFE34, WBP_ExtendNumLet}, + {0xFE4D, 0xFE4F, WBP_ExtendNumLet}, + {0xFE50, 0xFE50, WBP_MidNum}, + {0xFE52, 0xFE52, WBP_MidNumLet}, + {0xFE54, 0xFE54, WBP_MidNum}, + {0xFE55, 0xFE55, WBP_MidLetter}, + {0xFE70, 0xFE74, WBP_ALetter}, + {0xFE76, 0xFEFC, WBP_ALetter}, + {0xFEFF, 0xFEFF, WBP_Format}, + {0xFF07, 0xFF07, WBP_MidNumLet}, + {0xFF0C, 0xFF0C, WBP_MidNum}, + {0xFF0E, 0xFF0E, WBP_MidNumLet}, + {0xFF1A, 0xFF1A, WBP_MidLetter}, + {0xFF1B, 0xFF1B, WBP_MidNum}, + {0xFF21, 0xFF3A, WBP_ALetter}, + {0xFF3F, 0xFF3F, WBP_ExtendNumLet}, + {0xFF41, 0xFF5A, WBP_ALetter}, + {0xFF66, 0xFF6F, WBP_Katakana}, + {0xFF70, 0xFF70, WBP_Katakana}, + {0xFF71, 0xFF9D, WBP_Katakana}, + {0xFF9E, 0xFF9F, WBP_Extend}, + {0xFFA0, 0xFFBE, WBP_ALetter}, + {0xFFC2, 0xFFC7, WBP_ALetter}, + {0xFFCA, 0xFFCF, WBP_ALetter}, + {0xFFD2, 0xFFD7, WBP_ALetter}, + {0xFFDA, 0xFFDC, WBP_ALetter}, + {0xFFF9, 0xFFFB, WBP_Format}, + {0x10000, 0x1000B, WBP_ALetter}, + {0x1000D, 0x10026, WBP_ALetter}, + {0x10028, 0x1003A, WBP_ALetter}, + {0x1003C, 0x1003D, WBP_ALetter}, + {0x1003F, 0x1004D, WBP_ALetter}, + {0x10050, 0x1005D, WBP_ALetter}, + {0x10080, 0x100FA, WBP_ALetter}, + {0x10140, 0x10174, WBP_ALetter}, + {0x101FD, 0x101FD, WBP_Extend}, + {0x10280, 0x1029C, WBP_ALetter}, + {0x102A0, 0x102D0, WBP_ALetter}, + {0x10300, 0x1031E, WBP_ALetter}, + {0x10330, 0x10340, WBP_ALetter}, + {0x10341, 0x10341, WBP_ALetter}, + {0x10342, 0x10349, WBP_ALetter}, + {0x1034A, 0x1034A, WBP_ALetter}, + {0x10380, 0x1039D, WBP_ALetter}, + {0x103A0, 0x103C3, WBP_ALetter}, + {0x103C8, 0x103CF, WBP_ALetter}, + {0x103D1, 0x103D5, WBP_ALetter}, + {0x10400, 0x1044F, WBP_ALetter}, + {0x10450, 0x1049D, WBP_ALetter}, + {0x104A0, 0x104A9, WBP_Numeric}, + {0x10800, 0x10805, WBP_ALetter}, + {0x10808, 0x10808, WBP_ALetter}, + {0x1080A, 0x10835, WBP_ALetter}, + {0x10837, 0x10838, WBP_ALetter}, + {0x1083C, 0x1083C, WBP_ALetter}, + {0x1083F, 0x10855, WBP_ALetter}, + {0x10900, 0x10915, WBP_ALetter}, + {0x10920, 0x10939, WBP_ALetter}, + {0x10A00, 0x10A00, WBP_ALetter}, + {0x10A01, 0x10A03, WBP_Extend}, + {0x10A05, 0x10A06, WBP_Extend}, + {0x10A0C, 0x10A0F, WBP_Extend}, + {0x10A10, 0x10A13, WBP_ALetter}, + {0x10A15, 0x10A17, WBP_ALetter}, + {0x10A19, 0x10A33, WBP_ALetter}, + {0x10A38, 0x10A3A, WBP_Extend}, + {0x10A3F, 0x10A3F, WBP_Extend}, + {0x10A60, 0x10A7C, WBP_ALetter}, + {0x10B00, 0x10B35, WBP_ALetter}, + {0x10B40, 0x10B55, WBP_ALetter}, + {0x10B60, 0x10B72, WBP_ALetter}, + {0x10C00, 0x10C48, WBP_ALetter}, + {0x11000, 0x11000, WBP_Extend}, + {0x11001, 0x11001, WBP_Extend}, + {0x11002, 0x11002, WBP_Extend}, + {0x11003, 0x11037, WBP_ALetter}, + {0x11038, 0x11046, WBP_Extend}, + {0x11066, 0x1106F, WBP_Numeric}, + {0x11080, 0x11081, WBP_Extend}, + {0x11082, 0x11082, WBP_Extend}, + {0x11083, 0x110AF, WBP_ALetter}, + {0x110B0, 0x110B2, WBP_Extend}, + {0x110B3, 0x110B6, WBP_Extend}, + {0x110B7, 0x110B8, WBP_Extend}, + {0x110B9, 0x110BA, WBP_Extend}, + {0x110BD, 0x110BD, WBP_Format}, + {0x12000, 0x1236E, WBP_ALetter}, + {0x12400, 0x12462, WBP_ALetter}, + {0x13000, 0x1342E, WBP_ALetter}, + {0x16800, 0x16A38, WBP_ALetter}, + {0x1B000, 0x1B000, WBP_Katakana}, + {0x1D165, 0x1D166, WBP_Extend}, + {0x1D167, 0x1D169, WBP_Extend}, + {0x1D16D, 0x1D172, WBP_Extend}, + {0x1D173, 0x1D17A, WBP_Format}, + {0x1D17B, 0x1D182, WBP_Extend}, + {0x1D185, 0x1D18B, WBP_Extend}, + {0x1D1AA, 0x1D1AD, WBP_Extend}, + {0x1D242, 0x1D244, WBP_Extend}, + {0x1D400, 0x1D454, WBP_ALetter}, + {0x1D456, 0x1D49C, WBP_ALetter}, + {0x1D49E, 0x1D49F, WBP_ALetter}, + {0x1D4A2, 0x1D4A2, WBP_ALetter}, + {0x1D4A5, 0x1D4A6, WBP_ALetter}, + {0x1D4A9, 0x1D4AC, WBP_ALetter}, + {0x1D4AE, 0x1D4B9, WBP_ALetter}, + {0x1D4BB, 0x1D4BB, WBP_ALetter}, + {0x1D4BD, 0x1D4C3, WBP_ALetter}, + {0x1D4C5, 0x1D505, WBP_ALetter}, + {0x1D507, 0x1D50A, WBP_ALetter}, + {0x1D50D, 0x1D514, WBP_ALetter}, + {0x1D516, 0x1D51C, WBP_ALetter}, + {0x1D51E, 0x1D539, WBP_ALetter}, + {0x1D53B, 0x1D53E, WBP_ALetter}, + {0x1D540, 0x1D544, WBP_ALetter}, + {0x1D546, 0x1D546, WBP_ALetter}, + {0x1D54A, 0x1D550, WBP_ALetter}, + {0x1D552, 0x1D6A5, WBP_ALetter}, + {0x1D6A8, 0x1D6C0, WBP_ALetter}, + {0x1D6C2, 0x1D6DA, WBP_ALetter}, + {0x1D6DC, 0x1D6FA, WBP_ALetter}, + {0x1D6FC, 0x1D714, WBP_ALetter}, + {0x1D716, 0x1D734, WBP_ALetter}, + {0x1D736, 0x1D74E, WBP_ALetter}, + {0x1D750, 0x1D76E, WBP_ALetter}, + {0x1D770, 0x1D788, WBP_ALetter}, + {0x1D78A, 0x1D7A8, WBP_ALetter}, + {0x1D7AA, 0x1D7C2, WBP_ALetter}, + {0x1D7C4, 0x1D7CB, WBP_ALetter}, + {0x1D7CE, 0x1D7FF, WBP_Numeric}, + {0xE0001, 0xE0001, WBP_Format}, + {0xE0020, 0xE007F, WBP_Format}, + {0xE0100, 0xE01EF, WBP_Extend}, + {0xFFFFFFFF, 0xFFFFFFFF, WBP_Undefined} +}; diff --git a/src/static_deps/liblinebreak/wordbreakdef.h b/src/static_deps/liblinebreak/wordbreakdef.h new file mode 100644 index 0000000..331cd01 --- /dev/null +++ b/src/static_deps/liblinebreak/wordbreakdef.h @@ -0,0 +1,80 @@ +/* vim: set tabstop=4 shiftwidth=4: */ + +/* + * Word breaking in a Unicode sequence. Designed to be used in a + * generic text renderer. + * + * Copyright (C) 2011-2011 Tom Hacohen + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute + * it freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must + * not claim that you wrote the original software. If you use this + * software in a product, an acknowledgement in the product + * documentation would be appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must + * not be misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source + * distribution. + * + * The main reference is Unicode Standard Annex 29 (UAX #29): + * + * + * When this library was designed, this annex was at Revision 17, for + * Unicode 6.0.0: + * + * + * The Unicode Terms of Use are available at + * + */ + +/** + * @file wordbreakdef.h + * + * Definitions of internal data structures, declarations of global + * variables, and function prototypes for the word breaking algorithm. + * + * @version 2.0, 2011/12/12 + * @author Tom Hacohen + */ + +/** + * Word break classes. This is a direct mapping of Table 3 of Unicode + * Standard Annex 29, Revision 17. + */ +enum WordBreakClass +{ + WBP_Undefined, + + /* The following break classes are treated in the pair table. */ + WBP_CR, + WBP_LF, + WBP_Newline, + WBP_Extend, + WBP_Format, + WBP_Katakana, + WBP_ALetter, + WBP_MidNumLet, + WBP_MidLetter, + WBP_MidNum, + WBP_Numeric, + WBP_ExtendNumLet, + WBP_Any +}; + +/** + * Struct for entries of word break properties. The array of the + * entries \e must be sorted. + */ +struct WordBreakProperties +{ + utf32_t start; /**< Starting coding point */ + utf32_t end; /**< End coding point */ + enum WordBreakClass prop; /**< The word breaking property */ +};