+2013-01-11 Tom Hacohen (TAsn)
+
+ * Static libs: Updated liblinebreak to libunibreak's latest version.
+
2013-01-11 Cedric Bail
* Fix not up to date clip cache for Evas_Object_Text.
* use Eina_File in webp, gif, tiff, png and eet loader
* Eina.h includes eina_alloca.h/alloca.h to define alloca()
* Improved eina share del speed.
+ * Upgrade liblinebreak to latest version of libunibreak.
Fixes:
* Fix PPC (big endian) image codec bug.
rm -rf modules/emotion/xine/*.gcno
rm -rf modules/emotion/gstreamer/*.gcno
rm -rf modules/emotion/generic/*.gcno
- rm -rf static_libs/liblinebreak/*.gcno
+ rm -rf static_libs/libunibreak/*.gcno
rm -rf static_libs/lz4/*.gcno
# Linebreak
noinst_HEADERS += \
-static_libs/liblinebreak/linebreak.h \
-static_libs/liblinebreak/linebreakdef.h \
-static_libs/liblinebreak/wordbreakdef.h \
-static_libs/liblinebreak/wordbreak.h
+static_libs/libunibreak/linebreak.h \
+static_libs/libunibreak/linebreakdef.h \
+static_libs/libunibreak/wordbreakdef.h \
+static_libs/libunibreak/wordbreak.h
# Linebreak
lib_evas_libevas_la_SOURCES = \
-static_libs/liblinebreak/linebreak.c \
-static_libs/liblinebreak/linebreakdata.c \
-static_libs/liblinebreak/linebreakdef.c \
-static_libs/liblinebreak/wordbreak.c \
-static_libs/liblinebreak/wordbreakdata.x
+static_libs/libunibreak/linebreak.c \
+static_libs/libunibreak/linebreakdata.c \
+static_libs/libunibreak/linebreakdef.c \
+static_libs/libunibreak/wordbreak.c \
+static_libs/libunibreak/wordbreakdata.c
# Main
lib_evas_libevas_la_SOURCES += \
-I$(top_srcdir)/src/lib/evas/include \
-I$(top_srcdir)/src/lib/evas/cserve2 \
-I$(top_srcdir)/src/lib/evas/file \
--I$(top_srcdir)/src/static_libs/liblinebreak \
+-I$(top_srcdir)/src/static_libs/libunibreak \
-I$(top_srcdir)/src/lib/evas/common \
-I$(top_srcdir)/src/lib/eina \
-I$(top_builddir)/src/lib/eina \
# Linebreak
EXTRA_DIST += \
-static_libs/liblinebreak/LICENCE \
-static_libs/liblinebreak/AUTHORS \
-static_libs/liblinebreak/NEWS \
-static_libs/liblinebreak/README \
-static_libs/liblinebreak/ChangeLog
+static_libs/libunibreak/LICENCE \
+static_libs/libunibreak/AUTHORS \
+static_libs/libunibreak/NEWS \
+static_libs/libunibreak/README \
+static_libs/libunibreak/ChangeLog
# Engines
+2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
+
+ Update files according to UAX #14-30, for Unicode 6.2.0.
+ * README: Update the reference to UAX #14-30.
+ * src/linebreak.c (baTable): Update for the new class `RI'.
+ * src/linebreak.h (LINEBREAK_VERSION): Set to 0x0202.
+ * src/linebreakdef.h (LBP_RI): New enumerator for the new class `RI'
+ as defined in UAX #14-30.
+ * src/linebreakdata.c: Regenerate from LineBreak-6.2.0.txt.
+
+2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
+
+ * src/linebreak.c (baTable): Correct the issue that one column was
+ missing in the table.
+
+2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
+
+ * README: Update to reflect the recent changes.
+
+2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
+
+ Make `make linebreakdata' and `make wordbreakdata' work again.
+ * src/Makefile.am (EXTRA_DIST): Add missing `filter_dup.c'.
+ (linebreakdata): New make target.
+ (wordbreakdata): New make target.
+
+2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
+
+ Make `make dist' work again after the directory adjustment.
+ * Doxyfile (INPUT): Change to `src'.
+ (FILE_PATTERNS): Set to `*.c *.h'.
+ * Makefile.am (EXTRA_DIST): Move content from src/Makefile.am.
+ (doc): Move target from src/Makefile.am.
+ * src/Makefile.am (EXTRA_DIST): Move partial content to Makefile.am.
+ (doc): Move target to Makefile.am.
+
+2012-09-16 Wu Yongwei <wuyongwei@gmail.com>
+
+ Update files according to UAX #14-28, for Unicode 6.1.0.
+ * README: Update the reference to UAX #14-28.
+ * src/linebreak.c (baTable): Update for the new class `HL'.
+ (resolve_lb_class): Resolve the new class `CJ' to `ID' (simplified).
+ * src/linebreakdef.h (LBP_HL): New enumerator for the new class `HL'
+ as defined in UAX #14-28.
+ (LBP_CJ): New enumerator for the new class `CJ' as defined in
+ UAX #14-28.
+ * src/linebreakdata.c: Regenerate from LineBreak-6.1.0.txt.
+
+2012-08-13 Tom Hacohen <tom@stosb.com>
+
+ Move source files to under src.
+ * Makefile.am: Split from original Makefile.am.
+ (SUBDIRS): Add `src'.
+ * configure.ac (AC_CONFIG_SRCDIR): Add `src/' before `linebreak.c'.
+ (AC_CONFIG_FILES): Add `src/Makefile'.
+ * src/LineBreak1.sed: Move from LineBreak1.sed.
+ * src/LineBreak2.sed: Move from LineBreak2.sed.
+ * src/Makefile.am: Split from Makefile.am
+ * src/Makefile.gcc: Move from Makefile.gcc.
+ * src/Makefile.msvc: Move from Makefile.msvc.
+ * src/filter_dup.c: Move from filter_dup.c.
+ * src/linebreak.c: Move from linebreak.c.
+ * src/linebreak.h: Move from linebreak.h.
+ * src/linebreakdata.c: Move from linebreakdata.c.
+ * src/linebreakdata1.tmpl: Move from linebreakdata1.tmpl.
+ * src/linebreakdata2.tmpl: Move from linebreakdata2.tmpl.
+ * src/linebreakdata3.tmpl: Move from linebreakdata3.tmpl.
+ * src/linebreakdef.c: Move from linebreakdef.c.
+ * src/linebreakdef.h: Move from linebreakdef.h.
+ * src/sort_numeric_hex.py: Move from sort_numeric_hex.py.
+ * src/wordbreak.c: Move from wordbreak.c.
+ * src/wordbreak.h: Move from wordbreak.h.
+ * src/wordbreakdata.c: Move from wordbreakdata.c.
+ * src/wordbreakdata1.tmpl: Move from wordbreakdata1.tmpl.
+ * src/wordbreakdata2.tmpl: Move from wordbreakdata2.tmpl.
+ * src/wordbreakdef.h: Move from wordbreakdef.h.
+
+2012-08-12 Wu Yongwei <wuyongwei@gmail.com>
+
+ * README: Change the home URL to github; remove $Id$; eliminate
+ non-ASCII characters.
+
+2012-08-11 Wu Yongwei <wuyongwei@gmail.com>
+
+ * configure.ac (AC_INIT): Change the library name and version to
+ `libunibreak' and `1.0'.
+ (AC_PROG_LN_S): New macro.
+ (AC_OUTPUT): Change to `libunibreak.pc'.
+ * Doxyfile: (PROJECT_NAME): Change to `libunibreak'.
+ (PROJECT_NUMBER): Change to `1.0'.
+ * LICENCE: Add copyright information about Tom Hacohen.
+ * Makefile.am (lib_LTLIBRARIES): Change to `libunibreak.la'.
+ (pkgconfig_DATA): Change to `libunibreak.la'.
+ (libunibreak_la_LDFLAGS): Reset the version to `1:0'.
+ (install-exec-hook): Replace the static library liblinebreak.a with
+ a symlink to libunibreak.a.
+ * Makefile.msvc: Change the library name to `libunibreak', and the
+ output library to `unibreak.lib'.
+ * NEW: Add information about libunibreak 1.0.
+ * README: Change the library name, and add information about word
+ break.
+
+2012-02-04 Wu Yongwei <wuyongwei@gmail.com>
+
+ * wordbreak.h (WORDBREAK_INSIDEACHAR): Change from
+ WORDBREAK_INSIDECHAR.
+ * wordbreak.c (set_brks_to): Change `WORDBREAK_INSIDECHAR' to
+ `WORDBREAK_INSIDEACHAR'.
+
+2012-01-19 Wu Yongwei <wuyongwei@gmail.com>
+
+ * wordbreak.h: Change angle brackets to quotation marks (which
+ caused build errors).
+
+2012-01-19 Wu Yongwei <wuyongwei@gmail.com>
+
+ * Makefile.gcc (CFILES): Add wordbreak.c.
+ (WordBreakProperty.txt): New target.
+ (wordbreakdata): New target.
+
+2012-01-19 Wu Yongwei <wuyongwei@gmail.com>
+
+ * Makefile.am (liblinebreak_la_SOURCES): Remove wordbreakdata.c.
+ (EXTRA_DIST): Add wordbreakdata.c, wordbreakdata1.tmpl, and
+ wordbreakdata2.tmpl.
+
+2012-01-19 Wu Yongwei <wuyongwei@gmail.com>
+
+ * Makefile.msvc: Add wordbreak files.
+
+2012-01-18 Tom Hacohen <tom@stosb.com>
+
+ Add word breaking support.
+ * AUTHORS: Add `Tom Hacohen'.
+ * Makefile.am (include_HEADERS): Add header files for word breaking.
+ (liblinebreak_la_SOURCES): Add source files for word breaking.
+ (sort_numeric_hex.py): Add `sort_numeric_hex.py'.
+ (distclean-local): Clean also `WordBreakData.txt'.
+ (WordBreakProperty.txt): New target.
+ (wordbreakdata): New target.
+ * sort_numeric_hex.py: New file.
+ * wordbreak.c: New file.
+ * wordbreak.h: New file.
+ * wordbreakdef.h: New file.
+ * wordbreakdata.c: New file.
+ * wordbreakdata1.tmpl: New file.
+ * wordbreakdata2.tmpl: New file.
+
+2011-05-17 Wu Yongwei <wuyongwei@gmail.com>
+
+ Add support for pkg-config (thanks to Tom Hacohen).
+ * liblinebreak.pc.in: New file.
+ * configure.ac (AC_OUTPUT): Add `liblinebreak.pc'.
+ * Makefile.am (pkgconfig_DATA): Set to `liblinebreak.pc'.
+ (pkgconfigdir): Set to `$(libdir)/pkgconfig'.
+
+2011-05-07 Wu Yongwei <wuyongwei@gmail.com>
+
+ * README: Update the reference to UAX #14-26, for Unicode 6.0.0.
+
+2011-05-07 Wu Yongwei <wuyongwei@gmail.com>
+
+ * configure.ac (AC_INIT): Increase the version to 2.1.
+ * Makefile.am (liblinebreak_la_LDFLAGS): Set the version-info to
+ `2:1'.
+
+2011-05-07 Wu Yongwei <wuyongwei@gmail.com>
+
+ * LICENCE: Update the copyright year.
+
+2011-05-07 Wu Yongwei <wuyongwei@gmail.com>
+
+ Update for the 2.1 release.
+ * Doxyfile (PROJECT_NUMBER): Set to `2.1'.
+ * NEWS: Add information about the 2.1 release.
+ * linebreak.h (LINEBREAK_VERSION): Set to `0x0201'.
+ * linebreak.h: Update comments.
+ * linebreak.c: Ditto.
+ * linebreakdef.h: Ditto.
+ * linebreakdef.c: Ditto.
+
+2011-05-07 Wu Yongwei <wuyongwei@gmail.com>
+
+ * linebreakdata.c: Regenerate from LineBreak-6.0.0.txt.
+
+2011-05-07 Wu Yongwei <wuyongwei@gmail.com>
+
+ * linebreak.c (set_linebreaks): Fix the assertion failure when
+ U+FFFC (OBJECT REPLACEMENT CHARACTER) appears at the beginning of a
+ line (thanks to Tom Hacohen).
+
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
* LICENCE: Update the copyright year.
-Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
+Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
+Copyright (C) 2012 Tom Hacohen <tom dot hacohen at samsung dot com>
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
-New in 2.0
+New in libunibreak 1.0
+
+- Add word breaking support
+- Change the library name to "libunibreak", while keeping maximum compatibility
+- Add pkg-config support
+
+New in liblinebreak 2.1
+
+- Update the data according to LineBreak-6.0.0.txt
+- Fix the bug that an assertion in code can fail if U+FFFC is
+ encountered at the beginning of a line
+
+New in liblinebreak 2.0
- Update the algorithm and data according to UAX #14-24 and
LineBreak-5.2.0.txt
- Rename some functions to reduce namespace pollution
- Make Doxygen documentation better
-New in 1.2
+New in liblinebreak 1.2
- Fix the bug that an assertion in code can fail if an invalid UTF-8 or
UTF-16 sequence is encountered near the end of input
used as apostrophe
- Make Doxygen documentation better
-New in 1.1
+New in liblinebreak 1.1
- Make get_lb_prop_lang static and not an exported symbol
- Define is_line_breakable to alias to is_breakable
linebreakdef.h
- Add the function documentation comments to the header files
-New in 1.0
+New in liblinebreak 1.0
- Update the line breaking data according to UAX #14-22 and
LineBreak-5.1.0.txt
-NOTICE: This is the original version, that was adapted a bit (mostly
- build related) in order to work nicely with Evas.
-
-
- L I B L I N E B R E A K
- =======================
+ L I B U N I B R E A K
+ =====================
Overview
--------
-This is the README file for liblinebreak, an implementation of the line
-breaking algorithm as described in Unicode 5.2.0 Standard Annex 14,
-Revision 24, available at
- <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
+This is the README file for libunibreak, an implementation of the line
+breaking and word breaking algorithms as described in Unicode
+Standard Annex 14 and Unicode Standard Annex 30, available at
+ <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
+ <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
Check this URL for up-to-date information:
- <URL:http://vimgadgets.sourceforge.net/liblinebreak/>
+ <URL:https://github.com/adah1972/libunibreak>
Licence
- type `make doc' to generate the doxygen documentation; or
- type `make linebreakdata' to regenerate linebreakdata.c from
LineBreak.txt.
+ - type `make wordbreakdata' to regenerate wordbreakdata.c from
+ WordBreakProperty.txt.
2) On systems where GCC and Binutils are supported, one can type
+ cd src
cp -p Makefile.gcc Makefile
make
- type `make doc' to generate the doxygen documentation; or
- type `make linebreakdata' to regenerate linebreakdata.c from
LineBreak.txt.
+ - type `make wordbreakdata' to regenerate wordbreakdata.c from
+ WordBreakProperty.txt.
3) On Windows, apart from using method 1 (Cygwin/MSYS) and method 2
(MinGW), MSVC can also be used. Type
+ cd src
nmake -f Makefile.msvc
to build the static library. By default the debug release is built.
To build the release version
- nmake -f Makefile.msvc CFG="linebreak - Win32 Release"
+ nmake -f Makefile.msvc CFG="libunibreak - Win32 Release"
Documentation
-------------
-Check the generated document doc/html/linebreak_8h.html for the public
+Check the generated document doc/html/linebreak_8h.html and
+doc/html/wordbreak_8h.html in the downloaded file for the public
interfaces exposed to applications.
-$Id: README,v 1.6 2009/11/29 08:09:13 adah Exp $
-
vim:autoindent:expandtab:formatoptions=tcqlmn:textwidth=72:
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
- * This library has been updated according to Revision 24, for
- * Unicode 5.2.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
+ * This library has been updated according to Revision 30, for
+ * Unicode 6.2.0:
+ * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
* Implementation of the line breaking algorithm as described in Unicode
* Standard Annex 14.
*
- * @version 2.0, 2010/01/03
+ * @version 2.3, 2012/10/06
* @author Wu Yongwei
*/
/**
* Break action pair table. This is a direct mapping of Table 2 of
- * Unicode Standard Annex 14, Revision 24.
+ * Unicode Standard Annex 14, Revision 30.
*/
-static enum BreakAction baTable[LBP_JT][LBP_JT] = {
+static enum BreakAction baTable[LBP_RI][LBP_RI] = {
{ /* OP */
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
- PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
+ PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
+ CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
+ PRH_BRK },
{ /* CL */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* CP */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* QU */
PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
{ /* GL */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
{ /* NS */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* EX */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* SY */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* IS */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* PR */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
+ IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK },
{ /* PO */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* NU */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* AL */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* HL */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* ID */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* IN */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* HY */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* BA */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* BB */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
{ /* B2 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* ZW */
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
- DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* CM */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
{ /* WJ */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
{ /* H2 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
+ DIR_BRK },
{ /* H3 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
+ DIR_BRK },
{ /* JL */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
+ DIR_BRK },
{ /* JV */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
+ DIR_BRK },
{ /* JT */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
+ DIR_BRK },
+ { /* RI */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ IND_BRK },
};
/**
{
return LBP_ID;
}
- /* Fall through */
+ else
+ {
+ return LBP_AL;
+ }
+ case LBP_CJ:
+ /* Simplified for `normal' line breaking. See
+ * <url:http://www.unicode.org/reports/tr14/tr14-28.html#CJ>
+ * for details. */
+ return LBP_ID;
case LBP_SA:
case LBP_SG:
case LBP_XX:
lbcNew = resolve_lb_class(lbcNew, lang);
+ /* TODO: LB21a, as introduced by Revision 28 of UAX#14, is not
+ * yet implemented below. */
+
assert(lbcCur <= LBP_JT);
assert(lbcNew <= LBP_JT);
switch (baTable[lbcCur - 1][lbcNew - 1])
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
- * This library has been updated according to Revision 24, for
- * Unicode 5.2.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
+ * This library has been updated according to Revision 30, for
+ * Unicode 6.2.0:
+ * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*
* Header file for the line breaking algorithm.
*
- * @version 2.0, 2010/01/03
+ * @version 2.2, 2012/10/06
* @author Wu Yongwei
*/
extern "C" {
#endif
-#define LINEBREAK_VERSION 0x0200 /**< Version of the library linebreak */
+#define LINEBREAK_VERSION 0x0202 /**< Version of the library linebreak */
extern const int linebreak_version;
#ifndef LINEBREAK_UTF_TYPES_DEFINED
/* The content of this file is generated from:
-# LineBreak-6.0.0.txt
-# Date: 2010-08-18, 17:25:00 PDT [KW]
+# LineBreak-6.2.0.txt
+# Date: 2012-08-08, 19:26:00 GMT [KW]
*/
#include "linebreak.h"
{ 0x048A, 0x0587, LBP_AL },
{ 0x0589, 0x0589, LBP_IS },
{ 0x058A, 0x058A, LBP_BA },
+ { 0x058F, 0x058F, LBP_PR },
{ 0x0591, 0x05BD, LBP_CM },
{ 0x05BE, 0x05BE, LBP_BA },
{ 0x05BF, 0x05BF, LBP_CM },
{ 0x05C4, 0x05C5, LBP_CM },
{ 0x05C6, 0x05C6, LBP_EX },
{ 0x05C7, 0x05C7, LBP_CM },
- { 0x05D0, 0x0608, LBP_AL },
+ { 0x05D0, 0x05F2, LBP_HL },
+ { 0x05F3, 0x0608, LBP_AL },
{ 0x0609, 0x060B, LBP_PO },
{ 0x060C, 0x060D, LBP_IS },
{ 0x060E, 0x060F, LBP_AL },
{ 0x0829, 0x082D, LBP_CM },
{ 0x0830, 0x0858, LBP_AL },
{ 0x0859, 0x085B, LBP_CM },
- { 0x085E, 0x085E, LBP_AL },
- { 0x0900, 0x0903, LBP_CM },
+ { 0x085E, 0x08AC, LBP_AL },
+ { 0x08E4, 0x0903, LBP_CM },
{ 0x0904, 0x0939, LBP_AL },
{ 0x093A, 0x093C, LBP_CM },
{ 0x093D, 0x093D, LBP_AL },
{ 0x0AD0, 0x0AE1, LBP_AL },
{ 0x0AE2, 0x0AE3, LBP_CM },
{ 0x0AE6, 0x0AEF, LBP_NU },
+ { 0x0AF0, 0x0AF0, LBP_AL },
{ 0x0AF1, 0x0AF1, LBP_PR },
{ 0x0B01, 0x0B03, LBP_CM },
{ 0x0B05, 0x0B39, LBP_AL },
{ 0x0E5A, 0x0E5B, LBP_BA },
{ 0x0E81, 0x0ECD, LBP_SA },
{ 0x0ED0, 0x0ED9, LBP_NU },
- { 0x0EDC, 0x0EDD, LBP_SA },
+ { 0x0EDC, 0x0EDF, LBP_SA },
{ 0x0F00, 0x0F00, LBP_AL },
{ 0x0F01, 0x0F04, LBP_BB },
{ 0x0F05, 0x0F05, LBP_AL },
{ 0x1050, 0x108F, LBP_SA },
{ 0x1090, 0x1099, LBP_NU },
{ 0x109A, 0x109F, LBP_SA },
- { 0x10A0, 0x10FC, LBP_AL },
+ { 0x10A0, 0x10FF, LBP_AL },
{ 0x1100, 0x115F, LBP_JL },
{ 0x1160, 0x11A7, LBP_JV },
{ 0x11A8, 0x11FF, LBP_JT },
{ 0x1B74, 0x1B7C, LBP_AL },
{ 0x1B80, 0x1B82, LBP_CM },
{ 0x1B83, 0x1BA0, LBP_AL },
- { 0x1BA1, 0x1BAA, LBP_CM },
+ { 0x1BA1, 0x1BAD, LBP_CM },
{ 0x1BAE, 0x1BAF, LBP_AL },
{ 0x1BB0, 0x1BB9, LBP_NU },
- { 0x1BC0, 0x1BE5, LBP_AL },
+ { 0x1BBA, 0x1BE5, LBP_AL },
{ 0x1BE6, 0x1BF3, LBP_CM },
{ 0x1BFC, 0x1C23, LBP_AL },
{ 0x1C24, 0x1C37, LBP_CM },
{ 0x1C50, 0x1C59, LBP_NU },
{ 0x1C5A, 0x1C7D, LBP_AL },
{ 0x1C7E, 0x1C7F, LBP_BA },
+ { 0x1CC0, 0x1CC7, LBP_AL },
{ 0x1CD0, 0x1CD2, LBP_CM },
{ 0x1CD3, 0x1CD3, LBP_AL },
{ 0x1CD4, 0x1CE8, LBP_CM },
{ 0x1CE9, 0x1CEC, LBP_AL },
{ 0x1CED, 0x1CED, LBP_CM },
{ 0x1CEE, 0x1CF1, LBP_AL },
- { 0x1CF2, 0x1CF2, LBP_CM },
- { 0x1D00, 0x1DBF, LBP_AL },
+ { 0x1CF2, 0x1CF4, LBP_CM },
+ { 0x1CF5, 0x1DBF, LBP_AL },
{ 0x1DC0, 0x1DFF, LBP_CM },
{ 0x1E00, 0x1FFC, LBP_AL },
{ 0x1FFD, 0x1FFD, LBP_BB },
{ 0x20A7, 0x20A7, LBP_PO },
{ 0x20A8, 0x20B5, LBP_PR },
{ 0x20B6, 0x20B6, LBP_PO },
- { 0x20B7, 0x20B9, LBP_PR },
+ { 0x20B7, 0x20BA, LBP_PR },
{ 0x20D0, 0x20F0, LBP_CM },
{ 0x2100, 0x2102, LBP_AL },
{ 0x2103, 0x2103, LBP_PO },
{ 0x22BF, 0x22BF, LBP_AI },
{ 0x22C0, 0x2311, LBP_AL },
{ 0x2312, 0x2312, LBP_AI },
- { 0x2313, 0x2328, LBP_AL },
+ { 0x2313, 0x2319, LBP_AL },
+ { 0x231A, 0x231B, LBP_ID },
+ { 0x231C, 0x2328, LBP_AL },
{ 0x2329, 0x2329, LBP_OP },
{ 0x232A, 0x232A, LBP_CL },
- { 0x232B, 0x244A, LBP_AL },
+ { 0x232B, 0x23EF, LBP_AL },
+ { 0x23F0, 0x23F3, LBP_ID },
+ { 0x2400, 0x244A, LBP_AL },
{ 0x2460, 0x24FE, LBP_AI },
{ 0x24FF, 0x24FF, LBP_AL },
{ 0x2500, 0x254B, LBP_AI },
{ 0x25E2, 0x25E5, LBP_AI },
{ 0x25E6, 0x25EE, LBP_AL },
{ 0x25EF, 0x25EF, LBP_AI },
- { 0x25F0, 0x2604, LBP_AL },
+ { 0x25F0, 0x25FF, LBP_AL },
+ { 0x2600, 0x2603, LBP_ID },
+ { 0x2604, 0x2604, LBP_AL },
{ 0x2605, 0x2606, LBP_AI },
{ 0x2607, 0x2608, LBP_AL },
{ 0x2609, 0x2609, LBP_AI },
{ 0x260A, 0x260D, LBP_AL },
{ 0x260E, 0x260F, LBP_AI },
{ 0x2610, 0x2613, LBP_AL },
- { 0x2614, 0x2617, LBP_AI },
- { 0x2618, 0x261B, LBP_AL },
- { 0x261C, 0x261C, LBP_AI },
- { 0x261D, 0x261D, LBP_AL },
- { 0x261E, 0x261E, LBP_AI },
- { 0x261F, 0x263F, LBP_AL },
+ { 0x2614, 0x2615, LBP_ID },
+ { 0x2616, 0x2617, LBP_AI },
+ { 0x2618, 0x2618, LBP_ID },
+ { 0x2619, 0x2619, LBP_AL },
+ { 0x261A, 0x261F, LBP_ID },
+ { 0x2620, 0x2638, LBP_AL },
+ { 0x2639, 0x263B, LBP_ID },
+ { 0x263C, 0x263F, LBP_AL },
{ 0x2640, 0x2640, LBP_AI },
{ 0x2641, 0x2641, LBP_AL },
{ 0x2642, 0x2642, LBP_AI },
{ 0x2662, 0x2662, LBP_AL },
{ 0x2663, 0x2665, LBP_AI },
{ 0x2666, 0x2666, LBP_AL },
- { 0x2667, 0x266A, LBP_AI },
+ { 0x2667, 0x2667, LBP_AI },
+ { 0x2668, 0x2668, LBP_ID },
+ { 0x2669, 0x266A, LBP_AI },
{ 0x266B, 0x266B, LBP_AL },
{ 0x266C, 0x266D, LBP_AI },
{ 0x266E, 0x266E, LBP_AL },
{ 0x266F, 0x266F, LBP_AI },
- { 0x2670, 0x269D, LBP_AL },
+ { 0x2670, 0x267E, LBP_AL },
+ { 0x267F, 0x267F, LBP_ID },
+ { 0x2680, 0x269D, LBP_AL },
{ 0x269E, 0x269F, LBP_AI },
- { 0x26A0, 0x26BD, LBP_AL },
- { 0x26BE, 0x26BF, LBP_AI },
- { 0x26C0, 0x26C3, LBP_AL },
- { 0x26C4, 0x26CD, LBP_AI },
+ { 0x26A0, 0x26BC, LBP_AL },
+ { 0x26BD, 0x26C8, LBP_ID },
+ { 0x26C9, 0x26CC, LBP_AI },
+ { 0x26CD, 0x26CD, LBP_ID },
{ 0x26CE, 0x26CE, LBP_AL },
- { 0x26CF, 0x26E1, LBP_AI },
+ { 0x26CF, 0x26D1, LBP_ID },
+ { 0x26D2, 0x26D2, LBP_AI },
+ { 0x26D3, 0x26D4, LBP_ID },
+ { 0x26D5, 0x26D7, LBP_AI },
+ { 0x26D8, 0x26D9, LBP_ID },
+ { 0x26DA, 0x26DB, LBP_AI },
+ { 0x26DC, 0x26DC, LBP_ID },
+ { 0x26DD, 0x26DE, LBP_AI },
+ { 0x26DF, 0x26E1, LBP_ID },
{ 0x26E2, 0x26E2, LBP_AL },
{ 0x26E3, 0x26E3, LBP_AI },
{ 0x26E4, 0x26E7, LBP_AL },
- { 0x26E8, 0x26FF, LBP_AI },
- { 0x2701, 0x2756, LBP_AL },
+ { 0x26E8, 0x26E9, LBP_AI },
+ { 0x26EA, 0x26EA, LBP_ID },
+ { 0x26EB, 0x26F0, LBP_AI },
+ { 0x26F1, 0x26F5, LBP_ID },
+ { 0x26F6, 0x26F6, LBP_AI },
+ { 0x26F7, 0x26FA, LBP_ID },
+ { 0x26FB, 0x26FC, LBP_AI },
+ { 0x26FD, 0x2704, LBP_ID },
+ { 0x2705, 0x2707, LBP_AL },
+ { 0x2708, 0x270D, LBP_ID },
+ { 0x270E, 0x2756, LBP_AL },
{ 0x2757, 0x2757, LBP_AI },
{ 0x2758, 0x275A, LBP_AL },
{ 0x275B, 0x275E, LBP_QU },
{ 0x2B55, 0x2B59, LBP_AI },
{ 0x2C00, 0x2CEE, LBP_AL },
{ 0x2CEF, 0x2CF1, LBP_CM },
+ { 0x2CF2, 0x2CF3, LBP_AL },
{ 0x2CF9, 0x2CF9, LBP_EX },
{ 0x2CFA, 0x2CFC, LBP_BA },
{ 0x2CFD, 0x2CFD, LBP_AL },
{ 0x2E2E, 0x2E2E, LBP_EX },
{ 0x2E2F, 0x2E2F, LBP_AL },
{ 0x2E30, 0x2E31, LBP_BA },
+ { 0x2E32, 0x2E32, LBP_AL },
+ { 0x2E33, 0x2E34, LBP_BA },
+ { 0x2E35, 0x2E39, LBP_AL },
+ { 0x2E3A, 0x2E3B, LBP_B2 },
{ 0x2E80, 0x3000, LBP_ID },
{ 0x3001, 0x3002, LBP_CL },
{ 0x3003, 0x3004, LBP_ID },
{ 0x3030, 0x303A, LBP_ID },
{ 0x303B, 0x303C, LBP_NS },
{ 0x303D, 0x303F, LBP_ID },
- { 0x3041, 0x3041, LBP_NS },
+ { 0x3041, 0x3041, LBP_CJ },
{ 0x3042, 0x3042, LBP_ID },
- { 0x3043, 0x3043, LBP_NS },
+ { 0x3043, 0x3043, LBP_CJ },
{ 0x3044, 0x3044, LBP_ID },
- { 0x3045, 0x3045, LBP_NS },
+ { 0x3045, 0x3045, LBP_CJ },
{ 0x3046, 0x3046, LBP_ID },
- { 0x3047, 0x3047, LBP_NS },
+ { 0x3047, 0x3047, LBP_CJ },
{ 0x3048, 0x3048, LBP_ID },
- { 0x3049, 0x3049, LBP_NS },
+ { 0x3049, 0x3049, LBP_CJ },
{ 0x304A, 0x3062, LBP_ID },
- { 0x3063, 0x3063, LBP_NS },
+ { 0x3063, 0x3063, LBP_CJ },
{ 0x3064, 0x3082, LBP_ID },
- { 0x3083, 0x3083, LBP_NS },
+ { 0x3083, 0x3083, LBP_CJ },
{ 0x3084, 0x3084, LBP_ID },
- { 0x3085, 0x3085, LBP_NS },
+ { 0x3085, 0x3085, LBP_CJ },
{ 0x3086, 0x3086, LBP_ID },
- { 0x3087, 0x3087, LBP_NS },
+ { 0x3087, 0x3087, LBP_CJ },
{ 0x3088, 0x308D, LBP_ID },
- { 0x308E, 0x308E, LBP_NS },
+ { 0x308E, 0x308E, LBP_CJ },
{ 0x308F, 0x3094, LBP_ID },
- { 0x3095, 0x3096, LBP_NS },
+ { 0x3095, 0x3096, LBP_CJ },
{ 0x3099, 0x309A, LBP_CM },
{ 0x309B, 0x309E, LBP_NS },
{ 0x309F, 0x309F, LBP_ID },
- { 0x30A0, 0x30A1, LBP_NS },
+ { 0x30A0, 0x30A0, LBP_NS },
+ { 0x30A1, 0x30A1, LBP_CJ },
{ 0x30A2, 0x30A2, LBP_ID },
- { 0x30A3, 0x30A3, LBP_NS },
+ { 0x30A3, 0x30A3, LBP_CJ },
{ 0x30A4, 0x30A4, LBP_ID },
- { 0x30A5, 0x30A5, LBP_NS },
+ { 0x30A5, 0x30A5, LBP_CJ },
{ 0x30A6, 0x30A6, LBP_ID },
- { 0x30A7, 0x30A7, LBP_NS },
+ { 0x30A7, 0x30A7, LBP_CJ },
{ 0x30A8, 0x30A8, LBP_ID },
- { 0x30A9, 0x30A9, LBP_NS },
+ { 0x30A9, 0x30A9, LBP_CJ },
{ 0x30AA, 0x30C2, LBP_ID },
- { 0x30C3, 0x30C3, LBP_NS },
+ { 0x30C3, 0x30C3, LBP_CJ },
{ 0x30C4, 0x30E2, LBP_ID },
- { 0x30E3, 0x30E3, LBP_NS },
+ { 0x30E3, 0x30E3, LBP_CJ },
{ 0x30E4, 0x30E4, LBP_ID },
- { 0x30E5, 0x30E5, LBP_NS },
+ { 0x30E5, 0x30E5, LBP_CJ },
{ 0x30E6, 0x30E6, LBP_ID },
- { 0x30E7, 0x30E7, LBP_NS },
+ { 0x30E7, 0x30E7, LBP_CJ },
{ 0x30E8, 0x30ED, LBP_ID },
- { 0x30EE, 0x30EE, LBP_NS },
+ { 0x30EE, 0x30EE, LBP_CJ },
{ 0x30EF, 0x30F4, LBP_ID },
- { 0x30F5, 0x30F6, LBP_NS },
+ { 0x30F5, 0x30F6, LBP_CJ },
{ 0x30F7, 0x30FA, LBP_ID },
- { 0x30FB, 0x30FE, LBP_NS },
+ { 0x30FB, 0x30FB, LBP_NS },
+ { 0x30FC, 0x30FC, LBP_CJ },
+ { 0x30FD, 0x30FE, LBP_NS },
{ 0x30FF, 0x31E3, LBP_ID },
- { 0x31F0, 0x31FF, LBP_NS },
+ { 0x31F0, 0x31FF, LBP_CJ },
{ 0x3200, 0x3247, LBP_ID },
{ 0x3248, 0x324F, LBP_AI },
{ 0x3250, 0x4DBF, LBP_ID },
{ 0xA62A, 0xA66E, LBP_AL },
{ 0xA66F, 0xA672, LBP_CM },
{ 0xA673, 0xA673, LBP_AL },
- { 0xA67C, 0xA67D, LBP_CM },
- { 0xA67E, 0xA6EF, LBP_AL },
+ { 0xA674, 0xA67D, LBP_CM },
+ { 0xA67E, 0xA697, LBP_AL },
+ { 0xA69F, 0xA69F, LBP_CM },
+ { 0xA6A0, 0xA6EF, LBP_AL },
{ 0xA6F0, 0xA6F1, LBP_CM },
{ 0xA6F2, 0xA6F2, LBP_AL },
{ 0xA6F3, 0xA6F7, LBP_BA },
{ 0xAA5C, 0xAA5C, LBP_AL },
{ 0xAA5D, 0xAA5F, LBP_BA },
{ 0xAA60, 0xAADF, LBP_SA },
+ { 0xAAE0, 0xAAEA, LBP_AL },
+ { 0xAAEB, 0xAAEF, LBP_CM },
+ { 0xAAF0, 0xAAF1, LBP_BA },
+ { 0xAAF2, 0xAAF4, LBP_AL },
+ { 0xAAF5, 0xAAF6, LBP_CM },
{ 0xAB01, 0xABE2, LBP_AL },
{ 0xABE3, 0xABEA, LBP_CM },
{ 0xABEB, 0xABEB, LBP_BA },
{ 0xD800, 0xDFFF, LBP_SG },
{ 0xE000, 0xF8FF, LBP_XX },
{ 0xF900, 0xFAFF, LBP_ID },
- { 0xFB00, 0xFB1D, LBP_AL },
+ { 0xFB00, 0xFB17, LBP_AL },
+ { 0xFB1D, 0xFB1D, LBP_HL },
{ 0xFB1E, 0xFB1E, LBP_CM },
- { 0xFB1F, 0xFD3D, LBP_AL },
+ { 0xFB1F, 0xFB28, LBP_HL },
+ { 0xFB29, 0xFB29, LBP_AL },
+ { 0xFB2A, 0xFB4F, LBP_HL },
+ { 0xFB50, 0xFD3D, LBP_AL },
{ 0xFD3E, 0xFD3E, LBP_OP },
{ 0xFD3F, 0xFD3F, LBP_CL },
{ 0xFD50, 0xFDFB, LBP_AL },
{ 0xFF63, 0xFF64, LBP_CL },
{ 0xFF65, 0xFF65, LBP_NS },
{ 0xFF66, 0xFF66, LBP_AL },
- { 0xFF67, 0xFF70, LBP_NS },
+ { 0xFF67, 0xFF70, LBP_CJ },
{ 0xFF71, 0xFF9D, LBP_AL },
{ 0xFF9E, 0xFF9F, LBP_NS },
{ 0xFFA0, 0xFFDC, LBP_AL },
{ 0x110B0, 0x110BA, LBP_CM },
{ 0x110BB, 0x110BD, LBP_AL },
{ 0x110BE, 0x110C1, LBP_BA },
+ { 0x110D0, 0x110E8, LBP_AL },
+ { 0x110F0, 0x110F9, LBP_NU },
+ { 0x11100, 0x11102, LBP_CM },
+ { 0x11103, 0x11126, LBP_AL },
+ { 0x11127, 0x11134, LBP_CM },
+ { 0x11136, 0x1113F, LBP_NU },
+ { 0x11140, 0x11143, LBP_BA },
+ { 0x11180, 0x11182, LBP_CM },
+ { 0x11183, 0x111B2, LBP_AL },
+ { 0x111B3, 0x111C0, LBP_CM },
+ { 0x111C1, 0x111C4, LBP_AL },
+ { 0x111C5, 0x111C6, LBP_BA },
+ { 0x111C7, 0x111C7, LBP_AL },
+ { 0x111C8, 0x111C8, LBP_BA },
+ { 0x111D0, 0x111D9, LBP_NU },
+ { 0x11680, 0x116AA, LBP_AL },
+ { 0x116AB, 0x116B7, LBP_CM },
+ { 0x116C0, 0x116C9, LBP_NU },
{ 0x12000, 0x12462, LBP_AL },
{ 0x12470, 0x12473, LBP_BA },
{ 0x13000, 0x13257, LBP_AL },
{ 0x1328A, 0x13378, LBP_AL },
{ 0x13379, 0x13379, LBP_OP },
{ 0x1337A, 0x1337B, LBP_CL },
- { 0x1337C, 0x16A38, LBP_AL },
+ { 0x1337C, 0x16F50, LBP_AL },
+ { 0x16F51, 0x16F92, LBP_CM },
+ { 0x16F93, 0x16F9F, LBP_AL },
{ 0x1B000, 0x1B001, LBP_ID },
{ 0x1D000, 0x1D164, LBP_AL },
{ 0x1D165, 0x1D169, LBP_CM },
{ 0x1D242, 0x1D244, LBP_CM },
{ 0x1D245, 0x1D7CB, LBP_AL },
{ 0x1D7CE, 0x1D7FF, LBP_NU },
- { 0x1F000, 0x1F0DF, LBP_AL },
+ { 0x1EE00, 0x1EEF1, LBP_AL },
+ { 0x1F000, 0x1F0DF, LBP_ID },
{ 0x1F100, 0x1F12D, LBP_AI },
{ 0x1F12E, 0x1F12E, LBP_AL },
- { 0x1F130, 0x1F19A, LBP_AI },
- { 0x1F1E6, 0x1F1FF, LBP_AL },
- { 0x1F200, 0x1F251, LBP_ID },
- { 0x1F300, 0x1F773, LBP_AL },
+ { 0x1F130, 0x1F169, LBP_AI },
+ { 0x1F16A, 0x1F16B, LBP_AL },
+ { 0x1F170, 0x1F19A, LBP_AI },
+ { 0x1F1E6, 0x1F1FF, LBP_RI },
+ { 0x1F200, 0x1F3B4, LBP_ID },
+ { 0x1F3B5, 0x1F3B6, LBP_AL },
+ { 0x1F3B7, 0x1F3BB, LBP_ID },
+ { 0x1F3BC, 0x1F3BC, LBP_AL },
+ { 0x1F3BD, 0x1F49F, LBP_ID },
+ { 0x1F4A0, 0x1F4A0, LBP_AL },
+ { 0x1F4A1, 0x1F4A1, LBP_ID },
+ { 0x1F4A2, 0x1F4A2, LBP_AL },
+ { 0x1F4A3, 0x1F4A3, LBP_ID },
+ { 0x1F4A4, 0x1F4A4, LBP_AL },
+ { 0x1F4A5, 0x1F4AE, LBP_ID },
+ { 0x1F4AF, 0x1F4AF, LBP_AL },
+ { 0x1F4B0, 0x1F4B0, LBP_ID },
+ { 0x1F4B1, 0x1F4B2, LBP_AL },
+ { 0x1F4B3, 0x1F4FC, LBP_ID },
+ { 0x1F500, 0x1F506, LBP_AL },
+ { 0x1F507, 0x1F516, LBP_ID },
+ { 0x1F517, 0x1F524, LBP_AL },
+ { 0x1F525, 0x1F531, LBP_ID },
+ { 0x1F532, 0x1F543, LBP_AL },
+ { 0x1F550, 0x1F6C5, LBP_ID },
+ { 0x1F700, 0x1F773, LBP_AL },
{ 0x20000, 0x3FFFD, LBP_ID },
{ 0xE0001, 0xE01EF, LBP_CM },
{ 0xF0000, 0x10FFFD, LBP_XX },
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
- * This library has been updated according to Revision 24, for
- * Unicode 5.2.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
+ * This library has been updated according to Revision 30, for
+ * Unicode 6.2.0:
+ * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*
* Definition of language-specific data.
*
- * @version 2.0, 2010/01/03
+ * @version 2.2, 2012/10/06
* @author Wu Yongwei
*/
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
- * This library has been updated according to Revision 24, for
- * Unicode 5.2.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
+ * This library has been updated according to Revision 30, for
+ * Unicode 6.2.0:
+ * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
* Definitions of internal data structures, declarations of global
* variables, and function prototypes for the line breaking algorithm.
*
- * @version 2.0, 2010/01/03
+ * @version 2.3, 2012/10/06
* @author Wu Yongwei
*/
/**
* Line break classes. This is a direct mapping of Table 1 of Unicode
- * Standard Annex 14, Revision 19.
+ * Standard Annex 14, Revision 26.
*/
enum LineBreakClass
{
LBP_PO, /**< Postfix */
LBP_NU, /**< Numeric */
LBP_AL, /**< Alphabetic */
+ LBP_HL, /**< Hebrew letter */
LBP_ID, /**< Ideographic */
LBP_IN, /**< Inseparable characters */
LBP_HY, /**< Hyphen */
LBP_JL, /**< Hangul L Jamo */
LBP_JV, /**< Hangul V Jamo */
LBP_JT, /**< Hangul T Jamo */
+ LBP_RI, /**< Regional indicator */
/* The following break classes are not treated in the pair table */
LBP_AI, /**< Ambiguous (alphabetic or ideograph) */
LBP_BK, /**< Break (mandatory) */
LBP_CB, /**< Contingent break */
+ LBP_CJ, /**< Conditional Japanese starter */
LBP_CR, /**< Carriage return */
LBP_LF, /**< Line feed */
LBP_NL, /**< Next line */
* Word breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2011-2011 Tom Hacohen <tom@stosb.com>
+ * Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* Implementation of the word breaking algorithm as described in Unicode
* Standard Annex 29.
*
- * @version 2.0, 2011/12/12
+ * @version 2.2, 2012/02/04
* @author Tom Hacohen
*/
-
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreakdef.h"
#include "wordbreak.h"
-#include "wordbreakdata.x"
+#include "wordbreakdata.c"
#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
-/* Init the wordbreak internals. */
+/**
+ * Initializes the wordbreak internals. It currently does nothing, but
+ * it may in the future.
+ */
void init_wordbreak(void)
{
- /* Currently does nothing, may be needed in the future. */
- return;
}
/**
*
* @param ch character to check
* @param wbp pointer to the wbp breaking properties array
- * @param len the size of the wbp array in number of items.
+ * @param len size of the wbp array in number of items
* @return the word breaking class if found; \c WBP_Any otherwise
*/
static enum WordBreakClass get_char_wb_class(
}
/**
- * Sets the break types in brks starting from posLast up to posStop.
+ * Sets the word break types to a specific value in a range.
*
- * It sets the inside chars to #WORDBREAK_INSIDECHAR and the rest to brkType.
- * Assumes brks is initialized - all the cells with #WORDBREAK_NOBREAK are
+ * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
+ * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
* cells that we really don't want to break after.
*
- * @param s the string
- * @param brks[out] the breaks array to fill.
- * @param posStart the start position
- * @param posEnd the end position
- * @param len the length of the string
- * @param brkType the breaks type to use
- * @param get_next_char function to get the next UTF-32 character
+ * @param[in] s input string
+ * @param[out] brks breaks array to fill
+ * @param[in] posStart start position
+ * @param[in] posEnd end position (exclusive)
+ * @param[in] len length of the string
+ * @param[in] brkType breaks type to use
+ * @param[in] get_next_char function to get the next UTF-32 character
*/
-static void set_brks_to(const void *s,
+static void set_brks_to(
+ const void *s,
char *brks,
size_t posStart,
size_t posEnd,
char brkType,
get_next_char_t get_next_char)
{
- size_t posCur = posStart;
- while (posCur < posEnd)
+ size_t posNext = posStart;
+ while (posNext < posEnd)
{
- get_next_char(s, len, &posCur);
- for ( ; posStart < posCur - 1; ++posStart)
- {
- brks[posStart] = WORDBREAK_INSIDECHAR;
- }
- assert(posStart == posCur - 1);
+ utf32_t ch;
+ ch = get_next_char(s, len, &posNext);
+ assert(ch != EOS);
+ for (; posStart < posNext - 1; ++posStart)
+ brks[posStart] = WORDBREAK_INSIDEACHAR;
+ assert(posStart == posNext - 1);
/* Only set it if we haven't set it not to break before. */
if (brks[posStart] != WORDBREAK_NOBREAK)
brks[posStart] = brkType;
- posStart = posCur;
+ posStart = posNext;
}
}
-/* Checks to see if newline, cr, or lf. for WB3a and b */
+/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
- (cls == WBP_LF))
+ (cls == WBP_LF))
/**
* Sets the word breaking information for a generic input string.
char *brks,
get_next_char_t get_next_char)
{
- /* Previous class */
- enum WordBreakClass p_cls = WBP_Undefined;
- /* Strong previous class. */
- enum WordBreakClass sp_cls = WBP_Undefined;
+ enum WordBreakClass wbcLast = WBP_Undefined;
+ /* wbcSeqStart is the class that started the current sequence.
+ * WBP_Undefined is a special case that means "sot".
+ * This value is the class that is at the start of the current rule
+ * matching sequence. For example, in case of Numeric+MidNum+Numeric
+ * it'll be Numeric all the way.
+ */
+ enum WordBreakClass wbcSeqStart = WBP_Undefined;
utf32_t ch;
+ size_t posNext = 0;
size_t posCur = 0;
- size_t posCurSt = 0;
size_t posLast = 0;
- /* FIXME: unused atm. */
+ /* TODO: Language-specific specialization. */
(void) lang;
-
- /* Init brks */
+ /* Init brks. */
memset(brks, WORDBREAK_BREAK, len);
- ch = get_next_char(s, len, &posCur);
+ ch = get_next_char(s, len, &posNext);
- /* WB3a, WB3b are implied. */
- for ( ; ch != EOS ; )
+ while (ch != EOS)
{
- /* Current class */
- enum WordBreakClass c_cls;
- c_cls = get_char_wb_class(ch, wb_prop_default,
- ARRAY_LEN(wb_prop_default));
+ enum WordBreakClass wbcCur;
+ wbcCur = get_char_wb_class(ch, wb_prop_default,
+ ARRAY_LEN(wb_prop_default));
- switch (c_cls)
+ switch (wbcCur)
{
case WBP_CR:
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ /* WB3b */
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
case WBP_LF:
- if (sp_cls == WBP_CR) /* WB3 */
+ if (wbcSeqStart == WBP_CR) /* WB3 */
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_NOBREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
+ break;
}
- sp_cls = c_cls;
- posLast = posCurSt;
- break;
+ /* Fall off */
case WBP_Newline:
- /* WB3a, WB3b */
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ /* WB3a,3b */
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
case WBP_Extend:
case WBP_Format:
- /* WB4 - If not the first char/after a newline (W3ab),
- * skip this class, set it to be the same as the prev, and mark
+ /* WB4 - If not the first char/after a newline (WB3a,3b), skip
+ * this class, set it to be the same as the prev, and mark
* brks not to break before them. */
- if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls))
+ if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
}
else
{
/* It's surely not the first */
- brks[posCurSt - 1] = WORDBREAK_NOBREAK;
+ brks[posCur - 1] = WORDBREAK_NOBREAK;
/* "inherit" the previous class. */
- c_cls = p_cls;
+ wbcCur = wbcLast;
}
break;
case WBP_Katakana:
- if ((sp_cls == WBP_Katakana) || /* WB13 */
- (sp_cls == WBP_ExtendNumLet)) /* WB13b */
+ if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
+ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_NOBREAK, get_next_char);
}
/* No rule found, reset */
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
}
- sp_cls = c_cls;
- posLast = posCurSt;
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
case WBP_ALetter:
- if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */
- ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */
- (sp_cls == WBP_ExtendNumLet)) /* WB13b */
+ if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
+ (wbcLast == WBP_Numeric) || /* WB10 */
+ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_NOBREAK, get_next_char);
}
/* No rule found, reset */
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
}
- sp_cls = c_cls;
- posLast = posCurSt;
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
case WBP_MidNumLet:
- if ((p_cls == WBP_ALetter) || /* WBP6,7 */
- (p_cls == WBP_Numeric)) /* WBP11,12 */
+ if ((wbcLast == WBP_ALetter) || /* WB6,7 */
+ (wbcLast == WBP_Numeric)) /* WB11,12 */
{
/* Go on */
}
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
}
break;
case WBP_MidLetter:
- if (p_cls == WBP_ALetter) /* WBP6,7 */
+ if (wbcLast == WBP_ALetter) /* WB6,7 */
{
/* Go on */
}
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
}
break;
case WBP_MidNum:
- if (p_cls == WBP_Numeric) /* WBP11,12 */
+ if (wbcLast == WBP_Numeric) /* WB11,12 */
{
/* Go on */
}
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
}
break;
case WBP_Numeric:
- if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */
- ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */
- (sp_cls == WBP_ExtendNumLet)) /* WB13b */
+ if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
+ (wbcLast == WBP_ALetter) || /* WB9 */
+ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_NOBREAK, get_next_char);
}
/* No rule found, reset */
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
}
- sp_cls = c_cls;
- posLast = posCurSt;
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
case WBP_ExtendNumLet:
/* WB13a,13b */
- if ((sp_cls == p_cls) &&
- ((p_cls == WBP_ALetter) ||
- (p_cls == WBP_Numeric) ||
- (p_cls == WBP_Katakana) ||
- (p_cls == WBP_ExtendNumLet)))
+ if ((wbcSeqStart == wbcLast) &&
+ ((wbcLast == WBP_ALetter) ||
+ (wbcLast == WBP_Numeric) ||
+ (wbcLast == WBP_Katakana) ||
+ (wbcLast == WBP_ExtendNumLet)))
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_NOBREAK, get_next_char);
}
/* No rule found, reset */
else
{
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
}
- sp_cls = c_cls;
- posLast = posCurSt;
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
case WBP_Any:
/* Allow breaks and reset */
- set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
- get_next_char);
- sp_cls = c_cls;
- posLast = posCurSt;
+ set_brks_to(s, brks, posLast, posCur, len,
+ WORDBREAK_BREAK, get_next_char);
+ wbcSeqStart = wbcCur;
+ posLast = posCur;
break;
default:
break;
}
- p_cls = c_cls;
- posCurSt = posCur;
- ch = get_next_char(s, len, &posCur);
+ wbcLast = wbcCur;
+ posCur = posNext;
+ ch = get_next_char(s, len, &posNext);
}
/* WB2 */
- set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK,
- get_next_char);
+ set_brks_to(s, brks, posLast, posNext, len,
+ WORDBREAK_BREAK, get_next_char);
}
/**
* Word breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2011-2011 Tom Hacohen <tom@stosb.com>
+ * Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
*
* Header file for the word breaking (segmentation) algorithm.
*
- * @version 2.0, 2011/12/12
+ * @version 2.2, 2012/02/04
* @author Tom Hacohen
*/
#define WORDBREAK_H
#include <stddef.h>
-#include <linebreak.h>
+#include "linebreak.h"
#ifdef __cplusplus
extern "C" {
#endif
-#define WORDBREAK_BREAK 0 /* Break found */
-#define WORDBREAK_NOBREAK 1 /**< Break not found */
-#define WORDBREAK_INSIDECHAR 2 /**< A UTF-8/16 sequence is unfinished */
+#define WORDBREAK_BREAK 0 /**< Break is allowed */
+#define WORDBREAK_NOBREAK 1 /**< No break is allowed */
+#define WORDBREAK_INSIDEACHAR 2 /**< A UTF-8/16 sequence is unfinished */
void init_wordbreak(void);
void set_wordbreaks_utf8(
# WordBreakProperty-6.0.0.txt
# Date: 2010-08-19, 00:48:48 GMT [MD]
*/
+
#include "linebreak.h"
#include "wordbreakdef.h"
+
static struct WordBreakProperties wb_prop_default[] = {
{0x000A, 0x000A, WBP_LF},
{0x000B, 0x000C, WBP_Newline},
* Word breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2011-2011 Tom Hacohen <tom@stosb.com>
+ * Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* Definitions of internal data structures, declarations of global
* variables, and function prototypes for the word breaking algorithm.
*
- * @version 2.0, 2011/12/12
+ * @version 2.1, 2012/01/18
* @author Tom Hacohen
*/
enum WordBreakClass
{
WBP_Undefined,
-
- /* The following break classes are treated in the pair table. */
WBP_CR,
WBP_LF,
WBP_Newline,