-/* vim: set tabstop=4 shiftwidth=4: */
+/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
/*
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
- * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
+ * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* distribution.
*
* The main reference is Unicode Standard Annex 14 (UAX #14):
- * <URL:http://www.unicode.org/reports/tr14/>
+ * <URL:http://www.unicode.org/reports/tr14/>
*
* When this library was designed, this annex was at Revision 19, for
* Unicode 5.0.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
+ * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
- * This library has been updated according to Revision 24, for
- * Unicode 5.2.0:
- * <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
+ * This library has been updated according to Revision 30, for
+ * Unicode 6.2.0:
+ * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
*
* The Unicode Terms of Use are available at
- * <URL:http://www.unicode.org/copyright.html>
+ * <URL:http://www.unicode.org/copyright.html>
*/
/**
- * @file linebreak.c
+ * @file linebreak.c
*
* Implementation of the line breaking algorithm as described in Unicode
* Standard Annex 14.
*
- * @version 2.0, 2010/01/03
- * @author Wu Yongwei
+ * @version 2.5, 2013/11/14
+ * @author Wu Yongwei
+ * @author Petr Filipsky
*/
#include <assert.h>
#include "linebreakdef.h"
/**
+ * Special value used internally to indicate an undefined break result.
+ */
+#define LINEBREAK_UNDEFINED -1
+
+/**
* Size of the second-level index to the line breaking properties.
*/
#define LINEBREAK_INDEX_SIZE 40
*/
enum BreakAction
{
- DIR_BRK, /**< Direct break opportunity */
- IND_BRK, /**< Indirect break opportunity */
- CMI_BRK, /**< Indirect break opportunity for combining marks */
- CMP_BRK, /**< Prohibited break for combining marks */
- PRH_BRK /**< Prohibited break */
+ DIR_BRK, /**< Direct break opportunity */
+ IND_BRK, /**< Indirect break opportunity */
+ CMI_BRK, /**< Indirect break opportunity for combining marks */
+ CMP_BRK, /**< Prohibited break for combining marks */
+ PRH_BRK /**< Prohibited break */
};
/**
* Break action pair table. This is a direct mapping of Table 2 of
- * Unicode Standard Annex 14, Revision 24.
+ * Unicode Standard Annex 14, Revision 30.
*/
-static enum BreakAction baTable[LBP_JT][LBP_JT] = {
- { /* OP */
- PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
- PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
- { /* CL */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* CP */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* QU */
- PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
- { /* GL */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
- { /* NS */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* EX */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* SY */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* IS */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* PR */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
- { /* PO */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* NU */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* AL */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* ID */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* IN */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* HY */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* BA */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* BB */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
- { /* B2 */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* ZW */
- DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
- DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* CM */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
- { /* WJ */
- IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
- IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
- { /* H2 */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
- { /* H3 */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
- { /* JL */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
- { /* JV */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
- { /* JT */
- DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
- PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
- IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
- PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
+static enum BreakAction baTable[LBP_RI][LBP_RI] = {
+ { /* OP */
+ PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
+ CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
+ PRH_BRK },
+ { /* CL */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* CP */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* QU */
+ PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
+ { /* GL */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
+ { /* NS */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* EX */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* SY */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* IS */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* PR */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK },
+ { /* PO */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* NU */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* AL */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* HL */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* ID */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* IN */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* HY */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* BA */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* BB */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
+ { /* B2 */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* ZW */
+ DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* CM */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* WJ */
+ IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
+ IND_BRK },
+ { /* H2 */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
+ DIR_BRK },
+ { /* H3 */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
+ DIR_BRK },
+ { /* JL */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
+ DIR_BRK },
+ { /* JV */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
+ DIR_BRK },
+ { /* JT */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
+ DIR_BRK },
+ { /* RI */
+ DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
+ PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
+ CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
+ IND_BRK },
};
/**
*/
struct LineBreakPropertiesIndex
{
- utf32_t end; /**< End coding point */
- struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
+ utf32_t end; /**< End coding point */
+ struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
};
/**
*/
static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
{
- { 0xFFFFFFFF, lb_prop_default }
+ { 0xFFFFFFFF, lb_prop_default }
};
/**
*/
void init_linebreak(void)
{
- size_t i;
- size_t iPropDefault;
- size_t len;
- size_t step;
-
- len = 0;
- while (lb_prop_default[len].prop != LBP_Undefined)
- ++len;
- step = len / LINEBREAK_INDEX_SIZE;
- iPropDefault = 0;
- for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
- {
- lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
- iPropDefault += step;
- lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
- }
- lb_prop_index[--i].end = 0xFFFFFFFF;
+ size_t i;
+ size_t iPropDefault;
+ size_t len;
+ size_t step;
+
+ len = 0;
+ while (lb_prop_default[len].prop != LBP_Undefined)
+ ++len;
+ step = len / LINEBREAK_INDEX_SIZE;
+ iPropDefault = 0;
+ for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
+ {
+ lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
+ iPropDefault += step;
+ lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
+ }
+ lb_prop_index[--i].end = 0xFFFFFFFF;
}
/**
* Gets the language-specific line breaking properties.
*
- * @param lang language of the text
- * @return pointer to the language-specific line breaking
- * properties array if found; \c NULL otherwise
+ * @param lang language of the text
+ * @return pointer to the language-specific line breaking
+ * properties array if found; \c NULL otherwise
*/
static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
{
- struct LineBreakPropertiesLang *lbplIter;
- if (lang != NULL)
- {
- for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
- {
- if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
- {
- return lbplIter->lbp;
- }
- }
- }
- return NULL;
+ struct LineBreakPropertiesLang *lbplIter;
+ if (lang != NULL)
+ {
+ for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
+ {
+ if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
+ {
+ return lbplIter->lbp;
+ }
+ }
+ }
+ return NULL;
}
/**
* Gets the line breaking class of a character from a line breaking
* properties array.
*
- * @param ch character to check
- * @param lbp pointer to the line breaking properties array
- * @return the line breaking class if found; \c LBP_XX otherwise
+ * @param ch character to check
+ * @param lbp pointer to the line breaking properties array
+ * @return the line breaking class if found; \c LBP_XX otherwise
*/
static enum LineBreakClass get_char_lb_class(
- utf32_t ch,
- struct LineBreakProperties *lbp)
+ utf32_t ch,
+ struct LineBreakProperties *lbp)
{
- while (lbp->prop != LBP_Undefined && ch >= lbp->start)
- {
- if (ch <= lbp->end)
- return lbp->prop;
- ++lbp;
- }
- return LBP_XX;
+ while (lbp->prop != LBP_Undefined && ch >= lbp->start)
+ {
+ if (ch <= lbp->end)
+ return lbp->prop;
+ ++lbp;
+ }
+ return LBP_XX;
}
/**
* Gets the line breaking class of a character from the default line
* breaking properties array.
*
- * @param ch character to check
- * @return the line breaking class if found; \c LBP_XX otherwise
+ * @param ch character to check
+ * @return the line breaking class if found; \c LBP_XX otherwise
*/
static enum LineBreakClass get_char_lb_class_default(
- utf32_t ch)
+ utf32_t ch)
{
- size_t i = 0;
- while (ch > lb_prop_index[i].end)
- ++i;
- assert(i < LINEBREAK_INDEX_SIZE);
- return get_char_lb_class(ch, lb_prop_index[i].lbp);
+ size_t i = 0;
+ while (ch > lb_prop_index[i].end)
+ ++i;
+ assert(i < LINEBREAK_INDEX_SIZE);
+ return get_char_lb_class(ch, lb_prop_index[i].lbp);
}
/**
* and then the default data if there is no language-specific property
* available for the character.
*
- * @param ch character to check
- * @param lbpLang pointer to the language-specific line breaking
- * properties array
- * @return the line breaking class if found; \c LBP_XX
- * otherwise
+ * @param ch character to check
+ * @param lbpLang pointer to the language-specific line breaking
+ * properties array
+ * @return the line breaking class if found; \c LBP_XX
+ * otherwise
*/
static enum LineBreakClass get_char_lb_class_lang(
- utf32_t ch,
- struct LineBreakProperties *lbpLang)
+ utf32_t ch,
+ struct LineBreakProperties *lbpLang)
{
- enum LineBreakClass lbcResult;
-
- /* Find the language-specific line breaking class for a character */
- if (lbpLang)
- {
- lbcResult = get_char_lb_class(ch, lbpLang);
- if (lbcResult != LBP_XX)
- return lbcResult;
- }
-
- /* Find the generic language-specific line breaking class, if no
- * language context is provided, or language-specific data are not
- * available for the specific character in the specified language */
- return get_char_lb_class_default(ch);
+ enum LineBreakClass lbcResult;
+
+ /* Find the language-specific line breaking class for a character */
+ if (lbpLang)
+ {
+ lbcResult = get_char_lb_class(ch, lbpLang);
+ if (lbcResult != LBP_XX)
+ return lbcResult;
+ }
+
+ /* Find the generic language-specific line breaking class, if no
+ * language context is provided, or language-specific data are not
+ * available for the specific character in the specified language */
+ return get_char_lb_class_default(ch);
}
/**
* characters. They are treated in a simplistic way in this
* implementation.
*
- * @param lbc line breaking class to resolve
- * @param lang language of the text
- * @return the resolved line breaking class
+ * @param lbc line breaking class to resolve
+ * @param lang language of the text
+ * @return the resolved line breaking class
*/
static enum LineBreakClass resolve_lb_class(
- enum LineBreakClass lbc,
- const char *lang)
+ enum LineBreakClass lbc,
+ const char *lang)
+{
+ switch (lbc)
+ {
+ case LBP_AI:
+ if (lang != NULL &&
+ (strncmp(lang, "zh", 2) == 0 || /* Chinese */
+ strncmp(lang, "ja", 2) == 0 || /* Japanese */
+ strncmp(lang, "ko", 2) == 0)) /* Korean */
+ {
+ return LBP_ID;
+ }
+ else
+ {
+ return LBP_AL;
+ }
+ case LBP_CJ:
+ /* Simplified for `normal' line breaking. See
+ * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
+ * for details. */
+ return LBP_ID;
+ case LBP_SA:
+ case LBP_SG:
+ case LBP_XX:
+ return LBP_AL;
+ default:
+ return lbc;
+ }
+}
+
+/**
+ * Treats specially for the first character in a line.
+ *
+ * @param[in,out] lbpCtx pointer to the line breaking context
+ * @pre \a lbpCtx->lbcCur has a valid line break class
+ * @post \a lbpCtx->lbcCur has the updated line break class
+ */
+static void treat_first_char(
+ struct LineBreakContext* lbpCtx)
+{
+ switch (lbpCtx->lbcCur)
+ {
+ case LBP_LF:
+ case LBP_NL:
+ lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */
+ break;
+ case LBP_CB:
+ lbpCtx->lbcCur = LBP_BA; /* Rule LB20 */
+ break;
+ case LBP_SP:
+ lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
+ break;
+ default:
+ break;
+ }
+}
+
+/**
+ * Tries telling the line break opportunity by simple rules.
+ *
+ * @param[in,out] lbpCtx pointer to the line breaking context
+ * @pre \a lbpCtx->lbcCur has the current line break
+ * class; and \a lbpCtx->lbcNew has the line
+ * break class for the next character
+ * @post \a lbpCtx->lbcCur has the updated line break
+ * class
+ * @return break result, one of #LINEBREAK_MUSTBREAK,
+ * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
+ * if identified; or #LINEBREAK_UNDEFINED if
+ * table lookup is needed
+ */
+static int get_lb_result_simple(
+ struct LineBreakContext* lbpCtx)
+{
+ if (lbpCtx->lbcCur == LBP_BK
+ || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
+ {
+ return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */
+ }
+
+ switch (lbpCtx->lbcNew)
+ {
+ case LBP_SP:
+ return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */
+ case LBP_BK:
+ case LBP_LF:
+ case LBP_NL:
+ lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */
+ return LINEBREAK_NOBREAK; /* Rule LB6 */
+ case LBP_CR:
+ lbpCtx->lbcCur = LBP_CR;
+ return LINEBREAK_NOBREAK; /* Rule LB6 */
+ case LBP_CB:
+ lbpCtx->lbcCur = LBP_BA;
+ return LINEBREAK_ALLOWBREAK; /* Rule LB20 */
+ default:
+ return LINEBREAK_UNDEFINED; /* Table lookup is needed */
+ }
+}
+
+/**
+ * Tells the line break opportunity by table lookup.
+ *
+ * @param[in,out] lbpCtx pointer to the line breaking context
+ * @pre \a lbpCtx->lbcCur has the current line break
+ * class; \a lbpCtx->lbcLast has the line break
+ * class for the last character; and \a
+ * lbcCur->lbcNew has the line break class for
+ * the next character
+ * @post \a lbpCtx->lbcCur has the updated line break
+ * class
+ * @return break result, one of #LINEBREAK_MUSTBREAK,
+ * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
+ */
+static int get_lb_result_lookup(
+ struct LineBreakContext* lbpCtx)
+{
+ /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
+ * yet implemented below. */
+ int brk = LINEBREAK_UNDEFINED;
+ assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
+ assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
+
+ enum LineBreakClass lbcCur, lbcNew;
+
+ switch (lbpCtx->lbcCur)
+ {
+ case LBP_H2: /**< Hangul LV */
+ case LBP_H3: /**< Hangul LVT */
+ case LBP_JL: /**< Hangul L Jamo */
+ case LBP_JV: /**< Hangul V Jamo */
+ case LBP_JT: /**< Hangul T Jamo */
+ lbcCur = LBP_AL;
+ break;
+ default:
+ lbcCur = lbpCtx->lbcCur;
+ break;
+ }
+
+ switch (lbpCtx->lbcNew)
+ {
+ case LBP_H2: /**< Hangul LV */
+ case LBP_H3: /**< Hangul LVT */
+ case LBP_JL: /**< Hangul L Jamo */
+ case LBP_JV: /**< Hangul V Jamo */
+ case LBP_JT: /**< Hangul T Jamo */
+ lbcNew = LBP_AL;
+ break;
+ default:
+ lbcNew = lbpCtx->lbcNew;
+ break;
+ }
+
+ switch (baTable[lbcCur - 1][lbcNew - 1])
+ {
+ case DIR_BRK:
+ brk = LINEBREAK_ALLOWBREAK;
+ break;
+ case CMI_BRK:
+ case IND_BRK:
+ brk = (lbpCtx->lbcLast == LBP_SP)
+ ? LINEBREAK_ALLOWBREAK
+ : LINEBREAK_NOBREAK;
+ break;
+ case CMP_BRK:
+ brk = LINEBREAK_NOBREAK;
+ if (lbpCtx->lbcLast != LBP_SP)
+ return brk; /* Do not update lbcCur */
+ break;
+ case PRH_BRK:
+ brk = LINEBREAK_NOBREAK;
+ break;
+ }
+
+ lbpCtx->lbcCur = lbpCtx->lbcNew;
+ return brk;
+}
+
+/**
+ * Initializes line breaking context for a given language.
+ *
+ * @param[in,out] lbpCtx pointer to the line breaking context
+ * @param[in] ch the first character to process
+ * @param[in] lang language of the input
+ * @post the line breaking context is initialized
+ */
+void lb_init_break_context(
+ struct LineBreakContext* lbpCtx,
+ utf32_t ch,
+ const char* lang)
+{
+ lbpCtx->lang = lang;
+ lbpCtx->lbpLang = get_lb_prop_lang(lang);
+ lbpCtx->lbcLast = LBP_Undefined;
+ lbpCtx->lbcNew = LBP_Undefined;
+ lbpCtx->lbcCur = resolve_lb_class(
+ get_char_lb_class_lang(ch, lbpCtx->lbpLang),
+ lbpCtx->lang);
+ treat_first_char(lbpCtx);
+}
+
+/**
+ * Updates LineBreakingContext for the next code point and returns
+ * the detected break.
+ *
+ * @param[in,out] lbpCtx pointer to the line breaking context
+ * @param[in] ch Unicode code point
+ * @return break result, one of #LINEBREAK_MUSTBREAK,
+ * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
+ * @post the line breaking context is updated
+ */
+int lb_process_next_char(
+ struct LineBreakContext* lbpCtx,
+ utf32_t ch )
{
- switch (lbc)
- {
- case LBP_AI:
- if (lang != NULL &&
- (strncmp(lang, "zh", 2) == 0 || /* Chinese */
- strncmp(lang, "ja", 2) == 0 || /* Japanese */
- strncmp(lang, "ko", 2) == 0)) /* Korean */
- {
- return LBP_ID;
- }
- /* Fall through */
- case LBP_SA:
- case LBP_SG:
- case LBP_XX:
- return LBP_AL;
- default:
- return lbc;
- }
+ int brk;
+
+ lbpCtx->lbcLast = lbpCtx->lbcNew;
+ lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
+ brk = get_lb_result_simple(lbpCtx);
+ switch (brk)
+ {
+ case LINEBREAK_MUSTBREAK:
+ lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
+ treat_first_char(lbpCtx);
+ break;
+ case LINEBREAK_UNDEFINED:
+ lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
+ brk = get_lb_result_lookup(lbpCtx);
+ break;
+ default:
+ break;
+ }
+ return brk;
}
/**
* be advanced to the next complete character, unless the end of string
* is reached in the middle of a UTF-8 sequence.
*
- * @param[in] s input UTF-8 string
- * @param[in] len length of the string in bytes
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
+ * @param[in] s input UTF-8 string
+ * @param[in] len length of the string in bytes
+ * @param[in,out] ip pointer to the index
+ * @return the Unicode character beginning at the index; or
+ * #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf8(
- const utf8_t *s,
- size_t len,
- size_t *ip)
+ const utf8_t *s,
+ size_t len,
+ size_t *ip)
{
- utf8_t ch;
- utf32_t res;
-
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- ch = s[*ip];
-
- if (ch < 0xC2 || ch > 0xF4)
- { /* One-byte sequence, tail (should not occur), or invalid */
- *ip += 1;
- return ch;
- }
- else if (ch < 0xE0)
- { /* Two-byte sequence */
- if (*ip + 2 > len)
- return EOS;
- res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
- *ip += 2;
- return res;
- }
- else if (ch < 0xF0)
- { /* Three-byte sequence */
- if (*ip + 3 > len)
- return EOS;
- res = ((ch & 0x0F) << 12) +
- ((s[*ip + 1] & 0x3F) << 6) +
- ((s[*ip + 2] & 0x3F));
- *ip += 3;
- return res;
- }
- else
- { /* Four-byte sequence */
- if (*ip + 4 > len)
- return EOS;
- res = ((ch & 0x07) << 18) +
- ((s[*ip + 1] & 0x3F) << 12) +
- ((s[*ip + 2] & 0x3F) << 6) +
- ((s[*ip + 3] & 0x3F));
- *ip += 4;
- return res;
- }
+ utf8_t ch;
+ utf32_t res;
+
+ assert(*ip <= len);
+ if (*ip == len)
+ return EOS;
+ ch = s[*ip];
+
+ if (ch < 0xC2 || ch > 0xF4)
+ { /* One-byte sequence, tail (should not occur), or invalid */
+ *ip += 1;
+ return ch;
+ }
+ else if (ch < 0xE0)
+ { /* Two-byte sequence */
+ if (*ip + 2 > len)
+ return EOS;
+ res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
+ *ip += 2;
+ return res;
+ }
+ else if (ch < 0xF0)
+ { /* Three-byte sequence */
+ if (*ip + 3 > len)
+ return EOS;
+ res = ((ch & 0x0F) << 12) +
+ ((s[*ip + 1] & 0x3F) << 6) +
+ ((s[*ip + 2] & 0x3F));
+ *ip += 3;
+ return res;
+ }
+ else
+ { /* Four-byte sequence */
+ if (*ip + 4 > len)
+ return EOS;
+ res = ((ch & 0x07) << 18) +
+ ((s[*ip + 1] & 0x3F) << 12) +
+ ((s[*ip + 2] & 0x3F) << 6) +
+ ((s[*ip + 3] & 0x3F));
+ *ip += 4;
+ return res;
+ }
}
/**
* be advanced to the next complete character, unless the end of string
* is reached in the middle of a UTF-16 surrogate pair.
*
- * @param[in] s input UTF-16 string
- * @param[in] len length of the string in words
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
+ * @param[in] s input UTF-16 string
+ * @param[in] len length of the string in words
+ * @param[in,out] ip pointer to the index
+ * @return the Unicode character beginning at the index; or
+ * #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf16(
- const utf16_t *s,
- size_t len,
- size_t *ip)
+ const utf16_t *s,
+ size_t len,
+ size_t *ip)
{
- utf16_t ch;
-
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- ch = s[(*ip)++];
-
- if (ch < 0xD800 || ch > 0xDBFF)
- { /* If the character is not a high surrogate */
- return ch;
- }
- if (*ip == len)
- { /* If the input ends here (an error) */
- --(*ip);
- return EOS;
- }
- if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
- { /* If the next character is not the low surrogate (an error) */
- return ch;
- }
- /* Return the constructed character and advance the index again */
- return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
+ utf16_t ch;
+
+ assert(*ip <= len);
+ if (*ip == len)
+ return EOS;
+ ch = s[(*ip)++];
+
+ if (ch < 0xD800 || ch > 0xDBFF)
+ { /* If the character is not a high surrogate */
+ return ch;
+ }
+ if (*ip == len)
+ { /* If the input ends here (an error) */
+ --(*ip);
+ return EOS;
+ }
+ if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
+ { /* If the next character is not the low surrogate (an error) */
+ return ch;
+ }
+ /* Return the constructed character and advance the index again */
+ return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
}
/**
* Gets the next Unicode character in a UTF-32 sequence. The index will
* be advanced to the next character.
*
- * @param[in] s input UTF-32 string
- * @param[in] len length of the string in dwords
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
+ * @param[in] s input UTF-32 string
+ * @param[in] len length of the string in dwords
+ * @param[in,out] ip pointer to the index
+ * @return the Unicode character beginning at the index; or
+ * #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf32(
- const utf32_t *s,
- size_t len,
- size_t *ip)
+ const utf32_t *s,
+ size_t len,
+ size_t *ip)
{
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- return s[(*ip)++];
+ assert(*ip <= len);
+ if (*ip == len)
+ return EOS;
+ return s[(*ip)++];
}
/**
* Sets the line breaking information for a generic input string.
*
- * @param[in] s input string
- * @param[in] len length of the input
- * @param[in] lang language of the input
- * @param[out] brks pointer to the output breaking data,
- * containing #LINEBREAK_MUSTBREAK,
- * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
- * or #LINEBREAK_INSIDEACHAR
- * @param[in] get_next_char function to get the next UTF-32 character
+ * @param[in] s input string
+ * @param[in] len length of the input
+ * @param[in] lang language of the input
+ * @param[out] brks pointer to the output breaking data,
+ * containing #LINEBREAK_MUSTBREAK,
+ * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
+ * or #LINEBREAK_INSIDEACHAR
+ * @param[in] get_next_char function to get the next UTF-32 character
*/
void set_linebreaks(
- const void *s,
- size_t len,
- const char *lang,
- char *brks,
- get_next_char_t get_next_char)
+ const void *s,
+ size_t len,
+ const char *lang,
+ char *brks,
+ get_next_char_t get_next_char)
{
- utf32_t ch;
- enum LineBreakClass lbcCur;
- enum LineBreakClass lbcNew;
- enum LineBreakClass lbcLast;
- struct LineBreakProperties *lbpLang;
- size_t posCur = 0;
- size_t posLast = 0;
- // TIZEN ONLY : (2013.08.19) for special processing at Zero-width space character
- int zw_flag = 0;
- //
-
- --posLast; /* To be ++'d later */
- ch = get_next_char(s, len, &posCur);
- if (ch == EOS)
- return;
- lbpLang = get_lb_prop_lang(lang);
- lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
- lbcNew = LBP_Undefined;
-
-nextline:
-
- /* Special treatment for the first character */
- switch (lbcCur)
- {
- case LBP_LF:
- case LBP_NL:
- lbcCur = LBP_BK;
- break;
- case LBP_CB:
- lbcCur = LBP_BA;
- break;
- case LBP_SP:
- lbcCur = LBP_WJ;
- break;
- default:
- break;
- }
-
- /* Process a line till an explicit break or end of string */
- for (;;)
- {
- for (++posLast; posLast < posCur - 1; ++posLast)
- {
- brks[posLast] = LINEBREAK_INSIDEACHAR;
- }
- assert(posLast == posCur - 1);
- lbcLast = lbcNew;
- ch = get_next_char(s, len, &posCur);
- if (ch == EOS)
- break;
- lbcNew = get_char_lb_class_lang(ch, lbpLang);
- if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
- {
- brks[posLast] = LINEBREAK_MUSTBREAK;
- lbcCur = resolve_lb_class(lbcNew, lang);
- goto nextline;
- }
-
- // TIZEN ONLY : (2013.08.19) for special processing at Zero-width space character
- /*
- switch (lbcNew)
- {
- case LBP_SP:
- brks[posLast] = LINEBREAK_NOBREAK;
- continue;
- case LBP_BK:
- case LBP_LF:
- case LBP_NL:
- brks[posLast] = LINEBREAK_NOBREAK;
- lbcCur = LBP_BK;
- continue;
- case LBP_CR:
- brks[posLast] = LINEBREAK_NOBREAK;
- lbcCur = LBP_CR;
- continue;
- case LBP_CB:
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- lbcCur = LBP_BA;
- continue;
- default:
- break;
- }
-
- lbcNew = resolve_lb_class(lbcNew, lang);
-
- assert(lbcCur <= LBP_JT);
- assert(lbcNew <= LBP_JT);
- switch (baTable[lbcCur - 1][lbcNew - 1])
- {
- case DIR_BRK:
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- break;
- case CMI_BRK:
- case IND_BRK:
- if (lbcLast == LBP_SP)
- {
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- }
- else
- {
- brks[posLast] = LINEBREAK_NOBREAK;
- }
- break;
- case CMP_BRK:
- brks[posLast] = LINEBREAK_NOBREAK;
- if (lbcLast != LBP_SP)
- continue;
- break;
- case PRH_BRK:
- brks[posLast] = LINEBREAK_NOBREAK;
- break;
- }
-
- lbcCur = lbcNew;
- */
-
- // TIZEN ONLY - START
- if (lbcCur == LBP_ZW && !zw_flag)
- {
- zw_flag = 1;
- posLast = -1;
- posCur = 0;
- ch = get_next_char(s, len, &posCur);
- lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
- lbcNew = LBP_Undefined;
- goto nextline;
- }
- else if (zw_flag)
- {
- if (lbcCur == LBP_ZW)
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- else
- brks[posLast] = LINEBREAK_NOBREAK;
- lbcCur = lbcNew;
- }
- else
- {
- // TIZEN ONLY(20131106): For Hangul word wrap
- switch (lbcCur)
- {
- case LBP_H2: /**< Hangul LV */
- case LBP_H3: /**< Hangul LVT */
- case LBP_JL: /**< Hangul L Jamo */
- case LBP_JV: /**< Hangul V Jamo */
- case LBP_JT: /**< Hangul T Jamo */
- lbcCur = LBP_AL;
- break;
- default:
- break;
- }
-
- switch (lbcNew)
- {
- case LBP_H2: /**< Hangul LV */
- case LBP_H3: /**< Hangul LVT */
- case LBP_JL: /**< Hangul L Jamo */
- case LBP_JV: /**< Hangul V Jamo */
- case LBP_JT: /**< Hangul T Jamo */
- lbcNew = LBP_AL;
- break;
- default:
- break;
- }
- //
-
- switch (lbcNew)
- {
- case LBP_SP:
- brks[posLast] = LINEBREAK_NOBREAK;
- continue;
- case LBP_BK:
- case LBP_LF:
- case LBP_NL:
- brks[posLast] = LINEBREAK_NOBREAK;
- lbcCur = LBP_BK;
- continue;
- case LBP_CR:
- brks[posLast] = LINEBREAK_NOBREAK;
- lbcCur = LBP_CR;
- continue;
- case LBP_CB:
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- lbcCur = LBP_BA;
- continue;
- default:
- break;
- }
-
- lbcNew = resolve_lb_class(lbcNew, lang);
-
- assert(lbcCur <= LBP_JT);
- assert(lbcNew <= LBP_JT);
- switch (baTable[lbcCur - 1][lbcNew - 1])
- {
- case DIR_BRK:
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- break;
- case CMI_BRK:
- case IND_BRK:
- if (lbcLast == LBP_SP)
- {
- brks[posLast] = LINEBREAK_ALLOWBREAK;
- }
- else
- {
- brks[posLast] = LINEBREAK_NOBREAK;
- }
- break;
- case CMP_BRK:
- brks[posLast] = LINEBREAK_NOBREAK;
- if (lbcLast != LBP_SP)
- continue;
- break;
- case PRH_BRK:
- brks[posLast] = LINEBREAK_NOBREAK;
- break;
- }
- lbcCur = lbcNew;
- }
- // TIZEN ONLY - END
- }
-
- assert(posLast == posCur - 1 && posCur <= len);
- /* Break after the last character */
- brks[posLast] = LINEBREAK_MUSTBREAK;
- /* When the input contains incomplete sequences */
- while (posCur < len)
- {
- brks[posCur++] = LINEBREAK_INSIDEACHAR;
- }
+ utf32_t ch;
+ struct LineBreakContext lbCtx;
+ size_t posCur = 0;
+ size_t posLast = 0;
+
+ --posLast; /* To be ++'d later */
+ ch = get_next_char(s, len, &posCur);
+ if (ch == EOS)
+ return;
+ lb_init_break_context(&lbCtx, ch, lang);
+
+ /* Process a line till an explicit break or end of string */
+ for (;;)
+ {
+ for (++posLast; posLast < posCur - 1; ++posLast)
+ {
+ brks[posLast] = LINEBREAK_INSIDEACHAR;
+ }
+ assert(posLast == posCur - 1);
+ ch = get_next_char(s, len, &posCur);
+ if (ch == EOS)
+ break;
+ brks[posLast] = lb_process_next_char(&lbCtx, ch);
+ }
+
+ assert(posLast == posCur - 1 && posCur <= len);
+ /* Break after the last character */
+ brks[posLast] = LINEBREAK_MUSTBREAK;
+ /* When the input contains incomplete sequences */
+ while (posCur < len)
+ {
+ brks[posCur++] = LINEBREAK_INSIDEACHAR;
+ }
}
/**
* Sets the line breaking information for a UTF-8 input string.
*
- * @param[in] s input UTF-8 string
- * @param[in] len length of the input
- * @param[in] lang language of the input
- * @param[out] brks pointer to the output breaking data, containing
- * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
- * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
+ * @param[in] s input UTF-8 string
+ * @param[in] len length of the input
+ * @param[in] lang language of the input
+ * @param[out] brks pointer to the output breaking data, containing
+ * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
+ * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf8(
- const utf8_t *s,
- size_t len,
- const char *lang,
- char *brks)
+ const utf8_t *s,
+ size_t len,
+ const char *lang,
+ char *brks)
{
- set_linebreaks(s, len, lang, brks,
- (get_next_char_t)lb_get_next_char_utf8);
+ set_linebreaks(s, len, lang, brks,
+ (get_next_char_t)lb_get_next_char_utf8);
}
/**
* Sets the line breaking information for a UTF-16 input string.
*
- * @param[in] s input UTF-16 string
- * @param[in] len length of the input
- * @param[in] lang language of the input
- * @param[out] brks pointer to the output breaking data, containing
- * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
- * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
+ * @param[in] s input UTF-16 string
+ * @param[in] len length of the input
+ * @param[in] lang language of the input
+ * @param[out] brks pointer to the output breaking data, containing
+ * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
+ * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf16(
- const utf16_t *s,
- size_t len,
- const char *lang,
- char *brks)
+ const utf16_t *s,
+ size_t len,
+ const char *lang,
+ char *brks)
{
- set_linebreaks(s, len, lang, brks,
- (get_next_char_t)lb_get_next_char_utf16);
+ set_linebreaks(s, len, lang, brks,
+ (get_next_char_t)lb_get_next_char_utf16);
}
/**
* Sets the line breaking information for a UTF-32 input string.
*
- * @param[in] s input UTF-32 string
- * @param[in] len length of the input
- * @param[in] lang language of the input
- * @param[out] brks pointer to the output breaking data, containing
- * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
- * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
+ * @param[in] s input UTF-32 string
+ * @param[in] len length of the input
+ * @param[in] lang language of the input
+ * @param[out] brks pointer to the output breaking data, containing
+ * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
+ * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf32(
- const utf32_t *s,
- size_t len,
- const char *lang,
- char *brks)
+ const utf32_t *s,
+ size_t len,
+ const char *lang,
+ char *brks)
{
- set_linebreaks(s, len, lang, brks,
- (get_next_char_t)lb_get_next_char_utf32);
+ set_linebreaks(s, len, lang, brks,
+ (get_next_char_t)lb_get_next_char_utf32);
}
/**
* complicated cases involving combining marks, spaces, etc. cannot be
* correctly processed.
*
- * @param char1 the first Unicode character
- * @param char2 the second Unicode character
- * @param lang language of the input
- * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
- * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
+ * @param char1 the first Unicode character
+ * @param char2 the second Unicode character
+ * @param lang language of the input
+ * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
+ * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
int is_line_breakable(
- utf32_t char1,
- utf32_t char2,
- const char* lang)
+ utf32_t char1,
+ utf32_t char2,
+ const char* lang)
{
- utf32_t s[2];
- char brks[2];
- s[0] = char1;
- s[1] = char2;
- set_linebreaks_utf32(s, 2, lang, brks);
- return brks[0];
+ utf32_t s[2];
+ char brks[2];
+ s[0] = char1;
+ s[1] = char2;
+ set_linebreaks_utf32(s, 2, lang, brks);
+ return brks[0];
}