From: Konstantin Ritt Date: Tue, 25 Sep 2012 20:55:54 +0000 (+0300) Subject: QCharAttributes: add wordStart/wordEnd flags X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a798b956b9786240a06142de078f56c28962a535;p=profile%2Fivi%2Fqtbase.git QCharAttributes: add wordStart/wordEnd flags A simple heuristic is used to detect the word beginning and ending by looking at the word break property value of surrounding characters. This behaves better than the white-spaces based implementation used before and makes it possible to tailor the default algorithm for complex scripts. BIG FAT WARNING: The QCharAttributes buffer now has to have a length of string length + 1 for the flags at end of text. Task-Id: QTBUG-6498 Change-Id: I5589b191ffde6a50d2af0c14a00430d3852c67b4 Reviewed-by: Konstantin Ritt --- diff --git a/src/3rdparty/harfbuzz/src/harfbuzz-shaper.h b/src/3rdparty/harfbuzz/src/harfbuzz-shaper.h index 2ff52ea..5a3329d 100644 --- a/src/3rdparty/harfbuzz/src/harfbuzz-shaper.h +++ b/src/3rdparty/harfbuzz/src/harfbuzz-shaper.h @@ -133,7 +133,9 @@ typedef struct { hb_bitfield sentenceBoundary : 1; hb_bitfield lineBreak : 1; hb_bitfield whiteSpace : 1; /* A unicode whitespace character */ - hb_bitfield unused : 3; + hb_bitfield wordStart : 1; + hb_bitfield wordEnd : 1; + hb_bitfield unused : 1; } HB_CharAttributes; void HB_GetTailoredCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength, diff --git a/src/3rdparty/harfbuzz/src/harfbuzz-thai.c b/src/3rdparty/harfbuzz/src/harfbuzz-thai.c index 70c1d57..f62a1b7 100644 --- a/src/3rdparty/harfbuzz/src/harfbuzz-thai.c +++ b/src/3rdparty/harfbuzz/src/harfbuzz-thai.c @@ -395,8 +395,10 @@ static void HB_ThaiAssignAttributes(const HB_UChar16 *string, hb_uint32 len, HB_ to_tis620(string, len, cstr); for (i = 0; i < len; ++i) { - attributes[i].lineBreak = FALSE; attributes[i].wordBreak = FALSE; + attributes[i].wordStart = FALSE; + attributes[i].wordEnd = FALSE; + attributes[i].lineBreak = FALSE; } if (len > 128) { @@ -411,11 +413,17 @@ static void HB_ThaiAssignAttributes(const HB_UChar16 *string, hb_uint32 len, HB_ if (break_positions) { attributes[0].wordBreak = TRUE; + attributes[0].wordStart = TRUE; + attributes[0].wordEnd = FALSE; numbreaks = th_brk((const unsigned char *)cstr, break_positions, brp_size); for (i = 0; i < numbreaks; ++i) { attributes[break_positions[i]].wordBreak = TRUE; + attributes[break_positions[i]].wordStart = TRUE; + attributes[break_positions[i]].wordEnd = TRUE; attributes[break_positions[i]].lineBreak = TRUE; } + if (numbreaks > 0) + attributes[break_positions[numbreaks - 1]].wordStart = FALSE; if (break_positions != brp) free(break_positions); diff --git a/src/corelib/tools/qtextboundaryfinder.cpp b/src/corelib/tools/qtextboundaryfinder.cpp index 318e3b2..6656569 100644 --- a/src/corelib/tools/qtextboundaryfinder.cpp +++ b/src/corelib/tools/qtextboundaryfinder.cpp @@ -89,7 +89,7 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int scriptItems.append(item); } - QUnicodeTools::CharAttributeOptions options = QUnicodeTools::WhiteSpaces; + QUnicodeTools::CharAttributeOptions options = 0; switch (type) { case QTextBoundaryFinder::Grapheme: options |= QUnicodeTools::GraphemeBreaks; break; case QTextBoundaryFinder::Word: options |= QUnicodeTools::WordBreaks; break; @@ -189,9 +189,9 @@ QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other) , pos(other.pos) , freePrivate(true) { - d = (QTextBoundaryFinderPrivate *) malloc(length*sizeof(QCharAttributes)); + d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes)); Q_CHECK_PTR(d); - memcpy(d, other.d, length*sizeof(QCharAttributes)); + memcpy(d, other.d, (length + 1) * sizeof(QCharAttributes)); } /*! @@ -209,11 +209,11 @@ QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &o pos = other.pos; QTextBoundaryFinderPrivate *newD = (QTextBoundaryFinderPrivate *) - realloc(freePrivate ? d : 0, length*sizeof(QCharAttributes)); + realloc(freePrivate ? d : 0, (length + 1) * sizeof(QCharAttributes)); Q_CHECK_PTR(newD); freePrivate = true; d = newD; - memcpy(d, other.d, length*sizeof(QCharAttributes)); + memcpy(d, other.d, (length + 1) * sizeof(QCharAttributes)); return *this; } @@ -238,7 +238,7 @@ QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &strin , pos(0) , freePrivate(true) { - d = (QTextBoundaryFinderPrivate *) malloc(length*sizeof(QCharAttributes)); + d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes)); Q_CHECK_PTR(d); init(t, chars, length, d->attributes); } @@ -249,7 +249,8 @@ QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &strin \a buffer is an optional working buffer of size \a bufferSize you can pass to the QTextBoundaryFinder. If the buffer is large enough to hold the working - data required, it will use this instead of allocating its own buffer. + data required (bufferSize >= length + 1), it will use this + instead of allocating its own buffer. \warning QTextBoundaryFinder does not create a copy of \a chars. It is the application programmer's responsibility to ensure the array is allocated for @@ -262,11 +263,11 @@ QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars, , length(length) , pos(0) { - if (buffer && (uint)bufferSize >= length*sizeof(QCharAttributes)) { + if (buffer && (uint)bufferSize >= (length + 1) * sizeof(QCharAttributes)) { d = (QTextBoundaryFinderPrivate *)buffer; freePrivate = false; } else { - d = (QTextBoundaryFinderPrivate *) malloc(length*sizeof(QCharAttributes)); + d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes)); Q_CHECK_PTR(d); freePrivate = true; } @@ -455,38 +456,30 @@ bool QTextBoundaryFinder::isAtBoundary() const */ QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const { - if (!d) - return NotAtBoundary; - if (! isAtBoundary()) - return NotAtBoundary; - if (pos == 0) { - if (d->attributes[pos].whiteSpace) - return NotAtBoundary; - return StartWord; - } - if (pos == length) { - if (d->attributes[length-1].whiteSpace) - return NotAtBoundary; - return EndWord; - } - - if (t == Line && chars[pos - 1].unicode() == QChar::SoftHyphen) - return SoftHyphen; + BoundaryReasons reasons = NotAtBoundary; + if (!d || !isAtBoundary()) + return reasons; - if (t != Word) - return BoundaryReasons(StartWord | EndWord); - - const bool nextIsSpace = d->attributes[pos].whiteSpace; - const bool prevIsSpace = d->attributes[pos - 1].whiteSpace; + switch (t) { + case Word: + if (d->attributes[pos].wordStart) + reasons |= StartWord; + if (d->attributes[pos].wordEnd) + reasons |= EndWord; + break; + case Line: + if (pos > 0 && chars[pos - 1].unicode() == QChar::SoftHyphen) + reasons |= SoftHyphen; + // fall through + case Grapheme: + case Sentence: + reasons |= StartWord | EndWord; + break; + default: + break; + } - if (prevIsSpace && !nextIsSpace) - return StartWord; - else if (!prevIsSpace && nextIsSpace) - return EndWord; - else if (!prevIsSpace && !nextIsSpace) - return BoundaryReasons(StartWord | EndWord); - else - return NotAtBoundary; + return reasons; } QT_END_NAMESPACE diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index f8daec5..0b492ab 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -102,6 +102,8 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes lcls = cls; } + + attributes[len].graphemeBoundary = true; // GB2 } @@ -133,6 +135,10 @@ static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnico static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *attributes) { + enum WordType { + WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana + } currentWordType = WordTypeNone; + QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1 for (quint32 i = 0; i != len; ++i) { quint32 pos = i; @@ -178,9 +184,30 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at continue; } cls = ncls; - if (action == WB::Break) + if (action == WB::Break) { attributes[pos].wordBreak = true; + if (currentWordType != WordTypeNone) + attributes[pos].wordEnd = true; + switch (cls) { + case QUnicodeTables::WordBreak_Katakana: + currentWordType = WordTypeHiraganaKatakana; + attributes[pos].wordStart = true; + break; + case QUnicodeTables::WordBreak_ALetter: + case QUnicodeTables::WordBreak_Numeric: + currentWordType = WordTypeAlphaNumeric; + attributes[pos].wordStart = true; + break; + default: + currentWordType = WordTypeNone; + break; + } + } } + + if (currentWordType != WordTypeNone) + attributes[len].wordEnd = true; + attributes[len].wordBreak = true; // WB2 } @@ -277,6 +304,8 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes state = SB::breakTable[SB::Initial][ncls]; } } + + attributes[len].sentenceBoundary = true; // SB2 } @@ -514,6 +543,7 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at } attributes[0].lineBreak = false; // LB2 + attributes[len].lineBreak = true; // LB3 } @@ -543,7 +573,7 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, return; if (!(options & DontClearAttributes)) - ::memset(attributes, 0, length * sizeof(QCharAttributes)); + ::memset(attributes, 0, (length + 1) * sizeof(QCharAttributes)); if (options & GraphemeBreaks) getGraphemeBreaks(string, length, attributes); diff --git a/src/corelib/tools/qunicodetools_p.h b/src/corelib/tools/qunicodetools_p.h index 91028b6..b1e9127 100644 --- a/src/corelib/tools/qunicodetools_p.h +++ b/src/corelib/tools/qunicodetools_p.h @@ -64,7 +64,9 @@ struct Q_PACKED QCharAttributes uchar sentenceBoundary : 1; uchar lineBreak : 1; uchar whiteSpace : 1; - uchar unused : 3; + uchar wordStart : 1; + uchar wordEnd : 1; + uchar unused : 1; }; Q_DECLARE_TYPEINFO(QCharAttributes, Q_PRIMITIVE_TYPE); @@ -89,6 +91,7 @@ enum CharAttributeOption { }; Q_DECLARE_FLAGS(CharAttributeOptions, CharAttributeOption) +// attributes buffer has to have a length of string length + 1 Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, const ScriptItem *items, int numItems, QCharAttributes *attributes, CharAttributeOptions options = DefaultOptionsCompat); diff --git a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp index 26de0f2..56409447 100644 --- a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp +++ b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp @@ -72,6 +72,7 @@ private slots: void lineBoundaries_manual(); void fastConstructor(); + void wordBoundaries_qtbug6498(); void isAtSoftHyphen_data(); void isAtSoftHyphen(); void thaiLineBreak(); @@ -544,6 +545,67 @@ void tst_QTextBoundaryFinder::fastConstructor() QCOMPARE(finder.boundaryReasons(), QTextBoundaryFinder::NotAtBoundary); } +void tst_QTextBoundaryFinder::wordBoundaries_qtbug6498() +{ + // text with trailing space + QString text("Please test me. Finish "); + QTextBoundaryFinder finder(QTextBoundaryFinder::Word, text); + + QCOMPARE(finder.position(), 0); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord); + + QCOMPARE(finder.toNextBoundary(), 6); + QCOMPARE(finder.position(), 6); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord); + + QCOMPARE(finder.toNextBoundary(), 7); + QCOMPARE(finder.position(), 7); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord); + + QCOMPARE(finder.toNextBoundary(), 11); + QCOMPARE(finder.position(), 11); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord); + + QCOMPARE(finder.toNextBoundary(), 12); + QCOMPARE(finder.position(), 12); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord); + + QCOMPARE(finder.toNextBoundary(), 14); + QCOMPARE(finder.position(), 14); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord); + + QCOMPARE(finder.toNextBoundary(), 15); + QCOMPARE(finder.position(), 15); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() == QTextBoundaryFinder::NotAtBoundary); + + QCOMPARE(finder.toNextBoundary(), 16); + QCOMPARE(finder.position(), 16); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::StartWord); + + QCOMPARE(finder.toNextBoundary(), 22); + QCOMPARE(finder.position(), 22); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() & QTextBoundaryFinder::EndWord); + + QCOMPARE(finder.toNextBoundary(), 23); + QCOMPARE(finder.position(), 23); + QVERIFY(finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() == QTextBoundaryFinder::NotAtBoundary); + + QCOMPARE(finder.toNextBoundary(), -1); + QCOMPARE(finder.position(), -1); + QVERIFY(!finder.isAtBoundary()); + QVERIFY(finder.boundaryReasons() == QTextBoundaryFinder::NotAtBoundary); +} + void tst_QTextBoundaryFinder::isAtSoftHyphen_data() { QTest::addColumn("testString"); @@ -568,7 +630,7 @@ void tst_QTextBoundaryFinder::isAtSoftHyphen() QVERIFY(expectedBreakPositions.contains(i + 1)); boundaryFinder.setPosition(i + 1); QVERIFY(boundaryFinder.isAtBoundary()); - QVERIFY(boundaryFinder.boundaryReasons() == QTextBoundaryFinder::SoftHyphen); + QVERIFY(boundaryFinder.boundaryReasons() & QTextBoundaryFinder::SoftHyphen); } }