2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
23 #include "platform/text/TextBreakIterator.h"
25 #include "platform/text/TextBreakIteratorInternalICU.h"
26 #include "wtf/Assertions.h"
27 #include "wtf/HashMap.h"
28 #include "wtf/PassOwnPtr.h"
29 #include "wtf/ThreadSpecific.h"
30 #include "wtf/ThreadingPrimitives.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/CString.h"
33 #include "wtf/text/WTFString.h"
34 #include <unicode/rbbi.h>
35 #include <unicode/ubrk.h>
42 class LineBreakIteratorPool {
43 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
45 static LineBreakIteratorPool& sharedPool()
47 static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
51 static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
53 icu::BreakIterator* take(const AtomicString& locale)
55 icu::BreakIterator* iterator = 0;
56 for (size_t i = 0; i < m_pool.size(); ++i) {
57 if (m_pool[i].first == locale) {
58 iterator = m_pool[i].second;
65 UErrorCode openStatus = U_ZERO_ERROR;
66 bool localeIsEmpty = locale.isEmpty();
67 iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
68 // locale comes from a web page and it can be invalid, leading ICU
69 // to fail, in which case we fall back to the default locale.
70 if (!localeIsEmpty && U_FAILURE(openStatus)) {
71 openStatus = U_ZERO_ERROR;
72 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
75 if (U_FAILURE(openStatus)) {
76 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
81 ASSERT(!m_vendedIterators.contains(iterator));
82 m_vendedIterators.set(iterator, locale);
86 void put(icu::BreakIterator* iterator)
88 ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
90 if (m_pool.size() == capacity) {
91 delete(m_pool[0].second);
95 m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
99 LineBreakIteratorPool() { }
101 static const size_t capacity = 4;
103 typedef pair<AtomicString, icu::BreakIterator*> Entry;
104 typedef Vector<Entry, capacity> Pool;
106 HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
108 friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
111 enum TextContext { NoContext, PriorContext, PrimaryContext };
113 const int textBufferCapacity = 16;
117 UChar buffer[textBufferCapacity];
120 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
124 else if (index > limit)
129 static inline int64_t textNativeLength(UText* text)
131 return text->a + text->b;
134 // Relocate pointer from source into destination as required.
135 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
137 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
138 // Pointer references source extra buffer.
139 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
140 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
141 // Pointer references source text structure, but not source extra buffer.
142 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
146 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
148 ASSERT_UNUSED(deep, !deep);
149 if (U_FAILURE(*status))
151 int32_t extraSize = source->extraSize;
152 destination = utext_setup(destination, extraSize, status);
153 if (U_FAILURE(*status))
155 void* extraNew = destination->pExtra;
156 int32_t flags = destination->flags;
157 int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
158 memcpy(destination, source, sizeToCopy);
159 destination->pExtra = extraNew;
160 destination->flags = flags;
161 memcpy(destination->pExtra, source->pExtra, extraSize);
162 textFixPointer(source, destination, destination->context);
163 textFixPointer(source, destination, destination->p);
164 textFixPointer(source, destination, destination->q);
165 ASSERT(!destination->r);
166 const void * chunkContents = static_cast<const void*>(destination->chunkContents);
167 textFixPointer(source, destination, chunkContents);
168 destination->chunkContents = static_cast<const UChar*>(chunkContents);
172 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
174 // In the present context, this text provider is used only with ICU functions
175 // that do not perform an extract operation.
176 ASSERT_NOT_REACHED();
177 *errorCode = U_UNSUPPORTED_ERROR;
181 static void textClose(UText* text)
186 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
188 if (!text->b || nativeIndex > text->b)
189 return PrimaryContext;
190 if (nativeIndex == text->b)
191 return forward ? PrimaryContext : PriorContext;
195 static inline TextContext textLatin1GetCurrentContext(const UText* text)
197 if (!text->chunkContents)
199 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
202 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
204 ASSERT(text->chunkContents == text->pExtra);
206 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
207 text->chunkNativeStart = nativeIndex;
208 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
209 if (text->chunkNativeLimit > nativeLength)
210 text->chunkNativeLimit = nativeLength;
212 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
213 text->chunkNativeLimit = nativeIndex;
214 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
215 if (text->chunkNativeStart < text->b)
216 text->chunkNativeStart = text->b;
218 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
219 // Ensure chunk length is well defined if computed length exceeds int32_t range.
220 ASSERT(length <= numeric_limits<int32_t>::max());
221 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
222 text->nativeIndexingLimit = text->chunkLength;
223 text->chunkOffset = forward ? 0 : text->chunkLength;
224 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
227 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
229 ASSERT(!text->chunkContents || text->chunkContents == text->q);
230 text->chunkContents = static_cast<const UChar*>(text->pExtra);
231 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
234 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
236 ASSERT(text->chunkContents == text->q);
237 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
238 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
239 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
240 text->chunkNativeStart = 0;
241 text->chunkNativeLimit = text->b;
242 text->chunkLength = text->b;
243 text->nativeIndexingLimit = text->chunkLength;
244 int64_t offset = nativeIndex - text->chunkNativeStart;
245 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
246 ASSERT(offset <= numeric_limits<int32_t>::max());
247 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
250 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
252 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
253 text->chunkContents = static_cast<const UChar*>(text->q);
254 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
257 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
260 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
261 int64_t offset = nativeIndex - text->chunkNativeStart;
262 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
263 ASSERT(offset <= numeric_limits<int32_t>::max());
264 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
268 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
269 text->chunkOffset = text->chunkLength;
270 isAccessible = FALSE;
274 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
275 int64_t offset = nativeIndex - text->chunkNativeStart;
276 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
277 ASSERT(offset <= numeric_limits<int32_t>::max());
278 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
282 if (nativeIndex <= 0 && !text->chunkNativeStart) {
283 text->chunkOffset = 0;
284 isAccessible = FALSE;
291 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
295 int64_t nativeLength = textNativeLength(text);
297 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
299 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
300 TextContext currentContext = textLatin1GetCurrentContext(text);
301 TextContext newContext = textGetContext(text, nativeIndex, forward);
302 ASSERT(newContext != NoContext);
303 if (newContext == currentContext) {
304 if (currentContext == PrimaryContext) {
305 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
307 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
309 } else if (newContext == PrimaryContext) {
310 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
312 ASSERT(newContext == PriorContext);
313 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
318 static const struct UTextFuncs textLatin1Funcs = {
330 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
332 text->pFuncs = funcs;
333 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
334 text->context = string;
337 text->q = priorContext;
338 text->b = priorContextLength;
341 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
343 if (U_FAILURE(*status))
346 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
347 *status = U_ILLEGAL_ARGUMENT_ERROR;
350 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
351 if (U_FAILURE(*status)) {
355 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
359 static inline TextContext textUTF16GetCurrentContext(const UText* text)
361 if (!text->chunkContents)
363 return text->chunkContents == text->p ? PrimaryContext : PriorContext;
366 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
368 ASSERT(text->chunkContents == text->p);
369 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
370 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
371 text->chunkNativeStart = text->b;
372 text->chunkNativeLimit = nativeLength;
373 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
374 // Ensure chunk length is well defined if computed length exceeds int32_t range.
375 ASSERT(length <= numeric_limits<int32_t>::max());
376 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
377 text->nativeIndexingLimit = text->chunkLength;
378 int64_t offset = nativeIndex - text->chunkNativeStart;
379 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
380 ASSERT(offset <= numeric_limits<int32_t>::max());
381 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
384 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
386 ASSERT(!text->chunkContents || text->chunkContents == text->q);
387 text->chunkContents = static_cast<const UChar*>(text->p);
388 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
391 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
393 ASSERT(text->chunkContents == text->q);
394 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
395 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
396 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
397 text->chunkNativeStart = 0;
398 text->chunkNativeLimit = text->b;
399 text->chunkLength = text->b;
400 text->nativeIndexingLimit = text->chunkLength;
401 int64_t offset = nativeIndex - text->chunkNativeStart;
402 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
403 ASSERT(offset <= numeric_limits<int32_t>::max());
404 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
407 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
409 ASSERT(!text->chunkContents || text->chunkContents == text->p);
410 text->chunkContents = static_cast<const UChar*>(text->q);
411 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
414 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
418 int64_t nativeLength = textNativeLength(text);
420 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
422 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
423 TextContext currentContext = textUTF16GetCurrentContext(text);
424 TextContext newContext = textGetContext(text, nativeIndex, forward);
425 ASSERT(newContext != NoContext);
426 if (newContext == currentContext) {
427 if (currentContext == PrimaryContext) {
428 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
430 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
432 } else if (newContext == PrimaryContext) {
433 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
435 ASSERT(newContext == PriorContext);
436 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
441 static const struct UTextFuncs textUTF16Funcs = {
453 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
455 if (U_FAILURE(*status))
458 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
459 *status = U_ILLEGAL_ARGUMENT_ERROR;
463 text = utext_setup(text, 0, status);
464 if (U_FAILURE(*status)) {
468 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
472 static UText emptyText = UTEXT_INITIALIZER;
474 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
476 UErrorCode errorCode = U_ZERO_ERROR;
477 static TextBreakIterator* breakIter = 0;
479 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
480 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
485 UTextWithBuffer textLocal;
486 textLocal.text = emptyText;
487 textLocal.text.extraSize = sizeof(textLocal.buffer);
488 textLocal.text.pExtra = textLocal.buffer;
490 UErrorCode openStatus = U_ZERO_ERROR;
491 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
492 if (U_FAILURE(openStatus)) {
493 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
497 UErrorCode setTextStatus = U_ZERO_ERROR;
498 breakIter->setText(text, setTextStatus);
499 if (U_FAILURE(setTextStatus))
500 WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
507 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
509 UErrorCode errorCode = U_ZERO_ERROR;
510 UText uText = UTEXT_INITIALIZER;
511 utext_openUChars(&uText, string, length, &errorCode);
512 if (U_FAILURE(errorCode))
514 iter->setText(&uText, errorCode);
517 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
519 UErrorCode errorCode = U_ZERO_ERROR;
520 static TextBreakIterator* breakIter = 0;
522 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
523 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
527 setText16(breakIter, string, length);
531 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
533 if (string.isEmpty())
536 return wordBreakIterator(string.characters8() + start, length);
537 return wordBreakIterator(string.characters16() + start, length);
540 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
542 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
546 UTextWithBuffer textLocal;
547 textLocal.text = emptyText;
548 textLocal.text.extraSize = sizeof(textLocal.buffer);
549 textLocal.text.pExtra = textLocal.buffer;
551 UErrorCode openStatus = U_ZERO_ERROR;
552 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
553 if (U_FAILURE(openStatus)) {
554 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
558 UErrorCode setTextStatus = U_ZERO_ERROR;
559 iterator->setText(text, setTextStatus);
560 if (U_FAILURE(setTextStatus)) {
561 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
570 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
572 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
576 UText textLocal = UTEXT_INITIALIZER;
578 UErrorCode openStatus = U_ZERO_ERROR;
579 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
580 if (U_FAILURE(openStatus)) {
581 WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
585 UErrorCode setTextStatus = U_ZERO_ERROR;
586 iterator->setText(text, setTextStatus);
587 if (U_FAILURE(setTextStatus)) {
588 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
597 void releaseLineBreakIterator(TextBreakIterator* iterator)
599 ASSERT_ARG(iterator, iterator);
601 LineBreakIteratorPool::sharedPool().put(iterator);
604 static TextBreakIterator* nonSharedCharacterBreakIterator;
606 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
608 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
609 MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
610 if (nonSharedCharacterBreakIterator != expected)
612 nonSharedCharacterBreakIterator = newValue;
616 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
623 if (string.isEmpty())
626 m_is8Bit = string.is8Bit();
629 m_charaters8 = string.characters8();
631 m_length = string.length();
635 createIteratorForBuffer(string.characters16(), string.length());
638 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
645 createIteratorForBuffer(buffer, length);
648 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
650 m_iterator = nonSharedCharacterBreakIterator;
651 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
652 if (!createdIterator) {
653 UErrorCode errorCode = U_ZERO_ERROR;
654 m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
655 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
658 setText16(m_iterator, buffer, length);
661 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
665 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
669 int NonSharedCharacterBreakIterator::next()
672 return m_iterator->next();
674 if (m_offset >= m_length)
675 return TextBreakDone;
677 m_offset += clusterLengthStartingAt(m_offset);
681 int NonSharedCharacterBreakIterator::current()
684 return m_iterator->current();
688 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
691 return m_iterator->isBoundary(offset);
692 return !isLFAfterCR(offset);
695 int NonSharedCharacterBreakIterator::preceding(int offset) const
698 return m_iterator->preceding(offset);
700 return TextBreakDone;
701 if (isLFAfterCR(offset))
706 int NonSharedCharacterBreakIterator::following(int offset) const
709 return m_iterator->following(offset);
710 if (static_cast<unsigned>(offset) >= m_length)
711 return TextBreakDone;
712 return offset + clusterLengthStartingAt(offset);
715 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
717 UErrorCode openStatus = U_ZERO_ERROR;
718 static TextBreakIterator* iterator = 0;
720 iterator = icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
721 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
726 setText16(iterator, string, length);
730 bool isWordTextBreak(TextBreakIterator* iterator)
732 icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
733 int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
734 return ruleStatus != UBRK_WORD_NONE;
737 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
742 static TextBreakIterator* iterator = 0;
744 UParseError parseStatus;
745 UErrorCode openStatus = U_ZERO_ERROR;
747 String(breakRules).appendTo(rules);
749 iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
750 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
755 setText16(iterator, string, length);
759 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
761 // This rule set is based on character-break iterator rules of ICU 4.0
762 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
763 // The major differences from the original ones are listed below:
764 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
765 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
766 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
767 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
768 // * Added rules for regional indicator symbols.
769 static const char* const kRules =
770 "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
771 "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
772 "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
773 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
774 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
775 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
776 "$L = [\\p{Grapheme_Cluster_Break = L}];"
777 "$V = [\\p{Grapheme_Cluster_Break = V}];"
778 "$T = [\\p{Grapheme_Cluster_Break = T}];"
779 "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
780 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
781 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
782 "$HinV = \\u094D;" // Devanagari Sign Virama
783 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
784 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
785 "$BenV = \\u09CD;" // Bengali Sign Virama
786 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
787 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
788 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
789 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
790 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
791 "$GujV = \\u0ACD;" // Gujarati Sign Virama
792 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
793 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
794 "$OriV = \\u0B4D;" // Oriya Sign Virama
795 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
796 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
797 "$TelV = \\u0C4D;" // Telugu Sign Virama
798 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
799 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
800 "$KanV = \\u0CCD;" // Kannada Sign Virama
801 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
802 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
803 "$MalV = \\u0D4D;" // Malayalam Sign Virama
804 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
805 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
809 "$L ($L | $V | $LV | $LVT);"
810 "($LV | $V) ($V | $T);"
812 "[^$Control $CR $LF] $Extend;"
813 "[^$Control $CR $LF] $SpacingMark;"
816 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
817 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
818 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
819 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
820 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
821 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
822 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
823 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
826 "($L | $V | $LV | $LVT) $L;"
827 "($V | $T) ($LV | $V);"
829 "$Extend [^$Control $CR $LF];"
830 "$SpacingMark [^$Control $CR $LF];"
833 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
834 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
835 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
836 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
837 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
838 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
839 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
840 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
844 return setUpIteratorWithRules(kRules, string, length);