2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
23 #include "platform/text/TextBreakIterator.h"
25 #include "platform/text/TextBreakIteratorInternalICU.h"
26 #include "wtf/Assertions.h"
27 #include "wtf/HashMap.h"
28 #include "wtf/PassOwnPtr.h"
29 #include "wtf/ThreadSpecific.h"
30 #include "wtf/ThreadingPrimitives.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/CString.h"
33 #include "wtf/text/WTFString.h"
34 #include <unicode/rbbi.h>
35 #include <unicode/ubrk.h>
41 class LineBreakIteratorPool {
42 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
44 static LineBreakIteratorPool& sharedPool()
46 static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
50 static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
52 icu::BreakIterator* take(const AtomicString& locale)
54 icu::BreakIterator* iterator = 0;
55 for (size_t i = 0; i < m_pool.size(); ++i) {
56 if (m_pool[i].first == locale) {
57 iterator = m_pool[i].second;
64 UErrorCode openStatus = U_ZERO_ERROR;
65 bool localeIsEmpty = locale.isEmpty();
66 iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
67 // locale comes from a web page and it can be invalid, leading ICU
68 // to fail, in which case we fall back to the default locale.
69 if (!localeIsEmpty && U_FAILURE(openStatus)) {
70 openStatus = U_ZERO_ERROR;
71 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
74 if (U_FAILURE(openStatus)) {
75 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
80 ASSERT(!m_vendedIterators.contains(iterator));
81 m_vendedIterators.set(iterator, locale);
85 void put(icu::BreakIterator* iterator)
87 ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
89 if (m_pool.size() == capacity) {
90 delete(m_pool[0].second);
94 m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
98 LineBreakIteratorPool() { }
100 static const size_t capacity = 4;
102 typedef pair<AtomicString, icu::BreakIterator*> Entry;
103 typedef Vector<Entry, capacity> Pool;
105 HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
107 friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
110 enum TextContext { NoContext, PriorContext, PrimaryContext };
112 const int textBufferCapacity = 16;
116 UChar buffer[textBufferCapacity];
119 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
123 else if (index > limit)
128 static inline int64_t textNativeLength(UText* text)
130 return text->a + text->b;
133 // Relocate pointer from source into destination as required.
134 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
136 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
137 // Pointer references source extra buffer.
138 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
139 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
140 // Pointer references source text structure, but not source extra buffer.
141 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
145 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
147 ASSERT_UNUSED(deep, !deep);
148 if (U_FAILURE(*status))
150 int32_t extraSize = source->extraSize;
151 destination = utext_setup(destination, extraSize, status);
152 if (U_FAILURE(*status))
154 void* extraNew = destination->pExtra;
155 int32_t flags = destination->flags;
156 int sizeToCopy = std::min(source->sizeOfStruct, destination->sizeOfStruct);
157 memcpy(destination, source, sizeToCopy);
158 destination->pExtra = extraNew;
159 destination->flags = flags;
160 memcpy(destination->pExtra, source->pExtra, extraSize);
161 textFixPointer(source, destination, destination->context);
162 textFixPointer(source, destination, destination->p);
163 textFixPointer(source, destination, destination->q);
164 ASSERT(!destination->r);
165 const void * chunkContents = static_cast<const void*>(destination->chunkContents);
166 textFixPointer(source, destination, chunkContents);
167 destination->chunkContents = static_cast<const UChar*>(chunkContents);
171 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
173 // In the present context, this text provider is used only with ICU functions
174 // that do not perform an extract operation.
175 ASSERT_NOT_REACHED();
176 *errorCode = U_UNSUPPORTED_ERROR;
180 static void textClose(UText* text)
185 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
187 if (!text->b || nativeIndex > text->b)
188 return PrimaryContext;
189 if (nativeIndex == text->b)
190 return forward ? PrimaryContext : PriorContext;
194 static inline TextContext textLatin1GetCurrentContext(const UText* text)
196 if (!text->chunkContents)
198 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
201 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
203 ASSERT(text->chunkContents == text->pExtra);
205 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
206 text->chunkNativeStart = nativeIndex;
207 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
208 if (text->chunkNativeLimit > nativeLength)
209 text->chunkNativeLimit = nativeLength;
211 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
212 text->chunkNativeLimit = nativeIndex;
213 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
214 if (text->chunkNativeStart < text->b)
215 text->chunkNativeStart = text->b;
217 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
218 // Ensure chunk length is well defined if computed length exceeds int32_t range.
219 ASSERT(length <= std::numeric_limits<int32_t>::max());
220 text->chunkLength = length <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
221 text->nativeIndexingLimit = text->chunkLength;
222 text->chunkOffset = forward ? 0 : text->chunkLength;
223 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
226 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
228 ASSERT(!text->chunkContents || text->chunkContents == text->q);
229 text->chunkContents = static_cast<const UChar*>(text->pExtra);
230 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
233 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
235 ASSERT(text->chunkContents == text->q);
236 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
237 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
238 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
239 text->chunkNativeStart = 0;
240 text->chunkNativeLimit = text->b;
241 text->chunkLength = text->b;
242 text->nativeIndexingLimit = text->chunkLength;
243 int64_t offset = nativeIndex - text->chunkNativeStart;
244 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
245 ASSERT(offset <= std::numeric_limits<int32_t>::max());
246 text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
249 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
251 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
252 text->chunkContents = static_cast<const UChar*>(text->q);
253 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
256 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
259 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
260 int64_t offset = nativeIndex - text->chunkNativeStart;
261 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
262 ASSERT(offset <= std::numeric_limits<int32_t>::max());
263 text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
267 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
268 text->chunkOffset = text->chunkLength;
269 isAccessible = FALSE;
273 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
274 int64_t offset = nativeIndex - text->chunkNativeStart;
275 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
276 ASSERT(offset <= std::numeric_limits<int32_t>::max());
277 text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
281 if (nativeIndex <= 0 && !text->chunkNativeStart) {
282 text->chunkOffset = 0;
283 isAccessible = FALSE;
290 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
294 int64_t nativeLength = textNativeLength(text);
296 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
298 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
299 TextContext currentContext = textLatin1GetCurrentContext(text);
300 TextContext newContext = textGetContext(text, nativeIndex, forward);
301 ASSERT(newContext != NoContext);
302 if (newContext == currentContext) {
303 if (currentContext == PrimaryContext) {
304 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
306 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
308 } else if (newContext == PrimaryContext) {
309 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
311 ASSERT(newContext == PriorContext);
312 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
317 static const struct UTextFuncs textLatin1Funcs = {
329 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
331 text->pFuncs = funcs;
332 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
333 text->context = string;
336 text->q = priorContext;
337 text->b = priorContextLength;
340 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
342 if (U_FAILURE(*status))
345 if (!string || length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) {
346 *status = U_ILLEGAL_ARGUMENT_ERROR;
349 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
350 if (U_FAILURE(*status)) {
354 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
358 static inline TextContext textUTF16GetCurrentContext(const UText* text)
360 if (!text->chunkContents)
362 return text->chunkContents == text->p ? PrimaryContext : PriorContext;
365 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
367 ASSERT(text->chunkContents == text->p);
368 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
369 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
370 text->chunkNativeStart = text->b;
371 text->chunkNativeLimit = nativeLength;
372 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
373 // Ensure chunk length is well defined if computed length exceeds int32_t range.
374 ASSERT(length <= std::numeric_limits<int32_t>::max());
375 text->chunkLength = length <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
376 text->nativeIndexingLimit = text->chunkLength;
377 int64_t offset = nativeIndex - text->chunkNativeStart;
378 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
379 ASSERT(offset <= std::numeric_limits<int32_t>::max());
380 text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
383 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
385 ASSERT(!text->chunkContents || text->chunkContents == text->q);
386 text->chunkContents = static_cast<const UChar*>(text->p);
387 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
390 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
392 ASSERT(text->chunkContents == text->q);
393 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
394 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
395 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
396 text->chunkNativeStart = 0;
397 text->chunkNativeLimit = text->b;
398 text->chunkLength = text->b;
399 text->nativeIndexingLimit = text->chunkLength;
400 int64_t offset = nativeIndex - text->chunkNativeStart;
401 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
402 ASSERT(offset <= std::numeric_limits<int32_t>::max());
403 text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
406 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
408 ASSERT(!text->chunkContents || text->chunkContents == text->p);
409 text->chunkContents = static_cast<const UChar*>(text->q);
410 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
413 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
417 int64_t nativeLength = textNativeLength(text);
419 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
421 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
422 TextContext currentContext = textUTF16GetCurrentContext(text);
423 TextContext newContext = textGetContext(text, nativeIndex, forward);
424 ASSERT(newContext != NoContext);
425 if (newContext == currentContext) {
426 if (currentContext == PrimaryContext) {
427 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
429 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
431 } else if (newContext == PrimaryContext) {
432 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
434 ASSERT(newContext == PriorContext);
435 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
440 static const struct UTextFuncs textUTF16Funcs = {
452 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
454 if (U_FAILURE(*status))
457 if (!string || length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) {
458 *status = U_ILLEGAL_ARGUMENT_ERROR;
462 text = utext_setup(text, 0, status);
463 if (U_FAILURE(*status)) {
467 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
471 static UText emptyText = UTEXT_INITIALIZER;
473 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
475 UErrorCode errorCode = U_ZERO_ERROR;
476 static TextBreakIterator* breakIter = 0;
478 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
479 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
484 UTextWithBuffer textLocal;
485 textLocal.text = emptyText;
486 textLocal.text.extraSize = sizeof(textLocal.buffer);
487 textLocal.text.pExtra = textLocal.buffer;
489 UErrorCode openStatus = U_ZERO_ERROR;
490 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
491 if (U_FAILURE(openStatus)) {
492 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
496 UErrorCode setTextStatus = U_ZERO_ERROR;
497 breakIter->setText(text, setTextStatus);
498 if (U_FAILURE(setTextStatus))
499 WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
506 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
508 UErrorCode errorCode = U_ZERO_ERROR;
509 UText uText = UTEXT_INITIALIZER;
510 utext_openUChars(&uText, string, length, &errorCode);
511 if (U_FAILURE(errorCode))
513 iter->setText(&uText, errorCode);
516 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
518 UErrorCode errorCode = U_ZERO_ERROR;
519 static TextBreakIterator* breakIter = 0;
521 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
522 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
526 setText16(breakIter, string, length);
530 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
532 if (string.isEmpty())
535 return wordBreakIterator(string.characters8() + start, length);
536 return wordBreakIterator(string.characters16() + start, length);
539 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
541 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
545 UTextWithBuffer textLocal;
546 textLocal.text = emptyText;
547 textLocal.text.extraSize = sizeof(textLocal.buffer);
548 textLocal.text.pExtra = textLocal.buffer;
550 UErrorCode openStatus = U_ZERO_ERROR;
551 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
552 if (U_FAILURE(openStatus)) {
553 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
557 UErrorCode setTextStatus = U_ZERO_ERROR;
558 iterator->setText(text, setTextStatus);
559 if (U_FAILURE(setTextStatus)) {
560 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
569 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
571 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
575 UText textLocal = UTEXT_INITIALIZER;
577 UErrorCode openStatus = U_ZERO_ERROR;
578 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
579 if (U_FAILURE(openStatus)) {
580 WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
584 UErrorCode setTextStatus = U_ZERO_ERROR;
585 iterator->setText(text, setTextStatus);
586 if (U_FAILURE(setTextStatus)) {
587 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
596 void releaseLineBreakIterator(TextBreakIterator* iterator)
598 ASSERT_ARG(iterator, iterator);
600 LineBreakIteratorPool::sharedPool().put(iterator);
603 static TextBreakIterator* nonSharedCharacterBreakIterator;
605 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
607 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
608 MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
609 if (nonSharedCharacterBreakIterator != expected)
611 nonSharedCharacterBreakIterator = newValue;
615 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
622 if (string.isEmpty())
625 m_is8Bit = string.is8Bit();
628 m_charaters8 = string.characters8();
630 m_length = string.length();
634 createIteratorForBuffer(string.characters16(), string.length());
637 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
644 createIteratorForBuffer(buffer, length);
647 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
649 m_iterator = nonSharedCharacterBreakIterator;
650 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
651 if (!createdIterator) {
652 UErrorCode errorCode = U_ZERO_ERROR;
653 m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
654 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
657 setText16(m_iterator, buffer, length);
660 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
664 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
668 int NonSharedCharacterBreakIterator::next()
671 return m_iterator->next();
673 if (m_offset >= m_length)
674 return TextBreakDone;
676 m_offset += clusterLengthStartingAt(m_offset);
680 int NonSharedCharacterBreakIterator::current()
683 return m_iterator->current();
687 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
690 return m_iterator->isBoundary(offset);
691 return !isLFAfterCR(offset);
694 int NonSharedCharacterBreakIterator::preceding(int offset) const
697 return m_iterator->preceding(offset);
699 return TextBreakDone;
700 if (isLFAfterCR(offset))
705 int NonSharedCharacterBreakIterator::following(int offset) const
708 return m_iterator->following(offset);
709 if (static_cast<unsigned>(offset) >= m_length)
710 return TextBreakDone;
711 return offset + clusterLengthStartingAt(offset);
714 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
716 UErrorCode openStatus = U_ZERO_ERROR;
717 static TextBreakIterator* iterator = 0;
719 iterator = icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
720 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
725 setText16(iterator, string, length);
729 bool isWordTextBreak(TextBreakIterator* iterator)
731 icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
732 int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
733 return ruleStatus != UBRK_WORD_NONE;
736 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
741 static TextBreakIterator* iterator = 0;
743 UParseError parseStatus;
744 UErrorCode openStatus = U_ZERO_ERROR;
746 String(breakRules).appendTo(rules);
748 iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
749 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
754 setText16(iterator, string, length);
758 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
760 // This rule set is based on character-break iterator rules of ICU 4.0
761 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
762 // The major differences from the original ones are listed below:
763 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
764 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
765 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
766 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
767 // * Added rules for regional indicator symbols.
768 static const char* const kRules =
769 "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
770 "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
771 "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
772 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
773 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
774 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
775 "$L = [\\p{Grapheme_Cluster_Break = L}];"
776 "$V = [\\p{Grapheme_Cluster_Break = V}];"
777 "$T = [\\p{Grapheme_Cluster_Break = T}];"
778 "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
779 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
780 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
781 "$HinV = \\u094D;" // Devanagari Sign Virama
782 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
783 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
784 "$BenV = \\u09CD;" // Bengali Sign Virama
785 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
786 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
787 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
788 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
789 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
790 "$GujV = \\u0ACD;" // Gujarati Sign Virama
791 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
792 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
793 "$OriV = \\u0B4D;" // Oriya Sign Virama
794 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
795 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
796 "$TelV = \\u0C4D;" // Telugu Sign Virama
797 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
798 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
799 "$KanV = \\u0CCD;" // Kannada Sign Virama
800 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
801 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
802 "$MalV = \\u0D4D;" // Malayalam Sign Virama
803 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
804 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
808 "$L ($L | $V | $LV | $LVT);"
809 "($LV | $V) ($V | $T);"
811 "[^$Control $CR $LF] $Extend;"
812 "[^$Control $CR $LF] $SpacingMark;"
815 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
816 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
817 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
818 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
819 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
820 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
821 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
822 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
825 "($L | $V | $LV | $LVT) $L;"
826 "($V | $T) ($LV | $V);"
828 "$Extend [^$Control $CR $LF];"
829 "$SpacingMark [^$Control $CR $LF];"
832 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
833 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
834 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
835 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
836 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
837 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
838 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
839 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
843 return setUpIteratorWithRules(kRules, string, length);