src/third_party/WebKit/Source/platform/text/TextBreakIteratorICU.cpp

   1 /*
   2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
   3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Library General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Library General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Library General Public License
  16  * along with this library; see the file COPYING.LIB.  If not, write to
  17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18  * Boston, MA 02110-1301, USA.
  19  *
  20  */
  21
  22 #include "config.h"
  23 #include "platform/text/TextBreakIterator.h"
  24
  25 #include "platform/text/TextBreakIteratorInternalICU.h"
  26 #include "wtf/Assertions.h"
  27 #include "wtf/HashMap.h"
  28 #include "wtf/PassOwnPtr.h"
  29 #include "wtf/ThreadSpecific.h"
  30 #include "wtf/ThreadingPrimitives.h"
  31 #include "wtf/text/AtomicString.h"
  32 #include "wtf/text/CString.h"
  33 #include "wtf/text/WTFString.h"
  34 #include <unicode/rbbi.h>
  35 #include <unicode/ubrk.h>
  36
  37 using namespace WTF;
  38 using namespace std;
  39
  40 namespace WebCore {
  41
  42 class LineBreakIteratorPool {
  43     WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
  44 public:
  45     static LineBreakIteratorPool& sharedPool()
  46     {
  47         static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
  48         return **pool;
  49     }
  50
  51     static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
  52
  53     icu::BreakIterator* take(const AtomicString& locale)
  54     {
  55         icu::BreakIterator* iterator = 0;
  56         for (size_t i = 0; i < m_pool.size(); ++i) {
  57             if (m_pool[i].first == locale) {
  58                 iterator = m_pool[i].second;
  59                 m_pool.remove(i);
  60                 break;
  61             }
  62         }
  63
  64         if (!iterator) {
  65             UErrorCode openStatus = U_ZERO_ERROR;
  66             bool localeIsEmpty = locale.isEmpty();
  67             iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
  68             // locale comes from a web page and it can be invalid, leading ICU
  69             // to fail, in which case we fall back to the default locale.
  70             if (!localeIsEmpty && U_FAILURE(openStatus)) {
  71                 openStatus = U_ZERO_ERROR;
  72                 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
  73             }
  74
  75             if (U_FAILURE(openStatus)) {
  76                 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
  77                 return 0;
  78             }
  79         }
  80
  81         ASSERT(!m_vendedIterators.contains(iterator));
  82         m_vendedIterators.set(iterator, locale);
  83         return iterator;
  84     }
  85
  86     void put(icu::BreakIterator* iterator)
  87     {
  88         ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
  89
  90         if (m_pool.size() == capacity) {
  91             delete(m_pool[0].second);
  92             m_pool.remove(0);
  93         }
  94
  95         m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
  96     }
  97
  98 private:
  99     LineBreakIteratorPool() { }
 100
 101     static const size_t capacity = 4;
 102
 103     typedef pair<AtomicString, icu::BreakIterator*> Entry;
 104     typedef Vector<Entry, capacity> Pool;
 105     Pool m_pool;
 106     HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
 107
 108     friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
 109 };
 110
 111 enum TextContext { NoContext, PriorContext, PrimaryContext };
 112
 113 const int textBufferCapacity = 16;
 114
 115 typedef struct {
 116     UText text;
 117     UChar buffer[textBufferCapacity];
 118 } UTextWithBuffer;
 119
 120 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
 121 {
 122     if (index < 0)
 123         index = 0;
 124     else if (index > limit)
 125         index = limit;
 126     return index;
 127 }
 128
 129 static inline int64_t textNativeLength(UText* text)
 130 {
 131     return text->a + text->b;
 132 }
 133
 134 // Relocate pointer from source into destination as required.
 135 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
 136 {
 137     if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
 138         // Pointer references source extra buffer.
 139         pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
 140     } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
 141         // Pointer references source text structure, but not source extra buffer.
 142         pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
 143     }
 144 }
 145
 146 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
 147 {
 148     ASSERT_UNUSED(deep, !deep);
 149     if (U_FAILURE(*status))
 150         return 0;
 151     int32_t extraSize = source->extraSize;
 152     destination = utext_setup(destination, extraSize, status);
 153     if (U_FAILURE(*status))
 154         return destination;
 155     void* extraNew = destination->pExtra;
 156     int32_t flags = destination->flags;
 157     int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
 158     memcpy(destination, source, sizeToCopy);
 159     destination->pExtra = extraNew;
 160     destination->flags = flags;
 161     memcpy(destination->pExtra, source->pExtra, extraSize);
 162     textFixPointer(source, destination, destination->context);
 163     textFixPointer(source, destination, destination->p);
 164     textFixPointer(source, destination, destination->q);
 165     ASSERT(!destination->r);
 166     const void * chunkContents = static_cast<const void*>(destination->chunkContents);
 167     textFixPointer(source, destination, chunkContents);
 168     destination->chunkContents = static_cast<const UChar*>(chunkContents);
 169     return destination;
 170 }
 171
 172 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
 173 {
 174     // In the present context, this text provider is used only with ICU functions
 175     // that do not perform an extract operation.
 176     ASSERT_NOT_REACHED();
 177     *errorCode = U_UNSUPPORTED_ERROR;
 178     return 0;
 179 }
 180
 181 static void textClose(UText* text)
 182 {
 183     text->context = 0;
 184 }
 185
 186 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
 187 {
 188     if (!text->b || nativeIndex > text->b)
 189         return PrimaryContext;
 190     if (nativeIndex == text->b)
 191         return forward ? PrimaryContext : PriorContext;
 192     return PriorContext;
 193 }
 194
 195 static inline TextContext textLatin1GetCurrentContext(const UText* text)
 196 {
 197     if (!text->chunkContents)
 198         return NoContext;
 199     return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
 200 }
 201
 202 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 203 {
 204     ASSERT(text->chunkContents == text->pExtra);
 205     if (forward) {
 206         ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
 207         text->chunkNativeStart = nativeIndex;
 208         text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
 209         if (text->chunkNativeLimit > nativeLength)
 210             text->chunkNativeLimit = nativeLength;
 211     } else {
 212         ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
 213         text->chunkNativeLimit = nativeIndex;
 214         text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
 215         if (text->chunkNativeStart < text->b)
 216             text->chunkNativeStart = text->b;
 217     }
 218     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
 219     // Ensure chunk length is well defined if computed length exceeds int32_t range.
 220     ASSERT(length <= numeric_limits<int32_t>::max());
 221     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
 222     text->nativeIndexingLimit = text->chunkLength;
 223     text->chunkOffset = forward ? 0 : text->chunkLength;
 224     StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
 225 }
 226
 227 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 228 {
 229     ASSERT(!text->chunkContents || text->chunkContents == text->q);
 230     text->chunkContents = static_cast<const UChar*>(text->pExtra);
 231     textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 232 }
 233
 234 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 235 {
 236     ASSERT(text->chunkContents == text->q);
 237     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
 238     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 239     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 240     text->chunkNativeStart = 0;
 241     text->chunkNativeLimit = text->b;
 242     text->chunkLength = text->b;
 243     text->nativeIndexingLimit = text->chunkLength;
 244     int64_t offset = nativeIndex - text->chunkNativeStart;
 245     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
 246     ASSERT(offset <= numeric_limits<int32_t>::max());
 247     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
 248 }
 249
 250 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 251 {
 252     ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
 253     text->chunkContents = static_cast<const UChar*>(text->q);
 254     textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 255 }
 256
 257 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
 258 {
 259     if (forward) {
 260         if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
 261             int64_t offset = nativeIndex - text->chunkNativeStart;
 262             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
 263             ASSERT(offset <= numeric_limits<int32_t>::max());
 264             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
 265             isAccessible = TRUE;
 266             return true;
 267         }
 268         if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
 269             text->chunkOffset = text->chunkLength;
 270             isAccessible = FALSE;
 271             return true;
 272         }
 273     } else {
 274         if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
 275             int64_t offset = nativeIndex - text->chunkNativeStart;
 276             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
 277             ASSERT(offset <= numeric_limits<int32_t>::max());
 278             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
 279             isAccessible = TRUE;
 280             return true;
 281         }
 282         if (nativeIndex <= 0 && !text->chunkNativeStart) {
 283             text->chunkOffset = 0;
 284             isAccessible = FALSE;
 285             return true;
 286         }
 287     }
 288     return false;
 289 }
 290
 291 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
 292 {
 293     if (!text->context)
 294         return FALSE;
 295     int64_t nativeLength = textNativeLength(text);
 296     UBool isAccessible;
 297     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
 298         return isAccessible;
 299     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
 300     TextContext currentContext = textLatin1GetCurrentContext(text);
 301     TextContext newContext = textGetContext(text, nativeIndex, forward);
 302     ASSERT(newContext != NoContext);
 303     if (newContext == currentContext) {
 304         if (currentContext == PrimaryContext) {
 305             textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 306         } else {
 307             textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 308         }
 309     } else if (newContext == PrimaryContext) {
 310         textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
 311     } else {
 312         ASSERT(newContext == PriorContext);
 313         textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
 314     }
 315     return TRUE;
 316 }
 317
 318 static const struct UTextFuncs textLatin1Funcs = {
 319     sizeof(UTextFuncs),
 320     0, 0, 0,
 321     textClone,
 322     textNativeLength,
 323     textLatin1Access,
 324     textExtract,
 325     0, 0, 0, 0,
 326     textClose,
 327     0, 0, 0,
 328 };
 329
 330 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
 331 {
 332     text->pFuncs = funcs;
 333     text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
 334     text->context = string;
 335     text->p = string;
 336     text->a = length;
 337     text->q = priorContext;
 338     text->b = priorContextLength;
 339 }
 340
 341 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
 342 {
 343     if (U_FAILURE(*status))
 344         return 0;
 345
 346     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
 347         *status = U_ILLEGAL_ARGUMENT_ERROR;
 348         return 0;
 349     }
 350     UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
 351     if (U_FAILURE(*status)) {
 352         ASSERT(!text);
 353         return 0;
 354     }
 355     textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
 356     return text;
 357 }
 358
 359 static inline TextContext textUTF16GetCurrentContext(const UText* text)
 360 {
 361     if (!text->chunkContents)
 362         return NoContext;
 363     return text->chunkContents == text->p ? PrimaryContext : PriorContext;
 364 }
 365
 366 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 367 {
 368     ASSERT(text->chunkContents == text->p);
 369     ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
 370     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 371     text->chunkNativeStart = text->b;
 372     text->chunkNativeLimit = nativeLength;
 373     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
 374     // Ensure chunk length is well defined if computed length exceeds int32_t range.
 375     ASSERT(length <= numeric_limits<int32_t>::max());
 376     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
 377     text->nativeIndexingLimit = text->chunkLength;
 378     int64_t offset = nativeIndex - text->chunkNativeStart;
 379     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
 380     ASSERT(offset <= numeric_limits<int32_t>::max());
 381     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
 382 }
 383
 384 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 385 {
 386     ASSERT(!text->chunkContents || text->chunkContents == text->q);
 387     text->chunkContents = static_cast<const UChar*>(text->p);
 388     textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 389 }
 390
 391 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 392 {
 393     ASSERT(text->chunkContents == text->q);
 394     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
 395     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 396     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 397     text->chunkNativeStart = 0;
 398     text->chunkNativeLimit = text->b;
 399     text->chunkLength = text->b;
 400     text->nativeIndexingLimit = text->chunkLength;
 401     int64_t offset = nativeIndex - text->chunkNativeStart;
 402     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
 403     ASSERT(offset <= numeric_limits<int32_t>::max());
 404     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
 405 }
 406
 407 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 408 {
 409     ASSERT(!text->chunkContents || text->chunkContents == text->p);
 410     text->chunkContents = static_cast<const UChar*>(text->q);
 411     textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 412 }
 413
 414 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
 415 {
 416     if (!text->context)
 417         return FALSE;
 418     int64_t nativeLength = textNativeLength(text);
 419     UBool isAccessible;
 420     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
 421         return isAccessible;
 422     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
 423     TextContext currentContext = textUTF16GetCurrentContext(text);
 424     TextContext newContext = textGetContext(text, nativeIndex, forward);
 425     ASSERT(newContext != NoContext);
 426     if (newContext == currentContext) {
 427         if (currentContext == PrimaryContext) {
 428             textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 429         } else {
 430             textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 431         }
 432     } else if (newContext == PrimaryContext) {
 433         textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
 434     } else {
 435         ASSERT(newContext == PriorContext);
 436         textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
 437     }
 438     return TRUE;
 439 }
 440
 441 static const struct UTextFuncs textUTF16Funcs = {
 442     sizeof(UTextFuncs),
 443     0, 0, 0,
 444     textClone,
 445     textNativeLength,
 446     textUTF16Access,
 447     textExtract,
 448     0, 0, 0, 0,
 449     textClose,
 450     0, 0, 0,
 451 };
 452
 453 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
 454 {
 455     if (U_FAILURE(*status))
 456         return 0;
 457
 458     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
 459         *status = U_ILLEGAL_ARGUMENT_ERROR;
 460         return 0;
 461     }
 462
 463     text = utext_setup(text, 0, status);
 464     if (U_FAILURE(*status)) {
 465         ASSERT(!text);
 466         return 0;
 467     }
 468     textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
 469     return text;
 470 }
 471
 472 static UText emptyText = UTEXT_INITIALIZER;
 473
 474 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
 475 {
 476     UErrorCode errorCode = U_ZERO_ERROR;
 477     static TextBreakIterator* breakIter = 0;
 478     if (!breakIter) {
 479         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
 480         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
 481         if (!breakIter)
 482             return 0;
 483     }
 484
 485     UTextWithBuffer textLocal;
 486     textLocal.text = emptyText;
 487     textLocal.text.extraSize = sizeof(textLocal.buffer);
 488     textLocal.text.pExtra = textLocal.buffer;
 489
 490     UErrorCode openStatus = U_ZERO_ERROR;
 491     UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
 492     if (U_FAILURE(openStatus)) {
 493         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
 494         return 0;
 495     }
 496
 497     UErrorCode setTextStatus = U_ZERO_ERROR;
 498     breakIter->setText(text, setTextStatus);
 499     if (U_FAILURE(setTextStatus))
 500         WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
 501
 502     utext_close(text);
 503
 504     return breakIter;
 505 }
 506
 507 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
 508 {
 509     UErrorCode errorCode = U_ZERO_ERROR;
 510     UText uText = UTEXT_INITIALIZER;
 511     utext_openUChars(&uText, string, length, &errorCode);
 512     if (U_FAILURE(errorCode))
 513         return;
 514     iter->setText(&uText, errorCode);
 515 }
 516
 517 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
 518 {
 519     UErrorCode errorCode = U_ZERO_ERROR;
 520     static TextBreakIterator* breakIter = 0;
 521     if (!breakIter) {
 522         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
 523         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
 524         if (!breakIter)
 525             return 0;
 526     }
 527     setText16(breakIter, string, length);
 528     return breakIter;
 529 }
 530
 531 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
 532 {
 533     if (string.isEmpty())
 534         return 0;
 535     if (string.is8Bit())
 536         return wordBreakIterator(string.characters8() + start, length);
 537     return wordBreakIterator(string.characters16() + start, length);
 538 }
 539
 540 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
 541 {
 542     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
 543     if (!iterator)
 544         return 0;
 545
 546     UTextWithBuffer textLocal;
 547     textLocal.text = emptyText;
 548     textLocal.text.extraSize = sizeof(textLocal.buffer);
 549     textLocal.text.pExtra = textLocal.buffer;
 550
 551     UErrorCode openStatus = U_ZERO_ERROR;
 552     UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
 553     if (U_FAILURE(openStatus)) {
 554         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
 555         return 0;
 556     }
 557
 558     UErrorCode setTextStatus = U_ZERO_ERROR;
 559     iterator->setText(text, setTextStatus);
 560     if (U_FAILURE(setTextStatus)) {
 561         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
 562         return 0;
 563     }
 564
 565     utext_close(text);
 566
 567     return iterator;
 568 }
 569
 570 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
 571 {
 572     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
 573     if (!iterator)
 574         return 0;
 575
 576     UText textLocal = UTEXT_INITIALIZER;
 577
 578     UErrorCode openStatus = U_ZERO_ERROR;
 579     UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
 580     if (U_FAILURE(openStatus)) {
 581         WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
 582         return 0;
 583     }
 584
 585     UErrorCode setTextStatus = U_ZERO_ERROR;
 586     iterator->setText(text, setTextStatus);
 587     if (U_FAILURE(setTextStatus)) {
 588         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
 589         return 0;
 590     }
 591
 592     utext_close(text);
 593
 594     return iterator;
 595 }
 596
 597 void releaseLineBreakIterator(TextBreakIterator* iterator)
 598 {
 599     ASSERT_ARG(iterator, iterator);
 600
 601     LineBreakIteratorPool::sharedPool().put(iterator);
 602 }
 603
 604 static TextBreakIterator* nonSharedCharacterBreakIterator;
 605
 606 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
 607 {
 608     DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
 609     MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
 610     if (nonSharedCharacterBreakIterator != expected)
 611         return false;
 612     nonSharedCharacterBreakIterator = newValue;
 613     return true;
 614 }
 615
 616 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
 617     : m_is8Bit(true)
 618     , m_charaters8(0)
 619     , m_offset(0)
 620     , m_length(0)
 621     , m_iterator(0)
 622 {
 623     if (string.isEmpty())
 624         return;
 625
 626     m_is8Bit = string.is8Bit();
 627
 628     if (m_is8Bit) {
 629         m_charaters8 = string.characters8();
 630         m_offset = 0;
 631         m_length = string.length();
 632         return;
 633     }
 634
 635     createIteratorForBuffer(string.characters16(), string.length());
 636 }
 637
 638 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
 639     : m_is8Bit(false)
 640     , m_charaters8(0)
 641     , m_offset(0)
 642     , m_length(0)
 643     , m_iterator(0)
 644 {
 645     createIteratorForBuffer(buffer, length);
 646 }
 647
 648 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
 649 {
 650     m_iterator = nonSharedCharacterBreakIterator;
 651     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
 652     if (!createdIterator) {
 653         UErrorCode errorCode = U_ZERO_ERROR;
 654         m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
 655         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
 656     }
 657
 658     setText16(m_iterator, buffer, length);
 659 }
 660
 661 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
 662 {
 663     if (m_is8Bit)
 664         return;
 665     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
 666         delete m_iterator;
 667 }
 668
 669 int NonSharedCharacterBreakIterator::next()
 670 {
 671     if (!m_is8Bit)
 672         return m_iterator->next();
 673
 674     if (m_offset >= m_length)
 675         return TextBreakDone;
 676
 677     m_offset += clusterLengthStartingAt(m_offset);
 678     return m_offset;
 679 }
 680
 681 int NonSharedCharacterBreakIterator::current()
 682 {
 683     if (!m_is8Bit)
 684         return m_iterator->current();
 685     return m_offset;
 686 }
 687
 688 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
 689 {
 690     if (!m_is8Bit)
 691         return m_iterator->isBoundary(offset);
 692     return !isLFAfterCR(offset);
 693 }
 694
 695 int NonSharedCharacterBreakIterator::preceding(int offset) const
 696 {
 697     if (!m_is8Bit)
 698         return m_iterator->preceding(offset);
 699     if (offset <= 0)
 700         return TextBreakDone;
 701     if (isLFAfterCR(offset))
 702         return offset - 2;
 703     return offset - 1;
 704 }
 705
 706 int NonSharedCharacterBreakIterator::following(int offset) const
 707 {
 708     if (!m_is8Bit)
 709         return m_iterator->following(offset);
 710     if (static_cast<unsigned>(offset) >= m_length)
 711         return TextBreakDone;
 712     return offset + clusterLengthStartingAt(offset);
 713 }
 714
 715 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
 716 {
 717     UErrorCode openStatus = U_ZERO_ERROR;
 718     static TextBreakIterator* iterator = 0;
 719     if (!iterator) {
 720         iterator =  icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
 721         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
 722         if (!iterator)
 723             return 0;
 724     }
 725
 726     setText16(iterator, string, length);
 727     return iterator;
 728 }
 729
 730 bool isWordTextBreak(TextBreakIterator* iterator)
 731 {
 732     icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
 733     int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
 734     return ruleStatus != UBRK_WORD_NONE;
 735 }
 736
 737 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
 738 {
 739     if (!string)
 740         return 0;
 741
 742     static TextBreakIterator* iterator = 0;
 743     if (!iterator) {
 744         UParseError parseStatus;
 745         UErrorCode openStatus = U_ZERO_ERROR;
 746         Vector<UChar> rules;
 747         String(breakRules).appendTo(rules);
 748
 749         iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
 750         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
 751         if (!iterator)
 752             return 0;
 753     }
 754
 755     setText16(iterator, string, length);
 756     return iterator;
 757 }
 758
 759 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
 760 {
 761     // This rule set is based on character-break iterator rules of ICU 4.0
 762     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
 763     // The major differences from the original ones are listed below:
 764     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
 765     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
 766     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
 767     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
 768     // * Added rules for regional indicator symbols.
 769     static const char* const kRules =
 770         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
 771         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
 772         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
 773         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
 774         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
 775         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
 776         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
 777         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
 778         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
 779         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
 780         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
 781         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
 782         "$HinV    = \\u094D;"              // Devanagari Sign Virama
 783         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
 784         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
 785         "$BenV    = \\u09CD;"              // Bengali Sign Virama
 786         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
 787         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
 788         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
 789         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
 790         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
 791         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
 792         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
 793         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
 794         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
 795         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
 796         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
 797         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
 798         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
 799         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
 800         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
 801         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
 802         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
 803         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
 804         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
 805         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
 806         "!!chain;"
 807         "!!forward;"
 808         "$CR $LF;"
 809         "$L ($L | $V | $LV | $LVT);"
 810         "($LV | $V) ($V | $T);"
 811         "($LVT | $T) $T;"
 812         "[^$Control $CR $LF] $Extend;"
 813         "[^$Control $CR $LF] $SpacingMark;"
 814         "$RI $RI / $RI;"
 815         "$RI $RI;"
 816         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
 817         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
 818         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
 819         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
 820         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
 821         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
 822         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
 823         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
 824         "!!reverse;"
 825         "$LF $CR;"
 826         "($L | $V | $LV | $LVT) $L;"
 827         "($V | $T) ($LV | $V);"
 828         "$T ($LVT | $T);"
 829         "$Extend      [^$Control $CR $LF];"
 830         "$SpacingMark [^$Control $CR $LF];"
 831         "$RI $RI / $RI $RI;"
 832         "$RI $RI;"
 833         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
 834         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
 835         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
 836         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
 837         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
 838         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
 839         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
 840         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
 841         "!!safe_reverse;"
 842         "!!safe_forward;";
 843
 844     return setUpIteratorWithRules(kRules, string, length);
 845 }
 846
 847 }