src/third_party/WebKit/Source/platform/text/TextBreakIteratorICU.cpp

   1 /*
   2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
   3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Library General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Library General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Library General Public License
  16  * along with this library; see the file COPYING.LIB.  If not, write to
  17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18  * Boston, MA 02110-1301, USA.
  19  *
  20  */
  21
  22 #include "config.h"
  23 #include "platform/text/TextBreakIterator.h"
  24
  25 #include "platform/text/TextBreakIteratorInternalICU.h"
  26 #include "wtf/Assertions.h"
  27 #include "wtf/HashMap.h"
  28 #include "wtf/PassOwnPtr.h"
  29 #include "wtf/ThreadSpecific.h"
  30 #include "wtf/ThreadingPrimitives.h"
  31 #include "wtf/text/AtomicString.h"
  32 #include "wtf/text/CString.h"
  33 #include "wtf/text/WTFString.h"
  34 #include <unicode/rbbi.h>
  35 #include <unicode/ubrk.h>
  36
  37 using namespace WTF;
  38
  39 namespace blink {
  40
  41 class LineBreakIteratorPool {
  42     WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
  43 public:
  44     static LineBreakIteratorPool& sharedPool()
  45     {
  46         static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
  47         return **pool;
  48     }
  49
  50     static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
  51
  52     icu::BreakIterator* take(const AtomicString& locale)
  53     {
  54         icu::BreakIterator* iterator = 0;
  55         for (size_t i = 0; i < m_pool.size(); ++i) {
  56             if (m_pool[i].first == locale) {
  57                 iterator = m_pool[i].second;
  58                 m_pool.remove(i);
  59                 break;
  60             }
  61         }
  62
  63         if (!iterator) {
  64             UErrorCode openStatus = U_ZERO_ERROR;
  65             bool localeIsEmpty = locale.isEmpty();
  66             iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
  67             // locale comes from a web page and it can be invalid, leading ICU
  68             // to fail, in which case we fall back to the default locale.
  69             if (!localeIsEmpty && U_FAILURE(openStatus)) {
  70                 openStatus = U_ZERO_ERROR;
  71                 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
  72             }
  73
  74             if (U_FAILURE(openStatus)) {
  75                 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
  76                 return 0;
  77             }
  78         }
  79
  80         ASSERT(!m_vendedIterators.contains(iterator));
  81         m_vendedIterators.set(iterator, locale);
  82         return iterator;
  83     }
  84
  85     void put(icu::BreakIterator* iterator)
  86     {
  87         ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
  88
  89         if (m_pool.size() == capacity) {
  90             delete(m_pool[0].second);
  91             m_pool.remove(0);
  92         }
  93
  94         m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
  95     }
  96
  97 private:
  98     LineBreakIteratorPool() { }
  99
 100     static const size_t capacity = 4;
 101
 102     typedef pair<AtomicString, icu::BreakIterator*> Entry;
 103     typedef Vector<Entry, capacity> Pool;
 104     Pool m_pool;
 105     HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
 106
 107     friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
 108 };
 109
 110 enum TextContext { NoContext, PriorContext, PrimaryContext };
 111
 112 const int textBufferCapacity = 16;
 113
 114 typedef struct {
 115     UText text;
 116     UChar buffer[textBufferCapacity];
 117 } UTextWithBuffer;
 118
 119 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
 120 {
 121     if (index < 0)
 122         index = 0;
 123     else if (index > limit)
 124         index = limit;
 125     return index;
 126 }
 127
 128 static inline int64_t textNativeLength(UText* text)
 129 {
 130     return text->a + text->b;
 131 }
 132
 133 // Relocate pointer from source into destination as required.
 134 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
 135 {
 136     if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
 137         // Pointer references source extra buffer.
 138         pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
 139     } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
 140         // Pointer references source text structure, but not source extra buffer.
 141         pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
 142     }
 143 }
 144
 145 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
 146 {
 147     ASSERT_UNUSED(deep, !deep);
 148     if (U_FAILURE(*status))
 149         return 0;
 150     int32_t extraSize = source->extraSize;
 151     destination = utext_setup(destination, extraSize, status);
 152     if (U_FAILURE(*status))
 153         return destination;
 154     void* extraNew = destination->pExtra;
 155     int32_t flags = destination->flags;
 156     int sizeToCopy = std::min(source->sizeOfStruct, destination->sizeOfStruct);
 157     memcpy(destination, source, sizeToCopy);
 158     destination->pExtra = extraNew;
 159     destination->flags = flags;
 160     memcpy(destination->pExtra, source->pExtra, extraSize);
 161     textFixPointer(source, destination, destination->context);
 162     textFixPointer(source, destination, destination->p);
 163     textFixPointer(source, destination, destination->q);
 164     ASSERT(!destination->r);
 165     const void * chunkContents = static_cast<const void*>(destination->chunkContents);
 166     textFixPointer(source, destination, chunkContents);
 167     destination->chunkContents = static_cast<const UChar*>(chunkContents);
 168     return destination;
 169 }
 170
 171 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
 172 {
 173     // In the present context, this text provider is used only with ICU functions
 174     // that do not perform an extract operation.
 175     ASSERT_NOT_REACHED();
 176     *errorCode = U_UNSUPPORTED_ERROR;
 177     return 0;
 178 }
 179
 180 static void textClose(UText* text)
 181 {
 182     text->context = 0;
 183 }
 184
 185 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
 186 {
 187     if (!text->b || nativeIndex > text->b)
 188         return PrimaryContext;
 189     if (nativeIndex == text->b)
 190         return forward ? PrimaryContext : PriorContext;
 191     return PriorContext;
 192 }
 193
 194 static inline TextContext textLatin1GetCurrentContext(const UText* text)
 195 {
 196     if (!text->chunkContents)
 197         return NoContext;
 198     return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
 199 }
 200
 201 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 202 {
 203     ASSERT(text->chunkContents == text->pExtra);
 204     if (forward) {
 205         ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
 206         text->chunkNativeStart = nativeIndex;
 207         text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
 208         if (text->chunkNativeLimit > nativeLength)
 209             text->chunkNativeLimit = nativeLength;
 210     } else {
 211         ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
 212         text->chunkNativeLimit = nativeIndex;
 213         text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
 214         if (text->chunkNativeStart < text->b)
 215             text->chunkNativeStart = text->b;
 216     }
 217     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
 218     // Ensure chunk length is well defined if computed length exceeds int32_t range.
 219     ASSERT(length <= std::numeric_limits<int32_t>::max());
 220     text->chunkLength = length <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
 221     text->nativeIndexingLimit = text->chunkLength;
 222     text->chunkOffset = forward ? 0 : text->chunkLength;
 223     StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
 224 }
 225
 226 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 227 {
 228     ASSERT(!text->chunkContents || text->chunkContents == text->q);
 229     text->chunkContents = static_cast<const UChar*>(text->pExtra);
 230     textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 231 }
 232
 233 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 234 {
 235     ASSERT(text->chunkContents == text->q);
 236     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
 237     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 238     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 239     text->chunkNativeStart = 0;
 240     text->chunkNativeLimit = text->b;
 241     text->chunkLength = text->b;
 242     text->nativeIndexingLimit = text->chunkLength;
 243     int64_t offset = nativeIndex - text->chunkNativeStart;
 244     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
 245     ASSERT(offset <= std::numeric_limits<int32_t>::max());
 246     text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
 247 }
 248
 249 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 250 {
 251     ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
 252     text->chunkContents = static_cast<const UChar*>(text->q);
 253     textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 254 }
 255
 256 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
 257 {
 258     if (forward) {
 259         if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
 260             int64_t offset = nativeIndex - text->chunkNativeStart;
 261             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
 262             ASSERT(offset <= std::numeric_limits<int32_t>::max());
 263             text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
 264             isAccessible = TRUE;
 265             return true;
 266         }
 267         if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
 268             text->chunkOffset = text->chunkLength;
 269             isAccessible = FALSE;
 270             return true;
 271         }
 272     } else {
 273         if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
 274             int64_t offset = nativeIndex - text->chunkNativeStart;
 275             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
 276             ASSERT(offset <= std::numeric_limits<int32_t>::max());
 277             text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
 278             isAccessible = TRUE;
 279             return true;
 280         }
 281         if (nativeIndex <= 0 && !text->chunkNativeStart) {
 282             text->chunkOffset = 0;
 283             isAccessible = FALSE;
 284             return true;
 285         }
 286     }
 287     return false;
 288 }
 289
 290 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
 291 {
 292     if (!text->context)
 293         return FALSE;
 294     int64_t nativeLength = textNativeLength(text);
 295     UBool isAccessible;
 296     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
 297         return isAccessible;
 298     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
 299     TextContext currentContext = textLatin1GetCurrentContext(text);
 300     TextContext newContext = textGetContext(text, nativeIndex, forward);
 301     ASSERT(newContext != NoContext);
 302     if (newContext == currentContext) {
 303         if (currentContext == PrimaryContext) {
 304             textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 305         } else {
 306             textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 307         }
 308     } else if (newContext == PrimaryContext) {
 309         textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
 310     } else {
 311         ASSERT(newContext == PriorContext);
 312         textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
 313     }
 314     return TRUE;
 315 }
 316
 317 static const struct UTextFuncs textLatin1Funcs = {
 318     sizeof(UTextFuncs),
 319     0, 0, 0,
 320     textClone,
 321     textNativeLength,
 322     textLatin1Access,
 323     textExtract,
 324     0, 0, 0, 0,
 325     textClose,
 326     0, 0, 0,
 327 };
 328
 329 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
 330 {
 331     text->pFuncs = funcs;
 332     text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
 333     text->context = string;
 334     text->p = string;
 335     text->a = length;
 336     text->q = priorContext;
 337     text->b = priorContextLength;
 338 }
 339
 340 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
 341 {
 342     if (U_FAILURE(*status))
 343         return 0;
 344
 345     if (!string || length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) {
 346         *status = U_ILLEGAL_ARGUMENT_ERROR;
 347         return 0;
 348     }
 349     UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
 350     if (U_FAILURE(*status)) {
 351         ASSERT(!text);
 352         return 0;
 353     }
 354     textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
 355     return text;
 356 }
 357
 358 static inline TextContext textUTF16GetCurrentContext(const UText* text)
 359 {
 360     if (!text->chunkContents)
 361         return NoContext;
 362     return text->chunkContents == text->p ? PrimaryContext : PriorContext;
 363 }
 364
 365 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 366 {
 367     ASSERT(text->chunkContents == text->p);
 368     ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
 369     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 370     text->chunkNativeStart = text->b;
 371     text->chunkNativeLimit = nativeLength;
 372     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
 373     // Ensure chunk length is well defined if computed length exceeds int32_t range.
 374     ASSERT(length <= std::numeric_limits<int32_t>::max());
 375     text->chunkLength = length <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
 376     text->nativeIndexingLimit = text->chunkLength;
 377     int64_t offset = nativeIndex - text->chunkNativeStart;
 378     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
 379     ASSERT(offset <= std::numeric_limits<int32_t>::max());
 380     text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
 381 }
 382
 383 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 384 {
 385     ASSERT(!text->chunkContents || text->chunkContents == text->q);
 386     text->chunkContents = static_cast<const UChar*>(text->p);
 387     textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 388 }
 389
 390 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 391 {
 392     ASSERT(text->chunkContents == text->q);
 393     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
 394     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 395     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
 396     text->chunkNativeStart = 0;
 397     text->chunkNativeLimit = text->b;
 398     text->chunkLength = text->b;
 399     text->nativeIndexingLimit = text->chunkLength;
 400     int64_t offset = nativeIndex - text->chunkNativeStart;
 401     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
 402     ASSERT(offset <= std::numeric_limits<int32_t>::max());
 403     text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
 404 }
 405
 406 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
 407 {
 408     ASSERT(!text->chunkContents || text->chunkContents == text->p);
 409     text->chunkContents = static_cast<const UChar*>(text->q);
 410     textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 411 }
 412
 413 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
 414 {
 415     if (!text->context)
 416         return FALSE;
 417     int64_t nativeLength = textNativeLength(text);
 418     UBool isAccessible;
 419     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
 420         return isAccessible;
 421     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
 422     TextContext currentContext = textUTF16GetCurrentContext(text);
 423     TextContext newContext = textGetContext(text, nativeIndex, forward);
 424     ASSERT(newContext != NoContext);
 425     if (newContext == currentContext) {
 426         if (currentContext == PrimaryContext) {
 427             textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
 428         } else {
 429             textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
 430         }
 431     } else if (newContext == PrimaryContext) {
 432         textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
 433     } else {
 434         ASSERT(newContext == PriorContext);
 435         textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
 436     }
 437     return TRUE;
 438 }
 439
 440 static const struct UTextFuncs textUTF16Funcs = {
 441     sizeof(UTextFuncs),
 442     0, 0, 0,
 443     textClone,
 444     textNativeLength,
 445     textUTF16Access,
 446     textExtract,
 447     0, 0, 0, 0,
 448     textClose,
 449     0, 0, 0,
 450 };
 451
 452 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
 453 {
 454     if (U_FAILURE(*status))
 455         return 0;
 456
 457     if (!string || length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) {
 458         *status = U_ILLEGAL_ARGUMENT_ERROR;
 459         return 0;
 460     }
 461
 462     text = utext_setup(text, 0, status);
 463     if (U_FAILURE(*status)) {
 464         ASSERT(!text);
 465         return 0;
 466     }
 467     textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
 468     return text;
 469 }
 470
 471 static UText emptyText = UTEXT_INITIALIZER;
 472
 473 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
 474 {
 475     UErrorCode errorCode = U_ZERO_ERROR;
 476     static TextBreakIterator* breakIter = 0;
 477     if (!breakIter) {
 478         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
 479         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
 480         if (!breakIter)
 481             return 0;
 482     }
 483
 484     UTextWithBuffer textLocal;
 485     textLocal.text = emptyText;
 486     textLocal.text.extraSize = sizeof(textLocal.buffer);
 487     textLocal.text.pExtra = textLocal.buffer;
 488
 489     UErrorCode openStatus = U_ZERO_ERROR;
 490     UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
 491     if (U_FAILURE(openStatus)) {
 492         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
 493         return 0;
 494     }
 495
 496     UErrorCode setTextStatus = U_ZERO_ERROR;
 497     breakIter->setText(text, setTextStatus);
 498     if (U_FAILURE(setTextStatus))
 499         WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
 500
 501     utext_close(text);
 502
 503     return breakIter;
 504 }
 505
 506 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
 507 {
 508     UErrorCode errorCode = U_ZERO_ERROR;
 509     UText uText = UTEXT_INITIALIZER;
 510     utext_openUChars(&uText, string, length, &errorCode);
 511     if (U_FAILURE(errorCode))
 512         return;
 513     iter->setText(&uText, errorCode);
 514 }
 515
 516 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
 517 {
 518     UErrorCode errorCode = U_ZERO_ERROR;
 519     static TextBreakIterator* breakIter = 0;
 520     if (!breakIter) {
 521         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
 522         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
 523         if (!breakIter)
 524             return 0;
 525     }
 526     setText16(breakIter, string, length);
 527     return breakIter;
 528 }
 529
 530 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
 531 {
 532     if (string.isEmpty())
 533         return 0;
 534     if (string.is8Bit())
 535         return wordBreakIterator(string.characters8() + start, length);
 536     return wordBreakIterator(string.characters16() + start, length);
 537 }
 538
 539 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
 540 {
 541     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
 542     if (!iterator)
 543         return 0;
 544
 545     UTextWithBuffer textLocal;
 546     textLocal.text = emptyText;
 547     textLocal.text.extraSize = sizeof(textLocal.buffer);
 548     textLocal.text.pExtra = textLocal.buffer;
 549
 550     UErrorCode openStatus = U_ZERO_ERROR;
 551     UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
 552     if (U_FAILURE(openStatus)) {
 553         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
 554         return 0;
 555     }
 556
 557     UErrorCode setTextStatus = U_ZERO_ERROR;
 558     iterator->setText(text, setTextStatus);
 559     if (U_FAILURE(setTextStatus)) {
 560         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
 561         return 0;
 562     }
 563
 564     utext_close(text);
 565
 566     return iterator;
 567 }
 568
 569 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
 570 {
 571     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
 572     if (!iterator)
 573         return 0;
 574
 575     UText textLocal = UTEXT_INITIALIZER;
 576
 577     UErrorCode openStatus = U_ZERO_ERROR;
 578     UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
 579     if (U_FAILURE(openStatus)) {
 580         WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
 581         return 0;
 582     }
 583
 584     UErrorCode setTextStatus = U_ZERO_ERROR;
 585     iterator->setText(text, setTextStatus);
 586     if (U_FAILURE(setTextStatus)) {
 587         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
 588         return 0;
 589     }
 590
 591     utext_close(text);
 592
 593     return iterator;
 594 }
 595
 596 void releaseLineBreakIterator(TextBreakIterator* iterator)
 597 {
 598     ASSERT_ARG(iterator, iterator);
 599
 600     LineBreakIteratorPool::sharedPool().put(iterator);
 601 }
 602
 603 static TextBreakIterator* nonSharedCharacterBreakIterator;
 604
 605 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
 606 {
 607     DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
 608     MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
 609     if (nonSharedCharacterBreakIterator != expected)
 610         return false;
 611     nonSharedCharacterBreakIterator = newValue;
 612     return true;
 613 }
 614
 615 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
 616     : m_is8Bit(true)
 617     , m_charaters8(0)
 618     , m_offset(0)
 619     , m_length(0)
 620     , m_iterator(0)
 621 {
 622     if (string.isEmpty())
 623         return;
 624
 625     m_is8Bit = string.is8Bit();
 626
 627     if (m_is8Bit) {
 628         m_charaters8 = string.characters8();
 629         m_offset = 0;
 630         m_length = string.length();
 631         return;
 632     }
 633
 634     createIteratorForBuffer(string.characters16(), string.length());
 635 }
 636
 637 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
 638     : m_is8Bit(false)
 639     , m_charaters8(0)
 640     , m_offset(0)
 641     , m_length(0)
 642     , m_iterator(0)
 643 {
 644     createIteratorForBuffer(buffer, length);
 645 }
 646
 647 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
 648 {
 649     m_iterator = nonSharedCharacterBreakIterator;
 650     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
 651     if (!createdIterator) {
 652         UErrorCode errorCode = U_ZERO_ERROR;
 653         m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
 654         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
 655     }
 656
 657     setText16(m_iterator, buffer, length);
 658 }
 659
 660 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
 661 {
 662     if (m_is8Bit)
 663         return;
 664     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
 665         delete m_iterator;
 666 }
 667
 668 int NonSharedCharacterBreakIterator::next()
 669 {
 670     if (!m_is8Bit)
 671         return m_iterator->next();
 672
 673     if (m_offset >= m_length)
 674         return TextBreakDone;
 675
 676     m_offset += clusterLengthStartingAt(m_offset);
 677     return m_offset;
 678 }
 679
 680 int NonSharedCharacterBreakIterator::current()
 681 {
 682     if (!m_is8Bit)
 683         return m_iterator->current();
 684     return m_offset;
 685 }
 686
 687 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
 688 {
 689     if (!m_is8Bit)
 690         return m_iterator->isBoundary(offset);
 691     return !isLFAfterCR(offset);
 692 }
 693
 694 int NonSharedCharacterBreakIterator::preceding(int offset) const
 695 {
 696     if (!m_is8Bit)
 697         return m_iterator->preceding(offset);
 698     if (offset <= 0)
 699         return TextBreakDone;
 700     if (isLFAfterCR(offset))
 701         return offset - 2;
 702     return offset - 1;
 703 }
 704
 705 int NonSharedCharacterBreakIterator::following(int offset) const
 706 {
 707     if (!m_is8Bit)
 708         return m_iterator->following(offset);
 709     if (static_cast<unsigned>(offset) >= m_length)
 710         return TextBreakDone;
 711     return offset + clusterLengthStartingAt(offset);
 712 }
 713
 714 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
 715 {
 716     UErrorCode openStatus = U_ZERO_ERROR;
 717     static TextBreakIterator* iterator = 0;
 718     if (!iterator) {
 719         iterator =  icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
 720         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
 721         if (!iterator)
 722             return 0;
 723     }
 724
 725     setText16(iterator, string, length);
 726     return iterator;
 727 }
 728
 729 bool isWordTextBreak(TextBreakIterator* iterator)
 730 {
 731     icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
 732     int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
 733     return ruleStatus != UBRK_WORD_NONE;
 734 }
 735
 736 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
 737 {
 738     if (!string)
 739         return 0;
 740
 741     static TextBreakIterator* iterator = 0;
 742     if (!iterator) {
 743         UParseError parseStatus;
 744         UErrorCode openStatus = U_ZERO_ERROR;
 745         Vector<UChar> rules;
 746         String(breakRules).appendTo(rules);
 747
 748         iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
 749         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
 750         if (!iterator)
 751             return 0;
 752     }
 753
 754     setText16(iterator, string, length);
 755     return iterator;
 756 }
 757
 758 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
 759 {
 760     // This rule set is based on character-break iterator rules of ICU 4.0
 761     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
 762     // The major differences from the original ones are listed below:
 763     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
 764     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
 765     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
 766     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
 767     // * Added rules for regional indicator symbols.
 768     static const char* const kRules =
 769         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
 770         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
 771         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
 772         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
 773         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
 774         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
 775         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
 776         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
 777         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
 778         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
 779         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
 780         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
 781         "$HinV    = \\u094D;"              // Devanagari Sign Virama
 782         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
 783         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
 784         "$BenV    = \\u09CD;"              // Bengali Sign Virama
 785         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
 786         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
 787         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
 788         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
 789         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
 790         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
 791         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
 792         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
 793         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
 794         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
 795         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
 796         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
 797         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
 798         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
 799         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
 800         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
 801         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
 802         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
 803         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
 804         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
 805         "!!chain;"
 806         "!!forward;"
 807         "$CR $LF;"
 808         "$L ($L | $V | $LV | $LVT);"
 809         "($LV | $V) ($V | $T);"
 810         "($LVT | $T) $T;"
 811         "[^$Control $CR $LF] $Extend;"
 812         "[^$Control $CR $LF] $SpacingMark;"
 813         "$RI $RI / $RI;"
 814         "$RI $RI;"
 815         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
 816         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
 817         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
 818         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
 819         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
 820         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
 821         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
 822         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
 823         "!!reverse;"
 824         "$LF $CR;"
 825         "($L | $V | $LV | $LVT) $L;"
 826         "($V | $T) ($LV | $V);"
 827         "$T ($LVT | $T);"
 828         "$Extend      [^$Control $CR $LF];"
 829         "$SpacingMark [^$Control $CR $LF];"
 830         "$RI $RI / $RI $RI;"
 831         "$RI $RI;"
 832         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
 833         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
 834         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
 835         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
 836         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
 837         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
 838         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
 839         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
 840         "!!safe_reverse;"
 841         "!!safe_forward;";
 842
 843     return setUpIteratorWithRules(kRules, string, length);
 844 }
 845
 846 }