src/third_party/WebKit/Source/platform/text/TextBreakIterator.h

   1 /*
   2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
   3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Library General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Library General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Library General Public License
  16  * along with this library; see the file COPYING.LIB.  If not, write to
  17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18  * Boston, MA 02110-1301, USA.
  19  *
  20  */
  21
  22 #ifndef TextBreakIterator_h
  23 #define TextBreakIterator_h
  24
  25 #include "platform/PlatformExport.h"
  26 #include "wtf/text/AtomicString.h"
  27 #include "wtf/unicode/Unicode.h"
  28
  29 #include <unicode/brkiter.h>
  30
  31 namespace WebCore {
  32
  33 typedef icu::BreakIterator TextBreakIterator;
  34
  35 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
  36
  37 // This is similar to character break iterator in most cases, but is subject to
  38 // platform UI conventions. One notable example where this can be different
  39 // from character break iterator is Thai prepend characters, see bug 24342.
  40 // Use this for insertion point and selection manipulations.
  41 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length);
  42
  43 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length);
  44 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
  45 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
  46 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
  47 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
  48 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
  49
  50 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);
  51
  52 const int TextBreakDone = -1;
  53
  54 class PLATFORM_EXPORT LazyLineBreakIterator {
  55 public:
  56     LazyLineBreakIterator()
  57         : m_iterator(0)
  58         , m_cachedPriorContext(0)
  59         , m_cachedPriorContextLength(0)
  60     {
  61         resetPriorContext();
  62     }
  63
  64     LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString())
  65         : m_string(string)
  66         , m_locale(locale)
  67         , m_iterator(0)
  68         , m_cachedPriorContext(0)
  69         , m_cachedPriorContextLength(0)
  70     {
  71         resetPriorContext();
  72     }
  73
  74     ~LazyLineBreakIterator()
  75     {
  76         if (m_iterator)
  77             releaseLineBreakIterator(m_iterator);
  78     }
  79
  80     String string() const { return m_string; }
  81
  82     UChar lastCharacter() const
  83     {
  84         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
  85         return m_priorContext[1];
  86     }
  87
  88     UChar secondToLastCharacter() const
  89     {
  90         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
  91         return m_priorContext[0];
  92     }
  93
  94     void setPriorContext(UChar last, UChar secondToLast)
  95     {
  96         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
  97         m_priorContext[0] = secondToLast;
  98         m_priorContext[1] = last;
  99     }
 100
 101     void updatePriorContext(UChar last)
 102     {
 103         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
 104         m_priorContext[0] = m_priorContext[1];
 105         m_priorContext[1] = last;
 106     }
 107
 108     void resetPriorContext()
 109     {
 110         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
 111         m_priorContext[0] = 0;
 112         m_priorContext[1] = 0;
 113     }
 114
 115     unsigned priorContextLength() const
 116     {
 117         unsigned priorContextLength = 0;
 118         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
 119         if (m_priorContext[1]) {
 120             ++priorContextLength;
 121             if (m_priorContext[0])
 122                 ++priorContextLength;
 123         }
 124         return priorContextLength;
 125     }
 126
 127     // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
 128     // initialized to use the previously stored string as the primary breaking context and using
 129     // previously stored prior context if non-empty.
 130     TextBreakIterator* get(unsigned priorContextLength)
 131     {
 132         ASSERT(priorContextLength <= priorContextCapacity);
 133         const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
 134         if (!m_iterator) {
 135             if (m_string.is8Bit())
 136                 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength);
 137             else
 138                 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength);
 139             m_cachedPriorContext = priorContext;
 140             m_cachedPriorContextLength = priorContextLength;
 141         } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
 142             this->resetStringAndReleaseIterator(m_string, m_locale);
 143             return this->get(priorContextLength);
 144         }
 145         return m_iterator;
 146     }
 147
 148     void resetStringAndReleaseIterator(String string, const AtomicString& locale)
 149     {
 150         if (m_iterator)
 151             releaseLineBreakIterator(m_iterator);
 152
 153         m_string = string;
 154         m_locale = locale;
 155         m_iterator = 0;
 156         m_cachedPriorContext = 0;
 157         m_cachedPriorContextLength = 0;
 158     }
 159
 160 private:
 161     static const unsigned priorContextCapacity = 2;
 162     String m_string;
 163     AtomicString m_locale;
 164     TextBreakIterator* m_iterator;
 165     UChar m_priorContext[priorContextCapacity];
 166     const UChar* m_cachedPriorContext;
 167     unsigned m_cachedPriorContextLength;
 168 };
 169
 170 // Iterates over "extended grapheme clusters", as defined in UAX #29.
 171 // Note that platform implementations may be less sophisticated - e.g. ICU prior to
 172 // version 4.0 only supports "legacy grapheme clusters".
 173 // Use this for general text processing, e.g. string truncation.
 174
 175 class PLATFORM_EXPORT NonSharedCharacterBreakIterator {
 176     WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
 177 public:
 178     explicit NonSharedCharacterBreakIterator(const String&);
 179     NonSharedCharacterBreakIterator(const UChar*, unsigned length);
 180     ~NonSharedCharacterBreakIterator();
 181
 182     int next();
 183     int current();
 184
 185     bool isBreak(int offset) const;
 186     int preceding(int offset) const;
 187     int following(int offset) const;
 188
 189     bool operator!() const
 190     {
 191         return !m_is8Bit && !m_iterator;
 192     }
 193
 194 private:
 195     void createIteratorForBuffer(const UChar*, unsigned length);
 196
 197     unsigned clusterLengthStartingAt(unsigned offset) const
 198     {
 199         ASSERT(m_is8Bit);
 200         // The only Latin-1 Extended Grapheme Cluster is CR LF
 201         return isCRBeforeLF(offset) ? 2 : 1;
 202     }
 203
 204     bool isCRBeforeLF(unsigned offset) const
 205     {
 206         ASSERT(m_is8Bit);
 207         return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n';
 208     }
 209
 210     bool isLFAfterCR(unsigned offset) const
 211     {
 212         ASSERT(m_is8Bit);
 213         return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r';
 214     }
 215
 216     bool m_is8Bit;
 217
 218     // For 8 bit strings, we implement the iterator ourselves.
 219     const LChar* m_charaters8;
 220     unsigned m_offset;
 221     unsigned m_length;
 222
 223     // For 16 bit strings, we use a TextBreakIterator.
 224     TextBreakIterator* m_iterator;
 225 };
 226
 227 // Counts the number of grapheme clusters. A surrogate pair or a sequence
 228 // of a non-combining character and following combining characters is
 229 // counted as 1 grapheme cluster.
 230 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);
 231 // Returns the number of characters which will be less than or equal to
 232 // the specified grapheme cluster length.
 233 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned);
 234
 235 }
 236
 237 #endif