2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
22 #ifndef TextBreakIterator_h
23 #define TextBreakIterator_h
25 #include "platform/PlatformExport.h"
26 #include "wtf/text/AtomicString.h"
27 #include "wtf/unicode/Unicode.h"
29 #include <unicode/brkiter.h>
33 typedef icu::BreakIterator TextBreakIterator;
35 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
37 // This is similar to character break iterator in most cases, but is subject to
38 // platform UI conventions. One notable example where this can be different
39 // from character break iterator is Thai prepend characters, see bug 24342.
40 // Use this for insertion point and selection manipulations.
41 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length);
43 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length);
44 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
45 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
46 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
47 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
48 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
50 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);
52 const int TextBreakDone = -1;
54 class PLATFORM_EXPORT LazyLineBreakIterator {
56 LazyLineBreakIterator()
58 , m_cachedPriorContext(0)
59 , m_cachedPriorContextLength(0)
64 LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString())
68 , m_cachedPriorContext(0)
69 , m_cachedPriorContextLength(0)
74 ~LazyLineBreakIterator()
77 releaseLineBreakIterator(m_iterator);
80 String string() const { return m_string; }
82 UChar lastCharacter() const
84 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
85 return m_priorContext[1];
88 UChar secondToLastCharacter() const
90 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
91 return m_priorContext[0];
94 void setPriorContext(UChar last, UChar secondToLast)
96 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
97 m_priorContext[0] = secondToLast;
98 m_priorContext[1] = last;
101 void updatePriorContext(UChar last)
103 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
104 m_priorContext[0] = m_priorContext[1];
105 m_priorContext[1] = last;
108 void resetPriorContext()
110 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
111 m_priorContext[0] = 0;
112 m_priorContext[1] = 0;
115 unsigned priorContextLength() const
117 unsigned priorContextLength = 0;
118 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
119 if (m_priorContext[1]) {
120 ++priorContextLength;
121 if (m_priorContext[0])
122 ++priorContextLength;
124 return priorContextLength;
127 // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
128 // initialized to use the previously stored string as the primary breaking context and using
129 // previously stored prior context if non-empty.
130 TextBreakIterator* get(unsigned priorContextLength)
132 ASSERT(priorContextLength <= priorContextCapacity);
133 const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
135 if (m_string.is8Bit())
136 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength);
138 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength);
139 m_cachedPriorContext = priorContext;
140 m_cachedPriorContextLength = priorContextLength;
141 } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
142 this->resetStringAndReleaseIterator(m_string, m_locale);
143 return this->get(priorContextLength);
148 void resetStringAndReleaseIterator(String string, const AtomicString& locale)
151 releaseLineBreakIterator(m_iterator);
156 m_cachedPriorContext = 0;
157 m_cachedPriorContextLength = 0;
161 static const unsigned priorContextCapacity = 2;
163 AtomicString m_locale;
164 TextBreakIterator* m_iterator;
165 UChar m_priorContext[priorContextCapacity];
166 const UChar* m_cachedPriorContext;
167 unsigned m_cachedPriorContextLength;
170 // Iterates over "extended grapheme clusters", as defined in UAX #29.
171 // Note that platform implementations may be less sophisticated - e.g. ICU prior to
172 // version 4.0 only supports "legacy grapheme clusters".
173 // Use this for general text processing, e.g. string truncation.
175 class PLATFORM_EXPORT NonSharedCharacterBreakIterator {
176 WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
178 explicit NonSharedCharacterBreakIterator(const String&);
179 NonSharedCharacterBreakIterator(const UChar*, unsigned length);
180 ~NonSharedCharacterBreakIterator();
185 bool isBreak(int offset) const;
186 int preceding(int offset) const;
187 int following(int offset) const;
189 bool operator!() const
191 return !m_is8Bit && !m_iterator;
195 void createIteratorForBuffer(const UChar*, unsigned length);
197 unsigned clusterLengthStartingAt(unsigned offset) const
200 // The only Latin-1 Extended Grapheme Cluster is CR LF
201 return isCRBeforeLF(offset) ? 2 : 1;
204 bool isCRBeforeLF(unsigned offset) const
207 return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n';
210 bool isLFAfterCR(unsigned offset) const
213 return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r';
218 // For 8 bit strings, we implement the iterator ourselves.
219 const LChar* m_charaters8;
223 // For 16 bit strings, we use a TextBreakIterator.
224 TextBreakIterator* m_iterator;
227 // Counts the number of grapheme clusters. A surrogate pair or a sequence
228 // of a non-combining character and following combining characters is
229 // counted as 1 grapheme cluster.
230 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);
231 // Returns the number of characters which will be less than or equal to
232 // the specified grapheme cluster length.
233 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned);