1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 *******************************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
21 class DictionaryMatcher;
24 /*******************************************************************
25 * DictionaryBreakEngine
29 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
30 * dictionary to determine language-specific breaks.</p>
32 * <p>After it is constructed a DictionaryBreakEngine may be shared between
33 * threads without synchronization.</p>
35 class DictionaryBreakEngine : public LanguageBreakEngine {
38 * The set of characters handled by this engine
45 * The set of break types handled by this engine
52 * <p>Default constructor.</p>
55 DictionaryBreakEngine();
60 * <p>Constructor setting the break types handled.</p>
62 * @param breakTypes A bitmap of types handled by the engine.
64 DictionaryBreakEngine( uint32_t breakTypes );
67 * <p>Virtual destructor.</p>
69 virtual ~DictionaryBreakEngine();
72 * <p>Indicate whether this engine handles a particular character for
73 * a particular kind of break.</p>
75 * @param c A character which begins a run that the engine might handle
76 * @param breakType The type of text break which the caller wants to determine
77 * @return TRUE if this engine handles the particular character and break
80 virtual UBool handles( UChar32 c, int32_t breakType ) const;
83 * <p>Find any breaks within a run in the supplied text.</p>
85 * @param text A UText representing the text. The iterator is left at
86 * the end of the run of characters which the engine is capable of handling
87 * that starts from the first (or last) character in the range.
88 * @param startPos The start of the run within the supplied text.
89 * @param endPos The end of the run within the supplied text.
90 * @param reverse Whether the caller is looking for breaks in a reverse
92 * @param breakType The type of break desired, or -1.
93 * @param foundBreaks An allocated C array of the breaks found, if any
94 * @return The number of breaks found.
96 virtual int32_t findBreaks( UText *text,
101 UStack &foundBreaks ) const;
106 * <p>Set the character set handled by this engine.</p>
108 * @param set A UnicodeSet of the set of characters handled by the engine
110 virtual void setCharacters( const UnicodeSet &set );
113 * <p>Set the break types handled by this engine.</p>
115 * @param breakTypes A bitmap of types handled by the engine.
117 // virtual void setBreakTypes( uint32_t breakTypes );
120 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
122 * @param text A UText representing the text
123 * @param rangeStart The start of the range of dictionary characters
124 * @param rangeEnd The end of the range of dictionary characters
125 * @param foundBreaks Output of C array of int32_t break positions, or 0
126 * @return The number of breaks found
128 virtual int32_t divideUpDictionaryRange( UText *text,
131 UStack &foundBreaks ) const = 0;
135 /*******************************************************************
140 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
141 * dictionary and heuristics to determine Thai-specific breaks.</p>
143 * <p>After it is constructed a ThaiBreakEngine may be shared between
144 * threads without synchronization.</p>
146 class ThaiBreakEngine : public DictionaryBreakEngine {
149 * The set of characters handled by this engine
153 UnicodeSet fThaiWordSet;
154 UnicodeSet fEndWordSet;
155 UnicodeSet fBeginWordSet;
156 UnicodeSet fSuffixSet;
158 DictionaryMatcher *fDictionary;
163 * <p>Default constructor.</p>
165 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
168 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
171 * <p>Virtual destructor.</p>
173 virtual ~ThaiBreakEngine();
177 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
179 * @param text A UText representing the text
180 * @param rangeStart The start of the range of dictionary characters
181 * @param rangeEnd The end of the range of dictionary characters
182 * @param foundBreaks Output of C array of int32_t break positions, or 0
183 * @return The number of breaks found
185 virtual int32_t divideUpDictionaryRange( UText *text,
188 UStack &foundBreaks ) const;
192 /*******************************************************************
197 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
198 * dictionary and heuristics to determine Lao-specific breaks.</p>
200 * <p>After it is constructed a LaoBreakEngine may be shared between
201 * threads without synchronization.</p>
203 class LaoBreakEngine : public DictionaryBreakEngine {
206 * The set of characters handled by this engine
210 UnicodeSet fLaoWordSet;
211 UnicodeSet fEndWordSet;
212 UnicodeSet fBeginWordSet;
214 DictionaryMatcher *fDictionary;
219 * <p>Default constructor.</p>
221 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
224 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
227 * <p>Virtual destructor.</p>
229 virtual ~LaoBreakEngine();
233 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
235 * @param text A UText representing the text
236 * @param rangeStart The start of the range of dictionary characters
237 * @param rangeEnd The end of the range of dictionary characters
238 * @param foundBreaks Output of C array of int32_t break positions, or 0
239 * @return The number of breaks found
241 virtual int32_t divideUpDictionaryRange( UText *text,
244 UStack &foundBreaks ) const;
248 /*******************************************************************
253 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
254 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
256 * <p>After it is constructed a BurmeseBreakEngine may be shared between
257 * threads without synchronization.</p>
259 class BurmeseBreakEngine : public DictionaryBreakEngine {
262 * The set of characters handled by this engine
266 UnicodeSet fBurmeseWordSet;
267 UnicodeSet fEndWordSet;
268 UnicodeSet fBeginWordSet;
270 DictionaryMatcher *fDictionary;
275 * <p>Default constructor.</p>
277 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
280 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
283 * <p>Virtual destructor.</p>
285 virtual ~BurmeseBreakEngine();
289 * <p>Divide up a range of known dictionary characters.</p>
291 * @param text A UText representing the text
292 * @param rangeStart The start of the range of dictionary characters
293 * @param rangeEnd The end of the range of dictionary characters
294 * @param foundBreaks Output of C array of int32_t break positions, or 0
295 * @return The number of breaks found
297 virtual int32_t divideUpDictionaryRange( UText *text,
300 UStack &foundBreaks ) const;
304 /*******************************************************************
309 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
310 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
312 * <p>After it is constructed a KhmerBreakEngine may be shared between
313 * threads without synchronization.</p>
315 class KhmerBreakEngine : public DictionaryBreakEngine {
318 * The set of characters handled by this engine
322 UnicodeSet fKhmerWordSet;
323 UnicodeSet fEndWordSet;
324 UnicodeSet fBeginWordSet;
326 DictionaryMatcher *fDictionary;
331 * <p>Default constructor.</p>
333 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
336 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
339 * <p>Virtual destructor.</p>
341 virtual ~KhmerBreakEngine();
345 * <p>Divide up a range of known dictionary characters.</p>
347 * @param text A UText representing the text
348 * @param rangeStart The start of the range of dictionary characters
349 * @param rangeEnd The end of the range of dictionary characters
350 * @param foundBreaks Output of C array of int32_t break positions, or 0
351 * @return The number of breaks found
353 virtual int32_t divideUpDictionaryRange( UText *text,
356 UStack &foundBreaks ) const;
360 #if !UCONFIG_NO_NORMALIZATION
362 /*******************************************************************
366 //indicates language/script that the CjkBreakEngine will handle
373 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
374 * dictionary with costs associated with each word and
375 * Viterbi decoding to determine CJK-specific breaks.</p>
377 class CjkBreakEngine : public DictionaryBreakEngine {
380 * The set of characters handled by this engine
383 UnicodeSet fHangulWordSet;
384 UnicodeSet fHanWordSet;
385 UnicodeSet fKatakanaWordSet;
386 UnicodeSet fHiraganaWordSet;
388 DictionaryMatcher *fDictionary;
389 const Normalizer2 *nfkcNorm2;
394 * <p>Default constructor.</p>
396 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
397 * engine is deleted. The DictionaryMatcher must contain costs for each word
398 * in order for the dictionary to work properly.
400 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
403 * <p>Virtual destructor.</p>
405 virtual ~CjkBreakEngine();
409 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
411 * @param text A UText representing the text
412 * @param rangeStart The start of the range of dictionary characters
413 * @param rangeEnd The end of the range of dictionary characters
414 * @param foundBreaks Output of C array of int32_t break positions, or 0
415 * @return The number of breaks found
417 virtual int32_t divideUpDictionaryRange( UText *text,
420 UStack &foundBreaks ) const;