1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*************************************************************************
4 * Copyright (c) 2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *************************************************************************
8 #ifndef RBBIMONKEYTEST_H
9 #define RBBIMONKEYTEST_H
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
17 #include "unicode/rbbi.h"
18 #include "unicode/regex.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/uobject.h"
23 #include "simplethread.h"
30 // Develop a tailoring format.
31 // Hook to old tests that use monkey impl to get expected data.
34 class BreakRules; // Forward declaration
38 * Test the RuleBasedBreakIterator class giving different rules
40 class RBBIMonkeyTest: public IntlTest {
43 virtual ~RBBIMonkeyTest();
45 void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
50 const char *fParams; // Copy of user parameters passed in from IntlTest.
53 void testRules(const char *ruleFile);
54 static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status);
55 static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status);
56 static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status);
60 // The following classes are internal to the RBBI Monkey Test implementation.
64 // class CharClass Represents a single character class from the source break rules.
65 // Inherits from UObject because instances are adopted by UHashtable, which ultimately
66 // deletes them using hash's object deleter function.
68 class CharClass: public UObject {
71 UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules.
72 UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
73 LocalPointer<const UnicodeSet> fSet;
74 CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
75 fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
79 // class BreakRule represents a single rule from a set of break rules.
80 // Each rule has the set definitions expanded, and
81 // is compiled to a regular expression.
83 class BreakRule: public UObject {
87 UnicodeString fName; // Name of the rule.
88 UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
89 UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
90 LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
94 // class BreakRules represents a complete set of break rules, possibly tailored,
95 // compiled from testdata break rules.
97 class BreakRules: public UObject {
99 BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
102 void compileRules(UCHARBUF *rules, UErrorCode &status);
104 const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
107 RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
108 icu::UVector fBreakRules; // Contents are of type (BreakRule *).
110 LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString).
111 // Value is (CharClass *)
112 LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values,
113 // but in a vector so they can be accessed by index.
114 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
116 UBreakIteratorType fType;
118 CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
119 void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
120 bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
121 RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
123 LocalPointer<RegexMatcher> fSetRefsMatcher;
124 LocalPointer<RegexMatcher> fCommentsMatcher;
125 LocalPointer<RegexMatcher> fClassDefMatcher;
126 LocalPointer<RegexMatcher> fRuleDefMatcher;
130 // class MonkeyTestData represents a randomly synthesized test data string together
131 // with the expected break positions obtained by applying
132 // the test break rules.
134 class MonkeyTestData: public UObject {
137 ~MonkeyTestData() {};
138 void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
139 void clearActualBreaks();
140 void dump(int32_t around = -1) const;
142 uint32_t fRandomSeed; // The initial seed value from the random number genererator.
143 const BreakRules *fBkRules; // The break rules used to generate this data.
144 UnicodeString fString; // The text.
145 UnicodeString fExpectedBreaks; // Breaks as found by the reference rules.
146 // Parallel to fString. Non-zero if break preceding.
147 UnicodeString fActualBreaks; // Breaks as found by ICU break iterator.
148 UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position.
149 // Also parallel to fString.
150 UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule
151 // didn't cause a break, and a subsequent rule match starts
152 // on the last code point of the preceding match.
159 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
160 // test for one set of break rules.
162 // When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
163 // between instances of RBBIMonkeyImpl and threads.
165 class RBBIMonkeyImpl: public UObject {
167 RBBIMonkeyImpl(UErrorCode &status);
170 void setup(const char *ruleFileName, UErrorCode &status);
176 LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules.
177 LocalPointer<BreakRules> fRuleSet;
178 LocalPointer<RuleBasedBreakIterator> fBI;
179 LocalPointer<MonkeyTestData> fTestData;
180 IntlTest::icu_rand fRandomGenerator;
181 const char *fRuleFileName;
182 UBool fVerbose; // True to do long dump of failing data.
185 UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets.
187 enum CheckDirection {
191 void clearActualBreaks();
192 void testForwards(UErrorCode &status);
193 void testPrevious(UErrorCode &status);
194 void testFollowing(UErrorCode &status);
195 void testPreceding(UErrorCode &status);
196 void testIsBoundary(UErrorCode &status);
197 void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
199 class RBBIMonkeyThread: public SimpleThread {
201 RBBIMonkeyImpl *fMonkeyImpl;
203 RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
204 void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
207 void openBreakRules(const char *fileName, UErrorCode &status);
208 RBBIMonkeyThread fThread;
212 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
214 #endif // RBBIMONKEYTEST_H