1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
6 // Copyright (C) 2002-2016, International Business Machines Corporation and others.
7 // All Rights Reserved.
9 // This file contains declarations for class RBBIRuleScanner
16 #include "unicode/utypes.h"
17 #include "unicode/uobject.h"
18 #include "unicode/rbbi.h"
19 #include "unicode/uniset.h"
20 #include "unicode/parseerr.h"
23 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24 // looks up references to $variables within a set.
30 class RBBIRuleBuilder;
31 class RBBISymbolTable;
34 //--------------------------------------------------------------------------------
36 // class RBBIRuleScanner does the lowest level, character-at-a-time
37 // scanning of break iterator rules.
39 // The output of the scanner is parse trees for
40 // the rule expressions and a list of all Unicode Sets
43 //--------------------------------------------------------------------------------
45 class RBBIRuleScanner : public UMemory {
49 kStackSize = 100 // The size of the state stack for
50 }; // rules parsing. Corresponds roughly
51 // to the depth of parentheses nesting
52 // that is allowed in the rules.
57 RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
60 RBBIRuleScanner(RBBIRuleBuilder *rb);
63 virtual ~RBBIRuleScanner();
65 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
66 // Return false if at end.
68 UBool push(const RBBIRuleChar &c); // Push (unget) one character.
69 // Only a single character may be pushed.
71 void parse(); // Parse the rules, generating two parse
72 // trees, one each for the forward and
74 // and a list of UnicodeSets encountered.
77 * Return a rules string without unnecessary
80 static UnicodeString stripRules(const UnicodeString &rules);
83 UBool doParseActions(int32_t a);
84 void error(UErrorCode e); // error reporting convenience function.
85 void fixOpStack(RBBINode::OpPrecedence p);
87 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
91 void printNodeStack(const char *title);
93 RBBINode *pushNewNode(RBBINode::NodeType t);
97 RBBIRuleBuilder *fRB; // The rule builder that we are part of.
99 int32_t fScanIndex; // Index of current character being processed
100 // in the rule input string.
101 int32_t fNextIndex; // Index of the next character, which
102 // is the first character not yet scanned.
103 UBool fQuoteMode; // Scan is in a 'quoted region'
104 int32_t fLineNum; // Line number in input file.
105 int32_t fCharNum; // Char position within the line.
106 UChar32 fLastChar; // Previous char, needed to count CR-LF
107 // as a single line, not two.
109 RBBIRuleChar fC; // Current char for parse state machine
111 UnicodeString fVarName; // $variableName, valid when we've just
114 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
115 // parsing. index by p[state][char-class]
117 uint16_t fStack[kStackSize]; // State stack, holds state pushes
118 int32_t fStackPtr; // and pops as specified in the state
121 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
122 // during the parse of a rule
123 int32_t fNodeStackPtr;
126 UBool fReverseRule; // True if the rule currently being scanned
127 // is a reverse direction rule (if it
128 // starts with a '!')
130 UBool fLookAheadRule; // True if the rule includes a '/'
131 // somewhere within it.
133 UBool fNoChainInRule; // True if the current rule starts with a '^'.
135 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
136 // $variable symbols.
138 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
139 // the sets created while parsing rules.
140 // The key is the string used for creating
143 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
144 // the scanning of RBBI rules. The
145 // indicies for these are assigned by the
146 // perl script that builds the state tables.
149 int32_t fRuleNum; // Counts each rule as it is scanned.
151 int32_t fOptionStart; // Input index of start of a !!option
152 // keyword, while being scanned.
154 UnicodeSet *gRuleSet_rule_char;
155 UnicodeSet *gRuleSet_white_space;
156 UnicodeSet *gRuleSet_name_char;
157 UnicodeSet *gRuleSet_name_start_char;
159 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
160 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class