Imported Upstream version 57.1
[platform/upstream/icu.git] / source / common / rbbiscan.h
1 //
2 //  rbbiscan.h
3 //
4 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
5 //  All Rights Reserved.
6 //
7 //  This file contains declarations for class RBBIRuleScanner
8 //
9
10
11 #ifndef RBBISCAN_H
12 #define RBBISCAN_H
13
14 #include "unicode/utypes.h"
15 #include "unicode/uobject.h"
16 #include "unicode/rbbi.h"
17 #include "unicode/uniset.h"
18 #include "unicode/parseerr.h"
19 #include "uhash.h"
20 #include "uvector.h"
21 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
22                           //    looks up references to $variables within a set.
23 #include "rbbinode.h"
24 #include "rbbirpt.h"
25
26 U_NAMESPACE_BEGIN
27
28 class   RBBIRuleBuilder;
29 class   RBBISymbolTable;
30
31
32 //--------------------------------------------------------------------------------
33 //
34 //  class RBBIRuleScanner does the lowest level, character-at-a-time
35 //                        scanning of break iterator rules.  
36 //
37 //                        The output of the scanner is parse trees for
38 //                        the rule expressions and a list of all Unicode Sets
39 //                        encountered.
40 //
41 //--------------------------------------------------------------------------------
42
43 class RBBIRuleScanner : public UMemory {
44 public:
45
46     enum {
47         kStackSize = 100            // The size of the state stack for
48     };                              //   rules parsing.  Corresponds roughly
49                                     //   to the depth of parentheses nesting
50                                     //   that is allowed in the rules.
51
52     struct RBBIRuleChar {
53         UChar32             fChar;
54         UBool               fEscaped;
55         RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
56     };
57
58     RBBIRuleScanner(RBBIRuleBuilder  *rb);
59
60
61     virtual    ~RBBIRuleScanner();
62
63     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
64                                                     // Return false if at end.
65
66     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
67                                                     //   Only a single character may be pushed.
68
69     void        parse();                            // Parse the rules, generating two parse
70                                                     //   trees, one each for the forward and
71                                                     //   reverse rules,
72                                                     //   and a list of UnicodeSets encountered.
73
74     /**
75      * Return a rules string without unnecessary
76      * characters.
77      */
78     static UnicodeString stripRules(const UnicodeString &rules);
79 private:
80
81     UBool       doParseActions(int32_t a);
82     void        error(UErrorCode e);                   // error reporting convenience function.
83     void        fixOpStack(RBBINode::OpPrecedence p);
84                                                        //   a character.
85     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
86
87     UChar32     nextCharLL();
88 #ifdef RBBI_DEBUG
89     void        printNodeStack(const char *title);
90 #endif
91     RBBINode    *pushNewNode(RBBINode::NodeType  t);
92     void        scanSet();
93
94
95     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
96
97     int32_t                       fScanIndex;        // Index of current character being processed
98                                                      //   in the rule input string.
99     int32_t                       fNextIndex;        // Index of the next character, which
100                                                      //   is the first character not yet scanned.
101     UBool                         fQuoteMode;        // Scan is in a 'quoted region'
102     int32_t                       fLineNum;          // Line number in input file.
103     int32_t                       fCharNum;          // Char position within the line.
104     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
105                                                      //   as a single line, not two.
106
107     RBBIRuleChar                  fC;                // Current char for parse state machine
108                                                      //   processing.
109     UnicodeString                 fVarName;          // $variableName, valid when we've just
110                                                      //   scanned one.
111
112     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
113                                                      //   parsing.  index by p[state][char-class]
114
115     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
116     int32_t                       fStackPtr;           //  and pops as specified in the state
117                                                        //  transition rules.
118
119     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
120                                                            //  during the parse of a rule
121     int32_t                        fNodeStackPtr;
122
123
124     UBool                          fReverseRule;     // True if the rule currently being scanned
125                                                      //  is a reverse direction rule (if it
126                                                      //  starts with a '!')
127
128     UBool                          fLookAheadRule;   // True if the rule includes a '/'
129                                                      //   somewhere within it.
130
131     UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.
132
133     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
134                                                      //   $variable symbols.
135
136     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
137                                                      //   the sets created while parsing rules.
138                                                      //   The key is the string used for creating
139                                                      //   the set.
140
141     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
142                                                      //  the scanning of RBBI rules.  The
143                                                      //  indicies for these are assigned by the
144                                                      //  perl script that builds the state tables.
145                                                      //  See rbbirpt.h.
146
147     int32_t                        fRuleNum;         // Counts each rule as it is scanned.
148
149     int32_t                        fOptionStart;     // Input index of start of a !!option
150                                                      //   keyword, while being scanned.
151
152     UnicodeSet *gRuleSet_rule_char;
153     UnicodeSet *gRuleSet_white_space;
154     UnicodeSet *gRuleSet_name_char;
155     UnicodeSet *gRuleSet_name_start_char;
156
157     RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
158     RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
159 };
160
161 U_NAMESPACE_END
162
163 #endif