Imported Upstream version 58.1
[platform/upstream/icu.git] / source / common / rbbirpt.txt
1
2 #*****************************************************************************
3 #
4 #   Copyright (C) 2016 and later: Unicode, Inc. and others.
5 #   License & terms of use: http://www.unicode.org/copyright.html#License
6 #
7 #*****************************************************************************
8 #*****************************************************************************
9 #
10 #   Copyright (C) 2002-2016, International Business Machines Corporation and others.
11 #   All Rights Reserved.
12 #
13 #*****************************************************************************
14 #
15 #  file:  rbbirpt.txt
16 #  ICU Break Iterator Rule Parser State Table
17 #
18 #     This state table is used when reading and parsing a set of RBBI rules
19 #     The rule parser uses a state machine; the data in this file define the
20 #     state transitions that occur for each input character.
21 #
22 #     *** This file defines the RBBI rule grammar.   This is it.
23 #     *** The determination of what is accepted is here.
24 #
25 #     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
26 #     that are then built with the rule parser.
27 #
28 #    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
29
30 #
31 # Here is the syntax of the state definitions in this file:
32 #
33 #
34 #StateName:
35 #   input-char           n next-state           ^push-state     action    
36 #   input-char           n next-state           ^push-state     action    
37 #       |                |   |                      |             |
38 #       |                |   |                      |             |--- action to be performed by state machine
39 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
40 #       |                |   |                      |
41 #       |                |   |                      |--- Push this named state onto the state stack.
42 #       |                |   |                           Later, when next state is specified as "pop",
43 #       |                |   |                           the pushed state will become the current state.
44 #       |                |   |
45 #       |                |   |--- Transition to this state if the current input character matches the input
46 #       |                |        character or char class in the left hand column.  "pop" causes the next
47 #       |                |        state to be popped from the state stack.
48 #       |                |
49 #       |                |--- When making the state transition specified on this line, advance to the next
50 #       |                     character from the input only if 'n' appears here.
51 #       |
52 #       |--- Character or named character classes to test for.  If the current character being scanned
53 #            matches, peform the actions and go to the state specified on this line.
54 #            The input character is tested sequentally, in the order written.  The characters and
55 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
56 #            
57
58
59
60
61 #
62 #  start state, scan position is at the beginning of the rules file, or in between two rules.
63 #
64 start:
65     escaped                term                  ^break-rule-end    doExprStart                       
66     white_space          n start                     
67     '^'                  n start-after-caret     ^break-rule-end    doNoChain
68     '$'                    scan-var-name         ^assign-or-rule    doExprStart
69     '!'                  n rev-option                             
70     ';'                  n start                                                  # ignore empty rules.
71     eof                    exit              
72     default                term                  ^break-rule-end    doExprStart
73     
74 #
75 #  break-rule-end:  Returned from doing a break-rule expression.
76 #
77 break-rule-end:
78     ';'                  n start                                    doEndOfRule
79     white_space          n break-rule-end
80     default                errorDeath                               doRuleError
81      
82 #
83 # start of a rule, after having seen a '^' (inhibits rule chain in).
84 #     Similar to the main 'start' state in most respects, except
85 #          - empty rule is an error.
86 #          - A second '^' is an error.
87 #
88 start-after-caret:
89     escaped                term                                     doExprStart
90     white_space          n start-after-caret
91     '^'                    errorDeath                               doRuleError    # two '^'s
92     '$'                    scan-var-name         ^term-var-ref      doExprStart
93     ';'                    errorDeath                               doRuleError    # ^ ;
94     eof                    errorDeath                               doRuleError
95     default                term                                     doExprStart
96  
97 #
98 #   !               We've just scanned a '!', indicating either a !!key word flag or a
99 #                   !Reverse rule.
100 #
101 rev-option:
102     '!'                  n option-scan1   
103     default                reverse-rule           ^break-rule-end   doReverseDir
104     
105 option-scan1:
106     name_start_char      n option-scan2                             doOptionStart
107     default                errorDeath                               doRuleError
108     
109 option-scan2:
110     name_char            n option-scan2
111     default                option-scan3                             doOptionEnd
112     
113 option-scan3:
114     ';'                  n start 
115     white_space          n option-scan3 
116     default                errorDeath                               doRuleError 
117     
118
119 reverse-rule:
120     default                term                   ^break-rule-end   doExprStart
121     
122     
123 #
124 #  term.  Eat through a single rule character, or a composite thing, which
125 #         could be a parenthesized expression, a variable name, or a Unicode Set.
126 #
127 term:
128     escaped              n expr-mod                                 doRuleChar
129     white_space          n term
130     rule_char            n expr-mod                                 doRuleChar
131     '['                    scan-unicode-set      ^expr-mod
132     '('                  n term                  ^expr-mod          doLParen
133     '$'                    scan-var-name         ^term-var-ref
134     '.'                  n expr-mod                                 doDotAny
135     default                errorDeath                               doRuleError
136     
137     
138
139 #
140 #  term-var-ref   We've just finished scanning a reference to a $variable.
141 #                 Check that the variable was defined.
142 #                 The variable name scanning is in common with assignment statements,
143 #                 so the check can't be done there.
144 term-var-ref:
145     default                expr-mod                                 doCheckVarDef
146     
147     
148 #
149 #   expr-mod      We've just finished scanning a term, now look for the optional
150 #                 trailing '*', '?', '+'
151 #
152 expr-mod:
153     white_space          n  expr-mod
154     '*'                  n  expr-cont                               doUnaryOpStar
155     '+'                  n  expr-cont                               doUnaryOpPlus
156     '?'                  n  expr-cont                               doUnaryOpQuestion
157     default                 expr-cont 
158     
159     
160 #
161 #  expr-cont      Expression, continuation.  At a point where additional terms are
162 #                                            allowed, but not required.
163 #
164 expr-cont:
165     escaped                 term                                    doExprCatOperator
166     white_space          n  expr-cont
167     rule_char               term                                    doExprCatOperator
168     '['                     term                                    doExprCatOperator
169     '('                     term                                    doExprCatOperator
170     '$'                     term                                    doExprCatOperator
171     '.'                     term                                    doExprCatOperator
172     '/'                     look-ahead                              doExprCatOperator
173     '{'                  n  tag-open                                doExprCatOperator
174     '|'                  n  term                                    doExprOrOperator
175     ')'                  n  pop                                     doExprRParen
176     default                 pop                                     doExprFinished
177     
178
179 #
180 #   look-ahead    Scanning a '/', which identifies a break point, assuming that the
181 #                 remainder of the expression matches.
182 #
183 #                 Generate a parse tree as if this was a special kind of input symbol
184 #                 appearing in an otherwise normal concatenation expression.
185 #
186 look-ahead:
187     '/'                   n expr-cont-no-slash                      doSlash
188     default                 errorDeath
189
190
191 #
192 #  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
193 #                                            allowed, but not required.  Just like
194 #                                            expr-cont, above, except that no '/'
195 #                                            look-ahead symbol is permitted.
196 #
197 expr-cont-no-slash:
198     escaped                 term                                    doExprCatOperator
199     white_space          n  expr-cont
200     rule_char               term                                    doExprCatOperator
201     '['                     term                                    doExprCatOperator
202     '('                     term                                    doExprCatOperator
203     '$'                     term                                    doExprCatOperator
204     '.'                     term                                    doExprCatOperator
205     '|'                  n  term                                    doExprOrOperator
206     ')'                  n  pop                                     doExprRParen
207     default                 pop                                     doExprFinished
208
209
210 #
211 #   tags             scanning a '{', the opening delimiter for a tag that identifies
212 #                    the kind of match.  Scan the whole {dddd} tag, where d=digit
213 #
214 tag-open:
215     white_space          n  tag-open
216     digit_char              tag-value                               doStartTagValue
217     default                 errorDeath                              doTagExpectedError
218     
219 tag-value:
220     white_space          n  tag-close
221     '}'                     tag-close
222     digit_char           n  tag-value                               doTagDigit
223     default                 errorDeath                              doTagExpectedError
224     
225 tag-close:
226     white_space          n  tag-close
227     '}'                  n  expr-cont-no-tag                        doTagValue
228     default                 errorDeath                              doTagExpectedError
229     
230     
231     
232 #
233 #  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
234 #                                            allowed, but not required.  Just like
235 #                                            expr-cont, above, except that no "{ddd}"
236 #                                            tagging is permitted.
237 #
238 expr-cont-no-tag:
239     escaped                 term                                    doExprCatOperator
240     white_space          n  expr-cont-no-tag
241     rule_char               term                                    doExprCatOperator
242     '['                     term                                    doExprCatOperator
243     '('                     term                                    doExprCatOperator
244     '$'                     term                                    doExprCatOperator
245     '.'                     term                                    doExprCatOperator
246     '/'                     look-ahead                              doExprCatOperator
247     '|'                  n  term                                    doExprOrOperator
248     ')'                  n  pop                                     doExprRParen
249     default                 pop                                     doExprFinished
250     
251     
252
253
254 #
255 #   Variable Name Scanning.
256 #
257 #                    The state that branched to here must have pushed a return state
258 #                    to go to after completion of the variable name scanning.
259 #
260 #                    The current input character must be the $ that introduces the name.
261 #                    The $ is consummed here rather than in the state that first detected it
262 #                    so that the doStartVariableName action only needs to happen in one
263 #                    place (here), and the other states don't need to worry about it.
264 #
265 scan-var-name:
266    '$'                  n scan-var-start                            doStartVariableName
267    default                errorDeath
268
269
270 scan-var-start:
271     name_start_char      n scan-var-body
272     default                errorDeath                               doVariableNameExpectedErr
273     
274 scan-var-body:
275     name_char            n scan-var-body
276     default                pop                                      doEndVariableName
277     
278     
279     
280 #
281 #  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
282 #                     Within the RBBI parser, after finding the first character
283 #                     of a Unicode Set, we just hand the rule input at that
284 #                     point of to the Unicode Set constructor, then pick
285 #                     up parsing after the close of the set.
286 #
287 #                     The action for this state invokes the UnicodeSet parser.
288 #
289 scan-unicode-set:
290     '['                   n pop                                      doScanUnicodeSet
291     'p'                   n pop                                      doScanUnicodeSet
292     'P'                   n pop                                      doScanUnicodeSet
293     default                 errorDeath 
294     
295     
296
297
298
299
300
301 #
302 #  assign-or-rule.   A $variable was encountered at the start of something, could be
303 #                    either an assignment statement or a rule, depending on whether an '='
304 #                    follows the variable name.  We get to this state when the variable name
305 #                    scanning does a return.
306 #
307 assign-or-rule:
308     white_space          n assign-or-rule
309     '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
310     default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
311
312
313
314 #
315 #  assign-end        This state is entered when the end of the expression on the
316 #                    right hand side of an assignment is found.  We get here via
317 #                    a pop; this state is pushed when the '=' in an assignment is found.
318 #
319 #                    The only thing allowed at this point is a ';'.  The RHS of an
320 #                    assignment must look like a rule expression, and we come here
321 #                    when what is being scanned no longer looks like an expression.
322 #
323 assign-end:
324     ';'                  n start                                    doEndAssign
325     default                errorDeath                               doRuleErrorAssignExpr
326     
327     
328     
329 #
330 # errorDeath.   This state is specified as the next state whenever a syntax error
331 #               in the source rules is detected.  Barring bugs, the state machine will never
332 #               actually get here, but will stop because of the action associated with the error.
333 #               But, just in case, this state asks the state machine to exit.
334 errorDeath:
335     default              n errorDeath                               doExit
336
337