source/common/rbbirpt.txt

   1
   2 #*****************************************************************************
   3 #
   4 #   Copyright (C) 2016 and later: Unicode, Inc. and others.
   5 #   License & terms of use: http://www.unicode.org/copyright.html#License
   6 #
   7 #*****************************************************************************
   8 #*****************************************************************************
   9 #
  10 #   Copyright (C) 2002-2016, International Business Machines Corporation and others.
  11 #   All Rights Reserved.
  12 #
  13 #*****************************************************************************
  14 #
  15 #  file:  rbbirpt.txt
  16 #  ICU Break Iterator Rule Parser State Table
  17 #
  18 #     This state table is used when reading and parsing a set of RBBI rules
  19 #     The rule parser uses a state machine; the data in this file define the
  20 #     state transitions that occur for each input character.
  21 #
  22 #     *** This file defines the RBBI rule grammar.   This is it.
  23 #     *** The determination of what is accepted is here.
  24 #
  25 #     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
  26 #     that are then built with the rule parser.
  27 #
  28 #    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
  29
  30 #
  31 # Here is the syntax of the state definitions in this file:
  32 #
  33 #
  34 #StateName:
  35 #   input-char           n next-state           ^push-state     action
  36 #   input-char           n next-state           ^push-state     action
  37 #       |                |   |                      |             |
  38 #       |                |   |                      |             |--- action to be performed by state machine
  39 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
  40 #       |                |   |                      |
  41 #       |                |   |                      |--- Push this named state onto the state stack.
  42 #       |                |   |                           Later, when next state is specified as "pop",
  43 #       |                |   |                           the pushed state will become the current state.
  44 #       |                |   |
  45 #       |                |   |--- Transition to this state if the current input character matches the input
  46 #       |                |        character or char class in the left hand column.  "pop" causes the next
  47 #       |                |        state to be popped from the state stack.
  48 #       |                |
  49 #       |                |--- When making the state transition specified on this line, advance to the next
  50 #       |                     character from the input only if 'n' appears here.
  51 #       |
  52 #       |--- Character or named character classes to test for.  If the current character being scanned
  53 #            matches, peform the actions and go to the state specified on this line.
  54 #            The input character is tested sequentally, in the order written.  The characters and
  55 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
  56 #
  57
  58
  59
  60
  61 #
  62 #  start state, scan position is at the beginning of the rules file, or in between two rules.
  63 #
  64 start:
  65     escaped                term                  ^break-rule-end    doExprStart
  66     white_space          n start
  67     '^'                  n start-after-caret     ^break-rule-end    doNoChain
  68     '$'                    scan-var-name         ^assign-or-rule    doExprStart
  69     '!'                  n rev-option
  70     ';'                  n start                                                  # ignore empty rules.
  71     eof                    exit
  72     default                term                  ^break-rule-end    doExprStart
  73
  74 #
  75 #  break-rule-end:  Returned from doing a break-rule expression.
  76 #
  77 break-rule-end:
  78     ';'                  n start                                    doEndOfRule
  79     white_space          n break-rule-end
  80     default                errorDeath                               doRuleError
  81
  82 #
  83 # start of a rule, after having seen a '^' (inhibits rule chain in).
  84 #     Similar to the main 'start' state in most respects, except
  85 #          - empty rule is an error.
  86 #          - A second '^' is an error.
  87 #
  88 start-after-caret:
  89     escaped                term                                     doExprStart
  90     white_space          n start-after-caret
  91     '^'                    errorDeath                               doRuleError    # two '^'s
  92     '$'                    scan-var-name         ^term-var-ref      doExprStart
  93     ';'                    errorDeath                               doRuleError    # ^ ;
  94     eof                    errorDeath                               doRuleError
  95     default                term                                     doExprStart
  96
  97 #
  98 #   !               We've just scanned a '!', indicating either a !!key word flag or a
  99 #                   !Reverse rule.
 100 #
 101 rev-option:
 102     '!'                  n option-scan1
 103     default                reverse-rule           ^break-rule-end   doReverseDir
 104
 105 option-scan1:
 106     name_start_char      n option-scan2                             doOptionStart
 107     default                errorDeath                               doRuleError
 108
 109 option-scan2:
 110     name_char            n option-scan2
 111     default                option-scan3                             doOptionEnd
 112
 113 option-scan3:
 114     ';'                  n start
 115     white_space          n option-scan3
 116     default                errorDeath                               doRuleError
 117
 118
 119 reverse-rule:
 120     default                term                   ^break-rule-end   doExprStart
 121
 122
 123 #
 124 #  term.  Eat through a single rule character, or a composite thing, which
 125 #         could be a parenthesized expression, a variable name, or a Unicode Set.
 126 #
 127 term:
 128     escaped              n expr-mod                                 doRuleChar
 129     white_space          n term
 130     rule_char            n expr-mod                                 doRuleChar
 131     '['                    scan-unicode-set      ^expr-mod
 132     '('                  n term                  ^expr-mod          doLParen
 133     '$'                    scan-var-name         ^term-var-ref
 134     '.'                  n expr-mod                                 doDotAny
 135     default                errorDeath                               doRuleError
 136
 137
 138
 139 #
 140 #  term-var-ref   We've just finished scanning a reference to a $variable.
 141 #                 Check that the variable was defined.
 142 #                 The variable name scanning is in common with assignment statements,
 143 #                 so the check can't be done there.
 144 term-var-ref:
 145     default                expr-mod                                 doCheckVarDef
 146
 147
 148 #
 149 #   expr-mod      We've just finished scanning a term, now look for the optional
 150 #                 trailing '*', '?', '+'
 151 #
 152 expr-mod:
 153     white_space          n  expr-mod
 154     '*'                  n  expr-cont                               doUnaryOpStar
 155     '+'                  n  expr-cont                               doUnaryOpPlus
 156     '?'                  n  expr-cont                               doUnaryOpQuestion
 157     default                 expr-cont
 158
 159
 160 #
 161 #  expr-cont      Expression, continuation.  At a point where additional terms are
 162 #                                            allowed, but not required.
 163 #
 164 expr-cont:
 165     escaped                 term                                    doExprCatOperator
 166     white_space          n  expr-cont
 167     rule_char               term                                    doExprCatOperator
 168     '['                     term                                    doExprCatOperator
 169     '('                     term                                    doExprCatOperator
 170     '$'                     term                                    doExprCatOperator
 171     '.'                     term                                    doExprCatOperator
 172     '/'                     look-ahead                              doExprCatOperator
 173     '{'                  n  tag-open                                doExprCatOperator
 174     '|'                  n  term                                    doExprOrOperator
 175     ')'                  n  pop                                     doExprRParen
 176     default                 pop                                     doExprFinished
 177
 178
 179 #
 180 #   look-ahead    Scanning a '/', which identifies a break point, assuming that the
 181 #                 remainder of the expression matches.
 182 #
 183 #                 Generate a parse tree as if this was a special kind of input symbol
 184 #                 appearing in an otherwise normal concatenation expression.
 185 #
 186 look-ahead:
 187     '/'                   n expr-cont-no-slash                      doSlash
 188     default                 errorDeath
 189
 190
 191 #
 192 #  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
 193 #                                            allowed, but not required.  Just like
 194 #                                            expr-cont, above, except that no '/'
 195 #                                            look-ahead symbol is permitted.
 196 #
 197 expr-cont-no-slash:
 198     escaped                 term                                    doExprCatOperator
 199     white_space          n  expr-cont
 200     rule_char               term                                    doExprCatOperator
 201     '['                     term                                    doExprCatOperator
 202     '('                     term                                    doExprCatOperator
 203     '$'                     term                                    doExprCatOperator
 204     '.'                     term                                    doExprCatOperator
 205     '|'                  n  term                                    doExprOrOperator
 206     ')'                  n  pop                                     doExprRParen
 207     default                 pop                                     doExprFinished
 208
 209
 210 #
 211 #   tags             scanning a '{', the opening delimiter for a tag that identifies
 212 #                    the kind of match.  Scan the whole {dddd} tag, where d=digit
 213 #
 214 tag-open:
 215     white_space          n  tag-open
 216     digit_char              tag-value                               doStartTagValue
 217     default                 errorDeath                              doTagExpectedError
 218
 219 tag-value:
 220     white_space          n  tag-close
 221     '}'                     tag-close
 222     digit_char           n  tag-value                               doTagDigit
 223     default                 errorDeath                              doTagExpectedError
 224
 225 tag-close:
 226     white_space          n  tag-close
 227     '}'                  n  expr-cont-no-tag                        doTagValue
 228     default                 errorDeath                              doTagExpectedError
 229
 230
 231
 232 #
 233 #  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
 234 #                                            allowed, but not required.  Just like
 235 #                                            expr-cont, above, except that no "{ddd}"
 236 #                                            tagging is permitted.
 237 #
 238 expr-cont-no-tag:
 239     escaped                 term                                    doExprCatOperator
 240     white_space          n  expr-cont-no-tag
 241     rule_char               term                                    doExprCatOperator
 242     '['                     term                                    doExprCatOperator
 243     '('                     term                                    doExprCatOperator
 244     '$'                     term                                    doExprCatOperator
 245     '.'                     term                                    doExprCatOperator
 246     '/'                     look-ahead                              doExprCatOperator
 247     '|'                  n  term                                    doExprOrOperator
 248     ')'                  n  pop                                     doExprRParen
 249     default                 pop                                     doExprFinished
 250
 251
 252
 253
 254 #
 255 #   Variable Name Scanning.
 256 #
 257 #                    The state that branched to here must have pushed a return state
 258 #                    to go to after completion of the variable name scanning.
 259 #
 260 #                    The current input character must be the $ that introduces the name.
 261 #                    The $ is consummed here rather than in the state that first detected it
 262 #                    so that the doStartVariableName action only needs to happen in one
 263 #                    place (here), and the other states don't need to worry about it.
 264 #
 265 scan-var-name:
 266    '$'                  n scan-var-start                            doStartVariableName
 267    default                errorDeath
 268
 269
 270 scan-var-start:
 271     name_start_char      n scan-var-body
 272     default                errorDeath                               doVariableNameExpectedErr
 273
 274 scan-var-body:
 275     name_char            n scan-var-body
 276     default                pop                                      doEndVariableName
 277
 278
 279
 280 #
 281 #  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
 282 #                     Within the RBBI parser, after finding the first character
 283 #                     of a Unicode Set, we just hand the rule input at that
 284 #                     point of to the Unicode Set constructor, then pick
 285 #                     up parsing after the close of the set.
 286 #
 287 #                     The action for this state invokes the UnicodeSet parser.
 288 #
 289 scan-unicode-set:
 290     '['                   n pop                                      doScanUnicodeSet
 291     'p'                   n pop                                      doScanUnicodeSet
 292     'P'                   n pop                                      doScanUnicodeSet
 293     default                 errorDeath
 294
 295
 296
 297
 298
 299
 300
 301 #
 302 #  assign-or-rule.   A $variable was encountered at the start of something, could be
 303 #                    either an assignment statement or a rule, depending on whether an '='
 304 #                    follows the variable name.  We get to this state when the variable name
 305 #                    scanning does a return.
 306 #
 307 assign-or-rule:
 308     white_space          n assign-or-rule
 309     '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
 310     default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
 311
 312
 313
 314 #
 315 #  assign-end        This state is entered when the end of the expression on the
 316 #                    right hand side of an assignment is found.  We get here via
 317 #                    a pop; this state is pushed when the '=' in an assignment is found.
 318 #
 319 #                    The only thing allowed at this point is a ';'.  The RHS of an
 320 #                    assignment must look like a rule expression, and we come here
 321 #                    when what is being scanned no longer looks like an expression.
 322 #
 323 assign-end:
 324     ';'                  n start                                    doEndAssign
 325     default                errorDeath                               doRuleErrorAssignExpr
 326
 327
 328
 329 #
 330 # errorDeath.   This state is specified as the next state whenever a syntax error
 331 #               in the source rules is detected.  Barring bugs, the state machine will never
 332 #               actually get here, but will stop because of the action associated with the error.
 333 #               But, just in case, this state asks the state machine to exit.
 334 errorDeath:
 335     default              n errorDeath                               doExit
 336
 337