source/i18n/nfrule.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *   Copyright (C) 1997-2015, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 ******************************************************************************
   8 *   file name:  nfrule.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 * Modification history
  14 * Date        Name      Comments
  15 * 10/11/2001  Doug      Ported from ICU4J
  16 */
  17
  18 #include "nfrule.h"
  19
  20 #if U_HAVE_RBNF
  21
  22 #include "unicode/localpointer.h"
  23 #include "unicode/rbnf.h"
  24 #include "unicode/tblcoll.h"
  25 #include "unicode/plurfmt.h"
  26 #include "unicode/upluralrules.h"
  27 #include "unicode/coleitr.h"
  28 #include "unicode/uchar.h"
  29 #include "nfrs.h"
  30 #include "nfrlist.h"
  31 #include "nfsubs.h"
  32 #include "patternprops.h"
  33
  34 U_NAMESPACE_BEGIN
  35
  36 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status)
  37   : baseValue((int32_t)0)
  38   , radix(10)
  39   , exponent(0)
  40   , decimalPoint(0)
  41   , ruleText(_ruleText)
  42   , sub1(NULL)
  43   , sub2(NULL)
  44   , formatter(_rbnf)
  45   , rulePatternFormat(NULL)
  46 {
  47     if (!ruleText.isEmpty()) {
  48         parseRuleDescriptor(ruleText, status);
  49     }
  50 }
  51
  52 NFRule::~NFRule()
  53 {
  54     if (sub1 != sub2) {
  55         delete sub2;
  56         sub2 = NULL;
  57     }
  58     delete sub1;
  59     sub1 = NULL;
  60     delete rulePatternFormat;
  61     rulePatternFormat = NULL;
  62 }
  63
  64 static const UChar gLeftBracket = 0x005b;
  65 static const UChar gRightBracket = 0x005d;
  66 static const UChar gColon = 0x003a;
  67 static const UChar gZero = 0x0030;
  68 static const UChar gNine = 0x0039;
  69 static const UChar gSpace = 0x0020;
  70 static const UChar gSlash = 0x002f;
  71 static const UChar gGreaterThan = 0x003e;
  72 static const UChar gLessThan = 0x003c;
  73 static const UChar gComma = 0x002c;
  74 static const UChar gDot = 0x002e;
  75 static const UChar gTick = 0x0027;
  76 //static const UChar gMinus = 0x002d;
  77 static const UChar gSemicolon = 0x003b;
  78 static const UChar gX = 0x0078;
  79
  80 static const UChar gMinusX[] =                  {0x2D, 0x78, 0};    /* "-x" */
  81 static const UChar gInf[] =                     {0x49, 0x6E, 0x66, 0}; /* "Inf" */
  82 static const UChar gNaN[] =                     {0x4E, 0x61, 0x4E, 0}; /* "NaN" */
  83
  84 static const UChar gDollarOpenParenthesis[] =   {0x24, 0x28, 0}; /* "$(" */
  85 static const UChar gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */
  86
  87 static const UChar gLessLess[] =                {0x3C, 0x3C, 0};    /* "<<" */
  88 static const UChar gLessPercent[] =             {0x3C, 0x25, 0};    /* "<%" */
  89 static const UChar gLessHash[] =                {0x3C, 0x23, 0};    /* "<#" */
  90 static const UChar gLessZero[] =                {0x3C, 0x30, 0};    /* "<0" */
  91 static const UChar gGreaterGreater[] =          {0x3E, 0x3E, 0};    /* ">>" */
  92 static const UChar gGreaterPercent[] =          {0x3E, 0x25, 0};    /* ">%" */
  93 static const UChar gGreaterHash[] =             {0x3E, 0x23, 0};    /* ">#" */
  94 static const UChar gGreaterZero[] =             {0x3E, 0x30, 0};    /* ">0" */
  95 static const UChar gEqualPercent[] =            {0x3D, 0x25, 0};    /* "=%" */
  96 static const UChar gEqualHash[] =               {0x3D, 0x23, 0};    /* "=#" */
  97 static const UChar gEqualZero[] =               {0x3D, 0x30, 0};    /* "=0" */
  98 static const UChar gGreaterGreaterGreater[] =   {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */
  99
 100 static const UChar * const RULE_PREFIXES[] = {
 101     gLessLess, gLessPercent, gLessHash, gLessZero,
 102     gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero,
 103     gEqualPercent, gEqualHash, gEqualZero, NULL
 104 };
 105
 106 void
 107 NFRule::makeRules(UnicodeString& description,
 108                   NFRuleSet *owner,
 109                   const NFRule *predecessor,
 110                   const RuleBasedNumberFormat *rbnf,
 111                   NFRuleList& rules,
 112                   UErrorCode& status)
 113 {
 114     // we know we're making at least one rule, so go ahead and
 115     // new it up and initialize its basevalue and divisor
 116     // (this also strips the rule descriptor, if any, off the
 117     // descripton string)
 118     NFRule* rule1 = new NFRule(rbnf, description, status);
 119     /* test for NULL */
 120     if (rule1 == 0) {
 121         status = U_MEMORY_ALLOCATION_ERROR;
 122         return;
 123     }
 124     description = rule1->ruleText;
 125
 126     // check the description to see whether there's text enclosed
 127     // in brackets
 128     int32_t brack1 = description.indexOf(gLeftBracket);
 129     int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket);
 130
 131     // if the description doesn't contain a matched pair of brackets,
 132     // or if it's of a type that doesn't recognize bracketed text,
 133     // then leave the description alone, initialize the rule's
 134     // rule text and substitutions, and return that rule
 135     if (brack2 < 0 || brack1 > brack2
 136         || rule1->getType() == kProperFractionRule
 137         || rule1->getType() == kNegativeNumberRule
 138         || rule1->getType() == kInfinityRule
 139         || rule1->getType() == kNaNRule)
 140     {
 141         rule1->extractSubstitutions(owner, description, predecessor, status);
 142     }
 143     else {
 144         // if the description does contain a matched pair of brackets,
 145         // then it's really shorthand for two rules (with one exception)
 146         NFRule* rule2 = NULL;
 147         UnicodeString sbuf;
 148
 149         // we'll actually only split the rule into two rules if its
 150         // base value is an even multiple of its divisor (or it's one
 151         // of the special rules)
 152         if ((rule1->baseValue > 0
 153             && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0)
 154             || rule1->getType() == kImproperFractionRule
 155             || rule1->getType() == kMasterRule) {
 156
 157             // if it passes that test, new up the second rule.  If the
 158             // rule set both rules will belong to is a fraction rule
 159             // set, they both have the same base value; otherwise,
 160             // increment the original rule's base value ("rule1" actually
 161             // goes SECOND in the rule set's rule list)
 162             rule2 = new NFRule(rbnf, UnicodeString(), status);
 163             /* test for NULL */
 164             if (rule2 == 0) {
 165                 status = U_MEMORY_ALLOCATION_ERROR;
 166                 return;
 167             }
 168             if (rule1->baseValue >= 0) {
 169                 rule2->baseValue = rule1->baseValue;
 170                 if (!owner->isFractionRuleSet()) {
 171                     ++rule1->baseValue;
 172                 }
 173             }
 174
 175             // if the description began with "x.x" and contains bracketed
 176             // text, it describes both the improper fraction rule and
 177             // the proper fraction rule
 178             else if (rule1->getType() == kImproperFractionRule) {
 179                 rule2->setType(kProperFractionRule);
 180             }
 181
 182             // if the description began with "x.0" and contains bracketed
 183             // text, it describes both the master rule and the
 184             // improper fraction rule
 185             else if (rule1->getType() == kMasterRule) {
 186                 rule2->baseValue = rule1->baseValue;
 187                 rule1->setType(kImproperFractionRule);
 188             }
 189
 190             // both rules have the same radix and exponent (i.e., the
 191             // same divisor)
 192             rule2->radix = rule1->radix;
 193             rule2->exponent = rule1->exponent;
 194
 195             // rule2's rule text omits the stuff in brackets: initalize
 196             // its rule text and substitutions accordingly
 197             sbuf.append(description, 0, brack1);
 198             if (brack2 + 1 < description.length()) {
 199                 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
 200             }
 201             rule2->extractSubstitutions(owner, sbuf, predecessor, status);
 202         }
 203
 204         // rule1's text includes the text in the brackets but omits
 205         // the brackets themselves: initialize _its_ rule text and
 206         // substitutions accordingly
 207         sbuf.setTo(description, 0, brack1);
 208         sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
 209         if (brack2 + 1 < description.length()) {
 210             sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
 211         }
 212         rule1->extractSubstitutions(owner, sbuf, predecessor, status);
 213
 214         // if we only have one rule, return it; if we have two, return
 215         // a two-element array containing them (notice that rule2 goes
 216         // BEFORE rule1 in the list: in all cases, rule2 OMITS the
 217         // material in the brackets and rule1 INCLUDES the material
 218         // in the brackets)
 219         if (rule2 != NULL) {
 220             if (rule2->baseValue >= kNoBase) {
 221                 rules.add(rule2);
 222             }
 223             else {
 224                 owner->setNonNumericalRule(rule2);
 225             }
 226         }
 227     }
 228     if (rule1->baseValue >= kNoBase) {
 229         rules.add(rule1);
 230     }
 231     else {
 232         owner->setNonNumericalRule(rule1);
 233     }
 234 }
 235
 236 /**
 237  * This function parses the rule's rule descriptor (i.e., the base
 238  * value and/or other tokens that precede the rule's rule text
 239  * in the description) and sets the rule's base value, radix, and
 240  * exponent according to the descriptor.  (If the description doesn't
 241  * include a rule descriptor, then this function sets everything to
 242  * default values and the rule set sets the rule's real base value).
 243  * @param description The rule's description
 244  * @return If "description" included a rule descriptor, this is
 245  * "description" with the descriptor and any trailing whitespace
 246  * stripped off.  Otherwise; it's "descriptor" unchangd.
 247  */
 248 void
 249 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
 250 {
 251     // the description consists of a rule descriptor and a rule body,
 252     // separated by a colon.  The rule descriptor is optional.  If
 253     // it's omitted, just set the base value to 0.
 254     int32_t p = description.indexOf(gColon);
 255     if (p != -1) {
 256         // copy the descriptor out into its own string and strip it,
 257         // along with any trailing whitespace, out of the original
 258         // description
 259         UnicodeString descriptor;
 260         descriptor.setTo(description, 0, p);
 261
 262         ++p;
 263         while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) {
 264             ++p;
 265         }
 266         description.removeBetween(0, p);
 267
 268         // check first to see if the rule descriptor matches the token
 269         // for one of the special rules.  If it does, set the base
 270         // value to the correct identifier value
 271         int descriptorLength = descriptor.length();
 272         UChar firstChar = descriptor.charAt(0);
 273         UChar lastChar = descriptor.charAt(descriptorLength - 1);
 274         if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) {
 275             // if the rule descriptor begins with a digit, it's a descriptor
 276             // for a normal rule
 277             // since we don't have Long.parseLong, and this isn't much work anyway,
 278             // just build up the value as we encounter the digits.
 279             int64_t val = 0;
 280             p = 0;
 281             UChar c = gSpace;
 282
 283             // begin parsing the descriptor: copy digits
 284             // into "tempValue", skip periods, commas, and spaces,
 285             // stop on a slash or > sign (or at the end of the string),
 286             // and throw an exception on any other character
 287             int64_t ll_10 = 10;
 288             while (p < descriptorLength) {
 289                 c = descriptor.charAt(p);
 290                 if (c >= gZero && c <= gNine) {
 291                     val = val * ll_10 + (int32_t)(c - gZero);
 292                 }
 293                 else if (c == gSlash || c == gGreaterThan) {
 294                     break;
 295                 }
 296                 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
 297                 }
 298                 else {
 299                     // throw new IllegalArgumentException("Illegal character in rule descriptor");
 300                     status = U_PARSE_ERROR;
 301                     return;
 302                 }
 303                 ++p;
 304             }
 305
 306             // we have the base value, so set it
 307             setBaseValue(val, status);
 308
 309             // if we stopped the previous loop on a slash, we're
 310             // now parsing the rule's radix.  Again, accumulate digits
 311             // in tempValue, skip punctuation, stop on a > mark, and
 312             // throw an exception on anything else
 313             if (c == gSlash) {
 314                 val = 0;
 315                 ++p;
 316                 int64_t ll_10 = 10;
 317                 while (p < descriptorLength) {
 318                     c = descriptor.charAt(p);
 319                     if (c >= gZero && c <= gNine) {
 320                         val = val * ll_10 + (int32_t)(c - gZero);
 321                     }
 322                     else if (c == gGreaterThan) {
 323                         break;
 324                     }
 325                     else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
 326                     }
 327                     else {
 328                         // throw new IllegalArgumentException("Illegal character is rule descriptor");
 329                         status = U_PARSE_ERROR;
 330                         return;
 331                     }
 332                     ++p;
 333                 }
 334
 335                 // tempValue now contain's the rule's radix.  Set it
 336                 // accordingly, and recalculate the rule's exponent
 337                 radix = (int32_t)val;
 338                 if (radix == 0) {
 339                     // throw new IllegalArgumentException("Rule can't have radix of 0");
 340                     status = U_PARSE_ERROR;
 341                 }
 342
 343                 exponent = expectedExponent();
 344             }
 345
 346             // if we stopped the previous loop on a > sign, then continue
 347             // for as long as we still see > signs.  For each one,
 348             // decrement the exponent (unless the exponent is already 0).
 349             // If we see another character before reaching the end of
 350             // the descriptor, that's also a syntax error.
 351             if (c == gGreaterThan) {
 352                 while (p < descriptor.length()) {
 353                     c = descriptor.charAt(p);
 354                     if (c == gGreaterThan && exponent > 0) {
 355                         --exponent;
 356                     } else {
 357                         // throw new IllegalArgumentException("Illegal character in rule descriptor");
 358                         status = U_PARSE_ERROR;
 359                         return;
 360                     }
 361                     ++p;
 362                 }
 363             }
 364         }
 365         else if (0 == descriptor.compare(gMinusX, 2)) {
 366             setType(kNegativeNumberRule);
 367         }
 368         else if (descriptorLength == 3) {
 369             if (firstChar == gZero && lastChar == gX) {
 370                 setBaseValue(kProperFractionRule, status);
 371                 decimalPoint = descriptor.charAt(1);
 372             }
 373             else if (firstChar == gX && lastChar == gX) {
 374                 setBaseValue(kImproperFractionRule, status);
 375                 decimalPoint = descriptor.charAt(1);
 376             }
 377             else if (firstChar == gX && lastChar == gZero) {
 378                 setBaseValue(kMasterRule, status);
 379                 decimalPoint = descriptor.charAt(1);
 380             }
 381             else if (descriptor.compare(gNaN, 3) == 0) {
 382                 setBaseValue(kNaNRule, status);
 383             }
 384             else if (descriptor.compare(gInf, 3) == 0) {
 385                 setBaseValue(kInfinityRule, status);
 386             }
 387         }
 388     }
 389     // else use the default base value for now.
 390
 391     // finally, if the rule body begins with an apostrophe, strip it off
 392     // (this is generally used to put whitespace at the beginning of
 393     // a rule's rule text)
 394     if (description.length() > 0 && description.charAt(0) == gTick) {
 395         description.removeBetween(0, 1);
 396     }
 397
 398     // return the description with all the stuff we've just waded through
 399     // stripped off the front.  It now contains just the rule body.
 400     // return description;
 401 }
 402
 403 /**
 404 * Searches the rule's rule text for the substitution tokens,
 405 * creates the substitutions, and removes the substitution tokens
 406 * from the rule's rule text.
 407 * @param owner The rule set containing this rule
 408 * @param predecessor The rule preseding this one in "owners" rule list
 409 * @param ownersOwner The RuleBasedFormat that owns this rule
 410 */
 411 void
 412 NFRule::extractSubstitutions(const NFRuleSet* ruleSet,
 413                              const UnicodeString &ruleText,
 414                              const NFRule* predecessor,
 415                              UErrorCode& status)
 416 {
 417     if (U_FAILURE(status)) {
 418         return;
 419     }
 420     this->ruleText = ruleText;
 421     sub1 = extractSubstitution(ruleSet, predecessor, status);
 422     if (sub1 == NULL) {
 423         // Small optimization. There is no need to create a redundant NullSubstitution.
 424         sub2 = NULL;
 425     }
 426     else {
 427         sub2 = extractSubstitution(ruleSet, predecessor, status);
 428     }
 429     int32_t pluralRuleStart = this->ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
 430     int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? this->ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1);
 431     if (pluralRuleEnd >= 0) {
 432         int32_t endType = this->ruleText.indexOf(gComma, pluralRuleStart);
 433         if (endType < 0) {
 434             status = U_PARSE_ERROR;
 435             return;
 436         }
 437         UnicodeString type(this->ruleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2));
 438         UPluralType pluralType;
 439         if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) {
 440             pluralType = UPLURAL_TYPE_CARDINAL;
 441         }
 442         else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) {
 443             pluralType = UPLURAL_TYPE_ORDINAL;
 444         }
 445         else {
 446             status = U_ILLEGAL_ARGUMENT_ERROR;
 447             return;
 448         }
 449         rulePatternFormat = formatter->createPluralFormat(pluralType,
 450                 this->ruleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status);
 451     }
 452 }
 453
 454 /**
 455 * Searches the rule's rule text for the first substitution token,
 456 * creates a substitution based on it, and removes the token from
 457 * the rule's rule text.
 458 * @param owner The rule set containing this rule
 459 * @param predecessor The rule preceding this one in the rule set's
 460 * rule list
 461 * @param ownersOwner The RuleBasedNumberFormat that owns this rule
 462 * @return The newly-created substitution.  This is never null; if
 463 * the rule text doesn't contain any substitution tokens, this will
 464 * be a NullSubstitution.
 465 */
 466 NFSubstitution *
 467 NFRule::extractSubstitution(const NFRuleSet* ruleSet,
 468                             const NFRule* predecessor,
 469                             UErrorCode& status)
 470 {
 471     NFSubstitution* result = NULL;
 472
 473     // search the rule's rule text for the first two characters of
 474     // a substitution token
 475     int32_t subStart = indexOfAnyRulePrefix();
 476     int32_t subEnd = subStart;
 477
 478     // if we didn't find one, create a null substitution positioned
 479     // at the end of the rule text
 480     if (subStart == -1) {
 481         return NULL;
 482     }
 483
 484     // special-case the ">>>" token, since searching for the > at the
 485     // end will actually find the > in the middle
 486     if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) {
 487         subEnd = subStart + 2;
 488
 489         // otherwise the substitution token ends with the same character
 490         // it began with
 491     } else {
 492         UChar c = ruleText.charAt(subStart);
 493         subEnd = ruleText.indexOf(c, subStart + 1);
 494         // special case for '<%foo<<'
 495         if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) {
 496             // ordinals use "=#,##0==%abbrev=" as their rule.  Notice that the '==' in the middle
 497             // occurs because of the juxtaposition of two different rules.  The check for '<' is a hack
 498             // to get around this.  Having the duplicate at the front would cause problems with
 499             // rules like "<<%" to format, say, percents...
 500             ++subEnd;
 501         }
 502    }
 503
 504     // if we don't find the end of the token (i.e., if we're on a single,
 505     // unmatched token character), create a null substitution positioned
 506     // at the end of the rule
 507     if (subEnd == -1) {
 508         return NULL;
 509     }
 510
 511     // if we get here, we have a real substitution token (or at least
 512     // some text bounded by substitution token characters).  Use
 513     // makeSubstitution() to create the right kind of substitution
 514     UnicodeString subToken;
 515     subToken.setTo(ruleText, subStart, subEnd + 1 - subStart);
 516     result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet,
 517         this->formatter, subToken, status);
 518
 519     // remove the substitution from the rule text
 520     ruleText.removeBetween(subStart, subEnd+1);
 521
 522     return result;
 523 }
 524
 525 /**
 526  * Sets the rule's base value, and causes the radix and exponent
 527  * to be recalculated.  This is used during construction when we
 528  * don't know the rule's base value until after it's been
 529  * constructed.  It should be used at any other time.
 530  * @param The new base value for the rule.
 531  */
 532 void
 533 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status)
 534 {
 535     // set the base value
 536     baseValue = newBaseValue;
 537     radix = 10;
 538
 539     // if this isn't a special rule, recalculate the radix and exponent
 540     // (the radix always defaults to 10; if it's supposed to be something
 541     // else, it's cleaned up by the caller and the exponent is
 542     // recalculated again-- the only function that does this is
 543     // NFRule.parseRuleDescriptor() )
 544     if (baseValue >= 1) {
 545         exponent = expectedExponent();
 546
 547         // this function gets called on a fully-constructed rule whose
 548         // description didn't specify a base value.  This means it
 549         // has substitutions, and some substitutions hold on to copies
 550         // of the rule's divisor.  Fix their copies of the divisor.
 551         if (sub1 != NULL) {
 552             sub1->setDivisor(radix, exponent, status);
 553         }
 554         if (sub2 != NULL) {
 555             sub2->setDivisor(radix, exponent, status);
 556         }
 557
 558         // if this is a special rule, its radix and exponent are basically
 559         // ignored.  Set them to "safe" default values
 560     } else {
 561         exponent = 0;
 562     }
 563 }
 564
 565 /**
 566 * This calculates the rule's exponent based on its radix and base
 567 * value.  This will be the highest power the radix can be raised to
 568 * and still produce a result less than or equal to the base value.
 569 */
 570 int16_t
 571 NFRule::expectedExponent() const
 572 {
 573     // since the log of 0, or the log base 0 of something, causes an
 574     // error, declare the exponent in these cases to be 0 (we also
 575     // deal with the special-rule identifiers here)
 576     if (radix == 0 || baseValue < 1) {
 577         return 0;
 578     }
 579
 580     // we get rounding error in some cases-- for example, log 1000 / log 10
 581     // gives us 1.9999999996 instead of 2.  The extra logic here is to take
 582     // that into account
 583     int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix));
 584     int64_t temp = util64_pow(radix, tempResult + 1);
 585     if (temp <= baseValue) {
 586         tempResult += 1;
 587     }
 588     return tempResult;
 589 }
 590
 591 /**
 592  * Searches the rule's rule text for any of the specified strings.
 593  * @return The index of the first match in the rule's rule text
 594  * (i.e., the first substring in the rule's rule text that matches
 595  * _any_ of the strings in "strings").  If none of the strings in
 596  * "strings" is found in the rule's rule text, returns -1.
 597  */
 598 int32_t
 599 NFRule::indexOfAnyRulePrefix() const
 600 {
 601     int result = -1;
 602     for (int i = 0; RULE_PREFIXES[i]; i++) {
 603         int32_t pos = ruleText.indexOf(*RULE_PREFIXES[i]);
 604         if (pos != -1 && (result == -1 || pos < result)) {
 605             result = pos;
 606         }
 607     }
 608     return result;
 609 }
 610
 611 //-----------------------------------------------------------------------
 612 // boilerplate
 613 //-----------------------------------------------------------------------
 614
 615 static UBool
 616 util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2)
 617 {
 618     if (sub1) {
 619         if (sub2) {
 620             return *sub1 == *sub2;
 621         }
 622     } else if (!sub2) {
 623         return TRUE;
 624     }
 625     return FALSE;
 626 }
 627
 628 /**
 629 * Tests two rules for equality.
 630 * @param that The rule to compare this one against
 631 * @return True is the two rules are functionally equivalent
 632 */
 633 UBool
 634 NFRule::operator==(const NFRule& rhs) const
 635 {
 636     return baseValue == rhs.baseValue
 637         && radix == rhs.radix
 638         && exponent == rhs.exponent
 639         && ruleText == rhs.ruleText
 640         && util_equalSubstitutions(sub1, rhs.sub1)
 641         && util_equalSubstitutions(sub2, rhs.sub2);
 642 }
 643
 644 /**
 645 * Returns a textual representation of the rule.  This won't
 646 * necessarily be the same as the description that this rule
 647 * was created with, but it will produce the same result.
 648 * @return A textual description of the rule
 649 */
 650 static void util_append64(UnicodeString& result, int64_t n)
 651 {
 652     UChar buffer[256];
 653     int32_t len = util64_tou(n, buffer, sizeof(buffer));
 654     UnicodeString temp(buffer, len);
 655     result.append(temp);
 656 }
 657
 658 void
 659 NFRule::_appendRuleText(UnicodeString& result) const
 660 {
 661     switch (getType()) {
 662     case kNegativeNumberRule: result.append(gMinusX, 2); break;
 663     case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
 664     case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
 665     case kMasterRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break;
 666     case kInfinityRule: result.append(gInf, 3); break;
 667     case kNaNRule: result.append(gNaN, 3); break;
 668     default:
 669         // for a normal rule, write out its base value, and if the radix is
 670         // something other than 10, write out the radix (with the preceding
 671         // slash, of course).  Then calculate the expected exponent and if
 672         // if isn't the same as the actual exponent, write an appropriate
 673         // number of > signs.  Finally, terminate the whole thing with
 674         // a colon.
 675         util_append64(result, baseValue);
 676         if (radix != 10) {
 677             result.append(gSlash);
 678             util_append64(result, radix);
 679         }
 680         int numCarets = expectedExponent() - exponent;
 681         for (int i = 0; i < numCarets; i++) {
 682             result.append(gGreaterThan);
 683         }
 684         break;
 685     }
 686     result.append(gColon);
 687     result.append(gSpace);
 688
 689     // if the rule text begins with a space, write an apostrophe
 690     // (whitespace after the rule descriptor is ignored; the
 691     // apostrophe is used to make the whitespace significant)
 692     if (ruleText.charAt(0) == gSpace && (sub1 == NULL || sub1->getPos() != 0)) {
 693         result.append(gTick);
 694     }
 695
 696     // now, write the rule's rule text, inserting appropriate
 697     // substitution tokens in the appropriate places
 698     UnicodeString ruleTextCopy;
 699     ruleTextCopy.setTo(ruleText);
 700
 701     UnicodeString temp;
 702     if (sub2 != NULL) {
 703         sub2->toString(temp);
 704         ruleTextCopy.insert(sub2->getPos(), temp);
 705     }
 706     if (sub1 != NULL) {
 707         sub1->toString(temp);
 708         ruleTextCopy.insert(sub1->getPos(), temp);
 709     }
 710
 711     result.append(ruleTextCopy);
 712
 713     // and finally, top the whole thing off with a semicolon and
 714     // return the result
 715     result.append(gSemicolon);
 716 }
 717
 718 //-----------------------------------------------------------------------
 719 // formatting
 720 //-----------------------------------------------------------------------
 721
 722 /**
 723 * Formats the number, and inserts the resulting text into
 724 * toInsertInto.
 725 * @param number The number being formatted
 726 * @param toInsertInto The string where the resultant text should
 727 * be inserted
 728 * @param pos The position in toInsertInto where the resultant text
 729 * should be inserted
 730 */
 731 void
 732 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
 733 {
 734     // first, insert the rule's rule text into toInsertInto at the
 735     // specified position, then insert the results of the substitutions
 736     // into the right places in toInsertInto (notice we do the
 737     // substitutions in reverse order so that the offsets don't get
 738     // messed up)
 739     int32_t pluralRuleStart = ruleText.length();
 740     int32_t lengthOffset = 0;
 741     if (!rulePatternFormat) {
 742         toInsertInto.insert(pos, ruleText);
 743     }
 744     else {
 745         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
 746         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
 747         int initialLength = toInsertInto.length();
 748         if (pluralRuleEnd < ruleText.length() - 1) {
 749             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
 750         }
 751         toInsertInto.insert(pos,
 752             rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status));
 753         if (pluralRuleStart > 0) {
 754             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
 755         }
 756         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
 757     }
 758
 759     if (sub2 != NULL) {
 760         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
 761     }
 762     if (sub1 != NULL) {
 763         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
 764     }
 765 }
 766
 767 /**
 768 * Formats the number, and inserts the resulting text into
 769 * toInsertInto.
 770 * @param number The number being formatted
 771 * @param toInsertInto The string where the resultant text should
 772 * be inserted
 773 * @param pos The position in toInsertInto where the resultant text
 774 * should be inserted
 775 */
 776 void
 777 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
 778 {
 779     // first, insert the rule's rule text into toInsertInto at the
 780     // specified position, then insert the results of the substitutions
 781     // into the right places in toInsertInto
 782     // [again, we have two copies of this routine that do the same thing
 783     // so that we don't sacrifice precision in a long by casting it
 784     // to a double]
 785     int32_t pluralRuleStart = ruleText.length();
 786     int32_t lengthOffset = 0;
 787     if (!rulePatternFormat) {
 788         toInsertInto.insert(pos, ruleText);
 789     }
 790     else {
 791         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
 792         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
 793         int initialLength = toInsertInto.length();
 794         if (pluralRuleEnd < ruleText.length() - 1) {
 795             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
 796         }
 797         double pluralVal = number;
 798         if (0 <= pluralVal && pluralVal < 1) {
 799             // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior.
 800             // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors.
 801             pluralVal = uprv_round(pluralVal * uprv_pow(radix, exponent));
 802         }
 803         else {
 804             pluralVal = pluralVal / uprv_pow(radix, exponent);
 805         }
 806         toInsertInto.insert(pos, rulePatternFormat->format((int32_t)(pluralVal), status));
 807         if (pluralRuleStart > 0) {
 808             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
 809         }
 810         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
 811     }
 812
 813     if (sub2 != NULL) {
 814         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
 815     }
 816     if (sub1 != NULL) {
 817         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
 818     }
 819 }
 820
 821 /**
 822 * Used by the owning rule set to determine whether to invoke the
 823 * rollback rule (i.e., whether this rule or the one that precedes
 824 * it in the rule set's list should be used to format the number)
 825 * @param The number being formatted
 826 * @return True if the rule set should use the rule that precedes
 827 * this one in its list; false if it should use this rule
 828 */
 829 UBool
 830 NFRule::shouldRollBack(double number) const
 831 {
 832     // we roll back if the rule contains a modulus substitution,
 833     // the number being formatted is an even multiple of the rule's
 834     // divisor, and the rule's base value is NOT an even multiple
 835     // of its divisor
 836     // In other words, if the original description had
 837     //    100: << hundred[ >>];
 838     // that expands into
 839     //    100: << hundred;
 840     //    101: << hundred >>;
 841     // internally.  But when we're formatting 200, if we use the rule
 842     // at 101, which would normally apply, we get "two hundred zero".
 843     // To prevent this, we roll back and use the rule at 100 instead.
 844     // This is the logic that makes this happen: the rule at 101 has
 845     // a modulus substitution, its base value isn't an even multiple
 846     // of 100, and the value we're trying to format _is_ an even
 847     // multiple of 100.  This is called the "rollback rule."
 848     if ((sub1 != NULL && sub1->isModulusSubstitution()) || (sub2 != NULL && sub2->isModulusSubstitution())) {
 849         int64_t re = util64_pow(radix, exponent);
 850         return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0;
 851     }
 852     return FALSE;
 853 }
 854
 855 //-----------------------------------------------------------------------
 856 // parsing
 857 //-----------------------------------------------------------------------
 858
 859 /**
 860 * Attempts to parse the string with this rule.
 861 * @param text The string being parsed
 862 * @param parsePosition On entry, the value is ignored and assumed to
 863 * be 0. On exit, this has been updated with the position of the first
 864 * character not consumed by matching the text against this rule
 865 * (if this rule doesn't match the text at all, the parse position
 866 * if left unchanged (presumably at 0) and the function returns
 867 * new Long(0)).
 868 * @param isFractionRule True if this rule is contained within a
 869 * fraction rule set.  This is only used if the rule has no
 870 * substitutions.
 871 * @return If this rule matched the text, this is the rule's base value
 872 * combined appropriately with the results of parsing the substitutions.
 873 * If nothing matched, this is new Long(0) and the parse position is
 874 * left unchanged.  The result will be an instance of Long if the
 875 * result is an integer and Double otherwise.  The result is never null.
 876 */
 877 #ifdef RBNF_DEBUG
 878 #include <stdio.h>
 879
 880 static void dumpUS(FILE* f, const UnicodeString& us) {
 881   int len = us.length();
 882   char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1];
 883   if (buf != NULL) {
 884           us.extract(0, len, buf);
 885           buf[len] = 0;
 886           fprintf(f, "%s", buf);
 887           uprv_free(buf); //delete[] buf;
 888   }
 889 }
 890 #endif
 891 UBool
 892 NFRule::doParse(const UnicodeString& text,
 893                 ParsePosition& parsePosition,
 894                 UBool isFractionRule,
 895                 double upperBound,
 896                 Formattable& resVal) const
 897 {
 898     // internally we operate on a copy of the string being parsed
 899     // (because we're going to change it) and use our own ParsePosition
 900     ParsePosition pp;
 901     UnicodeString workText(text);
 902
 903     int32_t sub1Pos = sub1 != NULL ? sub1->getPos() : ruleText.length();
 904     int32_t sub2Pos = sub2 != NULL ? sub2->getPos() : ruleText.length();
 905
 906     // check to see whether the text before the first substitution
 907     // matches the text at the beginning of the string being
 908     // parsed.  If it does, strip that off the front of workText;
 909     // otherwise, dump out with a mismatch
 910     UnicodeString prefix;
 911     prefix.setTo(ruleText, 0, sub1Pos);
 912
 913 #ifdef RBNF_DEBUG
 914     fprintf(stderr, "doParse %p ", this);
 915     {
 916         UnicodeString rt;
 917         _appendRuleText(rt);
 918         dumpUS(stderr, rt);
 919     }
 920
 921     fprintf(stderr, " text: '");
 922     dumpUS(stderr, text);
 923     fprintf(stderr, "' prefix: '");
 924     dumpUS(stderr, prefix);
 925 #endif
 926     stripPrefix(workText, prefix, pp);
 927     int32_t prefixLength = text.length() - workText.length();
 928
 929 #ifdef RBNF_DEBUG
 930     fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos);
 931 #endif
 932
 933     if (pp.getIndex() == 0 && sub1Pos != 0) {
 934         // commented out because ParsePosition doesn't have error index in 1.1.x
 935         // restored for ICU4C port
 936         parsePosition.setErrorIndex(pp.getErrorIndex());
 937         resVal.setLong(0);
 938         return TRUE;
 939     }
 940     if (baseValue == kInfinityRule) {
 941         // If you match this, don't try to perform any calculations on it.
 942         parsePosition.setIndex(pp.getIndex());
 943         resVal.setDouble(uprv_getInfinity());
 944         return TRUE;
 945     }
 946     if (baseValue == kNaNRule) {
 947         // If you match this, don't try to perform any calculations on it.
 948         parsePosition.setIndex(pp.getIndex());
 949         resVal.setDouble(uprv_getNaN());
 950         return TRUE;
 951     }
 952
 953     // this is the fun part.  The basic guts of the rule-matching
 954     // logic is matchToDelimiter(), which is called twice.  The first
 955     // time it searches the input string for the rule text BETWEEN
 956     // the substitutions and tries to match the intervening text
 957     // in the input string with the first substitution.  If that
 958     // succeeds, it then calls it again, this time to look for the
 959     // rule text after the second substitution and to match the
 960     // intervening input text against the second substitution.
 961     //
 962     // For example, say we have a rule that looks like this:
 963     //    first << middle >> last;
 964     // and input text that looks like this:
 965     //    first one middle two last
 966     // First we use stripPrefix() to match "first " in both places and
 967     // strip it off the front, leaving
 968     //    one middle two last
 969     // Then we use matchToDelimiter() to match " middle " and try to
 970     // match "one" against a substitution.  If it's successful, we now
 971     // have
 972     //    two last
 973     // We use matchToDelimiter() a second time to match " last" and
 974     // try to match "two" against a substitution.  If "two" matches
 975     // the substitution, we have a successful parse.
 976     //
 977     // Since it's possible in many cases to find multiple instances
 978     // of each of these pieces of rule text in the input string,
 979     // we need to try all the possible combinations of these
 980     // locations.  This prevents us from prematurely declaring a mismatch,
 981     // and makes sure we match as much input text as we can.
 982     int highWaterMark = 0;
 983     double result = 0;
 984     int start = 0;
 985     double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue);
 986
 987     UnicodeString temp;
 988     do {
 989         // our partial parse result starts out as this rule's base
 990         // value.  If it finds a successful match, matchToDelimiter()
 991         // will compose this in some way with what it gets back from
 992         // the substitution, giving us a new partial parse result
 993         pp.setIndex(0);
 994
 995         temp.setTo(ruleText, sub1Pos, sub2Pos - sub1Pos);
 996         double partialResult = matchToDelimiter(workText, start, tempBaseValue,
 997             temp, pp, sub1,
 998             upperBound);
 999
1000         // if we got a successful match (or were trying to match a
1001         // null substitution), pp is now pointing at the first unmatched
1002         // character.  Take note of that, and try matchToDelimiter()
1003         // on the input text again
1004         if (pp.getIndex() != 0 || sub1 == NULL) {
1005             start = pp.getIndex();
1006
1007             UnicodeString workText2;
1008             workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex());
1009             ParsePosition pp2;
1010
1011             // the second matchToDelimiter() will compose our previous
1012             // partial result with whatever it gets back from its
1013             // substitution if there's a successful match, giving us
1014             // a real result
1015             temp.setTo(ruleText, sub2Pos, ruleText.length() - sub2Pos);
1016             partialResult = matchToDelimiter(workText2, 0, partialResult,
1017                 temp, pp2, sub2,
1018                 upperBound);
1019
1020             // if we got a successful match on this second
1021             // matchToDelimiter() call, update the high-water mark
1022             // and result (if necessary)
1023             if (pp2.getIndex() != 0 || sub2 == NULL) {
1024                 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) {
1025                     highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex();
1026                     result = partialResult;
1027                 }
1028             }
1029             else {
1030                 // commented out because ParsePosition doesn't have error index in 1.1.x
1031                 // restored for ICU4C port
1032                 int32_t temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex();
1033                 if (temp> parsePosition.getErrorIndex()) {
1034                     parsePosition.setErrorIndex(temp);
1035                 }
1036             }
1037         }
1038         else {
1039             // commented out because ParsePosition doesn't have error index in 1.1.x
1040             // restored for ICU4C port
1041             int32_t temp = sub1Pos + pp.getErrorIndex();
1042             if (temp > parsePosition.getErrorIndex()) {
1043                 parsePosition.setErrorIndex(temp);
1044             }
1045         }
1046         // keep trying to match things until the outer matchToDelimiter()
1047         // call fails to make a match (each time, it picks up where it
1048         // left off the previous time)
1049     } while (sub1Pos != sub2Pos
1050         && pp.getIndex() > 0
1051         && pp.getIndex() < workText.length()
1052         && pp.getIndex() != start);
1053
1054     // update the caller's ParsePosition with our high-water mark
1055     // (i.e., it now points at the first character this function
1056     // didn't match-- the ParsePosition is therefore unchanged if
1057     // we didn't match anything)
1058     parsePosition.setIndex(highWaterMark);
1059     // commented out because ParsePosition doesn't have error index in 1.1.x
1060     // restored for ICU4C port
1061     if (highWaterMark > 0) {
1062         parsePosition.setErrorIndex(0);
1063     }
1064
1065     // this is a hack for one unusual condition: Normally, whether this
1066     // rule belong to a fraction rule set or not is handled by its
1067     // substitutions.  But if that rule HAS NO substitutions, then
1068     // we have to account for it here.  By definition, if the matching
1069     // rule in a fraction rule set has no substitutions, its numerator
1070     // is 1, and so the result is the reciprocal of its base value.
1071     if (isFractionRule && highWaterMark > 0 && sub1 == NULL) {
1072         result = 1 / result;
1073     }
1074
1075     resVal.setDouble(result);
1076     return TRUE; // ??? do we need to worry if it is a long or a double?
1077 }
1078
1079 /**
1080 * This function is used by parse() to match the text being parsed
1081 * against a possible prefix string.  This function
1082 * matches characters from the beginning of the string being parsed
1083 * to characters from the prospective prefix.  If they match, pp is
1084 * updated to the first character not matched, and the result is
1085 * the unparsed part of the string.  If they don't match, the whole
1086 * string is returned, and pp is left unchanged.
1087 * @param text The string being parsed
1088 * @param prefix The text to match against
1089 * @param pp On entry, ignored and assumed to be 0.  On exit, points
1090 * to the first unmatched character (assuming the whole prefix matched),
1091 * or is unchanged (if the whole prefix didn't match).
1092 * @return If things match, this is the unparsed part of "text";
1093 * if they didn't match, this is "text".
1094 */
1095 void
1096 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const
1097 {
1098     // if the prefix text is empty, dump out without doing anything
1099     if (prefix.length() != 0) {
1100         UErrorCode status = U_ZERO_ERROR;
1101         // use prefixLength() to match the beginning of
1102         // "text" against "prefix".  This function returns the
1103         // number of characters from "text" that matched (or 0 if
1104         // we didn't match the whole prefix)
1105         int32_t pfl = prefixLength(text, prefix, status);
1106         if (U_FAILURE(status)) { // Memory allocation error.
1107                 return;
1108         }
1109         if (pfl != 0) {
1110             // if we got a successful match, update the parse position
1111             // and strip the prefix off of "text"
1112             pp.setIndex(pp.getIndex() + pfl);
1113             text.remove(0, pfl);
1114         }
1115     }
1116 }
1117
1118 /**
1119 * Used by parse() to match a substitution and any following text.
1120 * "text" is searched for instances of "delimiter".  For each instance
1121 * of delimiter, the intervening text is tested to see whether it
1122 * matches the substitution.  The longest match wins.
1123 * @param text The string being parsed
1124 * @param startPos The position in "text" where we should start looking
1125 * for "delimiter".
1126 * @param baseValue A partial parse result (often the rule's base value),
1127 * which is combined with the result from matching the substitution
1128 * @param delimiter The string to search "text" for.
1129 * @param pp Ignored and presumed to be 0 on entry.  If there's a match,
1130 * on exit this will point to the first unmatched character.
1131 * @param sub If we find "delimiter" in "text", this substitution is used
1132 * to match the text between the beginning of the string and the
1133 * position of "delimiter."  (If "delimiter" is the empty string, then
1134 * this function just matches against this substitution and updates
1135 * everything accordingly.)
1136 * @param upperBound When matching the substitution, it will only
1137 * consider rules with base values lower than this value.
1138 * @return If there's a match, this is the result of composing
1139 * baseValue with the result of matching the substitution.  Otherwise,
1140 * this is new Long(0).  It's never null.  If the result is an integer,
1141 * this will be an instance of Long; otherwise, it's an instance of
1142 * Double.
1143 *
1144 * !!! note {dlf} in point of fact, in the java code the caller always converts
1145 * the result to a double, so we might as well return one.
1146 */
1147 double
1148 NFRule::matchToDelimiter(const UnicodeString& text,
1149                          int32_t startPos,
1150                          double _baseValue,
1151                          const UnicodeString& delimiter,
1152                          ParsePosition& pp,
1153                          const NFSubstitution* sub,
1154                          double upperBound) const
1155 {
1156         UErrorCode status = U_ZERO_ERROR;
1157     // if "delimiter" contains real (i.e., non-ignorable) text, search
1158     // it for "delimiter" beginning at "start".  If that succeeds, then
1159     // use "sub"'s doParse() method to match the text before the
1160     // instance of "delimiter" we just found.
1161     if (!allIgnorable(delimiter, status)) {
1162         if (U_FAILURE(status)) { //Memory allocation error.
1163                 return 0;
1164         }
1165         ParsePosition tempPP;
1166         Formattable result;
1167
1168         // use findText() to search for "delimiter".  It returns a two-
1169         // element array: element 0 is the position of the match, and
1170         // element 1 is the number of characters that matched
1171         // "delimiter".
1172         int32_t dLen;
1173         int32_t dPos = findText(text, delimiter, startPos, &dLen);
1174
1175         // if findText() succeeded, isolate the text preceding the
1176         // match, and use "sub" to match that text
1177         while (dPos >= 0) {
1178             UnicodeString subText;
1179             subText.setTo(text, 0, dPos);
1180             if (subText.length() > 0) {
1181                 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound,
1182 #if UCONFIG_NO_COLLATION
1183                     FALSE,
1184 #else
1185                     formatter->isLenient(),
1186 #endif
1187                     result);
1188
1189                 // if the substitution could match all the text up to
1190                 // where we found "delimiter", then this function has
1191                 // a successful match.  Bump the caller's parse position
1192                 // to point to the first character after the text
1193                 // that matches "delimiter", and return the result
1194                 // we got from parsing the substitution.
1195                 if (success && tempPP.getIndex() == dPos) {
1196                     pp.setIndex(dPos + dLen);
1197                     return result.getDouble();
1198                 }
1199                 else {
1200                     // commented out because ParsePosition doesn't have error index in 1.1.x
1201                     // restored for ICU4C port
1202                     if (tempPP.getErrorIndex() > 0) {
1203                         pp.setErrorIndex(tempPP.getErrorIndex());
1204                     } else {
1205                         pp.setErrorIndex(tempPP.getIndex());
1206                     }
1207                 }
1208             }
1209
1210             // if we didn't match the substitution, search for another
1211             // copy of "delimiter" in "text" and repeat the loop if
1212             // we find it
1213             tempPP.setIndex(0);
1214             dPos = findText(text, delimiter, dPos + dLen, &dLen);
1215         }
1216         // if we make it here, this was an unsuccessful match, and we
1217         // leave pp unchanged and return 0
1218         pp.setIndex(0);
1219         return 0;
1220
1221         // if "delimiter" is empty, or consists only of ignorable characters
1222         // (i.e., is semantically empty), thwe we obviously can't search
1223         // for "delimiter".  Instead, just use "sub" to parse as much of
1224         // "text" as possible.
1225     }
1226     else if (sub == NULL) {
1227         return _baseValue;
1228     }
1229     else {
1230         ParsePosition tempPP;
1231         Formattable result;
1232
1233         // try to match the whole string against the substitution
1234         UBool success = sub->doParse(text, tempPP, _baseValue, upperBound,
1235 #if UCONFIG_NO_COLLATION
1236             FALSE,
1237 #else
1238             formatter->isLenient(),
1239 #endif
1240             result);
1241         if (success && (tempPP.getIndex() != 0)) {
1242             // if there's a successful match (or it's a null
1243             // substitution), update pp to point to the first
1244             // character we didn't match, and pass the result from
1245             // sub.doParse() on through to the caller
1246             pp.setIndex(tempPP.getIndex());
1247             return result.getDouble();
1248         }
1249         else {
1250             // commented out because ParsePosition doesn't have error index in 1.1.x
1251             // restored for ICU4C port
1252             pp.setErrorIndex(tempPP.getErrorIndex());
1253         }
1254
1255         // and if we get to here, then nothing matched, so we return
1256         // 0 and leave pp alone
1257         return 0;
1258     }
1259 }
1260
1261 /**
1262 * Used by stripPrefix() to match characters.  If lenient parse mode
1263 * is off, this just calls startsWith().  If lenient parse mode is on,
1264 * this function uses CollationElementIterators to match characters in
1265 * the strings (only primary-order differences are significant in
1266 * determining whether there's a match).
1267 * @param str The string being tested
1268 * @param prefix The text we're hoping to see at the beginning
1269 * of "str"
1270 * @return If "prefix" is found at the beginning of "str", this
1271 * is the number of characters in "str" that were matched (this
1272 * isn't necessarily the same as the length of "prefix" when matching
1273 * text with a collator).  If there's no match, this is 0.
1274 */
1275 int32_t
1276 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const
1277 {
1278     // if we're looking for an empty prefix, it obviously matches
1279     // zero characters.  Just go ahead and return 0.
1280     if (prefix.length() == 0) {
1281         return 0;
1282     }
1283
1284 #if !UCONFIG_NO_COLLATION
1285     // go through all this grief if we're in lenient-parse mode
1286     if (formatter->isLenient()) {
1287         // get the formatter's collator and use it to create two
1288         // collation element iterators, one over the target string
1289         // and another over the prefix (right now, we'll throw an
1290         // exception if the collator we get back from the formatter
1291         // isn't a RuleBasedCollator, because RuleBasedCollator defines
1292         // the CollationElementIterator protocol.  Hopefully, this
1293         // will change someday.)
1294         const RuleBasedCollator* collator = formatter->getCollator();
1295         if (collator == NULL) {
1296             status = U_MEMORY_ALLOCATION_ERROR;
1297             return 0;
1298         }
1299         LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str));
1300         LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix));
1301         // Check for memory allocation error.
1302         if (strIter.isNull() || prefixIter.isNull()) {
1303             status = U_MEMORY_ALLOCATION_ERROR;
1304             return 0;
1305         }
1306
1307         UErrorCode err = U_ZERO_ERROR;
1308
1309         // The original code was problematic.  Consider this match:
1310         // prefix = "fifty-"
1311         // string = " fifty-7"
1312         // The intent is to match string up to the '7', by matching 'fifty-' at position 1
1313         // in the string.  Unfortunately, we were getting a match, and then computing where
1314         // the match terminated by rematching the string.  The rematch code was using as an
1315         // initial guess the substring of string between 0 and prefix.length.  Because of
1316         // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving
1317         // the position before the hyphen in the string.  Recursing down, we then parsed the
1318         // remaining string '-7' as numeric.  The resulting number turned out as 43 (50 - 7).
1319         // This was not pretty, especially since the string "fifty-7" parsed just fine.
1320         //
1321         // We have newer APIs now, so we can use calls on the iterator to determine what we
1322         // matched up to.  If we terminate because we hit the last element in the string,
1323         // our match terminates at this length.  If we terminate because we hit the last element
1324         // in the target, our match terminates at one before the element iterator position.
1325
1326         // match collation elements between the strings
1327         int32_t oStr = strIter->next(err);
1328         int32_t oPrefix = prefixIter->next(err);
1329
1330         while (oPrefix != CollationElementIterator::NULLORDER) {
1331             // skip over ignorable characters in the target string
1332             while (CollationElementIterator::primaryOrder(oStr) == 0
1333                 && oStr != CollationElementIterator::NULLORDER) {
1334                 oStr = strIter->next(err);
1335             }
1336
1337             // skip over ignorable characters in the prefix
1338             while (CollationElementIterator::primaryOrder(oPrefix) == 0
1339                 && oPrefix != CollationElementIterator::NULLORDER) {
1340                 oPrefix = prefixIter->next(err);
1341             }
1342
1343             // dlf: move this above following test, if we consume the
1344             // entire target, aren't we ok even if the source was also
1345             // entirely consumed?
1346
1347             // if skipping over ignorables brought to the end of
1348             // the prefix, we DID match: drop out of the loop
1349             if (oPrefix == CollationElementIterator::NULLORDER) {
1350                 break;
1351             }
1352
1353             // if skipping over ignorables brought us to the end
1354             // of the target string, we didn't match and return 0
1355             if (oStr == CollationElementIterator::NULLORDER) {
1356                 return 0;
1357             }
1358
1359             // match collation elements from the two strings
1360             // (considering only primary differences).  If we
1361             // get a mismatch, dump out and return 0
1362             if (CollationElementIterator::primaryOrder(oStr)
1363                 != CollationElementIterator::primaryOrder(oPrefix)) {
1364                 return 0;
1365
1366                 // otherwise, advance to the next character in each string
1367                 // and loop (we drop out of the loop when we exhaust
1368                 // collation elements in the prefix)
1369             } else {
1370                 oStr = strIter->next(err);
1371                 oPrefix = prefixIter->next(err);
1372             }
1373         }
1374
1375         int32_t result = strIter->getOffset();
1376         if (oStr != CollationElementIterator::NULLORDER) {
1377             --result; // back over character that we don't want to consume;
1378         }
1379
1380 #ifdef RBNF_DEBUG
1381         fprintf(stderr, "prefix length: %d\n", result);
1382 #endif
1383         return result;
1384 #if 0
1385         //----------------------------------------------------------------
1386         // JDK 1.2-specific API call
1387         // return strIter.getOffset();
1388         //----------------------------------------------------------------
1389         // JDK 1.1 HACK (take out for 1.2-specific code)
1390
1391         // if we make it to here, we have a successful match.  Now we
1392         // have to find out HOW MANY characters from the target string
1393         // matched the prefix (there isn't necessarily a one-to-one
1394         // mapping between collation elements and characters).
1395         // In JDK 1.2, there's a simple getOffset() call we can use.
1396         // In JDK 1.1, on the other hand, we have to go through some
1397         // ugly contortions.  First, use the collator to compare the
1398         // same number of characters from the prefix and target string.
1399         // If they're equal, we're done.
1400         collator->setStrength(Collator::PRIMARY);
1401         if (str.length() >= prefix.length()) {
1402             UnicodeString temp;
1403             temp.setTo(str, 0, prefix.length());
1404             if (collator->equals(temp, prefix)) {
1405 #ifdef RBNF_DEBUG
1406                 fprintf(stderr, "returning: %d\n", prefix.length());
1407 #endif
1408                 return prefix.length();
1409             }
1410         }
1411
1412         // if they're not equal, then we have to compare successively
1413         // larger and larger substrings of the target string until we
1414         // get to one that matches the prefix.  At that point, we know
1415         // how many characters matched the prefix, and we can return.
1416         int32_t p = 1;
1417         while (p <= str.length()) {
1418             UnicodeString temp;
1419             temp.setTo(str, 0, p);
1420             if (collator->equals(temp, prefix)) {
1421                 return p;
1422             } else {
1423                 ++p;
1424             }
1425         }
1426
1427         // SHOULD NEVER GET HERE!!!
1428         return 0;
1429         //----------------------------------------------------------------
1430 #endif
1431
1432         // If lenient parsing is turned off, forget all that crap above.
1433         // Just use String.startsWith() and be done with it.
1434   } else
1435 #endif
1436   {
1437       if (str.startsWith(prefix)) {
1438           return prefix.length();
1439       } else {
1440           return 0;
1441       }
1442   }
1443 }
1444
1445 /**
1446 * Searches a string for another string.  If lenient parsing is off,
1447 * this just calls indexOf().  If lenient parsing is on, this function
1448 * uses CollationElementIterator to match characters, and only
1449 * primary-order differences are significant in determining whether
1450 * there's a match.
1451 * @param str The string to search
1452 * @param key The string to search "str" for
1453 * @param startingAt The index into "str" where the search is to
1454 * begin
1455 * @return A two-element array of ints.  Element 0 is the position
1456 * of the match, or -1 if there was no match.  Element 1 is the
1457 * number of characters in "str" that matched (which isn't necessarily
1458 * the same as the length of "key")
1459 */
1460 int32_t
1461 NFRule::findText(const UnicodeString& str,
1462                  const UnicodeString& key,
1463                  int32_t startingAt,
1464                  int32_t* length) const
1465 {
1466     if (rulePatternFormat) {
1467         Formattable result;
1468         FieldPosition position(UNUM_INTEGER_FIELD);
1469         position.setBeginIndex(startingAt);
1470         rulePatternFormat->parseType(str, this, result, position);
1471         int start = position.getBeginIndex();
1472         if (start >= 0) {
1473             int32_t pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
1474             int32_t pluralRuleSuffix = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2;
1475             int32_t matchLen = position.getEndIndex() - start;
1476             UnicodeString prefix(ruleText.tempSubString(0, pluralRuleStart));
1477             UnicodeString suffix(ruleText.tempSubString(pluralRuleSuffix));
1478             if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0
1479                     && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0)
1480             {
1481                 *length = matchLen + prefix.length() + suffix.length();
1482                 return start - prefix.length();
1483             }
1484         }
1485         *length = 0;
1486         return -1;
1487     }
1488     if (!formatter->isLenient()) {
1489         // if lenient parsing is turned off, this is easy: just call
1490         // String.indexOf() and we're done
1491         *length = key.length();
1492         return str.indexOf(key, startingAt);
1493     }
1494     else {
1495         // but if lenient parsing is turned ON, we've got some work
1496         // ahead of us
1497         return findTextLenient(str, key, startingAt, length);
1498     }
1499 }
1500
1501 int32_t
1502 NFRule::findTextLenient(const UnicodeString& str,
1503                  const UnicodeString& key,
1504                  int32_t startingAt,
1505                  int32_t* length) const
1506 {
1507     //----------------------------------------------------------------
1508     // JDK 1.1 HACK (take out of 1.2-specific code)
1509
1510     // in JDK 1.2, CollationElementIterator provides us with an
1511     // API to map between character offsets and collation elements
1512     // and we can do this by marching through the string comparing
1513     // collation elements.  We can't do that in JDK 1.1.  Insted,
1514     // we have to go through this horrible slow mess:
1515     int32_t p = startingAt;
1516     int32_t keyLen = 0;
1517
1518     // basically just isolate smaller and smaller substrings of
1519     // the target string (each running to the end of the string,
1520     // and with the first one running from startingAt to the end)
1521     // and then use prefixLength() to see if the search key is at
1522     // the beginning of each substring.  This is excruciatingly
1523     // slow, but it will locate the key and tell use how long the
1524     // matching text was.
1525     UnicodeString temp;
1526     UErrorCode status = U_ZERO_ERROR;
1527     while (p < str.length() && keyLen == 0) {
1528         temp.setTo(str, p, str.length() - p);
1529         keyLen = prefixLength(temp, key, status);
1530         if (U_FAILURE(status)) {
1531             break;
1532         }
1533         if (keyLen != 0) {
1534             *length = keyLen;
1535             return p;
1536         }
1537         ++p;
1538     }
1539     // if we make it to here, we didn't find it.  Return -1 for the
1540     // location.  The length should be ignored, but set it to 0,
1541     // which should be "safe"
1542     *length = 0;
1543     return -1;
1544 }
1545
1546 /**
1547 * Checks to see whether a string consists entirely of ignorable
1548 * characters.
1549 * @param str The string to test.
1550 * @return true if the string is empty of consists entirely of
1551 * characters that the number formatter's collator says are
1552 * ignorable at the primary-order level.  false otherwise.
1553 */
1554 UBool
1555 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const
1556 {
1557     // if the string is empty, we can just return true
1558     if (str.length() == 0) {
1559         return TRUE;
1560     }
1561
1562 #if !UCONFIG_NO_COLLATION
1563     // if lenient parsing is turned on, walk through the string with
1564     // a collation element iterator and make sure each collation
1565     // element is 0 (ignorable) at the primary level
1566     if (formatter->isLenient()) {
1567         const RuleBasedCollator* collator = formatter->getCollator();
1568         if (collator == NULL) {
1569             status = U_MEMORY_ALLOCATION_ERROR;
1570             return FALSE;
1571         }
1572         LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str));
1573
1574         // Memory allocation error check.
1575         if (iter.isNull()) {
1576             status = U_MEMORY_ALLOCATION_ERROR;
1577             return FALSE;
1578         }
1579
1580         UErrorCode err = U_ZERO_ERROR;
1581         int32_t o = iter->next(err);
1582         while (o != CollationElementIterator::NULLORDER
1583             && CollationElementIterator::primaryOrder(o) == 0) {
1584             o = iter->next(err);
1585         }
1586
1587         return o == CollationElementIterator::NULLORDER;
1588     }
1589 #endif
1590
1591     // if lenient parsing is turned off, there is no such thing as
1592     // an ignorable character: return true only if the string is empty
1593     return FALSE;
1594 }
1595
1596 void
1597 NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) {
1598     if (sub1 != NULL) {
1599         sub1->setDecimalFormatSymbols(newSymbols, status);
1600     }
1601     if (sub2 != NULL) {
1602         sub2->setDecimalFormatSymbols(newSymbols, status);
1603     }
1604 }
1605
1606 U_NAMESPACE_END
1607
1608 /* U_HAVE_RBNF */
1609 #endif