source/i18n/strmatch.h

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  * Copyright (C) 2001-2011, International Business Machines Corporation
   5  * and others. All Rights Reserved.
   6  **********************************************************************
   7  *   Date        Name        Description
   8  *   07/23/01    aliu        Creation.
   9  **********************************************************************
  10  */
  11 #ifndef STRMATCH_H
  12 #define STRMATCH_H
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_TRANSLITERATION
  17
  18 #include "unicode/unistr.h"
  19 #include "unicode/unifunct.h"
  20 #include "unicode/unimatch.h"
  21 #include "unicode/unirepl.h"
  22
  23 U_NAMESPACE_BEGIN
  24
  25 class TransliterationRuleData;
  26
  27 /**
  28  * An object that matches a fixed input string, implementing the
  29  * UnicodeMatcher API.  This object also implements the
  30  * UnicodeReplacer API, allowing it to emit the matched text as
  31  * output.  Since the match text may contain flexible match elements,
  32  * such as UnicodeSets, the emitted text is not the match pattern, but
  33  * instead a substring of the actual matched text.  Following
  34  * convention, the output text is the leftmost match seen up to this
  35  * point.
  36  *
  37  * A StringMatcher may represent a segment, in which case it has a
  38  * positive segment number.  This affects how the matcher converts
  39  * itself to a pattern but does not otherwise affect its function.
  40  *
  41  * A StringMatcher that is not a segment should not be used as a
  42  * UnicodeReplacer.
  43  */
  44 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
  45
  46  public:
  47
  48     /**
  49      * Construct a matcher that matches the given pattern string.
  50      * @param string the pattern to be matched, possibly containing
  51      * stand-ins that represent nested UnicodeMatcher objects.
  52      * @param start inclusive start index of text to be replaced
  53      * @param limit exclusive end index of text to be replaced;
  54      * must be greater than or equal to start
  55      * @param segmentNum the segment number from 1..n, or 0 if this is
  56      * not a segment.
  57      * @param data context object mapping stand-ins to
  58      * UnicodeMatcher objects.
  59      */
  60     StringMatcher(const UnicodeString& string,
  61                   int32_t start,
  62                   int32_t limit,
  63                   int32_t segmentNum,
  64                   const TransliterationRuleData& data);
  65
  66     /**
  67      * Copy constructor
  68      * @param o  the object to be copied.
  69      */
  70     StringMatcher(const StringMatcher& o);
  71
  72     /**
  73      * Destructor
  74      */
  75     virtual ~StringMatcher();
  76
  77     /**
  78      * Implement UnicodeFunctor
  79      * @return a copy of the object.
  80      */
  81     virtual UnicodeFunctor* clone() const;
  82
  83     /**
  84      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
  85      * and return the pointer.
  86      * @return the UnicodeMatcher point.
  87      */
  88     virtual UnicodeMatcher* toMatcher() const;
  89
  90     /**
  91      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
  92      * and return the pointer.
  93      * @return the UnicodeReplacer pointer.
  94      */
  95     virtual UnicodeReplacer* toReplacer() const;
  96
  97     /**
  98      * Implement UnicodeMatcher
  99      * @param text the text to be matched
 100      * @param offset on input, the index into text at which to begin
 101      * matching.  On output, the limit of the matched text.  The
 102      * number of matched characters is the output value of offset
 103      * minus the input value.  Offset should always point to the
 104      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
 105      * both on entry and upon return.
 106      * @param limit the limit index of text to be matched.  Greater
 107      * than offset for a forward direction match, less than offset for
 108      * a backward direction match.  The last character to be
 109      * considered for matching will be text.charAt(limit-1) in the
 110      * forward direction or text.charAt(limit+1) in the backward
 111      * direction.
 112      * @param incremental  if TRUE, then assume further characters may
 113      * be inserted at limit and check for partial matching.  Otherwise
 114      * assume the text as given is complete.
 115      * @return a match degree value indicating a full match, a partial
 116      * match, or a mismatch.  If incremental is FALSE then
 117      * U_PARTIAL_MATCH should never be returned.
 118      */
 119     virtual UMatchDegree matches(const Replaceable& text,
 120                                  int32_t& offset,
 121                                  int32_t limit,
 122                                  UBool incremental);
 123
 124     /**
 125      * Implement UnicodeMatcher
 126      * @param result            Output param to receive the pattern.
 127      * @param escapeUnprintable if True then escape the unprintable characters.
 128      * @return                  A reference to 'result'.
 129      */
 130     virtual UnicodeString& toPattern(UnicodeString& result,
 131                                      UBool escapeUnprintable = FALSE) const;
 132
 133     /**
 134      * Implement UnicodeMatcher
 135      * Returns TRUE if this matcher will match a character c, where c
 136      * & 0xFF == v, at offset, in the forward direction (with limit >
 137      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
 138      * indexing.
 139      * @param v    the given value
 140      * @return     TRUE if this matcher will match a character c,
 141      *             where c & 0xFF == v
 142      */
 143     virtual UBool matchesIndexValue(uint8_t v) const;
 144
 145     /**
 146      * Implement UnicodeMatcher
 147      */
 148     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
 149
 150     /**
 151      * Implement UnicodeFunctor
 152      */
 153     virtual void setData(const TransliterationRuleData*);
 154
 155     /**
 156      * Replace characters in 'text' from 'start' to 'limit' with the
 157      * output text of this object.  Update the 'cursor' parameter to
 158      * give the cursor position and return the length of the
 159      * replacement text.
 160      *
 161      * @param text the text to be matched
 162      * @param start inclusive start index of text to be replaced
 163      * @param limit exclusive end index of text to be replaced;
 164      * must be greater than or equal to start
 165      * @param cursor output parameter for the cursor position.
 166      * Not all replacer objects will update this, but in a complete
 167      * tree of replacer objects, representing the entire output side
 168      * of a transliteration rule, at least one must update it.
 169      * @return the number of 16-bit code units in the text replacing
 170      * the characters at offsets start..(limit-1) in text
 171      */
 172     virtual int32_t replace(Replaceable& text,
 173                             int32_t start,
 174                             int32_t limit,
 175                             int32_t& cursor);
 176
 177     /**
 178      * Returns a string representation of this replacer.  If the
 179      * result of calling this function is passed to the appropriate
 180      * parser, typically TransliteratorParser, it will produce another
 181      * replacer that is equal to this one.
 182      * @param result the string to receive the pattern.  Previous
 183      * contents will be deleted.
 184      * @param escapeUnprintable if TRUE then convert unprintable
 185      * character to their hex escape representations, \\uxxxx or
 186      * \\Uxxxxxxxx.  Unprintable characters are defined by
 187      * Utility.isUnprintable().
 188      * @return a reference to 'result'.
 189      */
 190     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
 191                                              UBool escapeUnprintable) const;
 192
 193     /**
 194      * Remove any match data.  This must be called before performing a
 195      * set of matches with this segment.
 196      */
 197     void resetMatch();
 198
 199     /**
 200      * ICU "poor man's RTTI", returns a UClassID for the actual class.
 201      */
 202     virtual UClassID getDynamicClassID() const;
 203
 204     /**
 205      * ICU "poor man's RTTI", returns a UClassID for this class.
 206      */
 207     static UClassID U_EXPORT2 getStaticClassID();
 208
 209     /**
 210      * Union the set of all characters that may output by this object
 211      * into the given set.
 212      * @param toUnionTo the set into which to union the output characters
 213      */
 214     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
 215
 216  private:
 217
 218     /**
 219      * The text to be matched.
 220      */
 221     UnicodeString pattern;
 222
 223     /**
 224      * Context object that maps stand-ins to matcher and replacer
 225      * objects.
 226      */
 227     const TransliterationRuleData* data;
 228
 229     /**
 230      * The segment number, 1-based, or 0 if not a segment.
 231      */
 232     int32_t segmentNumber;
 233
 234     /**
 235      * Start offset, in the match text, of the <em>rightmost</em>
 236      * match.
 237      */
 238     int32_t matchStart;
 239
 240     /**
 241      * Limit offset, in the match text, of the <em>rightmost</em>
 242      * match.
 243      */
 244     int32_t matchLimit;
 245
 246 };
 247
 248 U_NAMESPACE_END
 249
 250 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 251
 252 #endif