1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
7 ***************************************************************************
8 * Copyright (C) 2002-2016 International Business Machines Corporation
9 * and others. All rights reserved.
10 ***************************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uclean.h"
32 //--------------------------------------------------------------------------
34 // RegexPattern Default Constructor
36 //--------------------------------------------------------------------------
37 RegexPattern::RegexPattern() {
38 // Init all of this instances data.
43 //--------------------------------------------------------------------------
45 // Copy Constructor Note: This is a rather inefficient implementation,
46 // but it probably doesn't matter.
48 //--------------------------------------------------------------------------
49 RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
56 //--------------------------------------------------------------------------
58 // Assignment Operator
60 //--------------------------------------------------------------------------
61 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
63 // Source and destination are the same. Don't do anything.
67 // Clean out any previous contents of object being assigned to.
70 // Give target object a default initialization
74 fDeferredStatus = other.fDeferredStatus;
76 if (U_FAILURE(fDeferredStatus)) {
80 if (other.fPatternString == NULL) {
81 fPatternString = NULL;
82 fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
84 fPatternString = new UnicodeString(*(other.fPatternString));
85 if (fPatternString == NULL) {
86 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
88 fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus);
91 if (U_FAILURE(fDeferredStatus)) {
95 fFlags = other.fFlags;
96 fLiteralText = other.fLiteralText;
97 fMinMatchLen = other.fMinMatchLen;
98 fFrameSize = other.fFrameSize;
99 fDataSize = other.fDataSize;
100 fStaticSets = other.fStaticSets;
101 fStaticSets8 = other.fStaticSets8;
103 fStartType = other.fStartType;
104 fInitialStringIdx = other.fInitialStringIdx;
105 fInitialStringLen = other.fInitialStringLen;
106 *fInitialChars = *other.fInitialChars;
107 fInitialChar = other.fInitialChar;
108 *fInitialChars8 = *other.fInitialChars8;
109 fNeedsAltInput = other.fNeedsAltInput;
111 // Copy the pattern. It's just values, nothing deep to copy.
112 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
113 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
115 // Copy the Unicode Sets.
116 // Could be made more efficient if the sets were reference counted and shared,
117 // but I doubt that pattern copying will be particularly common.
118 // Note: init() already added an empty element zero to fSets
120 int32_t numSets = other.fSets->size();
121 fSets8 = new Regex8BitSet[numSets];
122 if (fSets8 == NULL) {
123 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
126 for (i=1; i<numSets; i++) {
127 if (U_FAILURE(fDeferredStatus)) {
130 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
131 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
132 if (newSet == NULL) {
133 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
136 fSets->addElement(newSet, fDeferredStatus);
137 fSets8[i] = other.fSets8[i];
140 // Copy the named capture group hash map.
141 int32_t hashPos = UHASH_FIRST;
142 while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
143 if (U_FAILURE(fDeferredStatus)) {
146 const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
147 UnicodeString *key = new UnicodeString(*name);
148 int32_t val = hashEl->value.integer;
150 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
152 uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
159 //--------------------------------------------------------------------------
161 // init Shared initialization for use by constructors.
162 // Bring an uninitialized RegexPattern up to a default state.
164 //--------------------------------------------------------------------------
165 void RegexPattern::init() {
168 fLiteralText.remove();
171 fDeferredStatus = U_ZERO_ERROR;
178 fStartType = START_NO_INFO;
179 fInitialStringIdx = 0;
180 fInitialStringLen = 0;
181 fInitialChars = NULL;
183 fInitialChars8 = NULL;
184 fNeedsAltInput = FALSE;
185 fNamedCaptureMap = NULL;
187 fPattern = NULL; // will be set later
188 fPatternString = NULL; // may be set later
189 fCompiledPat = new UVector64(fDeferredStatus);
190 fGroupMap = new UVector32(fDeferredStatus);
191 fSets = new UVector(fDeferredStatus);
192 fInitialChars = new UnicodeSet;
193 fInitialChars8 = new Regex8BitSet;
194 fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
195 uhash_compareUnicodeString, // Key comparator function
196 uhash_compareLong, // Value comparator function
198 if (U_FAILURE(fDeferredStatus)) {
201 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
202 fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
203 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
207 // Slot zero of the vector of sets is reserved. Fill it here.
208 fSets->addElement((int32_t)0, fDeferredStatus);
210 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
211 uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
215 //--------------------------------------------------------------------------
217 // zap Delete everything owned by this RegexPattern.
219 //--------------------------------------------------------------------------
220 void RegexPattern::zap() {
224 for (i=1; i<fSets->size(); i++) {
226 s = (UnicodeSet *)fSets->elementAt(i);
237 delete fInitialChars;
238 fInitialChars = NULL;
239 delete fInitialChars8;
240 fInitialChars8 = NULL;
241 if (fPattern != NULL) {
242 utext_close(fPattern);
245 if (fPatternString != NULL) {
246 delete fPatternString;
247 fPatternString = NULL;
249 uhash_close(fNamedCaptureMap);
250 fNamedCaptureMap = NULL;
254 //--------------------------------------------------------------------------
258 //--------------------------------------------------------------------------
259 RegexPattern::~RegexPattern() {
264 //--------------------------------------------------------------------------
268 //--------------------------------------------------------------------------
269 RegexPattern *RegexPattern::clone() const {
270 RegexPattern *copy = new RegexPattern(*this);
275 //--------------------------------------------------------------------------
277 // operator == (comparison) Consider to patterns to be == if the
278 // pattern strings and the flags are the same.
279 // Note that pattern strings with the same
280 // characters can still be considered different.
282 //--------------------------------------------------------------------------
283 UBool RegexPattern::operator ==(const RegexPattern &other) const {
284 if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
285 if (this->fPatternString != NULL && other.fPatternString != NULL) {
286 return *(this->fPatternString) == *(other.fPatternString);
287 } else if (this->fPattern == NULL) {
288 if (other.fPattern == NULL) {
291 } else if (other.fPattern != NULL) {
292 UTEXT_SETNATIVEINDEX(this->fPattern, 0);
293 UTEXT_SETNATIVEINDEX(other.fPattern, 0);
294 return utext_equals(this->fPattern, other.fPattern);
300 //---------------------------------------------------------------------
304 //---------------------------------------------------------------------
305 RegexPattern * U_EXPORT2
306 RegexPattern::compile(const UnicodeString ®ex,
311 if (U_FAILURE(status)) {
315 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
316 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
317 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
319 if ((flags & ~allFlags) != 0) {
320 status = U_REGEX_INVALID_FLAG;
324 if ((flags & UREGEX_CANON_EQ) != 0) {
325 status = U_REGEX_UNIMPLEMENTED;
329 RegexPattern *This = new RegexPattern;
331 status = U_MEMORY_ALLOCATION_ERROR;
334 if (U_FAILURE(This->fDeferredStatus)) {
335 status = This->fDeferredStatus;
339 This->fFlags = flags;
341 RegexCompile compiler(This, status);
342 compiler.compile(regex, pe, status);
344 if (U_FAILURE(status)) {
354 // compile, UText mode
356 RegexPattern * U_EXPORT2
357 RegexPattern::compile(UText *regex,
362 if (U_FAILURE(status)) {
366 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
367 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
368 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
370 if ((flags & ~allFlags) != 0) {
371 status = U_REGEX_INVALID_FLAG;
375 if ((flags & UREGEX_CANON_EQ) != 0) {
376 status = U_REGEX_UNIMPLEMENTED;
380 RegexPattern *This = new RegexPattern;
382 status = U_MEMORY_ALLOCATION_ERROR;
385 if (U_FAILURE(This->fDeferredStatus)) {
386 status = This->fDeferredStatus;
390 This->fFlags = flags;
392 RegexCompile compiler(This, status);
393 compiler.compile(regex, pe, status);
395 if (U_FAILURE(status)) {
404 // compile with default flags.
406 RegexPattern * U_EXPORT2
407 RegexPattern::compile(const UnicodeString ®ex,
411 return compile(regex, 0, pe, err);
416 // compile with default flags, UText mode
418 RegexPattern * U_EXPORT2
419 RegexPattern::compile(UText *regex,
423 return compile(regex, 0, pe, err);
428 // compile with no UParseErr parameter.
430 RegexPattern * U_EXPORT2
431 RegexPattern::compile(const UnicodeString ®ex,
436 return compile(regex, flags, pe, err);
441 // compile with no UParseErr parameter, UText mode
443 RegexPattern * U_EXPORT2
444 RegexPattern::compile(UText *regex,
449 return compile(regex, flags, pe, err);
453 //---------------------------------------------------------------------
457 //---------------------------------------------------------------------
458 uint32_t RegexPattern::flags() const {
463 //---------------------------------------------------------------------
465 // matcher(UnicodeString, err)
467 //---------------------------------------------------------------------
468 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
469 UErrorCode &status) const {
470 RegexMatcher *retMatcher = matcher(status);
471 if (retMatcher != NULL) {
472 retMatcher->fDeferredStatus = status;
473 retMatcher->reset(input);
479 //---------------------------------------------------------------------
483 //---------------------------------------------------------------------
484 RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
485 RegexMatcher *retMatcher = NULL;
487 if (U_FAILURE(status)) {
490 if (U_FAILURE(fDeferredStatus)) {
491 status = fDeferredStatus;
495 retMatcher = new RegexMatcher(this);
496 if (retMatcher == NULL) {
497 status = U_MEMORY_ALLOCATION_ERROR;
505 //---------------------------------------------------------------------
507 // matches Convenience function to test for a match, starting
508 // with a pattern string and a data string.
510 //---------------------------------------------------------------------
511 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex,
512 const UnicodeString &input,
514 UErrorCode &status) {
516 if (U_FAILURE(status)) {return FALSE;}
519 RegexPattern *pat = NULL;
520 RegexMatcher *matcher = NULL;
522 pat = RegexPattern::compile(regex, 0, pe, status);
523 matcher = pat->matcher(input, status);
524 retVal = matcher->matches(status);
533 // matches, UText mode
535 UBool U_EXPORT2 RegexPattern::matches(UText *regex,
538 UErrorCode &status) {
540 if (U_FAILURE(status)) {return FALSE;}
542 UBool retVal = FALSE;
543 RegexPattern *pat = NULL;
544 RegexMatcher *matcher = NULL;
546 pat = RegexPattern::compile(regex, 0, pe, status);
547 matcher = pat->matcher(status);
548 if (U_SUCCESS(status)) {
549 matcher->reset(input);
550 retVal = matcher->matches(status);
562 //---------------------------------------------------------------------
566 //---------------------------------------------------------------------
567 UnicodeString RegexPattern::pattern() const {
568 if (fPatternString != NULL) {
569 return *fPatternString;
570 } else if (fPattern == NULL) {
571 return UnicodeString();
573 UErrorCode status = U_ZERO_ERROR;
574 int64_t nativeLen = utext_nativeLength(fPattern);
575 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
576 UnicodeString result;
578 status = U_ZERO_ERROR;
579 UChar *resultChars = result.getBuffer(len16);
580 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
581 result.releaseBuffer(len16);
590 //---------------------------------------------------------------------
594 //---------------------------------------------------------------------
595 UText *RegexPattern::patternText(UErrorCode &status) const {
596 if (U_FAILURE(status)) {return NULL;}
597 status = U_ZERO_ERROR;
599 if (fPattern != NULL) {
602 RegexStaticSets::initGlobals(&status);
603 return RegexStaticSets::gStaticSets->fEmptyText;
608 //--------------------------------------------------------------------------------
610 // groupNumberFromName()
612 //--------------------------------------------------------------------------------
613 int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
614 if (U_FAILURE(status)) {
618 // No need to explicitly check for syntactically valid names.
619 // Invalid ones will never be in the map, and the lookup will fail.
621 int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
623 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
628 int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
629 if (U_FAILURE(status)) {
632 UnicodeString name(groupName, nameLength, US_INV);
633 return groupNumberFromName(name, status);
637 //---------------------------------------------------------------------
641 //---------------------------------------------------------------------
642 int32_t RegexPattern::split(const UnicodeString &input,
643 UnicodeString dest[],
644 int32_t destCapacity,
645 UErrorCode &status) const
647 if (U_FAILURE(status)) {
651 RegexMatcher m(this);
653 // Check m's status to make sure all is ok.
654 if (U_SUCCESS(m.fDeferredStatus)) {
655 r = m.split(input, dest, destCapacity, status);
663 int32_t RegexPattern::split(UText *input,
665 int32_t destCapacity,
666 UErrorCode &status) const
668 if (U_FAILURE(status)) {
672 RegexMatcher m(this);
674 // Check m's status to make sure all is ok.
675 if (U_SUCCESS(m.fDeferredStatus)) {
676 r = m.split(input, dest, destCapacity, status);
682 //---------------------------------------------------------------------
684 // dump Output the compiled form of the pattern.
685 // Debugging function only.
687 //---------------------------------------------------------------------
688 void RegexPattern::dumpOp(int32_t index) const {
689 (void)index; // Suppress warnings in non-debug build.
690 #if defined(REGEX_DEBUG)
691 static const char * const opNames[] = {URX_OPCODE_NAMES};
692 int32_t op = fCompiledPat->elementAti(index);
693 int32_t val = URX_VAL(op);
694 int32_t type = URX_TYPE(op);
695 int32_t pinnedType = type;
696 if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) {
700 printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
708 case URX_BACKSLASH_G:
709 case URX_BACKSLASH_X:
713 // Types with no operand field of interest.
716 case URX_RESERVED_OP:
717 case URX_START_CAPTURE:
718 case URX_END_CAPTURE:
723 case URX_BACKSLASH_B:
724 case URX_BACKSLASH_BU:
725 case URX_BACKSLASH_D:
726 case URX_BACKSLASH_Z:
729 case URX_CTR_INIT_NG:
731 case URX_CTR_LOOP_NG:
732 case URX_RELOC_OPRND:
736 case URX_STO_INP_LOC:
748 case URX_BACKSLASH_H:
749 case URX_BACKSLASH_R:
750 case URX_BACKSLASH_V:
751 // types with an integer operand field.
760 printf("'%s'", CStr(UnicodeString(val))());
767 int32_t lengthOp = fCompiledPat->elementAti(index+1);
768 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
769 int32_t length = URX_VAL(lengthOp);
770 UnicodeString str(fLiteralText, val, length);
771 printf("%s", CStr(str)());
779 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
780 set->toPattern(s, TRUE);
781 printf("%s", CStr(s)());
785 case URX_STATIC_SETREF:
786 case URX_STAT_SETREF_N:
789 if (val & URX_NEG_SET) {
793 UnicodeSet *set = fStaticSets[val];
794 set->toPattern(s, TRUE);
795 printf("%s", CStr(s)());
809 void RegexPattern::dumpPattern() const {
810 #if defined(REGEX_DEBUG)
813 UnicodeString patStr;
814 for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
817 printf("Original Pattern: \"%s\"\n", CStr(patStr)());
818 printf(" Min Match Length: %d\n", fMinMatchLen);
819 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
820 if (fStartType == START_STRING) {
821 UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
822 printf(" Initial match string: \"%s\"\n", CStr(initialString)());
823 } else if (fStartType == START_SET) {
825 fInitialChars->toPattern(s, TRUE);
826 printf(" Match First Chars: %s\n", CStr(s)());
828 } else if (fStartType == START_CHAR) {
829 printf(" First char of Match: ");
830 if (fInitialChar > 0x20) {
831 printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
833 printf("%#x\n", fInitialChar);
837 printf("Named Capture Groups:\n");
838 if (uhash_count(fNamedCaptureMap) == 0) {
841 int32_t pos = UHASH_FIRST;
842 const UHashElement *el = NULL;
843 while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
844 const UnicodeString *name = (const UnicodeString *)el->key.pointer;
845 int32_t number = el->value.integer;
846 printf(" %d\t%s\n", number, CStr(*name)());
850 printf("\nIndex Binary Type Operand\n" \
851 "-------------------------------------------\n");
852 for (index = 0; index<fCompiledPat->size(); index++) {
861 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
864 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS