1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2004-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uregex.cpp
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uregex.h"
17 #include "unicode/unistr.h"
18 #include "unicode/ustring.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uobject.h"
21 #include "unicode/utf16.h"
32 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
34 struct RegularExpression: public UMemory {
40 u_atomic_int32_t *fPatRefCount;
42 int32_t fPatStringLen;
43 RegexMatcher *fMatcher;
44 const UChar *fText; // Text from setText()
45 int32_t fTextLength; // Length provided by user with setText(), which
50 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
52 RegularExpression::RegularExpression() {
64 RegularExpression::~RegularExpression() {
67 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
69 uprv_free(fPatString);
70 uprv_free((void *)fPatRefCount);
72 if (fOwnsText && fText!=NULL) {
73 uprv_free((void *)fText);
82 //----------------------------------------------------------------------------------------
84 // validateRE Do boilerplate style checks on API function parameters.
85 // Return TRUE if they look OK.
86 //----------------------------------------------------------------------------------------
87 static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
88 if (U_FAILURE(*status)) {
91 if (re == NULL || re->fMagic != REXP_MAGIC) {
92 *status = U_ILLEGAL_ARGUMENT_ERROR;
95 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
96 if (requiresText && re->fText == NULL && !re->fOwnsText) {
97 *status = U_REGEX_INVALID_STATE;
103 //----------------------------------------------------------------------------------------
107 //----------------------------------------------------------------------------------------
108 U_CAPI URegularExpression * U_EXPORT2
109 uregex_open( const UChar *pattern,
110 int32_t patternLength,
113 UErrorCode *status) {
115 if (U_FAILURE(*status)) {
118 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
119 *status = U_ILLEGAL_ARGUMENT_ERROR;
122 int32_t actualPatLen = patternLength;
123 if (actualPatLen == -1) {
124 actualPatLen = u_strlen(pattern);
127 RegularExpression *re = new RegularExpression;
128 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
129 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
130 if (re == NULL || refC == NULL || patBuf == NULL) {
131 *status = U_MEMORY_ALLOCATION_ERROR;
133 uprv_free((void *)refC);
137 re->fPatRefCount = refC;
138 *re->fPatRefCount = 1;
141 // Make a copy of the pattern string, so we can return it later if asked.
142 // For compiling the pattern, we will use a UText wrapper around
143 // this local copy, to avoid making even more copies.
145 re->fPatString = patBuf;
146 re->fPatStringLen = patternLength;
147 u_memcpy(patBuf, pattern, actualPatLen);
148 patBuf[actualPatLen] = 0;
150 UText patText = UTEXT_INITIALIZER;
151 utext_openUChars(&patText, patBuf, patternLength, status);
154 // Compile the pattern
157 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
159 re->fPat = RegexPattern::compile(&patText, flags, *status);
161 utext_close(&patText);
163 if (U_FAILURE(*status)) {
168 // Create the matcher object
170 re->fMatcher = re->fPat->matcher(*status);
171 if (U_SUCCESS(*status)) {
172 return (URegularExpression*)re;
181 //----------------------------------------------------------------------------------------
185 //----------------------------------------------------------------------------------------
186 U_CAPI URegularExpression * U_EXPORT2
187 uregex_openUText(UText *pattern,
190 UErrorCode *status) {
192 if (U_FAILURE(*status)) {
195 if (pattern == NULL) {
196 *status = U_ILLEGAL_ARGUMENT_ERROR;
200 int64_t patternNativeLength = utext_nativeLength(pattern);
202 if (patternNativeLength == 0) {
203 *status = U_ILLEGAL_ARGUMENT_ERROR;
207 RegularExpression *re = new RegularExpression;
209 UErrorCode lengthStatus = U_ZERO_ERROR;
210 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
212 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
213 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
214 if (re == NULL || refC == NULL || patBuf == NULL) {
215 *status = U_MEMORY_ALLOCATION_ERROR;
217 uprv_free((void *)refC);
221 re->fPatRefCount = refC;
222 *re->fPatRefCount = 1;
225 // Make a copy of the pattern string, so we can return it later if asked.
226 // For compiling the pattern, we will use a read-only UText wrapper
227 // around this local copy, to avoid making even more copies.
229 re->fPatString = patBuf;
230 re->fPatStringLen = pattern16Length;
231 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
233 UText patText = UTEXT_INITIALIZER;
234 utext_openUChars(&patText, patBuf, pattern16Length, status);
237 // Compile the pattern
240 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
242 re->fPat = RegexPattern::compile(&patText, flags, *status);
244 utext_close(&patText);
246 if (U_FAILURE(*status)) {
251 // Create the matcher object
253 re->fMatcher = re->fPat->matcher(*status);
254 if (U_SUCCESS(*status)) {
255 return (URegularExpression*)re;
264 //----------------------------------------------------------------------------------------
268 //----------------------------------------------------------------------------------------
269 U_CAPI void U_EXPORT2
270 uregex_close(URegularExpression *re2) {
271 RegularExpression *re = (RegularExpression*)re2;
272 UErrorCode status = U_ZERO_ERROR;
273 if (validateRE(re, FALSE, &status) == FALSE) {
280 //----------------------------------------------------------------------------------------
284 //----------------------------------------------------------------------------------------
285 U_CAPI URegularExpression * U_EXPORT2
286 uregex_clone(const URegularExpression *source2, UErrorCode *status) {
287 RegularExpression *source = (RegularExpression*)source2;
288 if (validateRE(source, FALSE, status) == FALSE) {
292 RegularExpression *clone = new RegularExpression;
294 *status = U_MEMORY_ALLOCATION_ERROR;
298 clone->fMatcher = source->fPat->matcher(*status);
299 if (U_FAILURE(*status)) {
304 clone->fPat = source->fPat;
305 clone->fPatRefCount = source->fPatRefCount;
306 clone->fPatString = source->fPatString;
307 clone->fPatStringLen = source->fPatStringLen;
308 umtx_atomic_inc(source->fPatRefCount);
309 // Note: fText is not cloned.
311 return (URegularExpression*)clone;
317 //------------------------------------------------------------------------------
321 //------------------------------------------------------------------------------
322 U_CAPI const UChar * U_EXPORT2
323 uregex_pattern(const URegularExpression *regexp2,
325 UErrorCode *status) {
326 RegularExpression *regexp = (RegularExpression*)regexp2;
328 if (validateRE(regexp, FALSE, status) == FALSE) {
331 if (patLength != NULL) {
332 *patLength = regexp->fPatStringLen;
334 return regexp->fPatString;
338 //------------------------------------------------------------------------------
340 // uregex_patternUText
342 //------------------------------------------------------------------------------
343 U_CAPI UText * U_EXPORT2
344 uregex_patternUText(const URegularExpression *regexp2,
345 UErrorCode *status) {
346 RegularExpression *regexp = (RegularExpression*)regexp2;
347 return regexp->fPat->patternText(*status);
351 //------------------------------------------------------------------------------
355 //------------------------------------------------------------------------------
356 U_CAPI int32_t U_EXPORT2
357 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
358 RegularExpression *regexp = (RegularExpression*)regexp2;
359 if (validateRE(regexp, FALSE, status) == FALSE) {
362 int32_t flags = regexp->fPat->flags();
367 //------------------------------------------------------------------------------
371 //------------------------------------------------------------------------------
372 U_CAPI void U_EXPORT2
373 uregex_setText(URegularExpression *regexp2,
376 UErrorCode *status) {
377 RegularExpression *regexp = (RegularExpression*)regexp2;
378 if (validateRE(regexp, FALSE, status) == FALSE) {
381 if (text == NULL || textLength < -1) {
382 *status = U_ILLEGAL_ARGUMENT_ERROR;
386 if (regexp->fOwnsText && regexp->fText != NULL) {
387 uprv_free((void *)regexp->fText);
390 regexp->fText = text;
391 regexp->fTextLength = textLength;
392 regexp->fOwnsText = FALSE;
394 UText input = UTEXT_INITIALIZER;
395 utext_openUChars(&input, text, textLength, status);
396 regexp->fMatcher->reset(&input);
397 utext_close(&input); // reset() made a shallow clone, so we don't need this copy
401 //------------------------------------------------------------------------------
405 //------------------------------------------------------------------------------
406 U_CAPI void U_EXPORT2
407 uregex_setUText(URegularExpression *regexp2,
409 UErrorCode *status) {
410 RegularExpression *regexp = (RegularExpression*)regexp2;
411 if (validateRE(regexp, FALSE, status) == FALSE) {
415 *status = U_ILLEGAL_ARGUMENT_ERROR;
419 if (regexp->fOwnsText && regexp->fText != NULL) {
420 uprv_free((void *)regexp->fText);
423 regexp->fText = NULL; // only fill it in on request
424 regexp->fTextLength = -1;
425 regexp->fOwnsText = TRUE;
426 regexp->fMatcher->reset(text);
431 //------------------------------------------------------------------------------
435 //------------------------------------------------------------------------------
436 U_CAPI const UChar * U_EXPORT2
437 uregex_getText(URegularExpression *regexp2,
439 UErrorCode *status) {
440 RegularExpression *regexp = (RegularExpression*)regexp2;
441 if (validateRE(regexp, FALSE, status) == FALSE) {
445 if (regexp->fText == NULL) {
446 // need to fill in the text
447 UText *inputText = regexp->fMatcher->inputText();
448 int64_t inputNativeLength = utext_nativeLength(inputText);
449 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
450 regexp->fText = inputText->chunkContents;
451 regexp->fTextLength = (int32_t)inputNativeLength;
452 regexp->fOwnsText = FALSE; // because the UText owns it
454 UErrorCode lengthStatus = U_ZERO_ERROR;
455 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
456 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
458 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
459 regexp->fText = inputChars;
460 regexp->fOwnsText = TRUE; // should already be set but just in case
464 if (textLength != NULL) {
465 *textLength = regexp->fTextLength;
467 return regexp->fText;
471 //------------------------------------------------------------------------------
475 //------------------------------------------------------------------------------
476 U_CAPI UText * U_EXPORT2
477 uregex_getUText(URegularExpression *regexp2,
479 UErrorCode *status) {
480 RegularExpression *regexp = (RegularExpression*)regexp2;
481 if (validateRE(regexp, FALSE, status) == FALSE) {
484 return regexp->fMatcher->getInput(dest, *status);
488 //------------------------------------------------------------------------------
490 // uregex_refreshUText
492 //------------------------------------------------------------------------------
493 U_CAPI void U_EXPORT2
494 uregex_refreshUText(URegularExpression *regexp2,
496 UErrorCode *status) {
497 RegularExpression *regexp = (RegularExpression*)regexp2;
498 if (validateRE(regexp, FALSE, status) == FALSE) {
501 regexp->fMatcher->refreshInputText(text, *status);
505 //------------------------------------------------------------------------------
509 //------------------------------------------------------------------------------
510 U_CAPI UBool U_EXPORT2
511 uregex_matches(URegularExpression *regexp2,
513 UErrorCode *status) {
514 return uregex_matches64( regexp2, (int64_t)startIndex, status);
517 U_CAPI UBool U_EXPORT2
518 uregex_matches64(URegularExpression *regexp2,
520 UErrorCode *status) {
521 RegularExpression *regexp = (RegularExpression*)regexp2;
522 UBool result = FALSE;
523 if (validateRE(regexp, TRUE, status) == FALSE) {
526 if (startIndex == -1) {
527 result = regexp->fMatcher->matches(*status);
529 result = regexp->fMatcher->matches(startIndex, *status);
535 //------------------------------------------------------------------------------
539 //------------------------------------------------------------------------------
540 U_CAPI UBool U_EXPORT2
541 uregex_lookingAt(URegularExpression *regexp2,
543 UErrorCode *status) {
544 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
547 U_CAPI UBool U_EXPORT2
548 uregex_lookingAt64(URegularExpression *regexp2,
550 UErrorCode *status) {
551 RegularExpression *regexp = (RegularExpression*)regexp2;
552 UBool result = FALSE;
553 if (validateRE(regexp, TRUE, status) == FALSE) {
556 if (startIndex == -1) {
557 result = regexp->fMatcher->lookingAt(*status);
559 result = regexp->fMatcher->lookingAt(startIndex, *status);
566 //------------------------------------------------------------------------------
570 //------------------------------------------------------------------------------
571 U_CAPI UBool U_EXPORT2
572 uregex_find(URegularExpression *regexp2,
574 UErrorCode *status) {
575 return uregex_find64( regexp2, (int64_t)startIndex, status);
578 U_CAPI UBool U_EXPORT2
579 uregex_find64(URegularExpression *regexp2,
581 UErrorCode *status) {
582 RegularExpression *regexp = (RegularExpression*)regexp2;
583 UBool result = FALSE;
584 if (validateRE(regexp, TRUE, status) == FALSE) {
587 if (startIndex == -1) {
588 regexp->fMatcher->resetPreserveRegion();
589 result = regexp->fMatcher->find(*status);
591 result = regexp->fMatcher->find(startIndex, *status);
597 //------------------------------------------------------------------------------
601 //------------------------------------------------------------------------------
602 U_CAPI UBool U_EXPORT2
603 uregex_findNext(URegularExpression *regexp2,
604 UErrorCode *status) {
605 RegularExpression *regexp = (RegularExpression*)regexp2;
606 if (validateRE(regexp, TRUE, status) == FALSE) {
609 UBool result = regexp->fMatcher->find(*status);
613 //------------------------------------------------------------------------------
617 //------------------------------------------------------------------------------
618 U_CAPI int32_t U_EXPORT2
619 uregex_groupCount(URegularExpression *regexp2,
620 UErrorCode *status) {
621 RegularExpression *regexp = (RegularExpression*)regexp2;
622 if (validateRE(regexp, FALSE, status) == FALSE) {
625 int32_t result = regexp->fMatcher->groupCount();
630 //------------------------------------------------------------------------------
632 // uregex_groupNumberFromName
634 //------------------------------------------------------------------------------
636 uregex_groupNumberFromName(URegularExpression *regexp2,
637 const UChar *groupName,
639 UErrorCode *status) {
640 RegularExpression *regexp = (RegularExpression*)regexp2;
641 if (validateRE(regexp, FALSE, status) == FALSE) {
644 int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
649 uregex_groupNumberFromCName(URegularExpression *regexp2,
650 const char *groupName,
652 UErrorCode *status) {
653 RegularExpression *regexp = (RegularExpression*)regexp2;
654 if (validateRE(regexp, FALSE, status) == FALSE) {
657 return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
660 //------------------------------------------------------------------------------
664 //------------------------------------------------------------------------------
665 U_CAPI int32_t U_EXPORT2
666 uregex_group(URegularExpression *regexp2,
669 int32_t destCapacity,
670 UErrorCode *status) {
671 RegularExpression *regexp = (RegularExpression*)regexp2;
672 if (validateRE(regexp, TRUE, status) == FALSE) {
675 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
676 *status = U_ILLEGAL_ARGUMENT_ERROR;
680 if (destCapacity == 0 || regexp->fText != NULL) {
681 // If preflighting or if we already have the text as UChars,
682 // this is a little cheaper than extracting from the UText
685 // Pick up the range of characters from the matcher
687 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
688 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
689 if (U_FAILURE(*status)) {
694 // Trim length based on buffer capacity
696 int32_t fullLength = endIx - startIx;
697 int32_t copyLength = fullLength;
698 if (copyLength < destCapacity) {
699 dest[copyLength] = 0;
700 } else if (copyLength == destCapacity) {
701 *status = U_STRING_NOT_TERMINATED_WARNING;
703 copyLength = destCapacity;
704 *status = U_BUFFER_OVERFLOW_ERROR;
708 // Copy capture group to user's buffer
710 if (copyLength > 0) {
711 u_memcpy(dest, ®exp->fText[startIx], copyLength);
715 int64_t start = regexp->fMatcher->start64(groupNum, *status);
716 int64_t limit = regexp->fMatcher->end64(groupNum, *status);
717 if (U_FAILURE(*status)) {
721 // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
722 // Zero Length Match: start == end.
723 int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
730 //------------------------------------------------------------------------------
734 //------------------------------------------------------------------------------
735 U_CAPI UText * U_EXPORT2
736 uregex_groupUText(URegularExpression *regexp2,
739 int64_t *groupLength,
740 UErrorCode *status) {
741 RegularExpression *regexp = (RegularExpression*)regexp2;
742 if (validateRE(regexp, TRUE, status) == FALSE) {
743 UErrorCode emptyTextStatus = U_ZERO_ERROR;
744 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
747 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
750 //------------------------------------------------------------------------------
754 //------------------------------------------------------------------------------
755 U_CAPI int32_t U_EXPORT2
756 uregex_start(URegularExpression *regexp2,
758 UErrorCode *status) {
759 return (int32_t)uregex_start64( regexp2, groupNum, status);
762 U_CAPI int64_t U_EXPORT2
763 uregex_start64(URegularExpression *regexp2,
765 UErrorCode *status) {
766 RegularExpression *regexp = (RegularExpression*)regexp2;
767 if (validateRE(regexp, TRUE, status) == FALSE) {
770 int32_t result = regexp->fMatcher->start(groupNum, *status);
774 //------------------------------------------------------------------------------
778 //------------------------------------------------------------------------------
779 U_CAPI int32_t U_EXPORT2
780 uregex_end(URegularExpression *regexp2,
782 UErrorCode *status) {
783 return (int32_t)uregex_end64( regexp2, groupNum, status);
786 U_CAPI int64_t U_EXPORT2
787 uregex_end64(URegularExpression *regexp2,
789 UErrorCode *status) {
790 RegularExpression *regexp = (RegularExpression*)regexp2;
791 if (validateRE(regexp, TRUE, status) == FALSE) {
794 int32_t result = regexp->fMatcher->end(groupNum, *status);
798 //------------------------------------------------------------------------------
802 //------------------------------------------------------------------------------
803 U_CAPI void U_EXPORT2
804 uregex_reset(URegularExpression *regexp2,
806 UErrorCode *status) {
807 uregex_reset64( regexp2, (int64_t)index, status);
810 U_CAPI void U_EXPORT2
811 uregex_reset64(URegularExpression *regexp2,
813 UErrorCode *status) {
814 RegularExpression *regexp = (RegularExpression*)regexp2;
815 if (validateRE(regexp, TRUE, status) == FALSE) {
818 regexp->fMatcher->reset(index, *status);
822 //------------------------------------------------------------------------------
826 //------------------------------------------------------------------------------
827 U_CAPI void U_EXPORT2
828 uregex_setRegion(URegularExpression *regexp2,
831 UErrorCode *status) {
832 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
835 U_CAPI void U_EXPORT2
836 uregex_setRegion64(URegularExpression *regexp2,
839 UErrorCode *status) {
840 RegularExpression *regexp = (RegularExpression*)regexp2;
841 if (validateRE(regexp, TRUE, status) == FALSE) {
844 regexp->fMatcher->region(regionStart, regionLimit, *status);
848 //------------------------------------------------------------------------------
850 // uregex_setRegionAndStart
852 //------------------------------------------------------------------------------
853 U_CAPI void U_EXPORT2
854 uregex_setRegionAndStart(URegularExpression *regexp2,
858 UErrorCode *status) {
859 RegularExpression *regexp = (RegularExpression*)regexp2;
860 if (validateRE(regexp, TRUE, status) == FALSE) {
863 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
866 //------------------------------------------------------------------------------
868 // uregex_regionStart
870 //------------------------------------------------------------------------------
871 U_CAPI int32_t U_EXPORT2
872 uregex_regionStart(const URegularExpression *regexp2,
873 UErrorCode *status) {
874 return (int32_t)uregex_regionStart64(regexp2, status);
877 U_CAPI int64_t U_EXPORT2
878 uregex_regionStart64(const URegularExpression *regexp2,
879 UErrorCode *status) {
880 RegularExpression *regexp = (RegularExpression*)regexp2;
881 if (validateRE(regexp, TRUE, status) == FALSE) {
884 return regexp->fMatcher->regionStart();
888 //------------------------------------------------------------------------------
892 //------------------------------------------------------------------------------
893 U_CAPI int32_t U_EXPORT2
894 uregex_regionEnd(const URegularExpression *regexp2,
895 UErrorCode *status) {
896 return (int32_t)uregex_regionEnd64(regexp2, status);
899 U_CAPI int64_t U_EXPORT2
900 uregex_regionEnd64(const URegularExpression *regexp2,
901 UErrorCode *status) {
902 RegularExpression *regexp = (RegularExpression*)regexp2;
903 if (validateRE(regexp, TRUE, status) == FALSE) {
906 return regexp->fMatcher->regionEnd();
910 //------------------------------------------------------------------------------
912 // uregex_hasTransparentBounds
914 //------------------------------------------------------------------------------
915 U_CAPI UBool U_EXPORT2
916 uregex_hasTransparentBounds(const URegularExpression *regexp2,
917 UErrorCode *status) {
918 RegularExpression *regexp = (RegularExpression*)regexp2;
919 if (validateRE(regexp, FALSE, status) == FALSE) {
922 return regexp->fMatcher->hasTransparentBounds();
926 //------------------------------------------------------------------------------
928 // uregex_useTransparentBounds
930 //------------------------------------------------------------------------------
931 U_CAPI void U_EXPORT2
932 uregex_useTransparentBounds(URegularExpression *regexp2,
934 UErrorCode *status) {
935 RegularExpression *regexp = (RegularExpression*)regexp2;
936 if (validateRE(regexp, FALSE, status) == FALSE) {
939 regexp->fMatcher->useTransparentBounds(b);
943 //------------------------------------------------------------------------------
945 // uregex_hasAnchoringBounds
947 //------------------------------------------------------------------------------
948 U_CAPI UBool U_EXPORT2
949 uregex_hasAnchoringBounds(const URegularExpression *regexp2,
950 UErrorCode *status) {
951 RegularExpression *regexp = (RegularExpression*)regexp2;
952 if (validateRE(regexp, FALSE, status) == FALSE) {
955 return regexp->fMatcher->hasAnchoringBounds();
959 //------------------------------------------------------------------------------
961 // uregex_useAnchoringBounds
963 //------------------------------------------------------------------------------
964 U_CAPI void U_EXPORT2
965 uregex_useAnchoringBounds(URegularExpression *regexp2,
967 UErrorCode *status) {
968 RegularExpression *regexp = (RegularExpression*)regexp2;
969 if (validateRE(regexp, FALSE, status) == FALSE) {
972 regexp->fMatcher->useAnchoringBounds(b);
976 //------------------------------------------------------------------------------
980 //------------------------------------------------------------------------------
981 U_CAPI UBool U_EXPORT2
982 uregex_hitEnd(const URegularExpression *regexp2,
983 UErrorCode *status) {
984 RegularExpression *regexp = (RegularExpression*)regexp2;
985 if (validateRE(regexp, TRUE, status) == FALSE) {
988 return regexp->fMatcher->hitEnd();
992 //------------------------------------------------------------------------------
996 //------------------------------------------------------------------------------
997 U_CAPI UBool U_EXPORT2
998 uregex_requireEnd(const URegularExpression *regexp2,
999 UErrorCode *status) {
1000 RegularExpression *regexp = (RegularExpression*)regexp2;
1001 if (validateRE(regexp, TRUE, status) == FALSE) {
1004 return regexp->fMatcher->requireEnd();
1008 //------------------------------------------------------------------------------
1010 // uregex_setTimeLimit
1012 //------------------------------------------------------------------------------
1013 U_CAPI void U_EXPORT2
1014 uregex_setTimeLimit(URegularExpression *regexp2,
1016 UErrorCode *status) {
1017 RegularExpression *regexp = (RegularExpression*)regexp2;
1018 if (validateRE(regexp, FALSE, status)) {
1019 regexp->fMatcher->setTimeLimit(limit, *status);
1025 //------------------------------------------------------------------------------
1027 // uregex_getTimeLimit
1029 //------------------------------------------------------------------------------
1030 U_CAPI int32_t U_EXPORT2
1031 uregex_getTimeLimit(const URegularExpression *regexp2,
1032 UErrorCode *status) {
1034 RegularExpression *regexp = (RegularExpression*)regexp2;
1035 if (validateRE(regexp, FALSE, status)) {
1036 retVal = regexp->fMatcher->getTimeLimit();
1043 //------------------------------------------------------------------------------
1045 // uregex_setStackLimit
1047 //------------------------------------------------------------------------------
1048 U_CAPI void U_EXPORT2
1049 uregex_setStackLimit(URegularExpression *regexp2,
1051 UErrorCode *status) {
1052 RegularExpression *regexp = (RegularExpression*)regexp2;
1053 if (validateRE(regexp, FALSE, status)) {
1054 regexp->fMatcher->setStackLimit(limit, *status);
1060 //------------------------------------------------------------------------------
1062 // uregex_getStackLimit
1064 //------------------------------------------------------------------------------
1065 U_CAPI int32_t U_EXPORT2
1066 uregex_getStackLimit(const URegularExpression *regexp2,
1067 UErrorCode *status) {
1069 RegularExpression *regexp = (RegularExpression*)regexp2;
1070 if (validateRE(regexp, FALSE, status)) {
1071 retVal = regexp->fMatcher->getStackLimit();
1077 //------------------------------------------------------------------------------
1079 // uregex_setMatchCallback
1081 //------------------------------------------------------------------------------
1082 U_CAPI void U_EXPORT2
1083 uregex_setMatchCallback(URegularExpression *regexp2,
1084 URegexMatchCallback *callback,
1085 const void *context,
1086 UErrorCode *status) {
1087 RegularExpression *regexp = (RegularExpression*)regexp2;
1088 if (validateRE(regexp, FALSE, status)) {
1089 regexp->fMatcher->setMatchCallback(callback, context, *status);
1094 //------------------------------------------------------------------------------
1096 // uregex_getMatchCallback
1098 //------------------------------------------------------------------------------
1099 U_CAPI void U_EXPORT2
1100 uregex_getMatchCallback(const URegularExpression *regexp2,
1101 URegexMatchCallback **callback,
1102 const void **context,
1103 UErrorCode *status) {
1104 RegularExpression *regexp = (RegularExpression*)regexp2;
1105 if (validateRE(regexp, FALSE, status)) {
1106 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1111 //------------------------------------------------------------------------------
1113 // uregex_setMatchProgressCallback
1115 //------------------------------------------------------------------------------
1116 U_CAPI void U_EXPORT2
1117 uregex_setFindProgressCallback(URegularExpression *regexp2,
1118 URegexFindProgressCallback *callback,
1119 const void *context,
1120 UErrorCode *status) {
1121 RegularExpression *regexp = (RegularExpression*)regexp2;
1122 if (validateRE(regexp, FALSE, status)) {
1123 regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1128 //------------------------------------------------------------------------------
1130 // uregex_getMatchCallback
1132 //------------------------------------------------------------------------------
1133 U_CAPI void U_EXPORT2
1134 uregex_getFindProgressCallback(const URegularExpression *regexp2,
1135 URegexFindProgressCallback **callback,
1136 const void **context,
1137 UErrorCode *status) {
1138 RegularExpression *regexp = (RegularExpression*)regexp2;
1139 if (validateRE(regexp, FALSE, status)) {
1140 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1145 //------------------------------------------------------------------------------
1147 // uregex_replaceAll
1149 //------------------------------------------------------------------------------
1150 U_CAPI int32_t U_EXPORT2
1151 uregex_replaceAll(URegularExpression *regexp2,
1152 const UChar *replacementText,
1153 int32_t replacementLength,
1155 int32_t destCapacity,
1156 UErrorCode *status) {
1157 RegularExpression *regexp = (RegularExpression*)regexp2;
1158 if (validateRE(regexp, TRUE, status) == FALSE) {
1161 if (replacementText == NULL || replacementLength < -1 ||
1162 (destBuf == NULL && destCapacity > 0) ||
1164 *status = U_ILLEGAL_ARGUMENT_ERROR;
1170 uregex_reset(regexp2, 0, status);
1172 // Note: Seperate error code variables for findNext() and appendReplacement()
1173 // are used so that destination buffer overflow errors
1174 // in appendReplacement won't stop findNext() from working.
1175 // appendReplacement() and appendTail() special case incoming buffer
1176 // overflow errors, continuing to return the correct length.
1177 UErrorCode findStatus = *status;
1178 while (uregex_findNext(regexp2, &findStatus)) {
1179 len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1180 &destBuf, &destCapacity, status);
1182 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1184 if (U_FAILURE(findStatus)) {
1185 // If anything went wrong with the findNext(), make that error trump
1186 // whatever may have happened with the append() operations.
1187 // Errors in findNext() are not expected.
1188 *status = findStatus;
1195 //------------------------------------------------------------------------------
1197 // uregex_replaceAllUText
1199 //------------------------------------------------------------------------------
1200 U_CAPI UText * U_EXPORT2
1201 uregex_replaceAllUText(URegularExpression *regexp2,
1202 UText *replacementText,
1204 UErrorCode *status) {
1205 RegularExpression *regexp = (RegularExpression*)regexp2;
1206 if (validateRE(regexp, TRUE, status) == FALSE) {
1209 if (replacementText == NULL) {
1210 *status = U_ILLEGAL_ARGUMENT_ERROR;
1214 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1219 //------------------------------------------------------------------------------
1221 // uregex_replaceFirst
1223 //------------------------------------------------------------------------------
1224 U_CAPI int32_t U_EXPORT2
1225 uregex_replaceFirst(URegularExpression *regexp2,
1226 const UChar *replacementText,
1227 int32_t replacementLength,
1229 int32_t destCapacity,
1230 UErrorCode *status) {
1231 RegularExpression *regexp = (RegularExpression*)regexp2;
1232 if (validateRE(regexp, TRUE, status) == FALSE) {
1235 if (replacementText == NULL || replacementLength < -1 ||
1236 (destBuf == NULL && destCapacity > 0) ||
1238 *status = U_ILLEGAL_ARGUMENT_ERROR;
1243 UBool findSucceeded;
1244 uregex_reset(regexp2, 0, status);
1245 findSucceeded = uregex_find(regexp2, 0, status);
1246 if (findSucceeded) {
1247 len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1248 &destBuf, &destCapacity, status);
1250 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1256 //------------------------------------------------------------------------------
1258 // uregex_replaceFirstUText
1260 //------------------------------------------------------------------------------
1261 U_CAPI UText * U_EXPORT2
1262 uregex_replaceFirstUText(URegularExpression *regexp2,
1263 UText *replacementText,
1265 UErrorCode *status) {
1266 RegularExpression *regexp = (RegularExpression*)regexp2;
1267 if (validateRE(regexp, TRUE, status) == FALSE) {
1270 if (replacementText == NULL) {
1271 *status = U_ILLEGAL_ARGUMENT_ERROR;
1275 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1280 //------------------------------------------------------------------------------
1282 // uregex_appendReplacement
1284 //------------------------------------------------------------------------------
1288 // Dummy class, because these functions need to be friends of class RegexMatcher,
1289 // and stand-alone C functions don't work as friends
1293 inline static int32_t appendReplacement(RegularExpression *regexp,
1294 const UChar *replacementText,
1295 int32_t replacementLength,
1297 int32_t *destCapacity,
1298 UErrorCode *status);
1300 inline static int32_t appendTail(RegularExpression *regexp,
1302 int32_t *destCapacity,
1303 UErrorCode *status);
1305 inline static int32_t split(RegularExpression *regexp,
1307 int32_t destCapacity,
1308 int32_t *requiredCapacity,
1309 UChar *destFields[],
1310 int32_t destFieldsCapacity,
1311 UErrorCode *status);
1318 static const UChar BACKSLASH = 0x5c;
1319 static const UChar DOLLARSIGN = 0x24;
1320 static const UChar LEFTBRACKET = 0x7b;
1321 static const UChar RIGHTBRACKET = 0x7d;
1324 // Move a character to an output buffer, with bounds checking on the index.
1325 // Index advances even if capacity is exceeded, for preflight size computations.
1326 // This little sequence is used a LOT.
1328 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1329 if (*idx < bufCapacity) {
1337 // appendReplacement, the actual implementation.
1339 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
1340 const UChar *replacementText,
1341 int32_t replacementLength,
1343 int32_t *destCapacity,
1344 UErrorCode *status) {
1346 // If we come in with a buffer overflow error, don't suppress the operation.
1347 // A series of appendReplacements, appendTail need to correctly preflight
1348 // the buffer size when an overflow happens somewhere in the middle.
1349 UBool pendingBufferOverflow = FALSE;
1350 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1351 pendingBufferOverflow = TRUE;
1352 *status = U_ZERO_ERROR;
1356 // Validate all paramters
1358 if (validateRE(regexp, TRUE, status) == FALSE) {
1361 if (replacementText == NULL || replacementLength < -1 ||
1362 destCapacity == NULL || destBuf == NULL ||
1363 (*destBuf == NULL && *destCapacity > 0) ||
1364 *destCapacity < 0) {
1365 *status = U_ILLEGAL_ARGUMENT_ERROR;
1369 RegexMatcher *m = regexp->fMatcher;
1370 if (m->fMatch == FALSE) {
1371 *status = U_REGEX_INVALID_STATE;
1375 UChar *dest = *destBuf;
1376 int32_t capacity = *destCapacity;
1377 int32_t destIdx = 0;
1380 // If it wasn't supplied by the caller, get the length of the replacement text.
1381 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1382 // the fly and avoid this step.
1383 if (replacementLength == -1) {
1384 replacementLength = u_strlen(replacementText);
1387 // Copy input string from the end of previous match to start of current match
1388 if (regexp->fText != NULL) {
1390 int32_t lastMatchEnd;
1391 if (UTEXT_USES_U16(m->fInputText)) {
1392 lastMatchEnd = (int32_t)m->fLastMatchEnd;
1393 matchStart = (int32_t)m->fMatchStart;
1395 // !!!: Would like a better way to do this!
1396 UErrorCode tempStatus = U_ZERO_ERROR;
1397 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
1398 tempStatus = U_ZERO_ERROR;
1399 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
1401 for (i=lastMatchEnd; i<matchStart; i++) {
1402 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1405 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1406 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1407 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
1408 &possibleOverflowError);
1410 U_ASSERT(destIdx >= 0);
1412 // scan the replacement text, looking for substitutions ($n) and \escapes.
1413 int32_t replIdx = 0;
1414 while (replIdx < replacementLength && U_SUCCESS(*status)) {
1415 UChar c = replacementText[replIdx];
1417 if (c != DOLLARSIGN && c != BACKSLASH) {
1418 // Common case, no substitution, no escaping,
1419 // just copy the char to the dest buf.
1420 appendToBuf(c, &destIdx, dest, capacity);
1424 if (c == BACKSLASH) {
1425 // Backslash Escape. Copy the following char out without further checks.
1426 // Note: Surrogate pairs don't need any special handling
1427 // The second half wont be a '$' or a '\', and
1428 // will move to the dest normally on the next
1430 if (replIdx >= replacementLength) {
1433 c = replacementText[replIdx];
1435 if (c==0x55/*U*/ || c==0x75/*u*/) {
1436 // We have a \udddd or \Udddddddd escape sequence.
1437 UChar32 escapedChar =
1438 u_unescapeAt(uregex_ucstr_unescape_charAt,
1439 &replIdx, // Index is updated by unescapeAt
1440 replacementLength, // Length of replacement text
1441 (void *)replacementText);
1443 if (escapedChar != (UChar32)0xFFFFFFFF) {
1444 if (escapedChar <= 0xffff) {
1445 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1447 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1448 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1452 // Note: if the \u escape was invalid, just fall through and
1453 // treat it as a plain \<anything> escape.
1456 // Plain backslash escape. Just put out the escaped character.
1457 appendToBuf(c, &destIdx, dest, capacity);
1463 // We've got a $. Pick up the following capture group name or number.
1464 // For numbers, consume only digits that produce a valid capture group for the pattern.
1466 int32_t groupNum = 0;
1467 U_ASSERT(c == DOLLARSIGN);
1469 U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1470 if (u_isdigit(c32)) {
1471 int32_t numDigits = 0;
1472 int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
1474 if (replIdx >= replacementLength) {
1477 U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1478 if (u_isdigit(c32) == FALSE) {
1482 int32_t digitVal = u_charDigitValue(c32);
1483 if (groupNum * 10 + digitVal <= numCaptureGroups) {
1484 groupNum = groupNum * 10 + digitVal;
1485 U16_FWD_1(replacementText, replIdx, replacementLength);
1488 if (numDigits == 0) {
1489 *status = U_INDEX_OUTOFBOUNDS_ERROR;
1494 } else if (c32 == LEFTBRACKET) {
1495 // Scan for Named Capture Group, ${name}.
1496 UnicodeString groupName;
1497 U16_FWD_1(replacementText, replIdx, replacementLength);
1498 while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
1499 if (replIdx >= replacementLength) {
1500 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1503 U16_NEXT(replacementText, replIdx, replacementLength, c32);
1504 if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
1505 (c32 >= 0x61 && c32 <= 0x7a) || // a..z
1506 (c32 >= 0x31 && c32 <= 0x39)) { // 0..9
1507 groupName.append(c32);
1508 } else if (c32 == RIGHTBRACKET) {
1509 groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
1510 if (groupNum == 0) {
1511 // Name not defined by pattern.
1512 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1515 // Character was something other than a name char or a closing '}'
1516 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1520 // $ not followed by {name} or digits.
1521 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1525 // Finally, append the capture group data to the destination.
1526 if (U_SUCCESS(*status)) {
1527 destIdx += uregex_group((URegularExpression*)regexp, groupNum,
1528 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1529 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1530 // Ignore buffer overflow when extracting the group. We need to
1531 // continue on to get full size of the untruncated result. We will
1532 // raise our own buffer overflow error at the end.
1533 *status = U_ZERO_ERROR;
1537 if (U_FAILURE(*status)) {
1538 // bad group number or name.
1544 // Nul Terminate the dest buffer if possible.
1545 // Set the appropriate buffer overflow or not terminated error, if needed.
1547 if (destIdx < capacity) {
1549 } else if (U_SUCCESS(*status)) {
1550 if (destIdx == *destCapacity) {
1551 *status = U_STRING_NOT_TERMINATED_WARNING;
1553 *status = U_BUFFER_OVERFLOW_ERROR;
1558 // Return an updated dest buffer and capacity to the caller.
1560 if (destIdx > 0 && *destCapacity > 0) {
1561 if (destIdx < capacity) {
1562 *destBuf += destIdx;
1563 *destCapacity -= destIdx;
1565 *destBuf += capacity;
1570 // If we came in with a buffer overflow, make sure we go out with one also.
1571 // (A zero length match right at the end of the previous match could
1572 // make this function succeed even though a previous call had overflowed the buf)
1573 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1574 *status = U_BUFFER_OVERFLOW_ERROR;
1581 // appendReplacement the actual API function,
1583 U_CAPI int32_t U_EXPORT2
1584 uregex_appendReplacement(URegularExpression *regexp2,
1585 const UChar *replacementText,
1586 int32_t replacementLength,
1588 int32_t *destCapacity,
1589 UErrorCode *status) {
1591 RegularExpression *regexp = (RegularExpression*)regexp2;
1592 return RegexCImpl::appendReplacement(
1593 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1597 // uregex_appendReplacementUText...can just use the normal C++ method
1599 U_CAPI void U_EXPORT2
1600 uregex_appendReplacementUText(URegularExpression *regexp2,
1603 UErrorCode *status) {
1604 RegularExpression *regexp = (RegularExpression*)regexp2;
1605 regexp->fMatcher->appendReplacement(dest, replText, *status);
1609 //------------------------------------------------------------------------------
1611 // uregex_appendTail
1613 //------------------------------------------------------------------------------
1614 int32_t RegexCImpl::appendTail(RegularExpression *regexp,
1616 int32_t *destCapacity,
1620 // If we come in with a buffer overflow error, don't suppress the operation.
1621 // A series of appendReplacements, appendTail need to correctly preflight
1622 // the buffer size when an overflow happens somewhere in the middle.
1623 UBool pendingBufferOverflow = FALSE;
1624 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1625 pendingBufferOverflow = TRUE;
1626 *status = U_ZERO_ERROR;
1629 if (validateRE(regexp, TRUE, status) == FALSE) {
1633 if (destCapacity == NULL || destBuf == NULL ||
1634 (*destBuf == NULL && *destCapacity > 0) ||
1637 *status = U_ILLEGAL_ARGUMENT_ERROR;
1641 RegexMatcher *m = regexp->fMatcher;
1643 int32_t destIdx = 0;
1644 int32_t destCap = *destCapacity;
1645 UChar *dest = *destBuf;
1647 if (regexp->fText != NULL) {
1649 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1650 if (nativeIdx == -1) {
1652 } else if (UTEXT_USES_U16(m->fInputText)) {
1653 srcIdx = (int32_t)nativeIdx;
1655 UErrorCode status = U_ZERO_ERROR;
1656 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1660 U_ASSERT(destIdx >= 0);
1662 if (srcIdx == regexp->fTextLength) {
1665 UChar c = regexp->fText[srcIdx];
1666 if (c == 0 && regexp->fTextLength == -1) {
1667 regexp->fTextLength = srcIdx;
1671 if (destIdx < destCap) {
1674 // We've overflowed the dest buffer.
1675 // If the total input string length is known, we can
1676 // compute the total buffer size needed without scanning through the string.
1677 if (regexp->fTextLength > 0) {
1678 destIdx += (regexp->fTextLength - srcIdx);
1688 // The most recent call to find() succeeded.
1689 srcIdx = m->fMatchEnd;
1691 // The last call to find() on this matcher failed().
1692 // Look back to the end of the last find() that succeeded for src index.
1693 srcIdx = m->fLastMatchEnd;
1695 // There has been no successful match with this matcher.
1696 // We want to copy the whole string.
1701 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1705 // NUL terminate the output string, if possible, otherwise issue the
1706 // appropriate error or warning.
1708 if (destIdx < destCap) {
1710 } else if (destIdx == destCap) {
1711 *status = U_STRING_NOT_TERMINATED_WARNING;
1713 *status = U_BUFFER_OVERFLOW_ERROR;
1717 // Update the user's buffer ptr and capacity vars to reflect the
1720 if (destIdx < destCap) {
1721 *destBuf += destIdx;
1722 *destCapacity -= destIdx;
1723 } else if (*destBuf != NULL) {
1724 *destBuf += destCap;
1728 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1729 *status = U_BUFFER_OVERFLOW_ERROR;
1737 // appendTail the actual API function
1739 U_CAPI int32_t U_EXPORT2
1740 uregex_appendTail(URegularExpression *regexp2,
1742 int32_t *destCapacity,
1743 UErrorCode *status) {
1744 RegularExpression *regexp = (RegularExpression*)regexp2;
1745 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1750 // uregex_appendTailUText...can just use the normal C++ method
1752 U_CAPI UText * U_EXPORT2
1753 uregex_appendTailUText(URegularExpression *regexp2,
1755 UErrorCode *status) {
1756 RegularExpression *regexp = (RegularExpression*)regexp2;
1757 return regexp->fMatcher->appendTail(dest, *status);
1761 //------------------------------------------------------------------------------
1763 // copyString Internal utility to copy a string to an output buffer,
1764 // while managing buffer overflow and preflight size
1765 // computation. NUL termination is added to destination,
1766 // and the NUL is counted in the output size.
1768 //------------------------------------------------------------------------------
1770 static void copyString(UChar *destBuffer, // Destination buffer.
1771 int32_t destCapacity, // Total capacity of dest buffer
1772 int32_t *destIndex, // Index into dest buffer. Updated on return.
1773 // Update not clipped to destCapacity.
1774 const UChar *srcPtr, // Pointer to source string
1775 int32_t srcLen) // Source string len.
1778 int32_t di = *destIndex;
1781 for (si=0; si<srcLen; si++) {
1783 if (di < destCapacity) {
1791 if (di<destCapacity) {
1799 //------------------------------------------------------------------------------
1803 //------------------------------------------------------------------------------
1804 int32_t RegexCImpl::split(RegularExpression *regexp,
1806 int32_t destCapacity,
1807 int32_t *requiredCapacity,
1808 UChar *destFields[],
1809 int32_t destFieldsCapacity,
1810 UErrorCode *status) {
1812 // Reset for the input text
1814 regexp->fMatcher->reset();
1815 UText *inputText = regexp->fMatcher->fInputText;
1816 int64_t nextOutputStringStart = 0;
1817 int64_t inputLen = regexp->fMatcher->fInputLength;
1818 if (inputLen == 0) {
1823 // Loop through the input text, searching for the delimiter pattern
1825 int32_t i; // Index of the field being processed.
1826 int32_t destIdx = 0; // Next available position in destBuf;
1827 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1828 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
1830 if (i>=destFieldsCapacity-1) {
1831 // There are one or zero output strings left.
1832 // Fill the last output string with whatever is left from the input, then exit the loop.
1833 // ( i will be == destFieldsCapacity if we filled the output array while processing
1834 // capture groups of the delimiter expression, in which case we will discard the
1835 // last capture group saved in favor of the unprocessed remainder of the
1837 if (inputLen > nextOutputStringStart) {
1838 if (i != destFieldsCapacity-1) {
1839 // No fields are left. Recycle the last one for holding the trailing part of
1840 // the input string.
1841 i = destFieldsCapacity-1;
1842 destIdx = (int32_t)(destFields[i] - destFields[0]);
1845 destFields[i] = &destBuf[destIdx];
1846 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1847 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1852 if (regexp->fMatcher->find()) {
1853 // We found another delimiter. Move everything from where we started looking
1854 // up until the start of the delimiter into the next output string.
1855 destFields[i] = &destBuf[destIdx];
1857 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1858 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1859 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1860 tStatus = U_ZERO_ERROR;
1864 nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1866 // If the delimiter pattern has capturing parentheses, the captured
1867 // text goes out into the next n destination strings.
1869 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1870 // If we've run out of output string slots, bail out.
1871 if (i==destFieldsCapacity-1) {
1876 // Set up to extract the capture group contents into the dest buffer.
1877 destFields[i] = &destBuf[destIdx];
1878 tStatus = U_ZERO_ERROR;
1879 int32_t t = uregex_group((URegularExpression*)regexp,
1882 REMAINING_CAPACITY(destIdx, destCapacity),
1884 destIdx += t + 1; // Record the space used in the output string buffer.
1885 // +1 for the NUL that terminates the string.
1886 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1887 tStatus = U_ZERO_ERROR;
1893 if (nextOutputStringStart == inputLen) {
1894 // The delimiter was at the end of the string.
1895 // Output an empty string, and then we are done.
1896 if (destIdx < destCapacity) {
1897 destBuf[destIdx] = 0;
1899 if (i < destFieldsCapacity-1) {
1902 if (destIdx < destCapacity) {
1903 destFields[i] = destBuf + destIdx;
1912 // We ran off the end of the input while looking for the next delimiter.
1913 // All the remaining text goes into the current output string.
1914 destFields[i] = &destBuf[destIdx];
1915 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1916 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1921 // Zero out any unused portion of the destFields array
1923 for (j=i+1; j<destFieldsCapacity; j++) {
1924 destFields[j] = NULL;
1927 if (requiredCapacity != NULL) {
1928 *requiredCapacity = destIdx;
1930 if (destIdx > destCapacity) {
1931 *status = U_BUFFER_OVERFLOW_ERROR;
1937 // uregex_split The actual API function
1939 U_CAPI int32_t U_EXPORT2
1940 uregex_split(URegularExpression *regexp2,
1942 int32_t destCapacity,
1943 int32_t *requiredCapacity,
1944 UChar *destFields[],
1945 int32_t destFieldsCapacity,
1946 UErrorCode *status) {
1947 RegularExpression *regexp = (RegularExpression*)regexp2;
1948 if (validateRE(regexp, TRUE, status) == FALSE) {
1951 if ((destBuf == NULL && destCapacity > 0) ||
1953 destFields == NULL ||
1954 destFieldsCapacity < 1 ) {
1955 *status = U_ILLEGAL_ARGUMENT_ERROR;
1959 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1964 // uregex_splitUText...can just use the normal C++ method
1966 U_CAPI int32_t U_EXPORT2
1967 uregex_splitUText(URegularExpression *regexp2,
1968 UText *destFields[],
1969 int32_t destFieldsCapacity,
1970 UErrorCode *status) {
1971 RegularExpression *regexp = (RegularExpression*)regexp2;
1972 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1976 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS