source/tools/genrb/prscmnts.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  *******************************************************************************
   5  *   Copyright (C) 2003-2014, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  *******************************************************************************
   8  *
   9  * File prscmnts.cpp
  10  *
  11  * Modification History:
  12  *
  13  *   Date          Name        Description
  14  *   08/22/2003    ram         Creation.
  15  *******************************************************************************
  16  */
  17
  18 // Safer use of UnicodeString.
  19 #ifndef UNISTR_FROM_CHAR_EXPLICIT
  20 #   define UNISTR_FROM_CHAR_EXPLICIT explicit
  21 #endif
  22
  23 // Less important, but still a good idea.
  24 #ifndef UNISTR_FROM_STRING_EXPLICIT
  25 #   define UNISTR_FROM_STRING_EXPLICIT explicit
  26 #endif
  27
  28 #include "unicode/regex.h"
  29 #include "unicode/unistr.h"
  30 #include "unicode/parseerr.h"
  31 #include "prscmnts.h"
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34
  35 U_NAMESPACE_USE
  36
  37 #if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when RegularExpressions not available */
  38
  39 #define MAX_SPLIT_STRINGS 20
  40
  41 const char *patternStrings[UPC_LIMIT]={
  42     "^translate\\s*(.*)",
  43     "^note\\s*(.*)"
  44 };
  45
  46 U_CFUNC int32_t
  47 removeText(UChar *source, int32_t srcLen,
  48            UnicodeString patString,uint32_t options,
  49            UnicodeString replaceText, UErrorCode *status){
  50
  51     if(status == NULL || U_FAILURE(*status)){
  52         return 0;
  53     }
  54
  55     UnicodeString src(source, srcLen);
  56
  57     RegexMatcher    myMatcher(patString, src, options, *status);
  58     if(U_FAILURE(*status)){
  59         return 0;
  60     }
  61     UnicodeString dest;
  62
  63
  64     dest = myMatcher.replaceAll(replaceText,*status);
  65
  66
  67     return dest.extract(source, srcLen, *status);
  68
  69 }
  70 U_CFUNC int32_t
  71 trim(UChar *src, int32_t srcLen, UErrorCode *status){
  72      srcLen = removeText(src, srcLen, UnicodeString("^[ \\r\\n]+ "), 0, UnicodeString(), status); // remove leading new lines
  73      srcLen = removeText(src, srcLen, UnicodeString("^\\s+"), 0, UnicodeString(), status); // remove leading spaces
  74      srcLen = removeText(src, srcLen, UnicodeString("\\s+$"), 0, UnicodeString(), status); // remvoe trailing spcaes
  75      return srcLen;
  76 }
  77
  78 U_CFUNC int32_t
  79 removeCmtText(UChar* source, int32_t srcLen, UErrorCode* status){
  80     srcLen = trim(source, srcLen, status);
  81     UnicodeString patString("^\\s*?\\*\\s*?");  // remove pattern like " * " at the begining of the line
  82     srcLen = removeText(source, srcLen, patString, UREGEX_MULTILINE, UnicodeString(), status);
  83     return removeText(source, srcLen, UnicodeString("[ \\r\\n]+"), 0, UnicodeString(" "), status);// remove new lines;
  84 }
  85
  86 U_CFUNC int32_t
  87 getText(const UChar* source, int32_t srcLen,
  88         UChar** dest, int32_t destCapacity,
  89         UnicodeString patternString,
  90         UErrorCode* status){
  91
  92     if(status == NULL || U_FAILURE(*status)){
  93         return 0;
  94     }
  95
  96     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
  97     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), 0, *status);
  98     UnicodeString src (source,srcLen);
  99
 100     if (U_FAILURE(*status)) {
 101         return 0;
 102     }
 103     pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
 104
 105     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
 106     if (U_FAILURE(*status)) {
 107         return 0;
 108     }
 109     for(int32_t i=0; i<MAX_SPLIT_STRINGS; i++){
 110         matcher.reset(stringArray[i]);
 111         if(matcher.lookingAt(*status)){
 112             UnicodeString out = matcher.group(1, *status);
 113
 114             return out.extract(*dest, destCapacity,*status);
 115         }
 116     }
 117     return 0;
 118 }
 119
 120
 121 #define AT_SIGN  0x0040
 122
 123 U_CFUNC int32_t
 124 getDescription( const UChar* source, int32_t srcLen,
 125                 UChar** dest, int32_t destCapacity,
 126                 UErrorCode* status){
 127     if(status == NULL || U_FAILURE(*status)){
 128         return 0;
 129     }
 130
 131     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
 132     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status);
 133     UnicodeString src(source, srcLen);
 134
 135     if (U_FAILURE(*status)) {
 136         return 0;
 137     }
 138     pattern->split(src, stringArray,MAX_SPLIT_STRINGS , *status);
 139
 140     if(stringArray[0].indexOf((UChar)AT_SIGN)==-1){
 141         int32_t destLen =  stringArray[0].extract(*dest, destCapacity, *status);
 142         return trim(*dest, destLen, status);
 143     }
 144     return 0;
 145 }
 146
 147 U_CFUNC int32_t
 148 getCount(const UChar* source, int32_t srcLen,
 149          UParseCommentsOption option, UErrorCode *status){
 150
 151     if(status == NULL || U_FAILURE(*status)){
 152         return 0;
 153     }
 154
 155     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
 156     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status);
 157     UnicodeString src (source, srcLen);
 158
 159
 160     if (U_FAILURE(*status)) {
 161         return 0;
 162     }
 163     int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
 164
 165     UnicodeString patternString(patternStrings[option]);
 166     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
 167     if (U_FAILURE(*status)) {
 168         return 0;
 169     }
 170     int32_t count = 0;
 171     for(int32_t i=0; i<retLen; i++){
 172         matcher.reset(stringArray[i]);
 173         if(matcher.lookingAt(*status)){
 174             count++;
 175         }
 176     }
 177     if(option == UPC_TRANSLATE && count > 1){
 178         fprintf(stderr, "Multiple @translate tags cannot be supported.\n");
 179         exit(U_UNSUPPORTED_ERROR);
 180     }
 181     return count;
 182 }
 183
 184 U_CFUNC int32_t
 185 getAt(const UChar* source, int32_t srcLen,
 186         UChar** dest, int32_t destCapacity,
 187         int32_t index,
 188         UParseCommentsOption option,
 189         UErrorCode* status){
 190
 191     if(status == NULL || U_FAILURE(*status)){
 192         return 0;
 193     }
 194
 195     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
 196     RegexPattern      *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status);
 197     UnicodeString src (source, srcLen);
 198
 199
 200     if (U_FAILURE(*status)) {
 201         return 0;
 202     }
 203     int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
 204
 205     UnicodeString patternString(patternStrings[option]);
 206     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
 207     if (U_FAILURE(*status)) {
 208         return 0;
 209     }
 210     int32_t count = 0;
 211     for(int32_t i=0; i<retLen; i++){
 212         matcher.reset(stringArray[i]);
 213         if(matcher.lookingAt(*status)){
 214             if(count == index){
 215                 UnicodeString out = matcher.group(1, *status);
 216                 return out.extract(*dest, destCapacity,*status);
 217             }
 218             count++;
 219
 220         }
 221     }
 222     return 0;
 223
 224 }
 225
 226 U_CFUNC int32_t
 227 getTranslate( const UChar* source, int32_t srcLen,
 228               UChar** dest, int32_t destCapacity,
 229               UErrorCode* status){
 230     UnicodeString     notePatternString("^translate\\s*?(.*)");
 231
 232     int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status);
 233     return trim(*dest, destLen, status);
 234 }
 235
 236 U_CFUNC int32_t
 237 getNote(const UChar* source, int32_t srcLen,
 238         UChar** dest, int32_t destCapacity,
 239         UErrorCode* status){
 240
 241     UnicodeString     notePatternString("^note\\s*?(.*)");
 242     int32_t destLen =  getText(source, srcLen, dest, destCapacity, notePatternString, status);
 243     return trim(*dest, destLen, status);
 244
 245 }
 246
 247 #endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */
 248