source/i18n/unesctrn.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  **********************************************************************
   5  *   Copyright (c) 2001-2011, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  **********************************************************************
   8  *   Date        Name        Description
   9  *   11/19/2001  aliu        Creation.
  10  **********************************************************************
  11  */
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_TRANSLITERATION
  16
  17 #include "unicode/uchar.h"
  18 #include "unicode/utf16.h"
  19 #include "unesctrn.h"
  20 #include "util.h"
  21
  22 #include "cmemory.h"
  23
  24 U_NAMESPACE_BEGIN
  25
  26 /**
  27  * Special character marking the end of the spec[] array.
  28  */
  29 static const UChar END = 0xFFFF;
  30
  31 // Unicode: "U+10FFFF" hex, min=4, max=6
  32 static const UChar SPEC_Unicode[] = {
  33     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
  34     END
  35 };
  36
  37 // Java: "\\uFFFF" hex, min=4, max=4
  38 static const UChar SPEC_Java[] = {
  39     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  40     END
  41 };
  42
  43 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  44 static const UChar SPEC_C[] = {
  45     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
  46     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
  47     END
  48 };
  49
  50 // XML: "&#x10FFFF;" hex, min=1, max=6
  51 static const UChar SPEC_XML[] = {
  52     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
  53     END
  54 };
  55
  56 // XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
  57 static const UChar SPEC_XML10[] = {
  58     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
  59     END
  60 };
  61
  62 // Perl: "\\x{263A}" hex, min=1, max=6
  63 static const UChar SPEC_Perl[] = {
  64     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
  65     END
  66 };
  67
  68 // All: Java, C, Perl, XML, XML10, Unicode
  69 static const UChar SPEC_Any[] = {
  70     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
  71     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
  72     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
  73     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
  74     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
  75     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
  76     END
  77 };
  78
  79 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
  80
  81 static UChar* copySpec(const UChar* spec) {
  82     int32_t len = 0;
  83     while (spec[len] != END) {
  84         ++len;
  85     }
  86     ++len;
  87     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
  88     // Check for memory allocation error.
  89     if (result != NULL) {
  90         uprv_memcpy(result, spec, (size_t)len*sizeof(result[0]));
  91     }
  92     return result;
  93 }
  94
  95 /**
  96  * Factory methods.  Ignore the context.
  97  */
  98 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
  99     return new UnescapeTransliterator(ID, SPEC_Unicode);
 100 }
 101 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
 102     return new UnescapeTransliterator(ID, SPEC_Java);
 103 }
 104 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
 105     return new UnescapeTransliterator(ID, SPEC_C);
 106 }
 107 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
 108     return new UnescapeTransliterator(ID, SPEC_XML);
 109 }
 110 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
 111     return new UnescapeTransliterator(ID, SPEC_XML10);
 112 }
 113 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
 114     return new UnescapeTransliterator(ID, SPEC_Perl);
 115 }
 116 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
 117     return new UnescapeTransliterator(ID, SPEC_Any);
 118 }
 119
 120 /**
 121  * Registers standard variants with the system.  Called by
 122  * Transliterator during initialization.
 123  */
 124 void UnescapeTransliterator::registerIDs() {
 125     Token t = integerToken(0);
 126
 127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
 128
 129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
 130
 131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
 132
 133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
 134
 135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
 136
 137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
 138
 139     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
 140 }
 141
 142 /**
 143  * Constructor.  Takes the encoded spec array.
 144  */
 145 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
 146                                                const UChar *newSpec) :
 147     Transliterator(newID, NULL)
 148 {
 149     this->spec = copySpec(newSpec);
 150 }
 151
 152 /**
 153  * Copy constructor.
 154  */
 155 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
 156     Transliterator(o) {
 157     this->spec = copySpec(o.spec);
 158 }
 159
 160 UnescapeTransliterator::~UnescapeTransliterator() {
 161     uprv_free(spec);
 162 }
 163
 164 /**
 165  * Transliterator API.
 166  */
 167 Transliterator* UnescapeTransliterator::clone() const {
 168     return new UnescapeTransliterator(*this);
 169 }
 170
 171 /**
 172  * Implements {@link Transliterator#handleTransliterate}.
 173  */
 174 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
 175                                                  UBool isIncremental) const {
 176     int32_t start = pos.start;
 177     int32_t limit = pos.limit;
 178     int32_t i, j, ipat;
 179
 180     while (start < limit) {
 181         // Loop over the forms in spec[].  Exit this loop when we
 182         // match one of the specs.  Exit the outer loop if a
 183         // partial match is detected and isIncremental is true.
 184         for (j=0, ipat=0; spec[ipat] != END; ++j) {
 185
 186             // Read the header
 187             int32_t prefixLen = spec[ipat++];
 188             int32_t suffixLen = spec[ipat++];
 189             int8_t  radix     = (int8_t) spec[ipat++];
 190             int32_t minDigits = spec[ipat++];
 191             int32_t maxDigits = spec[ipat++];
 192
 193             // s is a copy of start that is advanced over the
 194             // characters as we parse them.
 195             int32_t s = start;
 196             UBool match = TRUE;
 197
 198             for (i=0; i<prefixLen; ++i) {
 199                 if (s >= limit) {
 200                     if (i > 0) {
 201                         // We've already matched a character.  This is
 202                         // a partial match, so we return if in
 203                         // incremental mode.  In non-incremental mode,
 204                         // go to the next spec.
 205                         if (isIncremental) {
 206                             goto exit;
 207                         }
 208                         match = FALSE;
 209                         break;
 210                     }
 211                 }
 212                 UChar c = text.charAt(s++);
 213                 if (c != spec[ipat + i]) {
 214                     match = FALSE;
 215                     break;
 216                 }
 217             }
 218
 219             if (match) {
 220                 UChar32 u = 0;
 221                 int32_t digitCount = 0;
 222                 for (;;) {
 223                     if (s >= limit) {
 224                         // Check for partial match in incremental mode.
 225                         if (s > start && isIncremental) {
 226                             goto exit;
 227                         }
 228                         break;
 229                     }
 230                     UChar32 ch = text.char32At(s);
 231                     int32_t digit = u_digit(ch, radix);
 232                     if (digit < 0) {
 233                         break;
 234                     }
 235                     s += U16_LENGTH(ch);
 236                     u = (u * radix) + digit;
 237                     if (++digitCount == maxDigits) {
 238                         break;
 239                     }
 240                 }
 241
 242                 match = (digitCount >= minDigits);
 243
 244                 if (match) {
 245                     for (i=0; i<suffixLen; ++i) {
 246                         if (s >= limit) {
 247                             // Check for partial match in incremental mode.
 248                             if (s > start && isIncremental) {
 249                                 goto exit;
 250                             }
 251                             match = FALSE;
 252                             break;
 253                         }
 254                         UChar c = text.charAt(s++);
 255                         if (c != spec[ipat + prefixLen + i]) {
 256                             match = FALSE;
 257                             break;
 258                         }
 259                     }
 260
 261                     if (match) {
 262                         // At this point, we have a match
 263                         UnicodeString str(u);
 264                         text.handleReplaceBetween(start, s, str);
 265                         limit -= s - start - str.length();
 266                         // The following break statement leaves the
 267                         // loop that is traversing the forms in
 268                         // spec[].  We then parse the next input
 269                         // character.
 270                         break;
 271                     }
 272                 }
 273             }
 274
 275             ipat += prefixLen + suffixLen;
 276         }
 277
 278         if (start < limit) {
 279             start += U16_LENGTH(text.char32At(start));
 280         }
 281     }
 282
 283   exit:
 284     pos.contextLimit += limit - pos.limit;
 285     pos.limit = limit;
 286     pos.start = start;
 287 }
 288
 289 U_NAMESPACE_END
 290
 291 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
 292
 293 //eof