source/common/normlzr.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  *************************************************************************
   5  * COPYRIGHT:
   6  * Copyright (c) 1996-2012, International Business Machines Corporation and
   7  * others. All Rights Reserved.
   8  *************************************************************************
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_NORMALIZATION
  14
  15 #include "unicode/uniset.h"
  16 #include "unicode/unistr.h"
  17 #include "unicode/chariter.h"
  18 #include "unicode/schriter.h"
  19 #include "unicode/uchriter.h"
  20 #include "unicode/normlzr.h"
  21 #include "unicode/utf16.h"
  22 #include "cmemory.h"
  23 #include "normalizer2impl.h"
  24 #include "uprops.h"  // for uniset_getUnicode32Instance()
  25
  26 U_NAMESPACE_BEGIN
  27
  28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
  29
  30 //-------------------------------------------------------------------------
  31 // Constructors and other boilerplate
  32 //-------------------------------------------------------------------------
  33
  34 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
  35     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  36     text(new StringCharacterIterator(str)),
  37     currentIndex(0), nextIndex(0),
  38     buffer(), bufferPos(0)
  39 {
  40     init();
  41 }
  42
  43 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
  44     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  45     text(new UCharCharacterIterator(str, length)),
  46     currentIndex(0), nextIndex(0),
  47     buffer(), bufferPos(0)
  48 {
  49     init();
  50 }
  51
  52 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
  53     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
  54     text(iter.clone()),
  55     currentIndex(0), nextIndex(0),
  56     buffer(), bufferPos(0)
  57 {
  58     init();
  59 }
  60
  61 Normalizer::Normalizer(const Normalizer &copy) :
  62     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
  63     text(copy.text->clone()),
  64     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
  65     buffer(copy.buffer), bufferPos(copy.bufferPos)
  66 {
  67     init();
  68 }
  69
  70 void
  71 Normalizer::init() {
  72     UErrorCode errorCode=U_ZERO_ERROR;
  73     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
  74     if(fOptions&UNORM_UNICODE_3_2) {
  75         delete fFilteredNorm2;
  76         fNorm2=fFilteredNorm2=
  77             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
  78     }
  79     if(U_FAILURE(errorCode)) {
  80         errorCode=U_ZERO_ERROR;
  81         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
  82     }
  83 }
  84
  85 Normalizer::~Normalizer()
  86 {
  87     delete fFilteredNorm2;
  88     delete text;
  89 }
  90
  91 Normalizer*
  92 Normalizer::clone() const
  93 {
  94     return new Normalizer(*this);
  95 }
  96
  97 /**
  98  * Generates a hash code for this iterator.
  99  */
 100 int32_t Normalizer::hashCode() const
 101 {
 102     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
 103 }
 104
 105 UBool Normalizer::operator==(const Normalizer& that) const
 106 {
 107     return
 108         this==&that ||
 109         (fUMode==that.fUMode &&
 110         fOptions==that.fOptions &&
 111         *text==*that.text &&
 112         buffer==that.buffer &&
 113         bufferPos==that.bufferPos &&
 114         nextIndex==that.nextIndex);
 115 }
 116
 117 //-------------------------------------------------------------------------
 118 // Static utility methods
 119 //-------------------------------------------------------------------------
 120
 121 void U_EXPORT2
 122 Normalizer::normalize(const UnicodeString& source,
 123                       UNormalizationMode mode, int32_t options,
 124                       UnicodeString& result,
 125                       UErrorCode &status) {
 126     if(source.isBogus() || U_FAILURE(status)) {
 127         result.setToBogus();
 128         if(U_SUCCESS(status)) {
 129             status=U_ILLEGAL_ARGUMENT_ERROR;
 130         }
 131     } else {
 132         UnicodeString localDest;
 133         UnicodeString *dest;
 134
 135         if(&source!=&result) {
 136             dest=&result;
 137         } else {
 138             // the source and result strings are the same object, use a temporary one
 139             dest=&localDest;
 140         }
 141         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 142         if(U_SUCCESS(status)) {
 143             if(options&UNORM_UNICODE_3_2) {
 144                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 145                     normalize(source, *dest, status);
 146             } else {
 147                 n2->normalize(source, *dest, status);
 148             }
 149         }
 150         if(dest==&localDest && U_SUCCESS(status)) {
 151             result=*dest;
 152         }
 153     }
 154 }
 155
 156 void U_EXPORT2
 157 Normalizer::compose(const UnicodeString& source,
 158                     UBool compat, int32_t options,
 159                     UnicodeString& result,
 160                     UErrorCode &status) {
 161     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
 162 }
 163
 164 void U_EXPORT2
 165 Normalizer::decompose(const UnicodeString& source,
 166                       UBool compat, int32_t options,
 167                       UnicodeString& result,
 168                       UErrorCode &status) {
 169     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
 170 }
 171
 172 UNormalizationCheckResult
 173 Normalizer::quickCheck(const UnicodeString& source,
 174                        UNormalizationMode mode, int32_t options,
 175                        UErrorCode &status) {
 176     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 177     if(U_SUCCESS(status)) {
 178         if(options&UNORM_UNICODE_3_2) {
 179             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 180                 quickCheck(source, status);
 181         } else {
 182             return n2->quickCheck(source, status);
 183         }
 184     } else {
 185         return UNORM_MAYBE;
 186     }
 187 }
 188
 189 UBool
 190 Normalizer::isNormalized(const UnicodeString& source,
 191                          UNormalizationMode mode, int32_t options,
 192                          UErrorCode &status) {
 193     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
 194     if(U_SUCCESS(status)) {
 195         if(options&UNORM_UNICODE_3_2) {
 196             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
 197                 isNormalized(source, status);
 198         } else {
 199             return n2->isNormalized(source, status);
 200         }
 201     } else {
 202         return FALSE;
 203     }
 204 }
 205
 206 UnicodeString & U_EXPORT2
 207 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
 208                         UnicodeString &result,
 209                         UNormalizationMode mode, int32_t options,
 210                         UErrorCode &errorCode) {
 211     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
 212         result.setToBogus();
 213         if(U_SUCCESS(errorCode)) {
 214             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 215         }
 216     } else {
 217         UnicodeString localDest;
 218         UnicodeString *dest;
 219
 220         if(&right!=&result) {
 221             dest=&result;
 222         } else {
 223             // the right and result strings are the same object, use a temporary one
 224             dest=&localDest;
 225         }
 226         *dest=left;
 227         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
 228         if(U_SUCCESS(errorCode)) {
 229             if(options&UNORM_UNICODE_3_2) {
 230                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
 231                     append(*dest, right, errorCode);
 232             } else {
 233                 n2->append(*dest, right, errorCode);
 234             }
 235         }
 236         if(dest==&localDest && U_SUCCESS(errorCode)) {
 237             result=*dest;
 238         }
 239     }
 240     return result;
 241 }
 242
 243 //-------------------------------------------------------------------------
 244 // Iteration API
 245 //-------------------------------------------------------------------------
 246
 247 /**
 248  * Return the current character in the normalized text.
 249  */
 250 UChar32 Normalizer::current() {
 251     if(bufferPos<buffer.length() || nextNormalize()) {
 252         return buffer.char32At(bufferPos);
 253     } else {
 254         return DONE;
 255     }
 256 }
 257
 258 /**
 259  * Return the next character in the normalized text and advance
 260  * the iteration position by one.  If the end
 261  * of the text has already been reached, {@link #DONE} is returned.
 262  */
 263 UChar32 Normalizer::next() {
 264     if(bufferPos<buffer.length() ||  nextNormalize()) {
 265         UChar32 c=buffer.char32At(bufferPos);
 266         bufferPos+=U16_LENGTH(c);
 267         return c;
 268     } else {
 269         return DONE;
 270     }
 271 }
 272
 273 /**
 274  * Return the previous character in the normalized text and decrement
 275  * the iteration position by one.  If the beginning
 276  * of the text has already been reached, {@link #DONE} is returned.
 277  */
 278 UChar32 Normalizer::previous() {
 279     if(bufferPos>0 || previousNormalize()) {
 280         UChar32 c=buffer.char32At(bufferPos-1);
 281         bufferPos-=U16_LENGTH(c);
 282         return c;
 283     } else {
 284         return DONE;
 285     }
 286 }
 287
 288 void Normalizer::reset() {
 289     currentIndex=nextIndex=text->setToStart();
 290     clearBuffer();
 291 }
 292
 293 void
 294 Normalizer::setIndexOnly(int32_t index) {
 295     text->setIndex(index);  // pins index
 296     currentIndex=nextIndex=text->getIndex();
 297     clearBuffer();
 298 }
 299
 300 /**
 301  * Return the first character in the normalized text.  This resets
 302  * the <tt>Normalizer's</tt> position to the beginning of the text.
 303  */
 304 UChar32 Normalizer::first() {
 305     reset();
 306     return next();
 307 }
 308
 309 /**
 310  * Return the last character in the normalized text.  This resets
 311  * the <tt>Normalizer's</tt> position to be just before the
 312  * the input text corresponding to that normalized character.
 313  */
 314 UChar32 Normalizer::last() {
 315     currentIndex=nextIndex=text->setToEnd();
 316     clearBuffer();
 317     return previous();
 318 }
 319
 320 /**
 321  * Retrieve the current iteration position in the input text that is
 322  * being normalized.  This method is useful in applications such as
 323  * searching, where you need to be able to determine the position in
 324  * the input text that corresponds to a given normalized output character.
 325  * <p>
 326  * <b>Note:</b> This method sets the position in the <em>input</em>, while
 327  * {@link #next} and {@link #previous} iterate through characters in the
 328  * <em>output</em>.  This means that there is not necessarily a one-to-one
 329  * correspondence between characters returned by <tt>next</tt> and
 330  * <tt>previous</tt> and the indices passed to and returned from
 331  * <tt>setIndex</tt> and {@link #getIndex}.
 332  *
 333  */
 334 int32_t Normalizer::getIndex() const {
 335     if(bufferPos<buffer.length()) {
 336         return currentIndex;
 337     } else {
 338         return nextIndex;
 339     }
 340 }
 341
 342 /**
 343  * Retrieve the index of the start of the input text.  This is the begin index
 344  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
 345  * over which this <tt>Normalizer</tt> is iterating
 346  */
 347 int32_t Normalizer::startIndex() const {
 348     return text->startIndex();
 349 }
 350
 351 /**
 352  * Retrieve the index of the end of the input text.  This is the end index
 353  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 354  * over which this <tt>Normalizer</tt> is iterating
 355  */
 356 int32_t Normalizer::endIndex() const {
 357     return text->endIndex();
 358 }
 359
 360 //-------------------------------------------------------------------------
 361 // Property access methods
 362 //-------------------------------------------------------------------------
 363
 364 void
 365 Normalizer::setMode(UNormalizationMode newMode)
 366 {
 367     fUMode = newMode;
 368     init();
 369 }
 370
 371 UNormalizationMode
 372 Normalizer::getUMode() const
 373 {
 374     return fUMode;
 375 }
 376
 377 void
 378 Normalizer::setOption(int32_t option,
 379                       UBool value)
 380 {
 381     if (value) {
 382         fOptions |= option;
 383     } else {
 384         fOptions &= (~option);
 385     }
 386     init();
 387 }
 388
 389 UBool
 390 Normalizer::getOption(int32_t option) const
 391 {
 392     return (fOptions & option) != 0;
 393 }
 394
 395 /**
 396  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 397  * The iteration position is set to the beginning of the input text.
 398  */
 399 void
 400 Normalizer::setText(const UnicodeString& newText,
 401                     UErrorCode &status)
 402 {
 403     if (U_FAILURE(status)) {
 404         return;
 405     }
 406     CharacterIterator *newIter = new StringCharacterIterator(newText);
 407     if (newIter == NULL) {
 408         status = U_MEMORY_ALLOCATION_ERROR;
 409         return;
 410     }
 411     delete text;
 412     text = newIter;
 413     reset();
 414 }
 415
 416 /**
 417  * Set the input text over which this <tt>Normalizer</tt> will iterate.
 418  * The iteration position is set to the beginning of the string.
 419  */
 420 void
 421 Normalizer::setText(const CharacterIterator& newText,
 422                     UErrorCode &status)
 423 {
 424     if (U_FAILURE(status)) {
 425         return;
 426     }
 427     CharacterIterator *newIter = newText.clone();
 428     if (newIter == NULL) {
 429         status = U_MEMORY_ALLOCATION_ERROR;
 430         return;
 431     }
 432     delete text;
 433     text = newIter;
 434     reset();
 435 }
 436
 437 void
 438 Normalizer::setText(const UChar* newText,
 439                     int32_t length,
 440                     UErrorCode &status)
 441 {
 442     if (U_FAILURE(status)) {
 443         return;
 444     }
 445     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
 446     if (newIter == NULL) {
 447         status = U_MEMORY_ALLOCATION_ERROR;
 448         return;
 449     }
 450     delete text;
 451     text = newIter;
 452     reset();
 453 }
 454
 455 /**
 456  * Copies the text under iteration into the UnicodeString referred to by "result".
 457  * @param result Receives a copy of the text under iteration.
 458  */
 459 void
 460 Normalizer::getText(UnicodeString&  result)
 461 {
 462     text->getText(result);
 463 }
 464
 465 //-------------------------------------------------------------------------
 466 // Private utility methods
 467 //-------------------------------------------------------------------------
 468
 469 void Normalizer::clearBuffer() {
 470     buffer.remove();
 471     bufferPos=0;
 472 }
 473
 474 UBool
 475 Normalizer::nextNormalize() {
 476     clearBuffer();
 477     currentIndex=nextIndex;
 478     text->setIndex(nextIndex);
 479     if(!text->hasNext()) {
 480         return FALSE;
 481     }
 482     // Skip at least one character so we make progress.
 483     UnicodeString segment(text->next32PostInc());
 484     while(text->hasNext()) {
 485         UChar32 c;
 486         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
 487             text->move32(-1, CharacterIterator::kCurrent);
 488             break;
 489         }
 490         segment.append(c);
 491     }
 492     nextIndex=text->getIndex();
 493     UErrorCode errorCode=U_ZERO_ERROR;
 494     fNorm2->normalize(segment, buffer, errorCode);
 495     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 496 }
 497
 498 UBool
 499 Normalizer::previousNormalize() {
 500     clearBuffer();
 501     nextIndex=currentIndex;
 502     text->setIndex(currentIndex);
 503     if(!text->hasPrevious()) {
 504         return FALSE;
 505     }
 506     UnicodeString segment;
 507     while(text->hasPrevious()) {
 508         UChar32 c=text->previous32();
 509         segment.insert(0, c);
 510         if(fNorm2->hasBoundaryBefore(c)) {
 511             break;
 512         }
 513     }
 514     currentIndex=text->getIndex();
 515     UErrorCode errorCode=U_ZERO_ERROR;
 516     fNorm2->normalize(segment, buffer, errorCode);
 517     bufferPos=buffer.length();
 518     return U_SUCCESS(errorCode) && !buffer.isEmpty();
 519 }
 520
 521 U_NAMESPACE_END
 522
 523 #endif /* #if !UCONFIG_NO_NORMALIZATION */