src/v8/src/unicode-inl.h

   1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef V8_UNICODE_INL_H_
   6 #define V8_UNICODE_INL_H_
   7
   8 #include "src/unicode.h"
   9 #include "src/base/logging.h"
  10 #include "src/utils.h"
  11
  12 namespace unibrow {
  13
  14 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
  15   CacheEntry entry = entries_[code_point & kMask];
  16   if (entry.code_point_ == code_point) return entry.value_;
  17   return CalculateValue(code_point);
  18 }
  19
  20 template <class T, int s> bool Predicate<T, s>::CalculateValue(
  21     uchar code_point) {
  22   bool result = T::Is(code_point);
  23   entries_[code_point & kMask] = CacheEntry(code_point, result);
  24   return result;
  25 }
  26
  27 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
  28     uchar* result) {
  29   CacheEntry entry = entries_[c & kMask];
  30   if (entry.code_point_ == c) {
  31     if (entry.offset_ == 0) {
  32       return 0;
  33     } else {
  34       result[0] = c + entry.offset_;
  35       return 1;
  36     }
  37   } else {
  38     return CalculateValue(c, n, result);
  39   }
  40 }
  41
  42 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
  43     uchar* result) {
  44   bool allow_caching = true;
  45   int length = T::Convert(c, n, result, &allow_caching);
  46   if (allow_caching) {
  47     if (length == 1) {
  48       entries_[c & kMask] = CacheEntry(c, result[0] - c);
  49       return 1;
  50     } else {
  51       entries_[c & kMask] = CacheEntry(c, 0);
  52       return 0;
  53     }
  54   } else {
  55     return length;
  56   }
  57 }
  58
  59
  60 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
  61   static const int kMask = ~(1 << 6);
  62   if (c <= kMaxOneByteChar) {
  63     str[0] = c;
  64     return 1;
  65   }
  66   str[0] = 0xC0 | (c >> 6);
  67   str[1] = 0x80 | (c & kMask);
  68   return 2;
  69 }
  70
  71 // Encode encodes the UTF-16 code units c and previous into the given str
  72 // buffer, and combines surrogate code units into single code points. If
  73 // replace_invalid is set to true, orphan surrogate code units will be replaced
  74 // with kBadChar.
  75 unsigned Utf8::Encode(char* str,
  76                       uchar c,
  77                       int previous,
  78                       bool replace_invalid) {
  79   static const int kMask = ~(1 << 6);
  80   if (c <= kMaxOneByteChar) {
  81     str[0] = c;
  82     return 1;
  83   } else if (c <= kMaxTwoByteChar) {
  84     str[0] = 0xC0 | (c >> 6);
  85     str[1] = 0x80 | (c & kMask);
  86     return 2;
  87   } else if (c <= kMaxThreeByteChar) {
  88     if (Utf16::IsSurrogatePair(previous, c)) {
  89       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
  90       return Encode(str - kUnmatchedSize,
  91                     Utf16::CombineSurrogatePair(previous, c),
  92                     Utf16::kNoPreviousCharacter,
  93                     replace_invalid) - kUnmatchedSize;
  94     } else if (replace_invalid &&
  95                (Utf16::IsLeadSurrogate(c) ||
  96                Utf16::IsTrailSurrogate(c))) {
  97       c = kBadChar;
  98     }
  99     str[0] = 0xE0 | (c >> 12);
 100     str[1] = 0x80 | ((c >> 6) & kMask);
 101     str[2] = 0x80 | (c & kMask);
 102     return 3;
 103   } else {
 104     str[0] = 0xF0 | (c >> 18);
 105     str[1] = 0x80 | ((c >> 12) & kMask);
 106     str[2] = 0x80 | ((c >> 6) & kMask);
 107     str[3] = 0x80 | (c & kMask);
 108     return 4;
 109   }
 110 }
 111
 112
 113 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
 114   if (length <= 0) return kBadChar;
 115   byte first = bytes[0];
 116   // Characters between 0000 and 0007F are encoded as a single character
 117   if (first <= kMaxOneByteChar) {
 118     *cursor += 1;
 119     return first;
 120   }
 121   return CalculateValue(bytes, length, cursor);
 122 }
 123
 124 unsigned Utf8::Length(uchar c, int previous) {
 125   if (c <= kMaxOneByteChar) {
 126     return 1;
 127   } else if (c <= kMaxTwoByteChar) {
 128     return 2;
 129   } else if (c <= kMaxThreeByteChar) {
 130     if (Utf16::IsTrailSurrogate(c) &&
 131         Utf16::IsLeadSurrogate(previous)) {
 132       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
 133     }
 134     return 3;
 135   } else {
 136     return 4;
 137   }
 138 }
 139
 140 }  // namespace unibrow
 141
 142 #endif  // V8_UNICODE_INL_H_