src/v8/src/unicode-inl.h

   1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef V8_UNICODE_INL_H_
   6 #define V8_UNICODE_INL_H_
   7
   8 #include "unicode.h"
   9 #include "checks.h"
  10 #include "platform.h"
  11
  12 namespace unibrow {
  13
  14 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
  15   CacheEntry entry = entries_[code_point & kMask];
  16   if (entry.code_point_ == code_point) return entry.value_;
  17   return CalculateValue(code_point);
  18 }
  19
  20 template <class T, int s> bool Predicate<T, s>::CalculateValue(
  21     uchar code_point) {
  22   bool result = T::Is(code_point);
  23   entries_[code_point & kMask] = CacheEntry(code_point, result);
  24   return result;
  25 }
  26
  27 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
  28     uchar* result) {
  29   CacheEntry entry = entries_[c & kMask];
  30   if (entry.code_point_ == c) {
  31     if (entry.offset_ == 0) {
  32       return 0;
  33     } else {
  34       result[0] = c + entry.offset_;
  35       return 1;
  36     }
  37   } else {
  38     return CalculateValue(c, n, result);
  39   }
  40 }
  41
  42 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
  43     uchar* result) {
  44   bool allow_caching = true;
  45   int length = T::Convert(c, n, result, &allow_caching);
  46   if (allow_caching) {
  47     if (length == 1) {
  48       entries_[c & kMask] = CacheEntry(c, result[0] - c);
  49       return 1;
  50     } else {
  51       entries_[c & kMask] = CacheEntry(c, 0);
  52       return 0;
  53     }
  54   } else {
  55     return length;
  56   }
  57 }
  58
  59
  60 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
  61   ASSERT(c > Latin1::kMaxChar);
  62   switch (c) {
  63     // This are equivalent characters in unicode.
  64     case 0x39c:
  65     case 0x3bc:
  66       return 0xb5;
  67     // This is an uppercase of a Latin-1 character
  68     // outside of Latin-1.
  69     case 0x178:
  70       return 0xff;
  71   }
  72   return 0;
  73 }
  74
  75
  76 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
  77   static const int kMask = ~(1 << 6);
  78   if (c <= kMaxOneByteChar) {
  79     str[0] = c;
  80     return 1;
  81   }
  82   str[0] = 0xC0 | (c >> 6);
  83   str[1] = 0x80 | (c & kMask);
  84   return 2;
  85 }
  86
  87 // Encode encodes the UTF-16 code units c and previous into the given str
  88 // buffer, and combines surrogate code units into single code points. If
  89 // replace_invalid is set to true, orphan surrogate code units will be replaced
  90 // with kBadChar.
  91 unsigned Utf8::Encode(char* str,
  92                       uchar c,
  93                       int previous,
  94                       bool replace_invalid) {
  95   static const int kMask = ~(1 << 6);
  96   if (c <= kMaxOneByteChar) {
  97     str[0] = c;
  98     return 1;
  99   } else if (c <= kMaxTwoByteChar) {
 100     str[0] = 0xC0 | (c >> 6);
 101     str[1] = 0x80 | (c & kMask);
 102     return 2;
 103   } else if (c <= kMaxThreeByteChar) {
 104     if (Utf16::IsSurrogatePair(previous, c)) {
 105       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
 106       return Encode(str - kUnmatchedSize,
 107                     Utf16::CombineSurrogatePair(previous, c),
 108                     Utf16::kNoPreviousCharacter,
 109                     replace_invalid) - kUnmatchedSize;
 110     } else if (replace_invalid &&
 111                (Utf16::IsLeadSurrogate(c) ||
 112                Utf16::IsTrailSurrogate(c))) {
 113       c = kBadChar;
 114     }
 115     str[0] = 0xE0 | (c >> 12);
 116     str[1] = 0x80 | ((c >> 6) & kMask);
 117     str[2] = 0x80 | (c & kMask);
 118     return 3;
 119   } else {
 120     str[0] = 0xF0 | (c >> 18);
 121     str[1] = 0x80 | ((c >> 12) & kMask);
 122     str[2] = 0x80 | ((c >> 6) & kMask);
 123     str[3] = 0x80 | (c & kMask);
 124     return 4;
 125   }
 126 }
 127
 128
 129 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
 130   if (length <= 0) return kBadChar;
 131   byte first = bytes[0];
 132   // Characters between 0000 and 0007F are encoded as a single character
 133   if (first <= kMaxOneByteChar) {
 134     *cursor += 1;
 135     return first;
 136   }
 137   return CalculateValue(bytes, length, cursor);
 138 }
 139
 140 unsigned Utf8::Length(uchar c, int previous) {
 141   if (c <= kMaxOneByteChar) {
 142     return 1;
 143   } else if (c <= kMaxTwoByteChar) {
 144     return 2;
 145   } else if (c <= kMaxThreeByteChar) {
 146     if (Utf16::IsTrailSurrogate(c) &&
 147         Utf16::IsLeadSurrogate(previous)) {
 148       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
 149     }
 150     return 3;
 151   } else {
 152     return 4;
 153   }
 154 }
 155
 156 Utf8DecoderBase::Utf8DecoderBase()
 157   : unbuffered_start_(NULL),
 158     utf16_length_(0),
 159     last_byte_of_buffer_unused_(false) {}
 160
 161 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
 162                                  unsigned buffer_length,
 163                                  const uint8_t* stream,
 164                                  unsigned stream_length) {
 165   Reset(buffer, buffer_length, stream, stream_length);
 166 }
 167
 168 template<unsigned kBufferSize>
 169 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
 170   : Utf8DecoderBase(buffer_,
 171                     kBufferSize,
 172                     reinterpret_cast<const uint8_t*>(stream),
 173                     length) {
 174 }
 175
 176 template<unsigned kBufferSize>
 177 void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
 178   Utf8DecoderBase::Reset(buffer_,
 179                          kBufferSize,
 180                          reinterpret_cast<const uint8_t*>(stream),
 181                          length);
 182 }
 183
 184 template <unsigned kBufferSize>
 185 unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
 186                                               unsigned length) const {
 187   ASSERT(length > 0);
 188   if (length > utf16_length_) length = utf16_length_;
 189   // memcpy everything in buffer.
 190   unsigned buffer_length =
 191       last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
 192   unsigned memcpy_length = length <= buffer_length  ? length : buffer_length;
 193   v8::internal::OS::MemCopy(data, buffer_, memcpy_length*sizeof(uint16_t));
 194   if (length <= buffer_length) return length;
 195   ASSERT(unbuffered_start_ != NULL);
 196   // Copy the rest the slow way.
 197   WriteUtf16Slow(unbuffered_start_,
 198                  data + buffer_length,
 199                  length - buffer_length);
 200   return length;
 201 }
 202
 203 }  // namespace unibrow
 204
 205 #endif  // V8_UNICODE_INL_H_