deps/v8/src/unicode.h

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Redistribution and use in source and binary forms, with or without
   3 // modification, are permitted provided that the following conditions are
   4 // met:
   5 //
   6 //     * Redistributions of source code must retain the above copyright
   7 //       notice, this list of conditions and the following disclaimer.
   8 //     * Redistributions in binary form must reproduce the above
   9 //       copyright notice, this list of conditions and the following
  10 //       disclaimer in the documentation and/or other materials provided
  11 //       with the distribution.
  12 //     * Neither the name of Google Inc. nor the names of its
  13 //       contributors may be used to endorse or promote products derived
  14 //       from this software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27
  28 #ifndef V8_UNICODE_H_
  29 #define V8_UNICODE_H_
  30
  31 #include <sys/types.h>
  32 #include "globals.h"
  33 /**
  34  * \file
  35  * Definitions and convenience functions for working with unicode.
  36  */
  37
  38 namespace unibrow {
  39
  40 typedef unsigned int uchar;
  41 typedef unsigned char byte;
  42
  43 /**
  44  * The max length of the result of converting the case of a single
  45  * character.
  46  */
  47 const int kMaxMappingSize = 4;
  48
  49 template <class T, int size = 256>
  50 class Predicate {
  51  public:
  52   inline Predicate() { }
  53   inline bool get(uchar c);
  54  private:
  55   friend class Test;
  56   bool CalculateValue(uchar c);
  57   struct CacheEntry {
  58     inline CacheEntry() : code_point_(0), value_(0) { }
  59     inline CacheEntry(uchar code_point, bool value)
  60       : code_point_(code_point),
  61         value_(value) { }
  62     uchar code_point_ : 21;
  63     bool value_ : 1;
  64   };
  65   static const int kSize = size;
  66   static const int kMask = kSize - 1;
  67   CacheEntry entries_[kSize];
  68 };
  69
  70 // A cache used in case conversion.  It caches the value for characters
  71 // that either have no mapping or map to a single character independent
  72 // of context.  Characters that map to more than one character or that
  73 // map differently depending on context are always looked up.
  74 template <class T, int size = 256>
  75 class Mapping {
  76  public:
  77   inline Mapping() { }
  78   inline int get(uchar c, uchar n, uchar* result);
  79  private:
  80   friend class Test;
  81   int CalculateValue(uchar c, uchar n, uchar* result);
  82   struct CacheEntry {
  83     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
  84     inline CacheEntry(uchar code_point, signed offset)
  85       : code_point_(code_point),
  86         offset_(offset) { }
  87     uchar code_point_;
  88     signed offset_;
  89     static const int kNoChar = (1 << 21) - 1;
  90   };
  91   static const int kSize = size;
  92   static const int kMask = kSize - 1;
  93   CacheEntry entries_[kSize];
  94 };
  95
  96 class UnicodeData {
  97  private:
  98   friend class Test;
  99   static int GetByteCount();
 100   static const uchar kMaxCodePoint;
 101 };
 102
 103 class Utf16 {
 104  public:
 105   static inline bool IsLeadSurrogate(int code) {
 106     if (code == kNoPreviousCharacter) return false;
 107     return (code & 0xfc00) == 0xd800;
 108   }
 109   static inline bool IsTrailSurrogate(int code) {
 110     if (code == kNoPreviousCharacter) return false;
 111     return (code & 0xfc00) == 0xdc00;
 112   }
 113
 114   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
 115     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
 116   }
 117   static const int kNoPreviousCharacter = -1;
 118   static const uchar kMaxNonSurrogateCharCode = 0xffff;
 119   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
 120   // of UTF-8 data.  The special case where the unit is a surrogate
 121   // trail produces 1 byte net, because the encoding of the pair is
 122   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
 123   // can be reclaimed.
 124   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
 125   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
 126   // The illegality stems from the surrogate not being part of a pair.
 127   static const int kUtf8BytesToCodeASurrogate = 3;
 128   static inline uint16_t LeadSurrogate(uint32_t char_code) {
 129     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
 130   }
 131   static inline uint16_t TrailSurrogate(uint32_t char_code) {
 132     return 0xdc00 + (char_code & 0x3ff);
 133   }
 134 };
 135
 136 class Latin1 {
 137  public:
 138   static const unsigned kMaxChar = 0xff;
 139   // Returns 0 if character does not convert to single latin-1 character
 140   // or if the character doesn't not convert back to latin-1 via inverse
 141   // operation (upper to lower, etc).
 142   static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
 143 };
 144
 145 class Utf8 {
 146  public:
 147   static inline uchar Length(uchar chr, int previous);
 148   static inline unsigned EncodeOneByte(char* out, uint8_t c);
 149   static inline unsigned Encode(
 150       char* out, uchar c, int previous);
 151   static uchar CalculateValue(const byte* str,
 152                               unsigned length,
 153                               unsigned* cursor);
 154   static const uchar kBadChar = 0xFFFD;
 155   static const unsigned kMaxEncodedSize   = 4;
 156   static const unsigned kMaxOneByteChar   = 0x7f;
 157   static const unsigned kMaxTwoByteChar   = 0x7ff;
 158   static const unsigned kMaxThreeByteChar = 0xffff;
 159   static const unsigned kMaxFourByteChar  = 0x1fffff;
 160
 161   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
 162   // that match are coded as a 4 byte UTF-8 sequence.
 163   static const unsigned kBytesSavedByCombiningSurrogates = 2;
 164   static const unsigned kSizeOfUnmatchedSurrogate = 3;
 165   static inline uchar ValueOf(const byte* str,
 166                               unsigned length,
 167                               unsigned* cursor);
 168 };
 169
 170
 171 class Utf8DecoderBase {
 172  public:
 173   // Initialization done in subclass.
 174   inline Utf8DecoderBase();
 175   inline Utf8DecoderBase(uint16_t* buffer,
 176                          unsigned buffer_length,
 177                          const uint8_t* stream,
 178                          unsigned stream_length);
 179   inline unsigned Utf16Length() const { return utf16_length_; }
 180  protected:
 181   // This reads all characters and sets the utf16_length_.
 182   // The first buffer_length utf16 chars are cached in the buffer.
 183   void Reset(uint16_t* buffer,
 184              unsigned buffer_length,
 185              const uint8_t* stream,
 186              unsigned stream_length);
 187   static void WriteUtf16Slow(const uint8_t* stream,
 188                              uint16_t* data,
 189                              unsigned length);
 190   const uint8_t* unbuffered_start_;
 191   unsigned utf16_length_;
 192   bool last_byte_of_buffer_unused_;
 193  private:
 194   DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
 195 };
 196
 197 template <unsigned kBufferSize>
 198 class Utf8Decoder : public Utf8DecoderBase {
 199  public:
 200   inline Utf8Decoder() {}
 201   inline Utf8Decoder(const char* stream, unsigned length);
 202   inline void Reset(const char* stream, unsigned length);
 203   inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
 204  private:
 205   uint16_t buffer_[kBufferSize];
 206 };
 207
 208
 209 struct Uppercase {
 210   static bool Is(uchar c);
 211 };
 212 struct Lowercase {
 213   static bool Is(uchar c);
 214 };
 215 struct Letter {
 216   static bool Is(uchar c);
 217 };
 218 struct Space {
 219   static bool Is(uchar c);
 220 };
 221 struct Number {
 222   static bool Is(uchar c);
 223 };
 224 struct WhiteSpace {
 225   static bool Is(uchar c);
 226 };
 227 struct LineTerminator {
 228   static bool Is(uchar c);
 229 };
 230 struct CombiningMark {
 231   static bool Is(uchar c);
 232 };
 233 struct ConnectorPunctuation {
 234   static bool Is(uchar c);
 235 };
 236 struct ToLowercase {
 237   static const int kMaxWidth = 3;
 238   static const bool kIsToLower = true;
 239   static int Convert(uchar c,
 240                      uchar n,
 241                      uchar* result,
 242                      bool* allow_caching_ptr);
 243 };
 244 struct ToUppercase {
 245   static const int kMaxWidth = 3;
 246   static const bool kIsToLower = false;
 247   static int Convert(uchar c,
 248                      uchar n,
 249                      uchar* result,
 250                      bool* allow_caching_ptr);
 251 };
 252 struct Ecma262Canonicalize {
 253   static const int kMaxWidth = 1;
 254   static int Convert(uchar c,
 255                      uchar n,
 256                      uchar* result,
 257                      bool* allow_caching_ptr);
 258 };
 259 struct Ecma262UnCanonicalize {
 260   static const int kMaxWidth = 4;
 261   static int Convert(uchar c,
 262                      uchar n,
 263                      uchar* result,
 264                      bool* allow_caching_ptr);
 265 };
 266 struct CanonicalizationRange {
 267   static const int kMaxWidth = 1;
 268   static int Convert(uchar c,
 269                      uchar n,
 270                      uchar* result,
 271                      bool* allow_caching_ptr);
 272 };
 273
 274 }  // namespace unibrow
 275
 276 #endif  // V8_UNICODE_H_