src/v8/src/unicode.h

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef V8_UNICODE_H_
   6 #define V8_UNICODE_H_
   7
   8 #include <sys/types.h>
   9 #include "src/globals.h"
  10 /**
  11  * \file
  12  * Definitions and convenience functions for working with unicode.
  13  */
  14
  15 namespace unibrow {
  16
  17 typedef unsigned int uchar;
  18 typedef unsigned char byte;
  19
  20 /**
  21  * The max length of the result of converting the case of a single
  22  * character.
  23  */
  24 const int kMaxMappingSize = 4;
  25
  26 template <class T, int size = 256>
  27 class Predicate {
  28  public:
  29   inline Predicate() { }
  30   inline bool get(uchar c);
  31  private:
  32   friend class Test;
  33   bool CalculateValue(uchar c);
  34   struct CacheEntry {
  35     inline CacheEntry() : code_point_(0), value_(0) { }
  36     inline CacheEntry(uchar code_point, bool value)
  37       : code_point_(code_point),
  38         value_(value) { }
  39     uchar code_point_ : 21;
  40     bool value_ : 1;
  41   };
  42   static const int kSize = size;
  43   static const int kMask = kSize - 1;
  44   CacheEntry entries_[kSize];
  45 };
  46
  47
  48 // A cache used in case conversion.  It caches the value for characters
  49 // that either have no mapping or map to a single character independent
  50 // of context.  Characters that map to more than one character or that
  51 // map differently depending on context are always looked up.
  52 template <class T, int size = 256>
  53 class Mapping {
  54  public:
  55   inline Mapping() { }
  56   inline int get(uchar c, uchar n, uchar* result);
  57  private:
  58   friend class Test;
  59   int CalculateValue(uchar c, uchar n, uchar* result);
  60   struct CacheEntry {
  61     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
  62     inline CacheEntry(uchar code_point, signed offset)
  63       : code_point_(code_point),
  64         offset_(offset) { }
  65     uchar code_point_;
  66     signed offset_;
  67     static const int kNoChar = (1 << 21) - 1;
  68   };
  69   static const int kSize = size;
  70   static const int kMask = kSize - 1;
  71   CacheEntry entries_[kSize];
  72 };
  73
  74
  75 class UnicodeData {
  76  private:
  77   friend class Test;
  78   static int GetByteCount();
  79   static const uchar kMaxCodePoint;
  80 };
  81
  82
  83 class Utf16 {
  84  public:
  85   static inline bool IsSurrogatePair(int lead, int trail) {
  86     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
  87   }
  88   static inline bool IsLeadSurrogate(int code) {
  89     if (code == kNoPreviousCharacter) return false;
  90     return (code & 0xfc00) == 0xd800;
  91   }
  92   static inline bool IsTrailSurrogate(int code) {
  93     if (code == kNoPreviousCharacter) return false;
  94     return (code & 0xfc00) == 0xdc00;
  95   }
  96
  97   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
  98     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
  99   }
 100   static const int kNoPreviousCharacter = -1;
 101   static const uchar kMaxNonSurrogateCharCode = 0xffff;
 102   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
 103   // of UTF-8 data.  The special case where the unit is a surrogate
 104   // trail produces 1 byte net, because the encoding of the pair is
 105   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
 106   // can be reclaimed.
 107   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
 108   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
 109   // The illegality stems from the surrogate not being part of a pair.
 110   static const int kUtf8BytesToCodeASurrogate = 3;
 111   static inline uint16_t LeadSurrogate(uint32_t char_code) {
 112     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
 113   }
 114   static inline uint16_t TrailSurrogate(uint32_t char_code) {
 115     return 0xdc00 + (char_code & 0x3ff);
 116   }
 117 };
 118
 119
 120 class Utf8 {
 121  public:
 122   static inline uchar Length(uchar chr, int previous);
 123   static inline unsigned EncodeOneByte(char* out, uint8_t c);
 124   static inline unsigned Encode(char* out,
 125                                 uchar c,
 126                                 int previous,
 127                                 bool replace_invalid = false);
 128   static uchar CalculateValue(const byte* str,
 129                               unsigned length,
 130                               unsigned* cursor);
 131
 132   // The unicode replacement character, used to signal invalid unicode
 133   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
 134   static const uchar kBadChar = 0xFFFD;
 135   static const unsigned kMaxEncodedSize   = 4;
 136   static const unsigned kMaxOneByteChar   = 0x7f;
 137   static const unsigned kMaxTwoByteChar   = 0x7ff;
 138   static const unsigned kMaxThreeByteChar = 0xffff;
 139   static const unsigned kMaxFourByteChar  = 0x1fffff;
 140
 141   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
 142   // that match are coded as a 4 byte UTF-8 sequence.
 143   static const unsigned kBytesSavedByCombiningSurrogates = 2;
 144   static const unsigned kSizeOfUnmatchedSurrogate = 3;
 145   // The maximum size a single UTF-16 code unit may take up when encoded as
 146   // UTF-8.
 147   static const unsigned kMax16BitCodeUnitSize  = 3;
 148   static inline uchar ValueOf(const byte* str,
 149                               unsigned length,
 150                               unsigned* cursor);
 151 };
 152
 153 struct Uppercase {
 154   static bool Is(uchar c);
 155 };
 156 struct Lowercase {
 157   static bool Is(uchar c);
 158 };
 159 struct Letter {
 160   static bool Is(uchar c);
 161 };
 162 struct ID_Start {
 163   static bool Is(uchar c);
 164 };
 165 struct ID_Continue {
 166   static bool Is(uchar c);
 167 };
 168 struct WhiteSpace {
 169   static bool Is(uchar c);
 170 };
 171 struct LineTerminator {
 172   static bool Is(uchar c);
 173 };
 174 struct ToLowercase {
 175   static const int kMaxWidth = 3;
 176   static const bool kIsToLower = true;
 177   static int Convert(uchar c,
 178                      uchar n,
 179                      uchar* result,
 180                      bool* allow_caching_ptr);
 181 };
 182 struct ToUppercase {
 183   static const int kMaxWidth = 3;
 184   static const bool kIsToLower = false;
 185   static int Convert(uchar c,
 186                      uchar n,
 187                      uchar* result,
 188                      bool* allow_caching_ptr);
 189 };
 190 struct Ecma262Canonicalize {
 191   static const int kMaxWidth = 1;
 192   static int Convert(uchar c,
 193                      uchar n,
 194                      uchar* result,
 195                      bool* allow_caching_ptr);
 196 };
 197 struct Ecma262UnCanonicalize {
 198   static const int kMaxWidth = 4;
 199   static int Convert(uchar c,
 200                      uchar n,
 201                      uchar* result,
 202                      bool* allow_caching_ptr);
 203 };
 204 struct CanonicalizationRange {
 205   static const int kMaxWidth = 1;
 206   static int Convert(uchar c,
 207                      uchar n,
 208                      uchar* result,
 209                      bool* allow_caching_ptr);
 210 };
 211
 212 }  // namespace unibrow
 213
 214 #endif  // V8_UNICODE_H_