1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #include <sys/types.h>
35 * Definitions and convenience functions for working with unicode.
40 typedef unsigned int uchar;
41 typedef unsigned char byte;
44 * The max length of the result of converting the case of a single
47 const int kMaxMappingSize = 4;
49 template <class T, int size = 256>
52 inline Predicate() { }
53 inline bool get(uchar c);
56 bool CalculateValue(uchar c);
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
62 uchar code_point_ : 21;
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
70 // A cache used in case conversion. It caches the value for characters
71 // that either have no mapping or map to a single character independent
72 // of context. Characters that map to more than one character or that
73 // map differently depending on context are always looked up.
74 template <class T, int size = 256>
78 inline int get(uchar c, uchar n, uchar* result);
81 int CalculateValue(uchar c, uchar n, uchar* result);
83 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
89 static const int kNoChar = (1 << 21) - 1;
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
99 static int GetByteCount();
100 static const uchar kMaxCodePoint;
105 static inline bool IsLeadSurrogate(int code) {
106 if (code == kNoPreviousCharacter) return false;
107 return (code & 0xfc00) == 0xd800;
109 static inline bool IsTrailSurrogate(int code) {
110 if (code == kNoPreviousCharacter) return false;
111 return (code & 0xfc00) == 0xdc00;
114 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
115 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
117 static const int kNoPreviousCharacter = -1;
118 static const uchar kMaxNonSurrogateCharCode = 0xffff;
119 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
120 // of UTF-8 data. The special case where the unit is a surrogate
121 // trail produces 1 byte net, because the encoding of the pair is
122 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
124 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
125 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
126 // The illegality stems from the surrogate not being part of a pair.
127 static const int kUtf8BytesToCodeASurrogate = 3;
128 static inline uint16_t LeadSurrogate(uint32_t char_code) {
129 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
131 static inline uint16_t TrailSurrogate(uint32_t char_code) {
132 return 0xdc00 + (char_code & 0x3ff);
138 static const unsigned kMaxChar = 0xff;
139 // Returns 0 if character does not convert to single latin-1 character
140 // or if the character doesn't not convert back to latin-1 via inverse
141 // operation (upper to lower, etc).
142 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
147 static inline uchar Length(uchar chr, int previous);
148 static inline unsigned EncodeOneByte(char* out, uint8_t c);
149 static inline unsigned Encode(
150 char* out, uchar c, int previous);
151 static uchar CalculateValue(const byte* str,
154 static const uchar kBadChar = 0xFFFD;
155 static const unsigned kMaxEncodedSize = 4;
156 static const unsigned kMaxOneByteChar = 0x7f;
157 static const unsigned kMaxTwoByteChar = 0x7ff;
158 static const unsigned kMaxThreeByteChar = 0xffff;
159 static const unsigned kMaxFourByteChar = 0x1fffff;
161 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
162 // that match are coded as a 4 byte UTF-8 sequence.
163 static const unsigned kBytesSavedByCombiningSurrogates = 2;
164 static const unsigned kSizeOfUnmatchedSurrogate = 3;
165 static inline uchar ValueOf(const byte* str,
171 class Utf8DecoderBase {
173 // Initialization done in subclass.
174 inline Utf8DecoderBase();
175 inline Utf8DecoderBase(uint16_t* buffer,
176 unsigned buffer_length,
177 const uint8_t* stream,
178 unsigned stream_length);
179 inline unsigned Utf16Length() const { return utf16_length_; }
181 // This reads all characters and sets the utf16_length_.
182 // The first buffer_length utf16 chars are cached in the buffer.
183 void Reset(uint16_t* buffer,
184 unsigned buffer_length,
185 const uint8_t* stream,
186 unsigned stream_length);
187 static void WriteUtf16Slow(const uint8_t* stream,
190 const uint8_t* unbuffered_start_;
191 unsigned utf16_length_;
192 bool last_byte_of_buffer_unused_;
194 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
197 template <unsigned kBufferSize>
198 class Utf8Decoder : public Utf8DecoderBase {
200 inline Utf8Decoder() {}
201 inline Utf8Decoder(const char* stream, unsigned length);
202 inline void Reset(const char* stream, unsigned length);
203 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
205 uint16_t buffer_[kBufferSize];
210 static bool Is(uchar c);
213 static bool Is(uchar c);
216 static bool Is(uchar c);
219 static bool Is(uchar c);
222 static bool Is(uchar c);
225 static bool Is(uchar c);
227 struct LineTerminator {
228 static bool Is(uchar c);
230 struct CombiningMark {
231 static bool Is(uchar c);
233 struct ConnectorPunctuation {
234 static bool Is(uchar c);
237 static const int kMaxWidth = 3;
238 static const bool kIsToLower = true;
239 static int Convert(uchar c,
242 bool* allow_caching_ptr);
245 static const int kMaxWidth = 3;
246 static const bool kIsToLower = false;
247 static int Convert(uchar c,
250 bool* allow_caching_ptr);
252 struct Ecma262Canonicalize {
253 static const int kMaxWidth = 1;
254 static int Convert(uchar c,
257 bool* allow_caching_ptr);
259 struct Ecma262UnCanonicalize {
260 static const int kMaxWidth = 4;
261 static int Convert(uchar c,
264 bool* allow_caching_ptr);
266 struct CanonicalizationRange {
267 static const int kMaxWidth = 1;
268 static int Convert(uchar c,
271 bool* allow_caching_ptr);
274 } // namespace unibrow
276 #endif // V8_UNICODE_H_