1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #include <sys/types.h>
35 * Definitions and convenience functions for working with unicode.
40 typedef unsigned int uchar;
41 typedef unsigned char byte;
44 * The max length of the result of converting the case of a single
47 const int kMaxMappingSize = 4;
49 template <class T, int size = 256>
52 inline Predicate() { }
53 inline bool get(uchar c);
56 bool CalculateValue(uchar c);
58 inline CacheEntry() : code_point_(0), value_(0) { }
59 inline CacheEntry(uchar code_point, bool value)
60 : code_point_(code_point),
62 uchar code_point_ : 21;
65 static const int kSize = size;
66 static const int kMask = kSize - 1;
67 CacheEntry entries_[kSize];
70 // A cache used in case conversion. It caches the value for characters
71 // that either have no mapping or map to a single character independent
72 // of context. Characters that map to more than one character or that
73 // map differently depending on context are always looked up.
74 template <class T, int size = 256>
78 inline int get(uchar c, uchar n, uchar* result);
81 int CalculateValue(uchar c, uchar n, uchar* result);
83 inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84 inline CacheEntry(uchar code_point, signed offset)
85 : code_point_(code_point),
89 static const int kNoChar = (1 << 21) - 1;
91 static const int kSize = size;
92 static const int kMask = kSize - 1;
93 CacheEntry entries_[kSize];
99 static int GetByteCount();
100 static const uchar kMaxCodePoint;
103 // --- U t f 8 a n d 16 ---
105 template <typename Data>
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109 inline Buffer() : data_(0), length_(0) { }
110 Data data() { return data_; }
111 unsigned length() { return length_; }
120 static inline bool IsLeadSurrogate(int code) {
121 if (code == kNoPreviousCharacter) return false;
122 return (code & 0xfc00) == 0xd800;
124 static inline bool IsTrailSurrogate(int code) {
125 if (code == kNoPreviousCharacter) return false;
126 return (code & 0xfc00) == 0xdc00;
129 static inline int CombineSurrogatePair(uchar lead, uchar trail) {
130 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
132 static const int kNoPreviousCharacter = -1;
133 static const uchar kMaxNonSurrogateCharCode = 0xffff;
134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135 // of UTF-8 data. The special case where the unit is a surrogate
136 // trail produces 1 byte net, because the encoding of the pair is
137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141 // The illegality stems from the surrogate not being part of a pair.
142 static const int kUtf8BytesToCodeASurrogate = 3;
143 static inline uchar LeadSurrogate(int char_code) {
144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
146 static inline uchar TrailSurrogate(int char_code) {
147 return 0xdc00 + (char_code & 0x3ff);
154 static inline uchar Length(uchar chr, int previous);
155 static inline unsigned Encode(
156 char* out, uchar c, int previous);
157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158 unsigned capacity, unsigned* chars_read, unsigned* offset);
159 static uchar CalculateValue(const byte* str,
162 static const uchar kBadChar = 0xFFFD;
163 static const unsigned kMaxEncodedSize = 4;
164 static const unsigned kMaxOneByteChar = 0x7f;
165 static const unsigned kMaxTwoByteChar = 0x7ff;
166 static const unsigned kMaxThreeByteChar = 0xffff;
167 static const unsigned kMaxFourByteChar = 0x1fffff;
169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170 // that match are coded as a 4 byte UTF-8 sequence.
171 static const unsigned kBytesSavedByCombiningSurrogates = 2;
172 static const unsigned kSizeOfUnmatchedSurrogate = 3;
175 template <unsigned s> friend class Utf8InputBuffer;
177 static inline uchar ValueOf(const byte* str,
182 // --- C h a r a c t e r S t r e a m ---
184 class CharacterStream {
186 inline uchar GetNext();
187 inline bool has_more() { return remaining_ != 0; }
188 // Note that default implementation is not efficient.
189 virtual void Seek(unsigned);
191 unsigned Utf16Length();
192 virtual ~CharacterStream() { }
193 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
195 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196 unsigned capacity, unsigned& offset);
197 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198 unsigned capacity, unsigned& offset);
199 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200 virtual void Rewind() = 0;
203 virtual void FillBuffer() = 0;
204 // The number of characters left in the current buffer
206 // The current offset within the buffer
208 // The buffer containing the decoded characters.
212 // --- I n p u t B u f f e r ---
215 * Provides efficient access to encoded characters in strings. It
216 * does so by reading characters one block at a time, rather than one
217 * character at a time, which gives string implementations an
218 * opportunity to optimize the decoding.
220 template <class Reader, class Input = Reader*, unsigned kSize = 256>
221 class InputBuffer : public CharacterStream {
223 virtual void Rewind();
224 inline void Reset(Input input);
225 void Seek(unsigned position);
226 inline void Reset(unsigned position, Input input);
229 explicit InputBuffer(Input input) { Reset(input); }
230 virtual void FillBuffer();
232 // A custom offset that can be used by the string implementation to
233 // mark progress within the encoded string.
237 // To avoid heap allocation, we keep an internal buffer to which
238 // the encoded string can write its characters. The string
239 // implementation is free to decide whether it wants to use this
241 byte util_buffer_[kSize];
244 // --- U t f 8 I n p u t B u f f e r ---
246 template <unsigned s = 256>
247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
249 inline Utf8InputBuffer() { }
250 inline Utf8InputBuffer(const char* data, unsigned length);
251 inline void Reset(const char* data, unsigned length) {
252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
253 Buffer<const char*>(data, length));
259 static bool Is(uchar c);
262 static bool Is(uchar c);
265 static bool Is(uchar c);
268 static bool Is(uchar c);
271 static bool Is(uchar c);
274 static bool Is(uchar c);
276 struct LineTerminator {
277 static bool Is(uchar c);
279 struct CombiningMark {
280 static bool Is(uchar c);
282 struct ConnectorPunctuation {
283 static bool Is(uchar c);
286 static const int kMaxWidth = 3;
287 static int Convert(uchar c,
290 bool* allow_caching_ptr);
293 static const int kMaxWidth = 3;
294 static int Convert(uchar c,
297 bool* allow_caching_ptr);
299 struct Ecma262Canonicalize {
300 static const int kMaxWidth = 1;
301 static int Convert(uchar c,
304 bool* allow_caching_ptr);
306 struct Ecma262UnCanonicalize {
307 static const int kMaxWidth = 4;
308 static int Convert(uchar c,
311 bool* allow_caching_ptr);
313 struct CanonicalizationRange {
314 static const int kMaxWidth = 1;
315 static int Convert(uchar c,
318 bool* allow_caching_ptr);
321 } // namespace unibrow
323 #endif // V8_UNICODE_H_