2 *******************************************************************************
4 * Copyright (C) 2016 and later: Unicode, Inc. and others.
5 * License & terms of use: http://www.unicode.org/copyright.html#License
7 *******************************************************************************
8 *******************************************************************************
10 * Copyright (C) 2003-2006, International Business Machines
11 * Corporation and others. All Rights Reserved.
13 *******************************************************************************
14 * file name: uit_len8.c
16 * tab size: 8 (not used)
19 * created on: 2003feb10
20 * created by: Markus W. Scherer
22 * This file contains the implementation of the "lenient UTF-8" UCharIterator
23 * as used in the uciter8 sample code.
24 * UTF-8-style macros are defined as well as the UCharIterator.
25 * The macros are incomplete (do not assemble code points from pairs of
26 * surrogates, see comment below)
27 * but sufficient for the iterator.
31 #include "unicode/utypes.h"
32 #include "unicode/uiter.h"
34 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
37 * This code leniently reads 8-bit Unicode strings,
38 * which could contain a mix of UTF-8 and CESU-8.
40 * - supplementary code points may be encoded with dedicated 4-byte sequences
42 * - supplementary code points may be encoded with
43 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
45 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
48 * Right now, the macros do not attempt to assemble code points from pairs of
49 * separately encoded surrogates.
50 * This would not be sufficient for processing based on these macros,
51 * but it is sufficient for a UCharIterator that returns only UChars anyway.
53 * The code is copied and modified from utf_impl.c and utf8.h.
55 * Change 2006feb08: Much of the implementation code is replaced by calling
56 * the utf_impl.c functions which accept a new "strict" parameter value
57 * of -2 implementing exactly this leniency.
60 #define L8_NEXT(s, i, length, c) { \
61 (c)=(uint8_t)(s)[(i)++]; \
64 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
71 #define L8_PREV(s, start, i, c) { \
72 (c)=(uint8_t)(s)[--(i)]; \
75 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
82 /* lenient-8 UCharIterator -------------------------------------------------- */
85 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
86 * except that it uses the lenient-8-bit-Unicode macros above.
90 * Minimal implementation:
91 * Maintain a single-UChar buffer for an additional surrogate.
92 * The caller must not modify start and limit because they are used internally.
94 * Use UCharIterator fields as follows:
95 * context pointer to UTF-8 string
96 * length UTF-16 length of the string; -1 until lazy evaluation
97 * start current UTF-8 index
98 * index current UTF-16 index; may be -1="unknown" after setState()
99 * limit UTF-8 length of the string
100 * reservedField supplementary code point
102 * Since UCharIterator delivers 16-bit code units, the iteration can be
103 * currently in the middle of the byte sequence for a supplementary code point.
104 * In this case, reservedField will contain that code point and start will
105 * point to after the corresponding byte sequence. The UTF-16 index will be
106 * one less than what it would otherwise be corresponding to the UTF-8 index.
107 * Otherwise, reservedField will be 0.
111 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
112 * Add implementations that do not call strlen() for iteration but check for NUL.
115 static int32_t U_CALLCONV
116 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
123 /* the current UTF-16 index is unknown after setState(), count from the beginning */
126 int32_t i, limit, index;
128 s=(const uint8_t *)iter->context;
130 limit=iter->start; /* count up to the UTF-8 index */
132 L8_NEXT(s, i, limit, c);
140 iter->start=i; /* just in case setState() did not get us to a code point boundary */
142 iter->length=index; /* in case it was <0 or wrong */
144 if(iter->reservedField!=0) {
145 --index; /* we are in the middle of a supplementary code point */
155 int32_t i, limit, length;
157 s=(const uint8_t *)iter->context;
160 * the current UTF-16 index is unknown after setState(),
161 * we must first count from the beginning to here
166 /* count from the beginning to the current index */
168 L8_NEXT(s, i, limit, c);
176 /* assume i==limit==iter->start, set the UTF-16 index */
177 iter->start=i; /* just in case setState() did not get us to a code point boundary */
178 iter->index= iter->reservedField!=0 ? length-1 : length;
182 if(iter->reservedField!=0) {
187 /* count from the current index to the end */
190 L8_NEXT(s, i, limit, c);
201 /* not a valid origin */
202 /* Should never get here! */
207 static int32_t U_CALLCONV
208 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
211 int32_t pos; /* requested UTF-16 index */
212 int32_t i; /* UTF-8 index */
215 /* calculate the requested UTF-16 index */
221 /* iter->index<0 (unknown) is possible */
225 pos=iter->index+delta;
228 /* the current UTF-16 index is unknown after setState(), use only delta */
235 if(iter->length>=0) {
236 pos=iter->length+delta;
239 /* pin to the end, avoid counting the length */
241 iter->start=iter->limit;
242 iter->reservedField=0;
244 return UITER_UNKNOWN_INDEX;
246 /* the current UTF-16 index is unknown, use only delta */
253 return -1; /* Error */
257 /* shortcuts: pinning to the edges of the string */
259 iter->index=iter->start=iter->reservedField=0;
261 } else if(iter->length>=0 && pos>=iter->length) {
262 iter->index=iter->length;
263 iter->start=iter->limit;
264 iter->reservedField=0;
268 /* minimize the number of L8_NEXT/PREV operations */
269 if(iter->index<0 || pos<iter->index/2) {
270 /* go forward from the start instead of backward from the current index */
271 iter->index=iter->start=iter->reservedField=0;
272 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
274 * if we have the UTF-16 index and length and the new position is
275 * closer to the end than the current index,
276 * then go backward from the end instead of forward from the current index
278 iter->index=iter->length;
279 iter->start=iter->limit;
280 iter->reservedField=0;
283 delta=pos-iter->index;
285 return iter->index; /* nothing to do */
288 /* move relative to unknown UTF-16 index */
290 return UITER_UNKNOWN_INDEX; /* nothing to do */
291 } else if(-delta>=iter->start) {
292 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
293 iter->index=iter->start=iter->reservedField=0;
295 } else if(delta>=(iter->limit-iter->start)) {
296 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
297 iter->index=iter->length; /* may or may not be <0 (unknown) */
298 iter->start=iter->limit;
299 iter->reservedField=0;
300 return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
306 /* move towards the requested position, pin to the edges of the string */
307 s=(const uint8_t *)iter->context;
308 pos=iter->index; /* could be <0 (unknown) */
312 int32_t limit=iter->limit;
313 if(iter->reservedField!=0) {
314 iter->reservedField=0;
318 while(delta>0 && i<limit) {
319 L8_NEXT(s, i, limit, c);
323 } else if(delta>=2) {
326 } else /* delta==1 */ {
327 /* stop in the middle of a supplementary code point */
328 iter->reservedField=c;
330 break; /* delta=0; */
334 if(iter->length<0 && iter->index>=0) {
335 iter->length= iter->reservedField==0 ? pos : pos+1;
336 } else if(iter->index<0 && iter->length>=0) {
337 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
340 } else /* delta<0 */ {
342 if(iter->reservedField!=0) {
343 iter->reservedField=0;
344 i-=4; /* we stayed behind the supplementary code point; go before it now */
348 while(delta<0 && i>0) {
353 } else if(delta<=-2) {
356 } else /* delta==-1 */ {
357 /* stop in the middle of a supplementary code point */
358 i+=4; /* back to behind this supplementary code point for consistent state */
359 iter->reservedField=c;
361 break; /* delta=0; */
368 return iter->index=pos;
370 /* we started with index<0 (unknown) so pos is bogus */
372 return iter->index=i; /* reached the beginning */
374 /* we still don't know the UTF-16 index */
375 return UITER_UNKNOWN_INDEX;
380 static UBool U_CALLCONV
381 lenient8IteratorHasNext(UCharIterator *iter) {
382 return iter->reservedField!=0 || iter->start<iter->limit;
385 static UBool U_CALLCONV
386 lenient8IteratorHasPrevious(UCharIterator *iter) {
387 return iter->start>0;
390 static UChar32 U_CALLCONV
391 lenient8IteratorCurrent(UCharIterator *iter) {
392 if(iter->reservedField!=0) {
393 return U16_TRAIL(iter->reservedField);
394 } else if(iter->start<iter->limit) {
395 const uint8_t *s=(const uint8_t *)iter->context;
397 int32_t i=iter->start;
399 L8_NEXT(s, i, iter->limit, c);
402 } else if(c<=0xffff) {
412 static UChar32 U_CALLCONV
413 lenient8IteratorNext(UCharIterator *iter) {
416 if(iter->reservedField!=0) {
417 UChar trail=U16_TRAIL(iter->reservedField);
418 iter->reservedField=0;
419 if((index=iter->index)>=0) {
423 } else if(iter->start<iter->limit) {
424 const uint8_t *s=(const uint8_t *)iter->context;
427 L8_NEXT(s, iter->start, iter->limit, c);
428 if((index=iter->index)>=0) {
430 if(iter->length<0 && iter->start==iter->limit) {
431 iter->length= c<=0xffff ? index : index+1;
433 } else if(iter->start==iter->limit && iter->length>=0) {
434 iter->index= c<=0xffff ? iter->length : iter->length-1;
438 } else if(c<=0xffff) {
441 iter->reservedField=c;
449 static UChar32 U_CALLCONV
450 lenient8IteratorPrevious(UCharIterator *iter) {
453 if(iter->reservedField!=0) {
454 UChar lead=U16_LEAD(iter->reservedField);
455 iter->reservedField=0;
456 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
457 if((index=iter->index)>0) {
461 } else if(iter->start>0) {
462 const uint8_t *s=(const uint8_t *)iter->context;
465 L8_PREV(s, 0, iter->start, c);
466 if((index=iter->index)>0) {
468 } else if(iter->start<=1) {
469 iter->index= c<=0xffff ? iter->start : iter->start+1;
473 } else if(c<=0xffff) {
476 iter->start+=4; /* back to behind this supplementary code point for consistent state */
477 iter->reservedField=c;
485 static uint32_t U_CALLCONV
486 lenient8IteratorGetState(const UCharIterator *iter) {
487 uint32_t state=(uint32_t)(iter->start<<1);
488 if(iter->reservedField!=0) {
494 static void U_CALLCONV
495 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
496 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
498 } else if(iter==NULL) {
499 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
500 } else if(state==lenient8IteratorGetState(iter)) {
501 /* setting to the current state: no-op */
503 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
504 state&=1; /* 1 if in surrogate pair, must be index>=4 */
506 if((state==0 ? index<0 : index<4) || iter->limit<index) {
507 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
509 iter->start=index; /* restore UTF-8 byte index */
513 iter->index=-1; /* unknown UTF-16 index */
516 iter->reservedField=0;
518 /* verified index>=4 above */
520 L8_PREV((const uint8_t *)iter->context, 0, index, c);
522 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
524 iter->reservedField=c;
531 static const UCharIterator lenient8Iterator={
533 lenient8IteratorGetIndex,
534 lenient8IteratorMove,
535 lenient8IteratorHasNext,
536 lenient8IteratorHasPrevious,
537 lenient8IteratorCurrent,
538 lenient8IteratorNext,
539 lenient8IteratorPrevious,
541 lenient8IteratorGetState,
542 lenient8IteratorSetState
545 U_CAPI void U_EXPORT2
546 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
548 if(s!=0 && length>=-1) {
549 *iter=lenient8Iterator;
554 iter->limit=strlen(s);
556 iter->length= iter->limit<=1 ? iter->limit : -1;
558 /* set no-op iterator */
559 uiter_setString(iter, NULL, 0);