1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2000-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnvlat1.cpp
10 * tab size: 8 (not used)
13 * created on: 2000feb07
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
22 #include "unicode/uset.h"
23 #include "unicode/utf8.h"
27 /* control optimizations according to the platform */
28 #define LATIN1_UNROLL_FROM_UNICODE 1
30 /* ISO 8859-1 --------------------------------------------------------------- */
32 /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
34 _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
35 UErrorCode *pErrorCode) {
36 const uint8_t *source;
38 int32_t targetCapacity, length;
43 /* set up the local pointers */
44 source=(const uint8_t *)pArgs->source;
46 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
47 offsets=pArgs->offsets;
52 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
53 * for the minimum of the sourceLength and targetCapacity
55 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
56 if(length<=targetCapacity) {
57 targetCapacity=length;
59 /* target will be full */
60 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
61 length=targetCapacity;
64 if(targetCapacity>=8) {
65 /* This loop is unrolled for speed and improved pipelining. */
68 loops=count=targetCapacity>>3;
69 length=targetCapacity&=0x7;
85 offsets[0]=sourceIndex++;
86 offsets[1]=sourceIndex++;
87 offsets[2]=sourceIndex++;
88 offsets[3]=sourceIndex++;
89 offsets[4]=sourceIndex++;
90 offsets[5]=sourceIndex++;
91 offsets[6]=sourceIndex++;
92 offsets[7]=sourceIndex++;
99 while(targetCapacity>0) {
104 /* write back the updated pointers */
105 pArgs->source=(const char *)source;
106 pArgs->target=target;
111 *offsets++=sourceIndex++;
114 pArgs->offsets=offsets;
118 /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
120 _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
121 UErrorCode *pErrorCode) {
122 const uint8_t *source=(const uint8_t *)pArgs->source;
123 if(source<(const uint8_t *)pArgs->sourceLimit) {
124 pArgs->source=(const char *)(source+1);
128 /* no output because of empty input */
129 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
133 /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
135 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
136 UErrorCode *pErrorCode) {
138 const UChar *source, *sourceLimit;
139 uint8_t *target, *oldTarget;
140 int32_t targetCapacity, length;
148 /* set up the local pointers */
149 cnv=pArgs->converter;
150 source=pArgs->source;
151 sourceLimit=pArgs->sourceLimit;
152 target=oldTarget=(uint8_t *)pArgs->target;
153 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
154 offsets=pArgs->offsets;
156 if(cnv->sharedData==&_Latin1Data) {
157 max=0xff; /* Latin-1 */
159 max=0x7f; /* US-ASCII */
162 /* get the converter state from UConverter */
165 /* sourceIndex=-1 if the current character began in the previous buffer */
166 sourceIndex= cp==0 ? 0 : -1;
169 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
170 * for the minimum of the sourceLength and targetCapacity
172 length=(int32_t)(sourceLimit-source);
173 if(length<targetCapacity) {
174 targetCapacity=length;
177 /* conversion loop */
178 if(cp!=0 && targetCapacity>0) {
182 #if LATIN1_UNROLL_FROM_UNICODE
183 /* unroll the loop with the most common case */
184 if(targetCapacity>=16) {
185 int32_t count, loops;
188 loops=count=targetCapacity>>4;
190 oredChars=u=*source++;
191 *target++=(uint8_t)u;
192 oredChars|=u=*source++;
193 *target++=(uint8_t)u;
194 oredChars|=u=*source++;
195 *target++=(uint8_t)u;
196 oredChars|=u=*source++;
197 *target++=(uint8_t)u;
198 oredChars|=u=*source++;
199 *target++=(uint8_t)u;
200 oredChars|=u=*source++;
201 *target++=(uint8_t)u;
202 oredChars|=u=*source++;
203 *target++=(uint8_t)u;
204 oredChars|=u=*source++;
205 *target++=(uint8_t)u;
206 oredChars|=u=*source++;
207 *target++=(uint8_t)u;
208 oredChars|=u=*source++;
209 *target++=(uint8_t)u;
210 oredChars|=u=*source++;
211 *target++=(uint8_t)u;
212 oredChars|=u=*source++;
213 *target++=(uint8_t)u;
214 oredChars|=u=*source++;
215 *target++=(uint8_t)u;
216 oredChars|=u=*source++;
217 *target++=(uint8_t)u;
218 oredChars|=u=*source++;
219 *target++=(uint8_t)u;
220 oredChars|=u=*source++;
221 *target++=(uint8_t)u;
223 /* were all 16 entries really valid? */
225 /* no, return to the first of these 16 */
232 targetCapacity-=16*count;
237 *offsets++=sourceIndex++;
238 *offsets++=sourceIndex++;
239 *offsets++=sourceIndex++;
240 *offsets++=sourceIndex++;
241 *offsets++=sourceIndex++;
242 *offsets++=sourceIndex++;
243 *offsets++=sourceIndex++;
244 *offsets++=sourceIndex++;
245 *offsets++=sourceIndex++;
246 *offsets++=sourceIndex++;
247 *offsets++=sourceIndex++;
248 *offsets++=sourceIndex++;
249 *offsets++=sourceIndex++;
250 *offsets++=sourceIndex++;
251 *offsets++=sourceIndex++;
252 *offsets++=sourceIndex++;
259 /* conversion loop */
261 while(targetCapacity>0 && (c=*source++)<=max) {
262 /* convert the Unicode code point */
263 *target++=(uint8_t)c;
269 if(!U_IS_SURROGATE(cp)) {
270 /* callback(unassigned) */
271 } else if(U_IS_SURROGATE_LEAD(cp)) {
273 if(source<sourceLimit) {
274 /* test the following code unit */
276 if(U16_IS_TRAIL(trail)) {
278 cp=U16_GET_SUPPLEMENTARY(cp, trail);
279 /* this codepage does not map supplementary code points */
280 /* callback(unassigned) */
282 /* this is an unmatched lead code unit (1st surrogate) */
283 /* callback(illegal) */
291 /* this is an unmatched trail code unit (2nd surrogate) */
292 /* callback(illegal) */
295 *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
300 /* set offsets since the start */
302 size_t count=target-oldTarget;
304 *offsets++=sourceIndex++;
309 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
311 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
314 /* write back the updated pointers */
315 pArgs->source=source;
316 pArgs->target=(char *)target;
317 pArgs->offsets=offsets;
320 /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
322 ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
323 UConverterToUnicodeArgs *pToUArgs,
324 UErrorCode *pErrorCode) {
326 const uint8_t *source, *sourceLimit;
328 int32_t targetCapacity;
333 /* set up the local pointers */
334 utf8=pToUArgs->converter;
335 source=(uint8_t *)pToUArgs->source;
336 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
337 target=(uint8_t *)pFromUArgs->target;
338 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
340 /* get the converter state from the UTF-8 UConverter */
341 c=(UChar32)utf8->toUnicodeStatus;
342 if(c!=0 && source<sourceLimit) {
343 if(targetCapacity==0) {
344 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
346 } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {
348 *target++=(uint8_t)(((c&3)<<6)|t1);
351 utf8->toUnicodeStatus=0;
354 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
355 *pErrorCode=U_USING_DEFAULT_WARNING;
361 * Make sure that the last byte sequence before sourceLimit is complete
362 * or runs into a lead byte.
363 * In the conversion loop compare source with sourceLimit only once
364 * per multi-byte character.
365 * For Latin-1, adjust sourceLimit only for 1 trail byte because
366 * the conversion loop handles at most 2-byte sequences.
368 if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {
372 /* conversion loop */
373 while(source<sourceLimit) {
374 if(targetCapacity>0) {
378 *target++=(uint8_t)b;
380 } else if( /* handle U+0080..U+00FF inline */
381 b>=0xc2 && b<=0xc3 &&
382 (t1=(uint8_t)(*source-0x80)) <= 0x3f
385 *target++=(uint8_t)(((b&3)<<6)|t1);
388 /* complicated, illegal or unmappable input: fall back to the pivoting implementation */
389 pToUArgs->source=(char *)(source-1);
390 pFromUArgs->target=(char *)target;
391 *pErrorCode=U_USING_DEFAULT_WARNING;
396 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
402 * The sourceLimit may have been adjusted before the conversion loop
403 * to stop before a truncated sequence.
404 * If so, then collect the truncated sequence now.
405 * For Latin-1, there is at most exactly one lead byte because of the
406 * smaller sourceLimit adjustment logic.
408 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
409 utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
411 utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
414 /* write back the updated pointers */
415 pToUArgs->source=(char *)source;
416 pFromUArgs->target=(char *)target;
420 _Latin1GetUnicodeSet(const UConverter *cnv,
422 UConverterUnicodeSet which,
423 UErrorCode *pErrorCode) {
424 sa->addRange(sa->set, 0, 0xff);
427 static const UConverterImpl _Latin1Impl={
437 _Latin1ToUnicodeWithOffsets,
438 _Latin1ToUnicodeWithOffsets,
439 _Latin1FromUnicodeWithOffsets,
440 _Latin1FromUnicodeWithOffsets,
447 _Latin1GetUnicodeSet,
453 static const UConverterStaticData _Latin1StaticData={
454 sizeof(UConverterStaticData),
456 819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
457 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
460 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
463 const UConverterSharedData _Latin1Data=
464 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Latin1StaticData, &_Latin1Impl);
466 /* US-ASCII ----------------------------------------------------------------- */
468 /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
470 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
471 UErrorCode *pErrorCode) {
472 const uint8_t *source, *sourceLimit;
473 UChar *target, *oldTarget;
474 int32_t targetCapacity, length;
481 /* set up the local pointers */
482 source=(const uint8_t *)pArgs->source;
483 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
484 target=oldTarget=pArgs->target;
485 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
486 offsets=pArgs->offsets;
488 /* sourceIndex=-1 if the current character began in the previous buffer */
492 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
493 * for the minimum of the sourceLength and targetCapacity
495 length=(int32_t)(sourceLimit-source);
496 if(length<targetCapacity) {
497 targetCapacity=length;
500 if(targetCapacity>=8) {
501 /* This loop is unrolled for speed and improved pipelining. */
502 int32_t count, loops;
505 loops=count=targetCapacity>>3;
507 oredChars=target[0]=source[0];
508 oredChars|=target[1]=source[1];
509 oredChars|=target[2]=source[2];
510 oredChars|=target[3]=source[3];
511 oredChars|=target[4]=source[4];
512 oredChars|=target[5]=source[5];
513 oredChars|=target[6]=source[6];
514 oredChars|=target[7]=source[7];
516 /* were all 16 entries really valid? */
518 /* no, return to the first of these 16 */
525 targetCapacity-=count*8;
530 offsets[0]=sourceIndex++;
531 offsets[1]=sourceIndex++;
532 offsets[2]=sourceIndex++;
533 offsets[3]=sourceIndex++;
534 offsets[4]=sourceIndex++;
535 offsets[5]=sourceIndex++;
536 offsets[6]=sourceIndex++;
537 offsets[7]=sourceIndex++;
544 /* conversion loop */
546 while(targetCapacity>0 && (c=*source++)<=0x7f) {
552 /* callback(illegal); copy the current bytes to toUBytes[] */
553 UConverter *cnv=pArgs->converter;
556 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
557 } else if(source<sourceLimit && target>=pArgs->targetLimit) {
559 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
562 /* set offsets since the start */
564 size_t count=target-oldTarget;
566 *offsets++=sourceIndex++;
571 /* write back the updated pointers */
572 pArgs->source=(const char *)source;
573 pArgs->target=target;
574 pArgs->offsets=offsets;
577 /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
579 _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
580 UErrorCode *pErrorCode) {
581 const uint8_t *source;
584 source=(const uint8_t *)pArgs->source;
585 if(source<(const uint8_t *)pArgs->sourceLimit) {
587 pArgs->source=(const char *)source;
591 UConverter *cnv=pArgs->converter;
594 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
599 /* no output because of empty input */
600 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
604 /* "Convert" UTF-8 to US-ASCII: Validate and copy. */
606 ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
607 UConverterToUnicodeArgs *pToUArgs,
608 UErrorCode *pErrorCode) {
609 const uint8_t *source, *sourceLimit;
611 int32_t targetCapacity, length;
615 if(pToUArgs->converter->toUnicodeStatus!=0) {
616 /* no handling of partial UTF-8 characters here, fall back to pivoting */
617 *pErrorCode=U_USING_DEFAULT_WARNING;
621 /* set up the local pointers */
622 source=(const uint8_t *)pToUArgs->source;
623 sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;
624 target=(uint8_t *)pFromUArgs->target;
625 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
628 * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
629 * for the minimum of the sourceLength and targetCapacity
631 length=(int32_t)(sourceLimit-source);
632 if(length<targetCapacity) {
633 targetCapacity=length;
636 /* unroll the loop with the most common case */
637 if(targetCapacity>=16) {
638 int32_t count, loops;
641 loops=count=targetCapacity>>4;
643 oredChars=*target++=*source++;
644 oredChars|=*target++=*source++;
645 oredChars|=*target++=*source++;
646 oredChars|=*target++=*source++;
647 oredChars|=*target++=*source++;
648 oredChars|=*target++=*source++;
649 oredChars|=*target++=*source++;
650 oredChars|=*target++=*source++;
651 oredChars|=*target++=*source++;
652 oredChars|=*target++=*source++;
653 oredChars|=*target++=*source++;
654 oredChars|=*target++=*source++;
655 oredChars|=*target++=*source++;
656 oredChars|=*target++=*source++;
657 oredChars|=*target++=*source++;
658 oredChars|=*target++=*source++;
660 /* were all 16 entries really valid? */
662 /* no, return to the first of these 16 */
669 targetCapacity-=16*count;
672 /* conversion loop */
674 while(targetCapacity>0 && (c=*source)<=0x7f) {
681 /* non-ASCII character, handle in standard converter */
682 *pErrorCode=U_USING_DEFAULT_WARNING;
683 } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {
685 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
688 /* write back the updated pointers */
689 pToUArgs->source=(const char *)source;
690 pFromUArgs->target=(char *)target;
694 _ASCIIGetUnicodeSet(const UConverter *cnv,
696 UConverterUnicodeSet which,
697 UErrorCode *pErrorCode) {
698 sa->addRange(sa->set, 0, 0x7f);
701 static const UConverterImpl _ASCIIImpl={
711 _ASCIIToUnicodeWithOffsets,
712 _ASCIIToUnicodeWithOffsets,
713 _Latin1FromUnicodeWithOffsets,
714 _Latin1FromUnicodeWithOffsets,
727 static const UConverterStaticData _ASCIIStaticData={
728 sizeof(UConverterStaticData),
730 367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
731 { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
734 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
737 const UConverterSharedData _ASCIIData=
738 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ASCIIStaticData, &_ASCIIImpl);