source/samples/ucnv/convsamp.cpp

   1 /*************************************************************************
   2 *
   3 *   Copyright (C) 2016 and later: Unicode, Inc. and others.
   4 *   License & terms of use: http://www.unicode.org/copyright.html#License
   5 *
   6 **************************************************************************
   7 **************************************************************************
   8 *
   9 *   Copyright (C) 2000-2016, International Business Machines
  10 *   Corporation and others.  All Rights Reserved.
  11 *
  12 ***************************************************************************
  13 *   file name:  convsamp.c
  14 *   encoding:   ASCII (7-bit)
  15 *
  16 *   created on: 2000may30
  17 *   created by: Steven R. Loomis
  18 *
  19 *   Sample code for the ICU conversion routines.
  20 *
  21 * Note: Nothing special is needed to build this sample. Link with
  22 *       the icu UC and icu I18N libraries.
  23 *
  24 *       I use 'assert' for error checking, you probably will want
  25 *       something more flexible.  '***BEGIN SAMPLE***' and
  26 *       '***END SAMPLE***' mark pieces suitable for stand alone
  27 *       code snippets.
  28 *
  29 *
  30 *  Each test can define it's own BUFFERSIZE
  31 *
  32 */
  33
  34 #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
  35
  36 #include <stdio.h>
  37 #include <ctype.h>            /* for isspace, etc.    */
  38 #include <assert.h>
  39 #include <string.h>
  40 #include <stdlib.h>  /* malloc */
  41
  42 #include "cmemory.h"
  43 #include "unicode/utypes.h"   /* Basic ICU data types */
  44 #include "unicode/ucnv.h"     /* C   Converter API    */
  45 #include "unicode/ustring.h"  /* some more string fcns*/
  46 #include "unicode/uchar.h"    /* char names           */
  47 #include "unicode/uloc.h"
  48 #include "unicode/unistr.h"
  49
  50 #include "flagcb.h"
  51
  52 /* Some utility functions */
  53
  54 static const UChar kNone[] = { 0x0000 };
  55
  56 #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
  57
  58 /* Print a UChar if possible, in seven characters. */
  59 void prettyPrintUChar(UChar c)
  60 {
  61   if(  (c <= 0x007F) &&
  62        (isgraph(c))  ) {
  63     printf(" '%c'   ", (char)(0x00FF&c));
  64   } else if ( c > 0x007F ) {
  65     char buf[1000];
  66     UErrorCode status = U_ZERO_ERROR;
  67     int32_t o;
  68
  69     o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
  70     if(U_SUCCESS(status) && (o>0) ) {
  71       buf[6] = 0;
  72       printf("%7s", buf);
  73     } else {
  74       printf(" ??????");
  75     }
  76   } else {
  77     switch((char)(c & 0x007F)) {
  78     case ' ':
  79       printf(" ' '   ");
  80       break;
  81     case '\t':
  82       printf(" \\t    ");
  83       break;
  84     case '\n':
  85       printf(" \\n    ");
  86       break;
  87     default:
  88       printf("  _    ");
  89       break;
  90     }
  91   }
  92 }
  93
  94
  95 void printUChars(const char  *name = "?",
  96                  const UChar *uch  = kNone,
  97                  int32_t     len   = -1 )
  98 {
  99   int32_t i;
 100
 101   if( (len == -1) && (uch) ) {
 102     len = u_strlen(uch);
 103   }
 104
 105   printf("%5s: ", name);
 106   for( i = 0; i <len; i++) {
 107     printf("%-6d ", i);
 108   }
 109   printf("\n");
 110
 111   printf("%5s: ", "uni");
 112   for( i = 0; i <len; i++) {
 113     printf("\\u%04X ", (int)uch[i]);
 114   }
 115   printf("\n");
 116
 117   printf("%5s:", "ch");
 118   for( i = 0; i <len; i++) {
 119     prettyPrintUChar(uch[i]);
 120   }
 121   printf("\n");
 122 }
 123
 124 void printBytes(const char  *name = "?",
 125                  const char *uch  = "",
 126                  int32_t     len   = -1 )
 127 {
 128   int32_t i;
 129
 130   if( (len == -1) && (uch) ) {
 131     len = strlen(uch);
 132   }
 133
 134   printf("%5s: ", name);
 135   for( i = 0; i <len; i++) {
 136     printf("%-4d ", i);
 137   }
 138   printf("\n");
 139
 140   printf("%5s: ", "uni");
 141   for( i = 0; i <len; i++) {
 142     printf("\\x%02X ", 0x00FF & (int)uch[i]);
 143   }
 144   printf("\n");
 145
 146   printf("%5s:", "ch");
 147   for( i = 0; i <len; i++) {
 148     if(isgraph(0x00FF & (int)uch[i])) {
 149       printf(" '%c' ", (char)uch[i]);
 150     } else {
 151       printf("     ");
 152     }
 153   }
 154   printf("\n");
 155 }
 156
 157 void printUChar(UChar32 ch32)
 158 {
 159     if(ch32 > 0xFFFF) {
 160       printf("ch: U+%06X\n", ch32);
 161     }
 162     else {
 163       UChar ch = (UChar)ch32;
 164       printUChars("C", &ch, 1);
 165     }
 166 }
 167
 168 /*******************************************************************
 169   Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
 170   followed by an exclamation mark (!) into the KOI8-R Russian code page.
 171
 172   This example first creates a UChar String out of the Unicode chars.
 173
 174   targetSize must be set to the amount of space available in the target
 175   buffer. After fromUChars is called,
 176   len will contain the number of bytes in target[] which were
 177   used in the resulting codepage.  In this case, there is a 1:1 mapping
 178   between the input and output characters. The exclamation mark has the
 179   same value in both KOI8-R and Unicode.
 180
 181   src: 0      1      2      3      4      5      6
 182   uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
 183    ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
 184
 185  targ:  0    1    2    3    4    5    6
 186   uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
 187    ch:                                '!'
 188
 189
 190 Converting FROM unicode
 191   to koi8-r.
 192   You must call ucnv_close to clean up the memory used by the
 193   converter.
 194
 195   'len' returns the number of OUTPUT bytes resulting from the
 196   conversion.
 197  */
 198
 199 UErrorCode convsample_02()
 200 {
 201   printf("\n\n==============================================\n"
 202          "Sample 02: C: simple Unicode -> koi8-r conversion\n");
 203
 204
 205   // **************************** START SAMPLE *******************
 206   // "cat<cat>OK"
 207   UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
 208                      0x0430, 0x0021, 0x0000 };
 209   char target[100];
 210   UErrorCode status = U_ZERO_ERROR;
 211   UConverter *conv;
 212   int32_t     len;
 213
 214   // set up the converter
 215   //! [ucnv_open]
 216   conv = ucnv_open("koi8-r", &status);
 217   //! [ucnv_open]
 218   assert(U_SUCCESS(status));
 219
 220   // convert to koi8-r
 221   len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
 222   assert(U_SUCCESS(status));
 223
 224   // close the converter
 225   ucnv_close(conv);
 226
 227   // ***************************** END SAMPLE ********************
 228
 229   // Print it out
 230   printUChars("src", source);
 231   printf("\n");
 232   printBytes("targ", target, len);
 233
 234   return U_ZERO_ERROR;
 235 }
 236
 237
 238 UErrorCode convsample_03()
 239 {
 240   printf("\n\n==============================================\n"
 241          "Sample 03: C: print out all converters\n");
 242
 243   int32_t count;
 244   int32_t i;
 245
 246   // **************************** START SAMPLE *******************
 247   count = ucnv_countAvailable();
 248   printf("Available converters: %d\n", count);
 249
 250   for(i=0;i<count;i++)
 251   {
 252     printf("%s ", ucnv_getAvailableName(i));
 253   }
 254
 255   // ***************************** END SAMPLE ********************
 256
 257   printf("\n");
 258
 259   return U_ZERO_ERROR;
 260 }
 261
 262
 263
 264 #define BUFFERSIZE 17 /* make it interesting :) */
 265
 266 /*
 267   Converting from a codepage to Unicode in bulk..
 268   What is the best way to determine the buffer size?
 269
 270      The 'buffersize' is in bytes of input.
 271     For a given converter, divinding this by the minimum char size
 272     give you the maximum number of Unicode characters that could be
 273     expected for a given number of input bytes.
 274      see: ucnv_getMinCharSize()
 275
 276      For example, a single byte codepage like 'Latin-3' has a
 277     minimum char size of 1. (It takes at least 1 byte to represent
 278     each Unicode char.) So the unicode buffer has the same number of
 279     UChars as the input buffer has bytes.
 280
 281      In a strictly double byte codepage such as cp1362 (Windows
 282     Korean), the minimum char size is 2. So, only half as many Unicode
 283     chars as bytes are needed.
 284
 285      This work to calculate the buffer size is an optimization. Any
 286     size of input and output buffer can be used, as long as the
 287     program handles the following cases: If the input buffer is empty,
 288     the source pointer will be equal to sourceLimit.  If the output
 289     buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
 290  */
 291
 292 UErrorCode convsample_05()
 293 {
 294   printf("\n\n==============================================\n"
 295          "Sample 05: C: count the number of letters in a UTF-8 document\n");
 296
 297   FILE *f;
 298   int32_t count;
 299   char inBuf[BUFFERSIZE];
 300   const char *source;
 301   const char *sourceLimit;
 302   UChar *uBuf;
 303   UChar *target;
 304   UChar *targetLimit;
 305   UChar *p;
 306   int32_t uBufSize = 0;
 307   UConverter *conv;
 308   UErrorCode status = U_ZERO_ERROR;
 309   uint32_t letters=0, total=0;
 310
 311   f = fopen("data01.txt", "r");
 312   if(!f)
 313   {
 314     fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
 315     return U_FILE_ACCESS_ERROR;
 316   }
 317
 318   // **************************** START SAMPLE *******************
 319   conv = ucnv_open("utf-8", &status);
 320   assert(U_SUCCESS(status));
 321
 322   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 323   printf("input bytes %d / min chars %d = %d UChars\n",
 324          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 325   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 326   assert(uBuf!=NULL);
 327
 328   // grab another buffer's worth
 329   while((!feof(f)) &&
 330         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 331   {
 332     // Convert bytes to unicode
 333     source = inBuf;
 334     sourceLimit = inBuf + count;
 335
 336     do
 337     {
 338         target = uBuf;
 339         targetLimit = uBuf + uBufSize;
 340
 341         ucnv_toUnicode(conv, &target, targetLimit,
 342                        &source, sourceLimit, NULL,
 343                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 344                                    /* is true (when no more data will come) */
 345                        &status);
 346
 347         if(status == U_BUFFER_OVERFLOW_ERROR)
 348         {
 349           // simply ran out of space - we'll reset the target ptr the next
 350           // time through the loop.
 351           status = U_ZERO_ERROR;
 352         }
 353         else
 354         {
 355           //  Check other errors here.
 356           assert(U_SUCCESS(status));
 357           // Break out of the loop (by force)
 358         }
 359
 360         // Process the Unicode
 361         // Todo: handle UTF-16/surrogates
 362
 363         for(p = uBuf; p<target; p++)
 364         {
 365           if(u_isalpha(*p))
 366             letters++;
 367           total++;
 368         }
 369     } while (source < sourceLimit); // while simply out of space
 370   }
 371
 372   printf("%d letters out of %d total UChars.\n", letters, total);
 373
 374   // ***************************** END SAMPLE ********************
 375   ucnv_close(conv);
 376
 377   printf("\n");
 378
 379   fclose(f);
 380
 381   return U_ZERO_ERROR;
 382 }
 383 #undef BUFFERSIZE
 384
 385 #define BUFFERSIZE 1024
 386 typedef struct
 387 {
 388   UChar32  codepoint;
 389   uint32_t frequency;
 390 } CharFreqInfo;
 391
 392 UErrorCode convsample_06()
 393 {
 394   printf("\n\n==============================================\n"
 395          "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
 396
 397   FILE *f;
 398   int32_t count;
 399   char inBuf[BUFFERSIZE];
 400   const char *source;
 401   const char *sourceLimit;
 402   int32_t uBufSize = 0;
 403   UConverter *conv;
 404   UErrorCode status = U_ZERO_ERROR;
 405   uint32_t letters=0, total=0;
 406
 407   CharFreqInfo   *info;
 408   UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
 409   UChar32   p;
 410
 411   uint32_t ie = 0;
 412   uint32_t gh = 0;
 413   UChar32 l = 0;
 414
 415   f = fopen("data06.txt", "r");
 416   if(!f)
 417   {
 418     fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
 419     return U_FILE_ACCESS_ERROR;
 420   }
 421
 422   info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
 423   if(!info)
 424   {
 425     fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
 426   }
 427
 428   /* reset frequencies */
 429   for(p=0;p<charCount;p++)
 430   {
 431     info[p].codepoint = p;
 432     info[p].frequency = 0;
 433   }
 434
 435   // **************************** START SAMPLE *******************
 436   conv = ucnv_open("utf-8", &status);
 437   assert(U_SUCCESS(status));
 438
 439   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 440   printf("input bytes %d / min chars %d = %d UChars\n",
 441          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 442
 443   // grab another buffer's worth
 444   while((!feof(f)) &&
 445         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 446   {
 447     // Convert bytes to unicode
 448     source = inBuf;
 449     sourceLimit = inBuf + count;
 450
 451     while(source < sourceLimit)
 452     {
 453       p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
 454       if(U_FAILURE(status))
 455       {
 456         fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
 457         status = U_ZERO_ERROR;
 458         continue;
 459       }
 460       U_ASSERT(status);
 461       total++;
 462
 463       if(u_isalpha(p))
 464         letters++;
 465
 466       if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
 467         ie++;
 468
 469       if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
 470         gh++;
 471
 472       if(p>charCount)
 473       {
 474         fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
 475         free(info);
 476         fclose(f);
 477         ucnv_close(conv);
 478         return U_UNSUPPORTED_ERROR;
 479       }
 480       info[p].frequency++;
 481       l = p;
 482     }
 483   }
 484
 485   fclose(f);
 486   ucnv_close(conv);
 487
 488   printf("%d letters out of %d total UChars.\n", letters, total);
 489   printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
 490
 491   // now, we could sort it..
 492
 493   //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
 494
 495   for(p=0;p<charCount;p++)
 496   {
 497     if(info[p].frequency)
 498     {
 499       printf("% 5d U+%06X ", info[p].frequency, p);
 500       if(p <= 0xFFFF)
 501       {
 502         prettyPrintUChar((UChar)p);
 503       }
 504       printf("\n");
 505     }
 506   }
 507   free(info);
 508   // ***************************** END SAMPLE ********************
 509
 510   printf("\n");
 511
 512   return U_ZERO_ERROR;
 513 }
 514 #undef BUFFERSIZE
 515
 516
 517 /******************************************************
 518   You must call ucnv_close to clean up the memory used by the
 519   converter.
 520
 521   'len' returns the number of OUTPUT bytes resulting from the
 522   conversion.
 523  */
 524
 525 UErrorCode convsample_12()
 526 {
 527   printf("\n\n==============================================\n"
 528          "Sample 12: C: simple sjis -> unicode conversion\n");
 529
 530
 531   // **************************** START SAMPLE *******************
 532
 533   char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
 534   UChar target[100];
 535   UErrorCode status = U_ZERO_ERROR;
 536   UConverter *conv;
 537   int32_t     len;
 538
 539   // set up the converter
 540   conv = ucnv_open("shift_jis", &status);
 541   assert(U_SUCCESS(status));
 542
 543   // convert to Unicode
 544   // Note: we can use strlen, we know it's an 8 bit null terminated codepage
 545   target[6] = 0xFDCA;
 546   len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
 547   U_ASSERT(status);
 548   // close the converter
 549   ucnv_close(conv);
 550
 551   // ***************************** END SAMPLE ********************
 552
 553   // Print it out
 554   printBytes("src", source, strlen(source) );
 555   printf("\n");
 556   printUChars("targ", target, len);
 557
 558   return U_ZERO_ERROR;
 559 }
 560
 561 /******************************************************************
 562    C: Convert from codepage to Unicode one at a time.
 563 */
 564
 565 UErrorCode convsample_13()
 566 {
 567   printf("\n\n==============================================\n"
 568          "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
 569
 570
 571   const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
 572   //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
 573   const char *source, *sourceLimit;
 574   UChar32 target;
 575   UErrorCode status = U_ZERO_ERROR;
 576   UConverter *conv = NULL;
 577   int32_t srcCount=0;
 578   int32_t dstCount=0;
 579
 580   srcCount = sizeof(sourceChars);
 581
 582   conv = ucnv_open("Big5", &status);
 583   U_ASSERT(status);
 584
 585   source = sourceChars;
 586   sourceLimit = sourceChars + sizeof(sourceChars);
 587
 588   // **************************** START SAMPLE *******************
 589
 590
 591   printBytes("src",source,sourceLimit-source);
 592
 593   while(source < sourceLimit)
 594   {
 595     puts("");
 596     target = ucnv_getNextUChar (conv,
 597                                 &source,
 598                                 sourceLimit,
 599                                 &status);
 600
 601     //    printBytes("src",source,sourceLimit-source);
 602     U_ASSERT(status);
 603     printUChar(target);
 604     dstCount++;
 605   }
 606
 607
 608   // ************************** END SAMPLE *************************
 609
 610   printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
 611   ucnv_close(conv);
 612
 613   return U_ZERO_ERROR;
 614 }
 615
 616
 617
 618
 619 UBool convsample_20_didSubstitute(const char *source)
 620 {
 621   UChar uchars[100];
 622   char bytes[100];
 623   UConverter *conv = NULL;
 624   UErrorCode status = U_ZERO_ERROR;
 625   uint32_t len, len2;
 626   UBool  flagVal;
 627
 628   FromUFLAGContext * context = NULL;
 629
 630   printf("\n\n==============================================\n"
 631          "Sample 20: C: Test for substitution using callbacks\n");
 632
 633   /* print out the original source */
 634   printBytes("src", source);
 635   printf("\n");
 636
 637   /* First, convert from UTF8 to unicode */
 638   conv = ucnv_open("utf-8", &status);
 639   U_ASSERT(status);
 640
 641   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 642   U_ASSERT(status);
 643
 644   printUChars("uch", uchars, len);
 645   printf("\n");
 646
 647   /* Now, close the converter */
 648   ucnv_close(conv);
 649
 650   /* Now, convert to windows-1252 */
 651   conv = ucnv_open("windows-1252", &status);
 652   U_ASSERT(status);
 653
 654   /* Converter starts out with the SUBSTITUTE callback set. */
 655
 656   /* initialize our callback */
 657   context = flagCB_fromU_openContext();
 658
 659   /* Set our special callback */
 660   ucnv_setFromUCallBack(conv,
 661                         flagCB_fromU,
 662                         context,
 663                         &(context->subCallback),
 664                         &(context->subContext),
 665                         &status);
 666
 667   U_ASSERT(status);
 668
 669   len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
 670   U_ASSERT(status);
 671
 672   flagVal = context->flag;  /* it's about to go away when we close the cnv */
 673
 674   ucnv_close(conv);
 675
 676   /* print out the original source */
 677   printBytes("bytes", bytes, len2);
 678
 679   return flagVal; /* true if callback was called */
 680 }
 681
 682 UErrorCode convsample_20()
 683 {
 684   const char *sample1 = "abc\xdf\xbf";
 685   const char *sample2 = "abc_def";
 686
 687
 688   if(convsample_20_didSubstitute(sample1))
 689   {
 690     printf("DID substitute.\n******\n");
 691   }
 692   else
 693   {
 694     printf("Did NOT substitute.\n*****\n");
 695   }
 696
 697   if(convsample_20_didSubstitute(sample2))
 698   {
 699     printf("DID substitute.\n******\n");
 700   }
 701   else
 702   {
 703     printf("Did NOT substitute.\n*****\n");
 704   }
 705
 706   return U_ZERO_ERROR;
 707 }
 708
 709 // 21  - C, callback, with clone and debug
 710
 711
 712
 713 UBool convsample_21_didSubstitute(const char *source)
 714 {
 715   UChar uchars[100];
 716   char bytes[100];
 717   UConverter *conv = NULL, *cloneCnv = NULL;
 718   UErrorCode status = U_ZERO_ERROR;
 719   uint32_t len, len2;
 720   int32_t  cloneLen;
 721   UBool  flagVal = FALSE;
 722   UConverterFromUCallback junkCB;
 723
 724   FromUFLAGContext *flagCtx = NULL,
 725                    *cloneFlagCtx = NULL;
 726
 727   debugCBContext   *debugCtx1 = NULL,
 728                    *debugCtx2 = NULL,
 729                    *cloneDebugCtx = NULL;
 730
 731   printf("\n\n==============================================\n"
 732          "Sample 21: C: Test for substitution w/ callbacks & clones \n");
 733
 734   /* print out the original source */
 735   printBytes("src", source);
 736   printf("\n");
 737
 738   /* First, convert from UTF8 to unicode */
 739   conv = ucnv_open("utf-8", &status);
 740   U_ASSERT(status);
 741
 742   len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
 743   U_ASSERT(status);
 744
 745   printUChars("uch", uchars, len);
 746   printf("\n");
 747
 748   /* Now, close the converter */
 749   ucnv_close(conv);
 750
 751   /* Now, convert to windows-1252 */
 752   conv = ucnv_open("windows-1252", &status);
 753   U_ASSERT(status);
 754
 755   /* Converter starts out with the SUBSTITUTE callback set. */
 756
 757   /* initialize our callback */
 758   /* from the 'bottom' innermost, out
 759    *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
 760
 761 #if DEBUG_TMI
 762   printf("flagCB_fromU = %p\n", &flagCB_fromU);
 763   printf("debugCB_fromU = %p\n", &debugCB_fromU);
 764 #endif
 765
 766   debugCtx1 = debugCB_openContext();
 767    flagCtx  = flagCB_fromU_openContext();
 768   debugCtx2 = debugCB_openContext();
 769
 770   debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
 771   debugCtx1->subContext  =  flagCtx;
 772
 773   flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
 774   flagCtx->subContext    =  debugCtx2;
 775
 776   debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 777   debugCtx2->subContext  = NULL;
 778
 779   /* Set our special callback */
 780
 781   ucnv_setFromUCallBack(conv,
 782                         debugCB_fromU,
 783                         debugCtx1,
 784                         &(debugCtx2->subCallback),
 785                         &(debugCtx2->subContext),
 786                         &status);
 787
 788   U_ASSERT(status);
 789
 790 #if DEBUG_TMI
 791   printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
 792          conv, debugCtx1, debugCtx1->subCallback,
 793          debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
 794 #endif
 795
 796   cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
 797
 798   U_ASSERT(status);
 799
 800 #if DEBUG_TMI
 801   printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
 802 #endif
 803
 804   ucnv_close(conv);
 805
 806 #if DEBUG_TMI
 807   printf("%p closed.\n", conv);
 808 #endif
 809
 810   U_ASSERT(status);
 811   /* Now, we have to extract the context */
 812   cloneDebugCtx = NULL;
 813   cloneFlagCtx  = NULL;
 814
 815   ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
 816   if(cloneDebugCtx != NULL) {
 817       cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
 818   }
 819
 820   printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
 821          cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
 822
 823   len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
 824   U_ASSERT(status);
 825
 826   if(cloneFlagCtx != NULL) {
 827       flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
 828   } else {
 829       printf("** Warning, couldn't get the subcallback \n");
 830   }
 831
 832   ucnv_close(cloneCnv);
 833
 834   /* print out the original source */
 835   printBytes("bytes", bytes, len2);
 836
 837   return flagVal; /* true if callback was called */
 838 }
 839
 840 UErrorCode convsample_21()
 841 {
 842   const char *sample1 = "abc\xdf\xbf";
 843   const char *sample2 = "abc_def";
 844
 845   if(convsample_21_didSubstitute(sample1))
 846   {
 847     printf("DID substitute.\n******\n");
 848   }
 849   else
 850   {
 851     printf("Did NOT substitute.\n*****\n");
 852   }
 853
 854   if(convsample_21_didSubstitute(sample2))
 855   {
 856     printf("DID substitute.\n******\n");
 857   }
 858   else
 859   {
 860     printf("Did NOT substitute.\n*****\n");
 861   }
 862
 863   return U_ZERO_ERROR;
 864 }
 865
 866
 867 //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
 868
 869 #define BUFFERSIZE 17 /* make it interesting :) */
 870
 871 UErrorCode convsample_40()
 872 {
 873   printf("\n\n==============================================\n"
 874     "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
 875
 876   FILE *f;
 877   FILE *out;
 878   int32_t count;
 879   char inBuf[BUFFERSIZE];
 880   const char *source;
 881   const char *sourceLimit;
 882   UChar *uBuf;
 883   UChar *target;
 884   UChar *targetLimit;
 885   int32_t uBufSize = 0;
 886   UConverter *conv = NULL;
 887   UErrorCode status = U_ZERO_ERROR;
 888   uint32_t inbytes=0, total=0;
 889
 890   f = fopen("data02.bin", "rb");
 891   if(!f)
 892   {
 893     fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
 894     return U_FILE_ACCESS_ERROR;
 895   }
 896
 897   out = fopen("data40.utf16", "wb");
 898   if(!out)
 899   {
 900     fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
 901     fclose(f);
 902     return U_FILE_ACCESS_ERROR;
 903   }
 904
 905   // **************************** START SAMPLE *******************
 906   conv = ucnv_openCCSID(37, UCNV_IBM, &status);
 907   assert(U_SUCCESS(status));
 908
 909   uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
 910   printf("input bytes %d / min chars %d = %d UChars\n",
 911          BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
 912   uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
 913   assert(uBuf!=NULL);
 914
 915   // grab another buffer's worth
 916   while((!feof(f)) &&
 917         ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
 918   {
 919     inbytes += count;
 920
 921     // Convert bytes to unicode
 922     source = inBuf;
 923     sourceLimit = inBuf + count;
 924
 925     do
 926     {
 927         target = uBuf;
 928         targetLimit = uBuf + uBufSize;
 929
 930         ucnv_toUnicode( conv, &target, targetLimit,
 931                        &source, sourceLimit, NULL,
 932                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
 933                                    /* is true (when no more data will come) */
 934                          &status);
 935
 936         if(status == U_BUFFER_OVERFLOW_ERROR)
 937         {
 938           // simply ran out of space - we'll reset the target ptr the next
 939           // time through the loop.
 940           status = U_ZERO_ERROR;
 941         }
 942         else
 943         {
 944           //  Check other errors here.
 945           assert(U_SUCCESS(status));
 946           // Break out of the loop (by force)
 947         }
 948
 949         // Process the Unicode
 950         // Todo: handle UTF-16/surrogates
 951         assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
 952                (size_t)(target-uBuf));
 953         total += (target-uBuf);
 954     } while (source < sourceLimit); // while simply out of space
 955   }
 956
 957   printf("%d bytes in,  %d UChars out.\n", inbytes, total);
 958
 959   // ***************************** END SAMPLE ********************
 960   ucnv_close(conv);
 961
 962   fclose(f);
 963   fclose(out);
 964   printf("\n");
 965
 966   return U_ZERO_ERROR;
 967 }
 968 #undef BUFFERSIZE
 969
 970
 971
 972 //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
 973
 974 #define BUFFERSIZE 24 /* make it interesting :) */
 975
 976 UErrorCode convsample_46()
 977 {
 978   printf("\n\n==============================================\n"
 979     "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
 980
 981   FILE *f;
 982   FILE *out;
 983   int32_t count;
 984   UChar inBuf[BUFFERSIZE];
 985   const UChar *source;
 986   const UChar *sourceLimit;
 987   char *buf;
 988   char *target;
 989   char *targetLimit;
 990
 991   int32_t bufSize = 0;
 992   UConverter *conv = NULL;
 993   UErrorCode status = U_ZERO_ERROR;
 994   uint32_t inchars=0, total=0;
 995
 996   f = fopen("data40.utf16", "rb");
 997   if(!f)
 998   {
 999     fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
1000     return U_FILE_ACCESS_ERROR;
1001   }
1002
1003   out = fopen("data46.out", "wb");
1004   if(!out)
1005   {
1006     fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1007     fclose(f);
1008     return U_FILE_ACCESS_ERROR;
1009   }
1010
1011   // **************************** START SAMPLE *******************
1012   conv = ucnv_open( "iso-8859-2", &status);
1013   assert(U_SUCCESS(status));
1014
1015   bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1016   printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1017          BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1018   buf = (char*)malloc(bufSize * sizeof(char));
1019   assert(buf!=NULL);
1020
1021   // grab another buffer's worth
1022   while((!feof(f)) &&
1023         ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1024   {
1025     inchars += count;
1026
1027     // Convert bytes to unicode
1028     source = inBuf;
1029     sourceLimit = inBuf + count;
1030
1031     do
1032     {
1033         target = buf;
1034         targetLimit = buf + bufSize;
1035
1036         ucnv_fromUnicode( conv, &target, targetLimit,
1037                        &source, sourceLimit, NULL,
1038                        feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1039                                    /* is true (when no more data will come) */
1040                          &status);
1041
1042         if(status == U_BUFFER_OVERFLOW_ERROR)
1043         {
1044           // simply ran out of space - we'll reset the target ptr the next
1045           // time through the loop.
1046           status = U_ZERO_ERROR;
1047         }
1048         else
1049         {
1050           //  Check other errors here.
1051           assert(U_SUCCESS(status));
1052           // Break out of the loop (by force)
1053         }
1054
1055         // Process the Unicode
1056         assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1057                (size_t)(target-buf));
1058         total += (target-buf);
1059     } while (source < sourceLimit); // while simply out of space
1060   }
1061
1062   printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1063
1064   // ***************************** END SAMPLE ********************
1065   ucnv_close(conv);
1066
1067   fclose(f);
1068   fclose(out);
1069   printf("\n");
1070
1071   return U_ZERO_ERROR;
1072 }
1073 #undef BUFFERSIZE
1074
1075 #define BUFFERSIZE 219
1076
1077 void convsample_50() {
1078   printf("\n\n==============================================\n"
1079          "Sample 50: C: ucnv_detectUnicodeSignature\n");
1080
1081   //! [ucnv_detectUnicodeSignature]
1082   UErrorCode err = U_ZERO_ERROR;
1083   UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1084   char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1085   int32_t signatureLength = 0;
1086   const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1087   UConverter *conv = NULL;
1088   UChar output[100];
1089   UChar *target = output, *out;
1090   const char *source = input;
1091   if(encoding!=NULL && U_SUCCESS(err)){
1092     // should signature be discarded ?
1093     conv = ucnv_open(encoding, &err);
1094     // do the conversion
1095     ucnv_toUnicode(conv,
1096                    &target, output + UPRV_LENGTHOF(output),
1097                    &source, input + sizeof(input),
1098                    NULL, TRUE, &err);
1099     out = output;
1100     if (discardSignature){
1101       ++out; // ignore initial U+FEFF
1102     }
1103     while(out != target) {
1104       printf("%04x ", *out++);
1105     }
1106     puts("");
1107   }
1108   //! [ucnv_detectUnicodeSignature]
1109   puts("");
1110 }
1111
1112
1113
1114 /* main */
1115
1116 int main()
1117 {
1118
1119   printf("Default Converter=%s\n", ucnv_getDefaultName() );
1120
1121   convsample_02();  // C  , u->koi8r, conv
1122   convsample_03();  // C,   iterate
1123
1124   convsample_05();  // C,  utf8->u, getNextUChar
1125   convsample_06(); // C freq counter thingy
1126
1127   convsample_12();  // C,  sjis->u, conv
1128   convsample_13();  // C,  big5->u, getNextU
1129
1130   convsample_20();  // C, callback
1131   convsample_21();  // C, callback debug
1132
1133   convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1134
1135   convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1136
1137   convsample_50();  // C, detect unicode signature
1138
1139   printf("End of converter samples.\n");
1140
1141   fflush(stdout);
1142   fflush(stderr);
1143
1144   return 0;
1145 }