From 15a2315cb457be0599d7a662e64aa54e560f96f0 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 29 Dec 1999 07:32:44 +0000 Subject: [PATCH] Update. * iconvdata/Makefile (modules): Add ISO-2022-CN. Add link rules for this module. (distribute): Add iso-2022-cn.c and cns11643l2.h. * iconvdata/cns11643l2.h: New file. * iconvdata/iso-2022-cn.c: New file. * iconvdata/gconv-modules: Add entries for ISO-2022-CN module. * iconvdata/cns11643l1.h (cns11643l1_to_ucs4): Make string argument unsigned. (ucs4_to_cns11643l1): Likewise. * iconvdata/euc-tw.c: Correct parameter passed to cns11643l1_to_ucs4. * iconvdata/iso-2022-kr.c: Remove unnecessary test from conversion loop to UCS4. Optimize recognition of escape sequences a bit. --- ChangeLog | 17 ++- iconvdata/Makefile | 7 +- iconvdata/cns11643l1.h | 5 +- iconvdata/cns11643l2.h | 85 +++++++++++ iconvdata/euc-tw.c | 2 +- iconvdata/gconv-modules | 5 + iconvdata/iso-2022-cn.c | 398 ++++++++++++++++++++++++++++++++++++++++++++++++ iconvdata/iso-2022-kr.c | 5 - 8 files changed, 512 insertions(+), 12 deletions(-) create mode 100644 iconvdata/cns11643l2.h create mode 100644 iconvdata/iso-2022-cn.c diff --git a/ChangeLog b/ChangeLog index b9d8ba6..d04d463 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,20 @@ 1999-12-28 Ulrich Drepper - * iconvdata/iso-2022-kr.c: Optimize recognition of escape - sequences a bit. + * iconvdata/Makefile (modules): Add ISO-2022-CN. + Add link rules for this module. + (distribute): Add iso-2022-cn.c and cns11643l2.h. + * iconvdata/cns11643l2.h: New file. + * iconvdata/iso-2022-cn.c: New file. + * iconvdata/gconv-modules: Add entries for ISO-2022-CN module. + + * iconvdata/cns11643l1.h (cns11643l1_to_ucs4): Make string argument + unsigned. + (ucs4_to_cns11643l1): Likewise. + * iconvdata/euc-tw.c: Correct parameter passed to cns11643l1_to_ucs4. + + * iconvdata/iso-2022-kr.c: Remove unnecessary test from conversion + loop to UCS4. + Optimize recognition of escape sequences a bit. * sysdeps/unix/sysv/linux/alpha/syscalls.list: Remove oldgetrlimit and oldsetrlimit definitions. diff --git a/iconvdata/Makefile b/iconvdata/Makefile index 8380f10..73bb096 100644 --- a/iconvdata/Makefile +++ b/iconvdata/Makefile @@ -44,7 +44,8 @@ modules := ISO8859-1 ISO8859-2 ISO8859-3 ISO8859-4 ISO8859-5 \ GOST_19768-74 GREEK-CCITT GREEK7 GREEK7-OLD INIS INIS-8 \ INIS-CYRILLIC ISO_6937-2 ISO_2033 ISO_5427 ISO_5427-EXT \ ISO_5428 ISO_10367-BOX MAC-IS MAC-UK NATS-DANO NATS-SEFI \ - SAMI-WS2 ISO-IR-197 TIS-620 KOI8-U GBK ISIRI-3342 GBGBK + SAMI-WS2 ISO-IR-197 TIS-620 KOI8-U GBK ISIRI-3342 GBGBK \ + ISO-2022-CN modules.so := $(addsuffix .so, $(modules)) @@ -73,6 +74,8 @@ $(objpfx)ISO-2022-JP.so: $(objpfx)libJIS.so $(objpfx)libGB.so \ $(objpfx)libCNS.so $(objpfx)libKSC.so LDFLAGS-ISO-2022-KR.so = -Wl,-rpath,'$$ORIGIN' $(objpfx)ISO-2022-KR.so: $(objpfx)libKSC.so +LDFLAGS-ISO-2022-CN.so = -Wl,-rpath,'$$ORIGIN' +$(objpfx)ISO-2022-CN.so: $(objpfx)libGB.so $(objpfx)libCNS.so LDFLAGS-libJIS.so = -Wl,-soname,$(@F) LDFLAGS-libKSC.so = -Wl,-soname,$(@F) @@ -112,7 +115,7 @@ distribute := gconv-modules extra-module.mk gap.awk gaptab.awk \ koi8-r.c ksc5601.c ksc5601.h latin-greek.c latin-greek-1.c \ macintosh.c mac-is.c mac-uk.c nats-dano.c nats-sefi.c sjis.c \ t.61.c uhc.c sami-ws2.c iso-ir-197.c tis-620.c koi8-u.c \ - isiri-3342.c gbgbk.c + isiri-3342.c gbgbk.c iso-2022-cn.c cns11643l2.h # We build the transformation modules only when we build shared libs. ifeq (yes,$(build-shared)) diff --git a/iconvdata/cns11643l1.h b/iconvdata/cns11643l1.h index 3e0d042..1aa5918 100644 --- a/iconvdata/cns11643l1.h +++ b/iconvdata/cns11643l1.h @@ -26,7 +26,8 @@ extern const uint16_t __cns11643l1_to_ucs4_tab[]; static inline uint32_t -cns11643l1_to_ucs4 (const char **s, size_t avail, unsigned char offset) +cns11643l1_to_ucs4 (const unsigned char **s, size_t avail, + unsigned char offset) { unsigned char ch = *(*s); unsigned char ch2; @@ -70,7 +71,7 @@ extern const char __cns11643l1_from_ucs4_tab14[][2]; static inline size_t -ucs4_to_cns11643l1 (uint32_t wch, char *s, size_t avail) +ucs4_to_cns11643l1 (uint32_t wch, unsigned char *s, size_t avail) { unsigned int ch = (unsigned int) wch; char buf[2]; diff --git a/iconvdata/cns11643l2.h b/iconvdata/cns11643l2.h new file mode 100644 index 0000000..0fea388 --- /dev/null +++ b/iconvdata/cns11643l2.h @@ -0,0 +1,85 @@ +/* Access functions for CNS 11643, plane 2 handling. + Copyright (C) 1998, 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper , 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include +#include + +/* Table for CNS 11643, plane 2 to UCS4 conversion. */ +extern const uint16_t __cns11643l2_to_ucs4_tab[]; + + +static inline uint32_t +cns11643l2_to_ucs4 (const unsigned char **s, size_t avail, + unsigned char offset) +{ + unsigned char ch = *(*s); + unsigned char ch2; + int idx; + + if (ch < offset || (ch - offset) <= 0x20 || (ch - offset) > 0x7d) + return __UNKNOWN_10646_CHAR; + + if (avail < 2) + return 0; + + ch2 = (*s)[1]; + if ((ch2 - offset) <= 0x20 || (ch2 - offset) >= 0x7f) + return __UNKNOWN_10646_CHAR; + + idx = (ch - 0x21 - offset) * 94 + (ch2 - 0x21 - offset); + if (idx > 0x1de1) + return __UNKNOWN_10646_CHAR; + + (*s) += 2; + + return __cns11643l2_to_ucs4_tab[idx] ?: ((*s) -= 2, __UNKNOWN_10646_CHAR); +} + + +/* The table which contains the CNS 11643 level 2 mappings. */ +extern const char __cns11643_from_ucs4_tab[][3]; + + +static inline size_t +ucs4_to_cns11643l2 (uint32_t wch, unsigned char *s, size_t avail) +{ + unsigned int ch = (unsigned int) wch; + const char *cp = NULL; + + if (ch >= 0x4e07 && ch <= 0x9fa4) + { + cp = __cns11643_from_ucs4_tab[ch - 0x4e00]; + if (cp[0] == '\2') + ++cp; + else + cp = NULL; + } + + if (cp == NULL) + return __UNKNOWN_10646_CHAR; + + if (avail < 2) + return 0; + + s[0] = cp[0]; + s[1] = cp[1]; + + return 2; +} diff --git a/iconvdata/euc-tw.c b/iconvdata/euc-tw.c index 4772908..036baae 100644 --- a/iconvdata/euc-tw.c +++ b/iconvdata/euc-tw.c @@ -97,7 +97,7 @@ else \ { \ /* This is code set 1: CNS 11643, plane 1. */ \ - const char *endp = inptr; \ + const unsigned char *endp = inptr; \ \ ch = cns11643l1_to_ucs4 (&endp, \ NEED_LENGTH_TEST ? inend - inptr : 2, \ diff --git a/iconvdata/gconv-modules b/iconvdata/gconv-modules index b57a488..c47ca0c 100644 --- a/iconvdata/gconv-modules +++ b/iconvdata/gconv-modules @@ -977,6 +977,11 @@ module ISO-2022-KR// INTERNAL ISO-2022-KR 1 module INTERNAL ISO-2022-KR// ISO-2022-KR 1 # from to module cost +alias CSISO2022CN// ISO-2022-CN// +module ISO-2022-CN// INTERNAL ISO-2022-CN 1 +module INTERNAL ISO-2022-CN// ISO-2022-CN 1 + +# from to module cost alias MAC// MACINTOSH// alias CSMACINTOSH// MACINTOSH// module MACINTOSH// INTERNAL MACINTOSH 1 diff --git a/iconvdata/iso-2022-cn.c b/iconvdata/iso-2022-cn.c new file mode 100644 index 0000000..563d173 --- /dev/null +++ b/iconvdata/iso-2022-cn.c @@ -0,0 +1,398 @@ +/* Conversion module for ISO-2022-CN. + Copyright (C) 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper , 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include +#include +#include +#include "gb2312.h" +#include "cns11643l1.h" +#include "cns11643l2.h" + +#include + +/* This makes obvious what everybody knows: 0x1b is the Esc character. */ +#define ESC 0x1b + +/* We have single-byte shift-in and shift-out sequences, and the single + shift sequence SS2 which replaces the SS2 designation for the next + two bytes. */ +#define SI 0x0f +#define SO 0x0e +#define SS2_0 ESC +#define SS2_1 0x4e + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "ISO-2022-CN//" +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define FROM_LOOP from_iso2022cn_loop +#define TO_LOOP to_iso2022cn_loop +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 4 +#define MAX_NEEDED_TO 4 +#define PREPARE_LOOP \ + int save_set; \ + int *setp = &data->__statep->count; +#define EXTRA_LOOP_ARGS , setp + + +/* The COUNT element of the state keeps track of the currently selected + character set. The possible values are: */ +enum +{ + ASCII_set = 0, + GB2312_set, + CNS11643_1_set, + CNS11643_2_set, + CURRENT_MASK = 3, + GB2312_ann = 4, + CNS11643_1_ann = 8, + CNS11643_2_ann = 16 +}; + + +/* Since this is a stateful encoding we have to provide code which resets + the output state to the initial state. This has to be done during the + flushing. */ +#define EMIT_SHIFT_TO_INIT \ + if (data->__statep->count != ASCII_set) \ + { \ + if (FROM_DIRECTION) \ + /* It's easy, we don't have to emit anything, we just reset the \ + state for the input. */ \ + data->__statep->count = ASCII_set; \ + else \ + { \ + unsigned char *outbuf = data->__outbuf; \ + \ + /* We are not in the initial state. To switch back we have \ + to emit `SI'. */ \ + if (outbuf == data->__outbufend) \ + /* We don't have enough room in the output buffer. */ \ + status = __GCONV_FULL_OUTPUT; \ + else \ + { \ + /* Write out the shift sequence. */ \ + *outbuf++ = SI; \ + if (data->__is_last) \ + *written += 1; \ + data->__outbuf = outbuf; \ + data->__statep->count = ASCII_set; \ + } \ + } \ + } + + +/* Since we might have to reset input pointer we must be able to save + and retore the state. */ +#define SAVE_RESET_STATE(Save) \ + if (Save) \ + save_set = *setp; \ + else \ + *setp = save_set + + +/* First define the conversion function from ISO-2022-CN to UCS4. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint32_t ch = *inptr; \ + \ + /* This is a 7bit character set, disallow all 8bit characters. */ \ + if (ch > 0x7f) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + /* Recognize escape sequences. */ \ + if (ch == ESC) \ + { \ + /* There are two kinds of escape sequences we have to handle: \ + - those announcing the use of GB and CNS characters on the \ + line; we can simply ignore them \ + - the initial byte of the SS2 sequence. \ + */ \ + if (NEED_LENGTH_TEST \ + && (inptr + 1 > inend \ + || (inptr[1] == '$' \ + && (inptr + 2 > inend \ + || (inptr[2] == ')' && inptr + 3 > inend) \ + || (inptr[2] == '*' && inptr + 3 > inend))) \ + || (inptr[1] == SS2_1 && inptr + 3 > inend))) \ + { \ + result = __GCONV_EMPTY_INPUT; \ + break; \ + } \ + if (inptr[1] == '$' \ + && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \ + || (inptr[2] == '*' && inptr[3] == 'H'))) \ + { \ + /* OK, we accept those character sets. */ \ + if (inptr[3] == 'A') \ + ann = GB2312_ann; \ + else if (inptr[3] == 'G') \ + ann = CNS11643_1_ann; \ + inptr += 4; \ + continue; \ + } \ + } \ + else if (ch == SO) \ + { \ + /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \ + S0 designation came last. The only problem is what to do with \ + faulty input files where no designator came. \ + XXX For now I'll default to use GB2312. If this is not the \ + best behaviour (e.g., we should flag an error) let me know. */ \ + ++inptr; \ + set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \ + continue; \ + } \ + else if (ch == SI) \ + { \ + /* Switch to use ASCII. */ \ + ++inptr; \ + set = ASCII_set; \ + continue; \ + } \ + \ + if (ch == ESC && inptr[1] == SS2_1) \ + { \ + /* This is a character from CNS 11643 plane 2. \ + XXX We could test here whether the use of this character \ + set was announced. */ \ + inptr += 2; \ + ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \ + if (ch == __UNKNOWN_10646_CHAR) \ + { \ + inptr -= 2; \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + else if (set == ASCII_set) \ + { \ + /* Almost done, just advance the input pointer. */ \ + ++inptr; \ + } \ + else \ + { \ + /* That's pretty easy, we have a dedicated functions for this. */ \ + if (set == GB2312_set) \ + ch = gb2312_to_ucs4 (&inptr, \ + NEED_LENGTH_TEST ? inend - inptr : 2, 0); \ + else \ + { \ + assert (set == CNS11643_1_set); \ + ch = cns11643l1_to_ucs4 (&inptr, \ + NEED_LENGTH_TEST ? inend - inptr : 2, 0);\ + } \ + \ + if (NEED_LENGTH_TEST && ch == 0) \ + { \ + result = __GCONV_EMPTY_INPUT; \ + break; \ + } \ + else if (ch == __UNKNOWN_10646_CHAR) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + \ + *((uint32_t *) outptr)++ = ch; \ + } +#define EXTRA_LOOP_DECLS , int *setp +#define INIT_PARAMS int set = *setp & CURRENT_MASK; \ + int ann = *setp & ~CURRENT_MASK +#define UPDATE_PARAMS *setp = set | ann +#include + + +/* Next, define the other direction. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + uint32_t ch; \ + size_t written = 0; \ + \ + ch = *((uint32_t *) inptr); \ + \ + /* First see whether we can write the character using the currently \ + selected character set. */ \ + if (ch < 0x80) \ + { \ + if (set != ASCII_set) \ + { \ + *outptr++ = SI; \ + set = ASCII_set; \ + if (NEED_LENGTH_TEST && outptr == outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + } \ + \ + *outptr++ = ch; \ + written = 1; \ + \ + /* At the end of the line we have to clear the `ann' flags since \ + every line must contain this information again. */ \ + if (ch == L'\n') \ + ann = 0; \ + } \ + else \ + { \ + char buf[2]; \ + int used; \ + \ + if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \ + { \ + written = ucs4_to_gb2312 (ch, buf, 2); \ + used = GB2312_set; \ + } \ + else \ + { \ + written = ucs4_to_cns11643l1 (ch, buf, 2); \ + used = CNS11643_1_set; \ + } \ + \ + if (written == __UNKNOWN_10646_CHAR) \ + { \ + /* Cannot convert it using the currently selected SO set. \ + Next try the SS2 set. */ \ + written = ucs4_to_cns11643l2 (ch, buf, 2); \ + if (written != __UNKNOWN_10646_CHAR) \ + /* Yep, that worked. */ \ + used = CNS11643_2_set; \ + else \ + { \ + /* Well, see whether we have to change the SO set. */ \ + if (set == GB2312_set) \ + written = ucs4_to_cns11643l1 (ch, buf, 2); \ + else \ + written = ucs4_to_gb2312 (ch, buf, 2); \ + \ + if (written != __UNKNOWN_10646_CHAR) \ + /* Oh well, then switch SO. */ \ + used = GB2312_set + CNS11643_1_set - set; \ + else \ + { \ + /* Even this does not work. Error. */ \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + } \ + assert (written == 2); \ + \ + /* See whether we have to emit an escape sequence. */ \ + if (set != used) \ + { \ + /* First see whether we announced that we use this \ + character set. */ \ + if ((ann & (2 << used)) == 0) \ + { \ + const char *escseq; \ + \ + if (NEED_LENGTH_TEST && outptr + 4 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + assert (used >= 1 && used <= 3); \ + escseq = "\e$)A\e$)G\e$*H" + (used - 1) * 4; \ + *outptr++ = *escseq++; \ + *outptr++ = *escseq++; \ + *outptr++ = *escseq++; \ + *outptr++ = *escseq++; \ + \ + if (used == GB2312_set) \ + ann = (ann & CNS11643_2_ann) | GB2312_ann; \ + else if (used == CNS11643_1_set) \ + ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \ + else \ + ann |= CNS11643_2_ann; \ + } \ + \ + if (used == CNS11643_2_set) \ + { \ + if (outptr + 2 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + *outptr++ = SS2_0; \ + *outptr++ = SS2_1; \ + } \ + else \ + { \ + /* We only have to emit something is currently ASCII is \ + selected. Otherwise we are switching within the \ + SO charset. */ \ + if (set == ASCII_set) \ + { \ + if (outptr + 1 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + *outptr++ = SO; \ + } \ + } \ + \ + /* Always test the length here since we have used up all the \ + guaranteed output buffer slots. */ \ + if (outptr + 2 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + } \ + else if (NEED_LENGTH_TEST && outptr + 2 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *outptr++ = buf[0]; \ + *outptr++ = buf[1]; \ + } \ + \ + /* Now that we wrote the output increment the input pointer. */ \ + inptr += 4; \ + } +#define EXTRA_LOOP_DECLS , int *setp +#define INIT_PARAMS int set = *setp & CURRENT_MASK; \ + int ann = *setp & ~CURRENT_MASK +#define UPDATE_PARAMS *setp = set | ann +#include + + +/* Now define the toplevel functions. */ +#include diff --git a/iconvdata/iso-2022-kr.c b/iconvdata/iso-2022-kr.c index 2961490..8ed5d21 100644 --- a/iconvdata/iso-2022-kr.c +++ b/iconvdata/iso-2022-kr.c @@ -166,11 +166,6 @@ enum \ if (set == ASCII_set) \ { \ - if (ch >= 0x80) \ - { \ - result = __GCONV_ILLEGAL_INPUT; \ - break; \ - } \ /* Almost done, just advance the input pointer. */ \ ++inptr; \ } \ -- 2.7.4