From 8676e7d322bcb9ce4010d380a2367dc55cf6a6de Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Thu, 24 Apr 2003 19:06:29 +0000 Subject: [PATCH] Upgrade to Encode 1.93. p4raw-id: //depot/perl@19325 --- ext/Encode/Changes | 34 ++++- ext/Encode/Encode.pm | 4 +- ext/Encode/Makefile.PL | 17 +-- ext/Encode/lib/Encode/CJKConstants.pm | 7 +- ext/Encode/lib/Encode/CN/HZ.pm | 238 +++++++++++++++++++++++----------- ext/Encode/lib/Encode/Encoding.pm | 9 +- ext/Encode/lib/Encode/Guess.pm | 81 +++++++----- ext/Encode/lib/Encode/KR/2022_KR.pm | 4 +- ext/Encode/lib/Encode/PerlIO.pod | 36 +++-- ext/Encode/t/CJKT.t | 24 ++-- ext/Encode/t/enc_eucjp.t | 3 +- ext/Encode/t/enc_module.t | 8 +- ext/Encode/t/guess.t | 15 ++- ext/Encode/ucm/euc-cn.ucm | 34 +---- ext/Encode/ucm/euc-jp.ucm | 32 +---- ext/Encode/ucm/euc-kr.ucm | 34 +---- 16 files changed, 324 insertions(+), 256 deletions(-) diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 730ea01..8d7a054 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,8 +1,38 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.92 2003/03/31 03:27:27 dankogai Exp $ +# $Id: Changes,v 1.93 2003/04/24 17:43:16 dankogai Exp $ # -$Revision: 1.92 $ $Date: 2003/03/31 03:27:27 $ +$Revision: 1.93 $ $Date: 2003/04/24 17:43:16 $ +! t/enc_eucjp.t + added "no warnings 'pack'" in for loop to keep bleedperl from + complaining "Character in 'C' format wrapped in pack". +! Makefile.PL + More elegant perl core detection inspired by Ilya Zakharevich + (but further elaborated for general cases). +! lib/Encode/Encoding.pm lib/Encode/PerlIO.pod + POD fixes. +! t/euc-jp.ucm + like cp9??, \x80-\x9F (control + 0x80) are zapped so they + are less likely to be confused w/ ISO-8859-* +! t/CJKT.t + RT tests added (vendor encodings are exemplified) + -- that successfully found a flaw on iso-2022-kr before the patch. +! lib/Encode/CJKConstants.pm lib/Encode/KR/2022_KR.pm + decode("ISO-2022-KR") has been buggy but no one ever sited + that since no one seems to be using it. Bugs discovered by + SADAHIRO-san + Message-Id: <20030416231757.A545.BQW10602@nifty.com> +! lib/Encode/CN/HZ.pm t/perlio.t + HZ is now perlio_ok, thanks to SADAHIRO-san. perlio.t modified + so it adds test for HZ. + Message-Id: <20030416231757.A545.BQW10602@nifty.com> +! lib/Encode/Guess.pm + Now guesses UTF-(16|32)(BE|LE) when the string contains \x00. + So long as the string contains \x{00}-\x{ff} it does not fail. + See perldoc for details. + Message-Id: + +1.92 2003/03/31 03:27:27 ! ucm/big5-eten.ucm ucm/big5-hkscs.ucm Extraneous single-byte chars in range \x80-\xA0 and \xFA-\xFF removed. FYI, IBM's ICU has none of these for java-Big5-1.3_P.ucm diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 7ca8f8d..45d134b 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,9 +1,9 @@ # -# $Id: Encode.pm,v 1.92 2003/03/31 03:46:47 dankogai Exp $ +# $Id: Encode.pm,v 1.93 2003/04/24 17:44:00 dankogai Exp $ # package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.92 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.93 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load(__PACKAGE__, $VERSION); diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 2e74fe4..83e4e64 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -2,9 +2,9 @@ use 5.007003; use ExtUtils::MakeMaker; # Just for sure :) -unless($ENV{PERL_CORE}) { - $ENV{PERL_CORE} = 1 if ($^X =~ m{\bminiperl[^/\\\]>:]*$}o); -} +my %ARGV = map { split /=/; defined $_[1] or $_[1]=1; @_ } @ARGV; +$ARGV{DEBUG} and warn "$_ => $ARGV{$_}\n" for keys %ARGV; +$ENV{PERL_CORE} ||= $ARGV{PERL_CORE}; my %tables = ( @@ -22,14 +22,9 @@ my @more_exe_files = qw( unidump ); my @pmlibdirs = qw(lib Encode); -for my $arg (@ARGV){ - if ($arg eq "MORE_SCRIPTS"){ - push @exe_files, @more_exe_files; - } - if ($arg eq "INSTALL_UCM"){ - push @pmlibdirs, "ucm"; - } -} + +$ARGV{MORE_SCRIOPTS} and push @exe_files, @more_exe_files; +$ARGV{INSTALL_UCM} and push @pmlibdirs, "ucm"; WriteMakefile( NAME => "Encode", diff --git a/ext/Encode/lib/Encode/CJKConstants.pm b/ext/Encode/lib/Encode/CJKConstants.pm index 73e83b0..bea2f3e 100644 --- a/ext/Encode/lib/Encode/CJKConstants.pm +++ b/ext/Encode/lib/Encode/CJKConstants.pm @@ -1,13 +1,13 @@ # -# $Id: CJKConstants.pm,v 1.1 2003/02/06 01:52:11 dankogai Exp $ +# $Id: CJKConstants.pm,v 1.2 2003/04/24 17:43:16 dankogai Exp $ # package Encode::CJKConstants; use strict; -our $RCSID = q$Id: CJKConstants.pm,v 1.1 2003/02/06 01:52:11 dankogai Exp $; -our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $RCSID = q$Id: CJKConstants.pm,v 1.2 2003/04/24 17:43:16 dankogai Exp $; +our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Carp; @@ -37,6 +37,7 @@ our %ESC = ( KSC_5601 => "\e\$(C", ASC => "\e\(B", KANA => "\e\(I", + '2022_KR' => "\e\$)C", ); our %RE = diff --git a/ext/Encode/lib/Encode/CN/HZ.pm b/ext/Encode/lib/Encode/CN/HZ.pm index 96577df..1ea1e45 100644 --- a/ext/Encode/lib/Encode/CN/HZ.pm +++ b/ext/Encode/lib/Encode/CN/HZ.pm @@ -3,106 +3,196 @@ package Encode::CN::HZ; use strict; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; -use Encode (); +use Encode qw(:fallbacks); use base qw(Encode::Encoding); __PACKAGE__->Define('hz'); -# HZ is only escaped GB, so we implement it with the -# GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. - +# HZ is a combination of ASCII and escaped GB, so we implement it +# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. +# not ported for EBCDIC. Which should be used, "~" or "\x7E"? sub needs_lines { 1 } -sub perlio_ok { - return 0; # for the time being -} +sub perlio_ok { 1 } -sub decode +sub decode ($$;$) { + use bytes; my ($obj,$str,$chk) = @_; - my $gb = Encode::find_encoding('gb2312-raw'); - - $str =~ s{~ # starting tilde - (?: - (~) # another tilde - escaped (set $1) - | # or - \n # \n - output nothing - | # or - \{ # opening brace of GB data - ( # set $2 to any number of... - (?: - [^~] # non-tilde GB character - | # or - ~(?!\}) # tilde not followed by a closing brace - )* - ) - ~\} # closing brace of GB data - | # XXX: invalid escape - maybe die on $chk? - ) - }{ - (defined $1) ? '~' # two tildes make one tilde - : - (defined $2) ? $gb->decode($2, $chk) # decode the characters - : - '' # ~\n and invalid escape = '' - }egx; - - return $str; + + my $GB = Encode::find_encoding('gb2312-raw'); + my $ret = ''; + my $in_ascii = 1; # default mode is ASCII. + + while (length $str) { + if ($in_ascii) { # ASCII mode + if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII + $ret .= $1; + # EBCDIC should need ascii2native, but not ported. + } + elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde + $ret .= '~'; + } + elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII + 1; # no-op + } + elsif ($str =~ s/^\x7E\x7B//) { # '~{' + $in_ascii = 0; # to GB + } + else { # encounters an invalid escape, \x80 or greater + last; + } + } + else { # GB mode; the byte ranges are as in RFC 1843. + if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) { + $ret .= $GB->decode($1, $chk); + } + elsif ($str =~ s/^\x7E\x7D//) { # '~}' + $in_ascii = 1; + } + else { # invalid + last; + } + } + } + $_[1] = $str if $chk; + return $ret; +} + +sub cat_decode { + use bytes; + + my ($obj, undef, $src, $pos, $trm, $chk) = @_; + my ($rdst, $rsrc, $rpos) = \@_[1..3]; + + my $GB = Encode::find_encoding('gb2312-raw'); + my $ret = ''; + my $in_ascii = 1; # default mode is ASCII. + + my $ini_pos = pos($$rsrc); + + substr($src, 0, $pos) = ''; + + my $ini_len = bytes::length($src); + + # $trm is the first of the pair '~~', then 2nd tilde is to be removed. + # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? + $src =~ s/^\x7E// if $trm eq "\x7E"; + + while (length $src) { + my $now; + if ($in_ascii) { # ASCII mode + if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII + $now = $1; + } + elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde + $now = '~'; + } + elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII + next; + } + elsif ($src =~ s/^\x7E\x7B//) { # '~{' + $in_ascii = 0; # to GB + next; + } + else { # encounters an invalid escape, \x80 or greater + last; + } + } + else { # GB mode; the byte ranges are as in RFC 1843. + if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) { + $now = $GB->decode($1, $chk); + } + elsif ($src =~ s/^\x7E\x7D//) { # '~}' + $in_ascii = 1; + next; + } + else { # invalid + last; + } + } + + next if ! defined $now; + + $ret .= $now; + + if ($now eq $trm) { + $$rdst .= $ret; + $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); + pos($$rsrc) = $ini_pos; + return 1; + } + } + + $$rdst .= $ret; + $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); + pos($$rsrc) = $ini_pos; + return ''; # terminator not found } -sub encode + +sub encode($$;$) { my ($obj,$str,$chk) = @_; - my ($out, $in_gb); - my $gb = Encode::find_encoding('gb2312-raw'); - - $str =~ s/~/~~/g; - - # XXX: Since CHECK and partial decoding has not been implemented yet, - # we'll use a very crude way to test for GB2312ness. - - for my $index (0 .. length($str) - 1) { - no warnings 'utf8'; - - my $char = substr($str, $index, 1); - # try to encode this character - # with CHECK on so it stops at proper place. - # also note that the assignement was braced in eval - # -- dankogai - my $try; - eval{ $try = $gb->encode($char, 1) }; - - if (defined($try)) { # is a GB character: - if ($in_gb) { - $out .= $try; # in GB mode - just append it - } - else { - $in_gb = 1; # enter GB mode, then append it - $out .= "~{$try"; - } - } # not a GB character: - elsif ($in_gb) { - $in_gb = 0; # leave GB mode, then append it - $out .= "~}$char"; + + my $GB = Encode::find_encoding('gb2312-raw'); + my $ret = ''; + my $in_ascii = 1; # default mode is ASCII. + + no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. + + while (length $str) { + if ($str =~ s/^([[:ascii:]]+)//) { + my $tmp = $1; + $tmp =~ s/~/~~/g; # escapes tildes + if (! $in_ascii) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + $ret .= pack 'a*', $tmp; # remove UTF8 flag. + } + elsif ($str =~ s/(.)//) { + my $tmp = $GB->encode($1, $chk); + last if !defined $tmp; + if (length $tmp == 2) { # maybe a valid GB char (XXX) + if ($in_ascii) { + $ret .= "\x7E\x7B"; # '~{' + $in_ascii = 0; + } + $ret .= $tmp; + } + elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX) + if (!$in_ascii) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + $ret .= $tmp; + } } - else { - $out .= $char; # not in GB mode - just append it + else { # if $str is malformed UTF8 *and* if length $str != 0. + last; } } + $_[1] = $str if $chk; - $out .= '~}' if $in_gb; # add closing brace if needed + # The state at the end of the chunk is discarded, even if in GB mode. + # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". + # Parhaps it is harmless, but further investigations may be required... - return $out; + if (! $in_ascii) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + return $ret; } 1; __END__ - =head1 NAME Encode::CN::HZ -- internally used by Encode::CN diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index 7ec9bf8..0bb4350 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -1,7 +1,7 @@ package Encode::Encoding; # Base class for classes which implement encodings use strict; -our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; require Encode; @@ -130,6 +130,13 @@ replacement character. =back +=back + +If you want your encoding to work with L pragma, you should +also implement the method below. + +=over 4 + =item -Ecat_decode($destination, $octets, $offset, $terminator [,$check]) MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>. diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index e3634a2..fc8d267 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -2,7 +2,7 @@ package Encode::Guess; use strict; use Encode qw(:fallbacks find_encoding); -our $VERSION = do { my @r = (q$Revision: 1.7 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; my $Canon = 'Guess'; our $DEBUG = 0; @@ -79,45 +79,66 @@ sub guess { $BOM = unpack('N', $octet); return find_encoding('UTF-32') if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); - my %try = %{$obj->{Suspects}}; for my $c (@_){ my $e = find_encoding($c) or die "Unknown encoding: $c"; $try{$e->name} = $e; $DEBUG and warn "Added: ", $e->name; } - my $nline = 1; - for my $line (split /\r\n?|\n/, $octet){ - # cheat 2 -- \e in the string - if ($line =~ /\e/o){ - my @keys = keys %try; - delete @try{qw/utf8 ascii/}; - for my $k (@keys){ - ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; + if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) + my $utf; + my ($be, $le) = (0, 0); + if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed + $utf = "UTF-32"; + for my $char (unpack('N*', $octet)){ + $char & 0x0000ffff and $be++; + $char & 0xffff0000 and $le++; } - } - my %ok = %try; - # warn join(",", keys %try); - for my $k (keys %try){ - my $scratch = $line; - $try{$k}->decode($scratch, FB_QUIET); - if ($scratch eq ''){ - $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); - }else{ - use bytes (); - $DEBUG and - warn sprintf("%4d:%-24s not ok; %d bytes left\n", - $nline, $k, bytes::length($scratch)); - delete $ok{$k}; - + }else{ # UTF-16(BE|LE) assumed + $utf = "UTF-16"; + for my $char (unpack('n*', $octet)){ + $char & 0x00ff and $be++; + $char & 0xff00 and $le++; } } - %ok or return "No appropriate encodings found!"; - if (scalar(keys(%ok)) == 1){ - my ($retval) = values(%ok); - return $retval; + $DEBUG and warn "$utf, be == $be, le == $le"; + $be == $le + and return "Encodings ambiguous between $utf BE and LE ($be, $le)"; + $utf .= ($be > $le) ? 'BE' : 'LE'; + return find_encoding($utf); + }else{ + my $nline = 1; + for my $line (split /\r\n?|\n/, $octet){ + # cheat 2 -- \e in the string + if ($line =~ /\e/o){ + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys){ + ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; + } + } + my %ok = %try; + # warn join(",", keys %try); + for my $k (keys %try){ + my $scratch = $line; + $try{$k}->decode($scratch, FB_QUIET); + if ($scratch eq ''){ + $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); + }else{ + use bytes (); + $DEBUG and + warn sprintf("%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch)); + delete $ok{$k}; + } + } + %ok or return "No appropriate encodings found!"; + if (scalar(keys(%ok)) == 1){ + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; $nline++; } - %try = %ok; $nline++; } $try{ascii} or return "Encodings too ambiguous: ", join(" or ", keys %try); diff --git a/ext/Encode/lib/Encode/KR/2022_KR.pm b/ext/Encode/lib/Encode/KR/2022_KR.pm index 1bb584e..ed660d2 100644 --- a/ext/Encode/lib/Encode/KR/2022_KR.pm +++ b/ext/Encode/lib/Encode/KR/2022_KR.pm @@ -1,7 +1,7 @@ package Encode::KR::2022_KR; use strict; -our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -29,7 +29,7 @@ sub encode my ($obj, $utf8, $chk) = @_; # empty the input string in the stack so perlio is ok $_[1] = '' if $chk; - my $octet = Encode::encode('euc-jp', $utf8, FB_PERLQQ) ; + my $octet = Encode::encode('euc-kr', $utf8, FB_PERLQQ) ; euc_iso(\$octet); return $octet; } diff --git a/ext/Encode/lib/Encode/PerlIO.pod b/ext/Encode/lib/Encode/PerlIO.pod index e433ea5..1b0547f 100644 --- a/ext/Encode/lib/Encode/PerlIO.pod +++ b/ext/Encode/lib/Encode/PerlIO.pod @@ -98,14 +98,13 @@ Encode converts from the beginning to \x7E, leaving \xe3 in the buffer because it is invalid (partial character). Unfortunately, this scheme does not work well with escape-based -encodings such as ISO-2022-JP. Let's see what happens in that case -in the next chapter. +encodings such as ISO-2022-JP. -=head1 BUGS +=head1 Line Buffering Now let's see what happens when you try to decode from ISO-2022-JP and the buffer ends in the middle of a character. - + JIS208-ESC \x{5f3e} A B C .... ~ \e $ B |DAN | .... 41 42 43 .... 7E 1b 24 41 43 46 .... @@ -118,25 +117,24 @@ area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC, in escape-based encodings you can't tell if a given octet is a whole character or just part of it. -There are actually several ways to solve this problem but none of -them is fast enough to be practical. From Encode's point of view, -the easiest solution is for PerlIO to implement a line buffer instead -of a fixed-length buffer, but that makes PerlIO really complicated. +Fortunately PerlIO also supports line buffer if you tell PerlIO to use +one instead of fixed buffer. Since ISO-2022-JP is guaranteed to revert to ASCII at the end of the line, partial +character will never happen when line buffer is used. -So for the time being, using escape-based encodings in the -":encoding()" layer of PerlIO does not work well. +To tell PerlIO to use line buffer, implement -Eneeds_lines method +for your encoding object. See L for details. -=head2 Workaround +Thanks to these efforts most encodings that come with Encode support +PerlIO but that still leaves following encodings. -If you still insist, you can at least use ":encoding()" by making sure -the buffer never gets full. Here is an example. + iso-2022-kr + MIME-B + MIME-Header + MIME-Q - use FileHandle; - binmode(STDOUT, ":encoding(7bit-jis)"); - STDOUT->autoflush(1); # don't forget this! - for my $l (@lines){ # $l cannot be longer than 1023 bytes - print $l; - } +Fortunately iso-2022-kr is hardly used (according to Jungshik) and +MIME-* are very unlikely to be fed to PerlIO because they are for mail +headers. See L for details. =head2 How can I tell whether my encoding fully supports PerlIO ? diff --git a/ext/Encode/t/CJKT.t b/ext/Encode/t/CJKT.t index 100b530..1480439 100644 --- a/ext/Encode/t/CJKT.t +++ b/ext/Encode/t/CJKT.t @@ -20,9 +20,7 @@ BEGIN { $| = 1; } use strict; -use Test::More tests => 42; -#use Test::More tests => 73; -#use Test::More qw(no_plan); +use Test::More tests => 60; use Encode; use File::Basename; use File::Spec; @@ -31,17 +29,16 @@ our $DEBUG = shift || 0; my %Charset = ( - 'big5-eten' => [qw(big5-eten cp950 MacChineseTrad)], + 'big5-eten' => [qw(big5-eten)], 'big5-hkscs' => [qw(big5-hkscs)], - gb2312 => [qw(euc-cn gb2312-raw cp936 MacChineseSimp)], - jisx0201 => [qw(euc-jp shiftjis 7bit-jis jis0201-raw - cp932 MacJapanese)], - jisx0212 => [qw(euc-jp 7bit-jis iso-2022-jp-1 jis0208-raw)], - jisx0208 => [qw(euc-jp shiftjis 7bit-jis cp932 MacJapanese - iso-2022-jp iso-2022-jp-1 jis0212-raw)], - ksc5601 => [qw(euc-kr iso-2022-kr ksc5601-raw cp949 MacKorean)], + gb2312 => [qw(euc-cn hz)], + jisx0201 => [qw(euc-jp shiftjis 7bit-jis)], + jisx0208 => [qw(euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1)], + jisx0212 => [qw(euc-jp 7bit-jis iso-2022-jp-1)], + ksc5601 => [qw(euc-kr iso-2022-kr johab)], ); + my $dir = dirname(__FILE__); my $seq = 1; @@ -111,4 +108,9 @@ for my $charset (sort keys %Charset){ $seq++; unlink($dst_utf, $dst_enc); + + for my $encoding (@{$Charset{$charset}}){ + my $rt = decode($encoding, encode($encoding, $uni)); + is ($rt, $uni, "RT $encoding"); + } } diff --git a/ext/Encode/t/enc_eucjp.t b/ext/Encode/t/enc_eucjp.t index 4355c12..3d5457d 100644 --- a/ext/Encode/t/enc_eucjp.t +++ b/ext/Encode/t/enc_eucjp.t @@ -1,4 +1,4 @@ -# $Id: enc_eucjp.t,v 1.3 2003/02/20 14:42:34 dankogai Exp $ +# $Id: enc_eucjp.t,v 1.4 2003/04/24 17:43:16 dankogai Exp $ # This is the twin of enc_utf8.t . BEGIN { @@ -30,6 +30,7 @@ print "1.." . (scalar @c + 1) . "\n"; my @f; for my $i (0..$#c) { + no warnings 'pack'; push @f, "f$i"; open(F, ">f$i") or die "$0: failed to open 'f$i' for writing: $!"; binmode(F, ":utf8"); diff --git a/ext/Encode/t/enc_module.t b/ext/Encode/t/enc_module.t index 89703a9..6afed81 100644 --- a/ext/Encode/t/enc_module.t +++ b/ext/Encode/t/enc_module.t @@ -1,4 +1,4 @@ -# $Id: enc_module.t,v 1.4 2003/03/31 03:27:27 dankogai Exp $ +# $Id: enc_module.t,v 1.5 2003/04/24 17:43:16 dankogai Exp $ # This file is in euc-jp BEGIN { require Config; import Config; @@ -27,12 +27,14 @@ use File::Basename; use File::Spec; use File::Compare qw(compare_text); +my $DEBUG = shift || 0; my $dir = dirname(__FILE__); my $file0 = File::Spec->catfile($dir,"enc_module.enc"); my $file1 = File::Spec->catfile($dir,"$$.enc"); my $obj = Mod_EUCJP->new; -local $SIG{__WARN__} = sub{}; # to silence reopening STD(IN|OUT) w/o closing +local $SIG{__WARN__} = sub{ $DEBUG and print STDERR @_ }; +# to silence reopening STD(IN|OUT) w/o closing unless $DEBUG open STDOUT, ">", $file1 or die "$file1:$!"; print $obj->str, "\n"; @@ -51,6 +53,8 @@ while(){ is ($cmp[$i++], $_, "encoding vs. STDIN - $i"); } +# I have tested and found "unless $^O eq 'freebsd'" is not +# necessary but I will leave it for the sake of Enache -- dankogai close STDOUT unless $^O eq 'freebsd'; unlink $file1 unless $cmp; __END__ diff --git a/ext/Encode/t/guess.t b/ext/Encode/t/guess.t index bd403d3..d3ff58c 100644 --- a/ext/Encode/t/guess.t +++ b/ext/Encode/t/guess.t @@ -21,7 +21,7 @@ use File::Spec; use Encode qw(decode encode find_encoding _utf8_off); #use Test::More qw(no_plan); -use Test::More tests => 17; +use Test::More tests => 23; use_ok("Encode::Guess"); { no warnings; @@ -86,4 +86,17 @@ for my $name (keys %CJKT){ is(guess_encoding($test)->name, $name, "CJKT:$name"); } +my $ambiguous = "\x{5c0f}\x{98fc}\x{5f3e}"; +my $english = "The quick brown fox jumps over the black lazy dog."; +for my $utf (qw/UTF-16 UTF-32/){ + for my $bl (qw/BE LE/){ + my $test = encode("$utf$bl" => $english); + is(guess_encoding($test)->name, "$utf$bl", "$utf$bl"); + } +} +for my $bl (qw/BE LE/){ + my $test = encode("UTF-16$bl" => $ambiguous); + my $result = guess_encoding($test); + ok(! ref($result), "UTF-16$bl:$result"); +} __END__; diff --git a/ext/Encode/ucm/euc-cn.ucm b/ext/Encode/ucm/euc-cn.ucm index a6e03fd..e1dacd4 100644 --- a/ext/Encode/ucm/euc-cn.ucm +++ b/ext/Encode/ucm/euc-cn.ucm @@ -1,5 +1,5 @@ # -# $Id: euc-cn.ucm,v 1.0 2002/03/28 23:26:26 dankogai Exp $ +# $Id: euc-cn.ucm,v 1.1 2003/04/24 17:43:16 dankogai Exp $ # # ./compile -n euc-cn -o Encode/euc-cn.ucm Encode/euc-cn.enc "euc-cn" @@ -136,38 +136,6 @@ CHARMAP \x7D |0 # RIGHT CURLY BRACKET \x7E |0 # TILDE \x7F |0 # DELETE - \x80 |0 # - \x81 |0 # - \x82 |0 # BREAK PERMITTED HERE - \x83 |0 # NO BREAK HERE - \x84 |0 # - \x85 |0 # NEXT LINE - \x86 |0 # START OF SELECTED AREA - \x87 |0 # END OF SELECTED AREA - \x88 |0 # CHARACTER TABULATION SET - \x89 |0 # CHARACTER TABULATION WITH JUSTIFICATION - \x8A |0 # LINE TABULATION SET - \x8B |0 # PARTIAL LINE DOWN - \x8C |0 # PARTIAL LINE UP - \x8D |0 # REVERSE LINE FEED - \x8E |0 # SINGLE SHIFT TWO - \x8F |0 # SINGLE SHIFT THREE - \x90 |0 # DEVICE CONTROL STRING - \x91 |0 # PRIVATE USE ONE - \x92 |0 # PRIVATE USE TWO - \x93 |0 # SET TRANSMIT STATE - \x94 |0 # CANCEL CHARACTER - \x95 |0 # MESSAGE WAITING - \x96 |0 # START OF GUARDED AREA - \x97 |0 # END OF GUARDED AREA - \x98 |0 # START OF STRING - \x99 |0 # - \x9A |0 # SINGLE CHARACTER INTRODUCER - \x9B |0 # CONTROL SEQUENCE INTRODUCER - \x9C |0 # STRING TERMINATOR - \x9D |0 # OPERATING SYSTEM COMMAND - \x9E |0 # PRIVACY MESSAGE - \x9F |0 # APPLICATION PROGRAM COMMAND \xA1\xA1 |0 # IDEOGRAPHIC SPACE \xA1\xA2 |0 # IDEOGRAPHIC COMMA \xA1\xA3 |0 # IDEOGRAPHIC FULL STOP diff --git a/ext/Encode/ucm/euc-jp.ucm b/ext/Encode/ucm/euc-jp.ucm index cc1379b..2896223 100644 --- a/ext/Encode/ucm/euc-jp.ucm +++ b/ext/Encode/ucm/euc-jp.ucm @@ -1,5 +1,5 @@ # -# $Id: euc-jp.ucm,v 1.2 2002/04/29 07:01:58 dankogai Exp $ +# $Id: euc-jp.ucm,v 1.3 2003/04/24 17:43:16 dankogai Exp $ # "euc-jp" 1 @@ -141,36 +141,6 @@ CHARMAP # # JIS X 0201 Kana # - \x80 |0 # - \x81 |0 # - \x82 |0 # BREAK PERMITTED HERE - \x83 |0 # NO BREAK HERE - \x84 |0 # - \x85 |0 # NEXT LINE (NEL) - \x86 |0 # START OF SELECTED AREA - \x87 |0 # END OF SELECTED AREA - \x88 |0 # CHARACTER TABULATION SET - \x89 |0 # CHARACTER TABULATION WITH JUSTIFICATION - \x8A |0 # LINE TABULATION SET - \x8B |0 # PARTIAL LINE FORWARD - \x8C |0 # PARTIAL LINE BACKWARD - \x8D |0 # REVERSE LINE FEED - \x90 |0 # DEVICE CONTROL STRING - \x91 |0 # PRIVATE USE ONE - \x92 |0 # PRIVATE USE TWO - \x93 |0 # SET TRANSMIT STATE - \x94 |0 # CANCEL CHARACTER - \x95 |0 # MESSAGE WAITING - \x96 |0 # START OF GUARDED AREA - \x97 |0 # END OF GUARDED AREA - \x98 |0 # START OF STRING - \x99 |0 # - \x9A |0 # SINGLE CHARACTER INTRODUCER - \x9B |0 # CONTROL SEQUENCE INTRODUCER - \x9C |0 # STRING TERMINATOR - \x9D |0 # OPERATING SYSTEM COMMAND - \x9E |0 # PRIVACY MESSAGE - \x9F |0 # APPLICATION PROGRAM COMMAND \x8E\xA1 |0 # HALFWIDTH IDEOGRAPHIC FULL STOP \x8E\xA2 |0 # HALFWIDTH LEFT CORNER BRACKET \x8E\xA3 |0 # HALFWIDTH RIGHT CORNER BRACKET diff --git a/ext/Encode/ucm/euc-kr.ucm b/ext/Encode/ucm/euc-kr.ucm index 9c38a9b..9266233 100644 --- a/ext/Encode/ucm/euc-kr.ucm +++ b/ext/Encode/ucm/euc-kr.ucm @@ -1,5 +1,5 @@ # -# $Id: euc-kr.ucm,v 1.0 2002/03/28 23:26:26 dankogai Exp $ +# $Id: euc-kr.ucm,v 1.1 2003/04/24 17:43:16 dankogai Exp $ # # ./compile -n euc-kr -o Encode/euc-kr.ucm Encode/euc-kr.enc "euc-kr" @@ -136,38 +136,6 @@ CHARMAP \x7D |0 # RIGHT CURLY BRACKET \x7E |0 # TILDE \x7F |0 # DELETE - \x80 |0 # - \x81 |0 # - \x82 |0 # BREAK PERMITTED HERE - \x83 |0 # NO BREAK HERE - \x84 |0 # - \x85 |0 # NEXT LINE - \x86 |0 # START OF SELECTED AREA - \x87 |0 # END OF SELECTED AREA - \x88 |0 # CHARACTER TABULATION SET - \x89 |0 # CHARACTER TABULATION WITH JUSTIFICATION - \x8A |0 # LINE TABULATION SET - \x8B |0 # PARTIAL LINE DOWN - \x8C |0 # PARTIAL LINE UP - \x8D |0 # REVERSE LINE FEED - \x8E |0 # SINGLE SHIFT TWO - \x8F |0 # SINGLE SHIFT THREE - \x90 |0 # DEVICE CONTROL STRING - \x91 |0 # PRIVATE USE ONE - \x92 |0 # PRIVATE USE TWO - \x93 |0 # SET TRANSMIT STATE - \x94 |0 # CANCEL CHARACTER - \x95 |0 # MESSAGE WAITING - \x96 |0 # START OF GUARDED AREA - \x97 |0 # END OF GUARDED AREA - \x98 |0 # START OF STRING - \x99 |0 # - \x9A |0 # SINGLE CHARACTER INTRODUCER - \x9B |0 # CONTROL SEQUENCE INTRODUCER - \x9C |0 # STRING TERMINATOR - \x9D |0 # OPERATING SYSTEM COMMAND - \x9E |0 # PRIVACY MESSAGE - \x9F |0 # APPLICATION PROGRAM COMMAND \xA1\xA1 |0 # IDEOGRAPHIC SPACE \xA1\xA2 |0 # IDEOGRAPHIC COMMA \xA1\xA3 |0 # IDEOGRAPHIC FULL STOP -- 2.7.4