Start of support of ICU-style .ucm files:

author Nick Ing-Simmons <nik@tiuk.ti.com>

Sun, 31 Dec 2000 18:11:54 +0000 (18:11 +0000)

committer Nick Ing-Simmons <nik@tiuk.ti.com>

Sun, 31 Dec 2000 18:11:54 +0000 (18:11 +0000)
author Nick Ing-Simmons <nik@tiuk.ti.com>
Sun, 31 Dec 2000 18:11:54 +0000 (18:11 +0000)
committer Nick Ing-Simmons <nik@tiuk.ti.com>
Sun, 31 Dec 2000 18:11:54 +0000 (18:11 +0000)
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm

index db471cb..dedb8e9 100644 (file)
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -342,7 +342,7 @@ sub from_to
  
  # The global hash is declared in XS code
  $encoding{Unicode}    = bless({},'Encode::Unicode');
-$encoding{iso10646-1} = bless({},'Encode::iso10646_1');
+$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
  
  sub encodings
  {
@@ -408,7 +408,8 @@ sub getEncoding
  
  package Encode::Unicode;
  
-# Dummy package that provides the encode interface
+# Dummy package that provides the encode interface but leaves data
+# as UTF-8 encoded. It is here so that from_to()
  
  sub name { 'Unicode' }
  
@@ -533,7 +534,9 @@ sub fromUnicode
   return $str;
  }
  
-package Encode::iso10646_1;#
+package Encode::iso10646_1;
+# Encoding is 16-bit network order Unicode
+# Used for X font encodings
  
  sub name { 'iso10646-1' }
  
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs

index cca1ddc..b61d89b 100644 (file)
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -340,10 +340,14 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
     U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
     STRLEN dlen = SvLEN(dst);
     int code;
-   while ((code = do_encode(dir,s,&slen,d,dlen,&dlen)))
+   while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
      {
       SvCUR_set(dst,dlen);
       SvPOK_on(dst);
+
+     if (code == ENCODE_FALLBACK)
+      break;
+
       switch(code)
        {
         case ENCODE_NOSPACE:
diff --git a/ext/Encode/compile b/ext/Encode/compile

index fbb08cd..21478f8 100755 (executable)
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -4,7 +4,7 @@ use strict;
  
  sub encode_U
  {
- # UTF-8 encocde long hand - only covers part of perl's range
+ # UTF-8 encode long hand - only covers part of perl's range
   my $uv = shift;
   if ($uv < 0x80)
    {
@@ -96,10 +96,17 @@ sub cmp_name
  
  foreach my $enc (sort cmp_name @ARGV)
   {
-  my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/;
+  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
    if (open(E,$enc))
     {
-    compile_enc(\*E,lc($name),\*C);
+    if ($sfx eq 'enc')
+     {
+      compile_enc(\*E,lc($name),\*C);
+     }
+    else
+     {
+      compile_ucm(\*E,lc($name),\*C);
+     }
     }
    else
     {
@@ -135,6 +142,86 @@ close(C);
  close(D);
  close(H);
  
+
+sub compile_ucm
+{
+ my ($fh,$name,$ch) = @_;
+ my $e2u = {};
+ my $u2e = {};
+ my $cs;
+ my %attr;
+ while (<$fh>)
+  {
+   s/#.*$//;
+   last if /^\s*CHARMAP\s*$/i;
+   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
+    {
+     $attr{$1} = $2;
+    }
+  }
+ if (!defined($cs =  $attr{'code_set_name'}))
+  {
+   warn "No <code_set_name> in $name\n";
+  }
+ else
+  {
+   $name = lc($cs);
+  }
+ my $erep;
+ my $urep;
+ if (exists $attr{'subchar'})
+  {
+   my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/;
+   $erep = join('',map(hex($_),@byte));
+  }
+ warn "Scanning $cs\n";
+ my $nfb = 0;
+ my $hfb = 0;
+ while (<$fh>)
+  {
+   s/#.*$//;
+   last if /^\s*END\s+CHARMAP\s*$/i;
+   next if /^\s*$/;
+   my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i;
+   my $fb = pop(@byte);
+   if (defined($u))
+    {
+     my $uch = encode_U(hex($u));
+     my $ech = join('',map(hex($_),@byte));
+     if (length($fb))
+      {
+       $fb = substr($fb,1);
+       $hfb++;
+      }
+     else
+      {
+       $nfb++;
+       $fb = '0';
+      }
+     # $fb is fallback flag
+     # 0 - round trip safe
+     # 1 - fallback for unicode -> enc
+     # 2 - skip sub-char mapping
+     # 3 - fallback enc -> unicode
+     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
+     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
+    }
+   else
+    {
+     warn $_;
+    }
+
+  }
+ if ($nfb && $hfb)
+  {
+   die "$nfb entries without fallback, $hfb entries with\n";
+  }
+ output($ch,$name.'_utf8',$e2u);
+ output($ch,'utf8_'.$name,$u2e);
+ $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
+                     outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)];
+}
+
  sub compile_enc
  {
   my ($fh,$name,$ch) = @_;
@@ -173,8 +260,8 @@ sub compile_enc
         if ($val || (!$ch && !$page))
          {
           my $uch = encode_U($val);
-         enter($e2u,$ech,$uch,$e2u);
-         enter($u2e,$uch,$ech,$u2e);
+         enter($e2u,$ech,$uch,$e2u,0);
+         enter($u2e,$uch,$ech,$u2e,0);
          }
         else
          {
@@ -193,18 +280,18 @@ sub compile_enc
  
  sub enter
  {
- my ($a,$s,$d,$t) = @_;
+ my ($a,$s,$d,$t,$fb) = @_;
   $t = $a if @_ < 4;
   my $b = substr($s,0,1);
   my $e = $a->{$b};
   unless ($e)
    {     # 0  1  2  3         4  5
-   $e = [$b,$b,'',{},length($s),0];
+   $e = [$b,$b,'',{},length($s),0,$fb];
     $a->{$b} = $e;
    }
   if (length($s) > 1)
    {
-   enter($e->[3],substr($s,1),$d,$t);
+   enter($e->[3],substr($s,1),$d,$t,$fb);
    }
   else
    {
@@ -260,7 +347,8 @@ sub process
         ord($b) == ord($a->{$l}[1])+1 &&
         $a->{$l}[3] == $a->{$b}[3] &&
         $a->{$l}[4] == $a->{$b}[4] &&
-       $a->{$l}[5] == $a->{$b}[5]
+       $a->{$l}[5] == $a->{$b}[5] &&
+       $a->{$l}[6] == $a->{$b}[6]
         # && length($a->{$l}[2]) < 16
        )
      {
@@ -316,7 +404,7 @@ sub outtable
   print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
   foreach my $b (@{$a->{'Entries'}})
    {
-   my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
+   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
     my $sc = ord($s);
     my $ec = ord($e);
     print  $fh "{";
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c

index f317250..4c68dd9 100644 (file)
--- a/ext/Encode/encengine.c
+++ b/ext/Encode/encengine.c
@@ -92,7 +92,7 @@ we add a flag to re-add the removed byte to the source we could handle
  #include "encode.h"
  
  int
-do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout)
+do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx)
  {
   const U8 *s    = src;
   const U8 *send = s+*slen;
@@ -106,9 +106,9 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR
     U8 byte = *s;
     while (byte > e->max)
      e++;
-   if (byte >= e->min && e->slen)
+   if (byte >= e->min && e->slen && (approx || !e->slen & 0x80))
      {
-     const U8 *cend = s + e->slen;
+     const U8 *cend = s + (e->slen & 0x7f);
       if (cend <= send)
        {
         STRLEN n;
@@ -136,7 +136,11 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR
         enc = e->next;
         s++;
         if (s == cend)
-        last = s;
+        {
+         if (approx && (e->slen & 0x80))
+          code = ENCODE_FALLBACK;
+         last = s;
+        }
        }
       else
        {
diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h

index 604b97f..853ad04 100644 (file)
--- a/ext/Encode/encode.h
+++ b/ext/Encode/encode.h
@@ -28,12 +28,13 @@ struct encode_s
  
  #ifdef U8
  extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
-                     U8 *dst, STRLEN dlen, STRLEN *dout);
+                     U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
  
  extern void Encode_DefineEncoding(encode_t *enc);
  #endif
  
-#define ENCODE_NOSPACE 1
-#define ENCODE_PARTIAL 2
-#define ENCODE_NOREP   3
+#define ENCODE_NOSPACE  1
+#define ENCODE_PARTIAL  2
+#define ENCODE_NOREP    3
+#define ENCODE_FALLBACK 4
  #endif
author	Nick Ing-Simmons <nik@tiuk.ti.com>
	Sun, 31 Dec 2000 18:11:54 +0000 (18:11 +0000)
committer	Nick Ing-Simmons <nik@tiuk.ti.com>
	Sun, 31 Dec 2000 18:11:54 +0000 (18:11 +0000)
ext/Encode/Encode.pm		patch \| blob \| history
ext/Encode/Encode.xs		patch \| blob \| history
ext/Encode/compile		patch \| blob \| history
ext/Encode/encengine.c		patch \| blob \| history
ext/Encode/encode.h		patch \| blob \| history