Collect some stats during compile process.

author Nick Ing-Simmons <nik@tiuk.ti.com>

Wed, 30 Jan 2002 09:57:47 +0000 (09:57 +0000)

committer Nick Ing-Simmons <nik@tiuk.ti.com>

Wed, 30 Jan 2002 09:57:47 +0000 (09:57 +0000)
author Nick Ing-Simmons <nik@tiuk.ti.com>
Wed, 30 Jan 2002 09:57:47 +0000 (09:57 +0000)
committer Nick Ing-Simmons <nik@tiuk.ti.com>
Wed, 30 Jan 2002 09:57:47 +0000 (09:57 +0000)
diff --git a/ext/Encode/EUC_JP/Makefile.PL b/ext/Encode/EUC_JP/Makefile.PL

index d9ff3bc..2b51469 100644 (file)
--- a/ext/Encode/EUC_JP/Makefile.PL
+++ b/ext/Encode/EUC_JP/Makefile.PL
@@ -2,7 +2,9 @@ use 5.7.2;
  use strict;
  use ExtUtils::MakeMaker;
  
-my %tables = (EUC_JP  => ['euc-jp.ucm' ],
+my %tables = (EUC_JP  => ['euc-jp.ucm',
+              # 'euc-kr.ucm', 'euc-cn.ucm'
+              ],
               );
  
  
diff --git a/ext/Encode/compile b/ext/Encode/compile

index b5f659a..bb2683f 100755 (executable)
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -117,6 +117,9 @@ else
  
  my %encoding;
  my %strings;
+my $saved = 0;
+my $subsave = 0;
+my $strings = 0;
  
  sub cmp_name
  {
@@ -156,6 +159,7 @@ foreach my $enc (sort cmp_name @encfiles)
  
  if ($doC)
   {
+  print STDERR "Writing compiled form\n";
    foreach my $name (sort cmp_name keys %encoding)
     {
      my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
@@ -214,6 +218,9 @@ END
     }
    close(D);
    close(H);
+  printf STDERR "%d bytes in string tables\n",$strings;
+  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
+  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
   }
  elsif ($doEnc)
   {
@@ -235,6 +242,7 @@ elsif ($doUcm)
  close(C);
  
  
+
  sub compile_ucm
  {
   my ($fh,$name) = @_;
@@ -270,7 +278,7 @@ sub compile_ucm
     push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
     $erep = join('',map(chr(hex($_)),@byte));
    }
- print "Scanning $name ($cs)\n";
+ print "Reading $name ($cs)\n";
   my $nfb = 0;
   my $hfb = 0;
   while (<$fh>)
@@ -414,11 +422,17 @@ sub enter
    }
  }
  
+
+
  sub outstring
  {
   my ($fh,$name,$s) = @_;
   my $sym = $strings{$s};
- unless ($sym)
+ if ($sym)
+  {
+   $saved += length($s);
+  }
+ else
    {
     foreach my $o (keys %strings)
      {
@@ -427,10 +441,12 @@ sub outstring
        {
         $sym = $strings{$o};
         $sym .= sprintf("+0x%02x",$i) if ($i);
+       $subsave += length($s);
         return $sym;
        }
      }
     $strings{$s} = $sym = $name;
+   $strings += length($s);
     printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
     # Do in chunks of 16 chars to constrain line length
     # Assumes ANSI C adjacent string litteral concatenation
diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h

index aecc66e..f19cdc2 100644 (file)
--- a/ext/Encode/encode.h
+++ b/ext/Encode/encode.h
@@ -1,38 +1,77 @@
  #ifndef ENCODE_H
  #define ENCODE_H
+
  #ifndef U8
+/* A tad devious this:
+   perl normally has a #define for U8 - if that isn't present
+   then we typedef it - leaving it #ifndef so we can do data parts without
+   getting extern references to the code parts
+ */
  typedef unsigned char U8;
  #endif
  
  typedef struct encpage_s encpage_t;
  
+
  struct encpage_s
  {
- const U8   *seq;
- encpage_t  *next;
- U8         min;
- U8         max;
- U8         dlen;
- U8         slen;
+ /* fields ordered to pack nicely on 32-bit machines */
+ const U8   *seq;       /* Packed output sequences we generate if we match */
+ encpage_t  *next;      /* Page to go to if we match */
+ U8         min;        /* Min value of octet to match this entry */
+ U8         max;        /* Max value of octet to match this entry */
+ U8         dlen;       /* destination length - size of entries in seq */
+ U8         slen;       /* source length - number of source octets needed */
  };
  
+/*
+   At any point in a translation there is a page pointer which points at an array
+   of the above structures.
+
+   Basic operation :
+   get octet from source stream.
+   if (octet >= min && octet < max) {
+      if slen is 0 then we cannot represent this character.
+      if we have less than slen octets (including this one) then we have a partial character.
+      otherwise
+       copy dlen octets from seq + dlen*(octet-min) to output
+       (dlen may be zero if we don't know yet.)
+       load page pointer with next to continue.
+       (is slen is one this is end of a character)
+       get next octet.
+   }
+   else {
+      increment the page pointer to look at next slot in the array
+   }
+
+   arrays SHALL be constructed so there is an entry which matches ..0xFF at the end,
+   and either maps it or indicates no representation.
+
+   if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
+
+*/
+
+
  typedef struct encode_s encode_t;
  struct encode_s
  {
- encpage_t  *t_utf8;
- encpage_t  *f_utf8;
- const U8   *rep;
- int        replen;
- U8         min_el;
- U8         max_el;
- const char *name[2];
+ encpage_t  *t_utf8;    /* Starting table for translation from the encoding to UTF-8 form */
+ encpage_t  *f_utf8;    /* Starting table for translation from UTF-8 to the encoding */
+ const U8   *rep;       /* Replacement character in this encoding e.g. "?" */
+ int        replen;     /* Number of octets to represent replacement character */
+ U8         min_el;     /* Minimum octets to represent a character */
+ U8         max_el;     /* Maximum octets to represent a character */
+ const char *name[2];   /* name(s) of this encoding */
  };
  
  #ifdef U8
+/* See comment at top of file for deviousness */
+
  extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
                       U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
  
  extern void Encode_DefineEncoding(encode_t *enc);
+
  #endif
  
  #define ENCODE_NOSPACE  1
author	Nick Ing-Simmons <nik@tiuk.ti.com>
	Wed, 30 Jan 2002 09:57:47 +0000 (09:57 +0000)
committer	Nick Ing-Simmons <nik@tiuk.ti.com>
	Wed, 30 Jan 2002 09:57:47 +0000 (09:57 +0000)
ext/Encode/EUC_JP/Makefile.PL		patch \| blob \| history
ext/Encode/compile		patch \| blob \| history
ext/Encode/encode.h		patch \| blob \| history