codepage/gensubset.pl

   1 #!/usr/bin/perl
   2 #
   3 # Generate a subset of the UnicodeData.txt file, available from
   4 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
   5 #
   6 # Usage:
   7 #   gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
   8 #
   9
  10 %need_these = ();
  11
  12 # Mark as needed all the characters mentioned in the relevant files
  13 foreach $file (@ARGV) {
  14     open(F, '<', $file) or die;
  15     while (defined($line = <F>)) {
  16         $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
  17         @f = split(/\s+/, $line);
  18         next if (scalar @f != 2);
  19         $need_these{hex $f[1]}++;
  20     }
  21     close(F);
  22 }
  23
  24 # Also mark as needed any case variants of those
  25 # (Note: this doesn't necessarily provide the full transitive closure,
  26 # but we shouldn't need it.)
  27 while (defined($line = <STDIN>)) {
  28     @f = split(/;/, $line);
  29     if ($f[0] =~ /^([0-9a-f]+)$/i) {
  30         $r = hex $f[0];
  31         if ($need_these{$r}) {
  32             $need_these{hex $f[12]}++ if ($f[12] ne '');
  33             $need_these{hex $f[13]}++ if ($f[13] ne '');
  34             $need_these{hex $f[14]}++ if ($f[14] ne '');
  35         }
  36     }
  37 }
  38
  39 # Finally, write out the subset
  40 seek(STDIN, 0, 0);
  41 while (defined($line = <STDIN>)) {
  42     ($v, $l) = split(/;/, $line, 2);
  43     if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
  44         # This isn't actually the format... fix that if it ever matters
  45         $r1 = hex $1;
  46         $r2 = hex $2;
  47     } elsif ($v =~ /^([0-9a-f]+)$/i) {
  48         $r1 = $r2 = hex $1;
  49     } else {
  50         next;
  51     }
  52     for ($r = $r1; $r <= $r2; $r++) {
  53         printf "%04X;%s", $r, $l if ($need_these{$r});
  54     }
  55 }
  56
  57