1 #!@perl_bindir@/perl -w
3 # Structurally diffs two SGML/XML files.
4 # Copyright (C) 2000 Frederik Fouvry
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # Send bug reports, comments, suggestions, improvements etc. to
21 # Frederik Fouvry <fouvry@sfs.nphil.uni-tuebingen.de>.
25 use vars qw($nsgmls $errors $errorlog $VERSION);
26 use Getopt::Long 2.01;
31 $nsgmls = "@jade_bindir@/nsgmls";
33 #----------------------------------------------------------------------
35 # - add text occurrences to diff file, such that diff has a bit more
36 # context; it might help in some cases (and perhaps ruin it in others).
37 # - Wait for suggestions ;-)
39 # Note: the input file need not be valid, nor is it necessary to have
40 # the DTDs. nsgmls always returns a structure.
41 #----------------------------------------------------------------------
44 chomp(my $progname = `basename $0`);
46 my ($opt_a, $opt_s, $opt_h, $opt_v, $opt_c) = (0, 0, 0, 0, "");
47 &GetOptions("h|help" => \$opt_h,
48 "v|version" => \$opt_v,
49 "s|statistics!" => \$opt_s,
50 "a|attributes!" => \$opt_a,
51 "c|context=s" => \$opt_c);
52 # -a includes the attribute values in the diff
53 # -s prints external entity information at the end
56 # -c add some context to improve the diff results
59 print STDOUT "DocBook-utils version @VERSION@ (sgmldiff v$VERSION)\n";
63 # Check number of arguments
64 if ($opt_h == 1 || @ARGV != 2) {
65 print STDERR "DocBook-utils version @VERSION@ (sgmldiff v$VERSION)
67 Usage: $progname [options] file1 file2
68 where the options are:
69 -a, --attributes includes the attribute values in the diff
70 -s, --statistics prints some SGML information at the end
71 -h, --help prints this usage information
72 -v, --version prints the version on the standard output
73 -c, --context adds more context to the diff, which may
74 improve the results. It takes \"attributes\", \"textpos\"
75 or \"nesting\" as an argument e.g. -c textpos, which can
76 also be combined: -c nesting,attributes
82 $opt_a = 1 if $opt_c =~ /attributes/; # -c attributes = -a
86 my $diff1 = "$file1.difftmp$$";
88 my $diff2 = "$file2.difftmp$$";
89 $errors = "-E0 -e -g"; # allow any number of errors
90 # and show precise context position of error
91 $errorlog = "-f /dev/null";
94 # Get structure of the files
95 my ($lines1,@allfile1) = &prepare($file1, $diff1);
96 my ($lines2,@allfile2) = &prepare($file2, $diff2);
97 my @lines1 = split(/@/,$lines1);
98 my @lines2 = split(/@/,$lines2);
100 # Do diff and rebuild the original input
101 open(SDIFF,"diff $diff1 $diff2 |");
103 while (defined($_)) {
105 my ($start1, $start2, $command, $d1, $d2, $end1, $end2);
107 if ($_ =~ /^(\d+)(,(\d+))?([acd])(\d+)(,(\d+))?$/) {
111 if (defined $3) { $d1 = $3-$1; } else { $d1 = 0; };
112 if (defined $7) { $d2 = $7-$5; } else { $d2 = 0; };
116 print "$lines1[$start1]"
117 .($lines1[$end1] > $lines1[$start1] ? ",$lines1[$end1]" : "")
118 ."$command$lines2[$start2]"
119 .($lines2[$end2] > $lines2[$start2] ? ",$lines2[$end2]" : "")
121 # Print lines of first file
123 while (defined $_ && /^< /) {
124 print &normalise_text($allfile1[$start1++],"< ");
129 # Print lines of second file
130 $_ = <SDIFF> if defined($_) && $_ =~ /^---$/;
131 while (defined $_ && /^> /) {
132 print &normalise_text($allfile2[$start2++],"> ");
143 #---------------------------------------------------------------------
144 # Process nsgmls output: keep all stuff that is important for the
145 # structure comparison. Make two structures: one that is diffed
146 # (without text) (DIFF) and one that is used to present the
147 # differences to the user (@full). For more info: see SP
148 # documentation, nsgmls output format.
151 my($filename,$todiffname) = @_;
155 my ($system_identifier, $public_identifier, $f_info, $empty) = ("", "", "");
156 my %statistics = (notation => {},
161 my @line_numbered = ();
164 open(ESIS, "$nsgmls -l $errors $errorlog -onotation-sysid -oid -oempty $filename | "); #-oentity generates strange output; ? -ononsgml
165 open(DIFF, "> $todiffname");
168 if ($_ =~ /^\((.+)$/) {
169 print DIFF "$indent<$1";
170 print DIFF " ".join(" ",@attributes)
171 if (@attributes > 0 && defined($opt_a) && $opt_a == 1);
173 push @line_numbered, "$line";
174 push @full, "$indent<$1".(@attributes > 0 ? " ".join(" ",@attributes) : "").">\n";
176 $indent .= " " if $opt_c =~ /nesting/;
177 } elsif ($_ =~ /^\)(.+)$/) {
179 $indent = substr($indent,0,-1) if $opt_c =~ /nesting/;
180 push @line_numbered, "$line" unless $empty;
181 push @full, "$indent</$gi>\n" unless $empty;
182 print DIFF "$indent</$gi>\n" unless $empty;
184 } elsif ($_ =~ /^-(.*)$/) {
186 my @a = split(/\\n/, $data);
187 push @line_numbered, "$line";
188 push @full, "$data\n";
190 print DIFF ($opt_c =~ /textpos/ ? "-" : "")."\n";
191 } elsif ($_ =~ /^\&(.*)$/) {
193 push @line_numbered, "$line";
195 } elsif ($_ =~ /^\?(.*)$/) {
196 print DIFF "<?$1>\n";
197 push @line_numbered, "$line";
198 push @full, "<?$1>\n";
199 } elsif ($_ =~ /^A(\S+)\s+(IMPLIED|CDATA (.*)|NOTATION (.*)|ENTITY (.*)|TOKEN (.*)|ID (.*))$/) {
202 if ($val eq "IMPLIED") {
203 # don't print anything
204 } elsif ($val =~ /^CDATA (.*)$/) {
205 @attributes = (@attributes, "$attr=\"$1\"");
206 } elsif ($val =~ /^NOTATION (.*)$/) {
207 @attributes = (@attributes, "$attr=\"$1\"");
208 } elsif ($val =~ /^ENTITY (.*)$/) {
209 @attributes = (@attributes, "$attr=\"$1\"");
210 } elsif ($val =~ /^TOKEN (.*)$/) {
211 @attributes = (@attributes, "$attr=\"$1\"");
212 } elsif ($val =~ /^ID (.*)$/) {
213 @attributes = (@attributes, "$attr=\"$1\"");
215 warn "Unrecognised construction `$val'";
217 } elsif ($_ =~ /^D(\S+)\s+(IMPLIED|CDATA (.*)|NOTATION (.*)|ENTITY (.*)|TOKEN (.*)|ID (.*))$/) {
218 # as yet never printed out
222 if ($val eq "IMPLIED") {
223 # don't print anything
224 } elsif ($val =~ /^CDATA (.*)$/) {
225 @e_attributes = (@e_attributes, "$attr=\"$1\"");
226 } elsif ($val =~ /^NOTATION (.*)$/) {
227 @e_attributes = (@e_attributes, "$attr=\"$1\"");
228 } elsif ($val =~ /^ENTITY (.*)$/) {
229 @e_attributes = (@e_attributes, "$attr=\"$1\"");
230 } elsif ($val =~ /^TOKEN (.*)$/) {
231 @e_attributes = (@e_attributes, "$attr=\"$1\"");
232 } elsif ($val =~ /^ID (.*)$/) {
233 @e_attributes = (@e_attributes, "$attr=\"$1\"");
235 warn "Unrecognised construction `$val'";
238 } elsif ($_ =~ /^a(\S+)\s+(\S+)\s+(.*)$/) {
240 } elsif ($_ =~ /^N(.*)$/) {
241 $statistics{notation}->{$1}->{pubid} = "$public_identifier"
242 unless $public_identifier eq "";
243 $statistics{notation}->{$1}->{sysid} = "$system_identifier"
244 unless $system_identifier eq "";
245 $statistics{notation}->{$1}->{emsysid} = "$f_info"
246 unless $f_info eq "";
247 $system_identifier = "";
248 $public_identifier = "";
250 } elsif ($_ =~ /^E(\S+)\s+(CDATA|NDATA|SDATA)\s+(.*)$/) {
251 $statistics{external_data}->{$1}->{pubid} = "$public_identifier $2 $3"
252 unless $public_identifier eq "";
253 $statistics{external_data}->{$1}->{sysid} = "$system_identifier $2 $3"
254 unless $system_identifier eq "";
255 $statistics{external_data}->{$1}->{emsysid} = "$f_info"
256 unless $f_info eq "";
257 $system_identifier = "";
258 $public_identifier = "";
260 } elsif ($_ =~ /^I(\S+)\s+(CDATA|SDATA|PI|TEXT)\s+(.*)$/) {
264 if ($typ =~ /^CDATA$/) {
266 push @line_numbered, "$line";
268 } elsif ($typ =~ /^SDATA$/) {
270 } elsif ($typ =~ /^PI$/) {
272 } elsif ($typ =~ /^TEXT$/) {
277 } elsif ($_ =~ /^S(.*)$/) {
278 $statistics{subdocument}->{$1}->{pubid} = "$public_identifier"
279 unless $public_identifier eq "";
280 $statistics{subdocument}->{$1}->{sysid} = "$system_identifier"
281 unless $system_identifier eq "";
282 $statistics{subdocument}->{$1}->{emsysid} = "$f_info"
283 unless $f_info eq "";
284 $system_identifier = "";
285 $public_identifier = "";
287 } elsif ($_ =~ /^T(.*)$/) {
288 $statistics{text}->{$1}->{pubid} = "$public_identifier"
289 unless $public_identifier eq "";
290 $statistics{text}->{$1}->{sysid} = "$system_identifier"
291 unless $system_identifier eq "";
292 $statistics{text}->{$1}->{emsysid} = "$f_info"
293 unless $f_info eq "";
294 $system_identifier = "";
295 $public_identifier = "";
297 } elsif ($_ =~ /^s(.*)$/) {
298 $system_identifier = "$1";
299 } elsif ($_ =~ /^p(.*)$/) {
300 $public_identifier = "$1";
301 } elsif ($_ =~ /^f(.*)$/) {
303 } elsif ($_ =~ /^{(.*)$/) {
305 } elsif ($_ =~ /^}(.*)$/) {
307 } elsif ($_ =~ /^L((\d+)( (.+))?)$/) {
309 # only line is set; nothing else is done
310 # print DIFF "----------$4----------\n" if defined($4);
311 # push @full, (defined($4) ? "----------$4----------\n" : "")."L$line\n";
312 # push @line_numbered, "$line";
313 $statistics{files}->{$4} = 1 if defined $4;
314 } elsif ($_ =~ /^#(.*)$/) {
316 } elsif ($_ =~ /^C$/) {
317 print STDERR "====================\n";
318 print STDERR "The file `$filename' is a valid document.\n";
319 } elsif ($_ =~ /^i$/) {
321 # only output with the option -oincluded
322 # for elements that are allowed by inclusion exception
323 } elsif ($_ =~ /^e$/) {
325 # only output with the option -oempty
327 warn "Unrecognised construction `$_'";
332 if (defined $opt_s && $opt_s == 1) {
333 print STDERR "--------------------\n";
334 print STDERR "Used SGML text files:\n"
335 unless keys(%{$statistics{files}}) == 0;
336 foreach my $f (keys %{$statistics{files}}) {
337 print STDERR " $f\n";
339 delete $statistics{files};
341 foreach my $k (keys %statistics) {
343 if ($k eq "external_data") { $stat_text1 .= "<!ENTITY"; }
344 elsif ($k eq "notation") { $stat_text1 .= "<!NOTATION"; }
345 elsif ($k eq "subdocument") { $stat_text1 .= "<!SUBDOC"; }
346 elsif ($k eq "text") { $stat_text1 .= "Entity"; }
347 else { die "Wrong statistics value"};
348 foreach my $l (keys %{$statistics{$k}}) {
349 $stat_text .= "$stat_text1 $l ";
350 foreach my $m (keys %{$statistics{$k}->{$l}}) {
351 my $value = "$statistics{$k}->{$l}->{$m}";
352 if ($m eq "pubid" && defined($value)) {
353 $stat_text .= "PUBLIC \"$value\">\n";
354 } elsif ($m eq "sysid" && defined($value)) {
355 $stat_text .= "SYSTEM \"$value\">\n";
358 my $value = $statistics{$k}->{$l}->{emsysid};
359 if (defined($value) && $value ne "") {
360 $value =~ s/^<(.*)>(.*)/$2/o, my $si = $1;
361 $si =~ s/^osfile$/FILE/io;
362 $stat_text .= " Full name of system identifier ($si) actually referred to:\n \"$value\"\n";
364 $stat_text .= " No system identifier could be generated\n";
368 if ($stat_text ne "") {
369 print STDERR "SGML information for `$filename':\n$stat_text";
371 print STDERR "No SGML information for `$filename'\n"
374 return (join("@",@line_numbered),@full);
377 #----------------------------------------------------------------------
378 # Normalise data text from nsgmls (i.e. don't print the escaped text).
380 my($string,$prefix) = @_;
381 my $result = "$prefix";
383 my ($char,$state,$c);
385 @string = split(//,$string);
386 foreach $c (@string) {
387 if (defined $state) { # we're in an escape sequence
388 if ($state eq "escape") { # which just started
389 if ($c eq "\\") { # slash
392 } elsif ($c eq "|") { # pipe
393 warn "Unresolved SDATA ";
396 } elsif ($c eq "n") { # newline
397 $result .= "\n$prefix";
399 } elsif ($c eq "%" || $c eq "#") { # character
400 # `\\#\d+;' is character number in internal character set
401 # (if not representable by output encoding)
402 # `\\%\d+;' is character number in document character set
403 # (numeric char ref to non-SGML chars in fixed char set mode)
406 } elsif ($c =~ /^[0-7]$/) { # character
410 die "Unrecognised construction";
412 } elsif ($state eq "decchar") { # reading a character code
416 $result .= chr($char);
420 } elsif ($state eq "octchar") { # reading a charactre code
421 if (length($char) < 2) {
423 } else { # length == 2
424 $result .= chr(oct($char.$c));
429 die "State `$state' does not exist, stopped ";
431 } elsif ($c eq "\\") { # an escape starts
433 } else { # normal case
441 #----------------------------------------------------------------------
444 warn "`$a' not implemented yet.
445 Please send a message to the maintainer (see source file) and include
446 an example (e.g. the input that caused this message)";