4 $version = '$Revision: 1.14 $';
8 SGMLS - class for postprocessing the output from the B<sgmls> and
15 my $parse = new SGMLS(STDIN);
17 my $event = $parse->next_event;
22 ($event->type eq 'start_element') && do {
23 my $element = $event->data; # An object of class SGMLS_Element
24 [[your code for the beginning of an element]]
28 ($event->type eq 'end_element') && do {
29 my $element = $event->data; # An object of class SGMLS_Element
30 [[your code for the end of an element]]
34 ($event->type eq 'cdata') && do {
35 my $cdata = $event->data; # A string
36 [[your code for character data]]
40 ($event->type eq 'sdata') && do {
41 my $sdata = $event->data; # A string
42 [[your code for system data]]
46 ($event->type eq 're') && do {
47 [[your code for a record end]]
51 ($event->type eq 'pi') && do {
52 my $pi = $event->data; # A string
53 [[your code for a processing instruction]]
57 ($event->type eq 'entity') && do {
58 my $entity = $event->data; # An object of class SGMLS_Entity
59 [[your code for an external entity]]
63 ($event->type eq 'start_subdoc') && do {
64 my $entity = $event->data; # An object of class SGMLS_Entity
65 [[your code for the beginning of a subdoc entity]]
69 ($event->type eq 'end_subdoc') && do {
70 my $entity = $event->data; # An object of class SGMLS_Entity
71 [[your code for the end of a subdoc entity]]
75 ($event->type eq 'conforming') && do {
76 [[your code for a conforming document]]
80 die "Internal error: unknown event type " . $event->type . "\n";
83 $event = $parse->next_event;
88 The B<SGMLS> package consists of several related classes: see
89 L<"SGMLS">, L<"SGMLS_Event">, L<"SGMLS_Element">,
90 L<"SGMLS_Attribute">, L<"SGMLS_Notation">, and L<"SGMLS_Entity">. All
91 of these classes are available when you specify
95 Generally, the only object which you will create explicitly will
96 belong to the C<SGMLS> class; all of the others will then be created
97 automatically for you over the course of the parse. Much fuller
98 documentation is available in the C<.sgml> files in the C<DOC/>
99 directory of the C<SGMLS.pm> distribution.
101 =head2 The C<SGMLS> class
103 This class holds a single parse. When you create an instance of it,
104 you specify a file handle as an argument (if you are reading the
105 output of B<sgmls> or B<nsgmls> from a pipe, the file handle will
106 ordinarily be C<STDIN>):
108 my $parse = new SGMLS(STDIN);
110 The most important method for this class is C<next_event>, which reads
111 and returns the next major event from the input stream. It is
112 important to note that the C<SGMLS> class deals with most B<ESIS>
113 events itself: attributes and entity definitions, for example, are
114 collected and stored automatically and invisibly to the user. The
115 following list contains all of the methods for the C<SGMLS> class:
117 =item C<next_event()>: Return an C<SGMLS_Event> object containing the
118 next major event from the SGML parse.
120 =item C<element()>: Return an C<SGMLS_Element> object containing the
121 current element in the document.
123 =item C<file()>: Return a string containing the name of the current
124 SGML source file (this will work only if the C<-l> option was given to
125 B<sgmls> or B<nsgmls>).
127 =item C<line()>: Return a string containing the current line number
128 from the source file (this will work only if the C<-l> option was
129 given to B<sgmls> or B<nsgmls>).
131 =item C<appinfo()>: Return a string containing the C<APPINFO>
132 parameter (if any) from the SGML declaration.
134 =item C<notation(NNAME)>: Return an C<SGMLS_Notation> object
135 representing the notation named C<NNAME>. With newer versions of
136 B<nsgmls>, all notations are available; otherwise, only the notations
137 which are actually used will be available.
139 =item C<entity(ENAME)>: Return an C<SGMLS_Entity> object representing
140 the entity named C<ENAME>. With newer versions of B<nsgmls>, all
141 entities are available; otherwise, only external data entities and
142 internal entities used as attribute values will be available.
144 =item C<ext()>: Return a reference to an associative array for
145 user-defined extensions.
148 =head2 The C<SGMLS_Event> class
150 This class holds a single major event, as generated by the
151 C<next_event> method in the C<SGMLS> class. It uses the following
154 =item C<type()>: Return a string describing the type of event:
155 "start_element", "end_element", "cdata", "sdata", "re", "pi",
156 "entity", "start_subdoc", "end_subdoc", and "conforming". See
157 L<"SYNOPSIS">, above, for the values associated with each of these.
159 =item C<data()>: Return the data associated with the current event (if
160 any). For "start_element" and "end_element", returns an
161 C<SGMLS_ELement> object; for "entity", "start_subdoc", and
162 "end_subdoc", returns an C<SGMLS_Entity> object; for "cdata", "sdata",
163 and "pi", returns a string; and for "re" and "conforming", returns the
164 empty string. See L<"SYNOPSIS">, above, for an example of this
167 =item C<key()>: Return a string key to the event, such as an element
168 or entity name (otherwise, the same as C<data()>).
170 =item C<file()>: Return the current file name, as in the C<SGMLS>
173 =item C<line()>: Return the current line number, as in the C<SGMLS>
176 =item C<element()>: Return the current element, as in the C<SGMLS>
179 =item C<parse()>: Return the C<SGMLS> object which generated the
182 =item C<entity(ENAME)>: Look up an entity, as in the C<SGMLS> class.
184 =item C<notation(ENAME)>: Look up a notation, as in the C<SGMLS>
187 =item C<ext()>: Return a reference to an associative array for
188 user-defined extensions.
191 =head2 The C<SGMLS_Element> class
193 This class is used for elements, and contains all associated
194 information (such as the element's attributes). It recognises the
197 =item C<name()>: Return a string containing the name, or Generic
198 Identifier, of the element, in upper case.
200 =item C<parent()>: Return the C<SGMLS_Element> object for the
201 element's parent (if any).
203 =item C<parse()>: Return the C<SGMLS> object for the current parse.
205 =item C<attributes()>: Return a reference to an associative array of
206 attribute names and C<SGMLS_Attribute> structures. Attribute names
207 will be all in upper case.
209 =item C<attribute_names()>: Return an array of strings containing the
210 names of all attributes defined for the current element, in upper
213 =item C<attribute(ANAME)>: Return the C<SGMLS_Attribute> structure for
214 the attribute C<ANAME>.
216 =item C<set_attribute(ATTRIB)>: Add the C<SGMLS_Attribute> object
217 C<ATTRIB> to the current element, replacing any other attribute
218 structure with the same name.
220 =item C<in(GI)>: Return C<true> (ie. 1) if the string C<GI> is the
221 name of the current element's parent, or C<false> (ie. 0) if it is
224 =item C<within(GI)>: Return C<true> (ie. 1) if the string C<GI> is the
225 name of any of the ancestors of the current element, or C<false>
226 (ie. 0) if it is not.
228 =item C<ext()>: Return a reference to an associative array for
229 user-defined extensions.
232 =head2 The C<SGMLS_Attribute> class
234 Each instance of an attribute for each C<SGMLS_Element> is an object
235 belonging to this class, which recognises the following methods:
237 =item C<name()>: Return a string containing the name of the current
238 attribute, all in upper case.
240 =item C<type()>: Return a string containing the type of the current
241 attribute, all in upper case. Available types are "IMPLIED", "CDATA",
242 "NOTATION", "ENTITY", and "TOKEN".
244 =item C<value()>: Return the value of the current attribute, if any.
245 This will be an empty string if the type is "IMPLIED", a string of
246 some sort if the type is "CDATA" or "TOKEN" (if it is "TOKEN", you may
247 want to split the string into a series of separate tokens), an
248 C<SGMLS_Notation> object if the type is "NOTATION", or an
249 C<SGMLS_Entity> object if the type is "ENTITY". Note that if the
250 value is "CDATA", it will I<not> have escape sequences for 8-bit
251 characters, record ends, or SDATA processed -- that will be your
254 =item C<is_implied()>: Return C<true> (ie. 1) if the value of the
255 attribute is implied, or C<false> (ie. 0) if it is specified in the
258 =item C<set_type(TYPE)>: Change the type of the attribute to the
259 string C<TYPE> (which should be all in upper case). Available types
260 are "IMPLIED", "CDATA", "NOTATION", "ENTITY", and "TOKEN".
262 =item C<set_value(VALUE)>: Change the value of the attribute to
263 C<VALUE>, which may be a string, an C<SGMLS_Entity> object, or an
264 C<SGMLS_Notation> subject, depending on the attribute's type.
266 =item C<ext()>: Return a reference to an associative array available
267 for user-defined extensions.
270 =head2 The C<SGMLS_Notation> class
272 All declared notations appear as objects belonging to this class,
273 which recognises the following methods:
275 =item C<name()>: Return a string containing the name of the notation.
277 =item C<sysid()>: Return a string containing the system identifier of
278 the notation, if any.
280 =item C<pubid()>: Return a string containing the public identifier of
281 the notation, if any.
283 =item C<ext()>: Return a reference to an associative array available
284 for user-defined extensions.
287 =head2 The C<SGMLS_Entity> class
289 All declared entities appear as objects belonging to this class, which
290 recognises the following methods:
292 =item C<name()>: Return a string containing the name of the entity, in
295 =item C<type()>: Return a string containing the type of the entity, in
296 upper case. Available types are "CDATA", "SDATA", "NDATA" (external
297 entities only), "SUBDOC", "PI" (newer versions of B<nsgmls> only), or
298 "TEXT" (newer versions of B<nsgmls> only).
300 =item C<value()>: Return a string containing the value of the entity,
303 =item C<sysid()>: Return a string containing the system identifier of
304 the entity (if any), if it is external.
306 =item C<pubid()>: Return a string containing the public identifier of
307 the entity (if any), if it is external.
309 =item C<filenames()>: Return an array of strings containing any file
310 names generated from the identifiers, if the entity is external.
312 =item C<notation()>: Return the C<SGMLS_Notation> object associated
313 with the entity, if it is external.
315 =item C<data_attributes()>: Return a reference to an associative array
316 of data attribute names (in upper case) and the associated
317 C<SGMLS_Attribute> objects for the current entity.
319 =item C<data_attribute_names()>: Return an array of data attribute
320 names (in upper case) for the current entity.
322 =item C<data_attribute(ANAME)>: Return the C<SGMLS_Attribute> object
323 for the data attribute named C<ANAME> for the current entity.
325 =item C<set_data_attribute(ATTRIB)>: Add the C<SGMLS_Attribute> object
326 C<ATTRIB> to the current entity, replacing any other data attribute
329 =item C<ext()>: Return a reference to an associative array for
330 user-defined extensions.
333 =head1 AUTHOR AND COPYRIGHT
335 Copyright 1994 and 1995 by David Megginson,
336 C<dmeggins@aix1.uottawa.ca>. Distributed under the terms of the Gnu
337 General Public License (version 2, 1991) -- see the file C<COPYING>
338 which is included in the B<SGMLS.pm> distribution.
343 L<SGMLS::Output> and L<SGMLS::Refs>.
348 # Data class for a single SGMLS ESIS output event. The object will
349 # keep information about its own current element and, if available,
350 # the source file and line where the event appeared.
352 # Event types are as follow:
354 # -------------------------------------------------------
355 # 'start_element' SGMLS_Element
356 # 'end_element' SGMLS_Element
361 # 'entity' SGMLS_Entity
362 # 'start_subdoc' SGMLS_Entity
363 # 'end_subdoc' SGMLS_Entity
364 # 'conforming' [none]
370 my ($class,$type,$data,$parse) = @_;
381 sub type { return $_[0]->[0]; }
382 sub data { return $_[0]->[1]; }
383 sub file { return $_[0]->[2]; }
384 sub line { return $_[0]->[3]; }
385 sub element { return $_[0]->[4]; }
386 sub parse { return $_[0]->[5]; }
387 sub ext { return $_[0]->[6]; }
388 # Generate a key for the event.
391 if (ref($self->data) eq SGMLS_Element ||
392 ref($self->data) eq SGMLS_Entity) {
393 return $self->data->name;
398 # Look up an entity in the parse.
400 my ($self,$ename) = (@_);
401 return $self->parse->entity($ename);
403 # Look up a notation in the parse.
405 my ($self,$nname) = (@_);
406 return $self->parse->notation($nname);
411 # Data class for a single SGML attribute. The object will know its
412 # type, and will keep a value unless the type is 'IMPLIED', in which
413 # case no meaningful value is available.
415 # Attribute types are as follow:
417 # ---------------------------------------
420 # NOTATION SGMLS_Notation
421 # ENTITY SGMLS_Entity
424 package SGMLS_Attribute;
428 my ($class,$name,$type,$value) = @_;
429 return bless [$name,$type,$value,{}];
432 sub name { return $_[0]->[0]; }
433 sub type { return $_[0]->[1]; }
434 sub value { return $_[0]->[2]; }
435 sub ext { return $_[0]->[3]; }
436 # Return 1 if the value is implied.
439 return ($self->type eq 'IMPLIED');
441 # Set the attribute's type.
443 my ($self,$type) = @_;
447 # Set the attribute's value.
449 my ($self,$value) = @_;
455 # Data class for a single element of an SGML document. The object will not
456 # know about its children (data or other elements), but it keeps track of its
457 # parent and its attributes.
459 package SGMLS_Element;
463 my ($class,$name,$parent,$attributes,$parse) = @_;
464 return bless [$name,$parent,$attributes,$parse,{}];
467 sub name { return $_[0]->[0]; }
468 sub parent { return $_[0]->[1]; }
469 sub parse { return $_[0]->[3]; }
470 sub ext { return $_[0]->[4]; }
472 # Return the associative array of
473 # attributes, parsing it the first
477 if (ref($self->[2]) eq 'ARRAY') {
479 foreach (@{$self->[2]}) {
480 /^(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
481 || croak "Bad attribute event data: $_";
482 my ($name,$type,$value) = ($1,$2,$4);
483 if ($type eq 'NOTATION') {
484 $value = $self->parse->notation($value);
485 } elsif ($type eq 'ENTITY') {
486 $value = $self->parse->entity($value);
489 new SGMLS_Attribute($name,$type,$value);
495 # Return a list of attribute names.
496 sub attribute_names {
498 return keys(%{$self->attributes});
500 # Find an attribute by name.
502 my ($self,$aname) = @_;
503 return $self->attributes->{$aname};
505 # Add a new attribute.
507 my ($self,$attribute) = @_;
508 $self->attributes->{$attribute->name} = $attribute;
510 # Check parent by name.
512 my ($self,$name) = @_;
513 if ($self->parent && $self->parent->name eq $name) {
514 return $self->parent;
519 # Check ancestors by name.
521 my ($self,$name) = @_;
522 for ($self = $self->parent; $self; $self = $self->parent) {
523 return $self if ($self->name eq $name);
530 # Data class for an SGML notation. The only information available
531 # will be the name, the sysid, and the pubid -- the rest is up to the
532 # processing application.
534 package SGMLS_Notation;
538 my ($class,$name,$sysid,$pubid) = @_;
539 return bless [$name,$sysid,$pubid,{}];
542 sub name { return $_[0]->[0]; }
543 sub sysid { return $_[0]->[1]; }
544 sub pubid { return $_[0]->[2]; }
545 sub ext { return $_[0]->[3]; }
548 # Data class for a single SGML entity. All entities will have a name
549 # and a type. Internal entities will be of type CDATA or SDATA only,
550 # and will have a value rather than a notation and sysid/pubid. External
551 # CDATA, NDATA, and SDATA entities will always have notations attached,
552 # and SUBDOC entities are always external (and will be parsed by SGMLS).
554 # Entity types are as follow:
555 # Type Internal External
556 # -----------------------------------------------------------
561 # (newer versions of NSGMLS only:)
565 package SGMLS_Entity;
569 my ($class,$name,$type,$value,$sysid,$pubid,$filenames,$notation) = @_;
570 return bless [$name,$type,$value,{},$sysid,$pubid,$filenames,$notation,{}];
573 sub name { return $_[0]->[0]; }
574 sub type { return $_[0]->[1]; }
575 sub value { return $_[0]->[2]; }
576 sub data_attributes { return $_[0]->[3]; }
577 sub sysid { return $_[0]->[4]; }
578 sub pubid { return $_[0]->[5]; }
579 sub filenames { return $_[0]->[6]; }
580 sub notation { return $_[0]->[7]; }
581 sub ext { return $_[0]->[8]; }
582 # Return a list of data-attribute names.
583 sub data_attribute_names {
585 return keys(%{$self->data_attributes});
587 # Find a data attribute by name.
589 my ($self,$aname) = @_;
590 return $self->data_attributes->{$aname};
592 # Add a new data attribute.
593 sub set_data_attribute {
594 my ($self,$data_attribute) = @_;
595 $self->data_attributes()->{$data_attribute->name} = $data_attribute;
601 # Data class for a single SGMLS parse. The constructor takes a single
602 # argument, a file handle from which the SGMLS ESIS events will be read
603 # (it may be a pipe, a fifo, a file, a socket, etc.). It is essential
604 # that no two SGMLS objects have the same handle.
609 my ($class,$handle) = @_;
611 # Force unqualified filehandles into caller's package
612 my ($package) = caller;
613 $handle =~ s/^[^':]+$/$package\:\:$&/;
618 'current_element' => '',
619 'current_attributes' => [],
620 'current_entities' => {},
621 'entity_stack' => [],
622 'current_notations' => {},
623 'notation_stack' => [],
624 'current_sysid' => '',
625 'current_pubid' => '',
626 'current_filenames' => [],
627 'current_file' => '',
628 'current_line' => '',
634 sub element { return $_[0]->{'current_element'}; }
635 sub file { return $_[0]->{'current_file'}; }
636 sub line { return $_[0]->{'current_line'}; }
637 sub appinfo { return $_[0]->{'appinfo'}; }
638 sub ext { return $_[0]->{'ext'}; }
640 # Given its name, look up a notation.
642 my ($self,$nname) = @_;
643 return $self->{'current_notations'}->{$nname};
645 # Given its name, look up an entity.
647 my ($self,$ename) = @_;
648 return $self->{'current_entities'}->{$ename};
651 # Return the next SGMLS_Event, or ''
652 # if the document has finished.
655 my $handle = $self->{'handle'};
657 # If there are any queued up events,
659 if ($#{$self->{event_stack}} >= 0) {
660 return pop @{$self->{event_stack}};
663 dispatch: while (!eof($handle)) {
665 my $c = getc($handle);
666 my $data = <$handle>;
669 ($c eq '(') && do { # start an element
670 $self->{'current_element'} =
671 new SGMLS_Element($data,
672 $self->{'current_element'},
673 $self->{'current_attributes'},
675 $self->{'current_attributes'} = [];
676 return new SGMLS_Event('start_element',
677 $self->{'current_element'},
681 ($c eq ')') && do { # end an element
682 my $old = $self->{'current_element'};
683 $self->{'current_element'} = $self->{'current_element'}->parent;
684 return new SGMLS_Event('end_element',$old,$self);
687 ($c eq '-') && do { # some data
690 while ($data =~ /\\(\\|n|\||[0-7]{1,3})/) {
693 # beginning or end of SDATA
696 unshift(@{$self->{'event_stack'}},
697 new SGMLS_Event($sdata_flag?'sdata':'cdata',
702 $sdata_flag = !$sdata_flag;
704 } elsif ($1 eq 'n') {
706 unshift(@{$self->{'event_stack'}},
707 new SGMLS_Event($sdata_flag?'sdata':'cdata',
712 unshift(@{$self->{'event_stack'}},
713 new SGMLS_Event('re','',$self));
714 } elsif ($1 eq '\\') {
717 $out .= chr(oct($1));
722 unshift(@{$self->{'event_stack'}},
723 new SGMLS_Event($sdata_flag?'sdata':'cdata',
727 return $self->next_event;
730 ($c eq '&') && do { # external entity reference
731 return new SGMLS_Event('entity',
732 ($self->{'current_entities'}->{$data}
733 || croak "Unknown external entity: $data\n"),
737 ($c eq '?') && do { # processing instruction
738 return new SGMLS_Event('pi',
743 ($c eq 'A') && do { # attribute declaration
744 # (will parse only on demand)
745 push @{$self->{'current_attributes'}}, $data;
749 ($c eq 'a') && do { # link attribute declaration
750 # NOT YET IMPLEMENTED!
754 ($c eq 'D') && do { # data attribute declaration
755 $data =~ /^(\S+) (\S+) (\S+)( (.*))?$/
756 || croak "Bad data-attribute event data: $data";
757 my ($ename,$aname,$type,$value) = ($1,$2,$3,$5);
758 my $entity = $self->{'current_entities'}->{$ename};
759 my $attribute = new SGMLS_Attribute($aname,$type,$value);
760 $entity->set_data_attribute($attribute);
764 ($c eq 'N') && do { # notation declaration
765 $self->{'current_notations'}->{$data} =
766 new SGMLS_Notation($data,
767 $self->{'current_sysid'},
768 $self->{'current_pubid'});
769 $self->{'current_sysid'} = '';
770 $self->{'current_pubid'} = '';
774 ($c eq 'E') && do { # external entity declaration
775 $data =~ /^(\S+) (\S+) (\S+)$/
776 || croak "Bad external entity event data: $data";
777 my ($name,$type,$nname) = ($1,$2,$3);
778 my $notation = $self->{'current_notations'}->{$nname} if $nname;
779 $self->{'current_entities'}->{$name} =
780 new SGMLS_Entity($name,
783 $self->{'current_sysid'},
784 $self->{'current_pubid'},
785 $self->{'current_filenames'},
787 $self->{'current_sysid'} = '';
788 $self->{'current_pubid'} = '';
789 $self->{'current_filenames'} = [];
793 ($c eq 'I') && do { # internal entity declaration
794 $data =~ /^(\S+) (\S+) (.*)$/
795 || croak "Bad external entity event data: $data";
796 my ($name,$type,$value) = ($1,$2,$3);
797 $self->{'current_entities'}->{$name} =
798 new SGMLS_Entity($name, $type, $value);
802 ($c eq 'T') && do { # external text entity declaration
803 $self->{'current_entities'}->{$data} =
804 new SGMLS_Entity($data,
807 $self->{'current_sysid'},
808 $self->{'current_pubid'},
809 $self->{'current_filenames'},
811 $self->{'current_sysid'} = '';
812 $self->{'current_pubid'} = '';
813 $self->{'current_filenames'} = [];
817 ($c eq 'S') && do { # subdocument entity declaration
818 $self->{'current_entities'}->{$data} =
819 new SGMLS_Entity($data,
822 $self->{'current_sysid'},
823 $self->{'current_pubid'},
824 $self->{'current_filenames'},
826 $self->{'current_sysid'} = '';
827 $self->{'current_pubid'} = '';
828 $self->{'current_filenames'} = [];
832 ($c eq 's') && do { # system id
833 $self->{'current_sysid'} = $data;
837 ($c eq 'p') && do { # public id
838 $self->{'current_pubid'} = $data;
842 ($c eq 'f') && do { # generated filename
843 push @{$self->{'current_filenames'}}, $data;
847 ($c eq '{') && do { # begin subdocument entity
848 my $subdoc = ($self->{'current_entities'}->{$data}||
849 croak "Unknown SUBDOC entity $data\n");
850 push @{$self->{'notation_stack'}}, $self->{'current_notations'};
851 push @{$self->{'entity_stack'}}, $self->{'current_entities'};
852 $self->{'current_notations'} = {};
853 $self->{'current_entities'} = {};
854 return new SGMLS_Event('start_subdoc',
859 ($c eq '}') && do { # end subdocument entity
860 $self->{'current_notations'} = pop @{$self->{'notation_stack'}};
861 $self->{'current_entities'} = pop @{$self->{'entity_stack'}};
862 return new SGMLS_Event('end_subdoc',
863 ($self->{'current_entities'}->{$data} ||
864 croak "Unknown SUBDOC entity $data\n"),
868 ($c eq 'L') && do { # line number (and file name)
869 $data =~ /^(\d+)( (.*))?$/;
870 $self->{'current_line'} = $1;
871 $self->{'current_file'} = $3 if $3;
875 ($c eq '#') && do { # APPINFO parameter
876 $self->{'appinfo'} = $data;
880 ($c eq 'C') && do { # document is conforming
881 return new SGMLS_Event('conforming','',$self);
889 ########################################################################
893 ########################################################################