_______________________________________________________________________________
+2011-10-15 Release 3.69
+
+Gisle Aas (4):
+ Documentation fix; encode_utf8 mixup [RT#71151]
+ Make it clearer that there are 2 (actually 3) options for handing "UTF-8 garbage"
+ Github is the official repo
+ Can't be bothered to try to fix the failures that occur on perl-5.6
+
+Barbie (1):
+ fix to TokeParser to correctly handle option configuration
+
+Jon Jensen (1):
+ Aesthetic change: remove extra ;
+
+Ville Skyttä (1):
+ Trim surrounding whitespace from extracted URLs.
+
+
+_______________________________________________________________________________
+2010-09-01 Release 3.68
+
+Gisle Aas (1):
+ Declare the encoding of the POD to be utf8
+
+
+_______________________________________________________________________________
+2010-08-17 Release 3.67
+
+Nicholas Clark (1):
+ bleadperl 2154eca7 breaks HTML::Parser 3.66 [RT#60368]
+
+
+_______________________________________________________________________________
+2010-07-09 Release 3.66
+
+Gisle Aas (1):
+ Fix entity decoding in utf8_mode for the title header
+
+
+_______________________________________________________________________________
2010-04-04 Release 3.65
Gisle Aas (1):
--- #YAML:1.0
name: HTML-Parser
-version: 3.65
+version: 3.69
abstract: HTML parser class
author:
- Gisle Aas <gisle@activestate.com>
Test::More: 0
requires:
HTML::Tagset: 3
- perl: 5.006
+ perl: 5.008
XSLoader: 0
resources:
MailingList: mailto:libwww@perl.org
- repository: http://gitorious.org/projects/perl-html-parser
+ repository: http://github.com/gisle/html-parser
no_index:
directory:
- t
- inc
-generated_by: ExtUtils::MakeMaker version 6.56
+generated_by: ExtUtils::MakeMaker version 6.57_05
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: 1.4
AUTHOR => 'Gisle Aas <gisle@activestate.com>',
LICENSE => 'perl',
- MIN_PERL_VERSION => 5.006,
+ MIN_PERL_VERSION => 5.008,
PREREQ_PM => {
'HTML::Tagset' => 3,
'XSLoader' => 0,
build_requires => { 'Test::More' => 0 },
recommends => { 'HTTP::Headers' => 0 },
resources => {
- repository => 'http://gitorious.org/projects/perl-html-parser',
+ repository => 'http://github.com/gisle/html-parser',
MailingList => 'mailto:libwww@perl.org',
}
},
use strict;
use vars qw($VERSION @ISA);
-$VERSION = "3.65";
+$VERSION = "3.69";
require HTML::Entities;
General entities are decoded in the attribute values and
one layer of matching quotes enclosing the attribute values is removed.
-The Unicode character set is assumed for entity decoding. With Perl
-version 5.6 or earlier only the Latin-1 range is supported, and
-entities for characters outside the range 0..255 are left unchanged.
+The Unicode character set is assumed for entity decoding.
=item C<@attr>
for any entities that expand to characters with code above 127. This
is not a good thing.
-The solution is to use the Encode::encode_utf8() on the data before
-feeding it to the $p->parse(). For $p->parse_file() pass a file that
-has been opened in ":utf8" mode.
+The recommened solution is to apply Encode::decode_utf8() on the data before
+feeding it to the $p->parse(). For $p->parse_file() pass a file that has been
+opened in ":utf8" mode.
-The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode>
-is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided.
+The alternative solution is to enable the C<utf8_mode> and not decode before
+passing strings to $p->parse(). The parser can process raw undecoded UTF-8
+sanely if the C<utf8_mode> is enabled, or if the "attr", "@attr" or "dtext"
+argspecs are avoided.
=item Parsing string decoded with wrong endianness
static SV*
check_handler(pTHX_ SV* h)
{
+ SvGETMAGIC(h);
if (SvROK(h)) {
SV* myref = SvRV(h);
if (SvTYPE(myref) == SVt_PVCV)
PREREQUISITES
In order to install and use this package you will need Perl version
-5.6 or better. The HTML::Tagset module should be installed.
+5.8 or better. The HTML::Tagset module should be installed.
If you intend to use the HTML::HeadParser you probably want to install
libwww-perl too.
package HTML::Entities;
+=encoding utf8
+
=head1 NAME
HTML::Entities - Encode or decode strings with HTML entities
=item decode_entities( $string, ... )
This routine replaces HTML entities found in the $string with the
-corresponding Unicode character. Under perl 5.6 and earlier only
-characters in the Latin-1 range are replaced. Unrecognized
-entities are left alone.
+corresponding Unicode character. Unrecognized entities are left alone.
If multiple strings are provided as argument they are each decoded
separately and the same number of strings are returned.
@EXPORT = qw(encode_entities decode_entities _decode_entities);
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
-$VERSION = "3.64";
+$VERSION = "3.69";
sub Version { $VERSION; }
require HTML::Parser; # for fast XS implemented decode_entities
use strict;
use vars qw($VERSION $DEBUG);
#$DEBUG = 1;
-$VERSION = "3.62";
+$VERSION = "3.69";
=item $hp = HTML::HeadParser->new
$text =~ s/\s+/ /g;
print "FLUSH $tag => '$text'\n" if $DEBUG;
if ($tag eq 'title') {
+ my $decoded;
+ $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode;
HTML::Entities::decode($text);
+ utf8::encode($text) if $decoded;
$self->{'header'}->push_header(Title => $text);
}
$self->{'tag'} = $self->{'text'} = '';
$self->{'header'}->push_header($key => $attr->{content});
} elsif ($tag eq 'base') {
return unless exists $attr->{href};
- $self->{'header'}->push_header('Content-Base' => $attr->{href});
+ (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5
+ $self->{'header'}->push_header('Content-Base' => $base);
} elsif ($tag eq 'isindex') {
# This is a non-standard header. Perhaps we should just ignore
# this element
} elsif ($tag eq 'link') {
return unless exists $attr->{href};
# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
- my $h_val = "<" . delete($attr->{href}) . ">";
+ my $href = delete($attr->{href});
+ $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5
+ my $h_val = "<$href>";
for (sort keys %{$attr}) {
next if $_ eq "/"; # XHTML junk
$h_val .= qq(; $_="$attr->{$_}");
}
BEGIN {
- *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;;
+ *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;
}
1;
require HTML::Parser;
@ISA = qw(HTML::Parser);
-$VERSION = "3.60";
+$VERSION = "3.69";
=head1 NAME
my $a;
for $a (@$links) {
next unless exists $attr->{$a};
- push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
- : $attr->{$a});
+ (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
+ push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
}
return unless @links;
$self->_found_link($tag, @links);
require HTML::PullParser;
@ISA=qw(HTML::PullParser);
-$VERSION = "3.57";
+$VERSION = "3.69";
use strict;
use Carp ();
{
my $class = shift;
my %cnf;
+
if (@_ == 1) {
my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
%cnf = ($type => $_[0]);
}
else {
+ unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1);
%cnf = @_;
}
my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
- my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
+ my $self = $class->SUPER::new(%ARGS, %cnf) || return undef;
$self->{textify} = $textify;
$self;
-Name: perl-HTML-Parser
-Summary: Perl module for parsing HTML
-Version: 3.65
-Release: 2
-Group: Development/Libraries
-License: GPL+ or Artistic
-URL: http://search.cpan.org/dist/HTML-Parser/
-Source0: %{name}-%{version}.tar.gz
-Source1001: perl-HTML-Parser.manifest
-Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version))
-Requires: perl(HTML::Tagset) >= 3.03
-BuildRequires: perl(HTML::Tagset) >= 3.03, perl(ExtUtils::MakeMaker), perl(Test::Simple)
+%define real_name HTML-Parser
+Name: perl-%{real_name}
+Summary: Perl module for parsing HTML
+Version: 3.69
+Release: 3%{?dist}
+License: GPL+ or Artistic
+Group: Development/Libraries
+Source0: %{real_name}-%{version}.tar.gz
+Source1001: packaging/perl-HTML-Parser.manifest
+URL: http://search.cpan.org/dist/HTML-Parser/
+Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version))
+BuildRequires: perl(Carp)
+BuildRequires: perl(ExtUtils::MakeMaker)
+BuildRequires: perl(ExtUtils::ParseXS)
+BuildRequires: perl(HTML::Tagset) >= 3
+BuildRequires: perl(Test::More)
+BuildRequires: perl(URI)
+BuildRequires: perl(XSLoader)
+BuildRequires: perl(Pod::Simple)
+BuildRequires: perl(Test::Pod)
+Requires: perl(HTML::Tagset) >= 3
+Requires: perl(URI)
+Requires: perl(XSLoader)
+%if %{undefined perl_bootstrap}
+# This creates cycle with perl-HTTP-Message. Weaken the dependency here because
+# it's just a recommended dependency per META.yml.
+BuildRequires: perl(HTTP::Headers)
+Requires: perl(HTTP::Headers)
+%endif
+
+%{?perl_default_filter}
+%{?perl_default_subpackage_tests}
%description
The HTML-Parser module for perl to parse and extract information from
HTML documents, including the HTML::Entities, HTML::HeadParser,
HTML::LinkExtor, HTML::PullParser, and HTML::TokeParser modules.
-
%prep
-%setup -q
+%setup -q -n HTML-Parser-3.69
chmod -c a-x eg/*
%build
cp %{SOURCE1001} .
-
-if test -f Makefile.PL; then
-%{__perl} Makefile.PL INSTALLDIRS=vendor
-make %{?jobs:-j%jobs}
-else
-%{__perl} Build.PL --installdirs vendor
-./Build
-fi
+%{__perl} Makefile.PL INSTALLDIRS=vendor OPTIMIZE="%{optflags}"
+make %{?_smp_mflags}
%install
-rm -rf %{buildroot}
-if test -f Makefile.PL; then
make pure_install PERL_INSTALL_ROOT=%{buildroot}
-else
-./Build install --installdirs vendor
-fi
+#file=%{buildroot}%{_mandir}/man3/HTML::Entities.3pm
+#iconv -f iso-8859-1 -t utf-8 <"$file" > "${file}_"
+#mv -f "${file}_" "$file"
find %{buildroot} -type f -name .packlist -exec rm -f {} ';'
-find %{buildroot} -depth -type d -exec rmdir {} 2>/dev/null ';'
find %{buildroot} -type f -name '*.bs' -empty -exec rm -f {} ';'
-%{_fixperms} %{buildroot}/*
+find %{buildroot} -depth -type d -exec rmdir {} 2>/dev/null ';'
+chmod -R u+w %{buildroot}/*
-file=$RPM_BUILD_ROOT%{_mandir}/man3/HTML::Entities.3pm
-iconv -f iso-8859-1 -t utf-8 <"$file" > "${file}_"
-mv -f "${file}_" "$file"
-chmod -R u+w $RPM_BUILD_ROOT/*
+%check
+make test
%files
%manifest perl-HTML-Parser.manifest
+%doc Changes README TODO eg/
%{perl_vendorarch}/HTML/*
%{perl_vendorarch}/auto/HTML/*
-%doc %{_mandir}/man3/*.3pm*
+#%{_mandir}/man3/*.3pm*
+
+
#!perl -w
use strict;
-use Test::More tests => 15;
+use Test::More tests => 16;
{ package H;
sub new { bless {}, shift; }
ok(!$p->as_string);
SKIP: {
- skip "Need Unicode support", 4 if $] < 5.008;
+ skip "Need Unicode support", 5 if $] < 5.008;
# Test that the Unicode BOM does not confuse us?
$p = HTML::HeadParser->new(H->new);
is($p->header("title"), "Parkinson's disease");
is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');
+
+ $p = HTML::HeadParser->new(H->new);
+ $p->utf8_mode(1);
+ $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
+\r
+<head profile="http://gmpg.org/xfn/11">\r
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
+\r
+<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW</title>\r
+<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r
+
+EOT
+ $p->eof;
+ is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW");
}