From c9f9cdcc515e626a053e0ab037fee97c204d8856 Mon Sep 17 00:00:00 2001 From: "Graydon, Tracy" Date: Thu, 6 Dec 2012 14:05:52 -0800 Subject: [PATCH] Fix the package build issues: missing deps --- Changes | 40 +++++++++++++++++++++ META.yml | 8 ++--- Makefile.PL | 4 +-- Parser.pm | 18 +++++----- Parser.xs | 1 + README | 2 +- eg/hform | 0 lib/HTML/Entities.pm | 8 ++--- lib/HTML/HeadParser.pm | 14 +++++--- lib/HTML/LinkExtor.pm | 6 ++-- lib/HTML/TokeParser.pm | 6 ++-- packaging/perl-HTML-Parser.spec | 80 +++++++++++++++++++++++------------------ t/headparser.t | 20 +++++++++-- 13 files changed, 142 insertions(+), 65 deletions(-) mode change 100644 => 100755 eg/hform diff --git a/Changes b/Changes index 948e5dc..df5932d 100644 --- a/Changes +++ b/Changes @@ -1,4 +1,44 @@ _______________________________________________________________________________ +2011-10-15 Release 3.69 + +Gisle Aas (4): + Documentation fix; encode_utf8 mixup [RT#71151] + Make it clearer that there are 2 (actually 3) options for handing "UTF-8 garbage" + Github is the official repo + Can't be bothered to try to fix the failures that occur on perl-5.6 + +Barbie (1): + fix to TokeParser to correctly handle option configuration + +Jon Jensen (1): + Aesthetic change: remove extra ; + +Ville Skyttä (1): + Trim surrounding whitespace from extracted URLs. + + +_______________________________________________________________________________ +2010-09-01 Release 3.68 + +Gisle Aas (1): + Declare the encoding of the POD to be utf8 + + +_______________________________________________________________________________ +2010-08-17 Release 3.67 + +Nicholas Clark (1): + bleadperl 2154eca7 breaks HTML::Parser 3.66 [RT#60368] + + +_______________________________________________________________________________ +2010-07-09 Release 3.66 + +Gisle Aas (1): + Fix entity decoding in utf8_mode for the title header + + +_______________________________________________________________________________ 2010-04-04 Release 3.65 Gisle Aas (1): diff --git a/META.yml b/META.yml index 4b3ea92..96ff56e 100644 --- a/META.yml +++ b/META.yml @@ -1,6 +1,6 @@ --- #YAML:1.0 name: HTML-Parser -version: 3.65 +version: 3.69 abstract: HTML parser class author: - Gisle Aas @@ -13,16 +13,16 @@ build_requires: Test::More: 0 requires: HTML::Tagset: 3 - perl: 5.006 + perl: 5.008 XSLoader: 0 resources: MailingList: mailto:libwww@perl.org - repository: http://gitorious.org/projects/perl-html-parser + repository: http://github.com/gisle/html-parser no_index: directory: - t - inc -generated_by: ExtUtils::MakeMaker version 6.56 +generated_by: ExtUtils::MakeMaker version 6.57_05 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 diff --git a/Makefile.PL b/Makefile.PL index 70ad50c..3e99a55 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -8,7 +8,7 @@ WriteMakefile( AUTHOR => 'Gisle Aas ', LICENSE => 'perl', - MIN_PERL_VERSION => 5.006, + MIN_PERL_VERSION => 5.008, PREREQ_PM => { 'HTML::Tagset' => 3, 'XSLoader' => 0, @@ -17,7 +17,7 @@ WriteMakefile( build_requires => { 'Test::More' => 0 }, recommends => { 'HTTP::Headers' => 0 }, resources => { - repository => 'http://gitorious.org/projects/perl-html-parser', + repository => 'http://github.com/gisle/html-parser', MailingList => 'mailto:libwww@perl.org', } }, diff --git a/Parser.pm b/Parser.pm index 154fb2f..dccbc5f 100644 --- a/Parser.pm +++ b/Parser.pm @@ -9,7 +9,7 @@ package HTML::Parser; use strict; use vars qw($VERSION @ISA); -$VERSION = "3.65"; +$VERSION = "3.69"; require HTML::Entities; @@ -650,9 +650,7 @@ names are forced to lower case. General entities are decoded in the attribute values and one layer of matching quotes enclosing the attribute values is removed. -The Unicode character set is assumed for entity decoding. With Perl -version 5.6 or earlier only the Latin-1 range is supported, and -entities for characters outside the range 0..255 are left unchanged. +The Unicode character set is assumed for entity decoding. =item C<@attr> @@ -1192,12 +1190,14 @@ The result of decoding will be a mix of encoded and decoded characters for any entities that expand to characters with code above 127. This is not a good thing. -The solution is to use the Encode::encode_utf8() on the data before -feeding it to the $p->parse(). For $p->parse_file() pass a file that -has been opened in ":utf8" mode. +The recommened solution is to apply Encode::decode_utf8() on the data before +feeding it to the $p->parse(). For $p->parse_file() pass a file that has been +opened in ":utf8" mode. -The parser can process raw undecoded UTF-8 sanely if the C -is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided. +The alternative solution is to enable the C and not decode before +passing strings to $p->parse(). The parser can process raw undecoded UTF-8 +sanely if the C is enabled, or if the "attr", "@attr" or "dtext" +argspecs are avoided. =item Parsing string decoded with wrong endianness diff --git a/Parser.xs b/Parser.xs index ffad00b..331e0e9 100644 --- a/Parser.xs +++ b/Parser.xs @@ -125,6 +125,7 @@ newSVpvn(char *s, STRLEN len) static SV* check_handler(pTHX_ SV* h) { + SvGETMAGIC(h); if (SvROK(h)) { SV* myref = SvRV(h); if (SvTYPE(myref) == SVt_PVCV) diff --git a/README b/README index ade4bf7..928d585 100644 --- a/README +++ b/README @@ -34,7 +34,7 @@ HTML::Parser to create and extract information from HTML syntax trees PREREQUISITES In order to install and use this package you will need Perl version -5.6 or better. The HTML::Tagset module should be installed. +5.8 or better. The HTML::Tagset module should be installed. If you intend to use the HTML::HeadParser you probably want to install libwww-perl too. diff --git a/eg/hform b/eg/hform old mode 100644 new mode 100755 diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm index 922faf2..ecd8e0d 100644 --- a/lib/HTML/Entities.pm +++ b/lib/HTML/Entities.pm @@ -1,5 +1,7 @@ package HTML::Entities; +=encoding utf8 + =head1 NAME HTML::Entities - Encode or decode strings with HTML entities @@ -32,9 +34,7 @@ character entities. The module provides the following functions: =item decode_entities( $string, ... ) This routine replaces HTML entities found in the $string with the -corresponding Unicode character. Under perl 5.6 and earlier only -characters in the Latin-1 range are replaced. Unrecognized -entities are left alone. +corresponding Unicode character. Unrecognized entities are left alone. If multiple strings are provided as argument they are each decoded separately and the same number of strings are returned. @@ -146,7 +146,7 @@ require Exporter; @EXPORT = qw(encode_entities decode_entities _decode_entities); @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); -$VERSION = "3.64"; +$VERSION = "3.69"; sub Version { $VERSION; } require HTML::Parser; # for fast XS implemented decode_entities diff --git a/lib/HTML/HeadParser.pm b/lib/HTML/HeadParser.pm index be65fa2..fe6916e 100644 --- a/lib/HTML/HeadParser.pm +++ b/lib/HTML/HeadParser.pm @@ -87,7 +87,7 @@ use HTML::Entities (); use strict; use vars qw($VERSION $DEBUG); #$DEBUG = 1; -$VERSION = "3.62"; +$VERSION = "3.69"; =item $hp = HTML::HeadParser->new @@ -157,7 +157,10 @@ sub flush_text # internal $text =~ s/\s+/ /g; print "FLUSH $tag => '$text'\n" if $DEBUG; if ($tag eq 'title') { + my $decoded; + $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode; HTML::Entities::decode($text); + utf8::encode($text) if $decoded; $self->{'header'}->push_header(Title => $text); } $self->{'tag'} = $self->{'text'} = ''; @@ -204,7 +207,8 @@ sub start $self->{'header'}->push_header($key => $attr->{content}); } elsif ($tag eq 'base') { return unless exists $attr->{href}; - $self->{'header'}->push_header('Content-Base' => $attr->{href}); + (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5 + $self->{'header'}->push_header('Content-Base' => $base); } elsif ($tag eq 'isindex') { # This is a non-standard header. Perhaps we should just ignore # this element @@ -215,7 +219,9 @@ sub start } elsif ($tag eq 'link') { return unless exists $attr->{href}; # - my $h_val = "<" . delete($attr->{href}) . ">"; + my $href = delete($attr->{href}); + $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5 + my $h_val = "<$href>"; for (sort keys %{$attr}) { next if $_ eq "/"; # XHTML junk $h_val .= qq(; $_="$attr->{$_}"); @@ -262,7 +268,7 @@ sub text } BEGIN { - *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;; + *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT; } 1; diff --git a/lib/HTML/LinkExtor.pm b/lib/HTML/LinkExtor.pm index 8d50439..c2f08c6 100644 --- a/lib/HTML/LinkExtor.pm +++ b/lib/HTML/LinkExtor.pm @@ -2,7 +2,7 @@ package HTML::LinkExtor; require HTML::Parser; @ISA = qw(HTML::Parser); -$VERSION = "3.60"; +$VERSION = "3.69"; =head1 NAME @@ -83,8 +83,8 @@ sub _start_tag my $a; for $a (@$links) { next unless exists $attr->{$a}; - push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base) - : $attr->{$a}); + (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5 + push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link); } return unless @links; $self->_found_link($tag, @links); diff --git a/lib/HTML/TokeParser.pm b/lib/HTML/TokeParser.pm index 94128db..959b96f 100644 --- a/lib/HTML/TokeParser.pm +++ b/lib/HTML/TokeParser.pm @@ -2,7 +2,7 @@ package HTML::TokeParser; require HTML::PullParser; @ISA=qw(HTML::PullParser); -$VERSION = "3.57"; +$VERSION = "3.69"; use strict; use Carp (); @@ -27,17 +27,19 @@ sub new { my $class = shift; my %cnf; + if (@_ == 1) { my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; %cnf = ($type => $_[0]); } else { + unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1); %cnf = @_; } my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; - my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; + my $self = $class->SUPER::new(%ARGS, %cnf) || return undef; $self->{textify} = $textify; $self; diff --git a/packaging/perl-HTML-Parser.spec b/packaging/perl-HTML-Parser.spec index 0c4ec8a..3447484 100644 --- a/packaging/perl-HTML-Parser.spec +++ b/packaging/perl-HTML-Parser.spec @@ -1,58 +1,70 @@ -Name: perl-HTML-Parser -Summary: Perl module for parsing HTML -Version: 3.65 -Release: 2 -Group: Development/Libraries -License: GPL+ or Artistic -URL: http://search.cpan.org/dist/HTML-Parser/ -Source0: %{name}-%{version}.tar.gz -Source1001: perl-HTML-Parser.manifest -Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version)) -Requires: perl(HTML::Tagset) >= 3.03 -BuildRequires: perl(HTML::Tagset) >= 3.03, perl(ExtUtils::MakeMaker), perl(Test::Simple) +%define real_name HTML-Parser +Name: perl-%{real_name} +Summary: Perl module for parsing HTML +Version: 3.69 +Release: 3%{?dist} +License: GPL+ or Artistic +Group: Development/Libraries +Source0: %{real_name}-%{version}.tar.gz +Source1001: packaging/perl-HTML-Parser.manifest +URL: http://search.cpan.org/dist/HTML-Parser/ +Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version)) +BuildRequires: perl(Carp) +BuildRequires: perl(ExtUtils::MakeMaker) +BuildRequires: perl(ExtUtils::ParseXS) +BuildRequires: perl(HTML::Tagset) >= 3 +BuildRequires: perl(Test::More) +BuildRequires: perl(URI) +BuildRequires: perl(XSLoader) +BuildRequires: perl(Pod::Simple) +BuildRequires: perl(Test::Pod) +Requires: perl(HTML::Tagset) >= 3 +Requires: perl(URI) +Requires: perl(XSLoader) +%if %{undefined perl_bootstrap} +# This creates cycle with perl-HTTP-Message. Weaken the dependency here because +# it's just a recommended dependency per META.yml. +BuildRequires: perl(HTTP::Headers) +Requires: perl(HTTP::Headers) +%endif + +%{?perl_default_filter} +%{?perl_default_subpackage_tests} %description The HTML-Parser module for perl to parse and extract information from HTML documents, including the HTML::Entities, HTML::HeadParser, HTML::LinkExtor, HTML::PullParser, and HTML::TokeParser modules. - %prep -%setup -q +%setup -q -n HTML-Parser-3.69 chmod -c a-x eg/* %build cp %{SOURCE1001} . - -if test -f Makefile.PL; then -%{__perl} Makefile.PL INSTALLDIRS=vendor -make %{?jobs:-j%jobs} -else -%{__perl} Build.PL --installdirs vendor -./Build -fi +%{__perl} Makefile.PL INSTALLDIRS=vendor OPTIMIZE="%{optflags}" +make %{?_smp_mflags} %install -rm -rf %{buildroot} -if test -f Makefile.PL; then make pure_install PERL_INSTALL_ROOT=%{buildroot} -else -./Build install --installdirs vendor -fi +#file=%{buildroot}%{_mandir}/man3/HTML::Entities.3pm +#iconv -f iso-8859-1 -t utf-8 <"$file" > "${file}_" +#mv -f "${file}_" "$file" find %{buildroot} -type f -name .packlist -exec rm -f {} ';' -find %{buildroot} -depth -type d -exec rmdir {} 2>/dev/null ';' find %{buildroot} -type f -name '*.bs' -empty -exec rm -f {} ';' -%{_fixperms} %{buildroot}/* +find %{buildroot} -depth -type d -exec rmdir {} 2>/dev/null ';' +chmod -R u+w %{buildroot}/* -file=$RPM_BUILD_ROOT%{_mandir}/man3/HTML::Entities.3pm -iconv -f iso-8859-1 -t utf-8 <"$file" > "${file}_" -mv -f "${file}_" "$file" -chmod -R u+w $RPM_BUILD_ROOT/* +%check +make test %files %manifest perl-HTML-Parser.manifest +%doc Changes README TODO eg/ %{perl_vendorarch}/HTML/* %{perl_vendorarch}/auto/HTML/* -%doc %{_mandir}/man3/*.3pm* +#%{_mandir}/man3/*.3pm* + + diff --git a/t/headparser.t b/t/headparser.t index adcde7a..985eaff 100644 --- a/t/headparser.t +++ b/t/headparser.t @@ -1,7 +1,7 @@ #!perl -w use strict; -use Test::More tests => 15; +use Test::More tests => 16; { package H; sub new { bless {}, shift; } @@ -147,7 +147,7 @@ unlink($file) or warn "Can't unlink $file: $!"; ok(!$p->as_string); SKIP: { - skip "Need Unicode support", 4 if $] < 5.008; + skip "Need Unicode support", 5 if $] < 5.008; # Test that the Unicode BOM does not confuse us? $p = HTML::HeadParser->new(H->new); @@ -177,4 +177,20 @@ EOT is($p->header("title"), "Parkinson's disease"); is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"'); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/ +\r + \r +\r +\r +\r +\r + ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW\r +\r + +EOT + $p->eof; + is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW"); } -- 2.7.4