From 2435e5d3caac53d7a120839117ce82b9e4700caa Mon Sep 17 00:00:00 2001 From: Brian Fraser Date: Sun, 10 Jul 2011 11:06:47 -0700 Subject: [PATCH] pad.c: flags checking for the UTF8 flag when necessary --- MANIFEST | 1 + ext/XS-APItest/t/fetch_pad_names.t | 321 +++++++++++++++++++++++++++++++++++++ pad.c | 12 +- 3 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 ext/XS-APItest/t/fetch_pad_names.t diff --git a/MANIFEST b/MANIFEST index c0ec184..9a12bb3 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3721,6 +3721,7 @@ ext/XS-APItest/t/copyhints.t test hv_copy_hints_hv() API ext/XS-APItest/t/customop.t XS::APItest: tests for custom ops ext/XS-APItest/t/eval-filter.t Simple source filter/eval test ext/XS-APItest/t/exception.t XS::APItest extension +ext/XS-APItest/t/fetch_pad_names.t Tests for UTF8 names in pad ext/XS-APItest/t/grok.t XS::APItest: tests for grok* functions ext/XS-APItest/t/hash.t XS::APItest: tests for hash related APIs ext/XS-APItest/t/keyword_multiline.t test keyword plugin parsing across lines diff --git a/ext/XS-APItest/t/fetch_pad_names.t b/ext/XS-APItest/t/fetch_pad_names.t new file mode 100644 index 0000000..384ca36 --- /dev/null +++ b/ext/XS-APItest/t/fetch_pad_names.t @@ -0,0 +1,321 @@ +use strict; +use warnings; +use Encode (); + +use Test::More tests => 77; + +use XS::APItest qw( fetch_pad_names pad_scalar ); + +local $SIG{__WARN__} = sub { warn $_[0] unless $_[0] =~ /Wide character in print at/ }; + +ok defined &fetch_pad_names, "sub imported"; +ok defined &pad_scalar; + +my $cv = sub { + my $test; +}; + +ok fetch_pad_names($cv), "Fetch working."; +is ref fetch_pad_names($cv), ref [], 'Fetch returns an arrayref'; +is @{fetch_pad_names($cv)}, 1, 'Sub has one lexical.'; +is fetch_pad_names($cv)->[0], '$test', "Fetching a simple scalar works."; + +$cv = sub { + use utf8; + + my $zest = 'invariant'; + my $zèst = 'latin-1'; + + return [pad_scalar(1, "zèst"), pad_scalar(1, "z\350st"), pad_scalar(1, "z\303\250st")]; +}; + +my $names_av = fetch_pad_names($cv); +my $flagged = my $unflagged = "\$z\x{c3}\x{a8}st"; +Encode::_utf8_on($flagged); + +general_tests( $cv->(), $names_av, { + results => [ + { cmp => 'latin-1', msg => 'Fetches through UTF-8.' }, + { cmp => 'latin-1', msg => 'Fetches through Latin-1.' }, + { cmp => 'NOT_IN_PAD', msg => "Doesn't fetch using octets." }, + ], + pad_size => { + total => { cmp => 2, msg => 'Sub has two lexicals.' }, + utf8 => { cmp => 0, msg => '' }, + invariant => { cmp => 2, msg => '' }, + }, + vars => [ + { name => '$zest', msg => 'Sub has [\$zest].', type => 'ok' }, + { name => "\$z\x{e8}st", msg => "Sub has [\$t\x{e8}st].", type => 'ok' }, + { name => $unflagged, msg => "Sub doesn't have [$unflagged].", type => 'not ok' }, + { name => $flagged, msg => "But does have it when flagged.", type => 'ok' }, + ], +}); + +$cv = do { + my $ascii = 'Defined'; + sub { + use utf8; + my $партнеры = $ascii; + return [$партнеры, pad_scalar(1, "партнеры"), pad_scalar(1, "\320\277\320\260\321\200\321\202\320\275\320\265\321\200\321\213")]; + }; +}; + +$names_av = fetch_pad_names($cv); +my $hex_var = "\$\x{43f}\x{430}\x{440}\x{442}\x{43d}\x{435}\x{440}\x{44b}"; +$flagged = $unflagged = "\$\320\277\320\260\321\200\321\202\320\275\320\265\321\200\321\213"; +Encode::_utf8_on($flagged); + +my $russian_var = do { + use utf8; + '$партнеры'; +}; + +general_tests( $cv->(), $names_av, { + results => [ + { cmp => 'Defined', msg => 'UTF-8 fetching works.' }, + { cmp => 'Defined', msg => 'pad_scalar fetch.' }, + { cmp => 'NOT_IN_PAD', msg => "Doesn't fetch using octets." }, + ], + pad_size => { + total => { cmp => 2, msg => 'Sub has two lexicals, including those it closed over.' }, + utf8 => { cmp => 1, msg => 'UTF-8 in the pad.' }, + invariant => { cmp => 1, msg => '' }, + }, + vars => [ + { name => '$ascii', msg => 'Sub has [$ascii].', type => 'ok' }, + { name => $russian_var, msg => "Sub has [$russian_var].", type => 'ok' }, + { name => $hex_var, msg => "Sub has [$hex_var].", type => 'ok' }, + { name => $unflagged, msg => "Sub doesn't have [$unflagged]", type => 'not ok' }, + { name => $flagged, msg => "But does have it when flagged.", type => 'ok' }, + ], +}); + +my $leon1 = "\$L\x{e9}on"; +my $leon2 = my $leon3 = "\$L\x{c3}\x{a9}on"; +Encode::_utf8_on($leon2); + +local $@; +$cv = eval <<"END"; + sub { + use utf8; + my \$Leon = 'Invariant'; + my $leon1 = 'Latin-1'; + return [ \$Leon, $leon1, $leon2, pad_scalar(1, "L\x{e9}on"), pad_scalar(1, "L\x{c3}\x{a9}on")]; + }; +END + +my $err = $@; +ok !$err, $@; + +$names_av = fetch_pad_names($cv); + +general_tests( $cv->(), $names_av, { + results => [ + { cmp => 'Invariant', msg => '' }, + { cmp => 'Latin-1', msg => "Fetched through [$leon1]" }, + { cmp => 'Latin-1', msg => "Fetched through [$leon2]" }, + { cmp => 'Latin-1', msg => 'pad_scalar fetch.' }, + { cmp => 'NOT_IN_PAD', msg => "Doesn't fetch using octets." }, + ], + pad_size => { + total => { cmp => 2, msg => 'Sub has two lexicals' }, + utf8 => { cmp => 0, msg => 'Latin-1 not upgraded to UTF-8.' }, + invariant => { cmp => 2, msg => '' }, + }, + vars => [ + { name => '$Leon', msg => 'Sub has [$Leon].', type => 'ok' }, + { name => $leon1, msg => "Sub has [$leon1].", type => 'ok' }, + { name => $leon2, msg => "Sub has [$leon2].", type => 'ok' }, + { name => $leon3, msg => "Sub doesn't have [$leon3]", type => 'not ok' }, + ], +}); + + +{ + use utf8; + my $Cèon = 4; + my $str1 = "\$C\x{e8}on"; + my $str2 = my $str3 = "\$C\x{c3}\x{a8}on"; + Encode::_utf8_on($str2); + + local $@; + $cv = eval <<"END_EVAL"; + sub { [ \$Cèon, $str1, $str2 ] }; +END_EVAL + + $err = $@; + ok !$err; + + $names_av = fetch_pad_names($cv); + + general_tests( $cv->(), $names_av, { + results => [ ({ SKIP => 1 }) x 3 ], + pad_size => { + total => { cmp => 1, msg => 'Sub has one lexical, which it closed over.' }, + utf8 => { cmp => 0, msg => '' }, + invariant => { cmp => 1, msg => '' }, + }, + vars => [ + { name => '$Ceon', msg => "Sub doesn't have [\$Ceon].", type => 'not ok' }, + map({ { name => $_, msg => "Sub has [$_].", type => 'ok' } } $str1, $str2 ), + { name => $str3, msg => "Sub doesn't have [$str3]", type => 'not ok' }, + ], + }); + +} + +#XXX: This will most certainly break once clean stashes are out. +$cv = sub { + use utf8; + our $戦国 = 10; + { + no strict 'refs'; + my ($symref, $encoded_sym) = (__PACKAGE__ . "::戦国") x 2; + utf8::encode($encoded_sym); + return [ $戦国, ${$symref}, ${$encoded_sym} ]; + } +}; + +my $flagged_our = my $unflagged_our = "\$\346\210\246\345\233\275"; +Encode::_utf8_on($flagged_our); + +$names_av = fetch_pad_names($cv); + +general_tests( $cv->(), $names_av, { + results => [ + { cmp => '10', msg => 'Fetched UTF-8 our var.' }, + ({ cmp => '10', msg => "Symref fetch." }) x 2, + ], + pad_size => { + total => { cmp => 3, msg => 'Sub has three lexicals.' }, + utf8 => { cmp => 1, msg => 'Japanese stored as UTF-8.' }, + invariant => { cmp => 2, msg => '' }, + }, + vars => [ + { name => "\$\x{6226}\x{56fd}", msg => "Sub has [\$\x{6226}\x{56fd}].", type => 'ok' }, + { name => $flagged_our, msg => "Sub has [$flagged_our].", type => 'ok' }, + { name => $unflagged_our, msg => "Sub doesn't have [$unflagged_our]", type => 'not ok' }, + ], +}); + + +{ + +use utf8; +{ + my $test; + BEGIN { + $test = "t\x{c3}\x{a8}st"; + Encode::_utf8_on($test); + } + use constant test => $test; +} + +$cv = sub { + my $tèst = 'Good'; + + return [ + $tèst, + pad_scalar(1, "tèst"), #"UTF-8" + pad_scalar(1, "t\350st"), #"Latin-1" + pad_scalar(1, "t\x{c3}\x{a8}st"), #"Octal" + pad_scalar(1, test()), #'UTF-8 enc' + ]; +}; + +$names_av = fetch_pad_names($cv); + +general_tests( $cv->(), $names_av, { + results => [ + { cmp => 'Good', msg => 'Fetched through Perl.' }, + { cmp => 'Good', msg => "pad_scalar: UTF-8 works." }, + { cmp => 'Good', msg => "pad_scalar: Latin-1 works." }, + { cmp => 'NOT_IN_PAD', msg => "pad_scalar: Doesn't fetch through octets." }, + { cmp => 'Good', msg => "pad_scalar: UTF-8-through-encoding works." }, + ], + pad_size => { + total => { cmp => 1, msg => 'Sub has one lexical.' }, + utf8 => { cmp => 0, msg => '' }, + invariant => { cmp => 1, msg => '' }, + }, + vars => [], +}); + +} + +$cv = do { + use utf8; + sub { + my $ニコニコ = 'katakana'; + my $にこにこ = 'hiragana'; + + return [ + $ニコニコ, + $にこにこ, + pad_scalar(1, "にこにこ"), + pad_scalar(1, "\x{306b}\x{3053}\x{306b}\x{3053}"), + pad_scalar(1, "\343\201\253\343\201\223\343\201\253\343\201\223"), + pad_scalar(1, "ニコニコ"), + pad_scalar(1, "\x{30cb}\x{30b3}\x{30cb}\x{30b3}"), + pad_scalar(1, "\343\203\213\343\202\263\343\203\213\343\202\263"), + ]; + } +}; + +$names_av = fetch_pad_names($cv); + +general_tests( $cv->(), $names_av, { + results => [ + { cmp => 'katakana', msg => '' }, + { cmp => 'hiragana', msg => '' }, + { cmp => 'hiragana', msg => '' }, + { cmp => 'hiragana', msg => '' }, + { cmp => 'NOT_IN_PAD', msg => '' }, + { cmp => 'katakana', msg => '' }, + { cmp => 'katakana', msg => '' }, + { cmp => 'NOT_IN_PAD', msg => '' }, + ], + pad_size => { + total => { cmp => 2, msg => 'Sub has two lexicals.' }, + utf8 => { cmp => 2, msg => '' }, + invariant => { cmp => 0, msg => '' }, + }, + vars => [], +}); + +{ + { + my $utf8_e; + BEGIN { + $utf8_e = "e"; + Encode::_utf8_on($utf8_e); + } + use constant utf8_e => $utf8_e; + } + my $e = 'Invariant'; + is pad_scalar(1, "e"), pad_scalar(1, utf8_e), 'Fetches the same thing, even if invariant but with differing utf8ness.'; +} + + +sub general_tests { + my ($results, $names_av, $tests) = @_; + + for my $i (0..$#$results) { + next if $tests->{results}[$i]{SKIP}; + is $results->[$i], $tests->{results}[$i]{cmp}, $tests->{results}[$i]{msg}; + } + + is @$names_av, $tests->{pad_size}{total}{cmp}, $tests->{pad_size}{total}{msg}; + is grep( Encode::is_utf8($_), @$names_av), $tests->{pad_size}{utf8}{cmp}; + is grep( !Encode::is_utf8($_), @$names_av), $tests->{pad_size}{invariant}{cmp}; + + for my $var (@{$tests->{vars}}) { + if ($var->{type} eq 'ok') { + ok $var->{name} ~~ $names_av, $var->{msg}; + } else { + ok !($var->{name} ~~ $names_av), $var->{msg}; + } + } + +} diff --git a/pad.c b/pad.c index afdc808..450fe2e 100644 --- a/pad.c +++ b/pad.c @@ -501,7 +501,7 @@ Perl_pad_add_name_pvn(pTHX_ const char *namepv, STRLEN namelen, PERL_ARGS_ASSERT_PAD_ADD_NAME_PVN; - if (flags & ~(padadd_OUR|padadd_STATE|padadd_NO_DUP_CHECK)) + if (flags & ~(padadd_OUR|padadd_STATE|padadd_NO_DUP_CHECK|padadd_UTF8_NAME)) Perl_croak(aTHX_ "panic: pad_add_name_pvn illegal flag bits 0x%" UVxf, (UV)flags); @@ -513,7 +513,7 @@ Perl_pad_add_name_pvn(pTHX_ const char *namepv, STRLEN namelen, pad_check_dup(namesv, flags & padadd_OUR, ourstash); } - offset = pad_alloc_name(namesv, flags, typestash, ourstash); + offset = pad_alloc_name(namesv, flags & ~padadd_UTF8_NAME, typestash, ourstash); /* not yet introduced */ COP_SEQ_RANGE_LOW_set(namesv, PERL_PADSEQ_INTRO); @@ -813,7 +813,7 @@ Perl_pad_findmy_pvn(pTHX_ const char *namepv, STRLEN namelen, U32 flags) pad_peg("pad_findmy_pvn"); - if (flags) + if (flags & ~padadd_UTF8_NAME) Perl_croak(aTHX_ "panic: pad_findmy_pvn illegal flag bits 0x%" UVxf, (UV)flags); @@ -874,6 +874,8 @@ Perl_pad_findmy_sv(pTHX_ SV *name, U32 flags) STRLEN namelen; PERL_ARGS_ASSERT_PAD_FINDMY_SV; namepv = SvPV(name, namelen); + if (SvUTF8(name)) + flags |= padadd_UTF8_NAME; return pad_findmy_pvn(namepv, namelen, flags); } @@ -968,6 +970,10 @@ S_pad_findlex(pTHX_ const char *namepv, STRLEN namelen, U32 flags, const CV* cv, PERL_ARGS_ASSERT_PAD_FINDLEX; + if (flags & ~padadd_UTF8_NAME) + Perl_croak(aTHX_ "panic: pad_findlex illegal flag bits 0x%" UVxf, + (UV)flags); + *out_flags = 0; DEBUG_Xv(PerlIO_printf(Perl_debug_log, -- 2.7.4