From 2435e5d3caac53d7a120839117ce82b9e4700caa Mon Sep 17 00:00:00 2001
From: Brian Fraser <fraserbn@gmail.com>
Date: Sun, 10 Jul 2011 11:06:47 -0700
Subject: [PATCH] pad.c: flags checking for the UTF8 flag when necessary

---
 MANIFEST                           |   1 +
 ext/XS-APItest/t/fetch_pad_names.t | 321 +++++++++++++++++++++++++++++++++++++
 pad.c                              |  12 +-
 3 files changed, 331 insertions(+), 3 deletions(-)
 create mode 100644 ext/XS-APItest/t/fetch_pad_names.t

diff --git a/MANIFEST b/MANIFEST
index c0ec184..9a12bb3 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3721,6 +3721,7 @@ ext/XS-APItest/t/copyhints.t	test hv_copy_hints_hv() API
 ext/XS-APItest/t/customop.t	XS::APItest: tests for custom ops
 ext/XS-APItest/t/eval-filter.t	Simple source filter/eval test
 ext/XS-APItest/t/exception.t	XS::APItest extension
+ext/XS-APItest/t/fetch_pad_names.t	Tests for UTF8 names in pad
 ext/XS-APItest/t/grok.t		XS::APItest: tests for grok* functions
 ext/XS-APItest/t/hash.t		XS::APItest: tests for hash related APIs
 ext/XS-APItest/t/keyword_multiline.t	test keyword plugin parsing across lines
diff --git a/ext/XS-APItest/t/fetch_pad_names.t b/ext/XS-APItest/t/fetch_pad_names.t
new file mode 100644
index 0000000..384ca36
--- /dev/null
+++ b/ext/XS-APItest/t/fetch_pad_names.t
@@ -0,0 +1,321 @@
+use strict;
+use warnings;
+use Encode ();
+
+use Test::More tests => 77;
+
+use XS::APItest qw( fetch_pad_names pad_scalar );
+
+local $SIG{__WARN__} = sub { warn $_[0] unless $_[0] =~ /Wide character in print at/ };
+
+ok defined &fetch_pad_names, "sub imported";
+ok defined &pad_scalar;
+
+my $cv = sub {
+    my $test;
+};
+
+ok fetch_pad_names($cv), "Fetch working.";
+is ref fetch_pad_names($cv), ref [], 'Fetch returns an arrayref';
+is @{fetch_pad_names($cv)}, 1, 'Sub has one lexical.';
+is fetch_pad_names($cv)->[0], '$test', "Fetching a simple scalar works.";
+
+$cv = sub {
+    use utf8;
+
+    my $zest = 'invariant';
+    my $zÃ¨st = 'latin-1';
+    
+    return [pad_scalar(1, "zÃ¨st"), pad_scalar(1, "z\350st"), pad_scalar(1, "z\303\250st")];
+};
+
+my $names_av    = fetch_pad_names($cv);
+my $flagged     = my $unflagged = "\$z\x{c3}\x{a8}st";
+Encode::_utf8_on($flagged);
+
+general_tests( $cv->(), $names_av, {
+    results => [
+                { cmp => 'latin-1', msg => 'Fetches through UTF-8.' },
+                { cmp => 'latin-1', msg => 'Fetches through Latin-1.' },
+                { cmp => 'NOT_IN_PAD', msg => "Doesn't fetch using octets." },
+               ],
+    pad_size => {
+                    total     => { cmp => 2, msg => 'Sub has two lexicals.' },
+                    utf8      => { cmp => 0, msg => '' },
+                    invariant => { cmp => 2, msg => '' },
+                },
+    vars    => [
+                { name => '$zest', msg => 'Sub has [\$zest].', type => 'ok' },
+                { name =>  "\$z\x{e8}st", msg => "Sub has [\$t\x{e8}st].", type => 'ok' },
+                { name => $unflagged, msg => "Sub doesn't have [$unflagged].", type => 'not ok' },
+                { name => $flagged, msg => "But does have it when flagged.", type => 'ok' },
+               ],
+});
+
+$cv = do {
+    my $ascii = 'Defined';
+    sub {
+        use utf8;
+        my $Ð¿Ð°ÑÑÐ½ÐµÑÑ = $ascii;
+        return [$Ð¿Ð°ÑÑÐ½ÐµÑÑ, pad_scalar(1, "Ð¿Ð°ÑÑÐ½ÐµÑÑ"), pad_scalar(1, "\320\277\320\260\321\200\321\202\320\275\320\265\321\200\321\213")];
+    };
+};
+
+$names_av     = fetch_pad_names($cv);
+my $hex_var   =  "\$\x{43f}\x{430}\x{440}\x{442}\x{43d}\x{435}\x{440}\x{44b}";
+$flagged      = $unflagged = "\$\320\277\320\260\321\200\321\202\320\275\320\265\321\200\321\213";
+Encode::_utf8_on($flagged);
+
+my $russian_var = do {
+    use utf8;
+    '$Ð¿Ð°ÑÑÐ½ÐµÑÑ';
+};
+
+general_tests( $cv->(), $names_av, {
+    results => [
+                { cmp => 'Defined', msg => 'UTF-8 fetching works.' },
+                { cmp => 'Defined', msg => 'pad_scalar fetch.' },
+                { cmp => 'NOT_IN_PAD', msg => "Doesn't fetch using octets." },
+               ],
+    pad_size => {
+                    total     => { cmp => 2, msg => 'Sub has two lexicals, including those it closed over.' },
+                    utf8      => { cmp => 1, msg => 'UTF-8 in the pad.' },
+                    invariant => { cmp => 1, msg => '' },
+                },
+    vars    => [
+                { name => '$ascii', msg => 'Sub has [$ascii].', type => 'ok' },
+                { name => $russian_var, msg => "Sub has [$russian_var].", type => 'ok' },
+                { name => $hex_var, msg => "Sub has [$hex_var].", type => 'ok' },
+                { name => $unflagged, msg => "Sub doesn't have [$unflagged]", type => 'not ok' },
+                { name => $flagged, msg => "But does have it when flagged.", type => 'ok' },
+               ],
+});
+
+my $leon1 = "\$L\x{e9}on";
+my $leon2 = my $leon3 = "\$L\x{c3}\x{a9}on";
+Encode::_utf8_on($leon2);
+
+local $@;
+$cv = eval <<"END";
+    sub {
+        use utf8;
+        my \$Leon = 'Invariant';
+        my $leon1 = 'Latin-1';
+        return [ \$Leon, $leon1, $leon2, pad_scalar(1, "L\x{e9}on"), pad_scalar(1, "L\x{c3}\x{a9}on")];
+    };
+END
+
+my $err = $@;
+ok !$err, $@;
+
+$names_av = fetch_pad_names($cv);
+
+general_tests( $cv->(), $names_av, {
+    results => [
+                { cmp => 'Invariant', msg => '' },
+                { cmp => 'Latin-1', msg => "Fetched through [$leon1]" },
+                { cmp => 'Latin-1', msg => "Fetched through [$leon2]" },
+                { cmp => 'Latin-1', msg => 'pad_scalar fetch.' },
+                { cmp => 'NOT_IN_PAD', msg => "Doesn't fetch using octets." },
+               ],
+    pad_size => {
+                    total     => { cmp => 2, msg => 'Sub has two lexicals' },
+                    utf8      => { cmp => 0, msg => 'Latin-1 not upgraded to UTF-8.' },
+                    invariant => { cmp => 2, msg => '' },
+                },
+    vars    => [
+                { name => '$Leon', msg => 'Sub has [$Leon].', type => 'ok' },
+                { name => $leon1, msg => "Sub has [$leon1].", type => 'ok' },
+                { name => $leon2, msg => "Sub has [$leon2].", type => 'ok' },
+                { name => $leon3, msg => "Sub doesn't have [$leon3]", type => 'not ok' },
+               ],
+});
+
+
+{
+    use utf8;
+    my $CÃ¨on = 4;
+    my $str1 = "\$C\x{e8}on";
+    my $str2 = my $str3 = "\$C\x{c3}\x{a8}on";
+    Encode::_utf8_on($str2);
+
+    local $@;
+    $cv = eval <<"END_EVAL";
+        sub { [ \$CÃ¨on, $str1, $str2 ] };
+END_EVAL
+    
+    $err = $@;
+    ok !$err;
+
+    $names_av = fetch_pad_names($cv);
+
+    general_tests( $cv->(), $names_av, {
+        results => [ ({ SKIP => 1 }) x 3 ],
+        pad_size => {
+                  total => { cmp => 1, msg => 'Sub has one lexical, which it closed over.' },
+                  utf8      => { cmp => 0, msg => '' },
+                  invariant => { cmp => 1, msg => '' },
+                    },
+        vars    => [
+                { name => '$Ceon', msg => "Sub doesn't have [\$Ceon].", type => 'not ok' },
+                map({ { name => $_, msg => "Sub has [$_].", type => 'ok' } } $str1, $str2 ),
+                { name => $str3, msg => "Sub doesn't have [$str3]", type => 'not ok' },
+                   ],
+    });
+
+}
+
+#XXX: This will most certainly break once clean stashes are out.
+$cv = sub {
+    use utf8;
+    our $æ¦å½ = 10;
+    {
+        no strict 'refs';
+        my ($symref, $encoded_sym) = (__PACKAGE__ . "::æ¦å½") x 2;
+        utf8::encode($encoded_sym);
+        return [ $æ¦å½, ${$symref}, ${$encoded_sym} ];
+    }
+};
+
+my $flagged_our = my $unflagged_our = "\$\346\210\246\345\233\275";
+Encode::_utf8_on($flagged_our);
+
+$names_av = fetch_pad_names($cv);
+
+general_tests( $cv->(), $names_av, {
+    results => [
+                { cmp => '10', msg => 'Fetched UTF-8 our var.' },
+                ({ cmp => '10', msg => "Symref fetch." }) x 2,
+               ],
+    pad_size => {
+                    total     => { cmp => 3, msg => 'Sub has three lexicals.' },
+                    utf8      => { cmp => 1, msg => 'Japanese stored as UTF-8.' },
+                    invariant => { cmp => 2, msg => '' },
+                },
+    vars    => [
+                { name => "\$\x{6226}\x{56fd}", msg => "Sub has [\$\x{6226}\x{56fd}].", type => 'ok' },
+                { name => $flagged_our, msg => "Sub has [$flagged_our].", type => 'ok' },
+                { name => $unflagged_our, msg => "Sub doesn't have [$unflagged_our]", type => 'not ok' },
+               ],
+});
+
+
+{
+
+use utf8;
+{
+    my $test;
+    BEGIN {
+        $test = "t\x{c3}\x{a8}st";
+        Encode::_utf8_on($test);
+    }
+    use constant test => $test;
+}
+
+$cv = sub {
+    my $tÃ¨st = 'Good';
+
+    return [
+        $tÃ¨st,
+        pad_scalar(1, "tÃ¨st"),              #"UTF-8"
+        pad_scalar(1, "t\350st"),           #"Latin-1"
+        pad_scalar(1, "t\x{c3}\x{a8}st"),   #"Octal"
+        pad_scalar(1, test()),              #'UTF-8 enc'
+        ];
+};
+
+$names_av = fetch_pad_names($cv);
+
+general_tests( $cv->(), $names_av, {
+    results => [
+                { cmp => 'Good', msg => 'Fetched through Perl.' },
+                { cmp => 'Good', msg => "pad_scalar: UTF-8 works." },
+                { cmp => 'Good', msg => "pad_scalar: Latin-1 works." },
+                { cmp => 'NOT_IN_PAD', msg => "pad_scalar: Doesn't fetch through octets." },
+                { cmp => 'Good', msg => "pad_scalar: UTF-8-through-encoding works." },
+               ],
+    pad_size => {
+                    total     => { cmp => 1, msg => 'Sub has one lexical.' },
+                    utf8      => { cmp => 0, msg => '' },
+                    invariant => { cmp => 1, msg => '' },
+                },
+    vars    => [],
+});
+
+}
+
+$cv = do {
+    use utf8;
+    sub {
+        my $ãã³ãã³ = 'katakana';
+        my $ã«ãã«ã = 'hiragana';
+
+        return [
+                $ãã³ãã³,
+                $ã«ãã«ã,
+                pad_scalar(1, "ã«ãã«ã"),
+                pad_scalar(1, "\x{306b}\x{3053}\x{306b}\x{3053}"),
+                pad_scalar(1, "\343\201\253\343\201\223\343\201\253\343\201\223"),
+                pad_scalar(1, "ãã³ãã³"),
+                pad_scalar(1, "\x{30cb}\x{30b3}\x{30cb}\x{30b3}"),
+                pad_scalar(1, "\343\203\213\343\202\263\343\203\213\343\202\263"),
+            ];
+    }
+};
+
+$names_av = fetch_pad_names($cv);
+
+general_tests( $cv->(), $names_av, {
+    results => [
+                { cmp => 'katakana', msg => '' },
+                { cmp => 'hiragana', msg => '' },
+                { cmp => 'hiragana', msg => '' },
+                { cmp => 'hiragana', msg => '' },
+                { cmp => 'NOT_IN_PAD', msg => '' },
+                { cmp => 'katakana', msg => '' },
+                { cmp => 'katakana', msg => '' },
+                { cmp => 'NOT_IN_PAD', msg => '' },
+               ],
+    pad_size => {
+                    total     => { cmp => 2, msg => 'Sub has two lexicals.' },
+                    utf8      => { cmp => 2, msg => '' },
+                    invariant => { cmp => 0, msg => '' },
+                },
+    vars    => [],
+});
+
+{
+    {
+        my $utf8_e;
+        BEGIN {
+            $utf8_e = "e";
+            Encode::_utf8_on($utf8_e);
+        }
+        use constant utf8_e => $utf8_e;
+    }
+    my $e = 'Invariant';
+    is pad_scalar(1, "e"), pad_scalar(1, utf8_e), 'Fetches the same thing, even if invariant but with differing utf8ness.';
+}
+
+
+sub general_tests {
+    my ($results, $names_av, $tests) = @_;
+
+    for my $i (0..$#$results) {
+        next if $tests->{results}[$i]{SKIP};
+        is $results->[$i], $tests->{results}[$i]{cmp}, $tests->{results}[$i]{msg};
+    }
+
+    is @$names_av, $tests->{pad_size}{total}{cmp}, $tests->{pad_size}{total}{msg};
+    is grep( Encode::is_utf8($_), @$names_av), $tests->{pad_size}{utf8}{cmp};
+    is grep( !Encode::is_utf8($_), @$names_av), $tests->{pad_size}{invariant}{cmp};
+
+    for my $var (@{$tests->{vars}}) {
+        if ($var->{type} eq 'ok') {
+            ok $var->{name} ~~ $names_av, $var->{msg};
+        } else {
+            ok !($var->{name} ~~ $names_av), $var->{msg};
+        }
+    }
+
+}
diff --git a/pad.c b/pad.c
index afdc808..450fe2e 100644
--- a/pad.c
+++ b/pad.c
@@ -501,7 +501,7 @@ Perl_pad_add_name_pvn(pTHX_ const char *namepv, STRLEN namelen,
 
     PERL_ARGS_ASSERT_PAD_ADD_NAME_PVN;
 
-    if (flags & ~(padadd_OUR|padadd_STATE|padadd_NO_DUP_CHECK))
+    if (flags & ~(padadd_OUR|padadd_STATE|padadd_NO_DUP_CHECK|padadd_UTF8_NAME))
 	Perl_croak(aTHX_ "panic: pad_add_name_pvn illegal flag bits 0x%" UVxf,
 		   (UV)flags);
 
@@ -513,7 +513,7 @@ Perl_pad_add_name_pvn(pTHX_ const char *namepv, STRLEN namelen,
 	pad_check_dup(namesv, flags & padadd_OUR, ourstash);
     }
 
-    offset = pad_alloc_name(namesv, flags, typestash, ourstash);
+    offset = pad_alloc_name(namesv, flags & ~padadd_UTF8_NAME, typestash, ourstash);
 
     /* not yet introduced */
     COP_SEQ_RANGE_LOW_set(namesv, PERL_PADSEQ_INTRO);
@@ -813,7 +813,7 @@ Perl_pad_findmy_pvn(pTHX_ const char *namepv, STRLEN namelen, U32 flags)
 
     pad_peg("pad_findmy_pvn");
 
-    if (flags)
+    if (flags & ~padadd_UTF8_NAME)
 	Perl_croak(aTHX_ "panic: pad_findmy_pvn illegal flag bits 0x%" UVxf,
 		   (UV)flags);
 
@@ -874,6 +874,8 @@ Perl_pad_findmy_sv(pTHX_ SV *name, U32 flags)
     STRLEN namelen;
     PERL_ARGS_ASSERT_PAD_FINDMY_SV;
     namepv = SvPV(name, namelen);
+    if (SvUTF8(name))
+        flags |= padadd_UTF8_NAME;
     return pad_findmy_pvn(namepv, namelen, flags);
 }
 
@@ -968,6 +970,10 @@ S_pad_findlex(pTHX_ const char *namepv, STRLEN namelen, U32 flags, const CV* cv,
 
     PERL_ARGS_ASSERT_PAD_FINDLEX;
 
+    if (flags & ~padadd_UTF8_NAME)
+	Perl_croak(aTHX_ "panic: pad_findlex illegal flag bits 0x%" UVxf,
+		   (UV)flags);
+
     *out_flags = 0;
 
     DEBUG_Xv(PerlIO_printf(Perl_debug_log,
-- 
2.7.4