Introduce C<use feature "unicode_strings">

author Rafael Garcia-Suarez <rgs@consttype.org>

Sun, 20 Dec 2009 15:23:36 +0000 (16:23 +0100)

committer Rafael Garcia-Suarez <rgs@consttype.org>

Sun, 20 Dec 2009 15:28:36 +0000 (16:28 +0100)
author Rafael Garcia-Suarez <rgs@consttype.org>
Sun, 20 Dec 2009 15:23:36 +0000 (16:23 +0100)
committer Rafael Garcia-Suarez <rgs@consttype.org>
Sun, 20 Dec 2009 15:28:36 +0000 (16:28 +0100)
diff --git a/MANIFEST b/MANIFEST

index 76a5568..2d79cbc 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -3454,6 +3454,7 @@ lib/ExtUtils/XSSymSet.pm  on VMS, manage linker symbols when building extensions
  lib/fastcwd.pl                 a faster but more dangerous getcwd
  lib/feature.pm                 Pragma to enable new syntax
  lib/feature.t                  See if features work
+lib/feature/unicode_strings.t  See if feature "unicode_strings" work
  lib/File/Basename.pm           Emulate the basename program
  lib/File/Basename.t            See if File::Basename works
  lib/File/CheckTree.pm          Perl module supporting wholesale file mode validation
@@ -3492,8 +3493,6 @@ lib/importenv.pl          Perl routine to get environment into variables
  lib/integer.pm                 For "use integer"
  lib/integer.t                  For "use integer" testing
  lib/Internals.t                        For Internals::* testing
-lib/legacy.pm                  Pragma to preserve legacy behavior
-lib/legacy.t                   For "use legacy" testing
  lib/less.pm                    For "use less"
  lib/less.t                     See if less support works
  lib/locale.pm                  For "use locale"
diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl

index c4a1a79..bb5e61b 100755 (executable)
--- a/Porting/Maintainers.pl
+++ b/Porting/Maintainers.pl
@@ -1941,6 +1941,7 @@ use File::Glob qw(:case);
                                 lib/exceptions.pl
                                 lib/fastcwd.pl
                                 lib/feature.{pm,t}
+                               lib/feature/
                                 lib/filetest.{pm,t}
                                 lib/find.pl
                                 lib/finddepth.pl
diff --git a/lib/feature.pm b/lib/feature.pm

index 915b5c7..649ccb3 100644 (file)
--- a/lib/feature.pm
+++ b/lib/feature.pm
@@ -1,19 +1,24 @@
  package feature;
  
-our $VERSION = '1.13';
+our $VERSION = '1.14';
  
  # (feature name) => (internal name, used in %^H)
  my %feature = (
-    switch => 'feature_switch',
-    say    => "feature_say",
-    state  => "feature_state",
+    switch          => 'feature_switch',
+    say             => "feature_say",
+    state           => "feature_state",
+    unicode_strings => "feature_unicode",
  );
  
+# This gets set (for now) in $^H as well as in %^H,
+# for runtime speed of the uc/lc/ucfirst/lcfirst functions.
+our $hint_uni8bit = 0x00000800;
+
  # NB. the latest bundle must be loaded by the -E switch (see toke.c)
  
  my %feature_bundle = (
      "5.10" => [qw(switch say state)],
-    "5.11" => [qw(switch say state)],
+    "5.11" => [qw(switch say state unicode_strings)],
  );
  
  # special case
@@ -43,9 +48,9 @@ feature - Perl pragma to enable new syntactic features
  
  It is usually impossible to add new syntax to Perl without breaking
  some existing programs. This pragma provides a way to minimize that
-risk. New syntactic constructs can be enabled by C<use feature 'foo'>,
-and will be parsed only when the appropriate feature pragma is in
-scope.
+risk. New syntactic constructs, or new semantic meanings to older
+constructs, can be enabled by C<use feature 'foo'>, and will be parsed
+only when the appropriate feature pragma is in scope.
  
  =head2 Lexical effect
  
@@ -95,6 +100,80 @@ variables.
  
  See L<perlsub/"Persistent Private Variables"> for details.
  
+=head2 the 'unicode_strings' feature
+
+C<use feature 'unicode_strings'> tells the compiler to treat
+strings with codepoints larger than 128 as Unicode. It is available
+starting with Perl 5.11.3.
+
+In greater detail:
+
+This feature modifies the semantics for the 128 characters on ASCII
+systems that have the 8th bit set.  (See L</EBCDIC platforms> below for
+EBCDIC systems.) By default, unless C<S<use locale>> is specified, or the
+scalar containing such a character is known by Perl to be encoded in UTF8,
+the semantics are essentially that the characters have an ordinal number,
+and that's it.  They are caseless, and aren't anything: they're not
+controls, not letters, not punctuation, ..., not anything.
+
+This behavior stems from when Perl did not support Unicode, and ASCII was the
+only known character set outside of C<S<use locale>>.  In order to not
+possibly break pre-Unicode programs, these characters have retained their old
+non-meanings, except when it is clear to Perl that Unicode is what is meant,
+for example by calling utf8::upgrade() on a scalar, or if the scalar also
+contains characters that are only available in Unicode.  Then these 128
+characters take on their Unicode meanings.
+
+The problem with this behavior is that a scalar that encodes these characters
+has a different meaning depending on if it is stored as utf8 or not.
+In general, the internal storage method should not affect the
+external behavior.
+
+The behavior is known to have effects on these areas:
+
+=over 4
+
+=item *
+
+Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>,
+and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression
+substitutions.
+
+=item *
+
+Using caseless (C</i>) regular expression matching
+
+=item *
+
+Matching a number of properties in regular expressions, such as C<\w>
+
+=item *
+
+User-defined case change mappings.  You can create a C<ToUpper()> function, for
+example, which overrides Perl's built-in case mappings.  The scalar must be
+encoded in utf8 for your function to actually be invoked.
+
+=back
+
+B<This lack of semantics for these characters is currently the default,>
+outside of C<use locale>.  See below for EBCDIC.
+
+To turn on B<case changing semantics only> for these characters, use
+C<use feature "unicode_strings">.
+
+The other old (legacy) behaviors regarding these characters are currently
+unaffected by this pragma.
+
+=head4 EBCDIC platforms
+
+On EBCDIC platforms, the situation is somewhat different.  The legacy
+semantics are whatever the underlying semantics of the native C language
+library are.  Each of the three EBCDIC encodings currently known by Perl is an
+isomorph of the Latin-1 character set.  That means every character in Latin-1
+has a corresponding EBCDIC equivalent, and vice-versa.  Specifying C<S<no
+legacy>> currently makes sure that all EBCDIC characters have the same
+B<casing only> semantics as their corresponding Latin-1 characters.
+
  =head1 FEATURE BUNDLES
  
  It's possible to load a whole slew of features in one go, using
@@ -164,6 +243,7 @@ sub import {
             unknown_feature($name);
         }
         $^H{$feature{$name}} = 1;
+        $^H |= $hint_uni8bit if $name eq 'unicode_strings';
      }
  }
  
@@ -173,6 +253,7 @@ sub unimport {
      # A bare C<no feature> should disable *all* features
      if (!@_) {
         delete @^H{ values(%feature) };
+        $^H &= ~ $hint_uni8bit;
         return;
      }
  
@@ -194,6 +275,7 @@ sub unimport {
         }
         else {
             delete $^H{$feature{$name}};
+            $^H &= ~ $hint_uni8bit if $name eq 'unicode_strings';
         }
      }
  }
diff --git a/lib/legacy.t b/lib/feature/unicode_strings.t

similarity index 98%

rename from lib/legacy.t

rename to lib/feature/unicode_strings.t

index 1f0cce9..dce34bd 100644 (file)
--- a/lib/legacy.t
+++ b/lib/feature/unicode_strings.t
@@ -84,7 +84,7 @@ for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
              my $cp = sprintf "U+%04X", $i;
  
              # First try using latin1 (Unicode) semantics.
-            no legacy "unicode8bit";    
+            use feature "unicode_strings";    
  
              my $phrase = 'with uni8bit';
              my $char = chr($i);
@@ -112,7 +112,7 @@ for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
              }
  
              # Then try with posix semantics.
-            use legacy "unicode8bit";
+            no feature "unicode_strings";
              $phrase = 'no uni8bit';
  
              # These don't contribute anything in this case.
diff --git a/lib/legacy.pm b/lib/legacy.pm

deleted file mode 100755 (executable)

index 1ea7c07..0000000
--- a/lib/legacy.pm
+++ /dev/null
@@ -1,199 +0,0 @@
-package legacy;
-
-our $VERSION = '1.00';
-
-$unicode8bit::hint_not_uni8bit = 0x00000800;
-
-my %legacy_bundle = (
-    "5.10" => [qw(unicode8bit)],
-    "5.11" => [qw(unicode8bit)],
-);
-
-my %legacy = ( 'unicode8bit' => '0' );
-
-=head1 NAME
-
-legacy - Perl pragma to preserve legacy behaviors or enable new non-default behaviors
-
-=head1 SYNOPSIS
-
- use legacy ':5.10'; # Keeps semantics the same as in perl 5.10
-
- use legacy qw(unicode8bit);
-
- no legacy;
-
- no legacy qw(unicode8bit);
-
-=head1 DESCRIPTION
-
-Some programs may rely on behaviors that for others are problematic or
-even wrong.  A new version of Perl may change behaviors from past ones,
-and when it is viewed that the old way of doing things may be required
-to still be supported, the new behavior will be able to be turned off by using
-this pragma.
-
-Additionally, a new behavior may be supported in a new version of Perl, but
-for whatever reason the default remains the old one.  This pragma can enable
-the new behavior.
-
-Like other pragmas (C<use feature>, for example), C<use legacy qw(foo)> will
-only make the legacy behavior for "foo" available from that point to the end of
-the enclosing block.
-
-=head2 B<use legacy>
-
-Preserve the old way of doing things when a new version of Perl is
-released that would otherwise change the behavior.
-
-The one current possibility is:
-
-=head3 unicode8bit
-
-Use legacy semantics for the 128 characters on ASCII systems that have the 8th
-bit set.  (See L</EBCDIC platforms> below for EBCDIC systems.)  Unless
-C<S<use locale>> is specified, or the scalar containing such a character is
-known by Perl to be encoded in UTF8, the semantics are essentially that the
-characters have an ordinal number, and that's it.  They are caseless, and
-aren't anything: they're not controls, not letters, not punctuation, ..., not
-anything.
-
-This behavior stems from when Perl did not support Unicode, and ASCII was the
-only known character set outside of C<S<use locale>>.  In order to not
-possibly break pre-Unicode programs, these characters have retained their old
-non-meanings, except when it is clear to Perl that Unicode is what is meant,
-for example by calling utf8::upgrade() on a scalar, or if the scalar also
-contains characters that are only available in Unicode.  Then these 128
-characters take on their Unicode meanings.
-
-The problem with this behavior is that a scalar that encodes these characters
-has a different meaning depending on if it is stored as utf8 or not.
-In general, the internal storage method should not affect the
-external behavior.
-
-The behavior is known to have effects on these areas:
-
-=over 4
-
-=item *
-
-Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>,
-and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression
-substitutions.
-
-=item *
-
-Using caseless (C</i>) regular expression matching
-
-=item *
-
-Matching a number of properties in regular expressions, such as C<\w>
-
-=item *
-
-User-defined case change mappings.  You can create a C<ToUpper()> function, for
-example, which overrides Perl's built-in case mappings.  The scalar must be
-encoded in utf8 for your function to actually be invoked.
-
-=back
-
-B<This lack of semantics for these characters is currently the default,>
-outside of C<use locale>.  See below for EBCDIC.
-To turn on B<case changing semantics only> for these characters, use
-C<S<no legacy>>.
-The other legacy behaviors regarding these characters are currently
-unaffected by this pragma.
-
-=head4 EBCDIC platforms
-
-On EBCDIC platforms, the situation is somewhat different.  The legacy
-semantics are whatever the underlying semantics of the native C language
-library are.  Each of the three EBCDIC encodings currently known by Perl is an
-isomorph of the Latin-1 character set.  That means every character in Latin-1
-has a corresponding EBCDIC equivalent, and vice-versa.  Specifying C<S<no
-legacy>> currently makes sure that all EBCDIC characters have the same
-B<casing only> semantics as their corresponding Latin-1 characters.
-
-=head2 B<no legacy>
-
-Turn on a new behavior in a version of Perl that understands
-it but has it turned off by default.  For example, C<no legacy 'foo'> turns on
-behavior C<foo> in the lexical scope of the pragma.  C<no legacy>
-without any modifier turns on all new behaviors known to the pragma.
-
-=head1 LEGACY BUNDLES
-
-It's possible to turn off all new behaviors past a given release by
-using a I<legacy bundle>, which is the name of the release prefixed with
-a colon, to distinguish it from an individual legacy behavior.
-
-Specifying sub-versions such as the C<0> in C<5.10.0> in legacy bundles has
-no effect: legacy bundles are guaranteed to be the same for all sub-versions.
-
-Legacy bundles are not allowed with C<no legacy>.
-
-=cut
-
-sub import {
-    my $class = shift;
-    if (@_ == 0) {
-        croak("No legacy behaviors specified");
-    }
-    while (@_) {
-        my $name = shift(@_);
-        if (substr($name, 0, 1) eq ":") {
-            my $v = substr($name, 1);
-            if (!exists $legacy_bundle{$v}) {
-                $v =~ s/^([0-9]+)\.([0-9]+).[0-9]+$/$1.$2/;
-                if (!exists $legacy_bundle{$v}) {
-                    unknown_legacy_bundle(substr($name, 1));
-                }
-            }
-            unshift @_, @{$legacy_bundle{$v}};
-            next;
-        }
-        $^H |= $unicode8bit::hint_not_uni8bit;   # The only valid thing as of yet
-    }
-}
-
-
-sub unimport {
-    my $class = shift;
-
-    # A bare C<no legacy> should disable *all* legacy behaviors
-    if (!@_) {
-        unshift @_, keys(%legacy);
-    }
-
-    while (@_) {
-        my $name = shift;
-        if (substr($name, 0, 1) eq ":") {
-            croak(sprintf('Legacy bundles (%s) are not allowed in "no legacy"',
-                $name));
-        }
-        if (!exists($legacy{$name})) {
-            unknown_legacy($name);
-        }
-        else {
-            $^H &= ~ $unicode8bit::hint_not_uni8bit; # The only valid thing now
-        }
-    }
-}
-
-sub unknown_legacy {
-    my $legacy = shift;
-    croak(sprintf('Legacy "%s" is not supported by Perl %vd', $legacy, $^V));
-}
-
-sub unknown_legacy_bundle {
-    my $legacy = shift;
-    croak(sprintf('Legacy bundle "%s" is not supported by Perl %vd',
-        $legacy, $^V));
-}
-
-sub croak {
-    require Carp;
-    Carp::croak(@_);
-}
-
-1;
diff --git a/perl.h b/perl.h

index adff169..5988e78 100644 (file)
--- a/perl.h
+++ b/perl.h
@@ -4773,7 +4773,7 @@ enum {            /* pass one of these to get_vtbl */
  #define HINT_BLOCK_SCOPE       0x00000100
  #define HINT_STRICT_SUBS       0x00000200 /* strict pragma */
  #define HINT_STRICT_VARS       0x00000400 /* strict pragma */
-#define HINT_NOT_UNI_8_BIT     0x00000800 /* unicode8bit pragma */
+#define HINT_UNI_8_BIT         0x00000800 /* unicode_strings feature */
  
  /* The HINT_NEW_* constants are used by the overload pragma */
  #define HINT_NEW_INTEGER       0x00001000
diff --git a/t/lib/feature/bundle b/t/lib/feature/bundle

index a869c75..11fde32 100644 (file)
--- a/t/lib/feature/bundle
+++ b/t/lib/feature/bundle
@@ -7,6 +7,25 @@ say "Hello", "world";
  EXPECT
  Helloworld
  ########
+# Standard feature bundle, no 5.11
+use feature ":5.10";
+say ord uc chr 233;
+EXPECT
+233
+########
+# Standard feature bundle, 5.11
+use feature ":5.11";
+say ord uc chr 233;
+EXPECT
+201
+########
+# Standard feature bundle, 5.11
+use feature ":5.11";
+use utf8;
+say ord "\ué"; # this is utf8
+EXPECT
+201
+########
  # more specific: 5.10.0 maps to 5.10
  use feature ":5.10.0";
  say "Hello", "world";
diff --git a/t/uni/overload.t b/t/uni/overload.t

index da9b07b..7bf4841 100644 (file)
--- a/t/uni/overload.t
+++ b/t/uni/overload.t
@@ -35,7 +35,7 @@ package main;
  
  # These tests are based on characters 128-255 not having latin1, and hence
  # Unicode, semantics
-use legacy 'unicode8bit';
+# no feature "unicode_strings";
  
  # Bug 34297
  foreach my $t ("ASCII", "B\366se") {
diff --git a/toke.c b/toke.c

index db9eca3..19241c4 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -583,7 +583,7 @@ S_missingterm(pTHX_ char *s)
         ((0 != (PL_hints & HINT_LOCALIZE_HH))                           \
             && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
  /* The longest string we pass in.  */
-#define MAX_FEATURE_LEN (sizeof("switch")-1)
+#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
  
  /*
   * S_feature_is_enabled
diff --git a/utf8.h b/utf8.h

index 8fef274..9eed545 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -206,7 +206,7 @@ encoded character.
  
  #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
  #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
-#define IN_UNI_8_BIT ( (! (CopHINTS_get(PL_curcop) & HINT_NOT_UNI_8_BIT)) \
+#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \
                         && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
  
  #define UTF8_ALLOW_EMPTY               0x0001
author	Rafael Garcia-Suarez <rgs@consttype.org>
	Sun, 20 Dec 2009 15:23:36 +0000 (16:23 +0100)
committer	Rafael Garcia-Suarez <rgs@consttype.org>
	Sun, 20 Dec 2009 15:28:36 +0000 (16:28 +0100)
MANIFEST		patch \| blob \| history
Porting/Maintainers.pl		patch \| blob \| history
lib/feature.pm		patch \| blob \| history
lib/feature/unicode_strings.t	[moved from lib/legacy.t with 98% similarity]	patch \| blob \| history
lib/legacy.pm	[deleted file]	patch \| blob \| history
perl.h		patch \| blob \| history
t/lib/feature/bundle		patch \| blob \| history
t/uni/overload.t		patch \| blob \| history
toke.c		patch \| blob \| history
utf8.h		patch \| blob \| history