If Unicode keys are entered to a hash, a bit is turned on.

author Jarkko Hietaniemi <jhi@iki.fi>

Fri, 22 Mar 2002 04:07:13 +0000 (04:07 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Fri, 22 Mar 2002 04:07:13 +0000 (04:07 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Fri, 22 Mar 2002 04:07:13 +0000 (04:07 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Fri, 22 Mar 2002 04:07:13 +0000 (04:07 +0000)
diff --git a/doop.c b/doop.c

index e2faa87..20379a9 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -1336,8 +1336,19 @@ Perl_do_kv(pTHX)
      PUTBACK;   /* hv_iternext and hv_iterval might clobber stack_sp */
      while ((entry = hv_iternext(keys))) {
         SPAGAIN;
-       if (dokeys)
-           XPUSHs(hv_iterkeysv(entry));        /* won't clobber stack_sp */
+       if (dokeys) {
+           SV* sv = hv_iterkeysv(entry);
+           if (HvUTF8KEYS((SV*)hv) && !DO_UTF8(sv)) {
+               STRLEN len, i;
+               char* s = SvPV(sv, len);
+               for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+               if (i < len) {
+                   sv = newSVsv(sv);
+                   sv_utf8_upgrade(sv);
+               }
+           }
+           XPUSHs(sv); /* won't clobber stack_sp */
+       }
         if (dovalues) {
             PUTBACK;
             tmpstr = realhv ?
diff --git a/dump.c b/dump.c

index b4b37bb..48a3b38 100644 (file)
--- a/dump.c
+++ b/dump.c
@@ -980,6 +980,7 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo
      case SVt_PVHV:
         if (HvSHAREKEYS(sv))    sv_catpv(d, "SHAREKEYS,");
         if (HvLAZYDEL(sv))      sv_catpv(d, "LAZYDEL,");
+       if (HvUTF8KEYS(sv))     sv_catpv(d, "UTF8,");
         break;
      case SVt_PVGV:
         if (GvINTRO(sv))        sv_catpv(d, "INTRO,");
diff --git a/ext/Devel/Peek/Peek.t b/ext/Devel/Peek/Peek.t

index bd42d93..f577369 100644 (file)
--- a/ext/Devel/Peek/Peek.t
+++ b/ext/Devel/Peek/Peek.t
@@ -347,8 +347,8 @@ do_test(19,
    RV = $ADDR
    SV = PVHV\\($ADDR\\) at $ADDR
      REFCNT = 2
-    FLAGS = \\(SHAREKEYS\\)
-    IV = 1
+    FLAGS = \\(SHAREKEYS,UTF8\\)
+    UV = 1
      NV = $FLOAT
      ARRAY = $ADDR  \\(0:7, 1:1\\)
      hash quality = 100.0%
@@ -373,8 +373,8 @@ do_test(19,
    RV = $ADDR
    SV = PVHV\\($ADDR\\) at $ADDR
      REFCNT = 2
-    FLAGS = \\(SHAREKEYS\\)
-    IV = 1
+    FLAGS = \\(SHAREKEYS,UTF8\\)
+    UV = 1
      NV = 0
      ARRAY = $ADDR  \\(0:7, 1:1\\)
      hash quality = 100.0%
diff --git a/hv.c b/hv.c

index 41aa8bb..f92e31e 100644 (file)
--- a/hv.c
+++ b/hv.c
@@ -488,11 +488,13 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
  #endif
         }
      }
+
      if (is_utf8) {
         STRLEN tmplen = klen;
         /* See the note in hv_fetch(). --jhi */
         key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8);
         klen = tmplen;
+       HvUTF8KEYS_on((SV*)hv);
      }
  
      if (!hash)
@@ -615,8 +617,10 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
      keysave = key = SvPV(keysv, klen);
      is_utf8 = (SvUTF8(keysv) != 0);
  
-    if (is_utf8)
+    if (is_utf8) {
         key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+       HvUTF8KEYS_on((SV*)hv);
+    }
  
      if (!hash)
         PERL_HASH(hash, key, klen);
@@ -773,6 +777,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
                 else
                     hv_free_ent(hv, entry);
                 xhv->xhv_keys--; /* HvKEYS(hv)-- */
+               if (xhv->xhv_keys == 0)
+                   HvUTF8KEYS_off(hv);
                 xhv->xhv_placeholders--;
                 return Nullsv;
             }
@@ -810,6 +816,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
             else
                 hv_free_ent(hv, entry);
             xhv->xhv_keys--; /* HvKEYS(hv)-- */
+           if (xhv->xhv_keys == 0)
+               HvUTF8KEYS_off(hv);
         }
         return sv;
      }
@@ -920,6 +928,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
             else
                 hv_free_ent(hv, entry);
             xhv->xhv_keys--; /* HvKEYS(hv)-- */
+          if (xhv->xhv_keys == 0)
+               HvUTF8KEYS_off(hv);
             xhv->xhv_placeholders--;
             return Nullsv;
         }
@@ -956,6 +966,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
             else
                 hv_free_ent(hv, entry);
             xhv->xhv_keys--; /* HvKEYS(hv)-- */
+           if (xhv->xhv_keys == 0)
+               HvUTF8KEYS_off(hv);
         }
         return sv;
      }
@@ -1478,6 +1490,8 @@ Perl_hv_clear(pTHX_ HV *hv)
  
      if (SvRMAGICAL(hv))
         mg_clear((SV*)hv);
+
+    HvUTF8KEYS_off(hv);
  }
  
  STATIC void
diff --git a/hv.h b/hv.h

index 369bf3c..3d51075 100644 (file)
--- a/hv.h
+++ b/hv.h
@@ -159,11 +159,14 @@ C<SV*>.
  #define HvTOTALKEYS(hv)                XHvTOTALKEYS((XPVHV*)  SvANY(hv))
  #define HvPLACEHOLDERS(hv)     XHvPLACEHOLDERS((XPVHV*)  SvANY(hv))
  
-
  #define HvSHAREKEYS(hv)                (SvFLAGS(hv) & SVphv_SHAREKEYS)
  #define HvSHAREKEYS_on(hv)     (SvFLAGS(hv) |= SVphv_SHAREKEYS)
  #define HvSHAREKEYS_off(hv)    (SvFLAGS(hv) &= ~SVphv_SHAREKEYS)
  
+#define HvUTF8KEYS(hv)         (SvFLAGS(hv) & SVphv_UTF8KEYS)
+#define HvUTF8KEYS_on(hv)      (SvFLAGS(hv) |= SVphv_UTF8KEYS)
+#define HvUTF8KEYS_off(hv)     (SvFLAGS(hv) &= ~SVphv_UTF8KEYS)
+
  #define HvLAZYDEL(hv)          (SvFLAGS(hv) & SVphv_LAZYDEL)
  #define HvLAZYDEL_on(hv)       (SvFLAGS(hv) |= SVphv_LAZYDEL)
  #define HvLAZYDEL_off(hv)      (SvFLAGS(hv) &= ~SVphv_LAZYDEL)
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 4cb8325..9ba32ee 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -113,8 +113,8 @@ Character semantics have the following effects:
  
  =item *
  
-Strings and patterns may contain characters that have an ordinal value
-larger than 255.
+Strings (including hash keys) and regular expression patterns may
+contain characters that have an ordinal value larger than 255.
  
  If you use a Unicode editor to edit your program, Unicode characters
  may occur directly within the literal strings in one of the various
@@ -128,18 +128,20 @@ hexadecimal, into the curlies. For instance, a smiley face is C<\x{263A}>.
  This works only for characters with a code 0x100 and above.
  
  Additionally, if you
+
     use charnames ':full';
+
  you can use the C<\N{...}> notation, putting the official Unicode character
  name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
  This works for all characters that have names.
  
  =item *
  
-If an appropriate L<encoding> is specified,
-identifiers within the Perl script may contain Unicode alphanumeric
-characters, including ideographs.  (You are currently on your own when
-it comes to using the canonical forms of characters--Perl doesn't
-(yet) attempt to canonicalize variable names for you.)
+If an appropriate L<encoding> is specified, identifiers within the
+Perl script may contain Unicode alphanumeric characters, including
+ideographs.  (You are currently on your own when it comes to using the
+canonical forms of characters--Perl doesn't (yet) attempt to
+canonicalize variable names for you.)
  
  =item *
  
@@ -846,8 +848,7 @@ B<any subsequent file open>, is UTF-8.
  
  Perl tries really hard to work both with Unicode and the old byte
  oriented world: most often this is nice, but sometimes this causes
-problems.  See L</BUGS> for example how sometimes using locales
-with Unicode can help with these problems.
+problems.
  
  =back
  
@@ -959,19 +960,10 @@ Use of locales with Unicode data may lead to odd results.  Currently
  there is some attempt to apply 8-bit locale info to characters in the
  range 0..255, but this is demonstrably incorrect for locales that use
  characters above that range when mapped into Unicode.  It will also
-tend to run slower.  Avoidance of locales is strongly encouraged,
-with one known expection, see the next paragraph.
-
-If the keys of a hash are "mixed", that is, some keys are Unicode,
-while some keys are "byte", the keys may behave differently in regular
-expressions since the definition of character classes like C</\w/>
-is different for byte strings and character strings.  This problem can
-sometimes be helped by using an appropriate locale (see L<perllocale>).
-Another way is to force all the strings to be character encoded by
-using utf8::upgrade() (see L<utf8>).
+tend to run slower.  Use of locales with Unicode is discouraged.
  
  Some functions are slower when working on UTF-8 encoded strings than
-on byte encoded strings. All functions that need to hop over
+on byte encoded strings.  All functions that need to hop over
  characters such as length(), substr() or index() can work B<much>
  faster when the underlying data are byte-encoded. Witness the
  following benchmark:
diff --git a/pp.c b/pp.c

index 15bf351..757b4f0 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3686,7 +3686,17 @@ PP(pp_each)
  
      EXTEND(SP, 2);
      if (entry) {
-       PUSHs(hv_iterkeysv(entry));     /* won't clobber stack_sp */
+        SV* sv = hv_iterkeysv(entry);
+       if (HvUTF8KEYS((SV*)hash) && !DO_UTF8(sv)) {
+           STRLEN len, i;
+           char* s = SvPV(sv, len);
+           for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+           if (i < len) {
+               sv = newSVsv(sv);
+               sv_utf8_upgrade(sv);
+           }
+       }
+       PUSHs(sv);      /* won't clobber stack_sp */
         if (gimme == G_ARRAY) {
             SV *val;
             PUTBACK;
diff --git a/sv.h b/sv.h

index b956768..9671bd7 100644 (file)
--- a/sv.h
+++ b/sv.h
@@ -235,6 +235,7 @@ perform the upgrade if necessary.  See C<svtype>.
  
  #define SVphv_SHAREKEYS 0x20000000     /* keys live on shared string table */
  #define SVphv_LAZYDEL  0x40000000      /* entry in xhv_eiter must be deleted */
+#define SVphv_UTF8KEYS         0x80000000      /* keys when fetched are UTF8 */
  
  #define SVprv_WEAKREF   0x80000000      /* Weak reference */
  
diff --git a/t/op/pat.t b/t/op/pat.t

index b5dff4b..001a5b0 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
  
  $| = 1;
  
-print "1..892\n";
+print "1..903\n";
  
  BEGIN {
      chdir 't' if -d 't';
@@ -2771,3 +2771,36 @@ print "# some Unicode properties\n";
         ++$test;
      }
  }
+
+
+{
+    my $test = 893;
+
+    print "# Unicode hash keys and \\w\n";
+    # This is not really a regex test but regexes bring
+    # out the issue nicely.
+    use strict;
+    my $u3 = "f\x{df}\x{100}";
+    my $u2 = substr($u3,0,2);
+    my $u1 = substr($u2,0,1);
+    my %u = ( $u1 => $u1, $u2 => $u2, $u3 => $u3 );  
+
+    for (keys %u) {
+       print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+           "ok $test\n" : "not ok $test\n";
+       $test++;
+   }
+
+    for (each %u) {
+       print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+           "ok $test\n" : "not ok $test\n";
+       $test++;
+   }
+
+    for (%u) {
+       print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+           "ok $test\n" : "not ok $test\n";
+       $test++;
+   }
+}
+
author	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 22 Mar 2002 04:07:13 +0000 (04:07 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 22 Mar 2002 04:07:13 +0000 (04:07 +0000)
doop.c		patch \| blob \| history
dump.c		patch \| blob \| history
ext/Devel/Peek/Peek.t		patch \| blob \| history
hv.c		patch \| blob \| history
hv.h		patch \| blob \| history
pod/perlunicode.pod		patch \| blob \| history
pp.c		patch \| blob \| history
sv.h		patch \| blob \| history
t/op/pat.t		patch \| blob \| history