PUTBACK; /* hv_iternext and hv_iterval might clobber stack_sp */
while ((entry = hv_iternext(keys))) {
SPAGAIN;
- if (dokeys)
- XPUSHs(hv_iterkeysv(entry)); /* won't clobber stack_sp */
+ if (dokeys) {
+ SV* sv = hv_iterkeysv(entry);
+ if (HvUTF8KEYS((SV*)hv) && !DO_UTF8(sv)) {
+ STRLEN len, i;
+ char* s = SvPV(sv, len);
+ for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+ if (i < len) {
+ sv = newSVsv(sv);
+ sv_utf8_upgrade(sv);
+ }
+ }
+ XPUSHs(sv); /* won't clobber stack_sp */
+ }
if (dovalues) {
PUTBACK;
tmpstr = realhv ?
case SVt_PVHV:
if (HvSHAREKEYS(sv)) sv_catpv(d, "SHAREKEYS,");
if (HvLAZYDEL(sv)) sv_catpv(d, "LAZYDEL,");
+ if (HvUTF8KEYS(sv)) sv_catpv(d, "UTF8,");
break;
case SVt_PVGV:
if (GvINTRO(sv)) sv_catpv(d, "INTRO,");
RV = $ADDR
SV = PVHV\\($ADDR\\) at $ADDR
REFCNT = 2
- FLAGS = \\(SHAREKEYS\\)
- IV = 1
+ FLAGS = \\(SHAREKEYS,UTF8\\)
+ UV = 1
NV = $FLOAT
ARRAY = $ADDR \\(0:7, 1:1\\)
hash quality = 100.0%
RV = $ADDR
SV = PVHV\\($ADDR\\) at $ADDR
REFCNT = 2
- FLAGS = \\(SHAREKEYS\\)
- IV = 1
+ FLAGS = \\(SHAREKEYS,UTF8\\)
+ UV = 1
NV = 0
ARRAY = $ADDR \\(0:7, 1:1\\)
hash quality = 100.0%
#endif
}
}
+
if (is_utf8) {
STRLEN tmplen = klen;
/* See the note in hv_fetch(). --jhi */
key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8);
klen = tmplen;
+ HvUTF8KEYS_on((SV*)hv);
}
if (!hash)
keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
- if (is_utf8)
+ if (is_utf8) {
key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+ HvUTF8KEYS_on((SV*)hv);
+ }
if (!hash)
PERL_HASH(hash, key, klen);
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
xhv->xhv_placeholders--;
return Nullsv;
}
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
}
return sv;
}
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
xhv->xhv_placeholders--;
return Nullsv;
}
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
}
return sv;
}
if (SvRMAGICAL(hv))
mg_clear((SV*)hv);
+
+ HvUTF8KEYS_off(hv);
}
STATIC void
#define HvTOTALKEYS(hv) XHvTOTALKEYS((XPVHV*) SvANY(hv))
#define HvPLACEHOLDERS(hv) XHvPLACEHOLDERS((XPVHV*) SvANY(hv))
-
#define HvSHAREKEYS(hv) (SvFLAGS(hv) & SVphv_SHAREKEYS)
#define HvSHAREKEYS_on(hv) (SvFLAGS(hv) |= SVphv_SHAREKEYS)
#define HvSHAREKEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_SHAREKEYS)
+#define HvUTF8KEYS(hv) (SvFLAGS(hv) & SVphv_UTF8KEYS)
+#define HvUTF8KEYS_on(hv) (SvFLAGS(hv) |= SVphv_UTF8KEYS)
+#define HvUTF8KEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_UTF8KEYS)
+
#define HvLAZYDEL(hv) (SvFLAGS(hv) & SVphv_LAZYDEL)
#define HvLAZYDEL_on(hv) (SvFLAGS(hv) |= SVphv_LAZYDEL)
#define HvLAZYDEL_off(hv) (SvFLAGS(hv) &= ~SVphv_LAZYDEL)
=item *
-Strings and patterns may contain characters that have an ordinal value
-larger than 255.
+Strings (including hash keys) and regular expression patterns may
+contain characters that have an ordinal value larger than 255.
If you use a Unicode editor to edit your program, Unicode characters
may occur directly within the literal strings in one of the various
This works only for characters with a code 0x100 and above.
Additionally, if you
+
use charnames ':full';
+
you can use the C<\N{...}> notation, putting the official Unicode character
name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
This works for all characters that have names.
=item *
-If an appropriate L<encoding> is specified,
-identifiers within the Perl script may contain Unicode alphanumeric
-characters, including ideographs. (You are currently on your own when
-it comes to using the canonical forms of characters--Perl doesn't
-(yet) attempt to canonicalize variable names for you.)
+If an appropriate L<encoding> is specified, identifiers within the
+Perl script may contain Unicode alphanumeric characters, including
+ideographs. (You are currently on your own when it comes to using the
+canonical forms of characters--Perl doesn't (yet) attempt to
+canonicalize variable names for you.)
=item *
Perl tries really hard to work both with Unicode and the old byte
oriented world: most often this is nice, but sometimes this causes
-problems. See L</BUGS> for example how sometimes using locales
-with Unicode can help with these problems.
+problems.
=back
there is some attempt to apply 8-bit locale info to characters in the
range 0..255, but this is demonstrably incorrect for locales that use
characters above that range when mapped into Unicode. It will also
-tend to run slower. Avoidance of locales is strongly encouraged,
-with one known expection, see the next paragraph.
-
-If the keys of a hash are "mixed", that is, some keys are Unicode,
-while some keys are "byte", the keys may behave differently in regular
-expressions since the definition of character classes like C</\w/>
-is different for byte strings and character strings. This problem can
-sometimes be helped by using an appropriate locale (see L<perllocale>).
-Another way is to force all the strings to be character encoded by
-using utf8::upgrade() (see L<utf8>).
+tend to run slower. Use of locales with Unicode is discouraged.
Some functions are slower when working on UTF-8 encoded strings than
-on byte encoded strings. All functions that need to hop over
+on byte encoded strings. All functions that need to hop over
characters such as length(), substr() or index() can work B<much>
faster when the underlying data are byte-encoded. Witness the
following benchmark:
EXTEND(SP, 2);
if (entry) {
- PUSHs(hv_iterkeysv(entry)); /* won't clobber stack_sp */
+ SV* sv = hv_iterkeysv(entry);
+ if (HvUTF8KEYS((SV*)hash) && !DO_UTF8(sv)) {
+ STRLEN len, i;
+ char* s = SvPV(sv, len);
+ for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+ if (i < len) {
+ sv = newSVsv(sv);
+ sv_utf8_upgrade(sv);
+ }
+ }
+ PUSHs(sv); /* won't clobber stack_sp */
if (gimme == G_ARRAY) {
SV *val;
PUTBACK;
#define SVphv_SHAREKEYS 0x20000000 /* keys live on shared string table */
#define SVphv_LAZYDEL 0x40000000 /* entry in xhv_eiter must be deleted */
+#define SVphv_UTF8KEYS 0x80000000 /* keys when fetched are UTF8 */
#define SVprv_WEAKREF 0x80000000 /* Weak reference */
$| = 1;
-print "1..892\n";
+print "1..903\n";
BEGIN {
chdir 't' if -d 't';
++$test;
}
}
+
+
+{
+ my $test = 893;
+
+ print "# Unicode hash keys and \\w\n";
+ # This is not really a regex test but regexes bring
+ # out the issue nicely.
+ use strict;
+ my $u3 = "f\x{df}\x{100}";
+ my $u2 = substr($u3,0,2);
+ my $u1 = substr($u2,0,1);
+ my %u = ( $u1 => $u1, $u2 => $u2, $u3 => $u3 );
+
+ for (keys %u) {
+ print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+ "ok $test\n" : "not ok $test\n";
+ $test++;
+ }
+
+ for (each %u) {
+ print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+ "ok $test\n" : "not ok $test\n";
+ $test++;
+ }
+
+ for (%u) {
+ print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+ "ok $test\n" : "not ok $test\n";
+ $test++;
+ }
+}
+