From bfa0ee78b652802412c3cab86bb873ed67ea6550 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 2 Feb 2014 12:20:42 -0700 Subject: [PATCH] 'use utf8' should imply /u regex matching This should be true even if the pattern isn't in utf8. --- regcomp.c | 14 ++++++++++---- t/re/pat.t | 6 +++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/regcomp.c b/regcomp.c index ab5fc04..bcd159c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -6289,7 +6289,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* ignore the utf8ness if the pattern is 0 length */ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); - RExC_uni_semantics = 0; + + /* 'use utf8' in the program indicates Unicode rules are wanted */ + RExC_uni_semantics = (PL_hints & HINT_UTF8); + RExC_contains_locale = 0; RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; @@ -6340,10 +6343,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (initial_charset == REGEX_LOCALE_CHARSET) { RExC_contains_locale = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + else if ((RExC_utf8 || RExC_uni_semantics) + && initial_charset == REGEX_DEPENDS_CHARSET) + { - /* Set to use unicode semantics if the pattern is in utf8 and has the - * 'depends' charset specified, as it means unicode when utf8 */ + /* Set to use unicode semantics if has the 'depends' charset specified, + * and either the pattern is in utf8 (as it means unicode when utf8), + * or we already know we want unicode rules */ set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } diff --git a/t/re/pat.t b/t/re/pat.t index 91274e6..6211065 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -20,7 +20,7 @@ BEGIN { require './test.pl'; } -plan tests => 712; # Update this when adding/deleting tests. +plan tests => 713; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1538,6 +1538,10 @@ EOP like "\x{AA}", qr/a?[\W_]/d, "\\W with /d synthetic start class works"; } + { + use utf8; + unlike("\xe0", qr/\W/, "'use utf8' implies /u"); + } } # End of sub run_tests -- 2.7.4