From a5961de5f4215b5cd376e88c8c5d267c7f7123f6 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sat, 22 Dec 2001 02:47:08 +0000 Subject: [PATCH] Unicode casefolding fixes. p4raw-id: //depot/perl@13843 --- op.c | 4 ++++ regcomp.c | 22 ++++++++++------------ regexec.c | 7 +++++-- t/op/pat.t | 35 ++++++++++++++++++++++++++++++++++- 4 files changed, 53 insertions(+), 15 deletions(-) diff --git a/op.c b/op.c index 9b1556e..c733052 100644 --- a/op.c +++ b/op.c @@ -3127,12 +3127,16 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, OP *repl) p = SvPV(pat, plen); pm->op_pmflags |= PMf_SKIPWHITE; } + if (DO_UTF8(pat) || (PL_hints & HINT_UTF8)) + pm->op_pmdynflags |= PMdf_UTF8; PM_SETRE(pm, CALLREGCOMP(aTHX_ p, p + plen, pm)); if (strEQ("\\s+", PM_GETRE(pm)->precomp)) pm->op_pmflags |= PMf_WHITE; op_free(expr); } else { + if (PL_hints & HINT_UTF8) + pm->op_pmdynflags |= PMdf_UTF8; if (pm->op_pmflags & PMf_KEEP || !(PL_hints & HINT_RE_EVAL)) expr = newUNOP((!(PL_hints & HINT_RE_EVAL) ? OP_REGCRESET diff --git a/regcomp.c b/regcomp.c index 53d8947..463b778 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1690,17 +1690,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - /* XXXX This looks very suspicious... */ - if (pm->op_pmdynflags & PMdf_CMP_UTF8) - RExC_utf8 = 1; - else - RExC_utf8 = 0; + RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; RExC_precomp = exp; - DEBUG_r(if (!PL_colorset) reginitcolors()); - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - (int)(xend - exp), RExC_precomp, PL_colors[1])); + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + (int)(xend - exp), RExC_precomp, PL_colors[1]); + }); RExC_flags16 = pm->op_pmflags; RExC_sawback = 0; @@ -3967,10 +3965,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } else #endif - for (i = prevvalue; i <= ceilvalue; i++) - ANYOF_BITMAP_SET(ret, i); + for (i = prevvalue; i <= ceilvalue; i++) + ANYOF_BITMAP_SET(ret, i); } - if (value > 255) { + if (value > 255 || UTF) { ANYOF_FLAGS(ret) |= ANYOF_UNICODE; if (prevvalue < value) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", diff --git a/regexec.c b/regexec.c index 35a0a6c..b7528e7 100644 --- a/regexec.c +++ b/regexec.c @@ -4110,9 +4110,12 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) match = TRUE; else if (flags & ANYOF_FOLD) { STRLEN ulen; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; - toLOWER_utf8(p, tmpbuf, &ulen); + to_utf8_fold(p, tmpbuf, &ulen); + if (swash_fetch(sw, tmpbuf, do_utf8)) + match = TRUE; + to_utf8_upper(p, tmpbuf, &ulen); if (swash_fetch(sw, tmpbuf, do_utf8)) match = TRUE; } diff --git a/t/op/pat.t b/t/op/pat.t index 077b957..ee7a736 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..757\n"; +print "1..769\n"; BEGIN { chdir 't' if -d 't'; @@ -2291,3 +2291,36 @@ print "# some Unicode properties\n"; print "not " unless "A\x{100}" =~ /A/i; print "ok 757\n"; } + +{ + use charnames ':full'; + + print "# LATIN LETTER A WITH GRAVE\n"; + my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}"; + my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}"; + + print $lower =~ m/$UPPER/i ? "ok 758\n" : "not ok 758\n"; + print $UPPER =~ m/$lower/i ? "ok 759\n" : "not ok 759\n"; + print $lower =~ m/[$UPPER]/i ? "ok 760\n" : "not ok 760\n"; + print $UPPER =~ m/[$lower]/i ? "ok 761\n" : "not ok 761\n"; + + print "# GREEK LETTER ALPHA WITH VRACHY\n"; + + $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}"; + $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}"; + + print $lower =~ m/$UPPER/i ? "ok 762\n" : "not ok 762\n"; + print $UPPER =~ m/$lower/i ? "ok 763\n" : "not ok 763\n"; + print $lower =~ m/[$UPPER]/i ? "ok 764\n" : "not ok 764\n"; + print $UPPER =~ m/[$lower]/i ? "ok 765\n" : "not ok 765\n"; + + print "# LATIN LETTER Y WITH DIAERESIS\n"; + + $lower = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"; + $UPPER = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; + + print $lower =~ m/$UPPER/i ? "ok 766\n" : "not ok 766\n"; + print $UPPER =~ m/$lower/i ? "ok 767\n" : "not ok 767\n"; + print $lower =~ m/[$UPPER]/i ? "ok 768\n" : "not ok 768\n"; + print $UPPER =~ m/[$lower]/i ? "ok 769\n" : "not ok 769\n"; +} -- 2.7.4