From d50a4f90cab527593b2dd218f71b66a6be555490 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Sun, 27 Feb 2011 18:44:43 -0700
Subject: [PATCH] Handle [folds] of 0-255 without swashes

Commit 56ca34cada940c7f6aae9a59da266e541530041e had the side effect of
causing regular expressions with things like [a-z], or even just [k] to
go out to disk to read tables to create swashes because it knew that
some of those characters matched outside the bitmap (and due to
l1_char_class_tab.h it knew which ones had those matches), but it didn't
know what the characters were that participated in those folds.

This patch hard-codes the Unicode 6.0 rules into regcomp.c for the
code points 0-255, so that the very slow utf8_heavy is not invoked on
them.  (Code points above 255 will continue to invoke it.)  It would,
of course, be better if these rules could be regen'd into regcomp.c, as
there is a risk that the standard will change, and the code will not.
But I don't think that has ever happened; in other words, I think that
the rules haven't changed so far since Day 1 of Unicode.  (That would
not be the case if we were doing simple case folding, as the capital
sharp ss which folds to U+00DF was added later.)  And the Standard is
getting more stable in this area.  I believe one of their stability
policies now forbid them from adding something that simply folds to
one of the characters that already has a fold, such as M and m.
Ligatures are frowned on, and I doubt that new ones would be encoded,
so that leaves a new Unicode character that folds to a Latin-1 plus some
sort of mark.  For those, this code is a no-op, so those aren't a
problem either.
---
 pod/perldiag.pod |   9 ++++
 regcomp.c        | 123 +++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 119 insertions(+), 13 deletions(-)

diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index aae2dd3..ce2a5d2 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3607,6 +3607,15 @@ redirected it with select().)
 "Can't locate object method \"%s\" via package \"%s\"".  It often means
 that a method requires a package that has not been loaded.
 
+=item Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;
+
+(W regex, deprecated) You used a regular expression with
+case-insensitive matching, and there is a bug in Perl in which the
+built-in regular expression folding rules are not accurate.  This may
+lead to incorrect results.  Please report this as a bug using the
+"perlbug" utility.  (This message is marked deprecated, so that it by
+default will be turned-on.)
+
 =item Perl_my_%s() not available
 
 (F) Your platform has very uncommon byte-order and integer size,
diff --git a/regcomp.c b/regcomp.c
index d767265..76579a0 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9222,12 +9222,15 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
      * Locale folding is done at run-time, so this function should not be
      * called for nodes that are for locales.
      *
-     * This function simply sets the bit corresponding to the fold of the input
+     * This function sets the bit corresponding to the fold of the input
      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
      * 'F' is 'f'.
      *
-     * It also sets any necessary flags, and returns the number of bits that
-     * actually changed from 0 to 1 */
+     * It also knows about the characters that are in the bitmap that have
+     * folds that are matchable only outside it, and sets the appropriate lists
+     * and flags.
+     *
+     * It returns the number of bits that actually changed from 0 to 1 */
 
     U8 stored = 0;
     U8 fold;
@@ -9242,17 +9245,111 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
         ANYOF_BITMAP_SET(node, fold);
         stored++;
     }
-    if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED))
-	|| (! UNI_SEMANTICS
+    if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
+	/* Certain Latin1 characters have matches outside the bitmap.  To get
+	 * here, 'value' is one of those characters.   None of these matches is
+	 * valid for ASCII characters under /aa, which have been excluded by
+	 * the 'if' above.  The matches fall into three categories:
+	 * 1) They are singly folded-to or -from an above 255 character, as
+	 *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
+	 *    WITH DIAERESIS;
+	 * 2) They are part of a multi-char fold with another character in the
+	 *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
+	 * 3) They are part of a multi-char fold with a character not in the
+	 *    bitmap, such as various ligatures.
+	 * We aren't dealing fully with multi-char folds, except we do deal
+	 * with the pattern containing a character that has a multi-char fold
+	 * (not so much the inverse).
+	 * For types 1) and 3), the matches only happen when the target string
+	 * is utf8; that's not true for 2), and we set a flag for it.
+	 *
+	 * The code below adds to the passed in inversion list the single fold
+	 * closures for 'value'.  The values are hard-coded here so that an
+	 * innocent-looking character class, like /[ks]/i won't have to go out
+	 * to disk to find the possible matches.  XXX It would be better to
+	 * generate these via regen, in case a new version of the Unicode
+	 * standard adds new mappings, though that is not really likely. */
+	switch (value) {
+	    case 'k':
+	    case 'K':
+		/* KELVIN SIGN */
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
+		break;
+	    case 's':
+	    case 'S':
+		/* LATIN SMALL LETTER LONG S */
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
+		break;
+	    case MICRO_SIGN:
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+						 GREEK_SMALL_LETTER_MU);
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+						 GREEK_CAPITAL_LETTER_MU);
+		break;
+	    case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+	    case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+		/* ANGSTROM SIGN */
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
+		if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
+		    *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+						     PL_fold_latin1[value]);
+		}
+		break;
+	    case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+		*invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+					LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+		break;
+	    case LATIN_SMALL_LETTER_SHARP_S:
+
+		/* Under /d and /u, this can match the two chars "ss" */
+		if (! MORE_ASCII_RESTRICTED) {
+		    add_alternate(alternate_ptr, (U8 *) "ss", 2);
+
+		    /* And under /u, it can match even if the target is not
+		     * utf8 */
+		    if (UNI_SEMANTICS) {
+			ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+		    }
+		}
+		break;
+	    case 'F': case 'f':
+	    case 'I': case 'i':
+	    case 'L': case 'l':
+	    case 'T': case 't':
+		/* These all are targets of multi-character folds, which can
+		 * occur with only non-Latin1 characters in the fold, so they
+		 * can match if the target string isn't UTF-8 */
+		ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+		break;
+	    case 'A': case 'a':
+	    case 'H': case 'h':
+	    case 'J': case 'j':
+	    case 'N': case 'n':
+	    case 'W': case 'w':
+	    case 'Y': case 'y':
+		/* These all are targets of multi-character folds, which occur
+		 * only with a non-Latin1 character as part of the fold, so
+		 * they can't match unless the target string is in UTF-8, so no
+		 * action here is necessary */
+		break;
+	    default:
+		/* Use deprecated warning to increase the chances of this
+		 * being output */
+		ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
+		break;
+	}
+    }
+    else if (DEPENDS_SEMANTICS
 	    && ! isASCII(value)
-	    && PL_fold_latin1[value] != value))
-    {   /* A character that has a fold outside of Latin1 matches outside the
-           bitmap, but only when the target string is utf8.  Similarly when we
-           don't have unicode semantics for the above ASCII Latin-1 characters,
-           and they have a fold, they should match if the target is utf8, and
-	   not otherwise.  We add the character here, and calculate the fold
-	   later, with the other nonbitmap folds */
-	*invlist_ptr = add_range_to_invlist(*invlist_ptr, value, value);
+	    && PL_fold_latin1[value] != value)
+    {
+	   /* Under DEPENDS rules, non-ASCII Latin1 characters match their
+	    * folds only when the target string is in UTF-8.  We add the fold
+	    * here to the list of things to match outside the bitmap, which
+	    * won't be looked at unless it is UTF8 (or else if something else
+	    * says to look even if not utf8, but those things better not happen
+	    * under DEPENDS semantics. */
+	*invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
     }
 
     return stored;
-- 
2.7.4