From 56ca34cada940c7f6aae9a59da266e541530041e Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Wed, 2 Feb 2011 12:01:34 -0700
Subject: [PATCH] Move ANYOF folding from regexec to regcomp

This is for security as well as performance.  It allows Unicode properties to
not be matched case sensitively.  As a result the swash inversion hash is
converted from having utf8 keys to numeric, code point, keys.

It also for the first time fixes the bug where /i doesn't work for a code point
not at the end of a range in a bracketed character class has a multi-character
fold
---
 lib/unicore/mktables    |  15 ++++
 pod/perldelta.pod       |  24 +++++
 pod/perlrecharclass.pod |  25 +++++-
 pod/perlunicode.pod     |  27 ++++++
 regcomp.c               | 230 ++++++++++++++++++++++++++++++++++++++++++------
 regexec.c               |   2 +
 utf8.c                  |  15 ++--
 7 files changed, 300 insertions(+), 38 deletions(-)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 5e3f0c5..ab90ba3 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -12395,6 +12395,7 @@ sub make_table_pod_entries($) {
     my $string_count = clarify_number($count);
     my $status = $input_table->status;
     my $status_info = $input_table->status_info;
+    my $caseless_equivalent = $input_table->caseless_equivalent;
 
     my $entry_for_first_table; # The entry for the first table output.
                            # Almost certainly, it is the parent.
@@ -12618,6 +12619,11 @@ sub make_table_pod_entries($) {
                 }
             }
 
+            if ($caseless_equivalent != 0) {
+                $parenthesized .=  '; ' if $parenthesized ne "";
+                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
+            }
+
 
             # Warn if this property isn't the same as one that a
             # semi-casual user might expect.  The other components of this
@@ -13060,6 +13066,15 @@ Numbers in (parentheses) indicate the total number of code points matched by
 the property.  For emphasis, those properties that match no code points at all
 are listed as well in a separate section following the table.
 
+Most properties match the same code points regardless of whether C<"/i">
+case-insensitive matching is specified or not.  But a few properties are
+affected.  These are shown with the notation
+
+ (/i= other_property)
+
+in the second column.  Under case-insensitive matching they match the
+same code pode points as the property "other_property".
+
 There is no description given for most non-Perl defined properties (See
 $unicode_reference_url for that).
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 9d6e258..3f0d2d8 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -51,6 +51,30 @@ XXX For a release on a stable branch, this section aspires to be:
 
 [ List each incompatible change as a =head2 entry ]
 
+=head2 Most C<\p{}> properties are now immune from case-insensitive matching
+
+For most Unicode properties, it doesn't make sense to have them match
+differently under C</i> case-insensitive matching than not.  And doing
+so leads to unexpected results and potential security holes.  For
+example
+
+ m/\p{ASCII_Hex_Digit}+/i
+
+could previously match non-ASCII characters because of the Unicode
+matching rules.  There were a number of bugs in this feature until an
+earlier release in the 5.13 series.  Now this release reverts, and
+removes the feature completely except for the few properties where
+people have come to expect it, namely the ones where casing is an
+integral part of their functionality, such as C<m/\p{Uppercase}/i> and
+C<m/\p{Lowercase}/i>, both of which match the exact same code points,
+namely those matched by C<m/\p{Cased}/i>.  Details are in
+L<perlrecharclass/Unicode Properties>.
+
+User-defined property handlers that need to match differently under
+C</i> must change to read the new boolean parameter passed it which is
+non-zero if case-insensitive matching is in effect; 0 if not.  See
+L<perluniprops/User-Defined Character Properties>.
+
 =head1 Deprecations
 
 XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index e7f6f8a..de2256f 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -273,13 +273,32 @@ C</\pLl/> is valid, but means something different.
 It matches a two character string: a letter (Unicode property C<\pL>),
 followed by a lowercase C<l>.
 
-For more details, see L<perlunicode/Unicode Character Properties>; for a
+Note that almost all properties are immune to case-insensitive matching.
+That is, adding a C</i> regular expression modifier does not change what
+they match.  There are two sets that are affected.  The first set is
+C<Uppercase_Letter>,
+C<Lowercase_Letter>,
+and C<Titlecase_Letter>,
+all of which match C<Cased_Letter> under C</i> matching.
+And the second set is
+C<Uppercase>,
+C<Lowercase>,
+and C<Titlecase>,
+all of which match C<Cased> under C</i> matching.
+(The difference between these sets is that some things, such as Roman
+Numerals come in both upper and lower case so they are C<Cased>, but
+aren't considered to be letters, so they aren't C<Cased_Letter>s.)
+This set also includes its subsets C<PosixUpper> and C<PosixLower> both
+of which under C</i> matching match C<PosixAlpha>.
+
+For more details on Unicode properties, see L<perlunicode/Unicode
+Character Properties>; for a
 complete list of possible properties, see
-L<perluniprops/Properties accessible through \p{} and \P{}>.
+L<perluniprops/Properties accessible through \p{} and \P{}>,
+which notes all forms that have C</i> differences.
 It is also possible to define your own properties. This is discussed in
 L<perlunicode/User-Defined Character Properties>.
 
-
 =head4 Examples
 
  "a"  =~  /\w/      # Match, "a" is a 'word' character.
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 360af1d..7f3a795 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -349,6 +349,27 @@ You can also use negation in both C<\p{}> and C<\P{}> by introducing a caret
 (^) between the first brace and the property name: C<\p{^Tamil}> is
 equal to C<\P{Tamil}>.
 
+Almost all properties are immune to case-insensitive matching.  That is,
+adding a C</i> regular expression modifier does not change what they
+match.  There are two sets that are affected.
+The first set is
+C<Uppercase_Letter>,
+C<Lowercase_Letter>,
+and C<Titlecase_Letter>,
+all of which match C<Cased_Letter> under C</i> matching.
+And the second set is
+C<Uppercase>,
+C<Lowercase>,
+and C<Titlecase>,
+all of which match C<Cased> under C</i> matching.
+This set also includes its subsets C<PosixUpper> and C<PosixLower> both
+of which under C</i> matching match C<PosixAlpha>.
+(The difference between these sets is that some things, such as Roman
+Numerals come in both upper and lower case so they are C<Cased>, but aren't considered to be
+letters, so they aren't C<Cased_Letter>s.)
+L<perluniprops> includes a notation for all forms that have C</i>
+differences.
+
 =head3 B<General_Category>
 
 Every Unicode character is assigned a general category, which is the "most
@@ -773,6 +794,12 @@ C<\p> or C<\P> construct.
 
 
 Note that the effect is compile-time and immutable once defined.
+However the subroutines are passed a single parameter which is 0 if
+case-sensitive matching is in effect, and non-zero if caseless matching
+is in effect.  The subroutine may return different values depending on
+the value of the flag, and one set of values will immutably be in effect
+for all case-sensitive matches; the other set for all case-insensitive
+matches.
 
 The subroutines must return a specially-formatted string, with one
 or more newline-separated lines.  Each line must be one of the following:
diff --git a/regcomp.c b/regcomp.c
index 0c5ad08..8007dd3 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -8872,14 +8872,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                                                  \
 	for (value = 0; value < 256; value++)                                  \
 	    if (TEST)                                                          \
-	    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
+	    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
     yesno = '+';                                                               \
     what = WORD;                                                               \
     break;                                                                     \
 case ANYOF_N##NAME:                                                            \
 	for (value = 0; value < 256; value++)                                  \
 	    if (!TEST)                                                         \
-	    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
+	    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
     yesno = '!';                                                               \
     what = WORD;                                                               \
     break
@@ -8894,14 +8894,14 @@ ANYOF_##NAME:                                                                  \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (TEST_8(value)) stored +=                                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
 		S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
-			           (U8) UNI_TO_NATIVE(value));                 \
+			           (U8) UNI_TO_NATIVE(value), &nonbitmap);                 \
         }                                                                      \
     }                                                                          \
     yesno = '+';                                                               \
@@ -8912,18 +8912,18 @@ case ANYOF_N##NAME:                                                            \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (! TEST_8(value)) stored +=                                     \
-		    S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
+		    S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);    \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit(  \
-			aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value));    \
+			aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap);    \
         }                                                                      \
 	if (ASCII_RESTRICTED) {                                                \
 	    for (value = 128; value < 256; value++) {                          \
              stored += S_set_regclass_bit(                                     \
-			   aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
+			   aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
 	    }                                                                  \
 	    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
 	}                                                                      \
@@ -8957,7 +8957,7 @@ case ANYOF_N##NAME:                                                            \
 #endif
 
 STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
 {
 
     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
@@ -8991,6 +8991,10 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
            don't have unicode semantics for the above ASCII Latin-1 characters,
            and they have a fold, they should match if the target is utf8, and
            not otherwise */
+	if (! *nonbitmap_ptr) {
+	    *nonbitmap_ptr = _new_invlist(2);
+	}
+	*nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
 	ANYOF_FLAGS(node) |= ANYOF_UTF8;
     }
 
@@ -8999,7 +9003,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
 
 
 PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
@@ -9015,7 +9019,7 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu
     stored = 1;
 
     if (FOLD && ! LOC) {	/* Locale folds aren't known until runtime */
-	stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+	stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr);
     }
 
     return stored;
@@ -9043,6 +9047,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     bool need_class = 0;
     SV *listsv = NULL;
     UV n;
+    HV* nonbitmap = NULL;
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
@@ -9064,8 +9069,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     /* Assume we are going to generate an ANYOF node. */
     ret = reganode(pRExC_state, ANYOF, 0);
 
-    if (!SIZE_ONLY)
+
+    if (!SIZE_ONLY) {
 	ANYOF_FLAGS(ret) = 0;
+    }
 
     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
 	RExC_naughty++;
@@ -9350,9 +9357,9 @@ parseit:
 
 		    if (prevvalue < 256) {
 			stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap);
 			stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
 		    }
 		    else {
 			ANYOF_FLAGS(ret) |= ANYOF_UTF8;
@@ -9404,7 +9411,7 @@ parseit:
 		    else {
 			for (value = 0; value < 128; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
 		    }
 		    yesno = '+';
 		    what = NULL;	/* Doesn't match outside ascii, so
@@ -9416,7 +9423,7 @@ parseit:
 		    else {
 			for (value = 128; value < 256; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
 		    }
 		    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
 		    yesno = '!';
@@ -9429,7 +9436,7 @@ parseit:
 			/* consecutive digits assumed */
 			for (value = '0'; value <= '9'; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
 		    }
 		    yesno = '+';
 		    what = POSIX_CC_UNI_NAME("Digit");
@@ -9441,10 +9448,10 @@ parseit:
 			/* consecutive digits assumed */
 			for (value = 0; value < '0'; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
 			for (value = '9' + 1; value < 256; value++)
 			    stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
 		    }
 		    yesno = '!';
 		    what = POSIX_CC_UNI_NAME("Digit");
@@ -9494,13 +9501,17 @@ parseit:
 		    }
 		    if (!SIZE_ONLY)
 			stored +=
-                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
 		} else
 		    range = 1;	/* yeah, it's a range! */
 		continue;	/* but do it the next time */
 	    }
 	}
 
+	if (value > 255) {
+	    RExC_uni_semantics = 1;
+	}
+
 	/* now is the next time */
 	if (!SIZE_ONLY) {
 	    if (prevvalue < 256) {
@@ -9517,25 +9528,32 @@ parseit:
 			for (i = prevvalue; i <= ceilvalue; i++)
 			    if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
 				stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
 			    }
 		    } else {
 			for (i = prevvalue; i <= ceilvalue; i++)
 			    if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
 				stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
 			    }
 		    }
 		}
 		else
 #endif
 		      for (i = prevvalue; i <= ceilvalue; i++) {
-			stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
+			stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
 	              }
 	  }
-	  if (value > 255 || UTF) {
-	        const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
-		const UV natvalue      = NATIVE_TO_UNI(value);
+	  if (value > 255) {
+	    const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
+	    const UV natvalue      = NATIVE_TO_UNI(value);
+	    if (! nonbitmap) {
+		nonbitmap = _new_invlist(2);
+	    }
+	    nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
+	    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
+	}
+#if 0
 
 		/* If the code point requires utf8 to represent, and we are not
 		 * folding, it can't match unless the target is in utf8.  Only
@@ -9632,6 +9650,7 @@ parseit:
 		    }
 		}
 	    }
+#endif
 #ifdef EBCDIC
 	    literal_endpoint = 0;
 #endif
@@ -9646,6 +9665,167 @@ parseit:
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
+    /* Finish up the non-bitmap entries */
+    if (nonbitmap) {
+	UV* nonbitmap_array;
+	UV i;
+
+	/* If folding, we add to the list all characters that could fold to or
+	 * from the ones already on the list */
+	if (FOLD) {
+	    HV* fold_intersection;
+	    UV* fold_list;
+
+	    /* This is a list of all the characters that participate in folds
+	     * (except marks, etc in multi-char folds */
+	    if (! PL_utf8_foldable) {
+		SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+		PL_utf8_foldable = _swash_to_invlist(swash);
+	    }
+
+	    /* This is a hash that for a particular fold gives all characters
+	     * that are involved in it */
+	    if (! PL_utf8_foldclosures) {
+
+		/* If the folds haven't been read in, call a fold
+		    * function to force that */
+		if (! PL_utf8_tofold) {
+		    U8 dummy[UTF8_MAXBYTES+1];
+		    STRLEN dummy_len;
+		    to_utf8_fold((U8*) "A", dummy, &dummy_len);
+		}
+		PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+	    }
+
+	    /* Only the characters in this class that participate in folds need
+	     * be checked.  Get the intersection of this class and all the
+	     * possible characters that are foldable.  This can quickly narrow
+	     * down a large class */
+	    fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
+
+	    /* Now look at the foldable characters in this class individually */
+	    fold_list = invlist_array(fold_intersection);
+	    for (i = 0; i < invlist_len(fold_intersection); i++) {
+		UV j;
+
+		/* The next entry is the beginning of the range that is in the
+		 * class */
+		UV start = fold_list[i++];
+
+
+		/* The next entry is the beginning of the next range, which
+		 * isn't in the class, so the end of the current range is one
+		 * less than that */
+		UV end = fold_list[i] - 1;
+
+		/* Look at every character in the range */
+		for (j = start; j <= end; j++) {
+
+		    /* Get its fold */
+		    U8 foldbuf[UTF8_MAXBYTES_CASE+1];
+		    STRLEN foldlen;
+		    const UV f = to_uni_fold(j, foldbuf, &foldlen);
+
+		    if (foldlen > (STRLEN)UNISKIP(f)) {
+
+			/* Any multicharacter foldings (disallowed in
+			 * lookbehind patterns) require the following
+			 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
+			 * E folds into "pq" and F folds into "rst", all other
+			 * characters fold to single characters.  We save away
+			 * these multicharacter foldings, to be later saved as
+			 * part of the additional "s" data. */
+			if (! RExC_in_lookbehind) {
+			    /* XXX Discard this fold if any are latin1 and LOC */
+			    SV *sv;
+
+			    if (!unicode_alternate) {
+				unicode_alternate = newAV();
+			    }
+			    sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
+			    av_push(unicode_alternate, sv);
+
+			    /* This node is variable length */
+			    OP(ret) = ANYOFV;
+			    ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+			}
+		    }
+		    else { /* Single character fold */
+			SV** listp;
+
+			/* Consider "k" =~ /[K]/i.  The line above would have
+			 * just folded the 'k' to itself, and that isn't going
+			 * to match 'K'.  So we look through the closure of
+			 * everything that folds to 'k'.  That will find the
+			 * 'K'.  Initialize the list, if necessary */
+
+			/* The data structure is a hash with the keys every
+			 * character that is folded to, like 'k', and the
+			 * values each an array of everything that folds to its
+			 * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+			if ((listp = hv_fetch(PL_utf8_foldclosures,
+				      (char *) foldbuf, foldlen, FALSE)))
+			{
+			    AV* list = (AV*) *listp;
+			    IV k;
+			    for (k = 0; k <= av_len(list); k++) {
+				SV** c_p = av_fetch(list, k, FALSE);
+				UV c;
+				if (c_p == NULL) {
+				    Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+				}
+				c = SvUV(*c_p);
+
+				if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
+				    stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap);
+				}
+				    /* It may be that the code point is already
+				     * in this range or already in the bitmap,
+				     * XXX THink about LOC
+				     * in which case we need do nothing */
+				else if ((c < start || c > end)
+					 && (c > 255
+					     || ! ANYOF_BITMAP_TEST(ret, c)))
+				{
+				    nonbitmap = add_range_to_invlist(nonbitmap, c, c);
+				}
+			    }
+			}
+		    }
+		}
+	    }
+	    invlist_destroy(fold_intersection);
+	} /* End of processing all the folds */
+
+	/*  Here have the full list of items to match that aren't in the
+	 *  bitmap.  Convert to the structure that the rest of the code is
+	 *  expecting.   XXX That rest of the code should convert to this
+	 *  structure */
+	nonbitmap_array = invlist_array(nonbitmap);
+	for (i = 0; i < invlist_len(nonbitmap); i++) {
+
+	    /* The next entry is the beginning of the range that is in the
+	     * class */
+	    UV start = nonbitmap_array[i++];
+
+	    /* The next entry is the beginning of the next range, which isn't
+	     * in the class, so the end of the current range is one less than
+	     * that */
+	    UV end = nonbitmap_array[i] - 1;
+
+	    if (start == end) {
+		Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
+	    }
+	    else {
+		/* The \t sets the whole range */
+		Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+			/* XXX EBCDIC */
+				   start, end);
+	    }
+	}
+	invlist_destroy(nonbitmap);
+    }
+
     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
      * set the FOLD flag yet, so this this does optimize those.  It doesn't
      * optimize locale.  Doing so perhaps could be done as long as there is
diff --git a/regexec.c b/regexec.c
index b59b8bf..476a966 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6813,6 +6813,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 			    }
 			}
 		    }
+#if 0
 		    if (!match) { /* See if the folded version matches */
 			SV** listp;
 
@@ -6881,6 +6882,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 			    }
 			}
 		    }
+#endif
 		}
 
 		/* If we allocated a string above, free it */
diff --git a/utf8.c b/utf8.c
index 18ff1d8..6053465 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2657,10 +2657,6 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
 	    char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
 	    STRLEN key_len = key_end - key;
 
-	    /* And the value is what the forward mapping is from. */
-	    char utf8_inverse[UTF8_MAXBYTES+1];
-	    char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse);
-
 	    /* Get the list for the map */
 	    if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
 		list = (AV*) *listp;
@@ -2679,22 +2675,21 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
 		    Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
 		}
 		entry = *entryp;
-		if (SvCUR(entry) != key_len) {
-		    continue;
-		}
-		if (memEQ(key, SvPVX(entry), key_len)) {
+		if (SvUV(entry) == val) {
 		    found_key = TRUE;
 		    break;
 		}
 	    }
+
+	    /* Make sure there is a mapping to itself on the list */
 	    if (! found_key) {
-		element = newSVpvn_flags(key, key_len, SVf_UTF8);
+		element = newSVuv(val);
 		av_push(list, element);
 	    }
 
 
 	    /* Simply add the value to the list */
-	    element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8);
+	    element = newSVuv(inverse);
 	    av_push(list, element);
 
 	    /* swash_get() increments the value of val for each element in the
-- 
2.7.4