From 9fc2026fc4845ba3d730fceeba2309fe9843f1b2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 18 Aug 2013 09:00:11 -0600 Subject: [PATCH] utf8.c: Add omitted fold cases The LATIN SMALL LETTER SHARP S can't fold to 'ss' under /iaa because the definition of /aa prohibits it, but it can fold to two consecutive instances of LATIN SMALL LETTER LONG S. A capital sharp s can do the same, and that was fixed in 1ca267a5, but this one was overlooked then. It turns out that another possibility was also overlooked in 1ca267a5. Both U+FB05 (LATIN SMALL LIGATURE LONG S T) and U+FB06 (LATIN SMALL LIGATURE ST) fold to the string 'st', except under /iaa these folds are prohibited. But U+FB05 and U+FB06 are equivalent to each other under /iaa. This wasn't working until now. This commit changes things so both fold to FB06. This bug would only surface during /iaa matching, and I don't believe there are any current code paths which lead to it, hence no tests are added by this commit. However, a future commit will lead to this bug, and existing tests find it then. --- utf8.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/utf8.c b/utf8.c index 1647f18..01f7d7e 100644 --- a/utf8.c +++ b/utf8.c @@ -2811,7 +2811,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b if (flags & FOLD_FLAGS_LOCALE) { - /* Special case this character, as what normally gets returned + /* Special case these characters, as what normally gets returned * under locale doesn't work */ if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, @@ -2819,6 +2819,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b { goto return_long_s; } + else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1 + && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8, + sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1)) + { + goto return_ligature_st; + } return check_locale_boundary_crossing(p, result, ustrp, lenp); } else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { @@ -2826,8 +2832,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else { /* This is called when changing the case of a utf8-encoded - * character above the Latin1 range, and the result should not - * contain an ASCII character. */ + * character above the ASCII range, and the result should not + * contain an ASCII character. */ UV original; /* To store the first code point of

*/ @@ -2840,11 +2846,16 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b /* Crossed, have to return the original */ original = valid_utf8_to_uvchr(p, lenp); - /* But in this one instance, there is an alternative we can + /* But in these instances, there is an alternative we can * return that is valid */ - if (original == LATIN_CAPITAL_LETTER_SHARP_S) { + if (original == LATIN_CAPITAL_LETTER_SHARP_S + || original == LATIN_SMALL_LETTER_SHARP_S) + { goto return_long_s; } + else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { + goto return_ligature_st; + } Copy(p, ustrp, *lenp, char); return original; } @@ -2883,6 +2894,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, ustrp, *lenp, U8); return LATIN_SMALL_LETTER_LONG_S; + + return_ligature_st: + /* Two folds to 'st' are prohibited by the options; instead we pick one and + * have the other one fold to it */ + + *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1; + Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); + return LATIN_SMALL_LIGATURE_ST; } /* Note: -- 2.7.4