regcharclass: Add tricky fold characters.
authorKarl Williamson <public@khwilliamson.com>
Sun, 20 Mar 2011 01:29:17 +0000 (19:29 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sun, 20 Mar 2011 18:16:13 +0000 (12:16 -0600)
The tricky fold characters need to be expanded to include the ones
that map to the same ones as the original set.  This isn't because the
new ones have a length issue, it's that they get left out of comparisons
because of the special regnodes generated for the tricky ones.

regcharclass.h
regen/regcharclass.pl

index ea5cb99..47d4b41 100644 (file)
 /*
        TRICKYFOLD: Problematic fold case letters.
 
-       0x00DF  # LATIN1 SMALL LETTER SHARP S
+       0x00DF  # LATIN SMALL LETTER SHARP S
        0x0390  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
        0x03B0  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+       0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
+       0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
+       0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
 */
 /*** GENERATED CODE ***/
 #define is_TRICKYFOLD(s,is_utf8)                                            \
        ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 )                                 \
     : ( 0xCE == ((U8*)s)[0] ) ?                                             \
        ( ( 0x90 == ((U8*)s)[1] || 0xB0 == ((U8*)s)[1] ) ? 2 : 0 )          \
+    : ( 0xE1 == ((U8*)s)[0] ) ?                                             \
+       ( ( 0xBA == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x9E == ((U8*)s)[2] ) ? 3 : 0 )                             \
+       : ( 0xBF == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x93 == ((U8*)s)[2] || 0xA3 == ((U8*)s)[2] ) ? 3 : 0 )      \
+       : 0 )                                                               \
     : 0 )                                                                   \
 : ( 0xDF == ((U8*)s)[0] ) )
 
 /*** GENERATED CODE ***/
 #define is_TRICKYFOLD_safe(s,e,is_utf8)                                     \
-( ((e)-(s) > 1) ?                                                           \
+( ((e)-(s) > 2) ?                                                           \
+    ( ( is_utf8 ) ?                                                         \
+       ( ( 0xC3 == ((U8*)s)[0] ) ?                                         \
+           ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 )                             \
+       : ( 0xCE == ((U8*)s)[0] ) ?                                         \
+           ( ( 0x90 == ((U8*)s)[1] || 0xB0 == ((U8*)s)[1] ) ? 2 : 0 )      \
+       : ( 0xE1 == ((U8*)s)[0] ) ?                                         \
+           ( ( 0xBA == ((U8*)s)[1] ) ?                                     \
+               ( ( 0x9E == ((U8*)s)[2] ) ? 3 : 0 )                         \
+           : ( 0xBF == ((U8*)s)[1] ) ?                                     \
+               ( ( 0x93 == ((U8*)s)[2] || 0xA3 == ((U8*)s)[2] ) ? 3 : 0 )  \
+           : 0 )                                                           \
+       : 0 )                                                               \
+    : ( 0xDF == ((U8*)s)[0] ) )                                             \
+: ((e)-(s) > 1) ?                                                           \
     ( ( is_utf8 ) ?                                                         \
        ( ( 0xC3 == ((U8*)s)[0] ) ?                                         \
            ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 )                             \
 #define is_TRICKYFOLD_cp(cp)                                                \
 ( 0xDF == cp || ( 0xDF < cp &&                                              \
 ( 0x390 == cp || ( 0x390 < cp &&                                            \
-0x3B0 == cp ) ) ) )
+( 0x3B0 == cp || ( 0x3B0 < cp &&                                            \
+( 0x1E9E == cp || ( 0x1E9E < cp &&                                          \
+( 0x1FD3 == cp || ( 0x1FD3 < cp &&                                          \
+0x1FE3 == cp ) ) ) ) ) ) ) ) ) )
 
 /*** GENERATED CODE ***/
 #define what_TRICKYFOLD(s,is_utf8)                                          \
     : ( 0xCE == ((U8*)s)[0] ) ?                                             \
        ( ( 0x90 == ((U8*)s)[1] ) ? 0x390                                   \
        : ( 0xB0 == ((U8*)s)[1] ) ? 0x3B0 : 0 )                             \
+    : ( 0xE1 == ((U8*)s)[0] ) ?                                             \
+       ( ( 0xBA == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x9E == ((U8*)s)[2] ) ? 0x1E9E : 0 )                        \
+       : ( 0xBF == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x93 == ((U8*)s)[2] ) ? 0x1FD3                              \
+           : ( 0xA3 == ((U8*)s)[2] ) ? 0x1FE3 : 0 )                        \
+       : 0 )                                                               \
     : 0 )                                                                   \
 : ( 0xDF == ((U8*)s)[0] ) ? 0xDF : 0 )
 
 /*** GENERATED CODE ***/
 #define what_TRICKYFOLD_safe(s,e,is_utf8)                                   \
-( ((e)-(s) > 1) ?                                                           \
+( ((e)-(s) > 2) ?                                                           \
+    ( ( is_utf8 ) ?                                                         \
+       ( ( 0xC3 == ((U8*)s)[0] ) ?                                         \
+           ( ( 0x9F == ((U8*)s)[1] ) ? 0xDF : 0 )                          \
+       : ( 0xCE == ((U8*)s)[0] ) ?                                         \
+           ( ( 0x90 == ((U8*)s)[1] ) ? 0x390                               \
+           : ( 0xB0 == ((U8*)s)[1] ) ? 0x3B0 : 0 )                         \
+       : ( 0xE1 == ((U8*)s)[0] ) ?                                         \
+           ( ( 0xBA == ((U8*)s)[1] ) ?                                     \
+               ( ( 0x9E == ((U8*)s)[2] ) ? 0x1E9E : 0 )                    \
+           : ( 0xBF == ((U8*)s)[1] ) ?                                     \
+               ( ( 0x93 == ((U8*)s)[2] ) ? 0x1FD3                          \
+               : ( 0xA3 == ((U8*)s)[2] ) ? 0x1FE3 : 0 )                    \
+           : 0 )                                                           \
+       : 0 )                                                               \
+    : ( 0xDF == ((U8*)s)[0] ) ? 0xDF : 0 )                                  \
+: ((e)-(s) > 1) ?                                                           \
     ( ( is_utf8 ) ?                                                         \
        ( ( 0xC3 == ((U8*)s)[0] ) ?                                         \
            ( ( 0x9F == ((U8*)s)[1] ) ? 0xDF : 0 )                          \
     : ( 0xCE == ((U8*)s)[0] ) ?                                             \
        ( ( 0x90 == ((U8*)s)[1] ) ? len=2, 0x390                            \
        : ( 0xB0 == ((U8*)s)[1] ) ? len=2, 0x3B0 : 0 )                      \
+    : ( 0xE1 == ((U8*)s)[0] ) ?                                             \
+       ( ( 0xBA == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x9E == ((U8*)s)[2] ) ? len=3, 0x1E9E : 0 )                 \
+       : ( 0xBF == ((U8*)s)[1] ) ?                                         \
+           ( ( 0x93 == ((U8*)s)[2] ) ? len=3, 0x1FD3                       \
+           : ( 0xA3 == ((U8*)s)[2] ) ? len=3, 0x1FE3 : 0 )                 \
+       : 0 )                                                               \
     : 0 )                                                                   \
 : ( 0xDF == ((U8*)s)[0] ) ? len=1, 0xDF : 0 )
 
 /*** GENERATED CODE ***/
 #define what_len_TRICKYFOLD_safe(s,e,is_utf8,len)                           \
-( ((e)-(s) > 1) ?                                                           \
+( ((e)-(s) > 2) ?                                                           \
+    ( ( is_utf8 ) ?                                                         \
+       ( ( 0xC3 == ((U8*)s)[0] ) ?                                         \
+           ( ( 0x9F == ((U8*)s)[1] ) ? len=2, 0xDF : 0 )                   \
+       : ( 0xCE == ((U8*)s)[0] ) ?                                         \
+           ( ( 0x90 == ((U8*)s)[1] ) ? len=2, 0x390                        \
+           : ( 0xB0 == ((U8*)s)[1] ) ? len=2, 0x3B0 : 0 )                  \
+       : ( 0xE1 == ((U8*)s)[0] ) ?                                         \
+           ( ( 0xBA == ((U8*)s)[1] ) ?                                     \
+               ( ( 0x9E == ((U8*)s)[2] ) ? len=3, 0x1E9E : 0 )             \
+           : ( 0xBF == ((U8*)s)[1] ) ?                                     \
+               ( ( 0x93 == ((U8*)s)[2] ) ? len=3, 0x1FD3                   \
+               : ( 0xA3 == ((U8*)s)[2] ) ? len=3, 0x1FE3 : 0 )             \
+           : 0 )                                                           \
+       : 0 )                                                               \
+    : ( 0xDF == ((U8*)s)[0] ) ? len=1, 0xDF : 0 )                           \
+: ((e)-(s) > 1) ?                                                           \
     ( ( is_utf8 ) ?                                                         \
        ( ( 0xC3 == ((U8*)s)[0] ) ?                                         \
            ( ( 0x9F == ((U8*)s)[1] ) ? len=2, 0xDF : 0 )                   \
index c3ea8a6..2e89b2d 100755 (executable)
@@ -731,6 +731,9 @@ VERTWS: Vertical Whitespace: \v \V
 
 TRICKYFOLD: Problematic fold case letters.
 => generic cp generic-cp generic-both :fast safe
-0x00DF # LATIN1 SMALL LETTER SHARP S
+0x00DF # LATIN SMALL LETTER SHARP S
 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
+0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
+0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0