From 706b72db21d25ec26ed7411e5221550ffb08ff21 Mon Sep 17 00:00:00 2001 From: Christian Persch Date: Sun, 12 Feb 2012 21:20:33 +0100 Subject: [PATCH] regex: Use glib for unicode data Use g_unichar_type() and g_unichar_get_script() instead of pcre tables. --- glib/pcre/Makefile.am | 1 - glib/pcre/pcre_compile.c | 26 +- glib/pcre/pcre_dfa_exec.c | 96 +++--- glib/pcre/pcre_exec.c | 26 +- glib/pcre/pcre_internal.h | 11 +- glib/pcre/pcre_tables.c | 16 + glib/pcre/pcre_xclass.c | 24 +- glib/pcre/ucp.h | 265 ++++++++-------- glib/update-pcre/ucp.patch | 732 ++++++++++++++++++++++++++++++++++++++++++++- 9 files changed, 958 insertions(+), 239 deletions(-) diff --git a/glib/pcre/Makefile.am b/glib/pcre/Makefile.am index 21da5c5..1981953 100644 --- a/glib/pcre/Makefile.am +++ b/glib/pcre/Makefile.am @@ -51,7 +51,6 @@ libpcre_la_SOURCES = \ pcre_string_utils.c \ pcre_study.c \ pcre_tables.c \ - pcre_ucd.c \ pcre_valid_utf8.c \ pcre_version.c \ pcre_xclass.c \ diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c index eb985df..b44055a 100644 --- a/glib/pcre/pcre_compile.c +++ b/glib/pcre/pcre_compile.c @@ -2890,43 +2890,43 @@ Returns: TRUE if auto-possessifying is OK static BOOL check_char_prop(int c, int ptype, int pdata, BOOL negated) { -const ucd_record *prop = GET_UCD(c); +const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(ptype) { case PT_LAMP: - return (prop->chartype == ucp_Lu || - prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == negated; + return (chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == negated; case PT_GC: - return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; + return (pdata == PRIV(ucp_gentype)[chartype]) == negated; case PT_PC: - return (pdata == prop->chartype) == negated; + return (pdata == chartype) == negated; case PT_SC: - return (pdata == prop->script) == negated; + return (pdata == UCD_SCRIPT(c)) == negated; /* These are specials */ case PT_ALNUM: - return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; + return (PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N) == negated; case PT_SPACE: /* Perl space */ - return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + return (PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == negated; case PT_PXSPACE: /* POSIX space */ - return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + return (PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == negated; case PT_WORD: - return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || + return (PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE) == negated; } return FALSE; diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c index 21d7be6..41ff65b 100644 --- a/glib/pcre/pcre_dfa_exec.c +++ b/glib/pcre/pcre_dfa_exec.c @@ -1015,7 +1015,7 @@ for (;;) if (clen > 0) { BOOL OK; - const ucd_record * prop = GET_UCD(c); + const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(code[1]) { case PT_ANY: @@ -1023,43 +1023,43 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; + OK = PRIV(ucp_gentype)[chartype] == code[2]; break; case PT_PC: - OK = prop->chartype == code[2]; + OK = chartype == code[2]; break; case PT_SC: - OK = prop->script == code[2]; + OK = UCD_SCRIPT(c) == code[2]; break; /* These are specials for combination cases. */ case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE; break; @@ -1209,7 +1209,7 @@ for (;;) if (clen > 0) { BOOL OK; - const ucd_record * prop = GET_UCD(c); + const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(code[2]) { case PT_ANY: @@ -1217,43 +1217,43 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; + OK = PRIV(ucp_gentype)[chartype] == code[3]; break; case PT_PC: - OK = prop->chartype == code[3]; + OK = chartype == code[3]; break; case PT_SC: - OK = prop->script == code[3]; + OK = UCD_SCRIPT(c) == code[3]; break; /* These are specials for combination cases. */ case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE; break; @@ -1456,7 +1456,7 @@ for (;;) if (clen > 0) { BOOL OK; - const ucd_record * prop = GET_UCD(c); + const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(code[2]) { case PT_ANY: @@ -1464,43 +1464,43 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; + OK = PRIV(ucp_gentype)[chartype] == code[3]; break; case PT_PC: - OK = prop->chartype == code[3]; + OK = chartype == code[3]; break; case PT_SC: - OK = prop->script == code[3]; + OK = UCD_SCRIPT(c) == code[3]; break; /* These are specials for combination cases. */ case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE; break; @@ -1728,7 +1728,7 @@ for (;;) if (clen > 0) { BOOL OK; - const ucd_record * prop = GET_UCD(c); + const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(code[1 + IMM2_SIZE + 1]) { case PT_ANY: @@ -1736,43 +1736,43 @@ for (;;) break; case PT_LAMP: - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt; + OK = chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt; break; case PT_GC: - OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; + OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2]; break; case PT_PC: - OK = prop->chartype == code[1 + IMM2_SIZE + 2]; + OK = chartype == code[1 + IMM2_SIZE + 2]; break; case PT_SC: - OK = prop->script == code[1 + IMM2_SIZE + 2]; + OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2]; break; /* These are specials for combination cases. */ case PT_ALNUM: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N; + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N; break; case PT_SPACE: /* Perl space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; break; case PT_PXSPACE: /* POSIX space */ - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + OK = PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR; break; case PT_WORD: - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || + OK = PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE; break; diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c index b715353..8eb3162 100644 --- a/glib/pcre/pcre_exec.c +++ b/glib/pcre/pcre_exec.c @@ -2507,7 +2507,7 @@ for (;;) } GETCHARINCTEST(c, eptr); { - const ucd_record *prop = GET_UCD(c); + const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(ecode[1]) { @@ -2516,44 +2516,44 @@ for (;;) break; case PT_LAMP: - if ((prop->chartype == ucp_Lu || - prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_GC: - if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) + if ((ecode[2] != PRIV(ucp_gentype)[chartype]) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_PC: - if ((ecode[2] != prop->chartype) == (op == OP_PROP)) + if ((ecode[2] != chartype) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_SC: - if ((ecode[2] != prop->script) == (op == OP_PROP)) + if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; /* These are specials */ case PT_ALNUM: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) + if ((PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_SPACE: /* Perl space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_PXSPACE: /* POSIX space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == (op == OP_NOTPROP)) @@ -2561,8 +2561,8 @@ for (;;) break; case PT_WORD: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || + if ((PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h index e5a4b6a..41c7ee3 100644 --- a/glib/pcre/pcre_internal.h +++ b/glib/pcre/pcre_internal.h @@ -2315,15 +2315,12 @@ extern const int PRIV(ucp_typerange)[]; #ifdef SUPPORT_UCP /* UCD access macros */ -#define UCD_BLOCK_SIZE 128 -#define GET_UCD(ch) (PRIV(ucd_records) + \ - PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \ - UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE]) +unsigned int _pcre_ucp_othercase(const unsigned int c); -#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype -#define UCD_SCRIPT(ch) GET_UCD(ch)->script +#define UCD_CHARTYPE(ch) (pcre_uint8)g_unichar_type((gunichar)(ch)) +#define UCD_SCRIPT(ch) (pcre_uint8)g_unichar_get_script((gunichar)(ch)) #define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] -#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) +#define UCD_OTHERCASE(ch) (_pcre_ucp_othercase(ch)) #endif /* SUPPORT_UCP */ diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c index c8134ec..47becc7 100644 --- a/glib/pcre/pcre_tables.c +++ b/glib/pcre/pcre_tables.c @@ -563,6 +563,22 @@ const ucp_type_table PRIV(utt)[] = { const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); +unsigned int +_pcre_ucp_othercase(const unsigned int c) +{ + int other_case = NOTACHAR; + + if (g_unichar_islower(c)) + other_case = g_unichar_toupper(c); + else if (g_unichar_isupper(c)) + other_case = g_unichar_tolower(c); + + if (other_case == c) + other_case = NOTACHAR; + + return other_case; +} + #endif /* SUPPORT_UTF */ /* End of pcre_tables.c */ diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c index dca7a39..e5a55d7 100644 --- a/glib/pcre/pcre_xclass.c +++ b/glib/pcre/pcre_xclass.c @@ -127,7 +127,7 @@ while ((t = *data++) != XCL_END) #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { - const ucd_record *prop = GET_UCD(c); + const pcre_uint8 chartype = UCD_CHARTYPE(c); switch(*data) { @@ -136,46 +136,46 @@ while ((t = *data++) != XCL_END) break; case PT_LAMP: - if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; + if ((chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: - if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP)) + if ((data[1] == PRIV(ucp_gentype)[chartype]) == (t == XCL_PROP)) return !negated; break; case PT_PC: - if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; + if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: - if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; + if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated; break; case PT_ALNUM: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP)) + if ((PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N) == (t == XCL_PROP)) return !negated; break; case PT_SPACE: /* Perl space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_PXSPACE: /* POSIX space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_WORD: - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || - PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) + if ((PRIV(ucp_gentype)[chartype] == ucp_L || + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE) == (t == XCL_PROP)) return !negated; break; diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h index 59c3bec..53a48c9 100644 --- a/glib/pcre/ucp.h +++ b/glib/pcre/ucp.h @@ -10,6 +10,7 @@ the UCD access macros. New values that are added for new releases of Unicode should always be at the end of each enum, for backwards compatibility. */ /* These are the general character categories. */ +#include "gunicode.h" enum { ucp_C, /* Other */ @@ -24,148 +25,148 @@ enum { /* These are the particular character types. */ enum { - ucp_Cc, /* Control */ - ucp_Cf, /* Format */ - ucp_Cn, /* Unassigned */ - ucp_Co, /* Private use */ - ucp_Cs, /* Surrogate */ - ucp_Ll, /* Lower case letter */ - ucp_Lm, /* Modifier letter */ - ucp_Lo, /* Other letter */ - ucp_Lt, /* Title case letter */ - ucp_Lu, /* Upper case letter */ - ucp_Mc, /* Spacing mark */ - ucp_Me, /* Enclosing mark */ - ucp_Mn, /* Non-spacing mark */ - ucp_Nd, /* Decimal number */ - ucp_Nl, /* Letter number */ - ucp_No, /* Other number */ - ucp_Pc, /* Connector punctuation */ - ucp_Pd, /* Dash punctuation */ - ucp_Pe, /* Close punctuation */ - ucp_Pf, /* Final punctuation */ - ucp_Pi, /* Initial punctuation */ - ucp_Po, /* Other punctuation */ - ucp_Ps, /* Open punctuation */ - ucp_Sc, /* Currency symbol */ - ucp_Sk, /* Modifier symbol */ - ucp_Sm, /* Mathematical symbol */ - ucp_So, /* Other symbol */ - ucp_Zl, /* Line separator */ - ucp_Zp, /* Paragraph separator */ - ucp_Zs /* Space separator */ + ucp_Cc = G_UNICODE_CONTROL, /* Control */ + ucp_Cf = G_UNICODE_FORMAT, /* Format */ + ucp_Cn = G_UNICODE_UNASSIGNED, /* Unassigned */ + ucp_Co = G_UNICODE_PRIVATE_USE, /* Private use */ + ucp_Cs = G_UNICODE_SURROGATE, /* Surrogate */ + ucp_Ll = G_UNICODE_LOWERCASE_LETTER, /* Lower case letter */ + ucp_Lm = G_UNICODE_MODIFIER_LETTER, /* Modifier letter */ + ucp_Lo = G_UNICODE_OTHER_LETTER, /* Other letter */ + ucp_Lt = G_UNICODE_TITLECASE_LETTER, /* Title case letter */ + ucp_Lu = G_UNICODE_UPPERCASE_LETTER, /* Upper case letter */ + ucp_Mc = G_UNICODE_SPACING_MARK, /* Spacing mark */ + ucp_Me = G_UNICODE_ENCLOSING_MARK, /* Enclosing mark */ + ucp_Mn = G_UNICODE_NON_SPACING_MARK, /* Non-spacing mark */ + ucp_Nd = G_UNICODE_DECIMAL_NUMBER, /* Decimal number */ + ucp_Nl = G_UNICODE_LETTER_NUMBER, /* Letter number */ + ucp_No = G_UNICODE_OTHER_NUMBER, /* Other number */ + ucp_Pc = G_UNICODE_CONNECT_PUNCTUATION, /* Connector punctuation */ + ucp_Pd = G_UNICODE_DASH_PUNCTUATION, /* Dash punctuation */ + ucp_Pe = G_UNICODE_CLOSE_PUNCTUATION, /* Close punctuation */ + ucp_Pf = G_UNICODE_FINAL_PUNCTUATION, /* Final punctuation */ + ucp_Pi = G_UNICODE_INITIAL_PUNCTUATION, /* Initial punctuation */ + ucp_Po = G_UNICODE_OTHER_PUNCTUATION, /* Other punctuation */ + ucp_Ps = G_UNICODE_OPEN_PUNCTUATION, /* Open punctuation */ + ucp_Sc = G_UNICODE_CURRENCY_SYMBOL, /* Currency symbol */ + ucp_Sk = G_UNICODE_MODIFIER_SYMBOL, /* Modifier symbol */ + ucp_Sm = G_UNICODE_MATH_SYMBOL, /* Mathematical symbol */ + ucp_So = G_UNICODE_OTHER_SYMBOL, /* Other symbol */ + ucp_Zl = G_UNICODE_LINE_SEPARATOR, /* Line separator */ + ucp_Zp = G_UNICODE_PARAGRAPH_SEPARATOR, /* Paragraph separator */ + ucp_Zs = G_UNICODE_SPACE_SEPARATOR /* Space separator */ }; /* These are the script identifications. */ enum { - ucp_Arabic, - ucp_Armenian, - ucp_Bengali, - ucp_Bopomofo, - ucp_Braille, - ucp_Buginese, - ucp_Buhid, - ucp_Canadian_Aboriginal, - ucp_Cherokee, - ucp_Common, - ucp_Coptic, - ucp_Cypriot, - ucp_Cyrillic, - ucp_Deseret, - ucp_Devanagari, - ucp_Ethiopic, - ucp_Georgian, - ucp_Glagolitic, - ucp_Gothic, - ucp_Greek, - ucp_Gujarati, - ucp_Gurmukhi, - ucp_Han, - ucp_Hangul, - ucp_Hanunoo, - ucp_Hebrew, - ucp_Hiragana, - ucp_Inherited, - ucp_Kannada, - ucp_Katakana, - ucp_Kharoshthi, - ucp_Khmer, - ucp_Lao, - ucp_Latin, - ucp_Limbu, - ucp_Linear_B, - ucp_Malayalam, - ucp_Mongolian, - ucp_Myanmar, - ucp_New_Tai_Lue, - ucp_Ogham, - ucp_Old_Italic, - ucp_Old_Persian, - ucp_Oriya, - ucp_Osmanya, - ucp_Runic, - ucp_Shavian, - ucp_Sinhala, - ucp_Syloti_Nagri, - ucp_Syriac, - ucp_Tagalog, - ucp_Tagbanwa, - ucp_Tai_Le, - ucp_Tamil, - ucp_Telugu, - ucp_Thaana, - ucp_Thai, - ucp_Tibetan, - ucp_Tifinagh, - ucp_Ugaritic, - ucp_Yi, + ucp_Arabic = G_UNICODE_SCRIPT_ARABIC, + ucp_Armenian = G_UNICODE_SCRIPT_ARMENIAN, + ucp_Bengali = G_UNICODE_SCRIPT_BENGALI, + ucp_Bopomofo = G_UNICODE_SCRIPT_BOPOMOFO, + ucp_Braille = G_UNICODE_SCRIPT_BRAILLE, + ucp_Buginese = G_UNICODE_SCRIPT_BUGINESE, + ucp_Buhid = G_UNICODE_SCRIPT_BUHID, + ucp_Canadian_Aboriginal = G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, + ucp_Cherokee = G_UNICODE_SCRIPT_CHEROKEE, + ucp_Common = G_UNICODE_SCRIPT_COMMON, + ucp_Coptic = G_UNICODE_SCRIPT_COPTIC, + ucp_Cypriot = G_UNICODE_SCRIPT_CYPRIOT, + ucp_Cyrillic = G_UNICODE_SCRIPT_CYRILLIC, + ucp_Deseret = G_UNICODE_SCRIPT_DESERET, + ucp_Devanagari = G_UNICODE_SCRIPT_DEVANAGARI, + ucp_Ethiopic = G_UNICODE_SCRIPT_ETHIOPIC, + ucp_Georgian = G_UNICODE_SCRIPT_GEORGIAN, + ucp_Glagolitic = G_UNICODE_SCRIPT_GLAGOLITIC, + ucp_Gothic = G_UNICODE_SCRIPT_GOTHIC, + ucp_Greek = G_UNICODE_SCRIPT_GREEK, + ucp_Gujarati = G_UNICODE_SCRIPT_GUJARATI, + ucp_Gurmukhi = G_UNICODE_SCRIPT_GURMUKHI, + ucp_Han = G_UNICODE_SCRIPT_HAN, + ucp_Hangul = G_UNICODE_SCRIPT_HANGUL, + ucp_Hanunoo = G_UNICODE_SCRIPT_HANUNOO, + ucp_Hebrew = G_UNICODE_SCRIPT_HEBREW, + ucp_Hiragana = G_UNICODE_SCRIPT_HIRAGANA, + ucp_Inherited = G_UNICODE_SCRIPT_INHERITED, + ucp_Kannada = G_UNICODE_SCRIPT_KANNADA, + ucp_Katakana = G_UNICODE_SCRIPT_KATAKANA, + ucp_Kharoshthi = G_UNICODE_SCRIPT_KHAROSHTHI, + ucp_Khmer = G_UNICODE_SCRIPT_KHMER, + ucp_Lao = G_UNICODE_SCRIPT_LAO, + ucp_Latin = G_UNICODE_SCRIPT_LATIN, + ucp_Limbu = G_UNICODE_SCRIPT_LIMBU, + ucp_Linear_B = G_UNICODE_SCRIPT_LINEAR_B, + ucp_Malayalam = G_UNICODE_SCRIPT_MALAYALAM, + ucp_Mongolian = G_UNICODE_SCRIPT_MONGOLIAN, + ucp_Myanmar = G_UNICODE_SCRIPT_MYANMAR, + ucp_New_Tai_Lue = G_UNICODE_SCRIPT_NEW_TAI_LUE, + ucp_Ogham = G_UNICODE_SCRIPT_OGHAM, + ucp_Old_Italic = G_UNICODE_SCRIPT_OLD_ITALIC, + ucp_Old_Persian = G_UNICODE_SCRIPT_OLD_PERSIAN, + ucp_Oriya = G_UNICODE_SCRIPT_ORIYA, + ucp_Osmanya = G_UNICODE_SCRIPT_OSMANYA, + ucp_Runic = G_UNICODE_SCRIPT_RUNIC, + ucp_Shavian = G_UNICODE_SCRIPT_SHAVIAN, + ucp_Sinhala = G_UNICODE_SCRIPT_SINHALA, + ucp_Syloti_Nagri = G_UNICODE_SCRIPT_SYLOTI_NAGRI, + ucp_Syriac = G_UNICODE_SCRIPT_SYRIAC, + ucp_Tagalog = G_UNICODE_SCRIPT_TAGALOG, + ucp_Tagbanwa = G_UNICODE_SCRIPT_TAGBANWA, + ucp_Tai_Le = G_UNICODE_SCRIPT_TAI_LE, + ucp_Tamil = G_UNICODE_SCRIPT_TAMIL, + ucp_Telugu = G_UNICODE_SCRIPT_TELUGU, + ucp_Thaana = G_UNICODE_SCRIPT_THAANA, + ucp_Thai = G_UNICODE_SCRIPT_THAI, + ucp_Tibetan = G_UNICODE_SCRIPT_TIBETAN, + ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH, + ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC, + ucp_Yi = G_UNICODE_SCRIPT_YI, /* New for Unicode 5.0: */ - ucp_Balinese, - ucp_Cuneiform, - ucp_Nko, - ucp_Phags_Pa, - ucp_Phoenician, + ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, + ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, + ucp_Nko = G_UNICODE_SCRIPT_NKO, + ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, + ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN, /* New for Unicode 5.1: */ - ucp_Carian, - ucp_Cham, - ucp_Kayah_Li, - ucp_Lepcha, - ucp_Lycian, - ucp_Lydian, - ucp_Ol_Chiki, - ucp_Rejang, - ucp_Saurashtra, - ucp_Sundanese, - ucp_Vai, + ucp_Carian = G_UNICODE_SCRIPT_CARIAN, + ucp_Cham = G_UNICODE_SCRIPT_CHAM, + ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI, + ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA, + ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN, + ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN, + ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI, + ucp_Rejang = G_UNICODE_SCRIPT_REJANG, + ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA, + ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE, + ucp_Vai = G_UNICODE_SCRIPT_VAI, /* New for Unicode 5.2: */ - ucp_Avestan, - ucp_Bamum, - ucp_Egyptian_Hieroglyphs, - ucp_Imperial_Aramaic, - ucp_Inscriptional_Pahlavi, - ucp_Inscriptional_Parthian, - ucp_Javanese, - ucp_Kaithi, - ucp_Lisu, - ucp_Meetei_Mayek, - ucp_Old_South_Arabian, - ucp_Old_Turkic, - ucp_Samaritan, - ucp_Tai_Tham, - ucp_Tai_Viet, + ucp_Avestan = G_UNICODE_SCRIPT_AVESTAN, + ucp_Bamum = G_UNICODE_SCRIPT_BAMUM, + ucp_Egyptian_Hieroglyphs = G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS, + ucp_Imperial_Aramaic = G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC, + ucp_Inscriptional_Pahlavi = G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI, + ucp_Inscriptional_Parthian = G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN, + ucp_Javanese = G_UNICODE_SCRIPT_JAVANESE, + ucp_Kaithi = G_UNICODE_SCRIPT_KAITHI, + ucp_Lisu = G_UNICODE_SCRIPT_LISU, + ucp_Meetei_Mayek = G_UNICODE_SCRIPT_MEETEI_MAYEK, + ucp_Old_South_Arabian = G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN, + ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC, + ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN, + ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM, + ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET, /* New for Unicode 6.0.0: */ - ucp_Batak, - ucp_Brahmi, - ucp_Mandaic, + ucp_Batak = G_UNICODE_SCRIPT_BATAK, + ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI, + ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC, /* New for Unicode 6.1.0: */ - ucp_Chakma, - ucp_Meroitic_Cursive, - ucp_Meroitic_Hieroglyphs, - ucp_Miao, - ucp_Sharada, - ucp_Sora_Sompeng, - ucp_Takri + ucp_Chakma = G_UNICODE_SCRIPT_CHAKMA, + ucp_Meroitic_Cursive = G_UNICODE_SCRIPT_MEROITIC_CURSIVE, + ucp_Meroitic_Hieroglyphs = G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS, + ucp_Miao = G_UNICODE_SCRIPT_MIAO, + ucp_Sharada = G_UNICODE_SCRIPT_SHARADA, + ucp_Sora_Sompeng = G_UNICODE_SCRIPT_SORA_SOMPENG, + ucp_Takri = G_UNICODE_SCRIPT_TAKRI, }; #endif diff --git a/glib/update-pcre/ucp.patch b/glib/update-pcre/ucp.patch index 8abd812..1b82f27 100644 --- a/glib/update-pcre/ucp.patch +++ b/glib/update-pcre/ucp.patch @@ -1,6 +1,632 @@ ---- pcre/ucp.h 2006-07-05 13:28:01.000000000 +0200 -+++ pcre/ucp.h 2006-10-09 16:27:19.000000000 +0200 -@@ -60,72 +60,72 @@ enum { +From 384879be07418fc6224b6603a2e8ca6f11e178fc Mon Sep 17 00:00:00 2001 +From: Christian Persch +Date: Sun, 12 Feb 2012 21:20:33 +0100 +Subject: [PATCH] regex: Use glib for unicode data + +Use g_unichar_type() and g_unichar_get_script() instead of pcre tables. +--- + glib/pcre/Makefile.am | 1 - + glib/pcre/pcre_compile.c | 26 +++--- + glib/pcre/pcre_dfa_exec.c | 96 ++++++++-------- + glib/pcre/pcre_exec.c | 26 +++--- + glib/pcre/pcre_internal.h | 11 +-- + glib/pcre/pcre_tables.c | 16 +++ + glib/pcre/pcre_xclass.c | 24 ++-- + glib/pcre/ucp.h | 265 +++++++++++++++++++++++---------------------- + 8 files changed, 239 insertions(+), 226 deletions(-) + +diff --git a/glib/pcre/Makefile.am b/glib/pcre/Makefile.am +index 21da5c5..1981953 100644 +--- a/glib/pcre/Makefile.am ++++ b/glib/pcre/Makefile.am +@@ -51,7 +51,6 @@ libpcre_la_SOURCES = \ + pcre_string_utils.c \ + pcre_study.c \ + pcre_tables.c \ +- pcre_ucd.c \ + pcre_valid_utf8.c \ + pcre_version.c \ + pcre_xclass.c \ +diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c +index eb985df..b44055a 100644 +--- a/glib/pcre/pcre_compile.c ++++ b/glib/pcre/pcre_compile.c +@@ -2890,43 +2890,43 @@ Returns: TRUE if auto-possessifying is OK + static BOOL + check_char_prop(int c, int ptype, int pdata, BOOL negated) + { +-const ucd_record *prop = GET_UCD(c); ++const pcre_uint8 chartype = UCD_CHARTYPE(c); + switch(ptype) + { + case PT_LAMP: +- return (prop->chartype == ucp_Lu || +- prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt) == negated; ++ return (chartype == ucp_Lu || ++ chartype == ucp_Ll || ++ chartype == ucp_Lt) == negated; + + case PT_GC: +- return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; ++ return (pdata == PRIV(ucp_gentype)[chartype]) == negated; + + case PT_PC: +- return (pdata == prop->chartype) == negated; ++ return (pdata == chartype) == negated; + + case PT_SC: +- return (pdata == prop->script) == negated; ++ return (pdata == UCD_SCRIPT(c)) == negated; + + /* These are specials */ + + case PT_ALNUM: +- return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; ++ return (PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N) == negated; + + case PT_SPACE: /* Perl space */ +- return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ return (PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == negated; + + case PT_PXSPACE: /* POSIX space */ +- return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ return (PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) + == negated; + + case PT_WORD: +- return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || ++ return (PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || + c == CHAR_UNDERSCORE) == negated; + } + return FALSE; +diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c +index 21d7be6..41ff65b 100644 +--- a/glib/pcre/pcre_dfa_exec.c ++++ b/glib/pcre/pcre_dfa_exec.c +@@ -1015,7 +1015,7 @@ for (;;) + if (clen > 0) + { + BOOL OK; +- const ucd_record * prop = GET_UCD(c); ++ const pcre_uint8 chartype = UCD_CHARTYPE(c); + switch(code[1]) + { + case PT_ANY: +@@ -1023,43 +1023,43 @@ for (;;) + break; + + case PT_LAMP: +- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt; ++ OK = chartype == ucp_Lu || chartype == ucp_Ll || ++ chartype == ucp_Lt; + break; + + case PT_GC: +- OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; ++ OK = PRIV(ucp_gentype)[chartype] == code[2]; + break; + + case PT_PC: +- OK = prop->chartype == code[2]; ++ OK = chartype == code[2]; + break; + + case PT_SC: +- OK = prop->script == code[2]; ++ OK = UCD_SCRIPT(c) == code[2]; + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N; ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + +@@ -1209,7 +1209,7 @@ for (;;) + if (clen > 0) + { + BOOL OK; +- const ucd_record * prop = GET_UCD(c); ++ const pcre_uint8 chartype = UCD_CHARTYPE(c); + switch(code[2]) + { + case PT_ANY: +@@ -1217,43 +1217,43 @@ for (;;) + break; + + case PT_LAMP: +- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt; ++ OK = chartype == ucp_Lu || chartype == ucp_Ll || ++ chartype == ucp_Lt; + break; + + case PT_GC: +- OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; ++ OK = PRIV(ucp_gentype)[chartype] == code[3]; + break; + + case PT_PC: +- OK = prop->chartype == code[3]; ++ OK = chartype == code[3]; + break; + + case PT_SC: +- OK = prop->script == code[3]; ++ OK = UCD_SCRIPT(c) == code[3]; + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N; ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + +@@ -1456,7 +1456,7 @@ for (;;) + if (clen > 0) + { + BOOL OK; +- const ucd_record * prop = GET_UCD(c); ++ const pcre_uint8 chartype = UCD_CHARTYPE(c); + switch(code[2]) + { + case PT_ANY: +@@ -1464,43 +1464,43 @@ for (;;) + break; + + case PT_LAMP: +- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt; ++ OK = chartype == ucp_Lu || chartype == ucp_Ll || ++ chartype == ucp_Lt; + break; + + case PT_GC: +- OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; ++ OK = PRIV(ucp_gentype)[chartype] == code[3]; + break; + + case PT_PC: +- OK = prop->chartype == code[3]; ++ OK = chartype == code[3]; + break; + + case PT_SC: +- OK = prop->script == code[3]; ++ OK = UCD_SCRIPT(c) == code[3]; + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N; ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + +@@ -1728,7 +1728,7 @@ for (;;) + if (clen > 0) + { + BOOL OK; +- const ucd_record * prop = GET_UCD(c); ++ const pcre_uint8 chartype = UCD_CHARTYPE(c); + switch(code[1 + IMM2_SIZE + 1]) + { + case PT_ANY: +@@ -1736,43 +1736,43 @@ for (;;) + break; + + case PT_LAMP: +- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt; ++ OK = chartype == ucp_Lu || chartype == ucp_Ll || ++ chartype == ucp_Lt; + break; + + case PT_GC: +- OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; ++ OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2]; + break; + + case PT_PC: +- OK = prop->chartype == code[1 + IMM2_SIZE + 2]; ++ OK = chartype == code[1 + IMM2_SIZE + 2]; + break; + + case PT_SC: +- OK = prop->script == code[1 + IMM2_SIZE + 2]; ++ OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2]; + break; + + /* These are specials for combination cases. */ + + case PT_ALNUM: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N; ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N; + break; + + case PT_SPACE: /* Perl space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; + break; + + case PT_PXSPACE: /* POSIX space */ +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR; + break; + + case PT_WORD: +- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || ++ OK = PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || + c == CHAR_UNDERSCORE; + break; + +diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c +index b715353..8eb3162 100644 +--- a/glib/pcre/pcre_exec.c ++++ b/glib/pcre/pcre_exec.c +@@ -2507,7 +2507,7 @@ for (;;) + } + GETCHARINCTEST(c, eptr); + { +- const ucd_record *prop = GET_UCD(c); ++ const pcre_uint8 chartype = UCD_CHARTYPE(c); + + switch(ecode[1]) + { +@@ -2516,44 +2516,44 @@ for (;;) + break; + + case PT_LAMP: +- if ((prop->chartype == ucp_Lu || +- prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) ++ if ((chartype == ucp_Lu || ++ chartype == ucp_Ll || ++ chartype == ucp_Lt) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_GC: +- if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) ++ if ((ecode[2] != PRIV(ucp_gentype)[chartype]) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_PC: +- if ((ecode[2] != prop->chartype) == (op == OP_PROP)) ++ if ((ecode[2] != chartype) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SC: +- if ((ecode[2] != prop->script) == (op == OP_PROP)) ++ if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + /* These are specials */ + + case PT_ALNUM: +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) ++ if ((PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SPACE: /* Perl space */ +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ if ((PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_PXSPACE: /* POSIX space */ +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ if ((PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) + == (op == OP_NOTPROP)) +@@ -2561,8 +2561,8 @@ for (;;) + break; + + case PT_WORD: +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || ++ if ((PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || + c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; +diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h +index e5a4b6a..41c7ee3 100644 +--- a/glib/pcre/pcre_internal.h ++++ b/glib/pcre/pcre_internal.h +@@ -2315,15 +2315,12 @@ extern const int PRIV(ucp_typerange)[]; + #ifdef SUPPORT_UCP + /* UCD access macros */ + +-#define UCD_BLOCK_SIZE 128 +-#define GET_UCD(ch) (PRIV(ucd_records) + \ +- PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \ +- UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE]) ++unsigned int _pcre_ucp_othercase(const unsigned int c); + +-#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype +-#define UCD_SCRIPT(ch) GET_UCD(ch)->script ++#define UCD_CHARTYPE(ch) (pcre_uint8)g_unichar_type((gunichar)(ch)) ++#define UCD_SCRIPT(ch) (pcre_uint8)g_unichar_get_script((gunichar)(ch)) + #define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] +-#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) ++#define UCD_OTHERCASE(ch) (_pcre_ucp_othercase(ch)) + + #endif /* SUPPORT_UCP */ + +diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c +index c8134ec..47becc7 100644 +--- a/glib/pcre/pcre_tables.c ++++ b/glib/pcre/pcre_tables.c +@@ -563,6 +563,22 @@ const ucp_type_table PRIV(utt)[] = { + + const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); + ++unsigned int ++_pcre_ucp_othercase(const unsigned int c) ++{ ++ int other_case = NOTACHAR; ++ ++ if (g_unichar_islower(c)) ++ other_case = g_unichar_toupper(c); ++ else if (g_unichar_isupper(c)) ++ other_case = g_unichar_tolower(c); ++ ++ if (other_case == c) ++ other_case = NOTACHAR; ++ ++ return other_case; ++} ++ + #endif /* SUPPORT_UTF */ + + /* End of pcre_tables.c */ +diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c +index dca7a39..e5a55d7 100644 +--- a/glib/pcre/pcre_xclass.c ++++ b/glib/pcre/pcre_xclass.c +@@ -127,7 +127,7 @@ while ((t = *data++) != XCL_END) + #ifdef SUPPORT_UCP + else /* XCL_PROP & XCL_NOTPROP */ + { +- const ucd_record *prop = GET_UCD(c); ++ const pcre_uint8 chartype = UCD_CHARTYPE(c); + + switch(*data) + { +@@ -136,46 +136,46 @@ while ((t = *data++) != XCL_END) + break; + + case PT_LAMP: +- if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || +- prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; ++ if ((chartype == ucp_Lu || chartype == ucp_Ll || ++ chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; + break; + + case PT_GC: +- if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP)) ++ if ((data[1] == PRIV(ucp_gentype)[chartype]) == (t == XCL_PROP)) + return !negated; + break; + + case PT_PC: +- if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; ++ if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; + break; + + case PT_SC: +- if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; ++ if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated; + break; + + case PT_ALNUM: +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP)) ++ if ((PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N) == (t == XCL_PROP)) + return !negated; + break; + + case PT_SPACE: /* Perl space */ +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ if ((PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (t == XCL_PROP)) + return !negated; + break; + + case PT_PXSPACE: /* POSIX space */ +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || ++ if ((PRIV(ucp_gentype)[chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) + return !negated; + break; + + case PT_WORD: +- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || +- PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) ++ if ((PRIV(ucp_gentype)[chartype] == ucp_L || ++ PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE) + == (t == XCL_PROP)) + return !negated; + break; +diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h +index 59c3bec..53a48c9 100644 +--- a/glib/pcre/ucp.h ++++ b/glib/pcre/ucp.h +@@ -10,6 +10,7 @@ the UCD access macros. New values that are added for new releases of Unicode + should always be at the end of each enum, for backwards compatibility. */ + + /* These are the general character categories. */ ++#include "gunicode.h" + + enum { + ucp_C, /* Other */ +@@ -24,148 +25,148 @@ enum { + /* These are the particular character types. */ + + enum { +- ucp_Cc, /* Control */ +- ucp_Cf, /* Format */ +- ucp_Cn, /* Unassigned */ +- ucp_Co, /* Private use */ +- ucp_Cs, /* Surrogate */ +- ucp_Ll, /* Lower case letter */ +- ucp_Lm, /* Modifier letter */ +- ucp_Lo, /* Other letter */ +- ucp_Lt, /* Title case letter */ +- ucp_Lu, /* Upper case letter */ +- ucp_Mc, /* Spacing mark */ +- ucp_Me, /* Enclosing mark */ +- ucp_Mn, /* Non-spacing mark */ +- ucp_Nd, /* Decimal number */ +- ucp_Nl, /* Letter number */ +- ucp_No, /* Other number */ +- ucp_Pc, /* Connector punctuation */ +- ucp_Pd, /* Dash punctuation */ +- ucp_Pe, /* Close punctuation */ +- ucp_Pf, /* Final punctuation */ +- ucp_Pi, /* Initial punctuation */ +- ucp_Po, /* Other punctuation */ +- ucp_Ps, /* Open punctuation */ +- ucp_Sc, /* Currency symbol */ +- ucp_Sk, /* Modifier symbol */ +- ucp_Sm, /* Mathematical symbol */ +- ucp_So, /* Other symbol */ +- ucp_Zl, /* Line separator */ +- ucp_Zp, /* Paragraph separator */ +- ucp_Zs /* Space separator */ ++ ucp_Cc = G_UNICODE_CONTROL, /* Control */ ++ ucp_Cf = G_UNICODE_FORMAT, /* Format */ ++ ucp_Cn = G_UNICODE_UNASSIGNED, /* Unassigned */ ++ ucp_Co = G_UNICODE_PRIVATE_USE, /* Private use */ ++ ucp_Cs = G_UNICODE_SURROGATE, /* Surrogate */ ++ ucp_Ll = G_UNICODE_LOWERCASE_LETTER, /* Lower case letter */ ++ ucp_Lm = G_UNICODE_MODIFIER_LETTER, /* Modifier letter */ ++ ucp_Lo = G_UNICODE_OTHER_LETTER, /* Other letter */ ++ ucp_Lt = G_UNICODE_TITLECASE_LETTER, /* Title case letter */ ++ ucp_Lu = G_UNICODE_UPPERCASE_LETTER, /* Upper case letter */ ++ ucp_Mc = G_UNICODE_SPACING_MARK, /* Spacing mark */ ++ ucp_Me = G_UNICODE_ENCLOSING_MARK, /* Enclosing mark */ ++ ucp_Mn = G_UNICODE_NON_SPACING_MARK, /* Non-spacing mark */ ++ ucp_Nd = G_UNICODE_DECIMAL_NUMBER, /* Decimal number */ ++ ucp_Nl = G_UNICODE_LETTER_NUMBER, /* Letter number */ ++ ucp_No = G_UNICODE_OTHER_NUMBER, /* Other number */ ++ ucp_Pc = G_UNICODE_CONNECT_PUNCTUATION, /* Connector punctuation */ ++ ucp_Pd = G_UNICODE_DASH_PUNCTUATION, /* Dash punctuation */ ++ ucp_Pe = G_UNICODE_CLOSE_PUNCTUATION, /* Close punctuation */ ++ ucp_Pf = G_UNICODE_FINAL_PUNCTUATION, /* Final punctuation */ ++ ucp_Pi = G_UNICODE_INITIAL_PUNCTUATION, /* Initial punctuation */ ++ ucp_Po = G_UNICODE_OTHER_PUNCTUATION, /* Other punctuation */ ++ ucp_Ps = G_UNICODE_OPEN_PUNCTUATION, /* Open punctuation */ ++ ucp_Sc = G_UNICODE_CURRENCY_SYMBOL, /* Currency symbol */ ++ ucp_Sk = G_UNICODE_MODIFIER_SYMBOL, /* Modifier symbol */ ++ ucp_Sm = G_UNICODE_MATH_SYMBOL, /* Mathematical symbol */ ++ ucp_So = G_UNICODE_OTHER_SYMBOL, /* Other symbol */ ++ ucp_Zl = G_UNICODE_LINE_SEPARATOR, /* Line separator */ ++ ucp_Zp = G_UNICODE_PARAGRAPH_SEPARATOR, /* Paragraph separator */ ++ ucp_Zs = G_UNICODE_SPACE_SEPARATOR /* Space separator */ + }; + /* These are the script identifications. */ enum { @@ -65,11 +691,6 @@ - ucp_Tifinagh, - ucp_Ugaritic, - ucp_Yi, -- ucp_Balinese, /* New for Unicode 5.0.0 */ -- ucp_Cuneiform, /* New for Unicode 5.0.0 */ -- ucp_Nko, /* New for Unicode 5.0.0 */ -- ucp_Phags_Pa, /* New for Unicode 5.0.0 */ -- ucp_Phoenician /* New for Unicode 5.0.0 */ + ucp_Arabic = G_UNICODE_SCRIPT_ARABIC, + ucp_Armenian = G_UNICODE_SCRIPT_ARMENIAN, + ucp_Bengali = G_UNICODE_SCRIPT_BENGALI, @@ -131,11 +752,96 @@ + ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH, + ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC, + ucp_Yi = G_UNICODE_SCRIPT_YI, -+ ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, /* New for Unicode 5.0.0 */ -+ ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, /* New for Unicode 5.0.0 */ -+ ucp_Nko = G_UNICODE_SCRIPT_NKO, /* New for Unicode 5.0.0 */ -+ ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, /* New for Unicode 5.0.0 */ -+ ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN /* New for Unicode 5.0.0 */ + /* New for Unicode 5.0: */ +- ucp_Balinese, +- ucp_Cuneiform, +- ucp_Nko, +- ucp_Phags_Pa, +- ucp_Phoenician, ++ ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, ++ ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, ++ ucp_Nko = G_UNICODE_SCRIPT_NKO, ++ ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, ++ ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN, + /* New for Unicode 5.1: */ +- ucp_Carian, +- ucp_Cham, +- ucp_Kayah_Li, +- ucp_Lepcha, +- ucp_Lycian, +- ucp_Lydian, +- ucp_Ol_Chiki, +- ucp_Rejang, +- ucp_Saurashtra, +- ucp_Sundanese, +- ucp_Vai, ++ ucp_Carian = G_UNICODE_SCRIPT_CARIAN, ++ ucp_Cham = G_UNICODE_SCRIPT_CHAM, ++ ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI, ++ ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA, ++ ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN, ++ ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN, ++ ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI, ++ ucp_Rejang = G_UNICODE_SCRIPT_REJANG, ++ ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA, ++ ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE, ++ ucp_Vai = G_UNICODE_SCRIPT_VAI, + /* New for Unicode 5.2: */ +- ucp_Avestan, +- ucp_Bamum, +- ucp_Egyptian_Hieroglyphs, +- ucp_Imperial_Aramaic, +- ucp_Inscriptional_Pahlavi, +- ucp_Inscriptional_Parthian, +- ucp_Javanese, +- ucp_Kaithi, +- ucp_Lisu, +- ucp_Meetei_Mayek, +- ucp_Old_South_Arabian, +- ucp_Old_Turkic, +- ucp_Samaritan, +- ucp_Tai_Tham, +- ucp_Tai_Viet, ++ ucp_Avestan = G_UNICODE_SCRIPT_AVESTAN, ++ ucp_Bamum = G_UNICODE_SCRIPT_BAMUM, ++ ucp_Egyptian_Hieroglyphs = G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS, ++ ucp_Imperial_Aramaic = G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC, ++ ucp_Inscriptional_Pahlavi = G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI, ++ ucp_Inscriptional_Parthian = G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN, ++ ucp_Javanese = G_UNICODE_SCRIPT_JAVANESE, ++ ucp_Kaithi = G_UNICODE_SCRIPT_KAITHI, ++ ucp_Lisu = G_UNICODE_SCRIPT_LISU, ++ ucp_Meetei_Mayek = G_UNICODE_SCRIPT_MEETEI_MAYEK, ++ ucp_Old_South_Arabian = G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN, ++ ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC, ++ ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN, ++ ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM, ++ ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET, + /* New for Unicode 6.0.0: */ +- ucp_Batak, +- ucp_Brahmi, +- ucp_Mandaic, ++ ucp_Batak = G_UNICODE_SCRIPT_BATAK, ++ ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI, ++ ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC, + /* New for Unicode 6.1.0: */ +- ucp_Chakma, +- ucp_Meroitic_Cursive, +- ucp_Meroitic_Hieroglyphs, +- ucp_Miao, +- ucp_Sharada, +- ucp_Sora_Sompeng, +- ucp_Takri ++ ucp_Chakma = G_UNICODE_SCRIPT_CHAKMA, ++ ucp_Meroitic_Cursive = G_UNICODE_SCRIPT_MEROITIC_CURSIVE, ++ ucp_Meroitic_Hieroglyphs = G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS, ++ ucp_Miao = G_UNICODE_SCRIPT_MIAO, ++ ucp_Sharada = G_UNICODE_SCRIPT_SHARADA, ++ ucp_Sora_Sompeng = G_UNICODE_SCRIPT_SORA_SOMPENG, ++ ucp_Takri = G_UNICODE_SCRIPT_TAKRI, }; #endif +-- +1.7.5.1.217.g4e3aa.dirty + -- 2.7.4