From c7bfd937a3e5c6aa90e7e8e2f34825d43be67969 Mon Sep 17 00:00:00 2001 From: Michael Schroeder Date: Mon, 6 May 2013 11:23:31 +0200 Subject: [PATCH] add utf8 helpers to util.c --- ext/repo_rpmdb.c | 78 ++++---------------------------- ext/repo_rpmdb_pubkey.c | 81 ++++----------------------------- src/libsolv.ver | 3 ++ src/util.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util.h | 3 ++ 5 files changed, 141 insertions(+), 142 deletions(-) diff --git a/ext/repo_rpmdb.c b/ext/repo_rpmdb.c index 5ed1c4e..6b1a196 100644 --- a/ext/repo_rpmdb.c +++ b/ext/repo_rpmdb.c @@ -380,76 +380,14 @@ static char *headtoevr(RpmHead *h) static void setutf8string(Repodata *repodata, Id handle, Id tag, const char *str) { - const unsigned char *cp; - int state = 0; - int c; - unsigned char *buf = 0, *bp; - - /* check if it's already utf8, code taken from screen ;-) */ - cp = (const unsigned char *)str; - while ((c = *cp++) != 0) - { - if (state) - { - if ((c & 0xc0) != 0x80) - break; /* encoding error */ - c = (c & 0x3f) | (state << 6); - if (!(state & 0x40000000)) - { - /* check for overlong sequences */ - if ((c & 0x820823e0) == 0x80000000) - break; - else if ((c & 0x020821f0) == 0x02000000) - break; - else if ((c & 0x000820f8) == 0x00080000) - break; - else if ((c & 0x0000207c) == 0x00002000) - break; - } - } - else - { - /* new sequence */ - if (c >= 0xfe) - break; - else if (c >= 0xfc) - c = (c & 0x01) | 0xbffffffc; /* 5 bytes to follow */ - else if (c >= 0xf8) - c = (c & 0x03) | 0xbfffff00; /* 4 */ - else if (c >= 0xf0) - c = (c & 0x07) | 0xbfffc000; /* 3 */ - else if (c >= 0xe0) - c = (c & 0x0f) | 0xbff00000; /* 2 */ - else if (c >= 0xc2) - c = (c & 0x1f) | 0xfc000000; /* 1 */ - else if (c >= 0x80) - break; - } - state = (c & 0x80000000) ? c : 0; - } - if (c) + if (str[solv_validutf8(str)]) { - /* not utf8, assume latin1 */ - buf = solv_malloc(2 * strlen(str) + 1); - cp = (const unsigned char *)str; - str = (char *)buf; - bp = buf; - while ((c = *cp++) != 0) - { - if (c >= 0xc0) - { - *bp++ = 0xc3; - c ^= 0xc0 ^ 0x80; - } - else if (c >= 0x80) - *bp++ = 0xc2; - *bp++ = c; - } - *bp++ = 0; + char *ustr = solv_latin1toutf8(str); /* not utf8, assume latin1 */ + repodata_set_str(repodata, handle, tag, ustr); + solv_free(ustr); } - repodata_set_str(repodata, handle, tag, str); - if (buf) - solv_free(buf); + else + repodata_set_str(repodata, handle, tag, str); } @@ -820,9 +758,9 @@ addchangelog(Repodata *data, Id handle, RpmHead *rpmhead) if (ct[i]) repodata_set_num(data, h, SOLVABLE_CHANGELOG_TIME, ct[i]); if (cn[i]) - repodata_set_str(data, h, SOLVABLE_CHANGELOG_AUTHOR, cn[i]); + setutf8string(data, h, SOLVABLE_CHANGELOG_AUTHOR, cn[i]); if (cx[i]) - repodata_set_str(data, h, SOLVABLE_CHANGELOG_TEXT, cx[i]); + setutf8string(data, h, SOLVABLE_CHANGELOG_TEXT, cx[i]); queue_push(&hq, h); } for (i = 0; i < hq.count; i++) diff --git a/ext/repo_rpmdb_pubkey.c b/ext/repo_rpmdb_pubkey.c index 2ca79c7..25bc60d 100644 --- a/ext/repo_rpmdb_pubkey.c +++ b/ext/repo_rpmdb_pubkey.c @@ -39,80 +39,17 @@ #include "chksum.h" #include "repo_rpmdb.h" -/* FIXME: dedup with repo_rpmdb.c */ -static void +static void setutf8string(Repodata *repodata, Id handle, Id tag, const char *str) { - const unsigned char *cp; - int state = 0; - int c; - unsigned char *buf = 0, *bp; - - /* check if it's already utf8, code taken from screen ;-) */ - cp = (const unsigned char *)str; - while ((c = *cp++) != 0) - { - if (state) - { - if ((c & 0xc0) != 0x80) - break; /* encoding error */ - c = (c & 0x3f) | (state << 6); - if (!(state & 0x40000000)) - { - /* check for overlong sequences */ - if ((c & 0x820823e0) == 0x80000000) - break; - else if ((c & 0x020821f0) == 0x02000000) - break; - else if ((c & 0x000820f8) == 0x00080000) - break; - else if ((c & 0x0000207c) == 0x00002000) - break; - } - } - else - { - /* new sequence */ - if (c >= 0xfe) - break; - else if (c >= 0xfc) - c = (c & 0x01) | 0xbffffffc; /* 5 bytes to follow */ - else if (c >= 0xf8) - c = (c & 0x03) | 0xbfffff00; /* 4 */ - else if (c >= 0xf0) - c = (c & 0x07) | 0xbfffc000; /* 3 */ - else if (c >= 0xe0) - c = (c & 0x0f) | 0xbff00000; /* 2 */ - else if (c >= 0xc2) - c = (c & 0x1f) | 0xfc000000; /* 1 */ - else if (c >= 0x80) - break; - } - state = (c & 0x80000000) ? c : 0; - } - if (c) - { - /* not utf8, assume latin1 */ - buf = solv_malloc(2 * strlen(str) + 1); - cp = (const unsigned char *)str; - str = (char *)buf; - bp = buf; - while ((c = *cp++) != 0) - { - if (c >= 0xc0) - { - *bp++ = 0xc3; - c ^= 0xc0 ^ 0x80; - } - else if (c >= 0x80) - *bp++ = 0xc2; - *bp++ = c; - } - *bp++ = 0; - } - repodata_set_str(repodata, handle, tag, str); - if (buf) - solv_free(buf); + if (str[solv_validutf8(str)]) + { + char *ustr = solv_latin1toutf8(str); /* not utf8, assume latin1 */ + repodata_set_str(repodata, handle, tag, ustr); + solv_free(ustr); + } + else + repodata_set_str(repodata, handle, tag, str); } static char * diff --git a/src/libsolv.ver b/src/libsolv.ver index 5cf7fa8..f1fcbc8 100644 --- a/src/libsolv.ver +++ b/src/libsolv.ver @@ -253,14 +253,17 @@ SOLV_1.0 { solv_dupjoin; solv_free; solv_hex2bin; + solv_latin1toutf8; solv_malloc; solv_malloc2; solv_oom; solv_realloc; solv_realloc2; + solv_replacebadutf8; solv_sort; solv_strdup; solv_timems; + solv_validutf8; solv_vercmp; solv_vercmp_deb; solv_vercmp_haiku; diff --git a/src/util.c b/src/util.c index 4fe2d63..4a24096 100644 --- a/src/util.c +++ b/src/util.c @@ -243,4 +243,122 @@ solv_bin2hex(const unsigned char *buf, int l, char *str) return str; } +size_t +solv_validutf8(const char *buf) +{ + const unsigned char *p; + int x; + + for (p = (const unsigned char *)buf; (x = *p) != 0; p++) + { + if (x < 0x80) + continue; + if (x < 0xc0) + break; + if (x < 0xe0) + { + /* one byte to follow */ + if ((p[1] & 0xc0) != 0x80) + break; + if ((x & 0x1e) == 0) + break; /* not minimal */ + p += 1; + continue; + } + if (x < 0xf0) + { + /* two bytes to follow */ + if ((p[1] & 0xc0) != 0x80 || (p[2] & 0xc0) != 0x80) + break; + if ((x & 0x0f) == 0 && (p[1] & 0x20) == 0) + break; /* not minimal */ + if (x == 0xed && (p[1] & 0x20) != 0) + break; /* d800-dfff surrogate */ + if (x == 0xef && p[1] == 0xbf && (p[2] == 0xbe || p[2] == 0xbf)) + break; /* fffe or ffff */ + p += 2; + continue; + } + if (x < 0xf8) + { + /* three bytes to follow */ + if ((p[1] & 0xc0) != 0x80 || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80) + break; + if ((x & 0x07) == 0 && (p[1] & 0x30) == 0) + break; /* not minimal */ + if ((x & 0x07) > 4 || ((x & 0x07) == 4 && (p[1] & 0x30) != 0)) + break; /* above 0x10ffff */ + p += 3; + continue; + } + break; /* maybe valid utf8, but above 0x10ffff */ + } + return (const char *)p - buf; +} + +char * +solv_latin1toutf8(const char *buf) +{ + int l = 1; + const char *p; + char *r, *rp; + + for (p = buf; *p; p++) + if ((*(const unsigned char *)p & 128) != 0) + l++; + r = rp = solv_malloc(p - buf + l); + for (p = buf; *p; p++) + { + if ((*(const unsigned char *)p & 128) != 0) + { + *rp++ = *(const unsigned char *)p & 64 ? 0xc3 : 0xc2; + *rp++ = *p & 0xbf; + } + else + *rp++ = *p; + } + *rp = 0; + return r; +} + +char * +solv_replacebadutf8(const char *buf) +{ + size_t l, nl; + const char *p; + char *r = 0, *rp = 0; + + for (;;) + { + for (p = buf, nl = 0; *p; ) + { + l = solv_validutf8(p); + if (rp && l) + { + memcpy(rp, p, l); + rp += l; + } + nl += l; + p += l; + if (!*p) + break; + /* found a bad char, replace with 0xfffd */ + if (rp) + { + *rp++ = 0xef; + *rp++ = 0xbf; + *rp++ = 0xbd; + } + nl += 3; + p++; + while ((*(const unsigned char *)p & 0xc0) == 0x80) + p++; + } + if (rp) + break; + r = rp = solv_malloc(nl + 1); + } + *rp = 0; + return r; +} diff --git a/src/util.h b/src/util.h index 82ad777..0c15d95 100644 --- a/src/util.h +++ b/src/util.h @@ -38,6 +38,9 @@ extern char *solv_dupjoin(const char *str1, const char *str2, const char *str3); extern char *solv_dupappend(const char *str1, const char *str2, const char *str3); extern int solv_hex2bin(const char **strp, unsigned char *buf, int bufl); extern char *solv_bin2hex(const unsigned char *buf, int l, char *str); +extern size_t solv_validutf8(const char *buf); +extern char *solv_latin1toutf8(const char *buf); +extern char *solv_replacebadutf8(const char *buf); static inline void *solv_extend(void *buf, size_t len, size_t nmemb, size_t size, size_t block) -- 2.7.4