static void
setutf8string(Repodata *repodata, Id handle, Id tag, const char *str)
{
- const unsigned char *cp;
- int state = 0;
- int c;
- unsigned char *buf = 0, *bp;
-
- /* check if it's already utf8, code taken from screen ;-) */
- cp = (const unsigned char *)str;
- while ((c = *cp++) != 0)
- {
- if (state)
- {
- if ((c & 0xc0) != 0x80)
- break; /* encoding error */
- c = (c & 0x3f) | (state << 6);
- if (!(state & 0x40000000))
- {
- /* check for overlong sequences */
- if ((c & 0x820823e0) == 0x80000000)
- break;
- else if ((c & 0x020821f0) == 0x02000000)
- break;
- else if ((c & 0x000820f8) == 0x00080000)
- break;
- else if ((c & 0x0000207c) == 0x00002000)
- break;
- }
- }
- else
- {
- /* new sequence */
- if (c >= 0xfe)
- break;
- else if (c >= 0xfc)
- c = (c & 0x01) | 0xbffffffc; /* 5 bytes to follow */
- else if (c >= 0xf8)
- c = (c & 0x03) | 0xbfffff00; /* 4 */
- else if (c >= 0xf0)
- c = (c & 0x07) | 0xbfffc000; /* 3 */
- else if (c >= 0xe0)
- c = (c & 0x0f) | 0xbff00000; /* 2 */
- else if (c >= 0xc2)
- c = (c & 0x1f) | 0xfc000000; /* 1 */
- else if (c >= 0x80)
- break;
- }
- state = (c & 0x80000000) ? c : 0;
- }
- if (c)
+ if (str[solv_validutf8(str)])
{
- /* not utf8, assume latin1 */
- buf = solv_malloc(2 * strlen(str) + 1);
- cp = (const unsigned char *)str;
- str = (char *)buf;
- bp = buf;
- while ((c = *cp++) != 0)
- {
- if (c >= 0xc0)
- {
- *bp++ = 0xc3;
- c ^= 0xc0 ^ 0x80;
- }
- else if (c >= 0x80)
- *bp++ = 0xc2;
- *bp++ = c;
- }
- *bp++ = 0;
+ char *ustr = solv_latin1toutf8(str); /* not utf8, assume latin1 */
+ repodata_set_str(repodata, handle, tag, ustr);
+ solv_free(ustr);
}
- repodata_set_str(repodata, handle, tag, str);
- if (buf)
- solv_free(buf);
+ else
+ repodata_set_str(repodata, handle, tag, str);
}
if (ct[i])
repodata_set_num(data, h, SOLVABLE_CHANGELOG_TIME, ct[i]);
if (cn[i])
- repodata_set_str(data, h, SOLVABLE_CHANGELOG_AUTHOR, cn[i]);
+ setutf8string(data, h, SOLVABLE_CHANGELOG_AUTHOR, cn[i]);
if (cx[i])
- repodata_set_str(data, h, SOLVABLE_CHANGELOG_TEXT, cx[i]);
+ setutf8string(data, h, SOLVABLE_CHANGELOG_TEXT, cx[i]);
queue_push(&hq, h);
}
for (i = 0; i < hq.count; i++)
#include "chksum.h"
#include "repo_rpmdb.h"
-/* FIXME: dedup with repo_rpmdb.c */
-static void
+static void
setutf8string(Repodata *repodata, Id handle, Id tag, const char *str)
{
- const unsigned char *cp;
- int state = 0;
- int c;
- unsigned char *buf = 0, *bp;
-
- /* check if it's already utf8, code taken from screen ;-) */
- cp = (const unsigned char *)str;
- while ((c = *cp++) != 0)
- {
- if (state)
- {
- if ((c & 0xc0) != 0x80)
- break; /* encoding error */
- c = (c & 0x3f) | (state << 6);
- if (!(state & 0x40000000))
- {
- /* check for overlong sequences */
- if ((c & 0x820823e0) == 0x80000000)
- break;
- else if ((c & 0x020821f0) == 0x02000000)
- break;
- else if ((c & 0x000820f8) == 0x00080000)
- break;
- else if ((c & 0x0000207c) == 0x00002000)
- break;
- }
- }
- else
- {
- /* new sequence */
- if (c >= 0xfe)
- break;
- else if (c >= 0xfc)
- c = (c & 0x01) | 0xbffffffc; /* 5 bytes to follow */
- else if (c >= 0xf8)
- c = (c & 0x03) | 0xbfffff00; /* 4 */
- else if (c >= 0xf0)
- c = (c & 0x07) | 0xbfffc000; /* 3 */
- else if (c >= 0xe0)
- c = (c & 0x0f) | 0xbff00000; /* 2 */
- else if (c >= 0xc2)
- c = (c & 0x1f) | 0xfc000000; /* 1 */
- else if (c >= 0x80)
- break;
- }
- state = (c & 0x80000000) ? c : 0;
- }
- if (c)
- {
- /* not utf8, assume latin1 */
- buf = solv_malloc(2 * strlen(str) + 1);
- cp = (const unsigned char *)str;
- str = (char *)buf;
- bp = buf;
- while ((c = *cp++) != 0)
- {
- if (c >= 0xc0)
- {
- *bp++ = 0xc3;
- c ^= 0xc0 ^ 0x80;
- }
- else if (c >= 0x80)
- *bp++ = 0xc2;
- *bp++ = c;
- }
- *bp++ = 0;
- }
- repodata_set_str(repodata, handle, tag, str);
- if (buf)
- solv_free(buf);
+ if (str[solv_validutf8(str)])
+ {
+ char *ustr = solv_latin1toutf8(str); /* not utf8, assume latin1 */
+ repodata_set_str(repodata, handle, tag, ustr);
+ solv_free(ustr);
+ }
+ else
+ repodata_set_str(repodata, handle, tag, str);
}
static char *
solv_dupjoin;
solv_free;
solv_hex2bin;
+ solv_latin1toutf8;
solv_malloc;
solv_malloc2;
solv_oom;
solv_realloc;
solv_realloc2;
+ solv_replacebadutf8;
solv_sort;
solv_strdup;
solv_timems;
+ solv_validutf8;
solv_vercmp;
solv_vercmp_deb;
solv_vercmp_haiku;
return str;
}
+size_t
+solv_validutf8(const char *buf)
+{
+ const unsigned char *p;
+ int x;
+
+ for (p = (const unsigned char *)buf; (x = *p) != 0; p++)
+ {
+ if (x < 0x80)
+ continue;
+ if (x < 0xc0)
+ break;
+ if (x < 0xe0)
+ {
+ /* one byte to follow */
+ if ((p[1] & 0xc0) != 0x80)
+ break;
+ if ((x & 0x1e) == 0)
+ break; /* not minimal */
+ p += 1;
+ continue;
+ }
+ if (x < 0xf0)
+ {
+ /* two bytes to follow */
+ if ((p[1] & 0xc0) != 0x80 || (p[2] & 0xc0) != 0x80)
+ break;
+ if ((x & 0x0f) == 0 && (p[1] & 0x20) == 0)
+ break; /* not minimal */
+ if (x == 0xed && (p[1] & 0x20) != 0)
+ break; /* d800-dfff surrogate */
+ if (x == 0xef && p[1] == 0xbf && (p[2] == 0xbe || p[2] == 0xbf))
+ break; /* fffe or ffff */
+ p += 2;
+ continue;
+ }
+ if (x < 0xf8)
+ {
+ /* three bytes to follow */
+ if ((p[1] & 0xc0) != 0x80 || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80)
+ break;
+ if ((x & 0x07) == 0 && (p[1] & 0x30) == 0)
+ break; /* not minimal */
+ if ((x & 0x07) > 4 || ((x & 0x07) == 4 && (p[1] & 0x30) != 0))
+ break; /* above 0x10ffff */
+ p += 3;
+ continue;
+ }
+ break; /* maybe valid utf8, but above 0x10ffff */
+ }
+ return (const char *)p - buf;
+}
+
+char *
+solv_latin1toutf8(const char *buf)
+{
+ int l = 1;
+ const char *p;
+ char *r, *rp;
+
+ for (p = buf; *p; p++)
+ if ((*(const unsigned char *)p & 128) != 0)
+ l++;
+ r = rp = solv_malloc(p - buf + l);
+ for (p = buf; *p; p++)
+ {
+ if ((*(const unsigned char *)p & 128) != 0)
+ {
+ *rp++ = *(const unsigned char *)p & 64 ? 0xc3 : 0xc2;
+ *rp++ = *p & 0xbf;
+ }
+ else
+ *rp++ = *p;
+ }
+ *rp = 0;
+ return r;
+}
+
+char *
+solv_replacebadutf8(const char *buf)
+{
+ size_t l, nl;
+ const char *p;
+ char *r = 0, *rp = 0;
+
+ for (;;)
+ {
+ for (p = buf, nl = 0; *p; )
+ {
+ l = solv_validutf8(p);
+ if (rp && l)
+ {
+ memcpy(rp, p, l);
+ rp += l;
+ }
+ nl += l;
+ p += l;
+ if (!*p)
+ break;
+ /* found a bad char, replace with 0xfffd */
+ if (rp)
+ {
+ *rp++ = 0xef;
+ *rp++ = 0xbf;
+ *rp++ = 0xbd;
+ }
+ nl += 3;
+ p++;
+ while ((*(const unsigned char *)p & 0xc0) == 0x80)
+ p++;
+ }
+ if (rp)
+ break;
+ r = rp = solv_malloc(nl + 1);
+ }
+ *rp = 0;
+ return r;
+}
extern char *solv_dupappend(const char *str1, const char *str2, const char *str3);
extern int solv_hex2bin(const char **strp, unsigned char *buf, int bufl);
extern char *solv_bin2hex(const unsigned char *buf, int l, char *str);
+extern size_t solv_validutf8(const char *buf);
+extern char *solv_latin1toutf8(const char *buf);
+extern char *solv_replacebadutf8(const char *buf);
static inline void *solv_extend(void *buf, size_t len, size_t nmemb, size_t size, size_t block)