From 70c82f1f7124a73b17591fe9620ca13da2c60c97 Mon Sep 17 00:00:00 2001 From: Gustavo Sverzut Barbieri Date: Fri, 7 Dec 2007 16:06:35 +0000 Subject: [PATCH] Charset conversion support. Now one can register various charsets to try to convert from and then call lms_charset_conv() to get it converted to UTF-8. All plugins were fixed to use this function. No non-UTF-8 data should be stored on DB anymore. --- src/bin/test.c | 17 +- src/lib/Makefile.am | 4 +- src/lib/lightmediascanner.c | 33 ++++ src/lib/lightmediascanner.h | 3 + src/lib/lightmediascanner_charset_conv.c | 291 +++++++++++++++++++++++++++++++ src/lib/lightmediascanner_charset_conv.h | 79 +++++++++ src/lib/lightmediascanner_plugin.h | 2 + src/plugins/id3lib/id3lib.cpp | 9 + src/plugins/jpeg/jpeg.c | 5 + src/plugins/m3u/m3u.c | 1 + src/plugins/pls/pls.c | 1 + src/plugins/video-dummy/video-dummy.c | 1 + 12 files changed, 440 insertions(+), 6 deletions(-) create mode 100644 src/lib/lightmediascanner_charset_conv.c create mode 100644 src/lib/lightmediascanner_charset_conv.h diff --git a/src/bin/test.c b/src/bin/test.c index 2ee4d44..dad2d64 100644 --- a/src/bin/test.c +++ b/src/bin/test.c @@ -12,7 +12,7 @@ usage(const char *prgname) fprintf(stderr, "Usage:\n" "\t%s " - "\n" + " \n" "\n", prgname); } @@ -20,7 +20,7 @@ usage(const char *prgname) int main(int argc, char *argv[]) { - char *db_path, *parser_name, *scan_path; + char *db_path, *parser_name, *charset, *scan_path; lms_t *lms; lms_plugin_t *parser; int commit_interval, slave_timeout; @@ -34,7 +34,8 @@ main(int argc, char *argv[]) slave_timeout = atoi(argv[2]); db_path = argv[3]; parser_name = argv[4]; - scan_path = argv[5]; + charset = argv[5]; + scan_path = argv[6]; lms = lms_new(db_path); if (!lms) { @@ -55,15 +56,21 @@ main(int argc, char *argv[]) return -2; } + if (lms_charset_add(lms, charset) != 0) { + fprintf(stderr, "ERROR: could not add charset '%s'\n", charset); + lms_free(lms); + return -3; + } + if (lms_process(lms, scan_path) != 0) { fprintf(stderr, "ERROR: processing \"%s\".\n", scan_path); lms_free(lms); - return -3; + return -4; } if (lms_free(lms) != 0) { fprintf(stderr, "ERROR: could not close light media scanner.\n"); - return -4; + return -5; } return 0; diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 86d2d92..8f206d1 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -6,7 +6,8 @@ include_HEADERS = \ lightmediascanner.h \ lightmediascanner_plugin.h \ lightmediascanner_utils.h \ - lightmediascanner_db.h + lightmediascanner_db.h \ + lightmediascanner_charset_conv.h noinst_HEADERS = lightmediascanner_db_private.h lib_LTLIBRARIES = liblightmediascanner.la @@ -14,6 +15,7 @@ lib_LTLIBRARIES = liblightmediascanner.la liblightmediascanner_la_SOURCES = \ lightmediascanner.c \ lightmediascanner_utils.c \ + lightmediascanner_charset_conv.c \ lightmediascanner_db_common.c \ lightmediascanner_db_image.c \ lightmediascanner_db_audio.c \ diff --git a/src/lib/lightmediascanner.c b/src/lib/lightmediascanner.c index a5ed083..a1411e9 100644 --- a/src/lib/lightmediascanner.c +++ b/src/lib/lightmediascanner.c @@ -40,6 +40,7 @@ #include "lightmediascanner.h" #include "lightmediascanner_plugin.h" #include "lightmediascanner_db_private.h" +#include "lightmediascanner_charset_conv.h" #define PATH_SIZE PATH_MAX #define DEFAULT_SLAVE_TIMEOUT 1000 @@ -68,6 +69,7 @@ struct parser { struct lms { struct parser *parsers; int n_parsers; + lms_charset_conv_t *cs_conv; char *db_path; int slave_timeout; unsigned int commit_interval; @@ -608,6 +610,7 @@ _retrieve_file_status(struct db *db, struct lms_file_info *finfo) static void _ctxt_init(struct lms_context *ctxt, const lms_t *lms, const struct db *db) { + ctxt->cs_conv = lms->cs_conv; ctxt->db = db->handle; } @@ -1218,11 +1221,18 @@ lms_new(const char *db_path) return NULL; } + lms->cs_conv = lms_charset_conv_new(); + if (!lms->cs_conv) { + free(lms); + return NULL; + } + lms->commit_interval = DEFAULT_COMMIT_INTERVAL; lms->slave_timeout = DEFAULT_SLAVE_TIMEOUT; lms->db_path = strdup(db_path); if (!lms->db_path) { perror("strdup"); + lms_charset_conv_free(lms->cs_conv); free(lms); return NULL; } @@ -1249,6 +1259,7 @@ lms_free(lms_t *lms) } free(lms->db_path); + lms_charset_conv_free(lms->cs_conv); free(lms); return 0; } @@ -1491,3 +1502,25 @@ lms_set_commit_interval(lms_t *lms, unsigned int transactions) lms->commit_interval = transactions; } + +int +lms_charset_add(lms_t *lms, const char *charset) +{ + if (!lms) { + fprintf(stderr, "ERROR: lms_charset_add(NULL)\n"); + return -1; + } + + return lms_charset_conv_add(lms->cs_conv, charset); +} + +int +lms_charset_del(lms_t *lms, const char *charset) +{ + if (!lms) { + fprintf(stderr, "ERROR: lms_charset_del(NULL)\n"); + return -1; + } + + return lms_charset_conv_del(lms->cs_conv, charset); +} diff --git a/src/lib/lightmediascanner.h b/src/lib/lightmediascanner.h index a4cb28e..203d8ef 100644 --- a/src/lib/lightmediascanner.h +++ b/src/lib/lightmediascanner.h @@ -83,6 +83,9 @@ extern "C" { API lms_plugin_t *lms_parser_find_and_add(lms_t *lms, const char *name) GNUC_NON_NULL(1, 2); API int lms_parser_del(lms_t *lms, lms_plugin_t *handle) GNUC_NON_NULL(1, 2); + API int lms_charset_add(lms_t *lms, const char *charset) GNUC_NON_NULL(1, 2); + API int lms_charset_del(lms_t *lms, const char *charset) GNUC_NON_NULL(1, 2); + #ifdef __cplusplus } #endif diff --git a/src/lib/lightmediascanner_charset_conv.c b/src/lib/lightmediascanner_charset_conv.c new file mode 100644 index 0000000..e3d7ca6 --- /dev/null +++ b/src/lib/lightmediascanner_charset_conv.c @@ -0,0 +1,291 @@ +#include "lightmediascanner_charset_conv.h" +#include +#include +#include +#include +#include + +struct lms_charset_conv { + iconv_t check; + iconv_t fallback; + unsigned int size; + iconv_t *convs; + char **names; +}; + +lms_charset_conv_t * +lms_charset_conv_new(void) +{ + lms_charset_conv_t *lcc; + + lcc = malloc(sizeof(*lcc)); + if (!lcc) { + perror("malloc"); + return NULL; + } + + lcc->check = iconv_open("UTF-8", "UTF-8"); + if (lcc->check == (iconv_t)-1) { + perror("ERROR: could not create conversion checker"); + goto error_check; + } + + lcc->fallback = iconv_open("UTF-8//IGNORE", "UTF-8"); + if (lcc->fallback == (iconv_t)-1) { + perror("ERROR: could not create conversion fallback"); + goto error_fallback; + } + + lcc->size = 0; + lcc->convs = NULL; + lcc->names = NULL; + return lcc; + + error_fallback: + iconv_close(lcc->check); + error_check: + free(lcc); + + return NULL; +} + +void +lms_charset_conv_free(lms_charset_conv_t *lcc) +{ + int i; + + if (!lcc) + return; + + iconv_close(lcc->check); + iconv_close(lcc->fallback); + + for (i = 0; i < lcc->size; i++) { + iconv_close(lcc->convs[i]); + free(lcc->names[i]); + } + + if (lcc->convs) + free(lcc->convs); + if (lcc->names) + free(lcc->names); + free(lcc); +} + +int +lms_charset_conv_add(lms_charset_conv_t *lcc, const char *charset) +{ + iconv_t cd, *convs; + char **names; + int idx, ns; + + if (!lcc) + return -1; + + if (!charset) + return -2; + + cd = iconv_open("UTF-8", charset); + if (cd == (iconv_t)-1) { + fprintf(stderr, "ERROR: could not add conversion charset '%s': %s\n", + charset, strerror(errno)); + return -3; + } + + idx = lcc->size; + ns = lcc->size + 1; + + convs = realloc(lcc->convs, ns * sizeof(*convs)); + if (!convs) + goto realloc_error; + lcc->convs = convs; + lcc->convs[idx] = cd; + + names = realloc(lcc->names, ns * sizeof(*names)); + if (!names) + goto realloc_error; + lcc->names = names; + lcc->names[idx] = strdup(charset); + if (!lcc->names[idx]) + goto realloc_error; + + lcc->size = ns; + return 0; + + realloc_error: + perror("realloc"); + iconv_close(cd); + return -4; +} + +static int +_find(const lms_charset_conv_t *lcc, const char *charset) +{ + int i; + + for (i = 0; i < lcc->size; i++) + if (strcmp(lcc->names[i], charset) == 0) + return i; + + return -1; +} + +int +lms_charset_conv_del(lms_charset_conv_t *lcc, const char *charset) +{ + iconv_t *convs; + char **names; + int idx; + + if (!lcc) + return -1; + + if (!charset) + return -2; + + idx = _find(lcc, charset); + if (idx < 0) { + fprintf(stderr, "ERROR: could not find charset '%s'\n", charset); + return -3; + } + + iconv_close(lcc->convs[idx]); + free(lcc->names[idx]); + + lcc->size--; + for (; idx < lcc->size; idx++) { + lcc->convs[idx] = lcc->convs[idx + 1]; + lcc->names[idx] = lcc->names[idx + 1]; + } + + convs = realloc(lcc->convs, lcc->size * sizeof(*convs)); + if (convs) + lcc->convs = convs; + else + perror("could not realloc 'convs'"); + + names = realloc(lcc->names, lcc->size * sizeof(*names)); + if (names) + lcc->names = names; + else + perror("could not realloc 'names'"); + + return 0; +} + +static int +_check(lms_charset_conv_t *lcc, const char *istr, unsigned int ilen, char *ostr, unsigned int olen) +{ + char *inbuf, *outbuf; + int inlen, outlen; + size_t r; + + inbuf = (char *)istr; + inlen = ilen; + outbuf = ostr; + outlen = olen; + + iconv(lcc->check, NULL, NULL, NULL, NULL); + r = iconv(lcc->check, &inbuf, &inlen, &outbuf, &outlen); + if (r == (size_t)-1) + return -1; + else + return 0; +} + +static int +_conv(iconv_t cd, char **p_str, unsigned int *p_len, char *ostr, unsigned int olen) +{ + char *inbuf, *outbuf; + int inlen, outlen; + size_t r; + + inbuf = *p_str; + inlen = *p_len; + outbuf = ostr; + outlen = olen; + + iconv(cd, NULL, NULL, NULL, NULL); + r = iconv(cd, &inbuf, &inlen, &outbuf, &outlen); + if (r == (size_t)-1) + return -1; + + *p_len = olen - outlen; + free(*p_str); + *p_str = ostr; + + outbuf = realloc(*p_str, *p_len + 1); + if (!outbuf) + perror("realloc"); + else + *p_str = outbuf; + + (*p_str)[*p_len] = '\0'; + + return 0; +} + +int +lms_charset_conv(lms_charset_conv_t *lcc, char **p_str, unsigned int *p_len) +{ + char *outstr; + int i, outlen; + + if (!lcc) + return -1; + if (!p_str) + return -2; + if (!p_len) + return -3; + if (!*p_str || !*p_len) + return 0; + + outlen = 2 * *p_len; + outstr = malloc(outlen + 1); + if (!outstr) { + perror("malloc"); + return -4; + } + + if (_check(lcc, *p_str, *p_len, outstr, outlen) == 0) { + free(outstr); + return 0; + } + + for (i = 0; i < lcc->size; i++) + if (_conv(lcc->convs[i], p_str, p_len, outstr, outlen) == 0) + return 0; + + fprintf(stderr, + "WARNING: could not convert '%*s' to any charset, use fallback\n", + *p_len, *p_str); + i = _conv(lcc->fallback, p_str, p_len, outstr, outlen); + if (i < 0) { + memset(*p_str, '?', *p_len); + free(outstr); + } + return i; +} + +int +lms_charset_conv_check(lms_charset_conv_t *lcc, const char *str, unsigned int len) +{ + char *outstr; + int r, outlen; + + if (!lcc) + return -1; + if (!str || !len) + return 0; + + outlen = 2 * len; + outstr = malloc(outlen); + if (!outstr) { + perror("malloc"); + return -2; + } + + r = _check(lcc, str, len, outstr, outlen); + free(outstr); + return r; +} diff --git a/src/lib/lightmediascanner_charset_conv.h b/src/lib/lightmediascanner_charset_conv.h new file mode 100644 index 0000000..14a8fed --- /dev/null +++ b/src/lib/lightmediascanner_charset_conv.h @@ -0,0 +1,79 @@ +/** + * Copyright (C) 2007 by INdT + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * @author Gustavo Sverzut Barbieri + */ + +#ifndef _LIGHTMEDIASCANNER_CHARSET_CONV_H_ +#define _LIGHTMEDIASCANNER_CHARSET_CONV_H_ 1 + +#ifdef GNUC_MALLOC +#undef GNUC_MALLOC +#endif +#ifdef GNUC_WARN_UNUSED_RESULT +#undef GNUC_WARN_UNUSED_RESULT +#endif +#ifdef GNUC_NON_NULL +#undef GNUC_NON_NULL +#endif +#ifdef API +#undef API +#endif + +#ifdef __GNUC__ +# if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) +# define GNUC_MALLOC __attribute__((__malloc__)) +# else +# define GNUC_MALLOC +# endif +# if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) +# define GNUC_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +# define GNUC_NON_NULL(...) __attribute__((nonnull(__VA_ARGS__))) +# else +# define GNUC_WARN_UNUSED_RESULT +# define GNUC_NON_NULL(...) +# endif +# if __GNUC__ >= 4 +# define API __attribute__ ((visibility("default"))) +# else +# define API +# endif +#else +# define GNUC_MALLOC +# define GNUC_WARN_UNUSED_RESULT +# define GNUC_NON_NULL(...) +# define API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + typedef struct lms_charset_conv lms_charset_conv_t; + + API lms_charset_conv_t *lms_charset_conv_new(void) GNUC_MALLOC GNUC_WARN_UNUSED_RESULT; + API void lms_charset_conv_free(lms_charset_conv_t *lcc) GNUC_NON_NULL(1); + API int lms_charset_conv_add(lms_charset_conv_t *lcc, const char *charset) GNUC_NON_NULL(1, 2); + API int lms_charset_conv_del(lms_charset_conv_t *lcc, const char *charset) GNUC_NON_NULL(1, 2); + + API int lms_charset_conv(lms_charset_conv_t *lcc, char **p_str, unsigned int *p_len) GNUC_NON_NULL(1, 2, 3); + API int lms_charset_conv_check(lms_charset_conv_t *lcc, const char *str, unsigned int len) GNUC_NON_NULL(1, 2); + +#ifdef __cplusplus +} +#endif +#endif /* _LIGHTMEDIASCANNER_CHARSET_CONV_H_ */ diff --git a/src/lib/lightmediascanner_plugin.h b/src/lib/lightmediascanner_plugin.h index d9306d1..5c30667 100644 --- a/src/lib/lightmediascanner_plugin.h +++ b/src/lib/lightmediascanner_plugin.h @@ -22,6 +22,7 @@ #define _LIGHTMEDIASCANNER_PLUGIN_H_ 1 #include +#include #include #include @@ -41,6 +42,7 @@ extern "C" { struct lms_context { sqlite3 *db; + lms_charset_conv_t *cs_conv; }; typedef void *(*lms_plugin_match_fn_t)(lms_plugin_t *p, const char *path, int len, int base); diff --git a/src/plugins/id3lib/id3lib.cpp b/src/plugins/id3lib/id3lib.cpp index 7bb2f5f..5916ff3 100644 --- a/src/plugins/id3lib/id3lib.cpp +++ b/src/plugins/id3lib/id3lib.cpp @@ -220,6 +220,15 @@ _parse(struct plugin *plugin, struct lms_context *ctxt, const struct lms_file_in info.title.str[info.title.len] = '\0'; } + if (info.title.str) + lms_charset_conv(ctxt->cs_conv, &info.title.str, &info.title.len); + if (info.artist.str) + lms_charset_conv(ctxt->cs_conv, &info.artist.str, &info.artist.len); + if (info.album.str) + lms_charset_conv(ctxt->cs_conv, &info.album.str, &info.album.len); + if (info.genre.str) + lms_charset_conv(ctxt->cs_conv, &info.genre.str, &info.genre.len); + info.id = finfo->id; r = lms_db_audio_add(plugin->audio_db, &info); diff --git a/src/plugins/jpeg/jpeg.c b/src/plugins/jpeg/jpeg.c index 054dc66..f32e397 100644 --- a/src/plugins/jpeg/jpeg.c +++ b/src/plugins/jpeg/jpeg.c @@ -644,6 +644,11 @@ _parse(struct plugin *plugin, struct lms_context *ctxt, const struct lms_file_in info.title.str[info.title.len] = '\0'; } + if (info.title.str) + lms_charset_conv(ctxt->cs_conv, &info.title.str, &info.title.len); + if (info.artist.str) + lms_charset_conv(ctxt->cs_conv, &info.artist.str, &info.artist.len); + info.id = finfo->id; r = lms_db_image_add(plugin->img_db, &info); diff --git a/src/plugins/m3u/m3u.c b/src/plugins/m3u/m3u.c index f3be26f..d5c907e 100644 --- a/src/plugins/m3u/m3u.c +++ b/src/plugins/m3u/m3u.c @@ -129,6 +129,7 @@ _parse(struct plugin *plugin, struct lms_context *ctxt, const struct lms_file_in info.title.str = malloc((info.title.len + 1) * sizeof(char)); memcpy(info.title.str, finfo->path + finfo->base, info.title.len); info.title.str[info.title.len] = '\0'; + lms_charset_conv(ctxt->cs_conv, &info.title.str, &info.title.len); info.id = finfo->id; r = lms_db_playlist_add(plugin->playlist_db, &info); diff --git a/src/plugins/pls/pls.c b/src/plugins/pls/pls.c index 8e96ad0..08533ab 100644 --- a/src/plugins/pls/pls.c +++ b/src/plugins/pls/pls.c @@ -311,6 +311,7 @@ _parse(struct plugin *plugin, struct lms_context *ctxt, const struct lms_file_in info.title.str = malloc((info.title.len + 1) * sizeof(char)); memcpy(info.title.str, finfo->path + finfo->base, info.title.len); info.title.str[info.title.len] = '\0'; + lms_charset_conv(ctxt->cs_conv, &info.title.str, &info.title.len); info.id = finfo->id; r = lms_db_playlist_add(plugin->playlist_db, &info); diff --git a/src/plugins/video-dummy/video-dummy.c b/src/plugins/video-dummy/video-dummy.c index 40533e9..af292c4 100644 --- a/src/plugins/video-dummy/video-dummy.c +++ b/src/plugins/video-dummy/video-dummy.c @@ -76,6 +76,7 @@ _parse(struct plugin *plugin, struct lms_context *ctxt, const struct lms_file_in info.title.str = malloc((info.title.len + 1) * sizeof(char)); memcpy(info.title.str, finfo->path + finfo->base, info.title.len); info.title.str[info.title.len] = '\0'; + lms_charset_conv(ctxt->cs_conv, &info.title.str, &info.title.len); info.id = finfo->id; r = lms_db_video_add(plugin->video_db, &info); -- 2.7.4