From 0cd95b5c8ed6a5295c22e5f8bd58893d55ff4867 Mon Sep 17 00:00:00 2001 From: Tomas Mlcoch Date: Sun, 26 May 2013 00:01:08 +0200 Subject: [PATCH] xml_parser: Draft of new xml parsing module --- src/CMakeLists.txt | 9 +- src/xml_parser.c | 73 +++++++++ src/xml_parser.h | 86 ++++++++++ src/xml_parser_filelists.c | 319 ++++++++++++++++++++++++++++++++++++++ src/xml_parser_internal.h | 113 ++++++++++++++ src/xml_parser_other.c | 0 src/xml_parser_primary.c | 0 tests/CMakeLists.txt | 4 + tests/test_xml_parser_filelists.c | 100 ++++++++++++ 9 files changed, 702 insertions(+), 2 deletions(-) create mode 100644 src/xml_parser.c create mode 100644 src/xml_parser.h create mode 100644 src/xml_parser_filelists.c create mode 100644 src/xml_parser_internal.h create mode 100644 src/xml_parser_other.c create mode 100644 src/xml_parser_primary.c create mode 100644 tests/test_xml_parser_filelists.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cf0d601..a06fbd6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,11 @@ SET (createrepo_c_SRCS xml_dump_filelists.c xml_dump_other.c xml_dump_primary.c - xml_file.c) + xml_file.c + xml_parser.c + xml_parser_filelists.c + xml_parser_other.c + xml_parser_primary.c) SET(headers compression_wrapper.h @@ -30,7 +34,8 @@ SET(headers sqlite.h version.h xml_dump.h - xml_file.h) + xml_file.h + xml_parser.h) ADD_LIBRARY(libcreaterepo_c SHARED ${createrepo_c_SRCS}) TARGET_LINK_LIBRARIES(libcreaterepo_c ${BZIP2_LIBRARIES}) diff --git a/src/xml_parser.c b/src/xml_parser.c new file mode 100644 index 0000000..9cdccf1 --- /dev/null +++ b/src/xml_parser.c @@ -0,0 +1,73 @@ +#include +#include "error.h" +#include "xml_parser.h" +#include "xml_parser_internal.h" +#include "misc.h" + +cr_ParserData * +cr_xml_parser_data() +{ + cr_ParserData *pd = g_new0(cr_ParserData, 1); + pd->ret = CRE_OK; + pd->content = g_malloc(CONTENT_REALLOC_STEP); + pd->acontent = CONTENT_REALLOC_STEP; + pd->msgs = g_string_new(0); + + return pd; +} + +void +cr_xml_parser_data_free(cr_ParserData *pd) +{ + g_free(pd->content); + g_string_free(pd->msgs, TRUE); + g_free(pd); +} + +void XMLCALL +cr_char_handler(void *pdata, const XML_Char *s, int len) +{ + int l; + char *c; + cr_ParserData *pd = pdata; + + if (pd->ret != CRE_OK) + return; /* There was an error -> do nothing */ + + if (!pd->docontent) + return; /* Do not store the content */ + + /* XXX: TODO: Maybe rewrite this reallocation step */ + l = pd->lcontent + len + 1; + if (l > pd->acontent) { + pd->acontent = l + CONTENT_REALLOC_STEP; + pd->content = realloc(pd->content, pd->acontent); + } + + c = pd->content + pd->lcontent; + pd->lcontent += len; + while (len-- > 0) + *c++ = *s++; + *c = '\0'; +} + +int +cr_newpkgcb(cr_Package **pkg, + const char *pkgId, + const char *name, + const char *arch, + void *cbdata, + GError **err) +{ + CR_UNUSED(pkgId); + CR_UNUSED(name); + CR_UNUSED(arch); + CR_UNUSED(cbdata); + + assert(pkg && *pkg == NULL); + assert(!err || *err == NULL); + + *pkg = cr_package_new(); + + return CRE_OK; +} diff --git a/src/xml_parser.h b/src/xml_parser.h new file mode 100644 index 0000000..1ecec4e --- /dev/null +++ b/src/xml_parser.h @@ -0,0 +1,86 @@ +/* createrepo_c - Library of routines for manipulation with repodata + * Copyright (C) 2013 Tomas Mlcoch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + * USA. + */ + +#ifndef __C_CREATEREPOLIB_XML_PARSER_H__ +#define __C_CREATEREPOLIB_XML_PARSER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "package.h" + +/** \defgroup xml_parser XML parser API. + * \addtogroup xml_parser + * @{ + */ + +/** Callback for XML parser wich is called when a package element is parsed. + * @param pkg Currently parsed package. + * @param cbdata User data. + * @err GError ** + * @return 0 - OK, 1 - ERROR (stops the parsing) + */ +typedef int (*cr_XmlParserPkgCb)(cr_Package *pkg, + void *cbdata, + GError **err); + +/** Callback for XML parser wich is called when a new package object parsing + * is started. This function has to set *pkg to package object which will + * be populated by parser. The object could be empty, or already partially + * filled (by other XML parsers) package object. + * If the pointer is set to NULL, current package will be skiped. + * Note: For the primary.xml file pkgId, name and arch are NULL! + * @param pkg Package that will be populated. + * @param pkgId pkgId (hash) of the new package. + * @param name Name of the new package. + * @param arch Arch of the new package. + * @param cbdata User data. + * @param err GError ** + * @return 0 - OK, 1 - ERR (stops the parsing) + */ +typedef int (*cr_XmlParserNewPkgCb)(cr_Package **pkg, + const char *pkgId, + const char *name, + const char *arch, + void *cbdata, + GError **err); + +int cr_newpkgcb(cr_Package **pkg, + const char *pkgId, + const char *name, + const char *arch, + void *cbdata, + GError **err); + +int cr_xml_parse_filelists(const char *path, + cr_XmlParserNewPkgCb newpkgcb, + void *newpkgcb_data, + cr_XmlParserPkgCb pkgcb, + void *pkgcb_data, + GError **err); + + +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* __C_CREATEREPOLIB_XML_PARSER_H__ */ diff --git a/src/xml_parser_filelists.c b/src/xml_parser_filelists.c new file mode 100644 index 0000000..5d3d382 --- /dev/null +++ b/src/xml_parser_filelists.c @@ -0,0 +1,319 @@ +/* createrepo_c - Library of routines for manipulation with repodata + * Copyright (C) 2013 Tomas Mlcoch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + * USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xml_parser_internal.h" +#include "xml_parser.h" +#include "error.h" +#include "package.h" +#include "logging.h" +#include "misc.h" + +typedef enum { + STATE_START, + STATE_FILELISTS, + STATE_PACKAGE, + STATE_VERSION, + STATE_FILE, + NUMSTATES, +} cr_FilState; + +/* NOTE: Same states in the first column must be together!!! + * Performance tip: More frequent elements shoud be listed + * first in its group (eg: element "package" (STATE_PACKAGE) + * has a "file" element listed first, because it is more frequent + * than a "version" element). */ +static cr_StatesSwitch stateswitches[] = { + { STATE_START, "filelists", STATE_FILELISTS, 0 }, + { STATE_FILELISTS, "package", STATE_PACKAGE, 0 }, + { STATE_PACKAGE, "file", STATE_FILE, 1 }, + { STATE_PACKAGE, "version", STATE_VERSION, 0 }, + { NUMSTATES, NULL, NUMSTATES, 0 }, +}; + +static void XMLCALL +cr_start_handler(void *pdata, const char *element, const char **attr) +{ + GError *tmp_err = NULL; + cr_ParserData *pd = pdata; + cr_StatesSwitch *sw; + + if (pd->ret != CRE_OK) + return; // There was an error -> do nothing + + if (pd->depth != pd->statedepth) { + // There probably was an unknown element + pd->depth++; + return; + } + pd->depth++; + + if (!pd->swtab[pd->state]) + return; // Current element should not have any sub elements + + /* TODO TEST THIS */ + if (!pd->pkg && pd->state != STATE_FILELISTS && pd->state != STATE_START) + return; // Do not parse current package tag and its content + + // Find current state by its name + for (sw = pd->swtab[pd->state]; sw->from == pd->state; sw++) + if (!strcmp(element, sw->ename)) + break; + if (sw->from != pd->state) + return; // There is no state for the name -> skip + + // Update parser data + pd->state = sw->to; + pd->docontent = sw->docontent; + pd->statedepth = pd->depth; + pd->lcontent = 0; + pd->content[0] = '\0'; + + switch(pd->state) { + case STATE_START: + case STATE_FILELISTS: + break; + + case STATE_PACKAGE: { + /* TODO: Parse all attrs in single loop instead of use cr_find_attr */ + const char *pkgId = cr_find_attr("pkgid", attr); + const char *name = cr_find_attr("name", attr); + const char *arch = cr_find_attr("arch", attr); + + if (!pkgId) { + pd->ret = CRE_BADXMLFILELISTS; + g_set_error(pd->err, CR_XML_PARSER_FIL_ERROR, CRE_BADXMLFILELISTS, + "Package pkgid attributte is missing!"); + break; + } + + if (pd->newpkgcb(&pd->pkg, + pkgId, + name, + arch, + pd->newpkgcb_data, + &tmp_err)) + { + pd->ret = CRE_CBINTERRUPTED; + if (tmp_err) + g_propagate_prefixed_error(pd->err, + tmp_err, + "Parsing interrupted:"); + else + g_set_error(pd->err, CR_XML_PARSER_FIL_ERROR, CRE_CBINTERRUPTED, + "Parsing interrupted"); + } + + /* TODO: Insert name and pkg id to the package */ + break; + } + + case STATE_VERSION: + /* TODO: Parse version */ + break; + + case STATE_FILE: { + const char *type = cr_find_attr("type", attr); + pd->last_file_type = FILE_FILE; + if (type) { + if (!strcmp(type, "dir")) + pd->last_file_type = FILE_DIR; + else if (!strcmp(type, "ghost")) + pd->last_file_type = FILE_GHOST; + else + g_string_append_printf(pd->msgs, + "Unknown file type \"%s\";", + type); + } + break; + } + + default: + break; + } +} + +static void XMLCALL +cr_end_handler(void *pdata, const char *element) +{ + cr_ParserData *pd = pdata; + GError *tmp_err = NULL; + unsigned int state = pd->state; + + CR_UNUSED(element); + + if (pd->ret != CRE_OK) + return; /* There was an error -> do nothing */ + + if (pd->depth != pd->statedepth) { + /* Back from the unknown state */ + pd->depth--; + return; + } + + pd->depth--; + pd->statedepth--; + pd->state = pd->sbtab[pd->state]; + pd->docontent = 0; + + switch (state) { + case STATE_START: + case STATE_FILELISTS: + case STATE_VERSION: + break; + + case STATE_PACKAGE: + if (!pd->pkg) + return; + + if (pd->pkgcb(pd->pkg, pd->pkgcb_data, &tmp_err)) { + pd->ret = CRE_CBINTERRUPTED; + if (tmp_err) + g_propagate_prefixed_error(pd->err, + tmp_err, + "Parsing interrupted:"); + else + g_set_error(pd->err, CR_XML_PARSER_FIL_ERROR, CRE_CBINTERRUPTED, + "Parsing interrupted"); + } + pd->pkg = NULL; + break; + + case STATE_FILE: { + if (!pd->pkg || !pd->content) + break; + + cr_PackageFile *pkg_file = cr_package_file_new(); + pkg_file->name = cr_safe_string_chunk_insert(pd->pkg->chunk, + cr_get_filename(pd->content)); + pkg_file->path = cr_safe_string_chunk_insert(pd->pkg->chunk, + pd->content); + switch (pd->last_file_type) { + case FILE_FILE: pkg_file->type = NULL; break; // NULL => "file" + case FILE_DIR: pkg_file->type = "dir"; break; + case FILE_GHOST: pkg_file->type = "ghost"; break; + } + + pd->pkg->files = g_slist_prepend(pd->pkg->files, pkg_file); + break; + } + + default: + break; + } +} + +int +cr_xml_parse_filelists(const char *path, + cr_XmlParserNewPkgCb newpkgcb, + void *newpkgcb_data, + cr_XmlParserPkgCb pkgcb, + void *pkgcb_data, + GError **err) +{ + int ret = CRE_OK; + CR_FILE *f; + cr_ParserData *pd; + XML_Parser parser; + + assert(path); + assert(pkgcb); + assert(!err || *err == NULL); + + if (!newpkgcb) + newpkgcb = cr_newpkgcb; + + f = cr_open(path, CR_CW_MODE_READ, CR_CW_AUTO_DETECT_COMPRESSION); + if (!f) { + g_set_error(err, CR_XML_PARSER_FIL_ERROR, CRE_IO, "Cannot open %s", path); + return CRE_IO; + } + + parser = XML_ParserCreate(NULL); + XML_SetElementHandler(parser, cr_start_handler, cr_end_handler); + XML_SetCharacterDataHandler(parser, cr_char_handler); + + pd = cr_xml_parser_data(); + pd->parser = &parser; + pd->state = STATE_START; + pd->newpkgcb_data = newpkgcb_data; + pd->newpkgcb = newpkgcb; + pd->pkgcb_data = pkgcb_data; + pd->pkgcb = pkgcb; + pd->swtab = g_malloc0(sizeof(cr_StatesSwitch *) * NUMSTATES); + pd->sbtab = g_malloc(sizeof(cr_FilState) * NUMSTATES); + for (cr_StatesSwitch *sw = stateswitches; sw->from != NUMSTATES; sw++) { + if (!pd->swtab[sw->from]) + pd->swtab[sw->from] = sw; + pd->sbtab[sw->to] = sw->from; + } + + XML_SetUserData(parser, pd); + + while (1) { + int len; + void *buf = XML_GetBuffer(parser, XML_BUFFER_SIZE); + if (!buf) { + ret = CRE_MEMORY; + g_set_error(err, CR_XML_PARSER_FIL_ERROR, CRE_MEMORY, + "Out of memory: Cannot allocate buffer for xml parser"); + break; + } + + len = cr_read(f, buf, XML_BUFFER_SIZE); + if (len < 0) { + ret = CRE_IO; + g_critical("%s: Cannot read for parsing : %s\n", + __func__, strerror(errno)); + g_set_error(err, CR_XML_PARSER_FIL_ERROR, CRE_IO, + "Error while reading xml"); + break; + } + + if (!XML_ParseBuffer(parser, len, len == 0)) { + ret = CRE_XMLPARSER; + g_critical("%s: parsing error: %s\n", + __func__, XML_ErrorString(XML_GetErrorCode(parser))); + g_set_error(err, CR_XML_PARSER_FIL_ERROR, CRE_XMLPARSER, + "Parse error at line: %d (%s)", + (int) XML_GetCurrentLineNumber(parser), + (char *) XML_ErrorString(XML_GetErrorCode(parser))); + break; + } + + if (len == 0) + break; + + if (pd->ret != CRE_OK) { + ret = pd->ret; + break; + } + } + + cr_xml_parser_data_free(pd); + XML_ParserFree(parser); + + return ret; +} diff --git a/src/xml_parser_internal.h b/src/xml_parser_internal.h new file mode 100644 index 0000000..39fa360 --- /dev/null +++ b/src/xml_parser_internal.h @@ -0,0 +1,113 @@ +/* createrepo_c - Library of routines for manipulation with repodata + * Copyright (C) 2013 Tomas Mlcoch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + * USA. + */ + +#ifndef __C_CREATEREPOLIB_XML_PARSER_INTERNAL_H__ +#define __C_CREATEREPOLIB_XML_PARSER_INTERNAL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include "xml_parser.h" +#include "error.h" +#include "package.h" + +#define XML_BUFFER_SIZE 8192 +#define CONTENT_REALLOC_STEP 256 + +typedef enum { + FILE_FILE, + FILE_DIR, + FILE_GHOST, + FILE_SENTINEL, +} cr_FileType; + +typedef struct { + unsigned int from; /*!< State (current tag) */ + char *ename; /*!< String name of sub-tag */ + unsigned int to; /*!< State of sub-tag */ + int docontent; /*!< Read text content of element? */ +} cr_StatesSwitch; + +typedef struct _cr_ParserData { + int ret; /*!< status of parsing (return code) */ + int depth; + int statedepth; + unsigned int state; /*!< current state */ + + /* Tag content related values */ + + int docontent; /*!< Store text content of the current element? */ + char *content; /*!< Text content of the element */ + int lcontent; /*!< The content lenght */ + int acontent; /*!< Available bytes in the content */ + + XML_Parser *parser; /*!< The parser */ + cr_StatesSwitch **swtab; /*!< Pointers to statesswitches table */ + unsigned int *sbtab; /*!< stab[to_state] = from_state */ + + /* Package stuff */ + + void *newpkgcb_data; /*!< + User data for the newpkgcb. */ + cr_XmlParserNewPkgCb newpkgcb; /*!< + Callback called to get (create new or use existing from a previous + parsing of other or primary xml file) pkg object for the currently + loaded pkg. */ + void *pkgcb_data; /*!< + User data for the pkgcb. */ + cr_XmlParserPkgCb pkgcb; /*!< + Callback called when a signel pkg data are completly parsed. */ + cr_Package *pkg; /*!< + The package which is currently loaded. */ + GString *msgs; /*!< + Messages from xml parser (warnings about unknown elements etc.) */ + GError **err; /*!< + Error message */ + + /* Filelists related stuff */ + + int last_file_type; +} cr_ParserData; + +cr_ParserData *cr_xml_parser_data(); + +void cr_xml_parser_data_free(cr_ParserData *pd); + +static inline const char * +cr_find_attr(const char *name, const char **attr) +{ + while (*attr) { + if (!strcmp(name, *attr)) + return attr[1]; + attr += 2; + } + + return NULL; +} + +void XMLCALL cr_char_handler(void *pdata, const XML_Char *s, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* __C_CREATEREPOLIB_XML_PARSER_INTERNAL_H__ */ diff --git a/src/xml_parser_other.c b/src/xml_parser_other.c new file mode 100644 index 0000000..e69de29 diff --git a/src/xml_parser_primary.c b/src/xml_parser_primary.c new file mode 100644 index 0000000..e69de29 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 097086d..08136e5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,6 +18,10 @@ ADD_EXECUTABLE(test_xml_file test_xml_file.c) TARGET_LINK_LIBRARIES(test_xml_file libcreaterepo_c ${GLIB2_LIBRARIES}) ADD_DEPENDENCIES(tests test_xml_file) +ADD_EXECUTABLE(test_xml_parser_filelists test_xml_parser_filelists.c) +TARGET_LINK_LIBRARIES(test_xml_parser_filelists libcreaterepo_c ${GLIB2_LIBRARIES}) +ADD_DEPENDENCIES(tests test_xml_parser_filelists) + CONFIGURE_FILE("run_gtester.sh.in" "run_gtester.sh") ADD_TEST(test_main run_gtester.sh) diff --git a/tests/test_xml_parser_filelists.c b/tests/test_xml_parser_filelists.c new file mode 100644 index 0000000..73d315f --- /dev/null +++ b/tests/test_xml_parser_filelists.c @@ -0,0 +1,100 @@ +/* createrepo_c - Library of routines for manipulation with repodata + * Copyright (C) 2013 Tomas Mlcoch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + * USA. + */ + +#include +#include +#include +#include "fixtures.h" +#include "createrepo/error.h" +#include "createrepo/package.h" +#include "createrepo/misc.h" +#include "createrepo/xml_parser.h" + +static int +pkgcb(cr_Package *pkg, void *cbdata, GError **err) +{ + g_assert(!err || *err == NULL); + if (cbdata) *((int *)cbdata) += 1; + return CRE_OK; +} + +static void test_cr_xml_parse_filelists_00(void) +{ + int ret; + GError *tmp_err = NULL; + + ret = cr_xml_parse_filelists(TEST_REPO_00_FILELISTS, + NULL, + NULL, + pkgcb, + NULL, + &tmp_err); + + g_assert(tmp_err == NULL); + g_assert_cmpint(ret, ==, CRE_OK); +} + +static void test_cr_xml_parse_filelists_01(void) +{ + int ret; + int parsed = 0; + GError *tmp_err = NULL; + + ret = cr_xml_parse_filelists(TEST_REPO_01_FILELISTS, + NULL, + NULL, + pkgcb, + &parsed, + &tmp_err); + + g_assert(tmp_err == NULL); + g_assert_cmpint(ret, ==, CRE_OK); + g_assert_cmpint(parsed, ==, 1); +} + +static void test_cr_xml_parse_filelists_02(void) +{ + int ret; + int parsed = 0; + GError *tmp_err = NULL; + + ret = cr_xml_parse_filelists(TEST_REPO_02_FILELISTS, + NULL, + NULL, + pkgcb, + &parsed, + &tmp_err); + + g_assert(tmp_err == NULL); + g_assert_cmpint(ret, ==, CRE_OK); + g_assert_cmpint(parsed, ==, 2); +} + +int main(int argc, char *argv[]) +{ + g_test_init(&argc, &argv, NULL); + + g_test_add_func("/xml_parser_filelists/test_cr_xml_parse_filelists_00", + test_cr_xml_parse_filelists_00); + g_test_add_func("/xml_parser_filelists/test_cr_xml_parse_filelists_01", + test_cr_xml_parse_filelists_01); + g_test_add_func("/xml_parser_filelists/test_cr_xml_parse_filelists_02", + test_cr_xml_parse_filelists_02); + return g_test_run(); +} -- 2.7.4