From: Seungbae Shin Date: Fri, 1 Apr 2022 07:14:23 +0000 (+0900) Subject: Refactoring ebook plugin X-Git-Tag: accepted/tizen/unified/20220603.141253~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7f2c8044d67cbe36b27a894125fc1904b047122e;p=platform%2Fcore%2Fmultimedia%2Flibmedia-service.git Refactoring ebook plugin - Apply strategy pattern - Separate files for classes and interfaces Change-Id: Icd06ad64b41ba7f799fa445cb12e2a6af9c99fbf --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 924d079..9d74a25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,10 @@ SET(HASH_SRCS SET(EBOOKPLUGIN_SRCS plugin/media-ebook-plugin.cpp + plugin/media-ebook-plugin-regmatch.cpp + plugin/media-ebook-plugin-dbinserter.cpp + plugin/media-ebook-plugin-pdf.cpp + plugin/media-ebook-plugin-epub.cpp ) SET(CONTENTPLUGIN_SRCS plugin/media-content-plugin.c diff --git a/plugin/media-ebook-plugin-dbinserter.cpp b/plugin/media-ebook-plugin-dbinserter.cpp new file mode 100644 index 0000000..73a2d3a --- /dev/null +++ b/plugin/media-ebook-plugin-dbinserter.cpp @@ -0,0 +1,66 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include "media-ebook-plugin-dbinserter.h" + +#ifdef LOG_TAG +#undef LOG_TAG +#endif + +#define LOG_TAG "MEDIA_SERVICE" + +#define INSERT_QUERY "INSERT INTO words(file_id, word) SELECT id, ? FROM files WHERE path=? ON CONFLICT (file_id, word) DO UPDATE SET frequency=frequency+1;" +#define TOKEN_KEY "\\s+" +#define SPECIAL_CHAR "[\\{\\}\\[\\]\\/?.,;:|\\)*~`!^\\-_+<>@\\#$%&\\\\=\\(\\\'\\\"]" + +bool DbInserter::run(const std::string& text) +{ + if (!dbHandle || filePath.empty() || text.empty()) + return false; + + auto sqlite_handle = static_cast(const_cast(dbHandle)); + + const std::regex sp(SPECIAL_CHAR); + std::string _text = std::regex_replace(text, sp, ""); + const std::regex re(TOKEN_KEY); + auto words_begin = std::sregex_token_iterator(_text.begin(), _text.end(), re, -1); + auto words_end = std::sregex_token_iterator(); + + bool isTransaction = (sqlite3_exec(sqlite_handle, "BEGIN;", NULL, NULL, NULL) == SQLITE_OK); + + sqlite3_stmt *stmt = NULL; + sqlite3_prepare_v2(sqlite_handle, INSERT_QUERY, -1, &stmt, NULL); + + for (auto i = words_begin; i != words_end; ++i) { + sqlite3_bind_text(stmt, 1, (*i).str().c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(stmt, 2, filePath.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_step(stmt); + sqlite3_reset(stmt); + } + + sqlite3_finalize(stmt); + + if (isTransaction) + sqlite3_exec(sqlite_handle, "COMMIT;", NULL, NULL, NULL); + + return true; +} diff --git a/plugin/media-ebook-plugin-dbinserter.h b/plugin/media-ebook-plugin-dbinserter.h new file mode 100644 index 0000000..57e1f07 --- /dev/null +++ b/plugin/media-ebook-plugin-dbinserter.h @@ -0,0 +1,40 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef __MEDIA_EBOOK_PLUGIN_DBINSERTER_H__ +#define __MEDIA_EBOOK_PLUGIN_DBINSERTER_H__ + +#include +#include "media-ebook-plugin-interface.h" + +class DbInserter : public IRunnable +{ +public: + DbInserter(std::string path, const void* handle) + : filePath(path), dbHandle(handle) { } + ~DbInserter() override = default; + + bool run(const std::string& text) override; + +private: + std::string filePath {}; + const void *dbHandle {}; +}; + +#endif diff --git a/plugin/media-ebook-plugin-epub.cpp b/plugin/media-ebook-plugin-epub.cpp new file mode 100644 index 0000000..59b969e --- /dev/null +++ b/plugin/media-ebook-plugin-epub.cpp @@ -0,0 +1,168 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "media-ebook-plugin-epub.h" + +#include +#include +#include + +#ifdef LOG_TAG +#undef LOG_TAG +#endif + +#define LOG_TAG "MEDIA_SERVICE" + +using namespace EBook; + +Epub::Epub(std::string path, std::unique_ptr runner) + : runner(std::move(runner)) +{ + if (path.empty()) { + LOGE("invalid path"); + return; + } + + LOGD("%s", path.c_str()); + + int err = 0; + z = zip_open(path.c_str(), ZIP_RDONLY, &err); + if (err != 0) + LOGE("zip_open failed"); +} + +Epub::~Epub() +{ + if (!z) + return; + + zip_close(z); + z = nullptr; +} + +bool Epub::find() +{ + zip_stat_t sb {}; + + int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED); + for (int i = 0; i < entry_len; i++) { + if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html")) + continue; + + if (zip_stat_index(z, i, 0, &sb) != 0) + continue; + + zip_file_t *file = zip_fopen_index(z, i, 0); + if (!file) + continue; + + std::vector file_buf(sb.size); + + zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size); + zip_fclose(file); + + if ((readn == static_cast(sb.size)) && + htmlFind(file_buf.data(), sb.size)) + return true; + } + + return false; +} + +bool Epub::htmlFind(const char* html_buf, int buf_size) +{ + htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL, + HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); + + if (!doc) { + LOGE("htmlReadMemory failed"); + return false; + } + + bool found = htmlNodeFindRecursive(xmlDocGetRootElement(doc)); + + xmlFreeDoc(doc); + + return found; +} + +bool Epub::htmlNodeFindRecursive(xmlNodePtr node) +{ + for (xmlNodePtr cur = node; cur; cur = cur->next) { + if (cur->type == XML_TEXT_NODE && + runner->run(reinterpret_cast(cur->content))) + return true; + + if (htmlNodeFindRecursive(cur->children)) + return true; + } + + return false; +} + +void Epub::insert() +{ + zip_stat_t sb {}; + + int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED); + for (int i = 0; i < entry_len; i++) { + if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html")) + continue; + + if (zip_stat_index(z, i, 0, &sb) != 0) + continue; + + zip_file_t *file = zip_fopen_index(z, i, 0); + if (!file) + continue; + + std::vector file_buf(sb.size); + + zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size); + zip_fclose(file); + + if (readn == static_cast(sb.size)) + htmlInsert(file_buf.data(), sb.size); + } +} + +void Epub::htmlInsert(const char* html_buf, int buf_size) +{ + htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL, + HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); + + if (!doc) { + LOGE("htmlReadMemory failed"); + return; + } + + htmlNodeInsertRecursive(xmlDocGetRootElement(doc)); + + xmlFreeDoc(doc); +} + +void Epub::htmlNodeInsertRecursive(xmlNodePtr node) +{ + for (xmlNodePtr cur = node; cur; cur = cur->next) { + if (cur->type == XML_TEXT_NODE) + runner->run(reinterpret_cast(cur->content)); + + htmlNodeInsertRecursive(cur->children); + } +} diff --git a/plugin/media-ebook-plugin-epub.h b/plugin/media-ebook-plugin-epub.h new file mode 100644 index 0000000..61c1d16 --- /dev/null +++ b/plugin/media-ebook-plugin-epub.h @@ -0,0 +1,56 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef __MEDIA_EBOOK_PLUGIN_EPUB_H__ +#define __MEDIA_EBOOK_PLUGIN_EPUB_H__ + +#include "media-ebook-plugin-interface.h" + +#include +#include + +#include +#include +#include +#include + +namespace EBook { + +class Epub : public IFindable, public IInsertable +{ +public: + Epub(std::string path, std::unique_ptr runner); + ~Epub() override; + + bool find() override; + void insert() override; + +private: + bool htmlFind(const char* html_buf, int buf_size); + bool htmlNodeFindRecursive(xmlNodePtr node); + void htmlInsert(const char* html_buf, int buf_size); + void htmlNodeInsertRecursive(xmlNodePtr node); + + zip_t* z {}; + std::unique_ptr runner {}; +}; + +} + +#endif diff --git a/plugin/media-ebook-plugin-interface.h b/plugin/media-ebook-plugin-interface.h new file mode 100644 index 0000000..cdd617b --- /dev/null +++ b/plugin/media-ebook-plugin-interface.h @@ -0,0 +1,46 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef __MEDIA_EBOOK_PLUGIN_INTERFACE_H__ +#define __MEDIA_EBOOK_PLUGIN_INTERFACE_H__ + +#include + +class IFindable +{ +public: + virtual ~IFindable() = default; + virtual bool find() = 0; +}; + +class IInsertable +{ +public: + virtual ~IInsertable() = default; + virtual void insert() = 0; +}; + +class IRunnable +{ +public: + virtual ~IRunnable() = default; + virtual bool run(const std::string& text) = 0; +}; + +#endif \ No newline at end of file diff --git a/plugin/media-ebook-plugin-pdf.cpp b/plugin/media-ebook-plugin-pdf.cpp new file mode 100644 index 0000000..058d317 --- /dev/null +++ b/plugin/media-ebook-plugin-pdf.cpp @@ -0,0 +1,148 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "media-ebook-plugin-pdf.h" + +#include +#include + +#ifdef LOG_TAG +#undef LOG_TAG +#endif + +#define LOG_TAG "MEDIA_SERVICE" + +using namespace EBook; + +Pdf::Pdf(std::string path, std::unique_ptr runner) + : runner(std::move(runner)) +{ + if (path.empty()) { + LOGE("invalid path"); + return; + } + + LOGD("%s", path.c_str()); + + try { + pdf.Load(path.c_str()); + loaded = true; + } catch (const PoDoFo::PdfError& e) { + LOGE("Initialization failed : %s", e.what()); + } +} + +bool Pdf::find() +{ + if (!loaded) + return false; + + for (int n = 0; n < pdf.GetPageCount(); ++n) + if (runner->run(parseTextFromPage(n))) + return true; + + return false; +} + +void Pdf::insert() +{ + if (!loaded) + return; + + for (int n = 0; n < pdf.GetPageCount(); ++n) + runner->run(parseTextFromPage(n)); +} + +std::string Pdf::parseTextFromPage(unsigned int index) +{ + std::string fullText; + + PoDoFo::EPdfContentsType type; + PoDoFo::PdfVariant var; + PoDoFo::PdfFont *cur_font = NULL; + bool text_block = false; + const char* tok; + std::stack stack; + PoDoFo::PdfString unicode; + PoDoFo::PdfArray array; + + PoDoFo::PdfPage* page = pdf.GetPage(index); + if (!page) + return fullText; + + PoDoFo::PdfContentsTokenizer tokenizer(page); + + while (tokenizer.ReadNext(type, tok, var)) { + if (type != PoDoFo::ePdfContentsType_Keyword) { + if (text_block) + stack.push(var); + + continue; + } + + if (!text_block && strcmp(tok, "BT") == 0) { + text_block = true; + continue; + } else if (text_block && strcmp(tok, "ET") == 0) { + text_block = false; + } + + if (!text_block) + continue; + + if (strcmp(tok, "Tf") == 0) { + if (stack.size() < 2) { + cur_font = NULL; + continue; + } + + stack.pop(); + cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName())); + } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) { + if (stack.empty()) + continue; + + if (!cur_font || !cur_font->GetEncoding()) + continue; + + unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font); + fullText += unicode.GetStringUtf8(); + + stack.pop(); + } else if (strcmp(tok, "TJ") == 0) { + if (stack.empty()) + continue; + + array = stack.top().GetArray(); + stack.pop(); + + for (int i = 0; i < static_cast(array.GetSize()); i++) { + if (array[i].IsString() || array[i].IsHexString()) { + if (!cur_font || !cur_font->GetEncoding()) + continue; + + unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font); + fullText += unicode.GetStringUtf8(); + } + } + } + } + + return fullText; +} diff --git a/plugin/media-ebook-plugin-pdf.h b/plugin/media-ebook-plugin-pdf.h new file mode 100644 index 0000000..d86d5ae --- /dev/null +++ b/plugin/media-ebook-plugin-pdf.h @@ -0,0 +1,49 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef __MEDIA_EBOOK_PLUGIN_PDF_H__ +#define __MEDIA_EBOOK_PLUGIN_PDF_H__ + +#include +#include +#include +#include "media-ebook-plugin-interface.h" + +namespace EBook { + +class Pdf : public IFindable, public IInsertable +{ +public: + Pdf(std::string path, std::unique_ptr runner); + ~Pdf() override = default; + + bool find() override; + void insert() override; + +private: + std::string parseTextFromPage(unsigned int index); + + bool loaded {}; + PoDoFo::PdfMemDocument pdf {}; + std::unique_ptr runner {}; +}; + +} + +#endif diff --git a/plugin/media-ebook-plugin-regmatch.cpp b/plugin/media-ebook-plugin-regmatch.cpp new file mode 100644 index 0000000..bfa060d --- /dev/null +++ b/plugin/media-ebook-plugin-regmatch.cpp @@ -0,0 +1,42 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#include "media-ebook-plugin-regmatch.h" +#include +#include + +#ifdef LOG_TAG +#undef LOG_TAG +#endif + +#define LOG_TAG "MEDIA_SERVICE" + +bool RegMatch::run(const std::string& text) +{ + if (text.empty()) + return false; + + std::regex re(keyword, std::regex::icase); + + if (std::regex_search(text, re)) { + LOGD("Found [%s]", keyword.c_str()); + return true; + } + + return false; +} \ No newline at end of file diff --git a/plugin/media-ebook-plugin-regmatch.h b/plugin/media-ebook-plugin-regmatch.h new file mode 100644 index 0000000..c73240d --- /dev/null +++ b/plugin/media-ebook-plugin-regmatch.h @@ -0,0 +1,39 @@ +/* + * libmedia-service + * + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef __MEDIA_EBOOK_PLUGIN_REGMATCH_H__ +#define __MEDIA_EBOOK_PLUGIN_REGMATCH_H__ + +#include +#include "media-ebook-plugin-interface.h" + +class RegMatch : public IRunnable +{ +public: + explicit RegMatch(const char *keyword) + : keyword(keyword) { } + ~RegMatch() override = default; + + bool run(const std::string& text) override; + +private: + std::string keyword {}; +}; + +#endif \ No newline at end of file diff --git a/plugin/media-ebook-plugin.cpp b/plugin/media-ebook-plugin.cpp index 57776ac..321fce1 100644 --- a/plugin/media-ebook-plugin.cpp +++ b/plugin/media-ebook-plugin.cpp @@ -16,477 +16,39 @@ * limitations under the License. * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "media-ebook-plugin-interface.h" +#include "media-ebook-plugin-epub.h" +#include "media-ebook-plugin-pdf.h" +#include "media-ebook-plugin-dbinserter.h" +#include "media-ebook-plugin-regmatch.h" -#include -#include -#include +#include #ifdef LOG_TAG #undef LOG_TAG #endif #define LOG_TAG "MEDIA_SERVICE" -#define INSERT_QUERY "INSERT INTO words(file_id, word) SELECT id, ? FROM files WHERE path=? ON CONFLICT (file_id, word) DO UPDATE SET frequency=frequency+1;" -#define TOKEN_KEY "\\s+" -#define SPECIAL_CHAR "[\\{\\}\\[\\]\\/?.,;:|\\)*~`!^\\-_+<>@\\#$%&\\\\=\\(\\\'\\\"]" - -class TextFinderInterface -{ -public: - virtual ~TextFinderInterface() = default; - virtual bool find(const char *keyword) = 0; - virtual void insert() = 0; -}; - -class TextFinder : public TextFinderInterface -{ -public: - virtual ~TextFinder() = default; - bool match(std::string& text, const char *keyword); - void batchInsert(std::string& text); - - sqlite3 *dbHandle {}; - const char *filePath {}; -}; - -bool TextFinder::match(std::string& text, const char *keyword) -{ - if (!keyword) - return false; - - if (text.empty()) - return false; - - std::regex re(keyword, std::regex::icase); - - if (std::regex_search(text, re)) { - LOGD("Found [%s]", keyword); - return true; - } - - return false; -} - -void TextFinder::batchInsert(std::string& text) -{ - if (!dbHandle || !filePath || text.empty()) - return; - - sqlite3_stmt *stmt = NULL; - const std::regex sp(SPECIAL_CHAR); - std::string temp = std::regex_replace(text, sp, ""); - bool isTransaction = false; - - if (sqlite3_exec(dbHandle, "BEGIN;", NULL, NULL, NULL) == SQLITE_OK) - isTransaction = true; - - sqlite3_prepare_v2(dbHandle, INSERT_QUERY, -1, &stmt, NULL); - - const std::regex re(TOKEN_KEY); - std::sregex_token_iterator end; - - for (std::sregex_token_iterator i(temp.begin(), temp.end(), re, -1); i != end; ++i) { - sqlite3_bind_text(stmt, 1, (*i).str().c_str(), -1, SQLITE_TRANSIENT); - sqlite3_bind_text(stmt, 2, filePath, -1, SQLITE_TRANSIENT); - sqlite3_step(stmt); - sqlite3_reset(stmt); - } - - sqlite3_finalize(stmt); - - if (isTransaction) - sqlite3_exec(dbHandle, "COMMIT;", NULL, NULL, NULL); -} - -/*---------------- PDF -----------------------*/ -class PdfTextFinder : public TextFinder -{ -public: - explicit PdfTextFinder(const char *path); - PdfTextFinder(sqlite3 *handle, const char *path); - bool find(const char *keyword) override; - void insert() override; - -private: - std::string parseTextFromPage(unsigned int index); - bool loaded {}; - - PoDoFo::PdfMemDocument pdf {}; -}; - -PdfTextFinder::PdfTextFinder(const char *path) -{ - if (!path) { - LOGE("invalid path"); - return; - } - - LOGD("%s", path); - - try { - pdf.Load(path); - loaded = true; - } catch (const PoDoFo::PdfError& e) { - LOGE("Initialization failed : %s", e.what()); - } -} - -PdfTextFinder::PdfTextFinder(sqlite3 *handle, const char *path) -{ - if (!handle) { - LOGE("invalid handle"); - return; - } - - if (!path) { - LOGE("invalid path"); - return; - } - - LOGD("%s", path); - - try { - pdf.Load(path); - loaded = true; - dbHandle = handle; - filePath = path; - } catch (const PoDoFo::PdfError& e) { - LOGE("Initialization failed : %s", e.what()); - } -} - -bool PdfTextFinder::find(const char *keyword) -{ - if (!loaded) - return false; - - if (!keyword) { - LOGE("Invalid keyword"); - return false; - } - - for (int n = 0; n < pdf.GetPageCount(); ++n) { - auto text = parseTextFromPage(n); - if (match(text, keyword)) - return true; - } - - return false; -} - -void PdfTextFinder::insert() -{ - if (!loaded) - return; - - for (int n = 0; n < pdf.GetPageCount(); ++n) { - auto text = parseTextFromPage(n); - batchInsert(text); - } -} - -std::string PdfTextFinder::parseTextFromPage(unsigned int index) -{ - std::string fullText; - - PoDoFo::EPdfContentsType type; - PoDoFo::PdfVariant var; - PoDoFo::PdfFont *cur_font = NULL; - bool text_block = false; - const char *tok; - std::stack stack; - PoDoFo::PdfString unicode; - PoDoFo::PdfArray array; - - PoDoFo::PdfPage* page = pdf.GetPage(index); - if (!page) - return fullText; - - PoDoFo::PdfContentsTokenizer tokenizer(page); - - while (tokenizer.ReadNext(type, tok, var)) { - if (type != PoDoFo::ePdfContentsType_Keyword) { - if (text_block) - stack.push(var); - - continue; - } - - if (!text_block && strcmp(tok, "BT") == 0) { - text_block = true; - continue; - } else if (text_block && strcmp(tok, "ET") == 0) { - text_block = false; - } - - if (!text_block) - continue; - - if (strcmp(tok, "Tf") == 0) { - if (stack.size() < 2) { - cur_font = NULL; - continue; - } - - stack.pop(); - cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName())); - } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) { - if (stack.empty()) - continue; - - if (!cur_font || !cur_font->GetEncoding()) - continue; - - unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font); - fullText += unicode.GetStringUtf8(); - - stack.pop(); - } else if (strcmp(tok, "TJ") == 0) { - if (stack.empty()) - continue; - - array = stack.top().GetArray(); - stack.pop(); - - for (int i = 0; i < static_cast(array.GetSize()); i++) { - if (array[i].IsString() || array[i].IsHexString()) { - if (!cur_font || !cur_font->GetEncoding()) - continue; - - unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font); - fullText += unicode.GetStringUtf8(); - } - } - } - } - - return fullText; -} - -/*---------------- EPUB -----------------------*/ - -class EpubTextFinder : public TextFinder -{ -public: - explicit EpubTextFinder(const char *path); - EpubTextFinder(sqlite3 *handle, const char *path); - bool find(const char *keyword) override; - void insert() override; - - ~EpubTextFinder() override; - -private: - bool htmlNodeFindRecursive(xmlNodePtr node, const char *keyword); - void htmlNodeFindRecursiveForDb(xmlNodePtr node); - bool htmlFind(const char *html_buf, int buf_size, const char *keyword); - void htmlFindForDb(const char *html_buf, int buf_size); - - zip_t *z {}; -}; - -EpubTextFinder::EpubTextFinder(const char *path) -{ - if (!path) { - LOGE("invalid path"); - return; - } - - LOGD("%s", path); - - int err = 0; - z = zip_open(path, ZIP_RDONLY, &err); - if (err != 0) - LOGE("zip_open failed"); -} - -EpubTextFinder::EpubTextFinder(sqlite3 *handle, const char *path) -{ - if (!handle) { - LOGE("invalid handle"); - return; - } - - if (!path) { - LOGE("invalid path"); - return; - } - - LOGD("%s", path); - - int err = 0; - z = zip_open(path, ZIP_RDONLY, &err); - if (err != 0) - LOGE("zip_open failed"); - dbHandle = handle; - filePath = path; -} - -EpubTextFinder::~EpubTextFinder() -{ - if (!z) - return; - - zip_close(z); - z = nullptr; -} - -bool EpubTextFinder::find(const char *keyword) -{ - zip_stat_t sb = {0, }; - - if (!keyword) { - LOGE("Invalid keyword"); - return false; - } - - int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED); - for (int i = 0; i < entry_len; i++) { - if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html")) - continue; - - if (zip_stat_index(z, i, 0, &sb) != 0) - continue; - - zip_file_t *file = zip_fopen_index(z, i, 0); - if (!file) - continue; - - std::vector file_buf(sb.size); - - zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size); - zip_fclose(file); - - if ((readn == static_cast(sb.size)) && - htmlFind(file_buf.data(), sb.size, keyword)) - return true; - } - - return false; -} - -void EpubTextFinder::insert() -{ - zip_stat_t sb = {0, }; - - int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED); - for (int i = 0; i < entry_len; i++) { - if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html")) - continue; - - if (zip_stat_index(z, i, 0, &sb) != 0) - continue; - - zip_file_t *file = zip_fopen_index(z, i, 0); - if (!file) - continue; - - std::vector file_buf(sb.size); - - zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size); - zip_fclose(file); - - if (readn == static_cast(sb.size)) - htmlFindForDb(file_buf.data(), sb.size); - } -} - -bool EpubTextFinder::htmlNodeFindRecursive(xmlNodePtr node, const char *keyword) -{ - for (xmlNodePtr cur = node; cur; cur = cur->next) { - if (cur->type == XML_TEXT_NODE) { - std::string text(reinterpret_cast(cur->content)); - if (match(text, keyword)) - return true; - } - - if (htmlNodeFindRecursive(cur->children, keyword)) - return true; - } - - return false; -} - -void EpubTextFinder::htmlNodeFindRecursiveForDb(xmlNodePtr node) -{ - for (xmlNodePtr cur = node; cur; cur = cur->next) { - if (cur->type == XML_TEXT_NODE) { - std::string text(reinterpret_cast(cur->content)); - batchInsert(text); - } - - htmlNodeFindRecursiveForDb(cur->children); - } -} - -void EpubTextFinder::htmlFindForDb(const char *html_buf, int buf_size) -{ - htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL, - HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); - - if (!doc) { - LOGE("htmlReadMemory failed"); - return; - } - - htmlNodeFindRecursiveForDb(xmlDocGetRootElement(doc)); - xmlFreeDoc(doc); -} - -bool EpubTextFinder::htmlFind(const char *html_buf, int buf_size, const char *keyword) -{ - htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL, - HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); - - if (!doc) { - LOGE("htmlReadMemory failed"); - return false; - } - - bool result = htmlNodeFindRecursive(xmlDocGetRootElement(doc), keyword); - - xmlFreeDoc(doc); - - return result; -} +using namespace std; extern "C" bool media_svc_pdf_is_keyword_included(const char *path, const char *keyword) { - /* ToDo: factory pattern */ - std::unique_ptr ebookText = std::make_unique(path); - - return ebookText->find(keyword); + return unique_ptr{ make_unique(path, make_unique(keyword)) }->find(); } extern "C" bool media_svc_epub_is_keyword_included(const char *path, const char *keyword) { - /* ToDo: factory pattern */ - std::unique_ptr ebookText = std::make_unique(path); - - return ebookText->find(keyword); + return unique_ptr{ make_unique(path, make_unique(keyword)) }->find(); } -extern "C" void media_svc_pdf_insert_to_db(sqlite3 *handle, const char *path) +extern "C" void media_svc_pdf_insert_to_db(void *handle, const char *path) { - std::unique_ptr ebookText = std::make_unique(handle, path); - - ebookText->insert(); + unique_ptr{ make_unique(path, make_unique(path, handle)) }->insert(); } -extern "C" void media_svc_epub_insert_to_db(sqlite3 *handle, const char *path) +extern "C" void media_svc_epub_insert_to_db(void *handle, const char *path) { - std::unique_ptr ebookText = std::make_unique(handle, path); - - ebookText->insert(); + unique_ptr{ make_unique(path, make_unique(path, handle)) }->insert(); }