4 * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 #include <podofo/podofo.h>
27 #include <libxml/xmlmemory.h>
28 #include <libxml/parser.h>
29 #include <libxml/HTMLparser.h>
39 #define LOG_TAG "MEDIA_SERVICE"
40 #define INSERT_QUERY "INSERT INTO words(file_id, word) SELECT id, ? FROM files WHERE path=? ON CONFLICT (file_id, word) DO UPDATE SET frequency=frequency+1;"
41 #define TOKEN_KEY "\\s+"
42 #define SPECIAL_CHAR "[\\{\\}\\[\\]\\/?.,;:|\\)*~`!^\\-_+<>@\\#$%&\\\\=\\(\\\'\\\"]"
44 class TextFinderInterface
47 virtual ~TextFinderInterface() = default;
48 virtual bool find(const char *keyword) = 0;
49 virtual void insert() = 0;
52 class TextFinder : public TextFinderInterface
55 virtual ~TextFinder() = default;
56 bool match(std::string& text, const char *keyword);
57 void batchInsert(std::string& text);
60 const char *filePath {};
63 bool TextFinder::match(std::string& text, const char *keyword)
71 std::regex re(keyword, std::regex::icase);
73 if (std::regex_search(text, re)) {
74 LOGD("Found [%s]", keyword);
81 void TextFinder::batchInsert(std::string& text)
83 if (!dbHandle || !filePath || text.empty())
86 sqlite3_stmt *stmt = NULL;
87 const std::regex sp(SPECIAL_CHAR);
88 std::string temp = std::regex_replace(text, sp, "");
89 bool isTransaction = false;
91 if (sqlite3_exec(dbHandle, "BEGIN;", NULL, NULL, NULL) == SQLITE_OK)
94 sqlite3_prepare_v2(dbHandle, INSERT_QUERY, -1, &stmt, NULL);
96 const std::regex re(TOKEN_KEY);
97 std::sregex_token_iterator end;
99 for (std::sregex_token_iterator i(temp.begin(), temp.end(), re, -1); i != end; ++i) {
100 sqlite3_bind_text(stmt, 1, (*i).str().c_str(), -1, SQLITE_TRANSIENT);
101 sqlite3_bind_text(stmt, 2, filePath, -1, SQLITE_TRANSIENT);
106 sqlite3_finalize(stmt);
109 sqlite3_exec(dbHandle, "COMMIT;", NULL, NULL, NULL);
112 /*---------------- PDF -----------------------*/
113 class PdfTextFinder : public TextFinder
116 explicit PdfTextFinder(const char *path);
117 PdfTextFinder(sqlite3 *handle, const char *path);
118 bool find(const char *keyword) override;
119 void insert() override;
122 std::string parseTextFromPage(unsigned int index);
125 PoDoFo::PdfMemDocument pdf {};
128 PdfTextFinder::PdfTextFinder(const char *path)
131 LOGE("invalid path");
140 } catch (const PoDoFo::PdfError& e) {
141 LOGE("Initialization failed : %s", e.what());
145 PdfTextFinder::PdfTextFinder(sqlite3 *handle, const char *path)
148 LOGE("invalid handle");
153 LOGE("invalid path");
164 } catch (const PoDoFo::PdfError& e) {
165 LOGE("Initialization failed : %s", e.what());
169 bool PdfTextFinder::find(const char *keyword)
175 LOGE("Invalid keyword");
179 for (int n = 0; n < pdf.GetPageCount(); ++n) {
180 auto text = parseTextFromPage(n);
181 if (match(text, keyword))
188 void PdfTextFinder::insert()
193 for (int n = 0; n < pdf.GetPageCount(); ++n) {
194 auto text = parseTextFromPage(n);
199 std::string PdfTextFinder::parseTextFromPage(unsigned int index)
201 std::string fullText;
203 PoDoFo::EPdfContentsType type;
204 PoDoFo::PdfVariant var;
205 PoDoFo::PdfFont *cur_font = NULL;
206 bool text_block = false;
208 std::stack<PoDoFo::PdfVariant> stack;
209 PoDoFo::PdfString unicode;
210 PoDoFo::PdfArray array;
212 PoDoFo::PdfPage* page = pdf.GetPage(index);
216 PoDoFo::PdfContentsTokenizer tokenizer(page);
218 while (tokenizer.ReadNext(type, tok, var)) {
219 if (type != PoDoFo::ePdfContentsType_Keyword) {
226 if (!text_block && strcmp(tok, "BT") == 0) {
229 } else if (text_block && strcmp(tok, "ET") == 0) {
236 if (strcmp(tok, "Tf") == 0) {
237 if (stack.size() < 2) {
243 cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName()));
244 } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) {
248 if (!cur_font || !cur_font->GetEncoding())
251 unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font);
252 fullText += unicode.GetStringUtf8();
255 } else if (strcmp(tok, "TJ") == 0) {
259 array = stack.top().GetArray();
262 for (int i = 0; i < static_cast<int>(array.GetSize()); i++) {
263 if (array[i].IsString() || array[i].IsHexString()) {
264 if (!cur_font || !cur_font->GetEncoding())
267 unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font);
268 fullText += unicode.GetStringUtf8();
277 /*---------------- EPUB -----------------------*/
279 class EpubTextFinder : public TextFinder
282 explicit EpubTextFinder(const char *path);
283 EpubTextFinder(sqlite3 *handle, const char *path);
284 bool find(const char *keyword) override;
285 void insert() override;
287 ~EpubTextFinder() override;
290 bool htmlNodeFindRecursive(xmlNodePtr node, const char *keyword);
291 void htmlNodeFindRecursiveForDb(xmlNodePtr node);
292 bool htmlFind(const char *html_buf, int buf_size, const char *keyword);
293 void htmlFindForDb(const char *html_buf, int buf_size);
298 EpubTextFinder::EpubTextFinder(const char *path)
301 LOGE("invalid path");
308 z = zip_open(path, ZIP_RDONLY, &err);
310 LOGE("zip_open failed");
313 EpubTextFinder::EpubTextFinder(sqlite3 *handle, const char *path)
316 LOGE("invalid handle");
321 LOGE("invalid path");
328 z = zip_open(path, ZIP_RDONLY, &err);
330 LOGE("zip_open failed");
336 EpubTextFinder::~EpubTextFinder()
345 bool EpubTextFinder::find(const char *keyword)
347 zip_stat_t sb = {0, };
350 LOGE("Invalid keyword");
354 int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED);
355 for (int i = 0; i < entry_len; i++) {
356 if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html"))
359 if (zip_stat_index(z, i, 0, &sb) != 0)
362 zip_file_t *file = zip_fopen_index(z, i, 0);
366 std::vector<char> file_buf(sb.size);
368 zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size);
371 if ((readn == static_cast<zip_int64_t>(sb.size)) &&
372 htmlFind(file_buf.data(), sb.size, keyword))
379 void EpubTextFinder::insert()
381 zip_stat_t sb = {0, };
383 int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED);
384 for (int i = 0; i < entry_len; i++) {
385 if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html"))
388 if (zip_stat_index(z, i, 0, &sb) != 0)
391 zip_file_t *file = zip_fopen_index(z, i, 0);
395 std::vector<char> file_buf(sb.size);
397 zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size);
400 if (readn == static_cast<zip_int64_t>(sb.size))
401 htmlFindForDb(file_buf.data(), sb.size);
405 bool EpubTextFinder::htmlNodeFindRecursive(xmlNodePtr node, const char *keyword)
407 for (xmlNodePtr cur = node; cur; cur = cur->next) {
408 if (cur->type == XML_TEXT_NODE) {
409 std::string text(reinterpret_cast<char*>(cur->content));
410 if (match(text, keyword))
414 if (htmlNodeFindRecursive(cur->children, keyword))
421 void EpubTextFinder::htmlNodeFindRecursiveForDb(xmlNodePtr node)
423 for (xmlNodePtr cur = node; cur; cur = cur->next) {
424 if (cur->type == XML_TEXT_NODE) {
425 std::string text(reinterpret_cast<char*>(cur->content));
429 htmlNodeFindRecursiveForDb(cur->children);
433 void EpubTextFinder::htmlFindForDb(const char *html_buf, int buf_size)
435 htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL,
436 HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
439 LOGE("htmlReadMemory failed");
443 htmlNodeFindRecursiveForDb(xmlDocGetRootElement(doc));
447 bool EpubTextFinder::htmlFind(const char *html_buf, int buf_size, const char *keyword)
449 htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL,
450 HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
453 LOGE("htmlReadMemory failed");
457 bool result = htmlNodeFindRecursive(xmlDocGetRootElement(doc), keyword);
464 extern "C" bool media_svc_pdf_is_keyword_included(const char *path, const char *keyword)
466 /* ToDo: factory pattern */
467 std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<PdfTextFinder>(path);
469 return ebookText->find(keyword);
472 extern "C" bool media_svc_epub_is_keyword_included(const char *path, const char *keyword)
474 /* ToDo: factory pattern */
475 std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<EpubTextFinder>(path);
477 return ebookText->find(keyword);
480 extern "C" void media_svc_pdf_insert_to_db(sqlite3 *handle, const char *path)
482 std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<PdfTextFinder>(handle, path);
487 extern "C" void media_svc_epub_insert_to_db(sqlite3 *handle, const char *path)
489 std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<EpubTextFinder>(handle, path);