From 3817d3b07aa3374e1e3414b88d69c3d68efd770b Mon Sep 17 00:00:00 2001 From: Minje Ahn Date: Thu, 22 Jul 2021 10:26:13 +0900 Subject: [PATCH] Add to search ebooks with keywords Change-Id: I7c4210616e56c28bb4ab5ff5e58721c37ea0263d Signed-off-by: Minje Ahn --- CMakeLists.txt | 6 +- include/media-svc.h | 2 + src/common/media-svc-util-epub.c | 120 +++++++++++++++++++++++ src/common/media-svc-util-pdf.cpp | 162 +++++++++++++++++++++++++++++++ src/common/media-svc-util.c | 49 +++++----- src/common/media-svc.c | 30 ++++++ src/include/common/media-svc-util-epub.h | 29 ++++++ src/include/common/media-svc-util-pdf.h | 37 +++++++ src/include/common/media-svc-util.h | 1 + 9 files changed, 411 insertions(+), 25 deletions(-) create mode 100755 src/common/media-svc-util-epub.c create mode 100644 src/common/media-svc-util-pdf.cpp create mode 100755 src/include/common/media-svc-util-epub.h create mode 100755 src/include/common/media-svc-util-pdf.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f715e16..5d62899 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6) -PROJECT(media-service C) +PROJECT(media-service C CXX) SET(VERSION_MAJOR 1) SET(VERSION "${VERSION_MAJOR}.0.0") @@ -12,6 +12,8 @@ SET(SRCS src/common/media-svc-album.c src/common/media-svc-media-folder.c src/common/media-svc-db-utils.c + src/common/media-svc-util-pdf.cpp + src/common/media-svc-util-epub.c src/common/media-svc-util.c src/common/media-svc-noti.c src/common/media-svc-storage.c @@ -56,9 +58,11 @@ pkg_check_modules(pkgs REQUIRED glib-2.0 dlog sqlite3 icu-i18n libexif mm-filein FOREACH(flag ${pkgs_CFLAGS}) SET(EXTRA_CFLAGS "${EXTRA_CFLAGS} ${flag}") + SET(EXTRA_CXXFLAGS "${EXTRA_CXXFLAGS} ${flag}") ENDFOREACH(flag) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS} -fPIC -Wall -Werror -D_FORTIFY_SOURCE=2") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXXFLAGS} -fPIC -Wall -Werror -D_FORTIFY_SOURCE=2") SET(CMAKE_C_FLAGS_DEBUG "-O0 -g") SET(CMAKE_C_FLAGS_RELEASE "-O2 -fPIC") diff --git a/include/media-svc.h b/include/media-svc.h index 1398b02..94bb2fc 100755 --- a/include/media-svc.h +++ b/include/media-svc.h @@ -75,6 +75,8 @@ int media_svc_send_query(uid_t uid); int media_svc_get_media_type(const char *path, int *mediatype); int media_svc_create_thumbnail(const char *file_path, int media_type, uid_t uid, char **thumbnail_path); +int media_svc_get_book_by_keyword(sqlite3 *handle, const char *keyword, GList **result); + #ifdef __cplusplus } #endif diff --git a/src/common/media-svc-util-epub.c b/src/common/media-svc-util-epub.c new file mode 100755 index 0000000..e558a21 --- /dev/null +++ b/src/common/media-svc-util-epub.c @@ -0,0 +1,120 @@ +/* + * libmedia-service + * + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include + +#include +#include +#include +#include +#include + +static bool __media_svc_epub_find_keyword(const char *text, const char *keyword) +{ + media_svc_retv_if(!text, false); + media_svc_retv_if(!keyword, false); + + if (g_regex_match_simple(keyword, text, G_REGEX_CASELESS, (GRegexMatchFlags)0)) { + media_svc_debug("Found"); + return true; + } + + return false; +} + +static bool __media_svc_epub_find_html_body(xmlNodePtr node, const char *keyword) +{ + xmlNodePtr cur_node = NULL; + + for (cur_node = node; cur_node; cur_node = cur_node->next) { + if(cur_node->type == XML_TEXT_NODE && __media_svc_epub_find_keyword((const char *)cur_node->content, keyword)) + return true; + + if (__media_svc_epub_find_html_body(cur_node->children, keyword)) + return true; + } + + return false; +} + +static bool __media_svc_epub_check_html(const char *html_buf, int buf_size, const char *keyword) +{ + htmlDocPtr doc = NULL; + xmlNodePtr node = NULL; + bool result = false; + + doc = htmlReadMemory(html_buf, buf_size, "/", NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); + media_svc_retvm_if(!doc, false, "htmlReadMemory failed"); + + node = xmlDocGetRootElement(doc); + result = __media_svc_epub_find_html_body(node, keyword); + + xmlFreeDoc(doc); + + return result; +} + +bool _media_svc_epub_is_keyword_included(const char *path, const char *keyword) +{ + int err = 0; + zip_t *z = NULL; + zip_stat_t sb = {0, }; + zip_file_t *file = NULL; + int entry_len = 0; + int i = 0; + char *file_buf = NULL; + + media_svc_retvm_if(!path, false, "Invalid path"); + media_svc_retvm_if(!keyword, false, "Invalid keyword"); + + z = zip_open(path, ZIP_RDONLY, &err); + media_svc_retvm_if(err != 0, false, "zip_open failed"); + + entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED); + for (i = 0; i < entry_len; i++) { + if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html")) + continue; + + if (zip_stat_index(z, i, 0, &sb) != 0) + continue; + + file = zip_fopen_index(z, i, 0); + if (!file) + continue; + + file_buf = g_malloc0(sb.size); + + if (zip_fread(file, file_buf, sb.size) == sb.size) { + if (__media_svc_epub_check_html(file_buf, sb.size, keyword)) { +// media_svc_sec_debug("File name [%s]", sb.name); + g_free(file_buf); + zip_close(z); + return true; + } + } + + g_free(file_buf); + file_buf = NULL; + } + + zip_close(z); + + return false; +} \ No newline at end of file diff --git a/src/common/media-svc-util-pdf.cpp b/src/common/media-svc-util-pdf.cpp new file mode 100644 index 0000000..08001ac --- /dev/null +++ b/src/common/media-svc-util-pdf.cpp @@ -0,0 +1,162 @@ +/* + * libmedia-service + * + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#include +#include +#include +#include +#include + +using namespace std; +using namespace PoDoFo; + +static bool __media_svc_pdf_find_keyword(const char *full, const char *keyword) +{ + media_svc_retv_if(!full, false); + media_svc_retv_if(!keyword, false); + + if (g_regex_match_simple(keyword, full, G_REGEX_CASELESS, (GRegexMatchFlags)0)) { + media_svc_debug("Found"); + return true; + } + + return false; + } + +static char * __media_svc_pdf_parse_text(PdfMemDocument *pdf, PdfPage *page, const char *keyword) +{ + EPdfContentsType type; + PdfVariant var; + PdfFont *cur_font = NULL; + bool text_block = false; + const char *tok; + stack stack; + PdfString unicode; + PdfArray array; + + GString *full_text = NULL; + gchar *tmp_text = NULL; + + media_svc_retv_if(!pdf, NULL); + media_svc_retv_if(!page, NULL); + media_svc_retv_if(!keyword, NULL); + + PdfContentsTokenizer tokenizer(page); + + full_text = g_string_new(NULL); + + while (tokenizer.ReadNext(type, tok, var)) { + if (type == ePdfContentsType_Keyword) { + if (!text_block && strcmp(tok, "BT") == 0) { + text_block = true; + continue; + } else if (text_block && strcmp(tok, "ET") == 0) { + text_block = false; + } + + if (!text_block) + continue; + + if (strcmp(tok, "Tf") == 0) { + if (stack.size() < 2) { + cur_font = NULL; + continue; + } + + stack.pop(); + cur_font = pdf->GetFont(page->GetFromResources(PdfName("Font"), stack.top().GetName())); + } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) { + if (stack.empty()) + continue; + + if (!cur_font || !cur_font->GetEncoding()) + continue; + + unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font); + full_text = g_string_append(full_text, unicode.GetStringUtf8().c_str()); + + stack.pop(); + } else if (strcmp(tok, "TJ") == 0) { + if (stack.empty()) + continue; + + array = stack.top().GetArray(); + stack.pop(); + + for (int i = 0; i < static_cast(array.GetSize()); i++) { + if (array[i].IsString() || array[i].IsHexString()) { + if (!cur_font || !cur_font->GetEncoding()) + continue; + + unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font); + full_text = g_string_append(full_text, unicode.GetStringUtf8().c_str()); + } + } + } + } else { + if (text_block) + stack.push(var); + } + } + + while (!stack.empty()) + stack.pop(); + + tmp_text = g_string_free(full_text, FALSE); + + /* GString start with an empty string. */ + if (strlen(tmp_text) == 0) { + g_free(tmp_text); + return NULL; + } else { + return tmp_text; + } +} + +bool _media_svc_pdf_is_keyword_included(const char *path, const char *keyword) +{ + bool res = false; + gchar *full_text = NULL; + + media_svc_retvm_if(!path, false, "Invalid path"); + media_svc_retvm_if(!keyword, false, "Invalid keyword"); + + try { + PdfMemDocument pdf(path); + + // PDF format starts from 1.. + // GetPageCount() is a value, not a calculation.. So, it does not affect the performance of this forloop. + for (int n = 0; n < pdf.GetPageCount(); ++n) { + PdfPage *page = pdf.GetPage(n); + + full_text = __media_svc_pdf_parse_text(&pdf, page, keyword); + + if (full_text) { + res = __media_svc_pdf_find_keyword(full_text, keyword); + g_free(full_text); + + if (res) + return res; + } + } + } catch (const PdfError& e) { + media_svc_error("Initialization failed : %s", e.what()); + } + + return false; +} diff --git a/src/common/media-svc-util.c b/src/common/media-svc-util.c index bf20193..a58a7c2 100644 --- a/src/common/media-svc-util.c +++ b/src/common/media-svc-util.c @@ -51,7 +51,9 @@ #include "media-svc-hash.h" #include "media-svc-album.h" #include "media-svc-localize_ch.h" -/*For ebook*/ +#include "media-svc-util-pdf.h" +#include "media-svc-util-epub.h" +/*For ebook metadata */ #include #include #include @@ -1243,27 +1245,19 @@ static gboolean __media_svc_get_epub_root_file(zip_t *z, char **opf_file) { gchar *buf = NULL; gchar *tmp_buf = NULL; - int len = 0; xmlDocPtr doc = NULL; xmlNodePtr node = NULL; media_svc_retvm_if(!z, FALSE, "z is NULL"); media_svc_retvm_if(!opf_file, FALSE, "opf_file is NULL"); - tmp_buf = __media_svc_get_zipfile_data(z, "META-INF/container.xml"); - media_svc_retvm_if(!tmp_buf, FALSE, "tmp_buf is NULL"); + buf = __media_svc_get_zipfile_data(z, "META-INF/container.xml"); + media_svc_retvm_if(!buf, FALSE, "buf is NULL"); - len = strlen(tmp_buf); - while (0 < len) { - len--; - - if (tmp_buf[len] == '>') - break; - } - - buf = g_strndup(tmp_buf, len + 1); - g_free(tmp_buf); + tmp_buf = g_strrstr(buf, ">"); + if (tmp_buf) + *(tmp_buf + 1) = '\0'; doc = xmlParseDoc((const xmlChar *)buf); g_free(buf); @@ -1402,19 +1396,15 @@ static int __media_svc_get_pdf_metadata(media_svc_content_info_s *content_info) meta_buf = g_malloc0(end_pos - start_pos + 1); - if (read(fd, meta_buf, end_pos - start_pos) != end_pos - start_pos) { - g_free(meta_buf); - goto NEXT; - } - - if (__media_svc_get_xml_metadata((const xmlChar *)meta_buf, TRUE, content_info)) { - g_free(meta_buf); - break; + if (read(fd, meta_buf, end_pos - start_pos) == end_pos - start_pos) { + if (__media_svc_get_xml_metadata((const xmlChar *)meta_buf, TRUE, content_info)) { + g_free(meta_buf); + break; + } } g_free(meta_buf); - meta_buf = NULL; -NEXT: + start_pos = 0; end_pos = 0; } @@ -1586,3 +1576,14 @@ bool _media_svc_is_valid_storage_type(ms_user_storage_type_e storage_type) return false; } } + +bool _media_svc_is_keyword_included(const char *path, const char *keyword) +{ + media_svc_retvm_if(!path, false, "Invalid path"); + media_svc_retvm_if(!keyword, false, "Invalid keyword"); + + if (g_str_has_suffix(path, "epub") || g_str_has_suffix(path, "EPUB")) + return _media_svc_epub_is_keyword_included(path, keyword); + else + return _media_svc_pdf_is_keyword_included(path, keyword); +} diff --git a/src/common/media-svc.c b/src/common/media-svc.c index a029935..d238e09 100755 --- a/src/common/media-svc.c +++ b/src/common/media-svc.c @@ -781,3 +781,33 @@ int media_svc_create_thumbnail(const char *file_path, int media_type, uid_t uid, return ret; } + +int media_svc_get_book_by_keyword(sqlite3 *handle, const char *keyword, GList **result) +{ + int ret = MS_MEDIA_ERR_NONE; + GList *item_list = NULL; + GList *iter = NULL; + char *query = NULL; + char *tmp_path = NULL; + + media_svc_retvm_if(!handle, MS_MEDIA_ERR_INVALID_PARAMETER, "db handle is NULL"); + media_svc_retvm_if(!keyword, MS_MEDIA_ERR_INVALID_PARAMETER, "keyword is NULL"); + media_svc_retvm_if(!result, MS_MEDIA_ERR_INVALID_PARAMETER, "result is NULL"); + + query = sqlite3_mprintf("SELECT media_path FROM %q WHERE media_type=%d AND validity=1;", + DB_TABLE_MEDIA, MEDIA_SVC_MEDIA_TYPE_BOOK); + + ret = _media_svc_get_media(handle, query, &item_list); + media_svc_retvm_if(ret != MS_MEDIA_ERR_NONE, ret, "_media_svc_get_media failed"); + + for (iter = item_list; iter; iter = g_list_next(iter)) { + tmp_path = (char *)iter->data; + + if (_media_svc_is_keyword_included(tmp_path, keyword)) + *result = g_list_append(*result, g_strdup(tmp_path)); + } + + g_list_free_full(item_list, g_free); + + return ret; +} diff --git a/src/include/common/media-svc-util-epub.h b/src/include/common/media-svc-util-epub.h new file mode 100755 index 0000000..2f3824c --- /dev/null +++ b/src/include/common/media-svc-util-epub.h @@ -0,0 +1,29 @@ +/* + * libmedia-service + * + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + + +#ifndef _MEDIA_SVC_UTIL_EPUB_H_ +#define _MEDIA_SVC_UTIL_EPUB_H_ + +#include + +bool _media_svc_epub_is_keyword_included(const char *path, const char *keyword); + +#endif /*_MEDIA_SVC_UTIL_EPUB_H_*/ diff --git a/src/include/common/media-svc-util-pdf.h b/src/include/common/media-svc-util-pdf.h new file mode 100755 index 0000000..c0f945a --- /dev/null +++ b/src/include/common/media-svc-util-pdf.h @@ -0,0 +1,37 @@ +/* + * libmedia-service + * + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + + +#ifndef _MEDIA_SVC_UTIL_PDF_H_ +#define _MEDIA_SVC_UTIL_PDF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + + +bool _media_svc_pdf_is_keyword_included(const char *path, const char *keyword); + + +#ifdef __cplusplus +} +#endif + +#endif /*_MEDIA_SVC_UTIL_PDF_H_*/ diff --git a/src/include/common/media-svc-util.h b/src/include/common/media-svc-util.h index b56b1a7..ad3c5b2 100755 --- a/src/include/common/media-svc-util.h +++ b/src/include/common/media-svc-util.h @@ -141,6 +141,7 @@ bool _media_svc_check_pinyin_support(void); int _media_svc_extract_music_metadata_for_update(media_svc_content_info_s *content_info, const char *path); int _media_svc_get_media_type(const char *path, int *mediatype); bool _media_svc_is_valid_storage_type(ms_user_storage_type_e storage_type); +bool _media_svc_is_keyword_included(const char *path, const char *keyword); #ifdef __cplusplus } -- 2.7.4