Add to search ebooks with keywords 90/261590/5
authorMinje Ahn <minje.ahn@samsung.com>
Thu, 22 Jul 2021 01:26:13 +0000 (10:26 +0900)
committerMinje ahn <minje.ahn@samsung.com>
Mon, 26 Jul 2021 02:07:48 +0000 (02:07 +0000)
Change-Id: I7c4210616e56c28bb4ab5ff5e58721c37ea0263d
Signed-off-by: Minje Ahn <minje.ahn@samsung.com>
CMakeLists.txt
include/media-svc.h
src/common/media-svc-util-epub.c [new file with mode: 0755]
src/common/media-svc-util-pdf.cpp [new file with mode: 0644]
src/common/media-svc-util.c
src/common/media-svc.c
src/include/common/media-svc-util-epub.h [new file with mode: 0755]
src/include/common/media-svc-util-pdf.h [new file with mode: 0755]
src/include/common/media-svc-util.h

index f715e16..5d62899 100644 (file)
@@ -1,5 +1,5 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
-PROJECT(media-service C)
+PROJECT(media-service C CXX)
 SET(VERSION_MAJOR 1)
 SET(VERSION "${VERSION_MAJOR}.0.0")
 
@@ -12,6 +12,8 @@ SET(SRCS
        src/common/media-svc-album.c
        src/common/media-svc-media-folder.c
        src/common/media-svc-db-utils.c
+       src/common/media-svc-util-pdf.cpp
+       src/common/media-svc-util-epub.c
        src/common/media-svc-util.c
        src/common/media-svc-noti.c
        src/common/media-svc-storage.c
@@ -56,9 +58,11 @@ pkg_check_modules(pkgs REQUIRED glib-2.0 dlog sqlite3 icu-i18n libexif mm-filein
 
 FOREACH(flag ${pkgs_CFLAGS})
        SET(EXTRA_CFLAGS "${EXTRA_CFLAGS} ${flag}")
+       SET(EXTRA_CXXFLAGS "${EXTRA_CXXFLAGS} ${flag}")
 ENDFOREACH(flag)
 
 SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS} -fPIC -Wall -Werror -D_FORTIFY_SOURCE=2")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXXFLAGS} -fPIC -Wall -Werror -D_FORTIFY_SOURCE=2")
 SET(CMAKE_C_FLAGS_DEBUG "-O0 -g")
 SET(CMAKE_C_FLAGS_RELEASE "-O2 -fPIC")
 
index 1398b02..94bb2fc 100755 (executable)
@@ -75,6 +75,8 @@ int media_svc_send_query(uid_t uid);
 int media_svc_get_media_type(const char *path, int *mediatype);
 int media_svc_create_thumbnail(const char *file_path, int media_type, uid_t uid, char **thumbnail_path);
 
+int media_svc_get_book_by_keyword(sqlite3 *handle, const char *keyword, GList **result);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/common/media-svc-util-epub.c b/src/common/media-svc-util-epub.c
new file mode 100755 (executable)
index 0000000..e558a21
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * libmedia-service
+ *
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <media-svc-util-epub.h>
+#include <media-svc-debug.h>
+
+#include <zip.h>
+#include <libxml/xmlmemory.h>
+#include <libxml/parser.h>
+#include <libxml/HTMLparser.h>
+#include <glib.h>
+
+static bool __media_svc_epub_find_keyword(const char *text, const char *keyword)
+{
+       media_svc_retv_if(!text, false);
+       media_svc_retv_if(!keyword, false);
+
+       if (g_regex_match_simple(keyword, text, G_REGEX_CASELESS, (GRegexMatchFlags)0)) {
+               media_svc_debug("Found");
+               return true;
+       }
+
+       return false;
+}
+
+static bool __media_svc_epub_find_html_body(xmlNodePtr node, const char *keyword)
+{
+       xmlNodePtr cur_node = NULL;
+
+       for (cur_node = node; cur_node; cur_node = cur_node->next) {
+               if(cur_node->type == XML_TEXT_NODE && __media_svc_epub_find_keyword((const char *)cur_node->content, keyword))
+                       return true;
+
+               if (__media_svc_epub_find_html_body(cur_node->children, keyword))
+                       return true;
+       }
+
+       return false;
+}
+
+static bool __media_svc_epub_check_html(const char *html_buf, int buf_size, const char *keyword)
+{
+       htmlDocPtr doc = NULL;
+       xmlNodePtr node = NULL;
+       bool result = false;
+
+       doc = htmlReadMemory(html_buf, buf_size, "/", NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
+       media_svc_retvm_if(!doc, false, "htmlReadMemory failed");
+
+       node = xmlDocGetRootElement(doc);
+       result = __media_svc_epub_find_html_body(node, keyword);
+
+       xmlFreeDoc(doc);
+
+       return result;
+}
+
+bool _media_svc_epub_is_keyword_included(const char *path, const char *keyword)
+{
+       int err = 0;
+       zip_t *z = NULL;
+       zip_stat_t sb = {0, };
+       zip_file_t *file = NULL;
+       int entry_len = 0;
+       int i = 0;
+       char *file_buf = NULL;
+
+       media_svc_retvm_if(!path, false, "Invalid path");
+       media_svc_retvm_if(!keyword, false, "Invalid keyword");
+
+       z = zip_open(path, ZIP_RDONLY, &err);
+       media_svc_retvm_if(err != 0, false, "zip_open failed");
+
+       entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED);
+       for (i = 0; i < entry_len; i++) {
+               if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html"))
+                       continue;
+
+               if (zip_stat_index(z, i, 0, &sb) != 0)
+                       continue;
+
+               file = zip_fopen_index(z, i, 0);
+               if (!file)
+                       continue;
+
+               file_buf = g_malloc0(sb.size);
+
+               if (zip_fread(file, file_buf, sb.size) == sb.size) {
+                       if (__media_svc_epub_check_html(file_buf, sb.size, keyword)) {
+//                             media_svc_sec_debug("File name [%s]", sb.name);
+                               g_free(file_buf);
+                               zip_close(z);
+                               return true;
+                       }
+               }
+
+               g_free(file_buf);
+               file_buf = NULL;
+       }
+
+       zip_close(z);
+
+       return false;
+}
\ No newline at end of file
diff --git a/src/common/media-svc-util-pdf.cpp b/src/common/media-svc-util-pdf.cpp
new file mode 100644 (file)
index 0000000..08001ac
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ * libmedia-service
+ *
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include <podofo/podofo.h>
+#include <stack>
+#include <media-svc-util-pdf.h>
+#include <media-svc-debug.h>
+#include <glib.h>
+
+using namespace std;
+using namespace PoDoFo;
+
+static bool __media_svc_pdf_find_keyword(const char *full, const char *keyword)
+{
+       media_svc_retv_if(!full, false);
+       media_svc_retv_if(!keyword, false);
+
+       if (g_regex_match_simple(keyword, full, G_REGEX_CASELESS, (GRegexMatchFlags)0)) {
+               media_svc_debug("Found");
+               return true;
+       }
+
+       return false;
+ }
+
+static char * __media_svc_pdf_parse_text(PdfMemDocument *pdf, PdfPage *page, const char *keyword)
+{
+       EPdfContentsType type;
+       PdfVariant var;
+       PdfFont *cur_font = NULL;
+       bool text_block = false;
+       const char *tok;
+       stack<PdfVariant> stack;
+       PdfString unicode;
+       PdfArray array;
+
+       GString *full_text = NULL;
+       gchar *tmp_text = NULL;
+
+       media_svc_retv_if(!pdf, NULL);
+       media_svc_retv_if(!page, NULL);
+       media_svc_retv_if(!keyword, NULL);
+
+       PdfContentsTokenizer tokenizer(page);
+
+       full_text = g_string_new(NULL);
+
+       while (tokenizer.ReadNext(type, tok, var)) {
+               if (type == ePdfContentsType_Keyword) {
+                       if (!text_block && strcmp(tok, "BT") == 0) {
+                               text_block = true;
+                               continue;
+                       } else if (text_block && strcmp(tok, "ET") == 0) {
+                               text_block = false;
+                       }
+
+                       if (!text_block)
+                               continue;
+
+                       if (strcmp(tok, "Tf") == 0) {
+                               if (stack.size() < 2) {
+                                       cur_font = NULL;
+                                       continue;
+                               }
+
+                               stack.pop();
+                               cur_font = pdf->GetFont(page->GetFromResources(PdfName("Font"), stack.top().GetName()));
+                       } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) {
+                               if (stack.empty())
+                                       continue;
+
+                               if (!cur_font || !cur_font->GetEncoding())
+                                       continue;
+
+                               unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font);
+                               full_text = g_string_append(full_text, unicode.GetStringUtf8().c_str());
+
+                               stack.pop();
+                       } else if (strcmp(tok, "TJ") == 0) {
+                               if (stack.empty())
+                                       continue;
+
+                               array = stack.top().GetArray();
+                               stack.pop();
+
+                               for (int i = 0; i < static_cast<int>(array.GetSize()); i++) {
+                                       if (array[i].IsString() || array[i].IsHexString()) {
+                                               if (!cur_font || !cur_font->GetEncoding())
+                                                       continue;
+
+                                               unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font);
+                                               full_text = g_string_append(full_text, unicode.GetStringUtf8().c_str());
+                                       }
+                               }
+                       }
+               } else {
+                       if (text_block)
+                               stack.push(var);
+               }
+       }
+
+       while (!stack.empty())
+               stack.pop();
+
+       tmp_text = g_string_free(full_text, FALSE);
+
+       /* GString start with an empty string. */
+       if (strlen(tmp_text) == 0) {
+               g_free(tmp_text);
+               return NULL;
+       } else {
+               return tmp_text;
+       }
+}
+
+bool _media_svc_pdf_is_keyword_included(const char *path, const char *keyword)
+{
+       bool res = false;
+       gchar *full_text = NULL;
+
+       media_svc_retvm_if(!path, false, "Invalid path");
+       media_svc_retvm_if(!keyword, false, "Invalid keyword");
+
+       try {
+               PdfMemDocument pdf(path);
+
+               // PDF format starts from 1..
+               // GetPageCount() is a value, not a calculation.. So, it does not affect the performance of this forloop.
+               for (int n = 0; n < pdf.GetPageCount(); ++n) {
+                       PdfPage *page = pdf.GetPage(n);
+
+                       full_text = __media_svc_pdf_parse_text(&pdf, page, keyword);
+
+                       if (full_text) {
+                               res = __media_svc_pdf_find_keyword(full_text, keyword);
+                               g_free(full_text);
+
+                               if (res)
+                                       return res;
+                       }
+               }
+       } catch (const PdfError& e) {
+               media_svc_error("Initialization failed : %s", e.what());
+       }
+
+       return false;
+}
index bf20193..a58a7c2 100644 (file)
@@ -51,7 +51,9 @@
 #include "media-svc-hash.h"
 #include "media-svc-album.h"
 #include "media-svc-localize_ch.h"
-/*For ebook*/
+#include "media-svc-util-pdf.h"
+#include "media-svc-util-epub.h"
+/*For ebook metadata */
 #include <zip.h>
 #include <libxml/xmlmemory.h>
 #include <libxml/parser.h>
@@ -1243,27 +1245,19 @@ static gboolean __media_svc_get_epub_root_file(zip_t *z, char **opf_file)
 {
        gchar *buf = NULL;
        gchar *tmp_buf = NULL;
-       int len = 0;
        xmlDocPtr doc = NULL;
        xmlNodePtr node = NULL;
 
        media_svc_retvm_if(!z, FALSE, "z is NULL");
        media_svc_retvm_if(!opf_file, FALSE, "opf_file is NULL");
 
-       tmp_buf = __media_svc_get_zipfile_data(z, "META-INF/container.xml");
-       media_svc_retvm_if(!tmp_buf, FALSE, "tmp_buf is NULL");
+       buf = __media_svc_get_zipfile_data(z, "META-INF/container.xml");
+       media_svc_retvm_if(!buf, FALSE, "buf is NULL");
 
-       len = strlen(tmp_buf);
 
-       while (0 < len) {
-               len--;
-
-               if (tmp_buf[len] == '>')
-                       break;
-       }
-
-       buf = g_strndup(tmp_buf, len + 1);
-       g_free(tmp_buf);
+       tmp_buf = g_strrstr(buf, ">");
+       if (tmp_buf)
+               *(tmp_buf + 1) = '\0';
 
        doc = xmlParseDoc((const xmlChar *)buf);
        g_free(buf);
@@ -1402,19 +1396,15 @@ static int __media_svc_get_pdf_metadata(media_svc_content_info_s *content_info)
 
                        meta_buf = g_malloc0(end_pos - start_pos + 1);
 
-                       if (read(fd, meta_buf, end_pos - start_pos) != end_pos - start_pos) {
-                               g_free(meta_buf);
-                               goto NEXT;
-                       }
-
-                       if (__media_svc_get_xml_metadata((const xmlChar *)meta_buf, TRUE, content_info)) {
-                               g_free(meta_buf);
-                               break;
+                       if (read(fd, meta_buf, end_pos - start_pos) == end_pos - start_pos) {
+                               if (__media_svc_get_xml_metadata((const xmlChar *)meta_buf, TRUE, content_info)) {
+                                       g_free(meta_buf);
+                                       break;
+                               }
                        }
 
                        g_free(meta_buf);
-                       meta_buf = NULL;
-NEXT:
+
                        start_pos = 0;
                        end_pos = 0;
                }
@@ -1586,3 +1576,14 @@ bool _media_svc_is_valid_storage_type(ms_user_storage_type_e storage_type)
                return false;
        }
 }
+
+bool _media_svc_is_keyword_included(const char *path, const char *keyword)
+{
+       media_svc_retvm_if(!path, false, "Invalid path");
+       media_svc_retvm_if(!keyword, false, "Invalid keyword");
+
+       if (g_str_has_suffix(path, "epub") || g_str_has_suffix(path, "EPUB"))
+               return _media_svc_epub_is_keyword_included(path, keyword);
+       else
+               return _media_svc_pdf_is_keyword_included(path, keyword);
+}
index a029935..d238e09 100755 (executable)
@@ -781,3 +781,33 @@ int media_svc_create_thumbnail(const char *file_path, int media_type, uid_t uid,
 
        return ret;
 }
+
+int media_svc_get_book_by_keyword(sqlite3 *handle, const char *keyword, GList **result)
+{
+       int ret = MS_MEDIA_ERR_NONE;
+       GList *item_list = NULL;
+       GList *iter = NULL;
+       char *query = NULL;
+       char *tmp_path = NULL;
+
+       media_svc_retvm_if(!handle, MS_MEDIA_ERR_INVALID_PARAMETER, "db handle is NULL");
+       media_svc_retvm_if(!keyword, MS_MEDIA_ERR_INVALID_PARAMETER, "keyword is NULL");
+       media_svc_retvm_if(!result, MS_MEDIA_ERR_INVALID_PARAMETER, "result is NULL");
+
+       query = sqlite3_mprintf("SELECT media_path FROM %q WHERE media_type=%d AND validity=1;",
+                                                       DB_TABLE_MEDIA, MEDIA_SVC_MEDIA_TYPE_BOOK);
+
+       ret = _media_svc_get_media(handle, query, &item_list);
+       media_svc_retvm_if(ret != MS_MEDIA_ERR_NONE, ret, "_media_svc_get_media failed");
+
+       for (iter = item_list; iter; iter = g_list_next(iter)) {
+               tmp_path = (char *)iter->data;
+
+               if (_media_svc_is_keyword_included(tmp_path, keyword))
+                       *result = g_list_append(*result, g_strdup(tmp_path));
+       }
+
+       g_list_free_full(item_list, g_free);
+
+       return ret;
+}
diff --git a/src/include/common/media-svc-util-epub.h b/src/include/common/media-svc-util-epub.h
new file mode 100755 (executable)
index 0000000..2f3824c
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * libmedia-service
+ *
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+
+#ifndef _MEDIA_SVC_UTIL_EPUB_H_
+#define _MEDIA_SVC_UTIL_EPUB_H_
+
+#include <stdbool.h>
+
+bool _media_svc_epub_is_keyword_included(const char *path, const char *keyword);
+
+#endif /*_MEDIA_SVC_UTIL_EPUB_H_*/
diff --git a/src/include/common/media-svc-util-pdf.h b/src/include/common/media-svc-util-pdf.h
new file mode 100755 (executable)
index 0000000..c0f945a
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * libmedia-service
+ *
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+
+#ifndef _MEDIA_SVC_UTIL_PDF_H_
+#define _MEDIA_SVC_UTIL_PDF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+bool _media_svc_pdf_is_keyword_included(const char *path, const char *keyword);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_MEDIA_SVC_UTIL_PDF_H_*/
index b56b1a7..ad3c5b2 100755 (executable)
@@ -141,6 +141,7 @@ bool _media_svc_check_pinyin_support(void);
 int _media_svc_extract_music_metadata_for_update(media_svc_content_info_s *content_info, const char *path);
 int _media_svc_get_media_type(const char *path, int *mediatype);
 bool _media_svc_is_valid_storage_type(ms_user_storage_type_e storage_type);
+bool _media_svc_is_keyword_included(const char *path, const char *keyword);
 
 #ifdef __cplusplus
 }