4 * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 #include <podofo/podofo.h>
21 #include <media-svc-util-pdf.h>
22 #include <media-svc-debug.h>
26 using namespace PoDoFo;
28 static bool __media_svc_pdf_find_keyword(const char *full, const char *keyword)
30 media_svc_retv_if(!full, false);
31 media_svc_retv_if(!keyword, false);
33 if (g_regex_match_simple(keyword, full, G_REGEX_CASELESS, (GRegexMatchFlags)0)) {
34 media_svc_debug("Found");
41 static char * __media_svc_pdf_parse_text(PdfMemDocument *pdf, PdfPage *page, const char *keyword)
43 EPdfContentsType type;
45 PdfFont *cur_font = NULL;
46 bool text_block = false;
48 stack<PdfVariant> stack;
52 GString *full_text = NULL;
53 gchar *tmp_text = NULL;
55 media_svc_retv_if(!pdf, NULL);
56 media_svc_retv_if(!page, NULL);
57 media_svc_retv_if(!keyword, NULL);
59 PdfContentsTokenizer tokenizer(page);
61 full_text = g_string_new(NULL);
63 while (tokenizer.ReadNext(type, tok, var)) {
64 if (type == ePdfContentsType_Keyword) {
65 if (!text_block && strcmp(tok, "BT") == 0) {
68 } else if (text_block && strcmp(tok, "ET") == 0) {
75 if (strcmp(tok, "Tf") == 0) {
76 if (stack.size() < 2) {
82 cur_font = pdf->GetFont(page->GetFromResources(PdfName("Font"), stack.top().GetName()));
83 } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) {
87 if (!cur_font || !cur_font->GetEncoding())
90 unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font);
91 full_text = g_string_append(full_text, unicode.GetStringUtf8().c_str());
94 } else if (strcmp(tok, "TJ") == 0) {
98 array = stack.top().GetArray();
101 for (int i = 0; i < static_cast<int>(array.GetSize()); i++) {
102 if (array[i].IsString() || array[i].IsHexString()) {
103 if (!cur_font || !cur_font->GetEncoding())
106 unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font);
107 full_text = g_string_append(full_text, unicode.GetStringUtf8().c_str());
117 while (!stack.empty())
120 tmp_text = g_string_free(full_text, FALSE);
122 /* GString start with an empty string. */
123 if (strlen(tmp_text) == 0) {
131 bool _media_svc_pdf_is_keyword_included(const char *path, const char *keyword)
134 gchar *full_text = NULL;
136 media_svc_retvm_if(!path, false, "Invalid path");
137 media_svc_retvm_if(!keyword, false, "Invalid keyword");
140 PdfMemDocument pdf(path);
142 // PDF format starts from 1..
143 // GetPageCount() is a value, not a calculation.. So, it does not affect the performance of this forloop.
144 for (int n = 0; n < pdf.GetPageCount(); ++n) {
145 PdfPage *page = pdf.GetPage(n);
147 full_text = __media_svc_pdf_parse_text(&pdf, page, keyword);
150 res = __media_svc_pdf_find_keyword(full_text, keyword);
157 } catch (const PdfError& e) {
158 media_svc_error("Initialization failed : %s", e.what());