4 * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
20 #include "media-ebook-plugin-pdf.h"
29 #define LOG_TAG "MEDIA_SERVICE"
31 using namespace EBook;
33 Pdf::Pdf(std::string path, std::unique_ptr<IRunnable> runner)
34 : runner(std::move(runner))
41 LOGD("%s", path.c_str());
44 pdf.Load(path.c_str());
46 } catch (const PoDoFo::PdfError& e) {
47 LOGE("Initialization failed : %s", e.what());
56 for (int n = 0; n < pdf.GetPageCount(); ++n)
57 if (runner->run(parseTextFromPage(n)))
68 for (int n = 0; n < pdf.GetPageCount(); ++n)
69 runner->run(parseTextFromPage(n));
72 std::string Pdf::parseTextFromPage(unsigned int index)
76 PoDoFo::EPdfContentsType type;
77 PoDoFo::PdfVariant var;
78 PoDoFo::PdfFont *cur_font = NULL;
79 bool text_block = false;
81 std::stack<PoDoFo::PdfVariant> stack;
82 PoDoFo::PdfString unicode;
83 PoDoFo::PdfArray array;
85 PoDoFo::PdfPage* page = pdf.GetPage(index);
89 PoDoFo::PdfContentsTokenizer tokenizer(page);
91 while (tokenizer.ReadNext(type, tok, var)) {
92 if (type != PoDoFo::ePdfContentsType_Keyword) {
99 if (!text_block && strcmp(tok, "BT") == 0) {
102 } else if (text_block && strcmp(tok, "ET") == 0) {
109 if (strcmp(tok, "Tf") == 0) {
110 if (stack.size() < 2) {
116 cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName()));
117 } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) {
121 if (!cur_font || !cur_font->GetEncoding())
124 unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font);
125 fullText += unicode.GetStringUtf8();
128 } else if (strcmp(tok, "TJ") == 0) {
132 array = stack.top().GetArray();
135 for (int i = 0; i < static_cast<int>(array.GetSize()); i++) {
136 if (array[i].IsString() || array[i].IsHexString()) {
137 if (!cur_font || !cur_font->GetEncoding())
140 unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font);
141 fullText += unicode.GetStringUtf8();