4 * Copyright (c) 2022 Samsung Electronics Co., Ltd. All rights reserved.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
20 #include "media-ebook-plugin-pdf.h"
29 #define LOG_TAG "MEDIA_SERVICE"
31 using namespace EBook;
33 Pdf::Pdf(std::string path, std::unique_ptr<IRunnable> runner)
34 : runner(std::move(runner))
41 LOGD("%s", path.c_str());
44 PoDoFo::PdfError::EnableLogging(false);
46 pdf.Load(path.c_str());
48 } catch (const PoDoFo::PdfError& e) {
49 LOGE("Initialization failed : %s", e.what());
58 for (int n = 0; n < pdf.GetPageCount(); ++n)
59 if (runner->run(parseTextFromPage(n)))
70 for (int n = 0; n < pdf.GetPageCount(); ++n)
71 runner->run(parseTextFromPage(n));
74 std::string Pdf::parseTextFromPage(unsigned int index)
78 PoDoFo::EPdfContentsType type;
79 PoDoFo::PdfVariant var;
80 PoDoFo::PdfFont *cur_font = NULL;
81 bool text_block = false;
83 std::stack<PoDoFo::PdfVariant> stack;
84 PoDoFo::PdfString unicode;
85 PoDoFo::PdfArray array;
87 PoDoFo::PdfPage* page = pdf.GetPage(index);
91 PoDoFo::PdfContentsTokenizer tokenizer(page);
93 while (tokenizer.ReadNext(type, tok, var)) {
94 if (type != PoDoFo::ePdfContentsType_Keyword) {
101 if (!text_block && strcmp(tok, "BT") == 0) {
104 } else if (text_block && strcmp(tok, "ET") == 0) {
111 if (strcmp(tok, "Tf") == 0) {
112 if (stack.size() < 2) {
118 cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName()));
119 } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) {
123 if (!cur_font || !cur_font->GetEncoding())
126 unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font);
127 fullText += unicode.GetStringUtf8();
130 } else if (strcmp(tok, "TJ") == 0) {
134 array = stack.top().GetArray();
137 for (int i = 0; i < static_cast<int>(array.GetSize()); i++) {
138 if (array[i].IsString() || array[i].IsHexString()) {
139 if (!cur_font || !cur_font->GetEncoding())
142 unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font);
143 fullText += unicode.GetStringUtf8();