1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
7 #include "../../include/fpdfapi/fpdf_page.h"
8 #include "../../include/fpdfapi/fpdf_pageobj.h"
9 #include "../../include/fpdftext/fpdf_text.h"
12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_)
13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
18 return CFX_ByteString((char)unicode);
20 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
22 return CFX_ByteString(altstr, -1);
24 return CFX_ByteString(defchar, -1);
28 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
30 return CFX_ByteString(buf, ret);
32 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
34 return CFX_ByteString(altstr, -1);
36 return CFX_ByteString(defchar, -1);
38 CTextPage::CTextPage()
41 CTextPage::~CTextPage()
44 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
48 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
53 void CTextPage::ProcessObject(CPDF_PageObject* pObject)
55 if (pObject->m_Type != PDFPAGE_TEXT) {
58 CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
59 CPDF_Font* pFont = pText->m_TextState.GetFont();
60 int count = pText->CountItems();
61 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
63 pText->CalcCharPos(pPosArray);
65 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
66 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
67 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
69 if (space_charcode != -1) {
70 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
73 spacew = fontsize_h / 4;
75 if (pText->m_TextState.GetBaselineAngle() != 0) {
77 CFX_AffineMatrix matrix;
78 pText->GetTextMatrix(&matrix);
79 for (int i = 0; i < pText->m_nChars; i ++) {
80 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
81 if (charcode == (FX_DWORD) - 1) {
85 pFont->GetCharBBox(charcode, char_box);
86 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
87 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
88 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
89 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
91 FX_FLOAT char_origx, char_origy;
92 matrix.Transform(char_left, 0, char_origx, char_origy);
93 matrix.TransformRect(char_left, char_right, char_top, char_bottom);
95 pFont->AppendChar(str, charcode);
96 InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
97 char_bottom, spacew, fontsize_v, str, pFont);
104 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
105 for (int ii = 0; ii < count * 2; ii ++) {
106 pPosArray[ii] *= ratio_h;
108 FX_FLOAT baseline = pText->m_PosY;
109 CTextBaseLine* pBaseLine = NULL;
110 FX_FLOAT topy = pText->m_Top;
111 FX_FLOAT bottomy = pText->m_Bottom;
112 FX_FLOAT leftx = pText->m_Left;
114 CFX_ByteString segment;
116 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
117 for (int i = 0; i < pText->m_nChars; i ++) {
118 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
119 if (charcode == (FX_DWORD) - 1) {
122 FX_FLOAT char_left = pPosArray[cc * 2];
123 FX_FLOAT char_right = pPosArray[cc * 2 + 1];
125 if (char_left < last_left || (char_left - last_right) > spacew / 2) {
126 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
127 topy, bottomy, spacew, fontsize_v, segment, pFont);
128 segment_left = char_left;
131 if (space_count > 1) {
132 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
133 topy, bottomy, spacew, fontsize_v, segment, pFont);
135 } else if (space_count == 1) {
136 pFont->AppendChar(segment, ' ');
138 if (segment.GetLength() == 0) {
139 segment_left = char_left;
141 segment_right = char_right;
142 pFont->AppendChar(segment, charcode);
144 last_left = char_left;
145 last_right = char_right;
147 if (segment.GetLength())
148 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
149 topy, bottomy, spacew, fontsize_v, segment, pFont);
152 static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_Font* pFont);
153 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
154 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
155 CFX_ByteString& str, CPDF_Font* pFont)
157 if (str.GetLength() == 0) {
160 if (pBaseLine == NULL) {
162 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
163 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
164 if (pExistLine->m_BaseLine == basey) {
165 pBaseLine = pExistLine;
168 if (pExistLine->m_BaseLine < basey) {
172 if (pBaseLine == NULL) {
173 pBaseLine = FX_NEW CTextBaseLine;
174 if (NULL == pBaseLine) {
177 pBaseLine->m_BaseLine = basey;
178 m_BaseLines.InsertAt(i, pBaseLine);
182 FX_LPCSTR pStr = str;
183 int len = str.GetLength(), offset = 0;
184 while (offset < len) {
185 FX_DWORD ch = pFont->GetNextChar(pStr, offset);
186 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
187 if (unicode_str.IsEmpty()) {
188 text += (FX_WCHAR)ch;
194 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
197 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
199 FX_FLOAT lastheight = -1;
200 FX_FLOAT lastbaseline = -1;
201 FX_FLOAT MinLeftX = 1000000;
202 FX_FLOAT MaxRightX = 0;
204 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
205 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
206 FX_FLOAT leftx, rightx;
207 if (pBaseLine->GetWidth(leftx, rightx)) {
208 if (leftx < MinLeftX) {
211 if (rightx > MaxRightX) {
216 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
217 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
218 pBaseLine->MergeBoxes();
220 for (i = 1; i < m_BaseLines.GetSize(); i ++) {
221 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
222 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
223 if (pBaseLine->CanMerge(pPrevLine)) {
224 pPrevLine->Merge(pBaseLine);
226 m_BaseLines.RemoveAt(i);
231 int* widths = FX_Alloc(int, m_BaseLines.GetSize());
233 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
235 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
237 FX_FLOAT TotalWidth = 0;
239 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
241 FX_FLOAT charwidth = TotalWidth / TotalChars;
242 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
244 if (widths[i] > 1000) {
247 if (widths[i] < minchars) {
248 widths[i] = minchars;
251 int AvgWidth = 0, widthcount = 0;
252 for (i = 0; i < m_BaseLines.GetSize(); i ++)
254 AvgWidth += widths[i];
257 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
259 for (i = 0; i < m_BaseLines.GetSize(); i ++)
260 if (MaxWidth < widths[i]) {
261 MaxWidth = widths[i];
263 if (MaxWidth > AvgWidth * 6 / 5) {
264 MaxWidth = AvgWidth * 6 / 5;
267 if (iMinWidth < MaxWidth) {
268 iMinWidth = MaxWidth;
272 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
273 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
274 pBaseLine->MergeBoxes();
279 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
280 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
281 if (lastheight >= 0) {
282 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
283 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
287 lastheight = pBaseLine->m_MaxFontSizeV;
288 lastbaseline = pBaseLine->m_BaseLine;
290 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
294 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
296 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
297 FX_LPWSTR pDst = NULL;
298 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
303 pDst = new FX_WCHAR[nCount];
304 FX_Unicode_GetNormalization(wChar, pDst);
305 for (int nIndex = 0; nIndex < nCount; nIndex++) {
306 sDest += pDst[nIndex];
310 void NormalizeString(CFX_WideString& str)
312 if (str.GetLength() <= 0) {
315 CFX_WideString sBuffer;
316 IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
317 if (NULL == BidiChar) {
321 FX_BOOL bR2L = FALSE;
322 FX_INT32 start = 0, count = 0, i = 0;
323 int nR2L = 0, nL2R = 0;
324 for (i = 0; i < str.GetLength(); i++) {
325 if(BidiChar->AppendChar(str.GetAt(i))) {
326 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
333 } else if (ret == 1) {
339 if(BidiChar->EndChar()) {
340 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
347 } else if(ret == 1) {
352 if(nR2L > 0 && nR2L >= nL2R) {
356 int count = order.GetSize();
357 for(int j = count - 1; j > 0; j -= 3) {
358 int ret = order.GetAt(j);
359 int start = order.GetAt(j - 2);
360 int count1 = order.GetAt(j - 1);
361 if(ret == 2 || ret == 0) {
362 for(int i = start + count1 - 1; i >= start; i--) {
363 NormalizeCompositeChar(str[i], sBuffer);
367 FX_BOOL bSymbol = FALSE;
368 while(i > 0 && order.GetAt(i) != 2) {
369 bSymbol = !order.GetAt(i);
372 int end = start + count1 ;
380 for(int m = start; m < end; m++) {
386 for(; n <= i; n += 3) {
387 int start = order.GetAt(n - 2);
388 int count1 = order.GetAt(n - 1);
389 int end = start + count1 ;
390 for(int m = start; m < end; m++) {
398 int count = order.GetSize();
399 FX_BOOL bL2R = FALSE;
400 for(int j = 0; j < count; j += 3) {
401 int ret = order.GetAt(j + 2);
402 int start = order.GetAt(j);
403 int count1 = order.GetAt(j + 1);
404 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
406 while(bR2L && i < count) {
407 if(order.GetAt(i + 2) == 1) {
418 int end = str.GetLength() - 1;
420 end = order.GetAt(i) - 1;
423 for(int n = end; n >= start; n--) {
424 NormalizeCompositeChar(str[i], sBuffer);
427 int end = start + count1 ;
428 for(int i = start; i < end; i++) {
438 static FX_BOOL IsNumber(CFX_WideString& str)
440 for (int i = 0; i < str.GetLength(); i ++) {
441 FX_WCHAR ch = str[i];
442 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
448 void CTextPage::FindColumns()
451 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
452 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
453 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
454 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
455 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
456 if (pColumn == NULL) {
457 pColumn = FX_NEW CTextColumn;
459 pColumn->m_Count = 1;
460 pColumn->m_AvgPos = pTextBox->m_Right;
461 pColumn->m_TextPos = -1;
462 m_TextColumns.Add(pColumn);
465 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
466 (pColumn->m_Count + 1);
471 int mincount = m_BaseLines.GetSize() / 4;
472 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
473 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
474 if (pTextColumn->m_Count >= mincount) {
478 m_TextColumns.RemoveAt(i);
481 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
482 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
483 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
484 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
485 if (IsNumber(pTextBox->m_Text)) {
486 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
491 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
493 for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
494 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
495 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
501 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
504 CTextBaseLine::CTextBaseLine()
510 CTextBaseLine::~CTextBaseLine()
512 for (int i = 0; i < m_TextList.GetSize(); i ++) {
513 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
517 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
518 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
523 if (m_Bottom > bottomy) {
526 if (m_MaxFontSizeV < fontsize_v) {
527 m_MaxFontSizeV = fontsize_v;
530 for (i = 0; i < m_TextList.GetSize(); i ++) {
531 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
532 if (pText->m_Left > leftx) {
536 CTextBox* pText = FX_NEW CTextBox;
540 pText->m_Text = text;
541 pText->m_Left = leftx;
542 pText->m_Right = rightx;
544 pText->m_Bottom = bottomy;
545 pText->m_SpaceWidth = spacew;
546 pText->m_FontSizeV = fontsize_v;
547 pText->m_pColumn = NULL;
548 m_TextList.InsertAt(i, pText);
550 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
551 FX_FLOAT& interlow, FX_FLOAT& interhigh);
552 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
554 FX_FLOAT inter_top, inter_bottom;
555 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
556 inter_bottom, inter_top)) {
559 FX_FLOAT inter_h = inter_top - inter_bottom;
560 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
563 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
564 for (int i = 0; i < m_TextList.GetSize(); i ++) {
565 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
566 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
567 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
568 FX_FLOAT inter_left, inter_right;
569 if (!GetIntersection(pText->m_Left, pText->m_Right,
570 pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
573 FX_FLOAT inter_w = inter_right - inter_left;
574 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
577 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
578 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
585 void CTextBaseLine::Merge(CTextBaseLine* pOther)
587 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
588 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
589 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
590 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
593 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
596 for (i = 0; i < m_TextList.GetSize(); i ++) {
597 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
598 if (pText->m_Text != L" ") {
602 if (i == m_TextList.GetSize()) {
605 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
606 leftx = pText->m_Left;
607 for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
608 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
609 if (pText->m_Text != L" ") {
613 pText = (CTextBox*)m_TextList.GetAt(i);
614 rightx = pText->m_Right;
617 void CTextBaseLine::MergeBoxes()
621 if (i >= m_TextList.GetSize() - 1) {
624 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
625 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
626 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
627 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
628 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
629 if (spacew > 0.0 && dx < spacew * 2) {
630 pThisText->m_Right = pNextText->m_Right;
631 if (dx > spacew * 1.5) {
632 pThisText->m_Text += L" ";
633 } else if (dx > spacew / 3) {
634 pThisText->m_Text += L' ';
636 pThisText->m_Text += pNextText->m_Text;
637 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
638 spacew : pNextText->m_SpaceWidth;
639 m_TextList.RemoveAt(i + 1);
646 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
650 for (int i = 0; i < m_TextList.GetSize(); i ++) {
651 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
653 if (pText->m_pColumn) {
654 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
655 xpos -= pText->m_Text.GetLength();
657 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
659 if (xpos <= lastpos) {
662 for (int j = lastpos + 1; j < xpos; j ++) {
665 CFX_WideString sSrc(pText->m_Text);
666 NormalizeString(sSrc);
669 lastpos = xpos + pText->m_Text.GetLength();
672 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
675 for (int i = 0; i < m_TextList.GetSize(); i ++) {
676 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
677 if (pText->m_Right - pText->m_Left < 0.002) {
680 count += pText->m_Text.GetLength();
681 width += pText->m_Right - pText->m_Left;
682 minchars += pText->m_Text.GetLength() + 1;
685 #define PI 3.1415926535897932384626433832795
686 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
688 int total_count = 0, rotated_count[3] = {0, 0, 0};
689 FX_POSITION pos = page.GetFirstObjectPosition();
691 CPDF_PageObject* pObj = page.GetNextObject(pos);
692 if (pObj->m_Type != PDFPAGE_TEXT) {
696 CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
697 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
701 int degree = (int)(angle * 180 / PI + 0.5);
708 int index = degree / 90 % 3 - 1;
712 rotated_count[index] ++;
714 if (total_count == 0) {
717 CFX_AffineMatrix matrix;
718 if (rotated_count[0] > total_count * 2 / 3) {
719 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
720 } else if (rotated_count[1] > total_count * 2 / 3) {
721 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
722 } else if (rotated_count[2] > total_count * 2 / 3) {
723 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
727 page.Transform(matrix);
728 page_bbox.Transform(&matrix);
730 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
731 int iMinWidth, FX_DWORD flags)
738 page.Load(pDoc, pPage);
739 CPDF_ParseOptions options;
740 options.m_bTextOnly = TRUE;
741 options.m_bSeparateForm = FALSE;
742 page.ParseContent(&options);
743 CFX_FloatRect page_bbox = page.GetPageBBox();
744 if (flags & PDF2TXT_AUTO_ROTATE) {
745 CheckRotate(page, page_bbox);
748 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
749 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
750 texts.m_bBreakSpace = TRUE;
751 FX_POSITION pos = page.GetFirstObjectPosition();
753 CPDF_PageObject* pObject = page.GetNextObject(pos);
754 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
755 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
756 if (!page_bbox.Contains(rect)) {
760 texts.ProcessObject(pObject);
762 texts.WriteOutput(lines, iMinWidth);
764 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
765 int iMinWidth, FX_DWORD flags)
768 CFX_WideStringArray wlines;
769 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
770 for (int i = 0; i < wlines.GetSize(); i ++) {
771 CFX_WideString wstr = wlines[i];
773 for (int c = 0; c < wstr.GetLength(); c ++) {
774 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
780 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
781 CFX_PtrArray* pObjArray);
782 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
784 buffer.EstimateSize(0, 10240);
786 page.Load(pDoc, pPage);
787 CPDF_ParseOptions options;
788 options.m_bTextOnly = TRUE;
789 options.m_bSeparateForm = FALSE;
790 page.ParseContent(&options);
791 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);