File indexing completed on 2024-05-12 08:32:57
0001 /* 0002 SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com> 0003 0004 SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 #include "textpage.h" 0008 #include "textpage_p.h" 0009 0010 #include <QDebug> 0011 0012 #include "area.h" 0013 #include "debug_p.h" 0014 #include "misc.h" 0015 #include "page.h" 0016 #include "page_p.h" 0017 0018 #include <cstring> 0019 0020 #include <QVarLengthArray> 0021 #include <QtAlgorithms> 0022 0023 using namespace Okular; 0024 0025 class SearchPoint 0026 { 0027 public: 0028 SearchPoint() 0029 : offset_begin(-1) 0030 , offset_end(-1) 0031 { 0032 } 0033 0034 /** The TinyTextEntity containing the first character of the match. */ 0035 TextList::ConstIterator it_begin; 0036 0037 /** The TinyTextEntity containing the last character of the match. */ 0038 TextList::ConstIterator it_end; 0039 0040 /** The index of the first character of the match in (*it_begin)->text(). 0041 * Satisfies 0 <= offset_begin < (*it_begin)->text().length(). 0042 */ 0043 int offset_begin; 0044 0045 /** One plus the index of the last character of the match in (*it_end)->text(). 0046 * Satisfies 0 < offset_end <= (*it_end)->text().length(). 0047 */ 0048 int offset_end; 0049 }; 0050 0051 /* text comparison functions */ 0052 0053 static bool CaseInsensitiveCmpFn(QStringView from, QStringView to) 0054 { 0055 #ifdef DEBUG_TEXTPAGE 0056 qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)"; 0057 #endif 0058 return from.compare(to, Qt::CaseInsensitive) == 0; 0059 } 0060 0061 static bool CaseSensitiveCmpFn(QStringView from, QStringView to) 0062 { 0063 #ifdef DEBUG_TEXTPAGE 0064 qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)"; 0065 #endif 0066 return from.compare(to, Qt::CaseSensitive) == 0; 0067 } 0068 0069 /** 0070 * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line 0071 * overlap within @p threshold percent, i. e. iff the ratio of the length of the 0072 * intersection of the segments to the length of the shortest of the two input segments 0073 * is not smaller than the threshold. 0074 */ 0075 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold) 0076 { 0077 // check if one consumes another fully (speed optimization) 0078 0079 if (left1 <= left2 && right1 >= right2) { 0080 return true; 0081 } 0082 0083 if (left1 >= left2 && right1 <= right2) { 0084 return true; 0085 } 0086 0087 // check if there is overlap above threshold 0088 if (right2 >= left1 && right1 >= left2) { 0089 double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1; 0090 0091 double length1 = right1 - left1, length2 = right2 - left2; 0092 0093 return overlap * 100 >= threshold * qMin(length1, length2); 0094 } 0095 0096 return false; 0097 } 0098 0099 static bool doesConsumeY(const QRect first, const QRect second, int threshold) 0100 { 0101 return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold); 0102 } 0103 0104 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold) 0105 { 0106 return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold); 0107 } 0108 0109 /* 0110 Rationale behind TinyTextEntity: 0111 0112 instead of storing directly a QString for the text of an entity, 0113 we store the UTF-16 data and their length. This way, we save about 0114 4 int's wrt a QString, and we can create a new string from that 0115 raw data (that's the only penalty of that). 0116 Even better, if the string we need to store has at most 0117 MaxStaticChars characters, then we store those in place of the QChar* 0118 that would be used (with new[] + free[]) for the data. 0119 */ 0120 class TinyTextEntity 0121 { 0122 static const int MaxStaticChars = sizeof(void *) / sizeof(QChar); 0123 0124 public: 0125 TinyTextEntity(const QString &text, const NormalizedRect &rect) 0126 : area(rect) 0127 { 0128 Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string"); 0129 Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!"); 0130 length = text.length(); 0131 switch (length) { 0132 #if QT_POINTER_SIZE >= 8 0133 case 4: 0134 d.qc[3] = text.at(3).unicode(); 0135 // fall through 0136 case 3: 0137 d.qc[2] = text.at(2).unicode(); 0138 #endif 0139 // fall through 0140 case 2: 0141 d.qc[1] = text.at(1).unicode(); 0142 // fall through 0143 case 1: 0144 d.qc[0] = text.at(0).unicode(); 0145 break; 0146 default: 0147 d.data = new QChar[length]; 0148 std::memcpy(d.data, text.constData(), length * sizeof(QChar)); 0149 } 0150 } 0151 0152 ~TinyTextEntity() 0153 { 0154 if (length > MaxStaticChars) { 0155 delete[] d.data; 0156 } 0157 } 0158 0159 inline QString text() const 0160 { 0161 return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length); 0162 } 0163 0164 inline NormalizedRect transformedArea(const QTransform &matrix) const 0165 { 0166 NormalizedRect transformed_area = area; 0167 transformed_area.transform(matrix); 0168 return transformed_area; 0169 } 0170 0171 NormalizedRect area; 0172 0173 private: 0174 Q_DISABLE_COPY(TinyTextEntity) 0175 0176 union { 0177 QChar *data; 0178 ushort qc[MaxStaticChars]; 0179 } d; 0180 int length; 0181 }; 0182 0183 TextEntity::TextEntity(const QString &text, NormalizedRect *area) 0184 : m_text(text) 0185 , m_area(area) 0186 , d(nullptr) 0187 { 0188 } 0189 0190 TextEntity::~TextEntity() 0191 { 0192 delete m_area; 0193 } 0194 0195 QString TextEntity::text() const 0196 { 0197 return m_text; 0198 } 0199 0200 NormalizedRect *TextEntity::area() const 0201 { 0202 return m_area; 0203 } 0204 0205 NormalizedRect TextEntity::transformedArea(const QTransform &matrix) const 0206 { 0207 NormalizedRect transformed_area = *m_area; 0208 transformed_area.transform(matrix); 0209 return transformed_area; 0210 } 0211 0212 TextPagePrivate::TextPagePrivate() 0213 : m_page(nullptr) 0214 { 0215 } 0216 0217 TextPagePrivate::~TextPagePrivate() 0218 { 0219 qDeleteAll(m_searchPoints); 0220 qDeleteAll(m_words); 0221 } 0222 0223 TextPage::TextPage() 0224 : d(new TextPagePrivate()) 0225 { 0226 } 0227 0228 TextPage::TextPage(const TextEntity::List &words) 0229 : d(new TextPagePrivate()) 0230 { 0231 TextEntity::List::ConstIterator it = words.constBegin(), itEnd = words.constEnd(); 0232 for (; it != itEnd; ++it) { 0233 TextEntity *e = *it; 0234 if (!e->text().isEmpty()) { 0235 d->m_words.append(new TinyTextEntity(e->text(), *e->area())); 0236 } 0237 delete e; 0238 } 0239 } 0240 0241 TextPage::~TextPage() 0242 { 0243 delete d; 0244 } 0245 0246 void TextPage::append(const QString &text, NormalizedRect *area) 0247 { 0248 if (!text.isEmpty()) { 0249 if (!d->m_words.isEmpty()) { 0250 TinyTextEntity *lastEntity = d->m_words.last(); 0251 const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC); 0252 if (concatText != concatText.normalized(QString::NormalizationForm_KC)) { 0253 // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å 0254 NormalizedRect newArea = *area | lastEntity->area; 0255 delete area; 0256 delete lastEntity; 0257 d->m_words.removeLast(); 0258 d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea)); 0259 return; 0260 } 0261 } 0262 0263 d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area)); 0264 } 0265 delete area; 0266 } 0267 0268 struct WordWithCharacters { 0269 WordWithCharacters(TinyTextEntity *w, const TextList &c) 0270 : word(w) 0271 , characters(c) 0272 { 0273 } 0274 0275 inline QString text() const 0276 { 0277 return word->text(); 0278 } 0279 0280 inline const NormalizedRect &area() const 0281 { 0282 return word->area; 0283 } 0284 0285 TinyTextEntity *word; 0286 TextList characters; 0287 }; 0288 typedef QList<WordWithCharacters> WordsWithCharacters; 0289 0290 /** 0291 * We will divide the whole page in some regions depending on the horizontal and 0292 * vertical spacing among different regions. Each region will have an area and an 0293 * associated WordsWithCharacters in sorted order. 0294 */ 0295 class RegionText 0296 { 0297 public: 0298 RegionText() {}; 0299 0300 RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area) 0301 : m_region_wordWithCharacters(wordsWithCharacters) 0302 , m_area(area) 0303 { 0304 } 0305 0306 inline QString string() const 0307 { 0308 QString res; 0309 for (const WordWithCharacters &word : m_region_wordWithCharacters) { 0310 res += word.text(); 0311 } 0312 return res; 0313 } 0314 0315 inline WordsWithCharacters text() const 0316 { 0317 return m_region_wordWithCharacters; 0318 } 0319 0320 inline QRect area() const 0321 { 0322 return m_area; 0323 } 0324 0325 inline void setArea(const QRect area) 0326 { 0327 m_area = area; 0328 } 0329 0330 inline void setText(const WordsWithCharacters &wordsWithCharacters) 0331 { 0332 m_region_wordWithCharacters = wordsWithCharacters; 0333 } 0334 0335 private: 0336 WordsWithCharacters m_region_wordWithCharacters; 0337 QRect m_area; 0338 }; 0339 0340 RegularAreaRect *TextPage::textArea(TextSelection *sel) const 0341 { 0342 if (d->m_words.isEmpty()) { 0343 return new RegularAreaRect(); 0344 } 0345 0346 /** 0347 It works like this: 0348 There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0) 0349 rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those 0350 points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search 0351 for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both 0352 of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a 0353 1. (rx,ry)x(1,ty) 0354 2. (0,ty)x(1,uy) 0355 3. (0,uy)x(vx,vy) 0356 0357 To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor 0358 or that has a left border >= cx and bottom border >= cy. 0359 */ 0360 RegularAreaRect *ret = new RegularAreaRect; 0361 0362 PagePrivate *pagePrivate = PagePrivate::get(d->m_page); 0363 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform(); 0364 const double scaleX = d->m_page->width(); 0365 const double scaleY = d->m_page->height(); 0366 0367 NormalizedPoint startC = sel->start(); 0368 NormalizedPoint endC = sel->end(); 0369 NormalizedPoint temp; 0370 0371 // if startPoint is right to endPoint swap them 0372 if (startC.x > endC.x) { 0373 temp = startC; 0374 startC = endC; 0375 endC = temp; 0376 } 0377 0378 // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document 0379 const NormalizedRect boundingRect = d->m_page->boundingBox(); 0380 const QRect content = boundingRect.geometry(scaleX, scaleY); 0381 const double minX = content.left(); 0382 const double maxX = content.right(); 0383 const double minY = content.top(); 0384 const double maxY = content.bottom(); 0385 0386 /** 0387 * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for 0388 * the endRectangle. We have four cases: 0389 * 0390 * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start 0391 * and endPoint are outof the bounding rect (do not intersect) 0392 * 0393 * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle 0394 * 0395 * Case 2(a): find the rectangle which contains start and endpoint and having some 0396 * TextEntity 0397 * 0398 * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any 0399 * TextEntity within the rect made by startPoint and endPoint 0400 * 0401 * Case 3: Now, we may have two type of selection. 0402 * 1. startpoint is left-top of start_end and endpoint is right-bottom 0403 * 2. startpoint is left-bottom of start_end and endpoint is top-right 0404 * 0405 * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have 0406 * text within them. so, we need to search for the best suitable textposition for start and end. 0407 * 0408 * Case 3(a): We search the nearest rectangle consisting of some 0409 * TinyTextEntity right to or bottom of the startPoint for selection 01. 0410 * And, for selection 02, we have to search for right and top 0411 * 0412 * Case 3(b): For endpoint, we have to find the point top of or left to 0413 * endpoint if we have selection 01. 0414 * Otherwise, the search will be left and bottom 0415 */ 0416 0417 // we know that startC.x > endC.x, we need to decide which is top and which is bottom 0418 const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y); 0419 0420 // Case 1(a) 0421 if (!boundingRect.intersects(start_end)) { 0422 return ret; 0423 } else { 0424 // case 1(b) 0425 /** 0426 note that, after swapping of start and end, we know that, 0427 start is always left to end. but, we cannot say start is 0428 positioned upper than end. 0429 **/ 0430 // if start is left to content rect take it to content rect boundary 0431 if (startC.x * scaleX < minX) { 0432 startC.x = minX / scaleX; 0433 } 0434 if (endC.x * scaleX > maxX) { 0435 endC.x = maxX / scaleX; 0436 } 0437 0438 // if start is top to end (selection type 01) 0439 if (startC.y * scaleY < minY) { 0440 startC.y = minY / scaleY; 0441 } 0442 if (endC.y * scaleY > maxY) { 0443 endC.y = maxY / scaleY; 0444 } 0445 0446 // if start is bottom to end (selection type 02) 0447 if (startC.y * scaleY > maxY) { 0448 startC.y = maxY / scaleY; 0449 } 0450 if (endC.y * scaleY < minY) { 0451 endC.y = minY / scaleY; 0452 } 0453 } 0454 0455 TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd(); 0456 TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd; 0457 const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight; 0458 0459 NormalizedRect tmp; 0460 // case 2(a) 0461 for (; it != itEnd; ++it) { 0462 tmp = (*it)->area; 0463 if (tmp.contains(startC.x, startC.y)) { 0464 start = it; 0465 } 0466 if (tmp.contains(endC.x, endC.y)) { 0467 end = it; 0468 } 0469 } 0470 0471 // case 2(b) 0472 it = tmpIt; 0473 if (start == it && end == itEnd) { 0474 for (; it != itEnd; ++it) { 0475 // is there any text rectangle within the start_end rect 0476 tmp = (*it)->area; 0477 if (start_end.intersects(tmp)) { 0478 break; 0479 } 0480 } 0481 0482 // we have searched every text entities, but none is within the rectangle created by start and end 0483 // so, no selection should be done 0484 if (it == itEnd) { 0485 return ret; 0486 } 0487 } 0488 it = tmpIt; 0489 bool selection_two_start = false; 0490 0491 // case 3.a 0492 if (start == it) { 0493 NormalizedRect rect; 0494 0495 // selection type 01 0496 if (startC.y <= endC.y) { 0497 for (; it != itEnd; ++it) { 0498 rect = (*it)->area; 0499 bool flagV = !rect.isBottom(startC); 0500 0501 if (flagV && rect.isRight(startC)) { 0502 start = it; 0503 break; 0504 } 0505 } 0506 } 0507 0508 // selection type 02 0509 else { 0510 selection_two_start = true; 0511 int distance = scaleX + scaleY + 100; 0512 0513 for (; it != itEnd; ++it) { 0514 rect = (*it)->area; 0515 0516 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) { 0517 QRect entRect = rect.geometry(scaleX, scaleY); 0518 int xdist, ydist; 0519 xdist = entRect.center().x() - startC.x * scaleX; 0520 ydist = entRect.center().y() - startC.y * scaleY; 0521 0522 // make them positive 0523 if (xdist < 0) { 0524 xdist = -xdist; 0525 } 0526 if (ydist < 0) { 0527 ydist = -ydist; 0528 } 0529 0530 if ((xdist + ydist) < distance) { 0531 distance = xdist + ydist; 0532 start = it; 0533 } 0534 } 0535 } 0536 } 0537 } 0538 0539 // case 3.b 0540 if (end == itEnd) { 0541 it = tmpIt; 0542 itEnd = itEnd - 1; 0543 0544 NormalizedRect rect; 0545 0546 if (startC.y <= endC.y) { 0547 for (; itEnd >= it; itEnd--) { 0548 rect = (*itEnd)->area; 0549 bool flagV = !rect.isTop(endC); 0550 0551 if (flagV && rect.isLeft(endC)) { 0552 end = itEnd; 0553 break; 0554 } 0555 } 0556 } 0557 0558 else { 0559 int distance = scaleX + scaleY + 100; 0560 for (; itEnd >= it; itEnd--) { 0561 rect = (*itEnd)->area; 0562 0563 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) { 0564 QRect entRect = rect.geometry(scaleX, scaleY); 0565 int xdist, ydist; 0566 xdist = entRect.center().x() - endC.x * scaleX; 0567 ydist = entRect.center().y() - endC.y * scaleY; 0568 0569 // make them positive 0570 if (xdist < 0) { 0571 xdist = -xdist; 0572 } 0573 if (ydist < 0) { 0574 ydist = -ydist; 0575 } 0576 0577 if ((xdist + ydist) < distance) { 0578 distance = xdist + ydist; 0579 end = itEnd; 0580 } 0581 } 0582 } 0583 } 0584 } 0585 0586 /* if start and end in selection 02 are in the same column, and we 0587 start at an empty space we have to remove the selection of last 0588 character 0589 */ 0590 if (selection_two_start) { 0591 if (start > end) { 0592 start = start - 1; 0593 } 0594 } 0595 0596 // if start is less than end swap them 0597 if (start > end) { 0598 it = start; 0599 start = end; 0600 end = it; 0601 } 0602 0603 // removes the possibility of crash, in case none of 1 to 3 is true 0604 if (end == d->m_words.constEnd()) { 0605 end--; 0606 } 0607 0608 for (; start <= end; start++) { 0609 ret->appendShape((*start)->transformedArea(matrix), side); 0610 } 0611 0612 return ret; 0613 } 0614 0615 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area) 0616 { 0617 SearchDirection dir = direct; 0618 // invalid search request 0619 if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) { 0620 return nullptr; 0621 } 0622 TextList::ConstIterator start; 0623 int start_offset = 0; 0624 TextList::ConstIterator end; 0625 const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID); 0626 if (sIt == d->m_searchPoints.constEnd()) { 0627 // if no previous run of this search is found, then set it to start 0628 // from the beginning (respecting the search direction) 0629 if (dir == NextResult) { 0630 dir = FromTop; 0631 } else if (dir == PreviousResult) { 0632 dir = FromBottom; 0633 } 0634 } 0635 bool forward = true; 0636 switch (dir) { 0637 case FromTop: 0638 start = d->m_words.constBegin(); 0639 start_offset = 0; 0640 end = d->m_words.constEnd(); 0641 break; 0642 case FromBottom: 0643 start = d->m_words.constEnd(); 0644 start_offset = 0; 0645 end = d->m_words.constBegin(); 0646 forward = false; 0647 break; 0648 case NextResult: 0649 start = (*sIt)->it_end; 0650 start_offset = (*sIt)->offset_end; 0651 end = d->m_words.constEnd(); 0652 break; 0653 case PreviousResult: 0654 start = (*sIt)->it_begin; 0655 start_offset = (*sIt)->offset_begin; 0656 end = d->m_words.constBegin(); 0657 forward = false; 0658 break; 0659 }; 0660 RegularAreaRect *ret = nullptr; 0661 const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn; 0662 if (forward) { 0663 ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end); 0664 } else { 0665 ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end); 0666 } 0667 return ret; 0668 } 0669 0670 // hyphenated '-' must be at the end of a word, so hyphenation means 0671 // we have a '-' just followed by a '\n' character 0672 // check if the string contains a '-' character 0673 // if the '-' is the last entry 0674 static int stringLengthAdaptedWithHyphen(const QString &str, TextList::ConstIterator it, TextList::ConstIterator textListEnd) 0675 { 0676 const int len = str.length(); 0677 0678 // hyphenated '-' must be at the end of a word, so hyphenation means 0679 // we have a '-' just followed by a '\n' character 0680 // check if the string contains a '-' character 0681 // if the '-' is the last entry 0682 if (str.endsWith(QLatin1Char('-'))) { 0683 // validity chek of it + 1 0684 if ((it + 1) != textListEnd) { 0685 // 1. if the next character is '\n' 0686 const QString &lookahedStr = (*(it + 1))->text(); 0687 if (lookahedStr.startsWith(QLatin1Char('\n'))) { 0688 return len - 1; 0689 } 0690 0691 // 2. if the next word is in a different line or not 0692 const NormalizedRect &hyphenArea = (*it)->area; 0693 const NormalizedRect &lookaheadArea = (*(it + 1))->area; 0694 0695 // lookahead to check whether both the '-' rect and next character rect overlap 0696 if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) { 0697 return len - 1; 0698 } 0699 } 0700 } 0701 // else if it is the second last entry - for example in pdf format 0702 else if (str.endsWith(QLatin1String("-\n"))) { 0703 return len - 2; 0704 } 0705 0706 return len; 0707 } 0708 0709 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp) 0710 { 0711 PagePrivate *pagePrivate = PagePrivate::get(m_page); 0712 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform(); 0713 RegularAreaRect *ret = new RegularAreaRect; 0714 0715 for (TextList::ConstIterator it = sp->it_begin;; it++) { 0716 const TinyTextEntity *curEntity = *it; 0717 ret->append(curEntity->transformedArea(matrix)); 0718 0719 if (it == sp->it_end) { 0720 break; 0721 } 0722 } 0723 0724 ret->simplify(); 0725 return ret; 0726 } 0727 0728 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextList::ConstIterator start, int start_offset, TextList::ConstIterator end) 0729 { 0730 // normalize query search all unicode (including glyphs) 0731 const QString query = _query.normalized(QString::NormalizationForm_KC); 0732 0733 // j is the current position in our query 0734 // queryLeft is the length of the query we have left to match 0735 int j = 0, queryLeft = query.length(); 0736 0737 TextList::ConstIterator it = start; 0738 int offset = start_offset; 0739 0740 TextList::ConstIterator it_begin = TextList::ConstIterator(); 0741 int offset_begin = 0; // dummy initial value to suppress compiler warnings 0742 0743 while (it != end) { 0744 const TinyTextEntity *curEntity = *it; 0745 const QString &str = curEntity->text(); 0746 const int strLen = str.length(); 0747 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd()); 0748 // adjustedLen <= strLen 0749 0750 if (offset >= strLen) { 0751 it++; 0752 offset = 0; 0753 continue; 0754 } 0755 0756 if (it_begin == TextList::ConstIterator()) { 0757 it_begin = it; 0758 offset_begin = offset; 0759 } 0760 0761 // Let the user write the hyphen or not when searching for text 0762 int matchedLen = -1; 0763 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) { 0764 // we have equal (or less than) area of the query left as the length of the current 0765 // entity 0766 const int min = qMin(queryLeft, matchingLen - offset); 0767 if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) { 0768 matchedLen = min; 0769 break; 0770 } 0771 } 0772 0773 if (matchedLen == -1) { 0774 // we have not matched 0775 // this means we do not have a complete match 0776 // we need to get back to query start 0777 // and continue the search from this place 0778 #ifdef DEBUG_TEXTPAGE 0779 qCDebug(OkularCoreDebug) << "\tnot matched"; 0780 #endif 0781 j = 0; 0782 queryLeft = query.length(); 0783 it = it_begin; 0784 offset = offset_begin + 1; 0785 it_begin = TextList::ConstIterator(); 0786 } else { 0787 // we have a match 0788 // move the current position in the query 0789 // to the position after the length of this string 0790 // we matched 0791 // subtract the length of the current entity from 0792 // the left length of the query 0793 #ifdef DEBUG_TEXTPAGE 0794 qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen; 0795 #endif 0796 j += matchedLen; 0797 queryLeft -= matchedLen; 0798 0799 if (queryLeft == 0) { 0800 // save or update the search point for the current searchID 0801 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0802 if (sIt == m_searchPoints.end()) { 0803 sIt = m_searchPoints.insert(searchID, new SearchPoint); 0804 } 0805 SearchPoint *sp = *sIt; 0806 sp->it_begin = it_begin; 0807 sp->it_end = it; 0808 sp->offset_begin = offset_begin; 0809 sp->offset_end = offset + matchedLen; 0810 return searchPointToArea(sp); 0811 } 0812 0813 it++; 0814 offset = 0; 0815 } 0816 } 0817 // end of loop - it means that we've ended the textentities 0818 0819 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0820 if (sIt != m_searchPoints.end()) { 0821 SearchPoint *sp = *sIt; 0822 m_searchPoints.erase(sIt); 0823 delete sp; 0824 } 0825 return nullptr; 0826 } 0827 0828 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextList::ConstIterator start, int start_offset, TextList::ConstIterator end) 0829 { 0830 // normalize query to search all unicode (including glyphs) 0831 const QString query = _query.normalized(QString::NormalizationForm_KC); 0832 0833 // j is the current position in our query 0834 // len is the length of the string in TextEntity 0835 // queryLeft is the length of the query we have left 0836 int j = query.length(), queryLeft = query.length(); 0837 0838 TextList::ConstIterator it = start; 0839 int offset = start_offset; 0840 0841 TextList::ConstIterator it_begin = TextList::ConstIterator(); 0842 int offset_begin = 0; // dummy initial value to suppress compiler warnings 0843 0844 while (true) { 0845 if (offset <= 0) { 0846 if (it == end) { 0847 break; 0848 } 0849 it--; 0850 } 0851 0852 const TinyTextEntity *curEntity = *it; 0853 const QString &str = curEntity->text(); 0854 const int strLen = str.length(); 0855 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd()); 0856 // adjustedLen <= strLen 0857 0858 if (offset <= 0) { 0859 offset = strLen; 0860 } 0861 0862 if (it_begin == TextList::ConstIterator()) { 0863 it_begin = it; 0864 offset_begin = offset; 0865 } 0866 0867 // Let the user write the hyphen or not when searching for text 0868 int matchedLen = -1; 0869 // we have equal (or less than) area of the query left as the length of the current 0870 // entity 0871 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) { 0872 const int hyphenOffset = (strLen - matchingLen); 0873 const int min = qMin(queryLeft + hyphenOffset, offset); 0874 if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) { 0875 matchedLen = min - hyphenOffset; 0876 break; 0877 } 0878 } 0879 0880 if (matchedLen == -1) { 0881 // we have not matched 0882 // this means we do not have a complete match 0883 // we need to get back to query start 0884 // and continue the search from this place 0885 #ifdef DEBUG_TEXTPAGE 0886 qCDebug(OkularCoreDebug) << "\tnot matched"; 0887 #endif 0888 0889 j = query.length(); 0890 queryLeft = query.length(); 0891 it = it_begin; 0892 offset = offset_begin - 1; 0893 it_begin = TextList::ConstIterator(); 0894 } else { 0895 // we have a match 0896 // move the current position in the query 0897 // to the position after the length of this string 0898 // we matched 0899 // subtract the length of the current entity from 0900 // the left length of the query 0901 #ifdef DEBUG_TEXTPAGE 0902 qCDebug(OkularCoreDebug) << "\tmatched"; 0903 #endif 0904 j -= matchedLen; 0905 queryLeft -= matchedLen; 0906 0907 if (queryLeft == 0) { 0908 // save or update the search point for the current searchID 0909 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0910 if (sIt == m_searchPoints.end()) { 0911 sIt = m_searchPoints.insert(searchID, new SearchPoint); 0912 } 0913 SearchPoint *sp = *sIt; 0914 sp->it_begin = it; 0915 sp->it_end = it_begin; 0916 sp->offset_begin = offset - matchedLen; 0917 sp->offset_end = offset_begin; 0918 return searchPointToArea(sp); 0919 } 0920 0921 offset = 0; 0922 } 0923 } 0924 // end of loop - it means that we've ended the textentities 0925 0926 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0927 if (sIt != m_searchPoints.end()) { 0928 SearchPoint *sp = *sIt; 0929 m_searchPoints.erase(sIt); 0930 delete sp; 0931 } 0932 return nullptr; 0933 } 0934 0935 QString TextPage::text(const RegularAreaRect *area) const 0936 { 0937 return text(area, AnyPixelTextAreaInclusionBehaviour); 0938 } 0939 0940 QString TextPage::text(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const 0941 { 0942 if (area && area->isNull()) { 0943 return QString(); 0944 } 0945 0946 TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd(); 0947 QString ret; 0948 if (area) { 0949 for (; it != itEnd; ++it) { 0950 if (b == AnyPixelTextAreaInclusionBehaviour) { 0951 if (area->intersects((*it)->area)) { 0952 ret += (*it)->text(); 0953 } 0954 } else { 0955 NormalizedPoint center = (*it)->area.center(); 0956 if (area->contains(center.x, center.y)) { 0957 ret += (*it)->text(); 0958 } 0959 } 0960 } 0961 } else { 0962 for (; it != itEnd; ++it) { 0963 ret += (*it)->text(); 0964 } 0965 } 0966 return ret; 0967 } 0968 0969 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second) 0970 { 0971 QRect firstArea = first.area().roundedGeometry(1000, 1000); 0972 QRect secondArea = second.area().roundedGeometry(1000, 1000); 0973 0974 return firstArea.left() < secondArea.left(); 0975 } 0976 0977 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second) 0978 { 0979 const QRect firstArea = first.area().roundedGeometry(1000, 1000); 0980 const QRect secondArea = second.area().roundedGeometry(1000, 1000); 0981 0982 return firstArea.top() < secondArea.top(); 0983 } 0984 0985 /** 0986 * Sets a new world list. Deleting the contents of the old one 0987 */ 0988 void TextPagePrivate::setWordList(const TextList &list) 0989 { 0990 qDeleteAll(m_words); 0991 m_words = list; 0992 } 0993 0994 /** 0995 * Remove all the spaces in between texts. It will make all the generators 0996 * same, whether they save spaces(like pdf) or not(like djvu). 0997 */ 0998 static void removeSpace(TextList *words) 0999 { 1000 TextList::Iterator it = words->begin(); 1001 const QString str(QLatin1Char(' ')); 1002 1003 while (it != words->end()) { 1004 if ((*it)->text() == str) { 1005 it = words->erase(it); 1006 } else { 1007 ++it; 1008 } 1009 } 1010 } 1011 1012 /** 1013 * We will read the TinyTextEntity from characters and try to create words from there. 1014 * Note: characters might be already characters for some generators, but we will keep 1015 * the nomenclature characters for the generator produced data. The resulting 1016 * WordsWithCharacters memory has to be managed by the caller, both the 1017 * WordWithCharacters::word and WordWithCharacters::characters contents 1018 */ 1019 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight) 1020 { 1021 /** 1022 * We will traverse characters and try to create words from the TinyTextEntities in it. 1023 * We will search TinyTextEntity blocks and merge them until we get a 1024 * space between two consecutive TinyTextEntities. When we get a space 1025 * we can take it as a end of word. Then we store the word as a TinyTextEntity 1026 * and keep it in newList. 1027 1028 * We create a RegionText named regionWord that contains the word and the characters associated with it and 1029 * a rectangle area of the element in newList. 1030 1031 */ 1032 WordsWithCharacters wordsWithCharacters; 1033 1034 TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt; 1035 int newLeft, newRight, newTop, newBottom; 1036 1037 for (; it != itEnd; it++) { 1038 QString textString = (*it)->text(); 1039 QString newString; 1040 QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea; 1041 TextList wordCharacters; 1042 tmpIt = it; 1043 int space = 0; 1044 1045 while (!space) { 1046 if (!textString.isEmpty()) { 1047 newString.append(textString); 1048 1049 // when textString is the start of the word 1050 if (tmpIt == it) { 1051 NormalizedRect newRect(lineArea, pageWidth, pageHeight); 1052 wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect)); 1053 } else { 1054 NormalizedRect newRect(elementArea, pageWidth, pageHeight); 1055 wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect)); 1056 } 1057 } 1058 1059 ++it; 1060 1061 /* 1062 we must have to put this line before the if condition of it==itEnd 1063 otherwise the last character can be missed 1064 */ 1065 if (it == itEnd) { 1066 break; 1067 } 1068 elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight); 1069 if (!doesConsumeY(elementArea, lineArea, 60)) { 1070 --it; 1071 break; 1072 } 1073 1074 const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width(); 1075 const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width(); 1076 1077 space = elementArea.left() - lineArea.right(); 1078 1079 if (space != 0) { 1080 it--; 1081 break; 1082 } 1083 1084 newLeft = text_x1 < line_x1 ? text_x1 : line_x1; 1085 newRight = line_x2 > text_x2 ? line_x2 : text_x2; 1086 newTop = text_y1 > line_y1 ? line_y1 : text_y1; 1087 newBottom = text_y2 > line_y2 ? text_y2 : line_y2; 1088 1089 lineArea.setLeft(newLeft); 1090 lineArea.setTop(newTop); 1091 lineArea.setWidth(newRight - newLeft); 1092 lineArea.setHeight(newBottom - newTop); 1093 1094 textString = (*it)->text(); 1095 } 1096 1097 // if newString is not empty, save it 1098 if (!newString.isEmpty()) { 1099 const NormalizedRect newRect(lineArea, pageWidth, pageHeight); 1100 TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect); 1101 wordsWithCharacters.append(WordWithCharacters(word, wordCharacters)); 1102 } 1103 1104 if (it == itEnd) { 1105 break; 1106 } 1107 } 1108 1109 return wordsWithCharacters; 1110 } 1111 1112 /** 1113 * Create Lines from the words and sort them 1114 */ 1115 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight) 1116 { 1117 /** 1118 * We cannot assume that the generator will give us texts in the right order. 1119 * We can only assume that we will get texts in the page and their bounding 1120 * rectangle. The texts can be character, word, half-word anything. 1121 * So, we need to: 1122 ** 1123 * 1. Sort rectangles/boxes containing texts by y0(top) 1124 * 2. Create textline where there is y overlap between TinyTextEntity 's 1125 * 3. Within each line sort the TinyTextEntity 's by x0(left) 1126 */ 1127 1128 QList<QPair<WordsWithCharacters, QRect>> lines; 1129 1130 /* 1131 Make a new copy of the TextList in the words, so that the wordsTmp and lines do 1132 not contain same pointers for all the TinyTextEntity. 1133 */ 1134 QList<WordWithCharacters> words = wordsTmp; 1135 1136 // Step 1 1137 std::sort(words.begin(), words.end(), compareTinyTextEntityY); 1138 1139 // Step 2 1140 QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end(); 1141 1142 // for every non-space texts(characters/words) in the textList 1143 for (; it != itEnd; it++) { 1144 const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight); 1145 bool found = false; 1146 1147 for (QPair<WordsWithCharacters, QRect> &linesI : lines) { 1148 /* the line area which will be expanded 1149 line_rects is only necessary to preserve the topmin and bottommax of all 1150 the texts in the line, left and right is not necessary at all 1151 */ 1152 QRect &lineArea = linesI.second; 1153 const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width(); 1154 const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width(); 1155 1156 /* 1157 if the new text and the line has y overlapping parts of more than 70%, 1158 the text will be added to this line 1159 */ 1160 if (doesConsumeY(elementArea, lineArea, 70)) { 1161 WordsWithCharacters &line = linesI.first; 1162 line.append(*it); 1163 1164 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1; 1165 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2; 1166 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1; 1167 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2; 1168 1169 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop); 1170 found = true; 1171 } 1172 1173 if (found) { 1174 break; 1175 } 1176 } 1177 1178 /* when we have found a new line create a new TextList containing 1179 only one element and append it to the lines 1180 */ 1181 if (!found) { 1182 WordsWithCharacters tmp; 1183 tmp.append((*it)); 1184 lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea)); 1185 } 1186 } 1187 1188 // Step 3 1189 for (QPair<WordsWithCharacters, QRect> &line : lines) { 1190 WordsWithCharacters &list = line.first; 1191 std::sort(list.begin(), list.end(), compareTinyTextEntityX); 1192 } 1193 1194 return lines; 1195 } 1196 1197 /** 1198 * Calculate Statistical information from the lines we made previously 1199 */ 1200 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing) 1201 { 1202 /** 1203 * For the region, defined by line_rects and lines 1204 * 1. Make line statistical analysis to find the line spacing 1205 * 2. Make character statistical analysis to differentiate between 1206 * word spacing and column spacing. 1207 */ 1208 1209 /** 1210 * Step 0 1211 */ 1212 const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight); 1213 1214 /** 1215 * Step 1 1216 */ 1217 QMap<int, int> line_space_stat; 1218 for (int i = 0; i < sortedLines.length(); i++) { 1219 const QRect rectUpper = sortedLines.at(i).second; 1220 1221 if (i + 1 == sortedLines.length()) { 1222 break; 1223 } 1224 const QRect rectLower = sortedLines.at(i + 1).second; 1225 1226 int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height()); 1227 if (linespace < 0) { 1228 linespace = -linespace; 1229 } 1230 1231 if (line_space_stat.contains(linespace)) { 1232 line_space_stat[linespace]++; 1233 } else { 1234 line_space_stat[linespace] = 1; 1235 } 1236 } 1237 1238 *line_spacing = 0; 1239 int weighted_count = 0; 1240 QMapIterator<int, int> iterate_linespace(line_space_stat); 1241 1242 while (iterate_linespace.hasNext()) { 1243 iterate_linespace.next(); 1244 *line_spacing += iterate_linespace.value() * iterate_linespace.key(); 1245 weighted_count += iterate_linespace.value(); 1246 } 1247 if (*line_spacing != 0) { 1248 *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5); 1249 } 1250 1251 /** 1252 * Step 2 1253 */ 1254 // We would like to use QMap instead of QHash as it will keep the keys sorted 1255 QMap<int, int> hor_space_stat; 1256 QMap<int, int> col_space_stat; 1257 QList<QList<QRect>> space_rects; 1258 QVector<QRect> max_hor_space_rects; 1259 1260 // Space in every line 1261 for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) { 1262 const WordsWithCharacters list = sortedLine.first; 1263 QList<QRect> line_space_rects; 1264 int maxSpace = 0, minSpace = pageWidth; 1265 1266 // for every TinyTextEntity element in the line 1267 WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end(); 1268 QRect max_area1, max_area2; 1269 // for every line 1270 for (; it != itEnd; it++) { 1271 const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight); 1272 if (it + 1 == itEnd) { 1273 break; 1274 } 1275 1276 const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight); 1277 int space = area2.left() - area1.right(); 1278 1279 if (space > maxSpace) { 1280 max_area1 = area1; 1281 max_area2 = area2; 1282 maxSpace = space; 1283 } 1284 1285 if (space < minSpace && space != 0) { 1286 minSpace = space; 1287 } 1288 1289 // if we found a real space, whose length is not zero and also less than the pageWidth 1290 if (space != 0 && space != pageWidth) { 1291 // increase the count of the space amount 1292 if (hor_space_stat.contains(space)) { 1293 hor_space_stat[space]++; 1294 } else { 1295 hor_space_stat[space] = 1; 1296 } 1297 1298 int left, right, top, bottom; 1299 1300 left = area1.right(); 1301 right = area2.left(); 1302 1303 top = area2.top() < area1.top() ? area2.top() : area1.top(); 1304 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom(); 1305 1306 QRect rect(left, top, right - left, bottom - top); 1307 line_space_rects.append(rect); 1308 } 1309 } 1310 1311 space_rects.append(line_space_rects); 1312 1313 if (hor_space_stat.contains(maxSpace)) { 1314 if (hor_space_stat[maxSpace] != 1) { 1315 hor_space_stat[maxSpace]--; 1316 } else { 1317 hor_space_stat.remove(maxSpace); 1318 } 1319 } 1320 1321 if (maxSpace != 0) { 1322 if (col_space_stat.contains(maxSpace)) { 1323 col_space_stat[maxSpace]++; 1324 } else { 1325 col_space_stat[maxSpace] = 1; 1326 } 1327 1328 // store the max rect of each line 1329 const int left = max_area1.right(); 1330 const int right = max_area2.left(); 1331 const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top(); 1332 const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom(); 1333 1334 const QRect rect(left, top, right - left, bottom - top); 1335 max_hor_space_rects.append(rect); 1336 } else { 1337 max_hor_space_rects.append(QRect(0, 0, 0, 0)); 1338 } 1339 } 1340 1341 // All the between word space counts are in hor_space_stat 1342 *word_spacing = 0; 1343 weighted_count = 0; 1344 QMapIterator<int, int> iterate(hor_space_stat); 1345 1346 while (iterate.hasNext()) { 1347 iterate.next(); 1348 1349 if (iterate.key() > 0) { 1350 *word_spacing += iterate.value() * iterate.key(); 1351 weighted_count += iterate.value(); 1352 } 1353 } 1354 if (weighted_count) { 1355 *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5); 1356 } 1357 1358 *col_spacing = 0; 1359 QMapIterator<int, int> iterate_col(col_space_stat); 1360 1361 while (iterate_col.hasNext()) { 1362 iterate_col.next(); 1363 if (iterate_col.value() > *col_spacing) { 1364 *col_spacing = iterate_col.value(); 1365 } 1366 } 1367 *col_spacing = col_space_stat.key(*col_spacing); 1368 1369 // if there is just one line in a region, there is no point in dividing it 1370 if (sortedLines.length() == 1) { 1371 *word_spacing = *col_spacing; 1372 } 1373 } 1374 1375 /** 1376 * Implements the XY Cut algorithm for textpage segmentation 1377 * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and 1378 * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function) 1379 */ 1380 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight) 1381 { 1382 RegionTextList tree; 1383 QRect contentRect(0, 0, pageWidth, pageHeight); 1384 const RegionText root(wordsWithCharacters, contentRect); 1385 1386 // start the tree with the root, it is our only region at the start 1387 tree.push_back(root); 1388 1389 int i = 0; 1390 1391 // while traversing the tree has not been ended 1392 while (i < tree.length()) { 1393 const RegionText node = tree.at(i); 1394 QRect regionRect = node.area(); 1395 1396 /** 1397 * 1. calculation of projection profiles 1398 */ 1399 // allocate the size of proj profiles and initialize with 0 1400 int size_proj_y = node.area().height(); 1401 int size_proj_x = node.area().width(); 1402 // dynamic memory allocation 1403 QVarLengthArray<int> proj_on_xaxis(size_proj_x); 1404 QVarLengthArray<int> proj_on_yaxis(size_proj_y); 1405 1406 for (int j = 0; j < size_proj_y; ++j) { 1407 proj_on_yaxis[j] = 0; 1408 } 1409 for (int j = 0; j < size_proj_x; ++j) { 1410 proj_on_xaxis[j] = 0; 1411 } 1412 1413 const QList<WordWithCharacters> list = node.text(); 1414 1415 // Calculate tcx and tcy locally for each new region 1416 int word_spacing, line_spacing, column_spacing; 1417 calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing); 1418 1419 const int tcx = word_spacing * 2; 1420 const int tcy = line_spacing * 2; 1421 1422 int maxX = 0, maxY = 0; 1423 int avgX = 0; 1424 int count; 1425 1426 // for every text in the region 1427 for (const WordWithCharacters &wwc : list) { 1428 TinyTextEntity *ent = wwc.word; 1429 const QRect entRect = ent->area.geometry(pageWidth, pageHeight); 1430 1431 // calculate vertical projection profile proj_on_xaxis1 1432 for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) { 1433 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) { 1434 proj_on_xaxis[k - regionRect.left()] += entRect.height(); 1435 } 1436 } 1437 1438 // calculate horizontal projection profile in the same way 1439 for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) { 1440 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) { 1441 proj_on_yaxis[k - regionRect.top()] += entRect.width(); 1442 } 1443 } 1444 } 1445 1446 for (int j = 0; j < size_proj_y; ++j) { 1447 if (proj_on_yaxis[j] > maxY) { 1448 maxY = proj_on_yaxis[j]; 1449 } 1450 } 1451 1452 avgX = count = 0; 1453 for (int j = 0; j < size_proj_x; ++j) { 1454 if (proj_on_xaxis[j] > maxX) { 1455 maxX = proj_on_xaxis[j]; 1456 } 1457 if (proj_on_xaxis[j]) { 1458 count++; 1459 avgX += proj_on_xaxis[j]; 1460 } 1461 } 1462 if (count) { 1463 avgX /= count; 1464 } 1465 1466 /** 1467 * 2. Cleanup Boundary White Spaces and removal of noise 1468 */ 1469 int xbegin = 0, xend = size_proj_x - 1; 1470 int ybegin = 0, yend = size_proj_y - 1; 1471 while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) { 1472 xbegin++; 1473 } 1474 while (xend >= 0 && proj_on_xaxis[xend] <= 0) { 1475 xend--; 1476 } 1477 while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) { 1478 ybegin++; 1479 } 1480 while (yend >= 0 && proj_on_yaxis[yend] <= 0) { 1481 yend--; 1482 } 1483 1484 // update the regionRect 1485 int old_left = regionRect.left(), old_top = regionRect.top(); 1486 regionRect.setLeft(old_left + xbegin); 1487 regionRect.setRight(old_left + xend); 1488 regionRect.setTop(old_top + ybegin); 1489 regionRect.setBottom(old_top + yend); 1490 1491 int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0; 1492 for (int j = 0; j < size_proj_x; ++j) { 1493 proj_on_xaxis[j] -= tnx; 1494 } 1495 for (int j = 0; j < size_proj_y; ++j) { 1496 proj_on_yaxis[j] -= tny; 1497 } 1498 1499 /** 1500 * 3. Find the Widest gap 1501 */ 1502 int gap_hor = -1, pos_hor = -1; 1503 int begin = -1, end = -1; 1504 1505 // find all hor_gaps and find the maximum between them 1506 for (int j = 1; j < size_proj_y; ++j) { 1507 // transition from white to black 1508 if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) { 1509 end = j; 1510 } 1511 1512 // transition from black to white 1513 if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) { 1514 begin = j; 1515 } 1516 1517 if (begin > 0 && end > 0 && end - begin > gap_hor) { 1518 gap_hor = end - begin; 1519 pos_hor = (end + begin) / 2; 1520 begin = -1; 1521 end = -1; 1522 } 1523 } 1524 1525 begin = -1, end = -1; 1526 int gap_ver = -1, pos_ver = -1; 1527 1528 // find all the ver_gaps and find the maximum between them 1529 for (int j = 1; j < size_proj_x; ++j) { 1530 // transition from white to black 1531 if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) { 1532 end = j; 1533 } 1534 1535 // transition from black to white 1536 if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) { 1537 begin = j; 1538 } 1539 1540 if (begin > 0 && end > 0 && end - begin > gap_ver) { 1541 gap_ver = end - begin; 1542 pos_ver = (end + begin) / 2; 1543 begin = -1; 1544 end = -1; 1545 } 1546 } 1547 1548 int cut_pos_x = pos_ver, cut_pos_y = pos_hor; 1549 int gap_x = gap_ver, gap_y = gap_hor; 1550 1551 /** 1552 * 4. Cut the region and make nodes (left,right) or (up,down) 1553 */ 1554 bool cut_hor = false, cut_ver = false; 1555 1556 // For horizontal cut 1557 const int topHeight = cut_pos_y - (regionRect.top() - old_top); 1558 const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight); 1559 const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight); 1560 1561 // For vertical Cut 1562 const int leftWidth = cut_pos_x - (regionRect.left() - old_left); 1563 const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height()); 1564 const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height()); 1565 1566 if (gap_y >= gap_x && gap_y >= tcy) { 1567 cut_hor = true; 1568 } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) { 1569 cut_ver = true; 1570 } else if (gap_x >= gap_y && gap_x >= tcx) { 1571 cut_ver = true; 1572 } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) { 1573 cut_hor = true; 1574 } else { 1575 // no cut possible 1576 // we can now update the node rectangle with the shrinked rectangle 1577 RegionText tmpNode = tree.at(i); 1578 tmpNode.setArea(regionRect); 1579 tree.replace(i, tmpNode); 1580 i++; 1581 continue; 1582 } 1583 1584 WordsWithCharacters list1, list2; 1585 1586 // horizontal cut, topRect and bottomRect 1587 if (cut_hor) { 1588 for (const WordWithCharacters &word : list) { 1589 const QRect wordRect = word.area().geometry(pageWidth, pageHeight); 1590 1591 if (topRect.intersects(wordRect)) { 1592 list1.append(word); 1593 } else { 1594 list2.append(word); 1595 } 1596 } 1597 1598 RegionText node1(list1, topRect); 1599 RegionText node2(list2, bottomRect); 1600 1601 tree.replace(i, node1); 1602 tree.insert(i + 1, node2); 1603 } 1604 1605 // vertical cut, leftRect and rightRect 1606 else if (cut_ver) { 1607 for (const WordWithCharacters &word : list) { 1608 const QRect wordRect = word.area().geometry(pageWidth, pageHeight); 1609 1610 if (leftRect.intersects(wordRect)) { 1611 list1.append(word); 1612 } else { 1613 list2.append(word); 1614 } 1615 } 1616 1617 RegionText node1(list1, leftRect); 1618 RegionText node2(list2, rightRect); 1619 1620 tree.replace(i, node1); 1621 tree.insert(i + 1, node2); 1622 } 1623 } 1624 1625 return tree; 1626 } 1627 1628 /** 1629 * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed 1630 */ 1631 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight) 1632 { 1633 /** 1634 * 1. Call makeAndSortLines before adding spaces in between words in a line 1635 * 2. Now add spaces between every two words in a line 1636 * 3. Finally, extract all the space separated texts from each region and return it 1637 */ 1638 1639 // Only change the texts under RegionTexts, not the area 1640 for (RegionText &tmpRegion : tree) { 1641 // Step 01 1642 QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight); 1643 1644 // Step 02 1645 for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) { 1646 WordsWithCharacters &list = sortedLine.first; 1647 for (int k = 0; k < list.length(); k++) { 1648 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight); 1649 if (k + 1 >= list.length()) { 1650 break; 1651 } 1652 1653 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight); 1654 const int space = area2.left() - area1.right(); 1655 1656 if (space != 0) { 1657 // Make a TinyTextEntity of string space and push it between it and it+1 1658 const int left = area1.right(); 1659 const int right = area2.left(); 1660 const int top = area2.top() < area1.top() ? area2.top() : area1.top(); 1661 const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom(); 1662 1663 const QString spaceStr(QStringLiteral(" ")); 1664 const QRect rect(QPoint(left, top), QPoint(right, bottom)); 1665 const NormalizedRect entRect(rect, pageWidth, pageHeight); 1666 TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect); 1667 TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect); 1668 WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2); 1669 1670 list.insert(k + 1, word); 1671 1672 // Skip the space 1673 k++; 1674 } 1675 } 1676 } 1677 1678 WordsWithCharacters tmpList; 1679 for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) { 1680 tmpList += sortedLine.first; 1681 } 1682 tmpRegion.setText(tmpList); 1683 } 1684 1685 // Step 03 1686 WordsWithCharacters tmp; 1687 for (const RegionText &tmpRegion : std::as_const(tree)) { 1688 tmp += tmpRegion.text(); 1689 } 1690 return tmp; 1691 } 1692 1693 /** 1694 * Correct the textOrder, all layout recognition works here 1695 */ 1696 void TextPagePrivate::correctTextOrder() 1697 { 1698 // m_page->width() and m_page->height() are in pixels at 1699 // 100% zoom level, and thus depend on display DPI. 1700 // To avoid Okular failing on lowDPI displays, 1701 // we scale pageWidth and pageHeight so their sum equals 2000. 1702 const double scalingFactor = 2000.0 / (m_page->width() + m_page->height()); 1703 const int pageWidth = (int)(scalingFactor * m_page->width()); 1704 const int pageHeight = (int)(scalingFactor * m_page->height()); 1705 1706 TextList characters = m_words; 1707 1708 /** 1709 * Remove spaces from the text 1710 */ 1711 removeSpace(&characters); 1712 1713 /** 1714 * Construct words from characters 1715 */ 1716 const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight); 1717 1718 /** 1719 * Make a XY Cut tree for segmentation of the texts 1720 */ 1721 const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight); 1722 1723 /** 1724 * Add spaces to the word 1725 */ 1726 const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight); 1727 1728 /** 1729 * Break the words into characters 1730 */ 1731 TextList listOfCharacters; 1732 for (const WordWithCharacters &word : listWithWordsAndSpaces) { 1733 delete word.word; 1734 listOfCharacters.append(word.characters); 1735 } 1736 setWordList(listOfCharacters); 1737 } 1738 1739 TextEntity::List TextPage::words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const 1740 { 1741 if (area && area->isNull()) { 1742 return TextEntity::List(); 1743 } 1744 1745 TextEntity::List ret; 1746 if (area) { 1747 for (const TinyTextEntity *te : std::as_const(d->m_words)) { 1748 if (b == AnyPixelTextAreaInclusionBehaviour) { 1749 if (area->intersects(te->area)) { 1750 ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area))); 1751 } 1752 } else { 1753 const NormalizedPoint center = te->area.center(); 1754 if (area->contains(center.x, center.y)) { 1755 ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area))); 1756 } 1757 } 1758 } 1759 } else { 1760 for (const TinyTextEntity *te : std::as_const(d->m_words)) { 1761 ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area))); 1762 } 1763 } 1764 return ret; 1765 } 1766 1767 RegularAreaRect *TextPage::wordAt(const NormalizedPoint &p, QString *word) const 1768 { 1769 TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd(); 1770 TextList::ConstIterator it = itBegin; 1771 TextList::ConstIterator posIt = itEnd; 1772 for (; it != itEnd; ++it) { 1773 if ((*it)->area.contains(p.x, p.y)) { 1774 posIt = it; 1775 break; 1776 } 1777 } 1778 if (posIt != itEnd) { 1779 if ((*posIt)->text().simplified().isEmpty()) { 1780 return nullptr; 1781 } 1782 // Find the first TinyTextEntity of the word 1783 while (posIt != itBegin) { 1784 --posIt; 1785 const QString itText = (*posIt)->text(); 1786 if (itText.right(1).at(0).isSpace()) { 1787 if (itText.endsWith(QLatin1String("-\n"))) { 1788 // Is an hyphenated word 1789 // continue searching the start of the word back 1790 continue; 1791 } 1792 1793 if (itText == QLatin1String("\n") && posIt != itBegin) { 1794 --posIt; 1795 if ((*posIt)->text().endsWith(QLatin1String("-"))) { 1796 // Is an hyphenated word 1797 // continue searching the start of the word back 1798 continue; 1799 } 1800 ++posIt; 1801 } 1802 1803 ++posIt; 1804 break; 1805 } 1806 } 1807 RegularAreaRect *ret = new RegularAreaRect(); 1808 QString foundWord; 1809 for (; posIt != itEnd; ++posIt) { 1810 const QString itText = (*posIt)->text(); 1811 if (itText.simplified().isEmpty()) { 1812 break; 1813 } 1814 1815 ret->appendShape((*posIt)->area); 1816 foundWord += (*posIt)->text(); 1817 if (itText.right(1).at(0).isSpace()) { 1818 if (!foundWord.endsWith(QLatin1String("-\n"))) { 1819 break; 1820 } 1821 } 1822 } 1823 1824 if (word) { 1825 *word = foundWord; 1826 } 1827 return ret; 1828 } else { 1829 return nullptr; 1830 } 1831 }