File indexing completed on 2024-04-28 04:32:47
0001 /* 0002 SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com> 0003 0004 SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 #include "textpage.h" 0008 #include "textpage_p.h" 0009 0010 #include <QDebug> 0011 0012 #include "area.h" 0013 #include "debug_p.h" 0014 #include "misc.h" 0015 #include "page.h" 0016 #include "page_p.h" 0017 #include <unordered_set> 0018 0019 #include <cstring> 0020 0021 #include <QVarLengthArray> 0022 #include <QtAlgorithms> 0023 0024 using namespace Okular; 0025 0026 // Many of the strings are being reused; especially 0027 // those less than 2 letters are very common 0028 // Use the implicit shared bits from QString to 0029 // not keep multiple same strings around, but just up the 0030 // refcount a bit 0031 // The main reason for '2' is that most calls here happens 0032 // in auxillary threads that are following a document 0033 // and keeping the pool thread_local gives quite a bit 0034 // of advantage here. 0035 // Some calls though comes from the main thread, so we 0036 // shouldn't keep all the long strings allocated in the main 0037 // thread around forever. 0038 // '2' has been chosen by random testing, and guesswork. 0039 static QString fromPool(const QString &str) 0040 { 0041 if (str.length() > 2) { 0042 return str; 0043 } 0044 0045 thread_local std::unordered_set<QString> pool; 0046 auto [iterator, success] = pool.insert(str); 0047 return *iterator; 0048 } 0049 0050 class SearchPoint 0051 { 0052 public: 0053 SearchPoint() 0054 : offset_begin(-1) 0055 , offset_end(-1) 0056 { 0057 } 0058 0059 /** The TextEntity containing the first character of the match. */ 0060 TextEntity::List::ConstIterator it_begin; 0061 0062 /** The TextEntity containing the last character of the match. */ 0063 TextEntity::List::ConstIterator it_end; 0064 0065 /** The index of the first character of the match in (*it_begin)->text(). 0066 * Satisfies 0 <= offset_begin < (*it_begin)->text().length(). 0067 */ 0068 int offset_begin; 0069 0070 /** One plus the index of the last character of the match in (*it_end)->text(). 0071 * Satisfies 0 < offset_end <= (*it_end)->text().length(). 0072 */ 0073 int offset_end; 0074 }; 0075 0076 /* text comparison functions */ 0077 0078 static bool CaseInsensitiveCmpFn(QStringView from, QStringView to) 0079 { 0080 #ifdef DEBUG_TEXTPAGE 0081 qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)"; 0082 #endif 0083 return from.compare(to, Qt::CaseInsensitive) == 0; 0084 } 0085 0086 static bool CaseSensitiveCmpFn(QStringView from, QStringView to) 0087 { 0088 #ifdef DEBUG_TEXTPAGE 0089 qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)"; 0090 #endif 0091 return from.compare(to, Qt::CaseSensitive) == 0; 0092 } 0093 0094 /** 0095 * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line 0096 * overlap within @p threshold percent, i. e. iff the ratio of the length of the 0097 * intersection of the segments to the length of the shortest of the two input segments 0098 * is not smaller than the threshold. 0099 */ 0100 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold) 0101 { 0102 // check if one consumes another fully (speed optimization) 0103 0104 if (left1 <= left2 && right1 >= right2) { 0105 return true; 0106 } 0107 0108 if (left1 >= left2 && right1 <= right2) { 0109 return true; 0110 } 0111 0112 // check if there is overlap above threshold 0113 if (right2 >= left1 && right1 >= left2) { 0114 double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1; 0115 0116 double length1 = right1 - left1, length2 = right2 - left2; 0117 0118 return overlap * 100 >= threshold * qMin(length1, length2); 0119 } 0120 0121 return false; 0122 } 0123 0124 static bool doesConsumeY(const QRect first, const QRect second, int threshold) 0125 { 0126 return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold); 0127 } 0128 0129 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold) 0130 { 0131 return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold); 0132 } 0133 0134 TextEntity::TextEntity(const QString &text, const NormalizedRect &area) 0135 : m_text(fromPool(text)) 0136 , m_area(area) 0137 { 0138 } 0139 0140 TextEntity::~TextEntity() = default; 0141 0142 QString TextEntity::text() const 0143 { 0144 return m_text; 0145 } 0146 0147 NormalizedRect TextEntity::area() const 0148 { 0149 return m_area; 0150 } 0151 0152 NormalizedRect TextEntity::transformedArea(const QTransform &matrix) const 0153 { 0154 NormalizedRect transformed_area = m_area; 0155 transformed_area.transform(matrix); 0156 return transformed_area; 0157 } 0158 0159 TextPagePrivate::TextPagePrivate() 0160 : m_page(nullptr) 0161 { 0162 } 0163 0164 TextPagePrivate::~TextPagePrivate() 0165 { 0166 qDeleteAll(m_searchPoints); 0167 } 0168 0169 TextPage::TextPage() 0170 : d(new TextPagePrivate()) 0171 { 0172 } 0173 0174 TextPage::TextPage(const TextEntity::List &words) 0175 : d(new TextPagePrivate()) 0176 { 0177 d->m_words = words; 0178 } 0179 0180 TextPage::~TextPage() 0181 { 0182 delete d; 0183 } 0184 0185 void TextPage::append(const QString &text, const NormalizedRect &area) 0186 { 0187 if (!text.isEmpty()) { 0188 if (!d->m_words.isEmpty()) { 0189 TextEntity &lastEntity = d->m_words.last(); 0190 const QString concatText = lastEntity.text() + text.normalized(QString::NormalizationForm_KC); 0191 if (concatText != concatText.normalized(QString::NormalizationForm_KC)) { 0192 // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å 0193 NormalizedRect newArea = area | lastEntity.area(); 0194 d->m_words.removeLast(); 0195 d->m_words.append(TextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea)); 0196 return; 0197 } 0198 } 0199 0200 d->m_words.append(TextEntity(text.normalized(QString::NormalizationForm_KC), area)); 0201 } 0202 } 0203 0204 struct WordWithCharacters { 0205 WordWithCharacters(const TextEntity &w, const TextEntity::List &c) 0206 : word(w) 0207 , characters(c) 0208 { 0209 } 0210 0211 inline QString text() const 0212 { 0213 return word.text(); 0214 } 0215 0216 inline NormalizedRect area() const 0217 { 0218 return word.area(); 0219 } 0220 0221 TextEntity word; 0222 TextEntity::List characters; 0223 }; 0224 typedef QList<WordWithCharacters> WordsWithCharacters; 0225 0226 /** 0227 * We will divide the whole page in some regions depending on the horizontal and 0228 * vertical spacing among different regions. Each region will have an area and an 0229 * associated WordsWithCharacters in sorted order. 0230 */ 0231 class RegionText 0232 { 0233 public: 0234 RegionText() {}; 0235 0236 RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area) 0237 : m_region_wordWithCharacters(wordsWithCharacters) 0238 , m_area(area) 0239 { 0240 } 0241 0242 inline QString string() const 0243 { 0244 QString res; 0245 for (const WordWithCharacters &word : m_region_wordWithCharacters) { 0246 res += word.text(); 0247 } 0248 return res; 0249 } 0250 0251 inline WordsWithCharacters text() const 0252 { 0253 return m_region_wordWithCharacters; 0254 } 0255 0256 inline QRect area() const 0257 { 0258 return m_area; 0259 } 0260 0261 inline void setArea(const QRect area) 0262 { 0263 m_area = area; 0264 } 0265 0266 inline void setText(const WordsWithCharacters &wordsWithCharacters) 0267 { 0268 m_region_wordWithCharacters = wordsWithCharacters; 0269 } 0270 0271 private: 0272 WordsWithCharacters m_region_wordWithCharacters; 0273 QRect m_area; 0274 }; 0275 0276 RegularAreaRect *TextPage::textArea(TextSelection *sel) const 0277 { 0278 if (d->m_words.isEmpty()) { 0279 return new RegularAreaRect(); 0280 } 0281 0282 /** 0283 It works like this: 0284 There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0) 0285 rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those 0286 points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search 0287 for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both 0288 of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a 0289 1. (rx,ry)x(1,ty) 0290 2. (0,ty)x(1,uy) 0291 3. (0,uy)x(vx,vy) 0292 0293 To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor 0294 or that has a left border >= cx and bottom border >= cy. 0295 */ 0296 RegularAreaRect *ret = new RegularAreaRect; 0297 0298 PagePrivate *pagePrivate = PagePrivate::get(d->m_page); 0299 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform(); 0300 const double scaleX = d->m_page->width(); 0301 const double scaleY = d->m_page->height(); 0302 0303 NormalizedPoint startC = sel->start(); 0304 NormalizedPoint endC = sel->end(); 0305 NormalizedPoint temp; 0306 0307 // if startPoint is right to endPoint swap them 0308 if (startC.x > endC.x) { 0309 temp = startC; 0310 startC = endC; 0311 endC = temp; 0312 } 0313 0314 // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document 0315 const NormalizedRect boundingRect = d->m_page->boundingBox(); 0316 const QRect content = boundingRect.geometry(scaleX, scaleY); 0317 const double minX = content.left(); 0318 const double maxX = content.right(); 0319 const double minY = content.top(); 0320 const double maxY = content.bottom(); 0321 0322 /** 0323 * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for 0324 * the endRectangle. We have four cases: 0325 * 0326 * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start 0327 * and endPoint are outof the bounding rect (do not intersect) 0328 * 0329 * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle 0330 * 0331 * Case 2(a): find the rectangle which contains start and endpoint and having some 0332 * TextEntity 0333 * 0334 * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any 0335 * TextEntity within the rect made by startPoint and endPoint 0336 * 0337 * Case 3: Now, we may have two type of selection. 0338 * 1. startpoint is left-top of start_end and endpoint is right-bottom 0339 * 2. startpoint is left-bottom of start_end and endpoint is top-right 0340 * 0341 * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have 0342 * text within them. so, we need to search for the best suitable textposition for start and end. 0343 * 0344 * Case 3(a): We search the nearest rectangle consisting of some 0345 * TinyTextEntity right to or bottom of the startPoint for selection 01. 0346 * And, for selection 02, we have to search for right and top 0347 * 0348 * Case 3(b): For endpoint, we have to find the point top of or left to 0349 * endpoint if we have selection 01. 0350 * Otherwise, the search will be left and bottom 0351 */ 0352 0353 // we know that startC.x > endC.x, we need to decide which is top and which is bottom 0354 const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y); 0355 0356 // Case 1(a) 0357 if (!boundingRect.intersects(start_end)) { 0358 return ret; 0359 } else { 0360 // case 1(b) 0361 /** 0362 note that, after swapping of start and end, we know that, 0363 start is always left to end. but, we cannot say start is 0364 positioned upper than end. 0365 **/ 0366 // if start is left to content rect take it to content rect boundary 0367 if (startC.x * scaleX < minX) { 0368 startC.x = minX / scaleX; 0369 } 0370 if (endC.x * scaleX > maxX) { 0371 endC.x = maxX / scaleX; 0372 } 0373 0374 // if start is top to end (selection type 01) 0375 if (startC.y * scaleY < minY) { 0376 startC.y = minY / scaleY; 0377 } 0378 if (endC.y * scaleY > maxY) { 0379 endC.y = maxY / scaleY; 0380 } 0381 0382 // if start is bottom to end (selection type 02) 0383 if (startC.y * scaleY > maxY) { 0384 startC.y = maxY / scaleY; 0385 } 0386 if (endC.y * scaleY < minY) { 0387 endC.y = minY / scaleY; 0388 } 0389 } 0390 0391 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd(); 0392 TextEntity::List::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd; 0393 const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight; 0394 0395 NormalizedRect tmp; 0396 // case 2(a) 0397 for (; it != itEnd; ++it) { 0398 tmp = it->area(); 0399 if (tmp.contains(startC.x, startC.y)) { 0400 start = it; 0401 } 0402 if (tmp.contains(endC.x, endC.y)) { 0403 end = it; 0404 } 0405 } 0406 0407 // case 2(b) 0408 it = tmpIt; 0409 if (start == it && end == itEnd) { 0410 for (; it != itEnd; ++it) { 0411 // is there any text rectangle within the start_end rect 0412 tmp = it->area(); 0413 if (start_end.intersects(tmp)) { 0414 break; 0415 } 0416 } 0417 0418 // we have searched every text entities, but none is within the rectangle created by start and end 0419 // so, no selection should be done 0420 if (it == itEnd) { 0421 return ret; 0422 } 0423 } 0424 it = tmpIt; 0425 bool selection_two_start = false; 0426 0427 // case 3.a 0428 if (start == it) { 0429 NormalizedRect rect; 0430 0431 // selection type 01 0432 if (startC.y <= endC.y) { 0433 for (; it != itEnd; ++it) { 0434 rect = it->area(); 0435 bool flagV = !rect.isBottom(startC); 0436 0437 if (flagV && rect.isRight(startC)) { 0438 start = it; 0439 break; 0440 } 0441 } 0442 } 0443 0444 // selection type 02 0445 else { 0446 selection_two_start = true; 0447 int distance = scaleX + scaleY + 100; 0448 0449 for (; it != itEnd; ++it) { 0450 rect = it->area(); 0451 0452 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) { 0453 QRect entRect = rect.geometry(scaleX, scaleY); 0454 int xdist, ydist; 0455 xdist = entRect.center().x() - startC.x * scaleX; 0456 ydist = entRect.center().y() - startC.y * scaleY; 0457 0458 // make them positive 0459 if (xdist < 0) { 0460 xdist = -xdist; 0461 } 0462 if (ydist < 0) { 0463 ydist = -ydist; 0464 } 0465 0466 if ((xdist + ydist) < distance) { 0467 distance = xdist + ydist; 0468 start = it; 0469 } 0470 } 0471 } 0472 } 0473 } 0474 0475 // case 3.b 0476 if (end == itEnd) { 0477 it = tmpIt; 0478 itEnd = itEnd - 1; 0479 0480 NormalizedRect rect; 0481 0482 if (startC.y <= endC.y) { 0483 for (; itEnd >= it; itEnd--) { 0484 rect = itEnd->area(); 0485 bool flagV = !rect.isTop(endC); 0486 0487 if (flagV && rect.isLeft(endC)) { 0488 end = itEnd; 0489 break; 0490 } 0491 } 0492 } 0493 0494 else { 0495 int distance = scaleX + scaleY + 100; 0496 for (; itEnd >= it; itEnd--) { 0497 rect = itEnd->area(); 0498 0499 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) { 0500 QRect entRect = rect.geometry(scaleX, scaleY); 0501 int xdist, ydist; 0502 xdist = entRect.center().x() - endC.x * scaleX; 0503 ydist = entRect.center().y() - endC.y * scaleY; 0504 0505 // make them positive 0506 if (xdist < 0) { 0507 xdist = -xdist; 0508 } 0509 if (ydist < 0) { 0510 ydist = -ydist; 0511 } 0512 0513 if ((xdist + ydist) < distance) { 0514 distance = xdist + ydist; 0515 end = itEnd; 0516 } 0517 } 0518 } 0519 } 0520 } 0521 0522 /* if start and end in selection 02 are in the same column, and we 0523 start at an empty space we have to remove the selection of last 0524 character 0525 */ 0526 if (selection_two_start) { 0527 if (start > end) { 0528 start = start - 1; 0529 } 0530 } 0531 0532 // if start is less than end swap them 0533 if (start > end) { 0534 it = start; 0535 start = end; 0536 end = it; 0537 } 0538 0539 // removes the possibility of crash, in case none of 1 to 3 is true 0540 if (end == d->m_words.constEnd()) { 0541 end--; 0542 } 0543 0544 for (; start <= end; start++) { 0545 ret->appendShape(start->transformedArea(matrix), side); 0546 } 0547 0548 return ret; 0549 } 0550 0551 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area) 0552 { 0553 SearchDirection dir = direct; 0554 // invalid search request 0555 if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) { 0556 return nullptr; 0557 } 0558 TextEntity::List::ConstIterator start; 0559 int start_offset = 0; 0560 TextEntity::List::ConstIterator end; 0561 const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID); 0562 if (sIt == d->m_searchPoints.constEnd()) { 0563 // if no previous run of this search is found, then set it to start 0564 // from the beginning (respecting the search direction) 0565 if (dir == NextResult) { 0566 dir = FromTop; 0567 } else if (dir == PreviousResult) { 0568 dir = FromBottom; 0569 } 0570 } 0571 bool forward = true; 0572 switch (dir) { 0573 case FromTop: 0574 start = d->m_words.constBegin(); 0575 start_offset = 0; 0576 end = d->m_words.constEnd(); 0577 break; 0578 case FromBottom: 0579 start = d->m_words.constEnd(); 0580 start_offset = 0; 0581 end = d->m_words.constBegin(); 0582 forward = false; 0583 break; 0584 case NextResult: 0585 start = (*sIt)->it_end; 0586 start_offset = (*sIt)->offset_end; 0587 end = d->m_words.constEnd(); 0588 break; 0589 case PreviousResult: 0590 start = (*sIt)->it_begin; 0591 start_offset = (*sIt)->offset_begin; 0592 end = d->m_words.constBegin(); 0593 forward = false; 0594 break; 0595 }; 0596 RegularAreaRect *ret = nullptr; 0597 const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn; 0598 if (forward) { 0599 ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end); 0600 } else { 0601 ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end); 0602 } 0603 return ret; 0604 } 0605 0606 // hyphenated '-' must be at the end of a word, so hyphenation means 0607 // we have a '-' just followed by a '\n' character 0608 // check if the string contains a '-' character 0609 // if the '-' is the last entry 0610 static int stringLengthAdaptedWithHyphen(const QString &str, TextEntity::List::ConstIterator it, TextEntity::List::ConstIterator textListEnd) 0611 { 0612 const int len = str.length(); 0613 0614 // hyphenated '-' must be at the end of a word, so hyphenation means 0615 // we have a '-' just followed by a '\n' character 0616 // check if the string contains a '-' character 0617 // if the '-' is the last entry 0618 if (str.endsWith(QLatin1Char('-'))) { 0619 // validity chek of it + 1 0620 if ((it + 1) != textListEnd) { 0621 // 1. if the next character is '\n' 0622 const QString &lookahedStr = (it + 1)->text(); 0623 if (lookahedStr.startsWith(QLatin1Char('\n'))) { 0624 return len - 1; 0625 } 0626 0627 // 2. if the next word is in a different line or not 0628 const NormalizedRect &hyphenArea = it->area(); 0629 const NormalizedRect &lookaheadArea = (it + 1)->area(); 0630 0631 // lookahead to check whether both the '-' rect and next character rect overlap 0632 if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) { 0633 return len - 1; 0634 } 0635 } 0636 } 0637 // else if it is the second last entry - for example in pdf format 0638 else if (str.endsWith(QLatin1String("-\n"))) { 0639 return len - 2; 0640 } 0641 0642 return len; 0643 } 0644 0645 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp) 0646 { 0647 PagePrivate *pagePrivate = PagePrivate::get(m_page); 0648 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform(); 0649 RegularAreaRect *ret = new RegularAreaRect; 0650 0651 for (TextEntity::List::ConstIterator it = sp->it_begin;; it++) { 0652 const TextEntity &curEntity = *it; 0653 ret->append(curEntity.transformedArea(matrix)); 0654 0655 if (it == sp->it_end) { 0656 break; 0657 } 0658 } 0659 0660 ret->simplify(); 0661 return ret; 0662 } 0663 0664 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end) 0665 { 0666 // normalize query search all unicode (including glyphs) 0667 const QString query = _query.normalized(QString::NormalizationForm_KC); 0668 0669 // j is the current position in our query 0670 // queryLeft is the length of the query we have left to match 0671 int j = 0, queryLeft = query.length(); 0672 0673 TextEntity::List::ConstIterator it = start; 0674 int offset = start_offset; 0675 0676 TextEntity::List::ConstIterator it_begin = TextEntity::List::ConstIterator(); 0677 int offset_begin = 0; // dummy initial value to suppress compiler warnings 0678 0679 while (it != end) { 0680 const TextEntity &curEntity = *it; 0681 const QString &str = curEntity.text(); 0682 const int strLen = str.length(); 0683 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd()); 0684 // adjustedLen <= strLen 0685 0686 if (offset >= strLen) { 0687 it++; 0688 offset = 0; 0689 continue; 0690 } 0691 0692 if (it_begin == TextEntity::List::ConstIterator()) { 0693 it_begin = it; 0694 offset_begin = offset; 0695 } 0696 0697 // Let the user write the hyphen or not when searching for text 0698 int matchedLen = -1; 0699 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) { 0700 // we have equal (or less than) area of the query left as the length of the current 0701 // entity 0702 const int min = qMin(queryLeft, matchingLen - offset); 0703 if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) { 0704 matchedLen = min; 0705 break; 0706 } 0707 } 0708 0709 if (matchedLen == -1) { 0710 // we have not matched 0711 // this means we do not have a complete match 0712 // we need to get back to query start 0713 // and continue the search from this place 0714 #ifdef DEBUG_TEXTPAGE 0715 qCDebug(OkularCoreDebug) << "\tnot matched"; 0716 #endif 0717 j = 0; 0718 queryLeft = query.length(); 0719 it = it_begin; 0720 offset = offset_begin + 1; 0721 it_begin = TextEntity::List::ConstIterator(); 0722 } else { 0723 // we have a match 0724 // move the current position in the query 0725 // to the position after the length of this string 0726 // we matched 0727 // subtract the length of the current entity from 0728 // the left length of the query 0729 #ifdef DEBUG_TEXTPAGE 0730 qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen; 0731 #endif 0732 j += matchedLen; 0733 queryLeft -= matchedLen; 0734 0735 if (queryLeft == 0) { 0736 // save or update the search point for the current searchID 0737 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0738 if (sIt == m_searchPoints.end()) { 0739 sIt = m_searchPoints.insert(searchID, new SearchPoint); 0740 } 0741 SearchPoint *sp = *sIt; 0742 sp->it_begin = it_begin; 0743 sp->it_end = it; 0744 sp->offset_begin = offset_begin; 0745 sp->offset_end = offset + matchedLen; 0746 return searchPointToArea(sp); 0747 } 0748 0749 it++; 0750 offset = 0; 0751 } 0752 } 0753 // end of loop - it means that we've ended the textentities 0754 0755 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0756 if (sIt != m_searchPoints.end()) { 0757 SearchPoint *sp = *sIt; 0758 m_searchPoints.erase(sIt); 0759 delete sp; 0760 } 0761 return nullptr; 0762 } 0763 0764 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end) 0765 { 0766 // normalize query to search all unicode (including glyphs) 0767 const QString query = _query.normalized(QString::NormalizationForm_KC); 0768 0769 // j is the current position in our query 0770 // len is the length of the string in TextEntity 0771 // queryLeft is the length of the query we have left 0772 int j = query.length(), queryLeft = query.length(); 0773 0774 TextEntity::List::ConstIterator it = start; 0775 int offset = start_offset; 0776 0777 TextEntity::List::ConstIterator it_begin = TextEntity::List::ConstIterator(); 0778 int offset_begin = 0; // dummy initial value to suppress compiler warnings 0779 0780 while (true) { 0781 if (offset <= 0) { 0782 if (it == end) { 0783 break; 0784 } 0785 it--; 0786 } 0787 0788 const TextEntity &curEntity = *it; 0789 const QString &str = curEntity.text(); 0790 const int strLen = str.length(); 0791 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd()); 0792 // adjustedLen <= strLen 0793 0794 if (offset <= 0) { 0795 offset = strLen; 0796 } 0797 0798 if (it_begin == TextEntity::List::ConstIterator()) { 0799 it_begin = it; 0800 offset_begin = offset; 0801 } 0802 0803 // Let the user write the hyphen or not when searching for text 0804 int matchedLen = -1; 0805 // we have equal (or less than) area of the query left as the length of the current 0806 // entity 0807 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) { 0808 const int hyphenOffset = (strLen - matchingLen); 0809 const int min = qMin(queryLeft + hyphenOffset, offset); 0810 if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) { 0811 matchedLen = min - hyphenOffset; 0812 break; 0813 } 0814 } 0815 0816 if (matchedLen == -1) { 0817 // we have not matched 0818 // this means we do not have a complete match 0819 // we need to get back to query start 0820 // and continue the search from this place 0821 #ifdef DEBUG_TEXTPAGE 0822 qCDebug(OkularCoreDebug) << "\tnot matched"; 0823 #endif 0824 0825 j = query.length(); 0826 queryLeft = query.length(); 0827 it = it_begin; 0828 offset = offset_begin - 1; 0829 it_begin = TextEntity::List::ConstIterator(); 0830 } else { 0831 // we have a match 0832 // move the current position in the query 0833 // to the position after the length of this string 0834 // we matched 0835 // subtract the length of the current entity from 0836 // the left length of the query 0837 #ifdef DEBUG_TEXTPAGE 0838 qCDebug(OkularCoreDebug) << "\tmatched"; 0839 #endif 0840 j -= matchedLen; 0841 queryLeft -= matchedLen; 0842 0843 if (queryLeft == 0) { 0844 // save or update the search point for the current searchID 0845 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0846 if (sIt == m_searchPoints.end()) { 0847 sIt = m_searchPoints.insert(searchID, new SearchPoint); 0848 } 0849 SearchPoint *sp = *sIt; 0850 sp->it_begin = it; 0851 sp->it_end = it_begin; 0852 sp->offset_begin = offset - matchedLen; 0853 sp->offset_end = offset_begin; 0854 return searchPointToArea(sp); 0855 } 0856 0857 offset = 0; 0858 } 0859 } 0860 // end of loop - it means that we've ended the textentities 0861 0862 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID); 0863 if (sIt != m_searchPoints.end()) { 0864 SearchPoint *sp = *sIt; 0865 m_searchPoints.erase(sIt); 0866 delete sp; 0867 } 0868 return nullptr; 0869 } 0870 0871 QString TextPage::text(const RegularAreaRect *area) const 0872 { 0873 return text(area, AnyPixelTextAreaInclusionBehaviour); 0874 } 0875 0876 QString TextPage::text(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const 0877 { 0878 if (area && area->isNull()) { 0879 return QString(); 0880 } 0881 0882 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd(); 0883 QString ret; 0884 if (area) { 0885 for (; it != itEnd; ++it) { 0886 if (b == AnyPixelTextAreaInclusionBehaviour) { 0887 if (area->intersects(it->area())) { 0888 ret += it->text(); 0889 } 0890 } else { 0891 NormalizedPoint center = it->area().center(); 0892 if (area->contains(center.x, center.y)) { 0893 ret += it->text(); 0894 } 0895 } 0896 } 0897 } else { 0898 for (; it != itEnd; ++it) { 0899 ret += it->text(); 0900 } 0901 } 0902 return ret; 0903 } 0904 0905 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second) 0906 { 0907 QRect firstArea = first.area().roundedGeometry(1000, 1000); 0908 QRect secondArea = second.area().roundedGeometry(1000, 1000); 0909 0910 return firstArea.left() < secondArea.left(); 0911 } 0912 0913 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second) 0914 { 0915 const QRect firstArea = first.area().roundedGeometry(1000, 1000); 0916 const QRect secondArea = second.area().roundedGeometry(1000, 1000); 0917 0918 return firstArea.top() < secondArea.top(); 0919 } 0920 0921 /** 0922 * Sets a new world list. Deleting the contents of the old one 0923 */ 0924 void TextPagePrivate::setWordList(const TextEntity::List &list) 0925 { 0926 m_words = list; 0927 } 0928 0929 /** 0930 * Remove all the spaces in between texts. It will make all the generators 0931 * same, whether they save spaces(like pdf) or not(like djvu). 0932 */ 0933 static void removeSpace(TextEntity::List *words) 0934 { 0935 TextEntity::List::Iterator it = words->begin(); 0936 const QString str(QLatin1Char(' ')); 0937 0938 while (it != words->end()) { 0939 if (it->text() == str) { 0940 it = words->erase(it); 0941 } else { 0942 ++it; 0943 } 0944 } 0945 } 0946 0947 /** 0948 * We will read the TinyTextEntity from characters and try to create words from there. 0949 * Note: characters might be already characters for some generators, but we will keep 0950 * the nomenclature characters for the generator produced data. The resulting 0951 * WordsWithCharacters memory has to be managed by the caller, both the 0952 * WordWithCharacters::word and WordWithCharacters::characters contents 0953 */ 0954 static WordsWithCharacters makeWordFromCharacters(const TextEntity::List &characters, int pageWidth, int pageHeight) 0955 { 0956 /** 0957 * We will traverse characters and try to create words from the TinyTextEntities in it. 0958 * We will search TinyTextEntity blocks and merge them until we get a 0959 * space between two consecutive TinyTextEntities. When we get a space 0960 * we can take it as a end of word. Then we store the word as a TinyTextEntity 0961 * and keep it in newList. 0962 0963 * We create a RegionText named regionWord that contains the word and the characters associated with it and 0964 * a rectangle area of the element in newList. 0965 0966 */ 0967 WordsWithCharacters wordsWithCharacters; 0968 0969 TextEntity::List::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt; 0970 int newLeft, newRight, newTop, newBottom; 0971 0972 for (; it != itEnd; it++) { 0973 QString textString = it->text(); 0974 QString newString; 0975 QRect lineArea = it->area().roundedGeometry(pageWidth, pageHeight); 0976 QRect elementArea; 0977 TextEntity::List wordCharacters; 0978 tmpIt = it; 0979 int space = 0; 0980 0981 while (!space) { 0982 if (!textString.isEmpty()) { 0983 newString.append(textString); 0984 0985 // when textString is the start of the word 0986 if (tmpIt == it) { 0987 NormalizedRect newRect(lineArea, pageWidth, pageHeight); 0988 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect)); 0989 } else { 0990 NormalizedRect newRect(elementArea, pageWidth, pageHeight); 0991 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect)); 0992 } 0993 } 0994 0995 ++it; 0996 0997 /* 0998 we must have to put this line before the if condition of it==itEnd 0999 otherwise the last character can be missed 1000 */ 1001 if (it == itEnd) { 1002 break; 1003 } 1004 elementArea = it->area().roundedGeometry(pageWidth, pageHeight); 1005 if (!doesConsumeY(elementArea, lineArea, 60)) { 1006 --it; 1007 break; 1008 } 1009 1010 const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width(); 1011 const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width(); 1012 1013 space = elementArea.left() - lineArea.right(); 1014 1015 if (space != 0) { 1016 it--; 1017 break; 1018 } 1019 1020 newLeft = text_x1 < line_x1 ? text_x1 : line_x1; 1021 newRight = line_x2 > text_x2 ? line_x2 : text_x2; 1022 newTop = text_y1 > line_y1 ? line_y1 : text_y1; 1023 newBottom = text_y2 > line_y2 ? text_y2 : line_y2; 1024 1025 lineArea.setLeft(newLeft); 1026 lineArea.setTop(newTop); 1027 lineArea.setWidth(newRight - newLeft); 1028 lineArea.setHeight(newBottom - newTop); 1029 1030 textString = it->text(); 1031 } 1032 1033 // if newString is not empty, save it 1034 if (!newString.isEmpty()) { 1035 const NormalizedRect newRect(lineArea, pageWidth, pageHeight); 1036 TextEntity word = TextEntity(newString.normalized(QString::NormalizationForm_KC), newRect); 1037 wordsWithCharacters.append(WordWithCharacters(word, wordCharacters)); 1038 } 1039 1040 if (it == itEnd) { 1041 break; 1042 } 1043 } 1044 1045 wordsWithCharacters.shrink_to_fit(); 1046 return wordsWithCharacters; 1047 } 1048 1049 /** 1050 * Create Lines from the words and sort them 1051 */ 1052 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight) 1053 { 1054 /** 1055 * We cannot assume that the generator will give us texts in the right order. 1056 * We can only assume that we will get texts in the page and their bounding 1057 * rectangle. The texts can be character, word, half-word anything. 1058 * So, we need to: 1059 ** 1060 * 1. Sort rectangles/boxes containing texts by y0(top) 1061 * 2. Create textline where there is y overlap between TinyTextEntity 's 1062 * 3. Within each line sort the TinyTextEntity 's by x0(left) 1063 */ 1064 1065 QList<QPair<WordsWithCharacters, QRect>> lines; 1066 1067 /* 1068 Make a new copy of the TextList in the words, so that the wordsTmp and lines do 1069 not contain same pointers for all the TinyTextEntity. 1070 */ 1071 QList<WordWithCharacters> words = wordsTmp; 1072 1073 // Step 1 1074 std::sort(words.begin(), words.end(), compareTinyTextEntityY); 1075 1076 // Step 2 1077 QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end(); 1078 1079 // for every non-space texts(characters/words) in the textList 1080 for (; it != itEnd; it++) { 1081 const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight); 1082 bool found = false; 1083 1084 for (QPair<WordsWithCharacters, QRect> &linesI : lines) { 1085 /* the line area which will be expanded 1086 line_rects is only necessary to preserve the topmin and bottommax of all 1087 the texts in the line, left and right is not necessary at all 1088 */ 1089 QRect &lineArea = linesI.second; 1090 const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width(); 1091 const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width(); 1092 1093 /* 1094 if the new text and the line has y overlapping parts of more than 70%, 1095 the text will be added to this line 1096 */ 1097 if (doesConsumeY(elementArea, lineArea, 70)) { 1098 WordsWithCharacters &line = linesI.first; 1099 line.append(*it); 1100 1101 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1; 1102 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2; 1103 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1; 1104 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2; 1105 1106 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop); 1107 found = true; 1108 } 1109 1110 if (found) { 1111 break; 1112 } 1113 } 1114 1115 /* when we have found a new line create a new TextList containing 1116 only one element and append it to the lines 1117 */ 1118 if (!found) { 1119 lines.append(QPair<WordsWithCharacters, QRect>({*it}, elementArea)); 1120 } 1121 } 1122 1123 // Step 3 1124 for (QPair<WordsWithCharacters, QRect> &line : lines) { 1125 WordsWithCharacters &list = line.first; 1126 std::sort(list.begin(), list.end(), compareTinyTextEntityX); 1127 } 1128 lines.shrink_to_fit(); 1129 1130 return lines; 1131 } 1132 1133 /** 1134 * Calculate Statistical information from the lines we made previously 1135 */ 1136 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing) 1137 { 1138 /** 1139 * For the region, defined by line_rects and lines 1140 * 1. Make line statistical analysis to find the line spacing 1141 * 2. Make character statistical analysis to differentiate between 1142 * word spacing and column spacing. 1143 */ 1144 1145 /** 1146 * Step 0 1147 */ 1148 const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight); 1149 1150 /** 1151 * Step 1 1152 */ 1153 QMap<int, int> line_space_stat; 1154 for (int i = 0; i < sortedLines.length(); i++) { 1155 const QRect rectUpper = sortedLines.at(i).second; 1156 1157 if (i + 1 == sortedLines.length()) { 1158 break; 1159 } 1160 const QRect rectLower = sortedLines.at(i + 1).second; 1161 1162 int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height()); 1163 if (linespace < 0) { 1164 linespace = -linespace; 1165 } 1166 1167 if (line_space_stat.contains(linespace)) { 1168 line_space_stat[linespace]++; 1169 } else { 1170 line_space_stat[linespace] = 1; 1171 } 1172 } 1173 1174 *line_spacing = 0; 1175 int weighted_count = 0; 1176 QMapIterator<int, int> iterate_linespace(line_space_stat); 1177 1178 while (iterate_linespace.hasNext()) { 1179 iterate_linespace.next(); 1180 *line_spacing += iterate_linespace.value() * iterate_linespace.key(); 1181 weighted_count += iterate_linespace.value(); 1182 } 1183 if (*line_spacing != 0) { 1184 *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5); 1185 } 1186 1187 /** 1188 * Step 2 1189 */ 1190 // We would like to use QMap instead of QHash as it will keep the keys sorted 1191 QMap<int, int> hor_space_stat; 1192 QMap<int, int> col_space_stat; 1193 QList<QList<QRect>> space_rects; 1194 QVector<QRect> max_hor_space_rects; 1195 1196 // Space in every line 1197 for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) { 1198 const WordsWithCharacters list = sortedLine.first; 1199 QList<QRect> line_space_rects; 1200 int maxSpace = 0, minSpace = pageWidth; 1201 1202 // for every TinyTextEntity element in the line 1203 WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end(); 1204 QRect max_area1, max_area2; 1205 // for every line 1206 for (; it != itEnd; it++) { 1207 const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight); 1208 if (it + 1 == itEnd) { 1209 break; 1210 } 1211 1212 const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight); 1213 int space = area2.left() - area1.right(); 1214 1215 if (space > maxSpace) { 1216 max_area1 = area1; 1217 max_area2 = area2; 1218 maxSpace = space; 1219 } 1220 1221 if (space < minSpace && space != 0) { 1222 minSpace = space; 1223 } 1224 1225 // if we found a real space, whose length is not zero and also less than the pageWidth 1226 if (space != 0 && space != pageWidth) { 1227 // increase the count of the space amount 1228 if (hor_space_stat.contains(space)) { 1229 hor_space_stat[space]++; 1230 } else { 1231 hor_space_stat[space] = 1; 1232 } 1233 1234 int left, right, top, bottom; 1235 1236 left = area1.right(); 1237 right = area2.left(); 1238 1239 top = area2.top() < area1.top() ? area2.top() : area1.top(); 1240 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom(); 1241 1242 QRect rect(left, top, right - left, bottom - top); 1243 line_space_rects.append(rect); 1244 } 1245 } 1246 1247 space_rects.append(line_space_rects); 1248 1249 if (hor_space_stat.contains(maxSpace)) { 1250 if (hor_space_stat[maxSpace] != 1) { 1251 hor_space_stat[maxSpace]--; 1252 } else { 1253 hor_space_stat.remove(maxSpace); 1254 } 1255 } 1256 1257 if (maxSpace != 0) { 1258 if (col_space_stat.contains(maxSpace)) { 1259 col_space_stat[maxSpace]++; 1260 } else { 1261 col_space_stat[maxSpace] = 1; 1262 } 1263 1264 // store the max rect of each line 1265 const int left = max_area1.right(); 1266 const int right = max_area2.left(); 1267 const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top(); 1268 const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom(); 1269 1270 const QRect rect(left, top, right - left, bottom - top); 1271 max_hor_space_rects.append(rect); 1272 } else { 1273 max_hor_space_rects.append(QRect(0, 0, 0, 0)); 1274 } 1275 } 1276 1277 // All the between word space counts are in hor_space_stat 1278 *word_spacing = 0; 1279 weighted_count = 0; 1280 QMapIterator<int, int> iterate(hor_space_stat); 1281 1282 while (iterate.hasNext()) { 1283 iterate.next(); 1284 1285 if (iterate.key() > 0) { 1286 *word_spacing += iterate.value() * iterate.key(); 1287 weighted_count += iterate.value(); 1288 } 1289 } 1290 if (weighted_count) { 1291 *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5); 1292 } 1293 1294 *col_spacing = 0; 1295 QMapIterator<int, int> iterate_col(col_space_stat); 1296 1297 while (iterate_col.hasNext()) { 1298 iterate_col.next(); 1299 if (iterate_col.value() > *col_spacing) { 1300 *col_spacing = iterate_col.value(); 1301 } 1302 } 1303 *col_spacing = col_space_stat.key(*col_spacing); 1304 1305 // if there is just one line in a region, there is no point in dividing it 1306 if (sortedLines.length() == 1) { 1307 *word_spacing = *col_spacing; 1308 } 1309 } 1310 1311 /** 1312 * Implements the XY Cut algorithm for textpage segmentation 1313 * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and 1314 * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function) 1315 */ 1316 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight) 1317 { 1318 RegionTextList tree; 1319 QRect contentRect(0, 0, pageWidth, pageHeight); 1320 const RegionText root(wordsWithCharacters, contentRect); 1321 1322 // start the tree with the root, it is our only region at the start 1323 tree.push_back(root); 1324 1325 int i = 0; 1326 1327 // while traversing the tree has not been ended 1328 while (i < tree.length()) { 1329 const RegionText node = tree.at(i); 1330 QRect regionRect = node.area(); 1331 1332 /** 1333 * 1. calculation of projection profiles 1334 */ 1335 // allocate the size of proj profiles and initialize with 0 1336 int size_proj_y = node.area().height(); 1337 int size_proj_x = node.area().width(); 1338 // dynamic memory allocation 1339 QVarLengthArray<int> proj_on_xaxis(size_proj_x); 1340 QVarLengthArray<int> proj_on_yaxis(size_proj_y); 1341 1342 for (int j = 0; j < size_proj_y; ++j) { 1343 proj_on_yaxis[j] = 0; 1344 } 1345 for (int j = 0; j < size_proj_x; ++j) { 1346 proj_on_xaxis[j] = 0; 1347 } 1348 1349 const QList<WordWithCharacters> list = node.text(); 1350 1351 // Calculate tcx and tcy locally for each new region 1352 int word_spacing, line_spacing, column_spacing; 1353 calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing); 1354 1355 const int tcx = word_spacing * 2; 1356 const int tcy = line_spacing * 2; 1357 1358 int maxX = 0, maxY = 0; 1359 int avgX = 0; 1360 int count; 1361 1362 // for every text in the region 1363 for (const WordWithCharacters &wwc : list) { 1364 const QRect entRect = wwc.area().geometry(pageWidth, pageHeight); 1365 1366 // calculate vertical projection profile proj_on_xaxis1 1367 for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) { 1368 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) { 1369 proj_on_xaxis[k - regionRect.left()] += entRect.height(); 1370 } 1371 } 1372 1373 // calculate horizontal projection profile in the same way 1374 for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) { 1375 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) { 1376 proj_on_yaxis[k - regionRect.top()] += entRect.width(); 1377 } 1378 } 1379 } 1380 1381 for (int j = 0; j < size_proj_y; ++j) { 1382 if (proj_on_yaxis[j] > maxY) { 1383 maxY = proj_on_yaxis[j]; 1384 } 1385 } 1386 1387 avgX = count = 0; 1388 for (int j = 0; j < size_proj_x; ++j) { 1389 if (proj_on_xaxis[j] > maxX) { 1390 maxX = proj_on_xaxis[j]; 1391 } 1392 if (proj_on_xaxis[j]) { 1393 count++; 1394 avgX += proj_on_xaxis[j]; 1395 } 1396 } 1397 if (count) { 1398 avgX /= count; 1399 } 1400 1401 /** 1402 * 2. Cleanup Boundary White Spaces and removal of noise 1403 */ 1404 int xbegin = 0, xend = size_proj_x - 1; 1405 int ybegin = 0, yend = size_proj_y - 1; 1406 while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) { 1407 xbegin++; 1408 } 1409 while (xend >= 0 && proj_on_xaxis[xend] <= 0) { 1410 xend--; 1411 } 1412 while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) { 1413 ybegin++; 1414 } 1415 while (yend >= 0 && proj_on_yaxis[yend] <= 0) { 1416 yend--; 1417 } 1418 1419 // update the regionRect 1420 int old_left = regionRect.left(), old_top = regionRect.top(); 1421 regionRect.setLeft(old_left + xbegin); 1422 regionRect.setRight(old_left + xend); 1423 regionRect.setTop(old_top + ybegin); 1424 regionRect.setBottom(old_top + yend); 1425 1426 int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0; 1427 for (int j = 0; j < size_proj_x; ++j) { 1428 proj_on_xaxis[j] -= tnx; 1429 } 1430 for (int j = 0; j < size_proj_y; ++j) { 1431 proj_on_yaxis[j] -= tny; 1432 } 1433 1434 /** 1435 * 3. Find the Widest gap 1436 */ 1437 int gap_hor = -1, pos_hor = -1; 1438 int begin = -1, end = -1; 1439 1440 // find all hor_gaps and find the maximum between them 1441 for (int j = 1; j < size_proj_y; ++j) { 1442 // transition from white to black 1443 if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) { 1444 end = j; 1445 } 1446 1447 // transition from black to white 1448 if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) { 1449 begin = j; 1450 } 1451 1452 if (begin > 0 && end > 0 && end - begin > gap_hor) { 1453 gap_hor = end - begin; 1454 pos_hor = (end + begin) / 2; 1455 begin = -1; 1456 end = -1; 1457 } 1458 } 1459 1460 begin = -1, end = -1; 1461 int gap_ver = -1, pos_ver = -1; 1462 1463 // find all the ver_gaps and find the maximum between them 1464 for (int j = 1; j < size_proj_x; ++j) { 1465 // transition from white to black 1466 if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) { 1467 end = j; 1468 } 1469 1470 // transition from black to white 1471 if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) { 1472 begin = j; 1473 } 1474 1475 if (begin > 0 && end > 0 && end - begin > gap_ver) { 1476 gap_ver = end - begin; 1477 pos_ver = (end + begin) / 2; 1478 begin = -1; 1479 end = -1; 1480 } 1481 } 1482 1483 int cut_pos_x = pos_ver, cut_pos_y = pos_hor; 1484 int gap_x = gap_ver, gap_y = gap_hor; 1485 1486 /** 1487 * 4. Cut the region and make nodes (left,right) or (up,down) 1488 */ 1489 bool cut_hor = false, cut_ver = false; 1490 1491 // For horizontal cut 1492 const int topHeight = cut_pos_y - (regionRect.top() - old_top); 1493 const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight); 1494 const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight); 1495 1496 // For vertical Cut 1497 const int leftWidth = cut_pos_x - (regionRect.left() - old_left); 1498 const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height()); 1499 const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height()); 1500 1501 if (gap_y >= gap_x && gap_y >= tcy) { 1502 cut_hor = true; 1503 } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) { 1504 cut_ver = true; 1505 } else if (gap_x >= gap_y && gap_x >= tcx) { 1506 cut_ver = true; 1507 } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) { 1508 cut_hor = true; 1509 } else { 1510 // no cut possible 1511 // we can now update the node rectangle with the shrinked rectangle 1512 RegionText tmpNode = tree.at(i); 1513 tmpNode.setArea(regionRect); 1514 tree.replace(i, tmpNode); 1515 i++; 1516 continue; 1517 } 1518 1519 WordsWithCharacters list1, list2; 1520 1521 // horizontal cut, topRect and bottomRect 1522 if (cut_hor) { 1523 for (const WordWithCharacters &word : list) { 1524 const QRect wordRect = word.area().geometry(pageWidth, pageHeight); 1525 1526 if (topRect.intersects(wordRect)) { 1527 list1.append(word); 1528 } else { 1529 list2.append(word); 1530 } 1531 } 1532 1533 RegionText node1(list1, topRect); 1534 RegionText node2(list2, bottomRect); 1535 1536 tree.replace(i, node1); 1537 tree.insert(i + 1, node2); 1538 } 1539 1540 // vertical cut, leftRect and rightRect 1541 else if (cut_ver) { 1542 for (const WordWithCharacters &word : list) { 1543 const QRect wordRect = word.area().geometry(pageWidth, pageHeight); 1544 1545 if (leftRect.intersects(wordRect)) { 1546 list1.append(word); 1547 } else { 1548 list2.append(word); 1549 } 1550 } 1551 1552 RegionText node1(list1, leftRect); 1553 RegionText node2(list2, rightRect); 1554 1555 tree.replace(i, node1); 1556 tree.insert(i + 1, node2); 1557 } 1558 } 1559 1560 tree.shrink_to_fit(); 1561 return tree; 1562 } 1563 1564 /** 1565 * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed 1566 */ 1567 TextEntity::List addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight) 1568 { 1569 /** 1570 * 1. Call makeAndSortLines before adding spaces in between words in a line 1571 * 2. Now add spaces between every two words in a line 1572 * 3. Finally, extract all the space separated texts from each region and return it 1573 */ 1574 1575 TextEntity::List res; 1576 // Only change the texts under RegionTexts, not the area 1577 for (const RegionText &tmpRegion : std::as_const(tree)) { 1578 // Step 01 1579 QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight); 1580 int counter = 0; 1581 1582 // Step 02 1583 for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) { 1584 WordsWithCharacters &list = sortedLine.first; 1585 for (int k = 0; k < list.length(); k++) { 1586 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight); 1587 if (k + 1 >= list.length()) { 1588 break; 1589 } 1590 1591 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight); 1592 const int space = area2.left() - area1.right(); 1593 1594 if (space != 0) { 1595 // Make a TinyTextEntity of string space and push it between it and it+1 1596 const int left = area1.right(); 1597 const int right = area2.left(); 1598 const int top = area2.top() < area1.top() ? area2.top() : area1.top(); 1599 const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom(); 1600 1601 const QString spaceStr(QStringLiteral(" ")); 1602 const QRect rect(QPoint(left, top), QPoint(right, bottom)); 1603 const NormalizedRect entRect(rect, pageWidth, pageHeight); 1604 TextEntity ent1 = TextEntity(spaceStr, entRect); 1605 WordWithCharacters word(ent1, QList<TextEntity>() << ent1); 1606 1607 list.insert(k + 1, word); 1608 1609 // Skip the space 1610 k++; 1611 } 1612 } 1613 counter += list.length(); 1614 } 1615 res.reserve(res.length() + counter); 1616 1617 for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) { 1618 for (const WordWithCharacters &word : sortedLine.first) { 1619 res += word.characters; 1620 } 1621 } 1622 } 1623 1624 // Step 03 1625 res.shrink_to_fit(); 1626 return res; 1627 } 1628 1629 /** 1630 * Correct the textOrder, all layout recognition works here 1631 */ 1632 void TextPagePrivate::correctTextOrder() 1633 { 1634 // m_page->width() and m_page->height() are in pixels at 1635 // 100% zoom level, and thus depend on display DPI. 1636 // To avoid Okular failing on lowDPI displays, 1637 // we scale pageWidth and pageHeight so their sum equals 2000. 1638 const double scalingFactor = 2000.0 / (m_page->width() + m_page->height()); 1639 const int pageWidth = (int)(scalingFactor * m_page->width()); 1640 const int pageHeight = (int)(scalingFactor * m_page->height()); 1641 1642 TextEntity::List characters = m_words; 1643 1644 /** 1645 * Remove spaces from the text 1646 */ 1647 removeSpace(&characters); 1648 1649 /** 1650 * Construct words from characters 1651 */ 1652 const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight); 1653 1654 /** 1655 * Make a XY Cut tree for segmentation of the texts 1656 */ 1657 RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight); 1658 1659 /** 1660 * Add spaces to the word 1661 */ 1662 const auto listOfCharacters = addNecessarySpace(std::move(tree), pageWidth, pageHeight); 1663 1664 setWordList(listOfCharacters); 1665 } 1666 1667 TextEntity::List TextPage::words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const 1668 { 1669 if (area && area->isNull()) { 1670 return TextEntity::List(); 1671 } 1672 1673 TextEntity::List ret; 1674 if (area) { 1675 for (const TextEntity &te : std::as_const(d->m_words)) { 1676 if (b == AnyPixelTextAreaInclusionBehaviour) { 1677 if (area->intersects(te.area())) { 1678 ret.append(te); 1679 } 1680 } else { 1681 const NormalizedPoint center = te.area().center(); 1682 if (area->contains(center.x, center.y)) { 1683 ret.append(te); 1684 } 1685 } 1686 } 1687 } else { 1688 for (const TextEntity &te : std::as_const(d->m_words)) { 1689 ret.append(te); 1690 } 1691 } 1692 return ret; 1693 } 1694 1695 RegularAreaRect *TextPage::wordAt(const NormalizedPoint &p, QString *word) const 1696 { 1697 TextEntity::List::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd(); 1698 TextEntity::List::ConstIterator it = itBegin; 1699 TextEntity::List::ConstIterator posIt = itEnd; 1700 for (; it != itEnd; ++it) { 1701 if (it->area().contains(p.x, p.y)) { 1702 posIt = it; 1703 break; 1704 } 1705 } 1706 if (posIt != itEnd) { 1707 if (posIt->text().simplified().isEmpty()) { 1708 return nullptr; 1709 } 1710 // Find the first TinyTextEntity of the word 1711 while (posIt != itBegin) { 1712 --posIt; 1713 const QString itText = posIt->text(); 1714 if (itText.right(1).at(0).isSpace()) { 1715 if (itText.endsWith(QLatin1String("-\n"))) { 1716 // Is an hyphenated word 1717 // continue searching the start of the word back 1718 continue; 1719 } 1720 1721 if (itText == QLatin1String("\n") && posIt != itBegin) { 1722 --posIt; 1723 if (posIt->text().endsWith(QLatin1String("-"))) { 1724 // Is an hyphenated word 1725 // continue searching the start of the word back 1726 continue; 1727 } 1728 ++posIt; 1729 } 1730 1731 ++posIt; 1732 break; 1733 } 1734 } 1735 RegularAreaRect *ret = new RegularAreaRect(); 1736 QString foundWord; 1737 for (; posIt != itEnd; ++posIt) { 1738 const QString itText = posIt->text(); 1739 if (itText.simplified().isEmpty()) { 1740 break; 1741 } 1742 1743 ret->appendShape(posIt->area()); 1744 foundWord += posIt->text(); 1745 if (itText.right(1).at(0).isSpace()) { 1746 if (!foundWord.endsWith(QLatin1String("-\n"))) { 1747 break; 1748 } 1749 } 1750 } 1751 1752 if (word) { 1753 *word = foundWord; 1754 } 1755 return ret; 1756 } else { 1757 return nullptr; 1758 } 1759 }